diff --git a/apis/r/.Rbuildignore b/apis/r/.Rbuildignore index 7ec1437e7f..ea196e0fcd 100644 --- a/apis/r/.Rbuildignore +++ b/apis/r/.Rbuildignore @@ -23,3 +23,4 @@ tiledbsoma/libtiledbsoma.tar.gz.current # vscode ^\.vscode$ ^\.devcontainer$ +^data-raw$ diff --git a/apis/r/DESCRIPTION b/apis/r/DESCRIPTION index 8bfd37e133..ec9d9bf3a5 100644 --- a/apis/r/DESCRIPTION +++ b/apis/r/DESCRIPTION @@ -4,7 +4,7 @@ Title: TileDB SOMA Description: Interface for working with 'TileDB'-based Stack of Matrices, Annotated ('SOMA'): an open data model for representing annotated matrices, like those commonly used for single cell data analysis. -Version: 0.0.0.9021 +Version: 0.0.0.9022 Authors@R: c( person(given = "Aaron", family = "Wolen", diff --git a/apis/r/NAMESPACE b/apis/r/NAMESPACE index b76f144800..36498e17fb 100644 --- a/apis/r/NAMESPACE +++ b/apis/r/NAMESPACE @@ -49,6 +49,9 @@ export(ScalarMap) export(TileDBArray) export(TileDBGroup) export(TileDBObject) +export(extract_dataset) +export(list_datasets) +export(load_dataset) export(nnz) export(pad_matrix) export(shape) @@ -87,6 +90,7 @@ importFrom(spdl,debug) importFrom(spdl,info) importFrom(spdl,setup) importFrom(stats,setNames) +importFrom(tools,file_path_sans_ext) importFrom(urltools,url_compose) importFrom(urltools,url_parse) importFrom(utils,packageVersion) diff --git a/apis/r/NEWS.md b/apis/r/NEWS.md index 2daadee463..5c2616f103 100644 --- a/apis/r/NEWS.md +++ b/apis/r/NEWS.md @@ -16,3 +16,4 @@ * Add Seurat outgestors for `SOMAExperimentAxisQuery` objects * Numeric coordinates passed to SOMADataFrame$read() are now automatically upcast to int64 when necessary * Add ingestors to read data from `Seurat` objects +* Add methods for listing and accessing bundled datasets, which now includes a `SOMAExperiment` containing the pbmc_small dataset from the SeuratObject package diff --git a/apis/r/R/datasets.R b/apis/r/R/datasets.R new file mode 100644 index 0000000000..ec47823c3d --- /dev/null +++ b/apis/r/R/datasets.R @@ -0,0 +1,86 @@ +#' SOMA Example Datasets +#' +#' @description +#' Access example SOMA objects bundled with the tiledbsoma package. +#' +#' Use `list_datasets()` to list the available datasets and `load_dataset()` to +#' load a dataset into memory using the appropriate SOMA class. The +#' `extract_dataset()` method returns the path to the extracted dataset without +#' loading it into memory. +#' +#' @details +#' The SOMA objects are stored as `tar.gz` files in the package's `extdata` +#' directory. Calling `load_dataset()` extracts the `tar.gz` file to the +#' specified `dir`, inspects its metadata to determine the appropriate SOMA +#' class to instantiate, and returns the SOMA object. +#' +#' @examples +#' soma_pbmc_small <- load_dataset("soma-exp-pbmc-small") +#' +#' @name example-datasets +NULL + +#' @rdname example-datasets +#' @return +#' - `list_datasets()` returns a character vector of the available datasets. +#' @importFrom tools file_path_sans_ext +#' @export +list_datasets <- function() { + data_dir <- example_data_dir() + files <- dir(data_dir, pattern = "tar\\.gz$") + tools::file_path_sans_ext(basename(files), compression = TRUE) +} + +#' @rdname example-datasets +#' @param name The name of the dataset. +#' @param dir The directory where the dataset will be extracted to (default: +#' `tempdir()`). +#' @return +#' - `extract_dataset()` returns the path to the extracted dataset. +#' @export +extract_dataset <- function(name, dir = tempdir()) { + data_dir <- example_data_dir() + tarfiles <- list_datasets() + + stopifnot( + "The specified directory does not exist" = dir.exists(dir), + "Provide the name of a single dataset" = is_scalar_character(name), + assert_subset(name, tarfiles, type = "dataset") + ) + + # Extract tar.gz file to dir + tarfile <- dir(data_dir, pattern = name, full.names = TRUE) + stopifnot("The specified dataset does not exist" = file.exists(tarfile)) + + dataset_uri <- file.path(dir, name) + untar(tarfile, exdir = dataset_uri) + dataset_uri +} + +#' @rdname example-datasets +#' @return +#' - `load_dataset()` returns a SOMA object. +#' @export +load_dataset <- function(name, dir = tempdir()) { + dataset_uri <- extract_dataset(name, dir) + + # Inspect the object's metadata + object <- switch( + tiledb::tiledb_object_type(dataset_uri), + "ARRAY" = TileDBArray$new(dataset_uri, internal_use_only = "allowed_use"), + "GROUP" = TileDBGroup$new(dataset_uri, internal_use_only = "allowed_use"), + stop("The dataset is not a TileDB Array or Group", call. = FALSE) + ) + + # Instantiate the proper SOMA object + switch( + object$get_metadata("soma_object_type"), + "SOMAExperiment" = SOMAExperimentOpen(dataset_uri), + "SOMADataFrame" = SOMADataFrameOpen(dataset_uri), + stop("The dataset is an unsupported SOMA object", call. = FALSE) + ) +} + +example_data_dir <- function() { + system.file("extdata", package = "tiledbsoma", mustWork = TRUE) +} diff --git a/apis/r/data-raw/create-soma-exp-pbmc-small.R b/apis/r/data-raw/create-soma-exp-pbmc-small.R new file mode 100644 index 0000000000..3b6f53b689 --- /dev/null +++ b/apis/r/data-raw/create-soma-exp-pbmc-small.R @@ -0,0 +1,23 @@ +# Create SOMAExperiment for the pbmc_small dataset +# https://mojaveazure.github.io/seurat-object/reference/pbmc_small.html + +library(tiledbsoma) +library(SeuratObject) + +# load the pbmc_small dataset +data(pbmc_small, package = "SeuratObject") +pbmc_small + +# variables +data_dir <- normalizePath(file.path("inst", "extdata")) +soma_exp_name <- "soma-exp-pbmc-small" +soma_exp_uri <- file.path(tempdir(), soma_exp_name) +tar_file <- file.path(data_dir, paste0(soma_exp_name, ".tar.gz")) + +# create the SOMAExperiment +write_soma(pbmc_small, uri = soma_exp_uri) + +# create tar.gz file containing the SOMAExperiment +od <- setwd(soma_exp_uri) +tar(tar_file, compression = "gzip") +setwd(od) diff --git a/apis/r/inst/extdata/soma-dataframe-pbmc3k-processed-obs.tar.gz b/apis/r/inst/extdata/soma-dataframe-pbmc3k-processed-obs.tar.gz new file mode 100644 index 0000000000..eb39ac98dc Binary files /dev/null and b/apis/r/inst/extdata/soma-dataframe-pbmc3k-processed-obs.tar.gz differ diff --git a/apis/r/inst/extdata/soma-exp-pbmc-small.tar.gz b/apis/r/inst/extdata/soma-exp-pbmc-small.tar.gz new file mode 100644 index 0000000000..2698e1c4f0 Binary files /dev/null and b/apis/r/inst/extdata/soma-exp-pbmc-small.tar.gz differ diff --git a/apis/r/inst/raw-data/soco-pbmc3k_processed-obs.tar.gz b/apis/r/inst/raw-data/soco-pbmc3k_processed-obs.tar.gz deleted file mode 100644 index 220fc8cf2b..0000000000 Binary files a/apis/r/inst/raw-data/soco-pbmc3k_processed-obs.tar.gz and /dev/null differ diff --git a/apis/r/man/example-datasets.Rd b/apis/r/man/example-datasets.Rd new file mode 100644 index 0000000000..b6e4cb12da --- /dev/null +++ b/apis/r/man/example-datasets.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/datasets.R +\name{example-datasets} +\alias{example-datasets} +\alias{list_datasets} +\alias{extract_dataset} +\alias{load_dataset} +\title{SOMA Example Datasets} +\usage{ +list_datasets() + +extract_dataset(name, dir = tempdir()) + +load_dataset(name, dir = tempdir()) +} +\arguments{ +\item{name}{The name of the dataset.} + +\item{dir}{The directory where the dataset will be extracted to (default: +\code{tempdir()}).} +} +\value{ +\itemize{ +\item \code{list_datasets()} returns a character vector of the available datasets. +} + +\itemize{ +\item \code{extract_dataset()} returns the path to the extracted dataset. +} + +\itemize{ +\item \code{load_dataset()} returns an SOMA object. +} +} +\description{ +Access example SOMA objects bundled with the tiledbsoma package. + +Use \code{list_datasets()} to list the available datasets and \code{load_dataset()} to +load a dataset into memory using the appropriate SOMA class. The +\code{extract_dataset()} method returns the path to the extracted dataset without +loading it into memory. +} +\details{ +The SOMA objects are stored as \code{tar.gz} files in the package's \code{extdata} +directory. Calling \code{load_dataset()} extracts the \code{tar.gz} file to the +specified \code{dir}, inspects its metadata to determine the appropriate SOMA +class to instantiate, and returns the SOMA object. +} +\examples{ +soma_pbmc_small <- load_dataset("soma-exp-pbmc-small") + +} diff --git a/apis/r/tests/testthat/test-SOMAArrayReader-Arrow.R b/apis/r/tests/testthat/test-SOMAArrayReader-Arrow.R index 020be25987..2a1d57e8c8 100644 --- a/apis/r/tests/testthat/test-SOMAArrayReader-Arrow.R +++ b/apis/r/tests/testthat/test-SOMAArrayReader-Arrow.R @@ -3,12 +3,8 @@ test_that("Arrow Interface from SOMAArrayReader", { library(tiledb) skip_if_not_installed("dplyr") # a Suggests + uri <- extract_dataset("soma-dataframe-pbmc3k-processed-obs") - tdir <- tempfile() - tgzfile <- system.file("raw-data", "soco-pbmc3k_processed-obs.tar.gz", package="tiledbsoma") - untar(tarfile = tgzfile, exdir = tdir) - - uri <- file.path(tdir, "obs") columns <- c("n_counts", "n_genes", "louvain") z <- soma_array_reader(uri, columns) diff --git a/apis/r/tests/testthat/test-SOMAArrayReader-Basics.R b/apis/r/tests/testthat/test-SOMAArrayReader-Basics.R index 85bb53c1c3..d7048bf57f 100644 --- a/apis/r/tests/testthat/test-SOMAArrayReader-Basics.R +++ b/apis/r/tests/testthat/test-SOMAArrayReader-Basics.R @@ -1,9 +1,5 @@ test_that("Basic SOMAArrayReader", { - tdir <- tempfile() - tgzfile <- system.file("raw-data", "soco-pbmc3k_processed-obs.tar.gz", package="tiledbsoma") - untar(tarfile = tgzfile, exdir = tdir) - - uri <- file.path(tdir, "obs") + uri <- extract_dataset("soma-dataframe-pbmc3k-processed-obs") df <- arrow_to_dt(soma_array_reader(uri)) expect_equal(nrow(df), 2638L) diff --git a/apis/r/tests/testthat/test-SOMAArrayReader-Iterated.R b/apis/r/tests/testthat/test-SOMAArrayReader-Iterated.R index 062ff3ebd2..7557996a72 100644 --- a/apis/r/tests/testthat/test-SOMAArrayReader-Iterated.R +++ b/apis/r/tests/testthat/test-SOMAArrayReader-Iterated.R @@ -58,10 +58,7 @@ test_that("Iterated Interface from SOMAArrayReader", { expect_equal(ncol(rl), 3) ## test completeness predicate on shorter data - tdir <- tempfile() - tgzfile <- system.file("raw-data", "soco-pbmc3k_processed-obs.tar.gz", package="tiledbsoma") - untar(tarfile = tgzfile, exdir = tdir) - uri <- file.path(tdir, "obs") + uri <- extract_dataset("soma-dataframe-pbmc3k-processed-obs") sr <- sr_setup(uri, config=as.character(config(ctx))) expect_false(tiledbsoma:::sr_complete(sr)) diff --git a/apis/r/tests/testthat/test-SOMADataFrame.R b/apis/r/tests/testthat/test-SOMADataFrame.R index 9cb998e502..983ae37755 100644 --- a/apis/r/tests/testthat/test-SOMADataFrame.R +++ b/apis/r/tests/testthat/test-SOMADataFrame.R @@ -158,11 +158,7 @@ test_that("int64 values are stored correctly", { }) test_that("SOMADataFrame read", { - tdir <- tempfile() - tgzfile <- system.file("raw-data", "soco-pbmc3k_processed-obs.tar.gz", package="tiledbsoma") - untar(tarfile = tgzfile, exdir = tdir) - - uri <- file.path(tdir, "obs") + uri <- extract_dataset("soma-dataframe-pbmc3k-processed-obs") sdf <- SOMADataFrame$new(uri, internal_use_only = "allowed_use") z <- sdf$read() diff --git a/apis/r/tests/testthat/test-example-datasets.R b/apis/r/tests/testthat/test-example-datasets.R new file mode 100644 index 0000000000..8026c67a87 --- /dev/null +++ b/apis/r/tests/testthat/test-example-datasets.R @@ -0,0 +1,18 @@ +test_that("example dataset access", { + expect_length( + list_datasets(), + length(dir(example_data_dir())) + ) + + # Test that the dataset can be extracted + dataset_uri <- extract_dataset("soma-exp-pbmc-small") + expect_true(dir.exists(dataset_uri)) + expect_equal(tiledb::tiledb_object_type(dataset_uri), "GROUP") + + # Test the datasets can be loaded + exp <- load_dataset("soma-exp-pbmc-small") + expect_s3_class(exp, "SOMAExperiment") + + sdf <- load_dataset("soma-dataframe-pbmc3k-processed-obs") + expect_s3_class(sdf, "SOMADataFrame") +})