Skip to content

Commit

Permalink
[r] Add new example dataset and methods for accessing example datasets (
Browse files Browse the repository at this point in the history
#1298)

* Add pbmc-small SOMAExperiment
* Add methods for accessing bundled soma objects
* Add helper for getting the example data directory
* Rename existing bundled pbmc3k obs
* Add tests for dataset utils
* Update existing tests to use new datasets API
* Rename raw-data to extdata
* Bump version and update news and docs
  • Loading branch information
aaronwolen authored Apr 26, 2023
1 parent 2d4d1b7 commit 508c96c
Show file tree
Hide file tree
Showing 15 changed files with 190 additions and 20 deletions.
1 change: 1 addition & 0 deletions apis/r/.Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@ tiledbsoma/libtiledbsoma.tar.gz.current
# vscode
^\.vscode$
^\.devcontainer$
^data-raw$
2 changes: 1 addition & 1 deletion apis/r/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Title: TileDB SOMA
Description: Interface for working with 'TileDB'-based Stack of Matrices,
Annotated ('SOMA'): an open data model for representing annotated matrices,
like those commonly used for single cell data analysis.
Version: 0.0.0.9021
Version: 0.0.0.9022
Authors@R: c(
person(given = "Aaron",
family = "Wolen",
Expand Down
4 changes: 4 additions & 0 deletions apis/r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ export(ScalarMap)
export(TileDBArray)
export(TileDBGroup)
export(TileDBObject)
export(extract_dataset)
export(list_datasets)
export(load_dataset)
export(nnz)
export(pad_matrix)
export(shape)
Expand Down Expand Up @@ -87,6 +90,7 @@ importFrom(spdl,debug)
importFrom(spdl,info)
importFrom(spdl,setup)
importFrom(stats,setNames)
importFrom(tools,file_path_sans_ext)
importFrom(urltools,url_compose)
importFrom(urltools,url_parse)
importFrom(utils,packageVersion)
Expand Down
1 change: 1 addition & 0 deletions apis/r/NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@
* Add Seurat outgestors for `SOMAExperimentAxisQuery` objects
* Numeric coordinates passed to SOMADataFrame$read() are now automatically upcast to int64 when necessary
* Add ingestors to read data from `Seurat` objects
* Add methods for listing and accessing bundled datasets, which now includes a `SOMAExperiment` containing the pbmc_small dataset from the SeuratObject package
86 changes: 86 additions & 0 deletions apis/r/R/datasets.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#' SOMA Example Datasets
#'
#' @description
#' Access example SOMA objects bundled with the tiledbsoma package.
#'
#' Use `list_datasets()` to list the available datasets and `load_dataset()` to
#' load a dataset into memory using the appropriate SOMA class. The
#' `extract_dataset()` method returns the path to the extracted dataset without
#' loading it into memory.
#'
#' @details
#' The SOMA objects are stored as `tar.gz` files in the package's `extdata`
#' directory. Calling `load_dataset()` extracts the `tar.gz` file to the
#' specified `dir`, inspects its metadata to determine the appropriate SOMA
#' class to instantiate, and returns the SOMA object.
#'
#' @examples
#' soma_pbmc_small <- load_dataset("soma-exp-pbmc-small")
#'
#' @name example-datasets
NULL

#' @rdname example-datasets
#' @return
#' - `list_datasets()` returns a character vector of the available datasets.
#' @importFrom tools file_path_sans_ext
#' @export
list_datasets <- function() {
data_dir <- example_data_dir()
files <- dir(data_dir, pattern = "tar\\.gz$")
tools::file_path_sans_ext(basename(files), compression = TRUE)
}

#' @rdname example-datasets
#' @param name The name of the dataset.
#' @param dir The directory where the dataset will be extracted to (default:
#' `tempdir()`).
#' @return
#' - `extract_dataset()` returns the path to the extracted dataset.
#' @export
extract_dataset <- function(name, dir = tempdir()) {
data_dir <- example_data_dir()
tarfiles <- list_datasets()

stopifnot(
"The specified directory does not exist" = dir.exists(dir),
"Provide the name of a single dataset" = is_scalar_character(name),
assert_subset(name, tarfiles, type = "dataset")
)

# Extract tar.gz file to dir
tarfile <- dir(data_dir, pattern = name, full.names = TRUE)
stopifnot("The specified dataset does not exist" = file.exists(tarfile))

dataset_uri <- file.path(dir, name)
untar(tarfile, exdir = dataset_uri)
dataset_uri
}

#' @rdname example-datasets
#' @return
#' - `load_dataset()` returns a SOMA object.
#' @export
load_dataset <- function(name, dir = tempdir()) {
dataset_uri <- extract_dataset(name, dir)

# Inspect the object's metadata
object <- switch(
tiledb::tiledb_object_type(dataset_uri),
"ARRAY" = TileDBArray$new(dataset_uri, internal_use_only = "allowed_use"),
"GROUP" = TileDBGroup$new(dataset_uri, internal_use_only = "allowed_use"),
stop("The dataset is not a TileDB Array or Group", call. = FALSE)
)

# Instantiate the proper SOMA object
switch(
object$get_metadata("soma_object_type"),
"SOMAExperiment" = SOMAExperimentOpen(dataset_uri),
"SOMADataFrame" = SOMADataFrameOpen(dataset_uri),
stop("The dataset is an unsupported SOMA object", call. = FALSE)
)
}

example_data_dir <- function() {
system.file("extdata", package = "tiledbsoma", mustWork = TRUE)
}
23 changes: 23 additions & 0 deletions apis/r/data-raw/create-soma-exp-pbmc-small.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Create SOMAExperiment for the pbmc_small dataset
# https://mojaveazure.github.io/seurat-object/reference/pbmc_small.html

library(tiledbsoma)
library(SeuratObject)

# load the pbmc_small dataset
data(pbmc_small, package = "SeuratObject")
pbmc_small

# variables
data_dir <- normalizePath(file.path("inst", "extdata"))
soma_exp_name <- "soma-exp-pbmc-small"
soma_exp_uri <- file.path(tempdir(), soma_exp_name)
tar_file <- file.path(data_dir, paste0(soma_exp_name, ".tar.gz"))

# create the SOMAExperiment
write_soma(pbmc_small, uri = soma_exp_uri)

# create tar.gz file containing the SOMAExperiment
od <- setwd(soma_exp_uri)
tar(tar_file, compression = "gzip")
setwd(od)
Binary file not shown.
Binary file added apis/r/inst/extdata/soma-exp-pbmc-small.tar.gz
Binary file not shown.
Binary file not shown.
52 changes: 52 additions & 0 deletions apis/r/man/example-datasets.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 1 addition & 5 deletions apis/r/tests/testthat/test-SOMAArrayReader-Arrow.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,8 @@ test_that("Arrow Interface from SOMAArrayReader", {
library(tiledb)

skip_if_not_installed("dplyr") # a Suggests
uri <- extract_dataset("soma-dataframe-pbmc3k-processed-obs")

tdir <- tempfile()
tgzfile <- system.file("raw-data", "soco-pbmc3k_processed-obs.tar.gz", package="tiledbsoma")
untar(tarfile = tgzfile, exdir = tdir)

uri <- file.path(tdir, "obs")
columns <- c("n_counts", "n_genes", "louvain")

z <- soma_array_reader(uri, columns)
Expand Down
6 changes: 1 addition & 5 deletions apis/r/tests/testthat/test-SOMAArrayReader-Basics.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
test_that("Basic SOMAArrayReader", {
tdir <- tempfile()
tgzfile <- system.file("raw-data", "soco-pbmc3k_processed-obs.tar.gz", package="tiledbsoma")
untar(tarfile = tgzfile, exdir = tdir)

uri <- file.path(tdir, "obs")
uri <- extract_dataset("soma-dataframe-pbmc3k-processed-obs")

df <- arrow_to_dt(soma_array_reader(uri))
expect_equal(nrow(df), 2638L)
Expand Down
5 changes: 1 addition & 4 deletions apis/r/tests/testthat/test-SOMAArrayReader-Iterated.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,7 @@ test_that("Iterated Interface from SOMAArrayReader", {
expect_equal(ncol(rl), 3)

## test completeness predicate on shorter data
tdir <- tempfile()
tgzfile <- system.file("raw-data", "soco-pbmc3k_processed-obs.tar.gz", package="tiledbsoma")
untar(tarfile = tgzfile, exdir = tdir)
uri <- file.path(tdir, "obs")
uri <- extract_dataset("soma-dataframe-pbmc3k-processed-obs")
sr <- sr_setup(uri, config=as.character(config(ctx)))

expect_false(tiledbsoma:::sr_complete(sr))
Expand Down
6 changes: 1 addition & 5 deletions apis/r/tests/testthat/test-SOMADataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -158,11 +158,7 @@ test_that("int64 values are stored correctly", {
})

test_that("SOMADataFrame read", {
tdir <- tempfile()
tgzfile <- system.file("raw-data", "soco-pbmc3k_processed-obs.tar.gz", package="tiledbsoma")
untar(tarfile = tgzfile, exdir = tdir)

uri <- file.path(tdir, "obs")
uri <- extract_dataset("soma-dataframe-pbmc3k-processed-obs")

sdf <- SOMADataFrame$new(uri, internal_use_only = "allowed_use")
z <- sdf$read()
Expand Down
18 changes: 18 additions & 0 deletions apis/r/tests/testthat/test-example-datasets.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
test_that("example dataset access", {
expect_length(
list_datasets(),
length(dir(example_data_dir()))
)

# Test that the dataset can be extracted
dataset_uri <- extract_dataset("soma-exp-pbmc-small")
expect_true(dir.exists(dataset_uri))
expect_equal(tiledb::tiledb_object_type(dataset_uri), "GROUP")

# Test the datasets can be loaded
exp <- load_dataset("soma-exp-pbmc-small")
expect_s3_class(exp, "SOMAExperiment")

sdf <- load_dataset("soma-dataframe-pbmc3k-processed-obs")
expect_s3_class(sdf, "SOMADataFrame")
})

0 comments on commit 508c96c

Please sign in to comment.