diff --git a/DESCRIPTION b/DESCRIPTION index 6ad0f1bc3..32b27e33e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: mia Type: Package -Version: 1.15.17 +Version: 1.15.18 Authors@R: c(person(given = "Tuomas", family = "Borman", role = c("aut", "cre"), email = "tuomas.v.borman@utu.fi", diff --git a/NAMESPACE b/NAMESPACE index 89f8f960e..739a0b590 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -400,6 +400,7 @@ importFrom(dplyr,group_by) importFrom(dplyr,mutate) importFrom(dplyr,n) importFrom(dplyr,rename) +importFrom(dplyr,summarise) importFrom(dplyr,sym) importFrom(dplyr,tally) importFrom(mediation,mediate) @@ -432,6 +433,7 @@ importFrom(tibble,rownames_to_column) importFrom(tibble,tibble) importFrom(tidyr,pivot_longer) importFrom(tidyr,pivot_wider) +importFrom(tidyr,unnest) importFrom(utils,assignInMyNamespace) importFrom(utils,combn) importFrom(utils,getFromNamespace) diff --git a/R/addDivergence.R b/R/addDivergence.R index d57c0e0e3..9cbfa1c61 100644 --- a/R/addDivergence.R +++ b/R/addDivergence.R @@ -204,7 +204,7 @@ setMethod("getDivergence", signature = c(x="SummarizedExperiment"), reference <- rep(reference, ncol(mat)) } # Check that all reference samples are included in the data - if( !all(reference %in% colnames(mat) | is.na(reference)) ){ + if( !all(unlist(reference) %in% colnames(mat) | is.na(unlist(reference))) ){ stop("All reference samples must be included in the data.", call. = FALSE) } @@ -214,12 +214,21 @@ setMethod("getDivergence", signature = c(x="SummarizedExperiment"), } # For each sample-pair, this function calculates dissimilarity. -#' @importFrom dplyr mutate +#' @importFrom dplyr group_by summarise +#' @importFrom tidyr unnest .calc_divergence <- function(mat, reference, method, ...){ # Create sample-pair data.frame - reference <- data.frame(sample = colnames(mat), reference = reference) + reference <- data.frame( + sample = colnames(mat), reference = I(unname(reference))) + # Check if there are multiple reference samples assigned for samples + if( any(lengths(reference[["reference"]]) > 1L) ){ + reference <- reference |> unnest(cols = reference) + warning("Some samples are associated with multiple reference samples. ", + "In these cases, the reference time point includes multiple ", + "samples, and their average is used.", call. = FALSE) + } # Exclude NA values - reference <- reference[!is.na(reference$reference), ] + reference <- reference[!is.na(reference[["reference"]]), ] # For dissimilarity calculation, the samples must be in rows mat <- t(mat) # Loop through sample-pairs @@ -235,5 +244,12 @@ setMethod("getDivergence", signature = c(x="SummarizedExperiment"), # Add values to data.frame that holds sample pairs temp <- unlist(temp) reference[["value"]] <- temp + # If there were multiple reference samples, take average + if( anyDuplicated(reference[["sample"]]) ){ + reference <- reference |> + group_by(sample) |> + summarise(value = mean(value)) |> + as.data.frame() + } return(reference) } diff --git a/tests/testthat/test-0divergence.R b/tests/testthat/test-0divergence.R index 5a0ee0a2b..d49ba659a 100644 --- a/tests/testthat/test-0divergence.R +++ b/tests/testthat/test-0divergence.R @@ -1,19 +1,19 @@ context("divergence estimates") test_that("divergence estimates", { - + skip_if_not(requireNamespace("vegan", quietly = TRUE)) - + data(esophagus, package="mia") tse <- esophagus - + tse_divergence <- addDivergence(tse) - + # Checks that the type of output is the same as the type of input. expect_true(typeof(tse_divergence) == typeof(tse)) - + # Check that colData includes divergence results expect_true( "divergence" %in% colnames(colData(tse_divergence))) - + # Expect errors when input is wrong expect_error(addDivergence( tse, name = 123, reference = "median", FUN = vegan::vegdist, @@ -47,26 +47,26 @@ test_that("divergence estimates", { method = "euclidean")) expect_error(addDivergence( tse, reference = FALSE, FUN = stats::dist, method = "euclidean")) - + # Reference values from microbiome pkg's divergence function expect_equal(unname( round(colData(addDivergence( tse, reference = "mean", FUN = stats::dist, method = "euclidean"))$divergence, 6)), round(c(35.35534, 42.16634, 59.44746)),6) - + expect_equal(unname( round(colData(addDivergence( tse, reference = assay(tse, "counts")[,3], FUN = stats::dist, method = "manhattan"))$divergence, 6)), round(c(210, 280, 0)),6) - + expect_equal(unname( round(colData(addDivergence( tse, reference = assay(tse, "counts")[,1], FUN = vegan::vegdist, method = "chao"))$divergence, 6)), round(c(0.00000000, 0.10115766, 0.08239422)),6) - + # Check different input types for reference sample <- sample(colnames(tse), 1) tse[["ref_name"]] <- rep(sample, ncol(tse)) @@ -77,7 +77,7 @@ test_that("divergence estimates", { expect_equal(in_coldata, single_sample) expect_equal(single_sample, as_num_vector) expect_equal(as_num_vector, as_char_vector) - + # Check that divergence is calculated correctly if we have different reference # samples for each sample. # Assign reference randomly @@ -90,7 +90,7 @@ test_that("divergence estimates", { return(val) }) expect_equal(test_values, ref_values) - + # When the divergence is calculated based on dissimilarity matrix, the result # should differ from calculating it from counts tse <- runMDS( @@ -103,4 +103,33 @@ test_that("divergence estimates", { assays = SimpleList(counts = t(reducedDim(tse, "MDS")))) test <- getDivergence(se, method = "euclidean") expect_identical(test, ref) + + # Check that divergence is correctly calculated with multiple reference + # samples. + tse <- makeTSE(nrow = 1000, ncol = 20) + assayNames(tse) <- "counts" + reference <- sample(colnames(tse), 40, replace = TRUE) + names(reference) <- sample(colnames(tse), 40, replace = TRUE) + reference <- split(reference, names(reference)) + reference <- reference[ match(colnames(tse), names(reference)) ] + tse[["reference"]] <- reference + res <- getDivergence(tse, reference = "reference", method = "euclidean") |> + expect_warning() + # For all samples, calculate divergence + sams <- colnames(tse) + ref <- lapply(sams, function(sam){ + # Get data on sample + sam_dat <- colData(tse[, sam]) + # Get its reference samples + ref_sams <- sam_dat[["reference"]] |> unlist() + # Loop through each reference sample, calculate its distance to + # the sample, and take mean + ref_vals <- vapply(ref_sams, function(ref_sam){ + dist(t( assay(tse, "counts")[, c(sam, ref_sam)] ), + method = "euclidean") + }, numeric(1)) + ref_vals <- mean(ref_vals) + return(ref_vals) + }) |> unlist() + expect_equal(res, ref) })