Handle duplicates in getDivergence (#680)

microbiome · Jan 23, 2025 · 11eba6c · 11eba6c
1 parent 255ac68
commit 11eba6c
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 17 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: mia
 Type: Package
-Version: 1.15.17
+Version: 1.15.18
 Authors@R:
     c(person(given = "Tuomas", family = "Borman", role = c("aut", "cre"),
              email = "tuomas.v.borman@utu.fi",

diff --git a/NAMESPACE b/NAMESPACE
@@ -400,6 +400,7 @@ importFrom(dplyr,group_by)
 importFrom(dplyr,mutate)
 importFrom(dplyr,n)
 importFrom(dplyr,rename)
+importFrom(dplyr,summarise)
 importFrom(dplyr,sym)
 importFrom(dplyr,tally)
 importFrom(mediation,mediate)
@@ -432,6 +433,7 @@ importFrom(tibble,rownames_to_column)
 importFrom(tibble,tibble)
 importFrom(tidyr,pivot_longer)
 importFrom(tidyr,pivot_wider)
+importFrom(tidyr,unnest)
 importFrom(utils,assignInMyNamespace)
 importFrom(utils,combn)
 importFrom(utils,getFromNamespace)

diff --git a/R/addDivergence.R b/R/addDivergence.R
@@ -204,7 +204,7 @@ setMethod("getDivergence", signature = c(x="SummarizedExperiment"),
         reference <- rep(reference, ncol(mat))
     }
     # Check that all reference samples are included in the data
-    if( !all(reference %in% colnames(mat) | is.na(reference)) ){
+    if( !all(unlist(reference) %in% colnames(mat) | is.na(unlist(reference))) ){
         stop("All reference samples must be included in the data.",
             call. = FALSE)
     }
@@ -214,12 +214,21 @@ setMethod("getDivergence", signature = c(x="SummarizedExperiment"),
 }
 
 # For each sample-pair, this function calculates dissimilarity.
-#' @importFrom dplyr mutate
+#' @importFrom dplyr group_by summarise
+#' @importFrom tidyr unnest
 .calc_divergence <- function(mat, reference, method, ...){
     # Create sample-pair data.frame
-    reference <- data.frame(sample = colnames(mat), reference = reference)
+    reference <- data.frame(
+        sample = colnames(mat), reference = I(unname(reference)))
+    # Check if there are multiple reference samples assigned for samples
+    if( any(lengths(reference[["reference"]]) > 1L) ){
+        reference <- reference |> unnest(cols = reference)
+        warning("Some samples are associated with multiple reference samples. ",
+                "In these cases, the reference time point includes multiple ",
+                "samples, and their average is used.", call. = FALSE)
+    }
     # Exclude NA values
-    reference <- reference[!is.na(reference$reference), ]
+    reference <- reference[!is.na(reference[["reference"]]), ]
     # For dissimilarity calculation, the samples must be in rows
     mat <- t(mat)
     # Loop through sample-pairs
@@ -235,5 +244,12 @@ setMethod("getDivergence", signature = c(x="SummarizedExperiment"),
     # Add values to data.frame that holds sample pairs
     temp <- unlist(temp)
     reference[["value"]] <- temp
+    # If there were multiple reference samples, take average
+    if( anyDuplicated(reference[["sample"]]) ){
+        reference <- reference |>
+            group_by(sample) |>
+            summarise(value = mean(value)) |>
+            as.data.frame()
+    }
     return(reference)
 }
diff --git a/tests/testthat/test-0divergence.R b/tests/testthat/test-0divergence.R
@@ -1,19 +1,19 @@
 context("divergence estimates")
 test_that("divergence estimates", {
-  
+
   skip_if_not(requireNamespace("vegan", quietly = TRUE))
-  
+
   data(esophagus, package="mia")
   tse <- esophagus
-  
+
   tse_divergence <- addDivergence(tse)
-  
+
   # Checks that the type of output is the same as the type of input.
   expect_true(typeof(tse_divergence) == typeof(tse))
-  
+
   # Check that colData includes divergence results
   expect_true( "divergence" %in% colnames(colData(tse_divergence)))
-  
+
   # Expect errors when input is wrong
   expect_error(addDivergence(
       tse, name = 123, reference = "median", FUN = vegan::vegdist,
@@ -47,26 +47,26 @@ test_that("divergence estimates", {
       method = "euclidean"))
   expect_error(addDivergence(
       tse, reference = FALSE, FUN = stats::dist, method = "euclidean"))
-  
+
   # Reference values from microbiome pkg's divergence function
   expect_equal(unname(
       round(colData(addDivergence(
           tse, reference = "mean", FUN = stats::dist,
           method = "euclidean"))$divergence, 6)),
       round(c(35.35534, 42.16634, 59.44746)),6)
-  
+
   expect_equal(unname(
       round(colData(addDivergence(
           tse, reference = assay(tse, "counts")[,3], FUN = stats::dist,
           method = "manhattan"))$divergence, 6)),
       round(c(210, 280, 0)),6)
-  
+
   expect_equal(unname(
       round(colData(addDivergence(
           tse, reference = assay(tse, "counts")[,1], FUN = vegan::vegdist,
           method = "chao"))$divergence, 6)),
       round(c(0.00000000, 0.10115766, 0.08239422)),6)
-  
+
   # Check different input types for reference
   sample <- sample(colnames(tse), 1)
   tse[["ref_name"]] <- rep(sample, ncol(tse))
@@ -77,7 +77,7 @@ test_that("divergence estimates", {
   expect_equal(in_coldata, single_sample)
   expect_equal(single_sample, as_num_vector)
   expect_equal(as_num_vector, as_char_vector)
-  
+
   # Check that divergence is calculated correctly if we have different reference
   # samples for each sample.
   # Assign reference randomly
@@ -90,7 +90,7 @@ test_that("divergence estimates", {
       return(val)
   })
   expect_equal(test_values, ref_values)
-  
+
   # When the divergence is calculated based on dissimilarity matrix, the result
   # should differ from calculating it from counts
   tse <- runMDS(
@@ -103,4 +103,33 @@ test_that("divergence estimates", {
       assays = SimpleList(counts = t(reducedDim(tse, "MDS"))))
   test <- getDivergence(se, method = "euclidean")
   expect_identical(test, ref)
+
+  # Check that divergence is correctly calculated with multiple reference
+  # samples.
+  tse <- makeTSE(nrow = 1000, ncol = 20)
+  assayNames(tse) <- "counts"
+  reference <- sample(colnames(tse), 40, replace = TRUE)
+  names(reference) <- sample(colnames(tse), 40, replace = TRUE)
+  reference <- split(reference, names(reference))
+  reference <- reference[ match(colnames(tse), names(reference)) ]
+  tse[["reference"]] <- reference
+  res <- getDivergence(tse, reference = "reference", method = "euclidean") |>
+    expect_warning()
+  # For all samples, calculate divergence
+  sams <- colnames(tse)
+  ref <- lapply(sams, function(sam){
+    # Get data on sample
+    sam_dat <- colData(tse[, sam])
+    # Get its reference samples
+    ref_sams <- sam_dat[["reference"]] |> unlist()
+    # Loop through each reference sample, calculate its distance to
+    # the sample, and take mean
+    ref_vals <- vapply(ref_sams, function(ref_sam){
+      dist(t( assay(tse, "counts")[, c(sam, ref_sam)] ),
+           method = "euclidean")
+    }, numeric(1))
+    ref_vals <- mean(ref_vals)
+    return(ref_vals)
+  }) |> unlist()
+  expect_equal(res, ref)
 })