Skip to content

Commit

Permalink
Handle duplicates in getDivergence (#680)
Browse files Browse the repository at this point in the history
  • Loading branch information
TuomasBorman authored Jan 23, 2025
1 parent 255ac68 commit 11eba6c
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 17 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: mia
Type: Package
Version: 1.15.17
Version: 1.15.18
Authors@R:
c(person(given = "Tuomas", family = "Borman", role = c("aut", "cre"),
email = "tuomas.v.borman@utu.fi",
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,7 @@ importFrom(dplyr,group_by)
importFrom(dplyr,mutate)
importFrom(dplyr,n)
importFrom(dplyr,rename)
importFrom(dplyr,summarise)
importFrom(dplyr,sym)
importFrom(dplyr,tally)
importFrom(mediation,mediate)
Expand Down Expand Up @@ -432,6 +433,7 @@ importFrom(tibble,rownames_to_column)
importFrom(tibble,tibble)
importFrom(tidyr,pivot_longer)
importFrom(tidyr,pivot_wider)
importFrom(tidyr,unnest)
importFrom(utils,assignInMyNamespace)
importFrom(utils,combn)
importFrom(utils,getFromNamespace)
Expand Down
24 changes: 20 additions & 4 deletions R/addDivergence.R
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ setMethod("getDivergence", signature = c(x="SummarizedExperiment"),
reference <- rep(reference, ncol(mat))
}
# Check that all reference samples are included in the data
if( !all(reference %in% colnames(mat) | is.na(reference)) ){
if( !all(unlist(reference) %in% colnames(mat) | is.na(unlist(reference))) ){
stop("All reference samples must be included in the data.",
call. = FALSE)
}
Expand All @@ -214,12 +214,21 @@ setMethod("getDivergence", signature = c(x="SummarizedExperiment"),
}

# For each sample-pair, this function calculates dissimilarity.
#' @importFrom dplyr mutate
#' @importFrom dplyr group_by summarise
#' @importFrom tidyr unnest
.calc_divergence <- function(mat, reference, method, ...){
# Create sample-pair data.frame
reference <- data.frame(sample = colnames(mat), reference = reference)
reference <- data.frame(
sample = colnames(mat), reference = I(unname(reference)))
# Check if there are multiple reference samples assigned for samples
if( any(lengths(reference[["reference"]]) > 1L) ){
reference <- reference |> unnest(cols = reference)
warning("Some samples are associated with multiple reference samples. ",
"In these cases, the reference time point includes multiple ",
"samples, and their average is used.", call. = FALSE)
}
# Exclude NA values
reference <- reference[!is.na(reference$reference), ]
reference <- reference[!is.na(reference[["reference"]]), ]
# For dissimilarity calculation, the samples must be in rows
mat <- t(mat)
# Loop through sample-pairs
Expand All @@ -235,5 +244,12 @@ setMethod("getDivergence", signature = c(x="SummarizedExperiment"),
# Add values to data.frame that holds sample pairs
temp <- unlist(temp)
reference[["value"]] <- temp
# If there were multiple reference samples, take average
if( anyDuplicated(reference[["sample"]]) ){
reference <- reference |>
group_by(sample) |>
summarise(value = mean(value)) |>
as.data.frame()
}
return(reference)
}
53 changes: 41 additions & 12 deletions tests/testthat/test-0divergence.R
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
context("divergence estimates")
test_that("divergence estimates", {

skip_if_not(requireNamespace("vegan", quietly = TRUE))

data(esophagus, package="mia")
tse <- esophagus

tse_divergence <- addDivergence(tse)

# Checks that the type of output is the same as the type of input.
expect_true(typeof(tse_divergence) == typeof(tse))

# Check that colData includes divergence results
expect_true( "divergence" %in% colnames(colData(tse_divergence)))

# Expect errors when input is wrong
expect_error(addDivergence(
tse, name = 123, reference = "median", FUN = vegan::vegdist,
Expand Down Expand Up @@ -47,26 +47,26 @@ test_that("divergence estimates", {
method = "euclidean"))
expect_error(addDivergence(
tse, reference = FALSE, FUN = stats::dist, method = "euclidean"))

# Reference values from microbiome pkg's divergence function
expect_equal(unname(
round(colData(addDivergence(
tse, reference = "mean", FUN = stats::dist,
method = "euclidean"))$divergence, 6)),
round(c(35.35534, 42.16634, 59.44746)),6)

expect_equal(unname(
round(colData(addDivergence(
tse, reference = assay(tse, "counts")[,3], FUN = stats::dist,
method = "manhattan"))$divergence, 6)),
round(c(210, 280, 0)),6)

expect_equal(unname(
round(colData(addDivergence(
tse, reference = assay(tse, "counts")[,1], FUN = vegan::vegdist,
method = "chao"))$divergence, 6)),
round(c(0.00000000, 0.10115766, 0.08239422)),6)

# Check different input types for reference
sample <- sample(colnames(tse), 1)
tse[["ref_name"]] <- rep(sample, ncol(tse))
Expand All @@ -77,7 +77,7 @@ test_that("divergence estimates", {
expect_equal(in_coldata, single_sample)
expect_equal(single_sample, as_num_vector)
expect_equal(as_num_vector, as_char_vector)

# Check that divergence is calculated correctly if we have different reference
# samples for each sample.
# Assign reference randomly
Expand All @@ -90,7 +90,7 @@ test_that("divergence estimates", {
return(val)
})
expect_equal(test_values, ref_values)

# When the divergence is calculated based on dissimilarity matrix, the result
# should differ from calculating it from counts
tse <- runMDS(
Expand All @@ -103,4 +103,33 @@ test_that("divergence estimates", {
assays = SimpleList(counts = t(reducedDim(tse, "MDS"))))
test <- getDivergence(se, method = "euclidean")
expect_identical(test, ref)

# Check that divergence is correctly calculated with multiple reference
# samples.
tse <- makeTSE(nrow = 1000, ncol = 20)
assayNames(tse) <- "counts"
reference <- sample(colnames(tse), 40, replace = TRUE)
names(reference) <- sample(colnames(tse), 40, replace = TRUE)
reference <- split(reference, names(reference))
reference <- reference[ match(colnames(tse), names(reference)) ]
tse[["reference"]] <- reference
res <- getDivergence(tse, reference = "reference", method = "euclidean") |>
expect_warning()
# For all samples, calculate divergence
sams <- colnames(tse)
ref <- lapply(sams, function(sam){
# Get data on sample
sam_dat <- colData(tse[, sam])
# Get its reference samples
ref_sams <- sam_dat[["reference"]] |> unlist()
# Loop through each reference sample, calculate its distance to
# the sample, and take mean
ref_vals <- vapply(ref_sams, function(ref_sam){
dist(t( assay(tse, "counts")[, c(sam, ref_sam)] ),
method = "euclidean")
}, numeric(1))
ref_vals <- mean(ref_vals)
return(ref_vals)
}) |> unlist()
expect_equal(res, ref)
})

0 comments on commit 11eba6c

Please sign in to comment.