Skip to content

Commit

Permalink
Merge pull request #72 from elsasserlab/aggregate-refactor
Browse files Browse the repository at this point in the history
bw_bed refactor
  • Loading branch information
cnluzon authored Aug 19, 2020
2 parents 828bbdc + d8c841b commit 477f446
Show file tree
Hide file tree
Showing 5 changed files with 185 additions and 93 deletions.
2 changes: 1 addition & 1 deletion elsasserlib/DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: elsasserlib
Type: Package
Title: General utilities used within Elsasser lab
Version: 1.0.6
Version: 1.0.7
Authors@R: c(
person("Carmen", "Navarro",
email = "carmen.navarro@scilifelab.se",
Expand Down
208 changes: 118 additions & 90 deletions elsasserlib/R/bwtools.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,58 +39,45 @@ bw_bed <- function(bwfiles,
bed <- import(bedfile)
bed <- sortSeqlevels(bed)
bed <- sort(bed, ignore.strand = FALSE)

result <- NULL
if (is.null(bg_bwfiles) || !is.null(aggregate_by)) {
result <- multi_bw_ranges(bwfiles, labels,
granges = bed,
per_locus_stat = per_locus_stat
)
} else {
if (is.null(aggregate_by)) {
if (is.null(aggregate_by)) {
if (is.null(bg_bwfiles)) {
result <- multi_bw_ranges(bwfiles, labels,
granges = bed,
per_locus_stat = per_locus_stat
)
}
else {
# Only want to normalize per-locus if not aggregating
result <- multi_bw_ranges_norm(
bwfiles,
bg_bwfilelist = bg_bwfiles,
labels = labels,
granges = bed,
per_locus_stat = per_locus_stat,
norm_func = norm_func
)
bwfiles,
bg_bwfilelist = bg_bwfiles,
labels = labels,
granges = bed,
per_locus_stat = per_locus_stat,
norm_func = norm_func
)
}
}

if ("name" %in% names(mcols(bed))) {
result$name <- bed$name
}

if (!is.null(aggregate_by)) {
df <- aggregate_scores(
result,
group_col = "name",
aggregate_by = aggregate_by
)

result <- natural_sort_by_field(df, "name")
} else {
result <- multi_bw_ranges_aggregated(bwfiles,
labels = labels,
granges = bed,
per_locus_stat = per_locus_stat,
aggregate_by = aggregate_by
)

if (!is.null(bg_bwfiles)) {
bg <- multi_bw_ranges(bg_bwfiles, labels,
bg <- multi_bw_ranges_aggregated(bg_bwfiles,
labels = labels,
granges = bed,
per_locus_stat = per_locus_stat
per_locus_stat = per_locus_stat,
aggregate_by = aggregate_by
)

if ("name" %in% names(mcols(bed))) {
bg$name <- bed$name
}

bg_df <- aggregate_scores(bg,
group_col = "name",
aggregate_by = aggregate_by
)

values <- cbind(norm_func(df[, labels] / bg_df[, labels]), df$name)
colnames(values) <- c(labels, "name")
result <- natural_sort_by_field(data.frame(values), "name")
rows <- rownames(result)
result <- data.frame(norm_func(result[rows, labels] / bg[rows, labels]))
rownames(result) <- rows
colnames(result) <- labels
}
}

Expand Down Expand Up @@ -153,7 +140,6 @@ bw_bins <- function(bwfiles,
selection = selection
)
} else {
# FIXME: mcols of result may end up being <matrix> instead of <numeric>.
result <- multi_bw_ranges_norm(bwfiles, bg_bwfiles, labels, tiles,
per_locus_stat = per_locus_stat,
selection = selection,
Expand Down Expand Up @@ -324,10 +310,42 @@ multi_bw_ranges <- function(bwfiles,
# granges_cbind sorts each element so it's safer to merge and no need to
# sort after
result <- granges_cbind(summaries, labels)

# Include names if granges has them
if ("name" %in% names(mcols(granges))) {
result$name <- granges$name
}

result
}


#' Intersect a list of bigWig files with a GRanges object and aggregate by name
#'
#' @inheritParams bw_bed
#' @param granges GRanges object to summarize. Should have a valid name field.
#' @return An aggregated dataframe
multi_bw_ranges_aggregated <- function(bwfiles,
labels,
granges,
per_locus_stat,
aggregate_by) {

result <- multi_bw_ranges(bwfiles, labels,
granges = granges,
per_locus_stat = per_locus_stat
)

df <- aggregate_scores(
result,
group_col = "name",
aggregate_by = aggregate_by
)

natural_sort_by_field(df, "name")
}


#' Intersect a list of bigWig files with a GRanges object (with background)
#'
#' Intersect a list of bigWig files with a GRanges object and normalize values
Expand Down Expand Up @@ -364,12 +382,13 @@ multi_bw_ranges_norm <- function(bwfilelist,
)

result_df <- data.frame(result)
result_df[, labels] <-
norm_func(as.matrix(mcols(result)) / as.matrix(mcols(bg)))
bg_df <- data.frame(bg)
result_df[, labels] <- norm_func(result_df[, labels] / bg_df[, labels])

makeGRangesFromDataFrame(result_df, keep.extra.columns = TRUE)
}


#' Score a GRanges object against a BigWig file
#'
#' Build a scored GRanges object from a GRanges object and a single BigWig file.
Expand Down Expand Up @@ -419,54 +438,50 @@ bw_ranges <- function(bwfile,
#' @importFrom rtracklayer mcols
#' @return A data frame with aggregated scores.
aggregate_scores <- function(scored_granges, group_col, aggregate_by) {
if (!is.null(group_col) && group_col %in% names(mcols(scored_granges))) {

# GRanges objects are 1-based and inclusive [start, end]
end_value <- GenomicRanges::end(scored_granges)
start_value <- GenomicRanges::start(scored_granges)
scored_granges$length <- end_value - start_value + 1

df <- data.frame(mcols(scored_granges))
validate_categories(df[, group_col])
# Make sure special characters are taken into account
score_cols <- colnames(mcols(scored_granges))
score_cols <- make.names(score_cols)
score_cols <- score_cols[!score_cols %in% c(group_col)]

if (aggregate_by == "true_mean") {
sum_vals <- df[, score_cols] * df$length
colnames(sum_vals) <- score_cols
sum_vals[, group_col] <- df[, group_col]
sum_vals$length <- df$length

# Summarize SUM only
sum_vals <- sum_vals %>%
group_by_at(group_col) %>%
summarise(across(where(is.numeric), sum))

# Divide sum(scores) by sum(length) and keep only scores
df <- sum_vals[, score_cols] / sum_vals$length
df[, group_col] <- sum_vals[, group_col]

} else if (aggregate_by %in% c("mean", "median")) {
f <- get(aggregate_by)
df <- df %>%
group_by_at(group_col) %>%
summarise(across(where(is.numeric), f))

} else {
stop(paste("Function not implemented as aggregate_by:", aggregate_by))
}

score_cols <- score_cols[!score_cols %in% c("length")]
df <- df[, c(score_cols, group_col), drop = FALSE]
data.frame(df)
# print(scored_granges)
validate_group_col(scored_granges, group_col)

# GRanges objects are 1-based and inclusive [start, end]
end_value <- GenomicRanges::end(scored_granges)
start_value <- GenomicRanges::start(scored_granges)
scored_granges$length <- end_value - start_value + 1

df <- data.frame(mcols(scored_granges))
validate_categories(df[, group_col])
# Make sure special characters are taken into account
score_cols <- colnames(mcols(scored_granges))
score_cols <- make.names(score_cols)
score_cols <- score_cols[!score_cols %in% c(group_col)]

if (aggregate_by == "true_mean") {
sum_vals <- df[, score_cols] * df$length
colnames(sum_vals) <- score_cols
sum_vals[, group_col] <- df[, group_col]
sum_vals$length <- df$length

# Summarize SUM only
sum_vals <- sum_vals %>%
group_by_at(group_col) %>%
summarise(across(where(is.numeric), sum))

# Divide sum(scores) by sum(length) and keep only scores
df <- sum_vals[, score_cols] / sum_vals$length
df[, group_col] <- sum_vals[, group_col]

} else if (aggregate_by %in% c("mean", "median")) {
f <- get(aggregate_by)
df <- df %>%
group_by_at(group_col) %>%
summarise(across(where(is.numeric), f))

} else {
stop("Grouping column not provided or not present in GRanges object.")
stop(paste("Function not implemented as aggregate_by:", aggregate_by))
}
}

score_cols <- score_cols[!score_cols %in% c("length")]
df <- df[, c(score_cols, group_col), drop = FALSE]
data.frame(df)
}


#' Calculate profile values of a bigWig file over GRanges object.
Expand Down Expand Up @@ -564,6 +579,7 @@ calculate_bw_profile <- function(bw,
summarize_matrix(full, label)
}


#' Calculate matrix for stretch mode
#'
#' @param bw BigWigFile object
Expand Down Expand Up @@ -759,3 +775,15 @@ validate_filelist <- function(filelist) {
stop(msg)
}
}


#' Validate that group col exists in granges
#'
#' @param granges GRanges object to check
#' @param group_col Group column name. Usually, name.
validate_group_col <- function(granges, group_col) {
if (!group_col %in% names(mcols(granges))) {
stop(paste("Invalid group column not present in granges", group_col))
}
}

34 changes: 34 additions & 0 deletions elsasserlib/man/multi_bw_ranges_aggregated.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 16 additions & 0 deletions elsasserlib/man/validate_group_col.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 16 additions & 2 deletions elsasserlib/tests/testthat/test_bwtools.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ toy_example <- function(bw1,
bw_special,
bed_with_names,
bg1,
bg2) {
bg2,
unnamed_bed) {
granges <- GRanges(
seqnames = Rle(c("chr1", "chr2"), c(10, 10)),
ranges = IRanges(c(seq(1, 181, by = 20), seq(1, 181, by = 20)),
Expand Down Expand Up @@ -43,6 +44,7 @@ toy_example <- function(bw1,
c(40, 100, 40, 130, 180))
)

export(labeled_gr, unnamed_bed)
labeled_gr$name <- c("typeA", "typeB", "typeA", "typeB", "typeB")

chromsizes <- c(200, 200)
Expand All @@ -67,14 +69,15 @@ bg1 <- tempfile("bigwig_bg1", fileext = ".bw")
bg2 <- tempfile("bigwig_bg2", fileext = ".bw")
bw_special <- tempfile("bigwig", fileext = ".bw")
bed_with_names <- tempfile("bed", fileext = ".bed")
unnamed_bed <- tempfile("bed2", fileext=".bed")

tiles <- tileGenome(c(chr1 = 200, chr2 = 200),
tilewidth = 20,
cut.last.tile.in.chrom = TRUE
)


setup(toy_example(bw1, bw2, bw_special, bed_with_names, bg1, bg2))
setup(toy_example(bw1, bw2, bw_special, bed_with_names, bg1, bg2, unnamed_bed))

teardown({
unlink(c(bw1, bw2, bg1, bg2, bw_special, bed_with_names))
Expand Down Expand Up @@ -571,3 +574,14 @@ test_that("bw_bed runs with background and aggregate_by parameter", {
expect_is(values, "data.frame")
})

test_that("bw_bed fails if aggregate_by in an unnamed bed file", {
expect_error({
values <- bw_bed(bw1, unnamed_bed, bg_bwfiles = bw2,
labels = "bw1",
per_locus_stat = "mean",
aggregate_by = "mean"
)},
"missing values in 'row.names' are not allowed"
)

})

0 comments on commit 477f446

Please sign in to comment.