Skip to content

Commit

Permalink
add skipRanges for summarization
Browse files Browse the repository at this point in the history
  • Loading branch information
mikelove committed Jan 16, 2025
1 parent eab5dbe commit 5a06002
Show file tree
Hide file tree
Showing 5 changed files with 119 additions and 88 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Package: tximeta
Version: 1.25.0
Version: 1.25.1
Title: Transcript Quantification Import with Automatic Metadata
Description: Transcript quantification import from Salmon and
other quantifiers with automatic attachment of transcript ranges
Expand Down
7 changes: 7 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# tximeta 1.25.1

* Added `skipRanges` to `summarizeToGene` which allows
summarization of assay data when `skipMeta` was used
and therefore ranges should not be used / output in
summarization.

# tximeta 1.23.5

* GENCODE 47 (H.s.), M36 (M.m), and Ensembl 113
Expand Down
192 changes: 106 additions & 86 deletions R/summarizeToGene.R
Original file line number Diff line number Diff line change
@@ -1,41 +1,54 @@
summarizeToGene.SummarizedExperiment <- function(object,
assignRanges=c("range","abundant"),
varReduce=FALSE, ...) {
varReduce=FALSE,
skipRanges=FALSE,
...) {

# arguments:
# assignRanges -
# "range" or "abundant", whether to return an SE
# with the GRanges set to the *range* of the isoforms,
# or the ranges of the most *abundant* isoform
# varReduce - see ?tximport
# skipRanges - whether to not make use of, or output, rowRanges

assignRanges <- match.arg(assignRanges)

missingMetadata(object, summarize=TRUE)

txomeInfo <- metadata(object)$txomeInfo
txdb <- getTxDb(txomeInfo)
message("obtaining transcript-to-gene mapping from database")

suppressMessages({
tx2gene <- select(txdb, keys(txdb, "TXNAME"), "GENEID", "TXNAME")
})

suppressWarnings({
g <- getRanges(txdb=txdb, txomeInfo=txomeInfo, type="gene")
})

# need to add seqinfo for GENCODE and RefSeq
if (all(is.na(seqlengths(g)))) {
seqinfo(g) <- seqinfo(object)
# some preparation of txdb etc for working on ranges
if (!skipRanges) {
missingMetadata(object, summarize=TRUE)
txomeInfo <- metadata(object)$txomeInfo
txdb <- getTxDb(txomeInfo)
message("obtaining transcript-to-gene mapping from database")
suppressMessages({
tx2gene <- select(txdb, keys(txdb, "TXNAME"), "GENEID", "TXNAME")
})
suppressWarnings({
g <- getRanges(txdb=txdb, txomeInfo=txomeInfo, type="gene")
})
# need to add seqinfo for GENCODE and RefSeq
if (all(is.na(seqlengths(g)))) {
seqinfo(g) <- seqinfo(object)
}
# note here about how ranges will be assigned
if (assignRanges == "abundant") {
message("assignRanges='abundant': gene ranges assigned by isoform abundance
see details at: ?summarizeToGene,SummarizedExperiment-method")
} else {
message("assignRanges='range': gene ranges assigned by total range of isoforms
see details at: ?summarizeToGene,SummarizedExperiment-method")
}
}


# make a txi list like tximport would output
txi <- list(
abundance=assays(object)[["abundance"]],
counts=assays(object)[["counts"]],
length=assays(object)[["length"]],
countsFromAbundance="no"
)

# deal with inferential replicates, if present
if ("infRep1" %in% assayNames(object)) {
infReps <- list(assays(object)[grep("infRep", assayNames(object))])
if (varReduce) {
Expand All @@ -48,17 +61,12 @@ summarizeToGene.SummarizedExperiment <- function(object,
if (varReduce) stop("cannot calculate inferential variance without inferential replicates")
}

# note here about how ranges are assigned
if (assignRanges == "abundant") {
message("assignRanges='abundant': gene ranges assigned by isoform abundance
see details at: ?summarizeToGene,SummarizedExperiment-method")
if (skipRanges) {
txi.gene <- summarizeToGene(object=txi, varReduce=varReduce, ...)
} else {
message("assignRanges='range': gene ranges assigned by total range of isoforms
see details at: ?summarizeToGene,SummarizedExperiment-method")
txi.gene <- summarizeToGene(object=txi, tx2gene=tx2gene, varReduce=varReduce, ...)
}

txi.gene <- summarizeToGene(object=txi, tx2gene=tx2gene, varReduce=varReduce, ...)

# put 'counts' in front to facilitate DESeqDataSet construction
assays <- txi.gene[c("counts","abundance","length")]
if (varReduce) {
Expand All @@ -68,73 +76,83 @@ summarizeToGene.SummarizedExperiment <- function(object,
assays <- c(assays, infReps)
}

# here do the same check/subset but with the gene-level
# tximport assay matrices and gene ranges
assays <- checkAssays2Txps(assays, g)

# TODO give a warning here if there are genes in TxDb not in Salmon index?
g <- g[rownames(assays[["counts"]])]

# store txp IDS
tx_ids <- CharacterList(split(tx2gene$TXNAME, tx2gene$GENEID))
if (all(names(g) %in% names(tx_ids))) {
tx_ids <- tx_ids[names(g)]
mcols(g)$tx_ids <- tx_ids
}

# calculate duplication
if ("hasDuplicate" %in% colnames(mcols(object))) {
stopifnot(all(rownames(object) %in% tx2gene[,1]))
t2g.o <- tx2gene[match(rownames(object),tx2gene[,1]),]
has.dup.list <- LogicalList(split(mcols(object)$hasDuplicate, t2g.o$GENEID))
mcols(g)$numDupObjects <- sum(has.dup.list)
}

# assign ranges based on most abundant isoform
# (for the expressed genes)
if (assignRanges == "abundant") {
stopifnot(!is.null(rowRanges(object)))
iso_prop <- getIsoProps(tpm=assays(object)[["abundance"]],
t2g=tx2gene)
# order by the rowRanges of the outgoing object
iso_prop <- iso_prop[names(g)]
# remove unexpressed genes
expressed <- !sapply(iso_prop, \(x) all(is.nan(x)))
# compute the most abundant isoform of expressed genes
mostAbundant <- sapply(iso_prop[expressed], \(x) names(x)[which.max(x)])
# assign all this data to mcols(g)
mcols(g)$isoform <- NA
assignIdx <- names(g) %in% names(mostAbundant)
assignNms <- names(g)[assignIdx]
mcols(g)$isoform[assignIdx] <- mostAbundant[assignNms]
mcols(g)$max_prop <- sapply(iso_prop, max)
mcols(g)$iso_prop <- NumericList(iso_prop)
# bring over new start and end positions for the subset
# of genes that have a most abundant isoform
txp <- rowRanges(object)
# if addExons has been called, here we flatten
# we could also return a GRangesList but this would be complex
if (is(txp, "GRangesList")) {
txp <- unlist(range(txp))
if (!skipRanges) {
# here do the same check/subset but with the gene-level
# tximport assay matrices and gene ranges
assays <- checkAssays2Txps(assays, g)

# TODO give a warning here if there are genes in TxDb not in Salmon index?
g <- g[rownames(assays[["counts"]])]

# store txp IDS
tx_ids <- CharacterList(split(tx2gene$TXNAME, tx2gene$GENEID))
if (all(names(g) %in% names(tx_ids))) {
tx_ids <- tx_ids[names(g)]
mcols(g)$tx_ids <- tx_ids
}

# calculate duplication
if ("hasDuplicate" %in% colnames(mcols(object))) {
stopifnot(all(rownames(object) %in% tx2gene[,1]))
t2g.o <- tx2gene[match(rownames(object),tx2gene[,1]),]
has.dup.list <- LogicalList(split(mcols(object)$hasDuplicate, t2g.o$GENEID))
mcols(g)$numDupObjects <- sum(has.dup.list)
}

# assign ranges based on most abundant isoform
# (for the expressed genes)
if (assignRanges == "abundant") {
stopifnot(!is.null(rowRanges(object)))
iso_prop <- getIsoProps(tpm=assays(object)[["abundance"]],
t2g=tx2gene)
# order by the rowRanges of the outgoing object
iso_prop <- iso_prop[names(g)]
# remove unexpressed genes
expressed <- !sapply(iso_prop, \(x) all(is.nan(x)))
# compute the most abundant isoform of expressed genes
mostAbundant <- sapply(iso_prop[expressed], \(x) names(x)[which.max(x)])
# assign all this data to mcols(g)
mcols(g)$isoform <- NA
assignIdx <- names(g) %in% names(mostAbundant)
assignNms <- names(g)[assignIdx]
mcols(g)$isoform[assignIdx] <- mostAbundant[assignNms]
mcols(g)$max_prop <- sapply(iso_prop, max)
mcols(g)$iso_prop <- NumericList(iso_prop)
# bring over new start and end positions for the subset
# of genes that have a most abundant isoform
txp <- rowRanges(object)
# if addExons has been called, here we flatten
# we could also return a GRangesList but this would be complex
if (is(txp, "GRangesList")) {
txp <- unlist(range(txp))
}
txp <- txp[mcols(g)$isoform[assignIdx]]
# assume seqnames and strand are the same
stopifnot(all(seqnames(g)[assignIdx] == seqnames(txp)))
stopifnot(all(strand(g)[assignIdx] == strand(txp)))
start(g)[assignIdx] <- start(txp)
end(g)[assignIdx] <- end(txp)
}
txp <- txp[mcols(g)$isoform[assignIdx]]
# assume seqnames and strand are the same
stopifnot(all(seqnames(g)[assignIdx] == seqnames(txp)))
stopifnot(all(strand(g)[assignIdx] == strand(txp)))
start(g)[assignIdx] <- start(txp)
end(g)[assignIdx] <- end(txp)
}

metadata <- metadata(object)
# stash countsFromAbundance value
metadata$countsFromAbundance <- txi.gene$countsFromAbundance
metadata$level <- "gene"
metadata$assignRanges <- assignRanges # how were ranges assigned

if (skipRanges) {
return(SummarizedExperiment(assays=assays, colData=colData(object),
metadata=metadata))
}

gse <- SummarizedExperiment(
assays=assays,
rowRanges=g,
colData=colData(object),
metadata=metadata
)

gse <- SummarizedExperiment(assays=assays,
rowRanges=g,
colData=colData(object),
metadata=metadata)
gse
}

Expand Down Expand Up @@ -165,6 +183,8 @@ summarizeToGene.SummarizedExperiment <- function(object,
#' isoform proportions) are also returned in \code{mcols}
#' @param varReduce whether to reduce per-sample inferential replicates
#' information into a matrix of sample variances \code{variance} (default FALSE)
#' @param skipRanges whether to skip making use of, or outputting, rowRanges
#' (default FALSE)
#' @param ... arguments passed to \code{tximport}
#'
#' @return a SummarizedExperiment with summarized quantifications
Expand Down
2 changes: 1 addition & 1 deletion R/tximeta.R
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,7 @@ missingMetadata <- function(se, summarize=FALSE) {
If (2), use a linkedTxome to provide the missing metadata and rerun tximeta"
if (summarize) {
msg <- paste0(msg, "
or use tx2gene, txOut=FALSE (and skipMeta=TRUE if Salmon/piscem)")
or provide a `tx2gene` data.frame and set `skipRanges=TRUE`")
}
if (is.null(metadata(se)$txomeInfo)) stop(msg)
}
Expand Down
4 changes: 4 additions & 0 deletions man/summarizeToGene.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 5a06002

Please sign in to comment.