diff --git a/DESCRIPTION b/DESCRIPTION index 877d8ef..3a86bde 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: tximeta -Version: 1.25.0 +Version: 1.25.1 Title: Transcript Quantification Import with Automatic Metadata Description: Transcript quantification import from Salmon and other quantifiers with automatic attachment of transcript ranges diff --git a/NEWS.md b/NEWS.md index b19045c..b461b4a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# tximeta 1.25.1 + +* Added `skipRanges` to `summarizeToGene` which allows + summarization of assay data when `skipMeta` was used + and therefore ranges should not be used / output in + summarization. + # tximeta 1.23.5 * GENCODE 47 (H.s.), M36 (M.m), and Ensembl 113 diff --git a/R/summarizeToGene.R b/R/summarizeToGene.R index aa79c85..f7f6082 100644 --- a/R/summarizeToGene.R +++ b/R/summarizeToGene.R @@ -1,6 +1,8 @@ summarizeToGene.SummarizedExperiment <- function(object, assignRanges=c("range","abundant"), - varReduce=FALSE, ...) { + varReduce=FALSE, + skipRanges=FALSE, + ...) { # arguments: # assignRanges - @@ -8,34 +10,45 @@ summarizeToGene.SummarizedExperiment <- function(object, # with the GRanges set to the *range* of the isoforms, # or the ranges of the most *abundant* isoform # varReduce - see ?tximport + # skipRanges - whether to not make use of, or output, rowRanges assignRanges <- match.arg(assignRanges) - - missingMetadata(object, summarize=TRUE) - - txomeInfo <- metadata(object)$txomeInfo - txdb <- getTxDb(txomeInfo) - message("obtaining transcript-to-gene mapping from database") - suppressMessages({ - tx2gene <- select(txdb, keys(txdb, "TXNAME"), "GENEID", "TXNAME") - }) - - suppressWarnings({ - g <- getRanges(txdb=txdb, txomeInfo=txomeInfo, type="gene") - }) - - # need to add seqinfo for GENCODE and RefSeq - if (all(is.na(seqlengths(g)))) { - seqinfo(g) <- seqinfo(object) + # some preparation of txdb etc for working on ranges + if (!skipRanges) { + missingMetadata(object, summarize=TRUE) + txomeInfo <- metadata(object)$txomeInfo + txdb <- getTxDb(txomeInfo) + message("obtaining transcript-to-gene mapping from database") + suppressMessages({ + tx2gene <- select(txdb, keys(txdb, "TXNAME"), "GENEID", "TXNAME") + }) + suppressWarnings({ + g <- getRanges(txdb=txdb, txomeInfo=txomeInfo, type="gene") + }) + # need to add seqinfo for GENCODE and RefSeq + if (all(is.na(seqlengths(g)))) { + seqinfo(g) <- seqinfo(object) + } + # note here about how ranges will be assigned + if (assignRanges == "abundant") { + message("assignRanges='abundant': gene ranges assigned by isoform abundance + see details at: ?summarizeToGene,SummarizedExperiment-method") + } else { + message("assignRanges='range': gene ranges assigned by total range of isoforms + see details at: ?summarizeToGene,SummarizedExperiment-method") + } } - + + # make a txi list like tximport would output txi <- list( abundance=assays(object)[["abundance"]], counts=assays(object)[["counts"]], length=assays(object)[["length"]], countsFromAbundance="no" ) + + # deal with inferential replicates, if present if ("infRep1" %in% assayNames(object)) { infReps <- list(assays(object)[grep("infRep", assayNames(object))]) if (varReduce) { @@ -48,17 +61,12 @@ summarizeToGene.SummarizedExperiment <- function(object, if (varReduce) stop("cannot calculate inferential variance without inferential replicates") } - # note here about how ranges are assigned - if (assignRanges == "abundant") { - message("assignRanges='abundant': gene ranges assigned by isoform abundance - see details at: ?summarizeToGene,SummarizedExperiment-method") + if (skipRanges) { + txi.gene <- summarizeToGene(object=txi, varReduce=varReduce, ...) } else { - message("assignRanges='range': gene ranges assigned by total range of isoforms - see details at: ?summarizeToGene,SummarizedExperiment-method") + txi.gene <- summarizeToGene(object=txi, tx2gene=tx2gene, varReduce=varReduce, ...) } - txi.gene <- summarizeToGene(object=txi, tx2gene=tx2gene, varReduce=varReduce, ...) - # put 'counts' in front to facilitate DESeqDataSet construction assays <- txi.gene[c("counts","abundance","length")] if (varReduce) { @@ -68,73 +76,83 @@ summarizeToGene.SummarizedExperiment <- function(object, assays <- c(assays, infReps) } - # here do the same check/subset but with the gene-level - # tximport assay matrices and gene ranges - assays <- checkAssays2Txps(assays, g) - - # TODO give a warning here if there are genes in TxDb not in Salmon index? - g <- g[rownames(assays[["counts"]])] - - # store txp IDS - tx_ids <- CharacterList(split(tx2gene$TXNAME, tx2gene$GENEID)) - if (all(names(g) %in% names(tx_ids))) { - tx_ids <- tx_ids[names(g)] - mcols(g)$tx_ids <- tx_ids - } - - # calculate duplication - if ("hasDuplicate" %in% colnames(mcols(object))) { - stopifnot(all(rownames(object) %in% tx2gene[,1])) - t2g.o <- tx2gene[match(rownames(object),tx2gene[,1]),] - has.dup.list <- LogicalList(split(mcols(object)$hasDuplicate, t2g.o$GENEID)) - mcols(g)$numDupObjects <- sum(has.dup.list) - } - - # assign ranges based on most abundant isoform - # (for the expressed genes) - if (assignRanges == "abundant") { - stopifnot(!is.null(rowRanges(object))) - iso_prop <- getIsoProps(tpm=assays(object)[["abundance"]], - t2g=tx2gene) - # order by the rowRanges of the outgoing object - iso_prop <- iso_prop[names(g)] - # remove unexpressed genes - expressed <- !sapply(iso_prop, \(x) all(is.nan(x))) - # compute the most abundant isoform of expressed genes - mostAbundant <- sapply(iso_prop[expressed], \(x) names(x)[which.max(x)]) - # assign all this data to mcols(g) - mcols(g)$isoform <- NA - assignIdx <- names(g) %in% names(mostAbundant) - assignNms <- names(g)[assignIdx] - mcols(g)$isoform[assignIdx] <- mostAbundant[assignNms] - mcols(g)$max_prop <- sapply(iso_prop, max) - mcols(g)$iso_prop <- NumericList(iso_prop) - # bring over new start and end positions for the subset - # of genes that have a most abundant isoform - txp <- rowRanges(object) - # if addExons has been called, here we flatten - # we could also return a GRangesList but this would be complex - if (is(txp, "GRangesList")) { - txp <- unlist(range(txp)) + if (!skipRanges) { + # here do the same check/subset but with the gene-level + # tximport assay matrices and gene ranges + assays <- checkAssays2Txps(assays, g) + + # TODO give a warning here if there are genes in TxDb not in Salmon index? + g <- g[rownames(assays[["counts"]])] + + # store txp IDS + tx_ids <- CharacterList(split(tx2gene$TXNAME, tx2gene$GENEID)) + if (all(names(g) %in% names(tx_ids))) { + tx_ids <- tx_ids[names(g)] + mcols(g)$tx_ids <- tx_ids + } + + # calculate duplication + if ("hasDuplicate" %in% colnames(mcols(object))) { + stopifnot(all(rownames(object) %in% tx2gene[,1])) + t2g.o <- tx2gene[match(rownames(object),tx2gene[,1]),] + has.dup.list <- LogicalList(split(mcols(object)$hasDuplicate, t2g.o$GENEID)) + mcols(g)$numDupObjects <- sum(has.dup.list) + } + + # assign ranges based on most abundant isoform + # (for the expressed genes) + if (assignRanges == "abundant") { + stopifnot(!is.null(rowRanges(object))) + iso_prop <- getIsoProps(tpm=assays(object)[["abundance"]], + t2g=tx2gene) + # order by the rowRanges of the outgoing object + iso_prop <- iso_prop[names(g)] + # remove unexpressed genes + expressed <- !sapply(iso_prop, \(x) all(is.nan(x))) + # compute the most abundant isoform of expressed genes + mostAbundant <- sapply(iso_prop[expressed], \(x) names(x)[which.max(x)]) + # assign all this data to mcols(g) + mcols(g)$isoform <- NA + assignIdx <- names(g) %in% names(mostAbundant) + assignNms <- names(g)[assignIdx] + mcols(g)$isoform[assignIdx] <- mostAbundant[assignNms] + mcols(g)$max_prop <- sapply(iso_prop, max) + mcols(g)$iso_prop <- NumericList(iso_prop) + # bring over new start and end positions for the subset + # of genes that have a most abundant isoform + txp <- rowRanges(object) + # if addExons has been called, here we flatten + # we could also return a GRangesList but this would be complex + if (is(txp, "GRangesList")) { + txp <- unlist(range(txp)) + } + txp <- txp[mcols(g)$isoform[assignIdx]] + # assume seqnames and strand are the same + stopifnot(all(seqnames(g)[assignIdx] == seqnames(txp))) + stopifnot(all(strand(g)[assignIdx] == strand(txp))) + start(g)[assignIdx] <- start(txp) + end(g)[assignIdx] <- end(txp) } - txp <- txp[mcols(g)$isoform[assignIdx]] - # assume seqnames and strand are the same - stopifnot(all(seqnames(g)[assignIdx] == seqnames(txp))) - stopifnot(all(strand(g)[assignIdx] == strand(txp))) - start(g)[assignIdx] <- start(txp) - end(g)[assignIdx] <- end(txp) } - + metadata <- metadata(object) # stash countsFromAbundance value metadata$countsFromAbundance <- txi.gene$countsFromAbundance metadata$level <- "gene" metadata$assignRanges <- assignRanges # how were ranges assigned + + if (skipRanges) { + return(SummarizedExperiment(assays=assays, colData=colData(object), + metadata=metadata)) + } + + gse <- SummarizedExperiment( + assays=assays, + rowRanges=g, + colData=colData(object), + metadata=metadata + ) - gse <- SummarizedExperiment(assays=assays, - rowRanges=g, - colData=colData(object), - metadata=metadata) gse } @@ -165,6 +183,8 @@ summarizeToGene.SummarizedExperiment <- function(object, #' isoform proportions) are also returned in \code{mcols} #' @param varReduce whether to reduce per-sample inferential replicates #' information into a matrix of sample variances \code{variance} (default FALSE) +#' @param skipRanges whether to skip making use of, or outputting, rowRanges +#' (default FALSE) #' @param ... arguments passed to \code{tximport} #' #' @return a SummarizedExperiment with summarized quantifications diff --git a/R/tximeta.R b/R/tximeta.R index ec9f0da..0f6727c 100644 --- a/R/tximeta.R +++ b/R/tximeta.R @@ -493,7 +493,7 @@ missingMetadata <- function(se, summarize=FALSE) { If (2), use a linkedTxome to provide the missing metadata and rerun tximeta" if (summarize) { msg <- paste0(msg, " - or use tx2gene, txOut=FALSE (and skipMeta=TRUE if Salmon/piscem)") + or provide a `tx2gene` data.frame and set `skipRanges=TRUE`") } if (is.null(metadata(se)$txomeInfo)) stop(msg) } diff --git a/man/summarizeToGene.Rd b/man/summarizeToGene.Rd index 0459e0c..359178f 100644 --- a/man/summarizeToGene.Rd +++ b/man/summarizeToGene.Rd @@ -9,6 +9,7 @@ object, assignRanges = c("range", "abundant"), varReduce = FALSE, + skipRanges = FALSE, ... ) } @@ -31,6 +32,9 @@ isoform proportions) are also returned in \code{mcols}} \item{varReduce}{whether to reduce per-sample inferential replicates information into a matrix of sample variances \code{variance} (default FALSE)} +\item{skipRanges}{whether to skip making use of, or outputting, rowRanges +(default FALSE)} + \item{...}{arguments passed to \code{tximport}} } \value{