add skipRanges for summarization

thelovelab · Jan 16, 2025 · 5a06002 · 5a06002
1 parent eab5dbe
commit 5a06002
Show file tree

Hide file tree

Showing 5 changed files with 119 additions and 88 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: tximeta
-Version: 1.25.0
+Version: 1.25.1
 Title: Transcript Quantification Import with Automatic Metadata
 Description: Transcript quantification import from Salmon and
     other quantifiers with automatic attachment of transcript ranges

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,10 @@
+# tximeta 1.25.1
+
+* Added `skipRanges` to `summarizeToGene` which allows
+  summarization of assay data when `skipMeta` was used
+  and therefore ranges should not be used / output in
+  summarization.
+
 # tximeta 1.23.5
 
 * GENCODE 47 (H.s.), M36 (M.m), and Ensembl 113

diff --git a/R/summarizeToGene.R b/R/summarizeToGene.R
@@ -1,41 +1,54 @@
 summarizeToGene.SummarizedExperiment <- function(object,
                                                  assignRanges=c("range","abundant"),
-                                                 varReduce=FALSE, ...) {
+                                                 varReduce=FALSE,
+                                                 skipRanges=FALSE,
+                                                 ...) {
 
   # arguments:
   # assignRanges -
   #   "range" or "abundant", whether to return an SE
   #   with the GRanges set to the *range* of the isoforms,
   #   or the ranges of the most *abundant* isoform
   # varReduce - see ?tximport
+  # skipRanges - whether to not make use of, or output, rowRanges
 
   assignRanges <- match.arg(assignRanges)
-
-  missingMetadata(object, summarize=TRUE)
-
-  txomeInfo <- metadata(object)$txomeInfo
-  txdb <- getTxDb(txomeInfo)
-  message("obtaining transcript-to-gene mapping from database")
 
-  suppressMessages({
-    tx2gene <- select(txdb, keys(txdb, "TXNAME"), "GENEID", "TXNAME")
-  })
-
-  suppressWarnings({
-    g <- getRanges(txdb=txdb, txomeInfo=txomeInfo, type="gene")
-  })
-
-  # need to add seqinfo for GENCODE and RefSeq
-  if (all(is.na(seqlengths(g)))) {
-    seqinfo(g) <- seqinfo(object)
+  # some preparation of txdb etc for working on ranges
+  if (!skipRanges) {
+    missingMetadata(object, summarize=TRUE)
+    txomeInfo <- metadata(object)$txomeInfo
+    txdb <- getTxDb(txomeInfo)
+    message("obtaining transcript-to-gene mapping from database")
+    suppressMessages({
+      tx2gene <- select(txdb, keys(txdb, "TXNAME"), "GENEID", "TXNAME")
+    })
+    suppressWarnings({
+      g <- getRanges(txdb=txdb, txomeInfo=txomeInfo, type="gene")
+    })
+    # need to add seqinfo for GENCODE and RefSeq
+    if (all(is.na(seqlengths(g)))) {
+      seqinfo(g) <- seqinfo(object)
+    }
+    # note here about how ranges will be assigned
+    if (assignRanges == "abundant") {
+      message("assignRanges='abundant': gene ranges assigned by isoform abundance
+  see details at: ?summarizeToGene,SummarizedExperiment-method")
+    } else {
+      message("assignRanges='range': gene ranges assigned by total range of isoforms
+  see details at: ?summarizeToGene,SummarizedExperiment-method")
+    }
   }
-
+
+  # make a txi list like tximport would output
   txi <- list(
     abundance=assays(object)[["abundance"]],
     counts=assays(object)[["counts"]],
     length=assays(object)[["length"]],
     countsFromAbundance="no"
   )
+
+  # deal with inferential replicates, if present
   if ("infRep1" %in% assayNames(object)) {
     infReps <- list(assays(object)[grep("infRep", assayNames(object))])
     if (varReduce) {
@@ -48,17 +61,12 @@ summarizeToGene.SummarizedExperiment <- function(object,
     if (varReduce) stop("cannot calculate inferential variance without inferential replicates")
   }
 
-  # note here about how ranges are assigned
-  if (assignRanges == "abundant") {
-    message("assignRanges='abundant': gene ranges assigned by isoform abundance
-  see details at: ?summarizeToGene,SummarizedExperiment-method")
+  if (skipRanges) {
+    txi.gene <- summarizeToGene(object=txi, varReduce=varReduce, ...)
   } else {
-    message("assignRanges='range': gene ranges assigned by total range of isoforms
-  see details at: ?summarizeToGene,SummarizedExperiment-method")
+    txi.gene <- summarizeToGene(object=txi, tx2gene=tx2gene, varReduce=varReduce, ...)
   }
 
-  txi.gene <- summarizeToGene(object=txi, tx2gene=tx2gene, varReduce=varReduce, ...)
-
   # put 'counts' in front to facilitate DESeqDataSet construction
   assays <- txi.gene[c("counts","abundance","length")]
   if (varReduce) {
@@ -68,73 +76,83 @@ summarizeToGene.SummarizedExperiment <- function(object,
     assays <- c(assays, infReps)
   }
 
-  # here do the same check/subset but with the gene-level
-  # tximport assay matrices and gene ranges
-  assays <- checkAssays2Txps(assays, g)
-
-  # TODO give a warning here if there are genes in TxDb not in Salmon index?
-  g <- g[rownames(assays[["counts"]])]
-
-  # store txp IDS
-  tx_ids <- CharacterList(split(tx2gene$TXNAME, tx2gene$GENEID))
-  if (all(names(g) %in% names(tx_ids))) {
-    tx_ids <- tx_ids[names(g)]
-    mcols(g)$tx_ids <- tx_ids
-  }
-
-  # calculate duplication
-  if ("hasDuplicate" %in% colnames(mcols(object))) {
-    stopifnot(all(rownames(object) %in% tx2gene[,1]))
-    t2g.o <- tx2gene[match(rownames(object),tx2gene[,1]),]
-    has.dup.list <- LogicalList(split(mcols(object)$hasDuplicate, t2g.o$GENEID))
-    mcols(g)$numDupObjects <- sum(has.dup.list)
-  }
-
-  # assign ranges based on most abundant isoform
-  # (for the expressed genes)
-  if (assignRanges == "abundant") {
-    stopifnot(!is.null(rowRanges(object)))
-    iso_prop <- getIsoProps(tpm=assays(object)[["abundance"]],
-                            t2g=tx2gene)
-    # order by the rowRanges of the outgoing object
-    iso_prop <- iso_prop[names(g)]
-    # remove unexpressed genes
-    expressed <- !sapply(iso_prop, \(x) all(is.nan(x)))
-    # compute the most abundant isoform of expressed genes
-    mostAbundant <- sapply(iso_prop[expressed], \(x) names(x)[which.max(x)])
-    # assign all this data to mcols(g)
-    mcols(g)$isoform <- NA
-    assignIdx <- names(g) %in% names(mostAbundant)
-    assignNms <- names(g)[assignIdx]
-    mcols(g)$isoform[assignIdx] <- mostAbundant[assignNms]
-    mcols(g)$max_prop <- sapply(iso_prop, max)
-    mcols(g)$iso_prop <- NumericList(iso_prop)
-    # bring over new start and end positions for the subset
-    # of genes that have a most abundant isoform
-    txp <- rowRanges(object)
-    # if addExons has been called, here we flatten
-    # we could also return a GRangesList but this would be complex
-    if (is(txp, "GRangesList")) {
-      txp <- unlist(range(txp))
+  if (!skipRanges) {
+    # here do the same check/subset but with the gene-level
+    # tximport assay matrices and gene ranges
+    assays <- checkAssays2Txps(assays, g)
+
+    # TODO give a warning here if there are genes in TxDb not in Salmon index?
+    g <- g[rownames(assays[["counts"]])]
+
+    # store txp IDS
+    tx_ids <- CharacterList(split(tx2gene$TXNAME, tx2gene$GENEID))
+    if (all(names(g) %in% names(tx_ids))) {
+      tx_ids <- tx_ids[names(g)]
+      mcols(g)$tx_ids <- tx_ids
+    }
+
+    # calculate duplication
+    if ("hasDuplicate" %in% colnames(mcols(object))) {
+      stopifnot(all(rownames(object) %in% tx2gene[,1]))
+      t2g.o <- tx2gene[match(rownames(object),tx2gene[,1]),]
+      has.dup.list <- LogicalList(split(mcols(object)$hasDuplicate, t2g.o$GENEID))
+      mcols(g)$numDupObjects <- sum(has.dup.list)
+    }
+
+    # assign ranges based on most abundant isoform
+    # (for the expressed genes)
+    if (assignRanges == "abundant") {
+      stopifnot(!is.null(rowRanges(object)))
+      iso_prop <- getIsoProps(tpm=assays(object)[["abundance"]],
+                              t2g=tx2gene)
+      # order by the rowRanges of the outgoing object
+      iso_prop <- iso_prop[names(g)]
+      # remove unexpressed genes
+      expressed <- !sapply(iso_prop, \(x) all(is.nan(x)))
+      # compute the most abundant isoform of expressed genes
+      mostAbundant <- sapply(iso_prop[expressed], \(x) names(x)[which.max(x)])
+      # assign all this data to mcols(g)
+      mcols(g)$isoform <- NA
+      assignIdx <- names(g) %in% names(mostAbundant)
+      assignNms <- names(g)[assignIdx]
+      mcols(g)$isoform[assignIdx] <- mostAbundant[assignNms]
+      mcols(g)$max_prop <- sapply(iso_prop, max)
+      mcols(g)$iso_prop <- NumericList(iso_prop)
+      # bring over new start and end positions for the subset
+      # of genes that have a most abundant isoform
+      txp <- rowRanges(object)
+      # if addExons has been called, here we flatten
+      # we could also return a GRangesList but this would be complex
+      if (is(txp, "GRangesList")) {
+        txp <- unlist(range(txp))
+      }
+      txp <- txp[mcols(g)$isoform[assignIdx]]
+      # assume seqnames and strand are the same
+      stopifnot(all(seqnames(g)[assignIdx] == seqnames(txp)))
+      stopifnot(all(strand(g)[assignIdx] == strand(txp)))
+      start(g)[assignIdx] <- start(txp)
+      end(g)[assignIdx] <- end(txp)
     }
-    txp <- txp[mcols(g)$isoform[assignIdx]]
-    # assume seqnames and strand are the same
-    stopifnot(all(seqnames(g)[assignIdx] == seqnames(txp)))
-    stopifnot(all(strand(g)[assignIdx] == strand(txp)))
-    start(g)[assignIdx] <- start(txp)
-    end(g)[assignIdx] <- end(txp)
   }
-
+    
   metadata <- metadata(object)
   # stash countsFromAbundance value
   metadata$countsFromAbundance <- txi.gene$countsFromAbundance
   metadata$level <- "gene"
   metadata$assignRanges <- assignRanges # how were ranges assigned
+
+  if (skipRanges) {
+    return(SummarizedExperiment(assays=assays, colData=colData(object),
+                                metadata=metadata))
+  }
+
+  gse <- SummarizedExperiment(
+    assays=assays,
+    rowRanges=g,
+    colData=colData(object),
+    metadata=metadata
+  )
 
-  gse <- SummarizedExperiment(assays=assays,
-                              rowRanges=g,
-                              colData=colData(object),
-                              metadata=metadata)  
   gse
 }
 
@@ -165,6 +183,8 @@ summarizeToGene.SummarizedExperiment <- function(object,
 #' isoform proportions) are also returned in \code{mcols}
 #' @param varReduce whether to reduce per-sample inferential replicates
 #' information into a matrix of sample variances \code{variance} (default FALSE)
+#' @param skipRanges whether to skip making use of, or outputting, rowRanges
+#' (default FALSE)
 #' @param ... arguments passed to \code{tximport}
 #'
 #' @return a SummarizedExperiment with summarized quantifications

diff --git a/R/tximeta.R b/R/tximeta.R
@@ -493,7 +493,7 @@ missingMetadata <- function(se, summarize=FALSE) {
   If (2), use a linkedTxome to provide the missing metadata and rerun tximeta"
   if (summarize) {
     msg <- paste0(msg, "
-  or use tx2gene, txOut=FALSE (and skipMeta=TRUE if Salmon/piscem)")
+  or provide a `tx2gene` data.frame and set `skipRanges=TRUE`")
   }
   if (is.null(metadata(se)$txomeInfo)) stop(msg)
 }

diff --git a/man/summarizeToGene.Rd b/man/summarizeToGene.Rd