Merge pull request #168 from nf-core/per-tool-quantification

Add output containing per-tool quantification
nf-core · Aug 9, 2024 · c9a5eb7 · c9a5eb7
2 parents 3e1ce6f + 7ac747e
commit c9a5eb7
Show file tree

Hide file tree

Showing 3 changed files with 62 additions and 23 deletions.
diff --git a/conf/modules.config b/conf/modules.config
@@ -652,6 +652,25 @@ process {
         ]
     }
 
+    withName: 'EXTRACT_COUNTS' {
+        // Add meta.id as header
+        // Keep columns 4,5
+        ext.args = { "-v FS='\\t' -v OFS='\\t' 'BEGIN { print \"id\", \"${meta.id}\" } { print \$4, \$5 }'" }
+        ext.suffix = {"counts.tsv"}
+        publishDir = [
+            enabled: false
+        ]
+    }
+
+    withName: 'COMBINE_COUNTS_PER_TOOL' {
+        ext.args = "-f 1 -t -O"
+        publishDir = [
+            path: { "${params.outdir}/bsj_detection/tools/${meta.id}" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
     withName: UPSET_SAMPLES {
         ext.when = { params.tools.split(',').length > 1 }
         publishDir = [

diff --git a/docs/output.md b/docs/output.md
@@ -154,6 +154,7 @@ The rough workflow for the BSJ detection looks like this:
 - `annotated`: Based on `masked`, but with additional columns for the circRNA type, the host gene(s), host transcript(s) and potential database hits. Contains a BED and a GTF file for each sample.
 - `fasta`: Extracted sequences of the circRNAs in FASTA format. Based on `masked`.
 - `intermediates`: Contains intermediate files generated by the BSJ detection tools, as explained below.
+- `${tool}.csv`: Number of reads that the tool found supporting the BSJ.
 
 </details>
 

diff --git a/subworkflows/local/bsj_detection.nf b/subworkflows/local/bsj_detection.nf
@@ -5,6 +5,8 @@ include { GNU_SORT as CONCAT_TOOLS_PER_SAMPLE        } from '../../modules/nf-co
 include { BEDTOOLS_GROUPBY as COUNT_TOOLS            } from '../../modules/nf-core/bedtools/groupby'
 include { GAWK as FILTER_MIN_TOOLS                   } from '../../modules/nf-core/gawk'
 include { GNU_SORT as CONCAT_SAMPLES                 } from '../../modules/nf-core/gnu/sort'
+include { GAWK as EXTRACT_COUNTS                     } from '../../modules/nf-core/gawk'
+include { CSVTK_JOIN as COMBINE_COUNTS_PER_TOOL      } from '../../modules/nf-core/csvtk/join'
 include { UPSET as UPSET_SAMPLES                     } from '../../modules/local/upset'
 include { UPSET as UPSET_ALL                         } from '../../modules/local/upset'
 include { BEDTOOLS_GETFASTA as FASTA_COMBINED        } from '../../modules/nf-core/bedtools/getfasta'
@@ -43,7 +45,7 @@ workflow BSJ_DETECTION {
 
     main:
     ch_versions      = Channel.empty()
-    ch_bed           = Channel.empty()
+    ch_bsj_bed_per_sample_tool           = Channel.empty()
     ch_multiqc_files = Channel.empty()
     fasta            = ch_fasta.map{meta, fasta -> fasta}
     gtf              = ch_gtf.map{meta, gtf -> gtf}
@@ -66,59 +68,76 @@ workflow BSJ_DETECTION {
 
     if (tools_selected.contains('segemehl')) {
         SEGEMEHL( reads, fasta, params.segemehl )
-        ch_versions = ch_versions.mix(SEGEMEHL.out.versions)
-        ch_bed      = ch_bed     .mix(SEGEMEHL.out.bed)
+        ch_versions                = ch_versions.mix(SEGEMEHL.out.versions)
+        ch_bsj_bed_per_sample_tool = ch_bsj_bed_per_sample_tool.mix(SEGEMEHL.out.bed)
     }
 
     if (tools_selected.contains('circexplorer2')) {
         CIRCEXPLORER2( gtf, fasta, STAR2PASS.out.junction )
-        ch_versions = ch_versions.mix(CIRCEXPLORER2.out.versions)
-        ch_bed      = ch_bed     .mix(CIRCEXPLORER2.out.bed)
+        ch_versions                = ch_versions.mix(CIRCEXPLORER2.out.versions)
+        ch_bsj_bed_per_sample_tool = ch_bsj_bed_per_sample_tool.mix(CIRCEXPLORER2.out.bed)
     }
 
     if (tools_selected.contains('circrna_finder')) {
         CIRCRNA_FINDER( fasta, STAR2PASS.out.sam, STAR2PASS.out.junction,
             STAR2PASS.out.tab )
-        ch_versions = ch_versions.mix(CIRCRNA_FINDER.out.versions)
-        ch_bed      = ch_bed     .mix(CIRCRNA_FINDER.out.bed)
+        ch_versions                = ch_versions.mix(CIRCRNA_FINDER.out.versions)
+        ch_bsj_bed_per_sample_tool = ch_bsj_bed_per_sample_tool.mix(CIRCRNA_FINDER.out.bed)
     }
 
     if (tools_selected.contains('find_circ')) {
         FIND_CIRC( reads, bowtie2_index, ch_fasta )
-        ch_versions = ch_versions.mix(FIND_CIRC.out.versions)
-        ch_bed      = ch_bed     .mix(FIND_CIRC.out.bed)
+        ch_versions                = ch_versions.mix(FIND_CIRC.out.versions)
+        ch_bsj_bed_per_sample_tool = ch_bsj_bed_per_sample_tool.mix(FIND_CIRC.out.bed)
     }
 
     if (tools_selected.contains('ciriquant')) {
         CIRIQUANT( reads, ch_gtf, ch_fasta, bwa_index, hisat2_index )
-        ch_versions = ch_versions.mix(CIRIQUANT.out.versions)
-        ch_bed      = ch_bed     .mix(CIRIQUANT.out.bed)
+        ch_versions                = ch_versions.mix(CIRIQUANT.out.versions)
+        ch_bsj_bed_per_sample_tool = ch_bsj_bed_per_sample_tool.mix(CIRIQUANT.out.bed)
     }
 
     if (tools_selected.contains('dcc')) {
         DCC( reads, ch_fasta, ch_gtf, star_index, STAR2PASS.out.junction,
             star_ignore_sjdbgtf, seq_platform, seq_center, bsj_reads )
-        ch_versions = ch_versions.mix(DCC.out.versions)
-        ch_bed      = ch_bed     .mix(DCC.out.bed)
+        ch_versions                = ch_versions.mix(DCC.out.versions)
+        ch_bsj_bed_per_sample_tool = ch_bsj_bed_per_sample_tool.mix(DCC.out.bed)
     }
 
     if (tools_selected.contains('mapsplice')) {
         MAPSPLICE( reads, gtf, fasta, bowtie_index, chromosomes,
             STAR2PASS.out.junction )
-        ch_versions = ch_versions.mix(MAPSPLICE.out.versions)
-        ch_bed      = ch_bed     .mix(MAPSPLICE.out.bed)
+        ch_versions                = ch_versions.mix(MAPSPLICE.out.versions)
+        ch_bsj_bed_per_sample_tool = ch_bsj_bed_per_sample_tool.mix(MAPSPLICE.out.bed)
     }
 
-    ch_bed = FILTER_BSJS( ch_bed, [] ).output
-    ch_versions = ch_versions.mix(FILTER_BSJS.out.versions)
+    //
+    // QUANTIFY BSJs PER TOOL
+    //
+
+    EXTRACT_COUNTS( ch_bsj_bed_per_sample_tool, [] )
+    ch_versions = ch_versions.mix(EXTRACT_COUNTS.out.versions)
+
+    COMBINE_COUNTS_PER_TOOL( EXTRACT_COUNTS.out.output
+        .map{ meta, bed -> [[id: meta.tool], bed]}
+        .groupTuple() )
+    ch_versions = ch_versions.mix(COMBINE_COUNTS_PER_TOOL.out.versions)
+
+    //
+    // APPLY bsj_reads FILTER
+    //
+
+    ch_bsj_bed_per_sample_tool_filtered = FILTER_BSJS( ch_bsj_bed_per_sample_tool, [] ).output
+    ch_versions                         = ch_versions.mix(FILTER_BSJS.out.versions)
+
 
     //
     // MERGE BED FILES
     //
 
-    MASK_SCORES( ch_bed, [] )
+    MASK_SCORES( ch_bsj_bed_per_sample_tool_filtered, [] )
     ch_versions = ch_versions.mix(MASK_SCORES.out.versions)
-    ch_bsj_bed_per_sample_tool = MASK_SCORES.out.output
+    ch_bsj_bed_per_sample_tool_masked = MASK_SCORES.out.output
         .filter{ meta, bed -> !bed.empty }
 
     CONCAT_TOOLS_PER_SAMPLE(
@@ -144,14 +163,14 @@ workflow BSJ_DETECTION {
     // UPSET PLOTS
     //
 
-    UPSET_SAMPLES( ch_bsj_bed_per_sample_tool
+    UPSET_SAMPLES( ch_bsj_bed_per_sample_tool_masked
         .map{ meta, bed -> [meta.id, meta.tool, bed]}
         .groupTuple()
         .map{ sample, tools, beds -> [[id: sample], tools, beds]} )
     ch_multiqc_files = ch_multiqc_files.mix(UPSET_SAMPLES.out.multiqc)
     ch_versions = ch_versions.mix(UPSET_SAMPLES.out.versions)
 
-    UPSET_ALL( ch_bsj_bed_per_sample_tool
+    UPSET_ALL( ch_bsj_bed_per_sample_tool_masked
         .map{ meta, bed -> ["all", meta.tool, bed] }
         .groupTuple()
         .map{ sample, tools, beds -> [[id: sample], tools, beds]} )
@@ -172,7 +191,7 @@ workflow BSJ_DETECTION {
     ch_bsj_bed12_per_sample = ANNOTATE_PER_SAMPLE.out.bed
     ch_bsj_gtf_per_sample   = ANNOTATE_PER_SAMPLE.out.gtf
 
-    ANNOTATE_PER_SAMPLE_TOOL( ch_bsj_bed_per_sample_tool, ch_gtf, exon_boundary, ch_annotation )
+    ANNOTATE_PER_SAMPLE_TOOL( ch_bsj_bed_per_sample_tool_masked, ch_gtf, exon_boundary, ch_annotation )
     ch_versions                  = ch_versions.mix(ANNOTATE_PER_SAMPLE_TOOL.out.versions)
     ch_bsj_bed12_per_sample_tool = ANNOTATE_PER_SAMPLE_TOOL.out.bed
     ch_bsj_gtf_per_sample_tool   = ANNOTATE_PER_SAMPLE_TOOL.out.gtf
@@ -189,7 +208,7 @@ workflow BSJ_DETECTION {
     ch_versions = ch_versions.mix(FASTA_PER_SAMPLE.out.versions)
     ch_bsj_fasta_per_sample = FASTA_PER_SAMPLE.out.fasta
 
-    FASTA_PER_SAMPLE_TOOL( ch_bsj_bed_per_sample_tool, fasta )
+    FASTA_PER_SAMPLE_TOOL( ch_bsj_bed_per_sample_tool_masked, fasta )
     ch_versions = ch_versions.mix(FASTA_PER_SAMPLE_TOOL.out.versions)
     ch_bsj_fasta_per_sample_tool = FASTA_PER_SAMPLE_TOOL.out.fasta