Skip to content

Commit

Permalink
Merge pull request #168 from nf-core/per-tool-quantification
Browse files Browse the repository at this point in the history
Add output containing per-tool quantification
  • Loading branch information
nictru authored Aug 9, 2024
2 parents 3e1ce6f + 7ac747e commit c9a5eb7
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 23 deletions.
19 changes: 19 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -652,6 +652,25 @@ process {
]
}

withName: 'EXTRACT_COUNTS' {
// Add meta.id as header
// Keep columns 4,5
ext.args = { "-v FS='\\t' -v OFS='\\t' 'BEGIN { print \"id\", \"${meta.id}\" } { print \$4, \$5 }'" }
ext.suffix = {"counts.tsv"}
publishDir = [
enabled: false
]
}

withName: 'COMBINE_COUNTS_PER_TOOL' {
ext.args = "-f 1 -t -O"
publishDir = [
path: { "${params.outdir}/bsj_detection/tools/${meta.id}" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: UPSET_SAMPLES {
ext.when = { params.tools.split(',').length > 1 }
publishDir = [
Expand Down
1 change: 1 addition & 0 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ The rough workflow for the BSJ detection looks like this:
- `annotated`: Based on `masked`, but with additional columns for the circRNA type, the host gene(s), host transcript(s) and potential database hits. Contains a BED and a GTF file for each sample.
- `fasta`: Extracted sequences of the circRNAs in FASTA format. Based on `masked`.
- `intermediates`: Contains intermediate files generated by the BSJ detection tools, as explained below.
- `${tool}.csv`: Number of reads that the tool found supporting the BSJ.

</details>

Expand Down
65 changes: 42 additions & 23 deletions subworkflows/local/bsj_detection.nf
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ include { GNU_SORT as CONCAT_TOOLS_PER_SAMPLE } from '../../modules/nf-co
include { BEDTOOLS_GROUPBY as COUNT_TOOLS } from '../../modules/nf-core/bedtools/groupby'
include { GAWK as FILTER_MIN_TOOLS } from '../../modules/nf-core/gawk'
include { GNU_SORT as CONCAT_SAMPLES } from '../../modules/nf-core/gnu/sort'
include { GAWK as EXTRACT_COUNTS } from '../../modules/nf-core/gawk'
include { CSVTK_JOIN as COMBINE_COUNTS_PER_TOOL } from '../../modules/nf-core/csvtk/join'
include { UPSET as UPSET_SAMPLES } from '../../modules/local/upset'
include { UPSET as UPSET_ALL } from '../../modules/local/upset'
include { BEDTOOLS_GETFASTA as FASTA_COMBINED } from '../../modules/nf-core/bedtools/getfasta'
Expand Down Expand Up @@ -43,7 +45,7 @@ workflow BSJ_DETECTION {

main:
ch_versions = Channel.empty()
ch_bed = Channel.empty()
ch_bsj_bed_per_sample_tool = Channel.empty()
ch_multiqc_files = Channel.empty()
fasta = ch_fasta.map{meta, fasta -> fasta}
gtf = ch_gtf.map{meta, gtf -> gtf}
Expand All @@ -66,59 +68,76 @@ workflow BSJ_DETECTION {

if (tools_selected.contains('segemehl')) {
SEGEMEHL( reads, fasta, params.segemehl )
ch_versions = ch_versions.mix(SEGEMEHL.out.versions)
ch_bed = ch_bed .mix(SEGEMEHL.out.bed)
ch_versions = ch_versions.mix(SEGEMEHL.out.versions)
ch_bsj_bed_per_sample_tool = ch_bsj_bed_per_sample_tool.mix(SEGEMEHL.out.bed)
}

if (tools_selected.contains('circexplorer2')) {
CIRCEXPLORER2( gtf, fasta, STAR2PASS.out.junction )
ch_versions = ch_versions.mix(CIRCEXPLORER2.out.versions)
ch_bed = ch_bed .mix(CIRCEXPLORER2.out.bed)
ch_versions = ch_versions.mix(CIRCEXPLORER2.out.versions)
ch_bsj_bed_per_sample_tool = ch_bsj_bed_per_sample_tool.mix(CIRCEXPLORER2.out.bed)
}

if (tools_selected.contains('circrna_finder')) {
CIRCRNA_FINDER( fasta, STAR2PASS.out.sam, STAR2PASS.out.junction,
STAR2PASS.out.tab )
ch_versions = ch_versions.mix(CIRCRNA_FINDER.out.versions)
ch_bed = ch_bed .mix(CIRCRNA_FINDER.out.bed)
ch_versions = ch_versions.mix(CIRCRNA_FINDER.out.versions)
ch_bsj_bed_per_sample_tool = ch_bsj_bed_per_sample_tool.mix(CIRCRNA_FINDER.out.bed)
}

if (tools_selected.contains('find_circ')) {
FIND_CIRC( reads, bowtie2_index, ch_fasta )
ch_versions = ch_versions.mix(FIND_CIRC.out.versions)
ch_bed = ch_bed .mix(FIND_CIRC.out.bed)
ch_versions = ch_versions.mix(FIND_CIRC.out.versions)
ch_bsj_bed_per_sample_tool = ch_bsj_bed_per_sample_tool.mix(FIND_CIRC.out.bed)
}

if (tools_selected.contains('ciriquant')) {
CIRIQUANT( reads, ch_gtf, ch_fasta, bwa_index, hisat2_index )
ch_versions = ch_versions.mix(CIRIQUANT.out.versions)
ch_bed = ch_bed .mix(CIRIQUANT.out.bed)
ch_versions = ch_versions.mix(CIRIQUANT.out.versions)
ch_bsj_bed_per_sample_tool = ch_bsj_bed_per_sample_tool.mix(CIRIQUANT.out.bed)
}

if (tools_selected.contains('dcc')) {
DCC( reads, ch_fasta, ch_gtf, star_index, STAR2PASS.out.junction,
star_ignore_sjdbgtf, seq_platform, seq_center, bsj_reads )
ch_versions = ch_versions.mix(DCC.out.versions)
ch_bed = ch_bed .mix(DCC.out.bed)
ch_versions = ch_versions.mix(DCC.out.versions)
ch_bsj_bed_per_sample_tool = ch_bsj_bed_per_sample_tool.mix(DCC.out.bed)
}

if (tools_selected.contains('mapsplice')) {
MAPSPLICE( reads, gtf, fasta, bowtie_index, chromosomes,
STAR2PASS.out.junction )
ch_versions = ch_versions.mix(MAPSPLICE.out.versions)
ch_bed = ch_bed .mix(MAPSPLICE.out.bed)
ch_versions = ch_versions.mix(MAPSPLICE.out.versions)
ch_bsj_bed_per_sample_tool = ch_bsj_bed_per_sample_tool.mix(MAPSPLICE.out.bed)
}

ch_bed = FILTER_BSJS( ch_bed, [] ).output
ch_versions = ch_versions.mix(FILTER_BSJS.out.versions)
//
// QUANTIFY BSJs PER TOOL
//

EXTRACT_COUNTS( ch_bsj_bed_per_sample_tool, [] )
ch_versions = ch_versions.mix(EXTRACT_COUNTS.out.versions)

COMBINE_COUNTS_PER_TOOL( EXTRACT_COUNTS.out.output
.map{ meta, bed -> [[id: meta.tool], bed]}
.groupTuple() )
ch_versions = ch_versions.mix(COMBINE_COUNTS_PER_TOOL.out.versions)

//
// APPLY bsj_reads FILTER
//

ch_bsj_bed_per_sample_tool_filtered = FILTER_BSJS( ch_bsj_bed_per_sample_tool, [] ).output
ch_versions = ch_versions.mix(FILTER_BSJS.out.versions)


//
// MERGE BED FILES
//

MASK_SCORES( ch_bed, [] )
MASK_SCORES( ch_bsj_bed_per_sample_tool_filtered, [] )
ch_versions = ch_versions.mix(MASK_SCORES.out.versions)
ch_bsj_bed_per_sample_tool = MASK_SCORES.out.output
ch_bsj_bed_per_sample_tool_masked = MASK_SCORES.out.output
.filter{ meta, bed -> !bed.empty }

CONCAT_TOOLS_PER_SAMPLE(
Expand All @@ -144,14 +163,14 @@ workflow BSJ_DETECTION {
// UPSET PLOTS
//

UPSET_SAMPLES( ch_bsj_bed_per_sample_tool
UPSET_SAMPLES( ch_bsj_bed_per_sample_tool_masked
.map{ meta, bed -> [meta.id, meta.tool, bed]}
.groupTuple()
.map{ sample, tools, beds -> [[id: sample], tools, beds]} )
ch_multiqc_files = ch_multiqc_files.mix(UPSET_SAMPLES.out.multiqc)
ch_versions = ch_versions.mix(UPSET_SAMPLES.out.versions)

UPSET_ALL( ch_bsj_bed_per_sample_tool
UPSET_ALL( ch_bsj_bed_per_sample_tool_masked
.map{ meta, bed -> ["all", meta.tool, bed] }
.groupTuple()
.map{ sample, tools, beds -> [[id: sample], tools, beds]} )
Expand All @@ -172,7 +191,7 @@ workflow BSJ_DETECTION {
ch_bsj_bed12_per_sample = ANNOTATE_PER_SAMPLE.out.bed
ch_bsj_gtf_per_sample = ANNOTATE_PER_SAMPLE.out.gtf

ANNOTATE_PER_SAMPLE_TOOL( ch_bsj_bed_per_sample_tool, ch_gtf, exon_boundary, ch_annotation )
ANNOTATE_PER_SAMPLE_TOOL( ch_bsj_bed_per_sample_tool_masked, ch_gtf, exon_boundary, ch_annotation )
ch_versions = ch_versions.mix(ANNOTATE_PER_SAMPLE_TOOL.out.versions)
ch_bsj_bed12_per_sample_tool = ANNOTATE_PER_SAMPLE_TOOL.out.bed
ch_bsj_gtf_per_sample_tool = ANNOTATE_PER_SAMPLE_TOOL.out.gtf
Expand All @@ -189,7 +208,7 @@ workflow BSJ_DETECTION {
ch_versions = ch_versions.mix(FASTA_PER_SAMPLE.out.versions)
ch_bsj_fasta_per_sample = FASTA_PER_SAMPLE.out.fasta

FASTA_PER_SAMPLE_TOOL( ch_bsj_bed_per_sample_tool, fasta )
FASTA_PER_SAMPLE_TOOL( ch_bsj_bed_per_sample_tool_masked, fasta )
ch_versions = ch_versions.mix(FASTA_PER_SAMPLE_TOOL.out.versions)
ch_bsj_fasta_per_sample_tool = FASTA_PER_SAMPLE_TOOL.out.fasta

Expand Down

0 comments on commit c9a5eb7

Please sign in to comment.