genomic-medicine-sweden · Lucpen · Dec 30, 2024 · Dec 30, 2024 · Jan 4, 2025 · Jan 4, 2025
@@ -3,10 +3,11 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## TDB
+## X.X.X - TBD [XXXX-XX-XX]
 
 ### `Added`
 
+- Added the option of starting from a bam file [#193](https://github.com/genomic-medicine-sweden/tomte/pull/193)
 - Optionally run Peddy for per-sample sex- and heterozygosity checks [#190](https://github.com/genomic-medicine-sweden/tomte/pull/190)
 - Optionally calculate percentage mapping to hemoglobin genes (or any other set of genes provided) [#190](https://github.com/genomic-medicine-sweden/tomte/pull/190)
 - Added the option of providing sex as 0, 1, or 2 as in the raredisease pipeline [#192](https://github.com/genomic-medicine-sweden/tomte/pull/192)

@@ -41,6 +41,20 @@
                     }
                 ]
             },
+            "bam": {
+                "errorMessage": "BAM file cannot contain spaces and must have extension '.bam'",
+                "type": "string",
+                "pattern": "^\\S+\\.bam$",
+                "format": "file-path",
+                "exists": true
+            },
+            "bai": {
+                "errorMessage": "BAM index file cannot contain spaces and must have extension '.bai'",
+                "type": "string",
+                "pattern": "^\\S+\\.bai$",
+                "format": "file-path",
+                "exists": true
+            },
             "strandedness": {
                 "type": "string",
                 "meta": ["strandedness"],
@@ -63,6 +77,18 @@
                 "errorMessage": "The valid input for sample sex is M, F, NA, other, 0, 1 or 2"
             }
         },
-        "required": ["case", "sample", "fastq_1", "strandedness"]
+        "anyOf": [
+            {
+                "dependentRequired": {
+                    "lane": ["fastq_1"]
+                }
+            },
+            {
+                "dependentRequired": {
+                    "lane": ["bam"]
+                }
+            }
+        ],
+        "required": ["case", "sample", "strandedness"]
     }
 }
@@ -26,6 +26,8 @@ params {
     // Skip when GITHUB actions
     skip_drop_ae = System.getenv("GITHUB_ACTIONS").equals(null) ? false : true
     skip_drop_as = System.getenv("GITHUB_ACTIONS").equals(null) ? false : true
+    skip_downsample = false
+    skip_subsample_region = true
 
     // Peddy looks for sites beyond chromosome 21 and thus crashes for this test case
     skip_peddy = true

@@ -58,6 +58,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 #### Salmon
 
 [`Salmon`](https://salmon.readthedocs.io/en/latest/) quantifies reads.
+Note that as Salmon has been setup to start from fastq files, it will not run if the pipeline starts from bam files.
 
 <details markdown="1">
 <summary>Output files</summary>

@@ -98,17 +98,19 @@ Running the pipeline involves three steps:
 
 #### Samplesheet
 
-A samplesheet is used to pass the information about the sample(s), such as the path to the FASTQ files and other meta data (sex, phenotype, etc.,) to the pipeline in csv format.
+A samplesheet is used to pass the information about the sample(s), such as the path to the FASTQ/BAM files and other meta data (sex, phenotype, etc.,) to the pipeline in csv format.
 
 genomic-medicine-sweden/tomte will requires the information given bellow.
 
-| Fields         | Description                                                                                                                                                                            |
-| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `case`         | Case ID, for the analysis used when generating a family VCF.                                                                                                                           |
-| `sample`       | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). |
-| `fastq_1`      | Absolute path to FASTQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                                         |
-| `fastq_2`      | Absolute path to FASTQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                                         |
-| `strandedness` | Sample strandness                                                                                                                                                                      |
+| Fields         | Description                                                                                                                                                                            | Mandatory?                    |
+| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------- |
+| `case`         | Case ID, for the analysis used when generating a family VCF.                                                                                                                           | Mandatory                     |
+| `sample`       | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | Mandatory                     |
+| `fastq_1`      | Absolute path to FASTQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                                         | Provide either fastq_1 or bam |
+| `fastq_2`      | Absolute path to FASTQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                                         | Provide either fastq_2 or bai |
+| `strandedness` | Sample strandness                                                                                                                                                                      | Mandatory                     |
+| `bam`          | Full path to BAM file.                                                                                                                                                                 | Provide either fastq_1 or bam |
+| `bai`          | Full path to BAM index file.                                                                                                                                                           | Provide either fastq_2 or bai |
 
 It is also possible to include multiple runs of the same sample in a samplesheet. For example, when you have re-sequenced the same sample more than once to increase sequencing depth. In that case, the `sample` identifiers in the samplesheet have to be the same. The pipeline will align the raw read/read-pairs independently before merging the alignments belonging to the same sample. Below is an example for a trio with the proband sequenced across two lanes:
 
@@ -119,6 +121,13 @@ It is also possible to include multiple runs of the same sample in a samplesheet
 | fam_1 | PATIENT_1    | AEG588A3_S1_L001_R1_001.fastq.gz | AEG588A3_S1_L001_R2_001.fastq.gz | reverse      |
 | fam_1 | PATIENT_1    | AEG588A3_S1_L002_R1_001.fastq.gz | AEG588A3_S1_L002_R2_001.fastq.gz | reverse      |
 
+Here is an example of a samplesheet where BAM files are provided:
+
+| case  | sample       | fastq_1 | fastq_2 | strandedness | bam          | bai              |
+| ----- | ------------ | ------- | ------- | ------------ | ------------ | ---------------- |
+| fam_1 | CONTROL_REP1 |         |         | reverse      | AEG588A1.bam | AEG588A1.bam.bai |
+| fam_1 | CONTROL_REP2 |         |         | reverse      | AEG588A2.bam | AEG588A2.bam.bai |
+
 If you would like to see more examples of what a typical samplesheet looks like for a duo, follow this links, [sample_sheet](https://github.com/genomic-medicine-sweden/tomte/blob/master/test_data/samplesheet_chr21.csv)
 
 #### Reference files and parameters

@@ -24,19 +24,36 @@ process DROP_SAMPLE_ANNOT {
     task.ext.when == null || task.ext.when
 
     script:
-    def id = "${ids}".replace("[","").replace("]","").replace(",","")
-    def single_end = "${single_ends}".replace("[","").replace("]","").replace(",","")
-    def sex_drop = "${sex}".replace("[","").replace("]","").replace(",","").replace("1","M").replace("2","F").replace("0","NA").replace("other","NA")
-    def strandedness = "${strandednesses}".replace("[","").replace("]","").replace(",","")
+    def id = ids.join(' ')
+    def single_end = single_ends.join(' ')
+    def sex_drop = sex.collect { it.replace("1","M").replace("2","F").replace("0","NA").replace("other","NA") }.join(' ')
+    def strandedness = strandednesses.join(' ')
     def drop_group = "${drop_group_samples_ae},${drop_group_samples_as}".replace(" ","").replace("[","").replace("]","")
     def reference_count_file = ref_gene_counts ? "--ref_count_file ${ref_gene_counts}" : ''
     def reference_annotation = ref_annot ? "--ref_annot ${ref_annot}" : ''
     """
+    SINGLE_ENDS=(${single_end})
+    BAMS=(${bam.join(' ')})
+
+    # Check if single_end values are provided
+    updated_single_ends=()
+    for ((i=0; i<\${#SINGLE_ENDS[@]}; i++)); do
+        if [[ "\${SINGLE_ENDS[i]}" == "null" ]]; then
+            result=\$(samtools view -c -f 1 "\${BAMS[i]}" | awk '{print \$1 == 0 ? "true" : "false"}')
+            updated_single_ends+=("\$result")
+        else
+            updated_single_ends+=("\${SINGLE_ENDS[i]}")
+        fi
+    done
+
+    # Convert updated_single_ends array to space-separated string and save to file
+    echo "\${updated_single_ends[*]}" > updated_single_ends.txt
+
     drop_sample_annot.py \\
-        --bam ${bam} \\
+        --bam ${bam.join(' ')} \\
         --samples $id \\
         --strandedness $strandedness \\
-        --single_end $single_end \\
+        --single_end \$(cat updated_single_ends.txt) \\
         --sex $sex_drop \\
         $reference_count_file \\
         $reference_annotation \\
@@ -46,6 +63,7 @@ process DROP_SAMPLE_ANNOT {
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
         drop_sample_annot: \$(drop_sample_annot.py --version)
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
     END_VERSIONS
     """
 
@@ -56,6 +74,7 @@ process DROP_SAMPLE_ANNOT {
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
         drop_sample_annot: \$(drop_sample_annot.py --version)
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
     END_VERSIONS
     """
 }
@@ -13,7 +13,8 @@ include { SAMTOOLS_VIEW        } from '../../modules/nf-core/samtools/view/main'
 
 workflow ALIGNMENT {
     take:
-        reads                 // channel:   [mandatory] [ val(meta), [path(reads)]  ]
+        ch_fastq_reads        // channel:   [optional] [ val(meta), [path(reads)]  ]
+        ch_bam_bai_reads      // channel:   [optional] [ val(meta), [path(bam) path(bai)]  ]
-        ch_fastq_reads        // channel:   [optional] [ val(meta), [path(reads)]  ]
-        ch_bam_bai_reads      // channel:   [optional] [ val(meta), [path(bam) path(bai)]  ]
+        ch_fastq_reads        // channel:   [optional] [ val(meta), [path(reads)] ]
+        ch_bam_bai_reads      // channel:   [optional] [ val(meta), [path(bam) path(bai)] ]
-        ch_fastq_reads        // channel:   [optional] [ val(meta), [path(reads)]  ]
-        ch_bam_bai_reads      // channel:   [optional] [ val(meta), [path(bam) path(bai)]  ]
+        ch_fastq_reads        // channel:   [optional] [ val(meta), [path(reads)] ]
+        ch_bam_bai_reads      // channel:   [optional] [ val(meta), [path(bam) path(bai)] ]
         star_index            // channel:   [mandatory] [ val(meta), path(star_index) ]
         ch_gtf                // channel:   [mandatory] [ val(meta), path(gtf) ]
         ch_platform           // channel:   [mandatory] [ val(platform) ]
@@ -28,7 +29,7 @@ workflow ALIGNMENT {
     main:
         ch_versions = Channel.empty()
 
-        ch_fastq = branchFastqToSingleAndMulti(reads)
+        ch_fastq = branchFastqToSingleAndMulti(ch_fastq_reads)
 
         CAT_FASTQ(ch_fastq.multiple_fq)
             .reads.mix(ch_fastq.single_fq)
@@ -38,13 +39,19 @@ workflow ALIGNMENT {
 
         STAR_ALIGN(FASTP.out.reads, star_index, ch_gtf, false, ch_platform, false)
 
+        ch_bam_reads = ch_bam_bai_reads.map { meta, bambai -> [ meta, bambai[0] ] }
+        ch_bai_reads = ch_bam_bai_reads.map { meta, bambai -> [ meta, bambai[1] ] }
+
+        ch_bam_aligned=ch_bam_reads.mix(STAR_ALIGN.out.bam_sorted_aligned)
-        ch_bam_aligned=ch_bam_reads.mix(STAR_ALIGN.out.bam_sorted_aligned)
+        ch_bam_aligned = ch_bam_reads.mix(STAR_ALIGN.out.bam_sorted_aligned)
-        ch_bam_aligned=ch_bam_reads.mix(STAR_ALIGN.out.bam_sorted_aligned)
+        ch_bam_aligned = ch_bam_reads.mix(STAR_ALIGN.out.bam_sorted_aligned)
+
         SAMTOOLS_INDEX( STAR_ALIGN.out.bam_sorted_aligned )
+        ch_bai = ch_bai_reads.mix(SAMTOOLS_INDEX.out.bai)
 
         ch_bam_bai = Channel.empty()
         ch_bam_bai_out = Channel.empty()
 
         if (!skip_subsample_region) {
-            RNA_SUBSAMPLE_REGION( STAR_ALIGN.out.bam_sorted_aligned, subsample_bed, seed_frac)
+            RNA_SUBSAMPLE_REGION( ch_bam_aligned, subsample_bed, seed_frac)
             ch_bam_bai = ch_bam_bai.mix(RNA_SUBSAMPLE_REGION.out.bam_bai)
             ch_versions = ch_versions.mix(RNA_SUBSAMPLE_REGION.out.versions.first())
             if (skip_downsample) {
@@ -55,17 +62,17 @@ workflow ALIGNMENT {
                 ch_versions = ch_versions.mix(RNA_DOWNSAMPLE.out.versions.first())
             }
         } else {
-            ch_bam_bai = ch_bam_bai.mix(STAR_ALIGN.out.bam_sorted_aligned.join(SAMTOOLS_INDEX.out.bai))
+            ch_bam_bai = ch_bam_bai.mix(ch_bam_aligned.join(ch_bai))
             if (skip_downsample) {
-                ch_bam_bai_out = STAR_ALIGN.out.bam_sorted_aligned.join(SAMTOOLS_INDEX.out.bai)
+                ch_bam_bai_out = ch_bam_aligned.join(ch_bai)
             } else {
                 RNA_DOWNSAMPLE( ch_bam_bai, num_reads)
                 ch_bam_bai_out = RNA_DOWNSAMPLE.out.bam_bai
                 ch_versions = ch_versions.mix(RNA_DOWNSAMPLE.out.versions.first())
             }
         }
 
-        SAMTOOLS_VIEW( STAR_ALIGN.out.bam_sorted_aligned.join(SAMTOOLS_INDEX.out.bai), ch_genome_fasta, [] )
+        SAMTOOLS_VIEW( ch_bam_aligned.join(ch_bai), ch_genome_fasta, [] )
 
         SALMON_QUANT( FASTP.out.reads, salmon_index, ch_gtf.map{ meta, gtf ->  gtf  }, [], false, 'A')
 
@@ -79,7 +86,7 @@ workflow ALIGNMENT {
     emit:
         merged_reads    = CAT_FASTQ.out.reads                // channel: [ val(meta), path(fastq) ]
         fastp_report    = FASTP.out.json                     // channel: [ val(meta), path(json) ]
-        bam             = STAR_ALIGN.out.bam_sorted_aligned  // channel: [ val(meta), path(bam) ]
+        bam             = ch_bam_aligned  // channel: [ val(meta), path(bam) ]
         bam_bai         = ch_bam_bai                         // channel: [ val(meta), path(bam), path(bai) ]
         bam_ds_bai      = ch_bam_bai_out                     // channel: [ val(meta), path(bam), path(bai) ]
         gene_counts     = STAR_ALIGN.out.read_per_gene_tab   // channel: [ val(meta), path(tsv) ]

@@ -72,45 +72,49 @@ workflow PIPELINE_INITIALISATION {
     // Create channel from input file provided through params.input
     //
 
-    Channel
+Channel
     .fromList(samplesheetToList(params.input, "${projectDir}/assets/schema_input.json"))
     .tap { ch_original_input }
-    .map { meta, fastq_1, fastq_2 -> meta.id }
-    .reduce([:]) { counts, sample -> // get counts of each sample in the samplesheet - for groupTuple
+    .map { meta, fastq_1, fastq_2, bam, bai -> meta.id }
+    .reduce([:]) { counts, sample ->
         counts[sample] = (counts[sample] ?: 0) + 1
         counts
     }
     .combine ( ch_original_input )
-    .map { counts, meta, fastq_1, fastq_2 ->
-        if (!fastq_2) {
-            return [ meta + [ single_end:true, fq_pairs:counts[meta.id] ], [ fastq_1 ] ]
+    .map { counts, meta, fastq_1, fastq_2, bam, bai ->
+        if (bam) {
+            return [ meta + [ single_end:false, fq_pairs:counts[meta.id], is_bam:true ], [ bam, bai ] ]
+        } else if (!fastq_2) {
+            return [ meta + [ single_end:true, fq_pairs:counts[meta.id], is_bam:false ], [ fastq_1 ] ]
         } else {
-            return [ meta + [ single_end:false, fq_pairs:counts[meta.id] ], [ fastq_1, fastq_2 ] ]
+            return [ meta + [ single_end:false, fq_pairs:counts[meta.id], is_bam:false ], [ fastq_1, fastq_2 ] ]
         }
     }
     .tap { ch_input_counts }
-    .map { meta, fastqs -> fastqs }
-    .reduce([:]) { counts, fastqs -> // get number of fastq sets in the run - for creating unique ID:s
-        counts[fastqs] = counts.size() + 1
+    .map { meta, files -> [meta.id, files] }
+    .reduce([:]) { counts, id_files ->
+        counts[id_files[0]] = (counts[id_files[0]] ?: [:])
+        counts[id_files[0]][id_files[1]] = counts[id_files[0]].size() + 1
         return counts
     }
     .combine( ch_input_counts )
-    .map { lineno, meta, fastqs -> // append line number to sample id for unique set ids
-        new_meta = meta + [id:meta.id+"_id"+lineno[fastqs]]
-        return [ new_meta, fastqs ]
+    .map { lineno, meta, files ->
+        new_meta = meta + [id:meta.id+"_id"+lineno[meta.id][files]]
+        return [ new_meta, files ]
     }
-    .tap { ch_samplesheet } // Output, the rest is just for validation
-    .map { meta, fastqs ->
-        return [ meta.sample, groupKey( meta + [id:meta.sample], meta.fq_pairs ), fastqs ]
+    .tap { ch_samplesheet }
+    .map { meta, files ->
+        return [ meta.sample, groupKey( meta + [id:meta.sample], meta.fq_pairs ), files ]
     }
     .groupTuple()
     .map {
         validateInputSamplesheet(it)
     }
 
-    emit:
-    samplesheet = ch_samplesheet
-    versions    = ch_versions
+ch_samplesheet.view()
+emit:
+samplesheet = ch_samplesheet
+versions    = ch_versions
 }
 
 /*
@@ -186,6 +190,7 @@ def validateInputSamplesheet(input) {
 
     return [ metas[0], fastqs ]
 }
+
 //
 // Get attribute from genome config file e.g. fasta
 //

@@ -121,14 +121,26 @@ workflow TOMTE {
     ).set { ch_references }
     ch_versions = ch_versions.mix(PREPARE_REFERENCES.out.versions.first())
 
+    // Prepare input
+    ch_samplesheet
+        .branch {
+            fastq: it[1].any { it.toString().endsWith('.fastq.gz') || it.toString().endsWith('.fq.gz') }
+            bam:   it[1].any { it.toString().endsWith('.bam') }
+        }
+        .set { ch_input_branch }
+
+    ch_bam_reads = ch_input_branch.bam
+    ch_fastq_reads = ch_input_branch.fastq
+
     FASTQC (
-        ch_samplesheet
+        ch_fastq_reads
     )
     ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]})
     ch_versions = ch_versions.mix(FASTQC.out.versions.first())
 
     ALIGNMENT(
-        ch_samplesheet,
+        ch_fastq_reads,
+        ch_bam_reads,
         ch_references.star_index,
         ch_references.gtf,
         ch_platform,