add meta.yml for each local module

genomic-medicine-sweden · Jan 9, 2025 · d37ee4b · d37ee4b
1 parent d259ea4
commit d37ee4b
Show file tree

Hide file tree

Showing 12 changed files with 375 additions and 14 deletions.
diff --git a/modules/local/extract_viral_taxid/main.nf b/modules/local/extract_viral_taxid/main.nf
@@ -0,0 +1,34 @@
+process EXTRACT_VIRAL_TAXID {
+
+    tag "$meta.id"
+    label 'process_low'
+
+    input:
+    val evalue // e-vaule threshold to filter the diamond report
+    tuple val(meta), path(taxpasta_standardised_profile)
+    tuple val(meta), path(report) // classification report
+
+    output:
+    tuple val(meta), path("*viral_taxids.tsv"), optional:true, emit: viral_taxid
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def prefix = task.ext.prefix ?: "${meta.id}_${meta.tool}"
+
+    """
+    if grep -qi "virus" $taxpasta_standardised_profile; then
+        grep -i "virus" $taxpasta_standardised_profile | cut -f 1 > taxpasta_viral_taxid.txt
+        if [[ "${meta.tool}" == "kraken2" || "${meta.tool}" == "centrifuge" ]]; then
+            awk -F'\t' '\$3 != 0 {print \$5}' ${report} > detected_taxid.txt
+            grep -F -w -f taxpasta_viral_taxid.txt detected_taxid.txt > ${prefix}_viral_taxids.tsv
+        elif [[ "${meta.tool}" == "diamond" ]]; then
+            awk '\$3 < ${evalue}' ${report} | cut -f 2 | uniq > detected_taxid.txt
+            grep -F -w -f taxpasta_viral_taxid.txt detected_taxid.txt | uniq > ${prefix}_viral_taxids.tsv
+        fi
+    else
+        echo "No viral taxids found." > "no_viral_taxid.txt"
+    fi
+    """
+}
diff --git a/modules/local/extract_viral_taxid/meta.yml b/modules/local/extract_viral_taxid/meta.yml
@@ -0,0 +1,38 @@
+name: extract_viral_taxid
+description: Extract the taxid of viruses identified by the classifier (Kraken2/Centrifuge/DIAMOND)
+keywards:
+  - taxid
+  - virus
+  - taxpasta
+  - kraken2
+  - centrifuge
+  - diamond
+input:
+  - evalue:
+      type: ["number", "integer"]
+      description: A e-vaule threshold to filter the diamond classification result
+  - - meta:
+        type: map
+        description: |
+          Groovy map containing sample information
+          e.g. [ id:''test, single_end:false]
+  - - taxpasta_standardised_profile:
+        type: file
+        description: Path to the taxpasta standardised profile
+  - - report:
+        type: file
+        description: |
+          Path to the classifier report (Kraken2/Centrifuge) or TSV file (DIAMOND)
+          containing the identified viral taxid.
+output:
+  - - meta:
+        type: map
+        description: |
+          Groovy map containing sample information
+          e.g. [ id:''test, single_end:false]
+  - - viral_taxid:
+        type: file
+        description: Extract  viral taxid
+        pattern: "*viral_taxids.tsv"
+authors:
+  - "@LilyAnderssonLee"
diff --git a/modules/local/extractcentrifugereads/main.nf b/modules/local/extractcentrifugereads/main.nf
@@ -0,0 +1,42 @@
+process EXTRACTCENTRIFUGEREADS {
+
+    tag "$meta.id"
+    label 'process_low'
+
+    conda "bioconda::seqkit=2.8.2"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/seqkit:2.8.2--h9ee0642_1':
+        'biocontainers/seqkit:2.8.2--h9ee0642_1' }"
+
+    input:
+    val taxid
+    tuple val (meta), path(results)
+    tuple val (meta), path(fastq) // bowtie2/align *unmapped_{1,2}.fastq.gz
+
+    output:
+    tuple val(meta), path("*.fastq"), optional:true, emit: extracted_centrifuge_reads
+    path "versions.yml"                            , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    awk -v taxID=$taxid '\$3 == taxID && \$8 == 1 {print \$1}' $results > readID.txt
+    if [ "${meta.single_end}" == 'true' ]; then
+        seqkit grep -f readID.txt $fastq > ${prefix}_${taxid}.extracted_centrifuge_read.fastq
+    elif [ "${meta.single_end}" == 'false' ]; then
+        seqkit grep -f readID.txt ${fastq[0]} > ${prefix}_${taxid}.extracted_centrifuge_read1.fastq
+        seqkit grep -f readID.txt ${fastq[1]} > ${prefix}_${taxid}.extracted_centrifuge_read2.fastq
+    fi
+
+    rm readID.txt
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        seqkit: \$( seqkit version | sed 's/seqkit v//' )
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/extractcentrifugereads/meta.yml b/modules/local/extractcentrifugereads/meta.yml
@@ -0,0 +1,41 @@
+name: extractcentrifuge
+description: Use a custom python script to extract reads with specified taxonomic ID from the Centrifuge classification output
+keywards:
+  - taxid
+  - centrifuge
+  - results
+  - fastq
+  - extract_reads
+input:
+  - taxid:
+      type: integer
+      description: A taxonomic ID to extract the reads
+  - - meta:
+        type: map
+        description: |
+          Groovy Map containing sample information
+          e.g. [ id:'test', single_end:false ]
+  - - results:
+        type: file
+        description: File containing classification results
+        pattern: "*.{results.txt}"
+  - - fastq:
+        type: file
+        description: FastQ files
+        pattern: "*.fastq.gz"
+output:
+  - - meta:
+        type: map
+        description: |
+          Groovy Map containing sample information
+          e.g. [ id:'test', single_end:false ]
+  - - extracted_centrifuge_reads:
+        type: file
+        description: FastQ files contain the extracted reads of a specified taxid
+        pattern: "*fastq"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@LilyAnderssonLee"
diff --git a/modules/local/extractdiamondreads/main.nf b/modules/local/extractdiamondreads/main.nf
@@ -0,0 +1,43 @@
+process EXTRACTCDIAMONDREADS {
+
+    tag "$meta.id"
+    label 'process_high'
+
+    conda "bioconda::seqkit=2.8.2"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/seqkit:2.8.2--h9ee0642_1':
+        'biocontainers/seqkit:2.8.2--h9ee0642_1' }"
+
+    input:
+    val taxid
+    tuple val (meta), path(tsv)
+    tuple val (meta), path(fastq) // bowtie2/align *unmapped_{1,2}.fastq.gz
+
+    output:
+    tuple val(meta), path("*.fastq"), optional:true, emit: extracted_diamond_reads
+    path "versions.yml"                            , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+
+    """
+    awk -v taxID=$taxid '\$2 == taxID {print \$1}' $tsv > readID.txt
+    if [ ${meta.single_end} == 'true' ]; then
+        seqkit grep -f readID.txt $fastq > ${prefix}_${taxid}.extracted_diamond_read.fastq
+    elif [ "${meta.single_end}" == 'false' ]; then
+        seqkit grep -f readID.txt ${fastq[0]} > ${prefix}_${taxid}.extracted_diamond_read1.fastq
+        seqkit grep -f readID.txt ${fastq[1]} > ${prefix}_${taxid}.extracted_diamond_read2.fastq
+    fi
+
+    rm readID.txt
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        seqkit: \$( seqkit version | sed 's/seqkit v//' )
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/extractdiamondreads/meta.yml b/modules/local/extractdiamondreads/meta.yml
@@ -0,0 +1,36 @@
+name: extractdiamondreads
+description: Use a custom python script to extract reads with specified taxonomic ID from the DIAMOND classification output
+keywards:
+  - taxid
+  - DIAMOND/blastx
+  - tsv
+  - fastq
+  - extract_reads
+input:
+  - taxid:
+      type: integer
+      description: A taxonomic ID to extract the reads
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - tsv:
+      type: file
+      description: Tab separated file containing DIAMOND taxonomic classification of hits
+      pattern: "*.tsv"
+  - fastq:
+      type: file
+      description: FastQ files
+      pattern: "*.fastq.gz"
+output:
+  - extracted_diamond_reads:
+      type: file
+      description: FastQ files contain the extracted reads of a specified taxid
+      pattern: "*fastq"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@LilyAnderssonLee"
diff --git a/modules/local/rm_empty_fastq/main.nf b/modules/local/rm_empty_fastq/main.nf
@@ -0,0 +1,27 @@
+process RM_EMPTY_FASTQ {
+
+    label 'process_low'
+
+    input:
+    path folder
+
+    output:
+    path folder, optional: true
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    """
+    if [ -d ${folder} ]; then
+        for f in ${folder}/*.fastq; do
+            if [ ! -s \$f ]; then
+                rm \$f
+            fi
+        done
+    else
+        echo "Folder ${folder} doesn't exist."
+    fi
+    """
+}
diff --git a/modules/local/rm_empty_fastq/meta.yml b/modules/local/rm_empty_fastq/meta.yml
@@ -0,0 +1,17 @@
+name: rm_empty_fastq
+description: |
+  Remove empty FastQ files based on a user-defined taxid.
+  In other words, this taxid was not detected by the classifier (Kraken2/Centrifuge/DIAMOND).
+keywards:
+  - empty
+  - fastq
+input:
+  - folder:
+      type: path
+      description: Path stores the extracted reads from Kraken2/Centrifuge/DIAMOND
+output:
+  - folder:
+      type: path
+      description: Path stores the non-empty extracted reads from Kraken2/Centrifuge/DIAMOND
+authors:
+  - "@LilyAnderssonLee"
diff --git a/modules/local/subset_bam/main.nf b/modules/local/subset_bam/main.nf
@@ -0,0 +1,34 @@
+process SUBSET_BAM {
+
+    tag "$meta.id"
+    label 'process_low'
+
+    conda "bioconda::samtools:1.21"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/samtools:1.21--h50ea8bc_0':
+        'biocontainers/samtools:1.21--h50ea8bc_0' }"
+
+    input:
+    tuple val(meta), path(bam), path(bai)
+    val taxid_accession
+
+    output:
+    tuple val(meta), path("*.bam"), emit: bam
+    path "versions.yml"           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def accessions = taxid_accession.join(" ")
+
+    """
+    samtools view $bam $accessions -o ${prefix}.bam
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(samtools --version |& sed '1!d ; s/samtools //')
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/subset_bam/meta.yml b/modules/local/subset_bam/meta.yml
@@ -0,0 +1,53 @@
+name: subset_bam
+description: |
+  Subset bam file for each taxid
+keywards:
+  - samtools
+  - view
+  - subset
+  - bam
+tools:
+  - samtools:
+      description: |
+        SAMtools is a set of utilities for interacting with and post-processing
+        short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li.
+        These files are generated as output by short read aligners like BWA.
+      homepage: http://www.htslib.org/
+      documentation: http://www.htslib.org/doc/samtools.html
+      doi: 10.1093/bioinformatics/btp352
+      licence: ["MIT"]
+      identifier: biotools:samtools
+input:
+  - - meta:
+        type: map
+        description: |
+          Groovy map containing sample information
+          e.g. [ id:''test, single_end:false]
+  - - bam:
+        type: file
+        description: BAM file of reads aligned to the pathogen genomes
+        pattern: "*.{bam}"
+  - - bai:
+        type: file
+        description: BAI file (BAM index) of BAM reads aligned to the pathogen genome
+        pattern: "*.{bai}"
+  - taxid_accession:
+      type: list
+      description: A list of acessions of a taxid
+output:
+  - - meta:
+        type: map
+        description: |
+          Groovy map containing sample information
+          e.g. [ id:''test, single_end:false]
+  - - bam:
+        type: file
+        description: A subset bam file
+        pattern: "*.{bam}"
+  - versions:
+      - versions.yml:
+          type: file
+          description: File containing software versions
+          pattern: "versions.yml"
+authors:
+  - "@LilyAnderssonLee"