use python instead of awk to extract diamond reads

genomic-medicine-sweden · Jan 9, 2025 · f8843f3 · f8843f3
1 parent 0ada1f5
commit f8843f3
Show file tree

Hide file tree

Showing 14 changed files with 103 additions and 24 deletions.
diff --git a/.nf-core.yml b/.nf-core.yml
@@ -5,6 +5,7 @@ lint:
     - docs/images/nf-core-metaval_logo_dark.png
     - assets/samplesheet.csv
     - .github/ISSUE_TEMPLATE/config.yml
+    - conf/test_full.config
   files_unchanged:
     - assets/sendmail_template.txt
     - .github/CONTRIBUTING.md

diff --git a/bin/extractdiamondreads.py b/bin/extractdiamondreads.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+
+import argparse
+import subprocess
+import os
+
+def parse_args(args=None):
+    Description = "Extract reads of specified taxonomic ID from the output of DIAMOND/blastx classification."
+    parser = argparse.ArgumentParser(description=Description)
+    parser.add_argument("--tsv", required=True, help="Path to the DIAMOND output TSV file.")
+    parser.add_argument("--taxid", required=True, help="Taxonomic ID to extract the reads")
+    parser.add_argument("--evalue", required=True, help="A e-value to filter the DIAMOND classification result.")
+    parser.add_argument("--prefix", required=True, help="Prefix for output files")
+    parser.add_argument("--single_end", action="store_true", help="Flag for single_end reads")
+    parser.add_argument("--fastq", required=True, nargs='+', help="Paths to input FASTQ files.")
+
+
+    return parser.parse_args(args)
+
+def extract_reads_by_taxid(tsv_path, taxid, evalue, fastq_paths, single_end, prefix):
+    read_id_file = f"{prefix}_readID.txt"
+
+    # Step1: Filter the DIAMOND tsv file line-by-line and collect read IDs by taxid.
+    with open(tsv_path, 'r') as tsv, open(read_id_file, 'w') as out:
+        for line in tsv:
+            parts = line.strip().split('\t')
+            current_taxid = parts[1]
+            if current_taxid == taxid and parts[2] < evalue:
+                out.write(parts[0] + "\n")
+
+    # Step2: Extract reads using seqkit
+    output_files = []
+
+    if single_end:
+        output_file = f"{prefix}_{taxid}.extracted_diamond_reads.fastq"
+        subprocess.run(["seqkit", "grep", "-f", read_id_file, fastq_paths[0], "-o", output_file], check=True)
+        output_files.append(output_file)
+    else:
+        output_file1 = f"{prefix}_{taxid}.extracted_diamond_read1.fastq"
+        output_file2 = f"{prefix}_{taxid}.extracted_diamond_read2.fastq"
+        subprocess.run(["seqkit", "grep", "-f", read_id_file, fastq_paths[0], "-o", output_file1], check=True)
+        subprocess.run(["seqkit", "grep", "-f", read_id_file, fastq_paths[1], "-o", output_file2], check=True)
+        output_files.extend([output_file1, output_file2])
+
+    # Clean up temporary files
+    os.remove(read_id_file)
+
+    return output_files
+
+def main(args=None):
+    args = parse_args(args)
+    extract_reads_by_taxid(
+        tsv_path=args.tsv,
+        taxid=args.taxid,
+        evalue=args.evalue,
+        prefix=args.prefix,
+        single_end=args.single_end,
+        fastq_paths=args.fastq
+    )
+
+if __name__ == "__main__":
+    main()
+
+
+
diff --git a/conf/base.config b/conf/base.config
@@ -60,7 +60,7 @@ process {
         maxRetries    = 3
     }
     withName: EXTRACTCDIAMONDREADS {
-        memory = { 60.GB * task.attempt }
+        memory = 180.GB
     }
     withName: SPADES {
         errorStrategy = { task.exitStatus in [143,137,21,2,1,0] ? 'retry' : 'ignore' }

diff --git a/conf/test_denovo.config b/conf/test_denovo.config
@@ -33,6 +33,7 @@ params {
     fastq_output                  = true
     extract_centrifuge_reads      = true
     extract_diamond_reads         = true
+    evalue                        = 1e-20
 
     // de novo
     perform_shortread_denovo      = true

diff --git a/conf/test_screenpathogen.config b/conf/test_screenpathogen.config
@@ -2,7 +2,7 @@
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     Nextflow config file for running minimal tests
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    Defines a input taxid list to run a fast and simple pipeline test.
+    Test the pathogen screening workflow.
 
     Use as follows:
         nextflow run genomic-medicine-sweden/metaval -profile test_screenpathogen,<docker/singularity> --outdir <OUTDIR>
@@ -34,6 +34,7 @@ params {
     fastq_output                  = false
     extract_centrifuge_reads      = false
     extract_diamond_reads         = false
+    evalue                        = 1e-20
 
     // de novo
     perform_shortread_denovo      = false

diff --git a/conf/test_taxid.config b/conf/test_taxid.config
@@ -34,6 +34,7 @@ params {
     fastq_output                  = true
     extract_centrifuge_reads      = true
     extract_diamond_reads         = true
+    evalue                        = 1e-20
 
     // de novo
     perform_shortread_denovo      = true

diff --git a/conf/test_virus.config b/conf/test_virus.config
@@ -33,6 +33,7 @@ params {
     fastq_output                  = true
     extract_centrifuge_reads      = true
     extract_diamond_reads         = true
+    evalue                        = 1e-20
 
     // de novo
     perform_shortread_denovo      = true

diff --git a/modules/local/extract_viral_taxid/main.nf b/modules/local/extract_viral_taxid/main.nf
@@ -27,8 +27,6 @@ process EXTRACT_VIRAL_TAXID {
             awk '\$3 < ${evalue}' ${report} | cut -f 2 | uniq > detected_taxid.txt
             grep -F -w -f taxpasta_viral_taxid.txt detected_taxid.txt | uniq > ${prefix}_viral_taxids.tsv
         fi
-    else
-        echo "No viral taxids found." > "no_viral_taxid.txt"
     fi
     """
 }
diff --git a/modules/local/extract_viral_taxid/meta.yml b/modules/local/extract_viral_taxid/meta.yml
@@ -10,7 +10,7 @@ keywards:
 input:
   - evalue:
       type: ["number", "integer"]
-      description: A e-vaule threshold to filter the diamond classification result
+      description: An e-value threshold to filter the diamond classification result
   - - meta:
         type: map
         description: |

diff --git a/modules/local/extractdiamondreads/environment.yml b/modules/local/extractdiamondreads/environment.yml
@@ -0,0 +1,7 @@
+name: extractdiamondreads
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - bioconda::seqkit=2.9.0
+  - conda-forge::python=3.13.1
diff --git a/modules/local/extractdiamondreads/main.nf b/modules/local/extractdiamondreads/main.nf
@@ -3,41 +3,41 @@ process EXTRACTCDIAMONDREADS {
     tag "$meta.id"
     label 'process_high'
 
-    conda "bioconda::seqkit=2.8.2"
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/seqkit:2.8.2--h9ee0642_1':
-        'biocontainers/seqkit:2.8.2--h9ee0642_1' }"
+        'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/11/1195d7edbe47145bda13bf6809891dc2fbe9df749c2e567b8879518b8de4ca33/data':
+        'community.wave.seqera.io/library/seqkit_python:cf97bbadc3675f5b' }"
 
     input:
     val taxid
+    val evalue
     tuple val (meta), path(tsv)
-    tuple val (meta), path(fastq) // bowtie2/align *unmapped_{1,2}.fastq.gz
+    tuple val (meta), path(fastq)
 
     output:
-    tuple val(meta), path("*.fastq"), optional:true, emit: extracted_diamond_reads
-    path "versions.yml"                            , emit: versions
+    tuple val(meta), path("*.fastq"), optional: true, emit: extracted_diamond_reads
+    path "versions.yml"                             , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
 
     script:
-    def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
+    def single_end_flag = meta.single_end ? '--single_end' : ''
 
     """
-    awk -v taxID=$taxid '\$2 == taxID {print \$1}' $tsv > readID.txt
-    if [ ${meta.single_end} == 'true' ]; then
-        seqkit grep -f readID.txt $fastq > ${prefix}_${taxid}.extracted_diamond_read.fastq
-    elif [ "${meta.single_end}" == 'false' ]; then
-        seqkit grep -f readID.txt ${fastq[0]} > ${prefix}_${taxid}.extracted_diamond_read1.fastq
-        seqkit grep -f readID.txt ${fastq[1]} > ${prefix}_${taxid}.extracted_diamond_read2.fastq
-    fi
-
-    rm readID.txt
+    extractdiamondreads.py \\
+        --taxid $taxid \\
+        --tsv $tsv \\
+        --evalue $evalue \\
+        --prefix $prefix \\
+        $single_end_flag \\
+        --fastq ${fastq.join(' ')}
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        seqkit: \$( seqkit version | sed 's/seqkit v//' )
+        seqkit: \$( seqkit version | sed -e 's/seqkit v//g' )
+        python: \$( python --version | sed -e 's/Python //g')
     END_VERSIONS
     """
 }
diff --git a/modules/local/extractdiamondreads/meta.yml b/modules/local/extractdiamondreads/meta.yml
@@ -2,14 +2,17 @@ name: extractdiamondreads
 description: Use a custom python script to extract reads with specified taxonomic ID from the DIAMOND classification output
 keywards:
   - taxid
-  - DIAMOND/blastx
+  - DIAMOND
   - tsv
   - fastq
   - extract_reads
 input:
   - taxid:
       type: integer
       description: A taxonomic ID to extract the reads
+  - evalue:
+      type: ["number", "integer"]
+      description: An e-value threshold to filter the diamond classification result
   - meta:
       type: map
       description: |

diff --git a/nextflow.config b/nextflow.config
@@ -192,7 +192,6 @@ profiles {
         }
     }
     test                { includeConfig 'conf/test.config'      }
-    test_full           { includeConfig 'conf/test_full.config' }
     test_taxid          { includeConfig 'conf/test_taxid.config' }
     test_virus          { includeConfig 'conf/test_virus.config' }
     test_denovo         { includeConfig 'conf/test_denovo.config' }

diff --git a/subworkflows/local/taxid_reads.nf b/subworkflows/local/taxid_reads.nf
@@ -149,6 +149,7 @@ workflow TAXID_READS {
 
             EXTRACTCDIAMONDREADS(
                 diamond_params_taxid.taxid,
+                params.evalue,
                 diamond_params_taxid.diamond_tsv,
                 diamond_params_taxid.reads
             )
@@ -177,6 +178,7 @@ workflow TAXID_READS {
 
             EXTRACTCDIAMONDREADS(
                 diamond_combined_input.taxid,
+                params.evalue,
                 diamond_combined_input.diamond_tsv,
                 diamond_combined_input.reads,
             )