From f8843f3903dc3bb71b996566b5f129af04e6e22d Mon Sep 17 00:00:00 2001
From: LilyAnderssonLee <lilili2011me@gmail.com>
Date: Thu, 9 Jan 2025 17:14:45 +0100
Subject: [PATCH] use python instead of awk to extract diamond reads

---
 .nf-core.yml                                  |  1 +
 bin/extractdiamondreads.py                    | 65 +++++++++++++++++++
 conf/base.config                              |  2 +-
 conf/test_denovo.config                       |  1 +
 conf/test_screenpathogen.config               |  3 +-
 conf/test_taxid.config                        |  1 +
 conf/test_virus.config                        |  1 +
 modules/local/extract_viral_taxid/main.nf     |  2 -
 modules/local/extract_viral_taxid/meta.yml    |  2 +-
 .../local/extractdiamondreads/environment.yml |  7 ++
 modules/local/extractdiamondreads/main.nf     | 34 +++++-----
 modules/local/extractdiamondreads/meta.yml    |  5 +-
 nextflow.config                               |  1 -
 subworkflows/local/taxid_reads.nf             |  2 +
 14 files changed, 103 insertions(+), 24 deletions(-)
 create mode 100755 bin/extractdiamondreads.py
 create mode 100644 modules/local/extractdiamondreads/environment.yml

diff --git a/.nf-core.yml b/.nf-core.yml
index 45a97f5..894fe85 100644
--- a/.nf-core.yml
+++ b/.nf-core.yml
@@ -5,6 +5,7 @@ lint:
     - docs/images/nf-core-metaval_logo_dark.png
     - assets/samplesheet.csv
     - .github/ISSUE_TEMPLATE/config.yml
+    - conf/test_full.config
   files_unchanged:
     - assets/sendmail_template.txt
     - .github/CONTRIBUTING.md
diff --git a/bin/extractdiamondreads.py b/bin/extractdiamondreads.py
new file mode 100755
index 0000000..759badb
--- /dev/null
+++ b/bin/extractdiamondreads.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+
+import argparse
+import subprocess
+import os
+
+def parse_args(args=None):
+    Description = "Extract reads of specified taxonomic ID from the output of DIAMOND/blastx classification."
+    parser = argparse.ArgumentParser(description=Description)
+    parser.add_argument("--tsv", required=True, help="Path to the DIAMOND output TSV file.")
+    parser.add_argument("--taxid", required=True, help="Taxonomic ID to extract the reads")
+    parser.add_argument("--evalue", required=True, help="A e-value to filter the DIAMOND classification result.")
+    parser.add_argument("--prefix", required=True, help="Prefix for output files")
+    parser.add_argument("--single_end", action="store_true", help="Flag for single_end reads")
+    parser.add_argument("--fastq", required=True, nargs='+', help="Paths to input FASTQ files.")
+
+
+    return parser.parse_args(args)
+
+def extract_reads_by_taxid(tsv_path, taxid, evalue, fastq_paths, single_end, prefix):
+    read_id_file = f"{prefix}_readID.txt"
+
+    # Step1: Filter the DIAMOND tsv file line-by-line and collect read IDs by taxid.
+    with open(tsv_path, 'r') as tsv, open(read_id_file, 'w') as out:
+        for line in tsv:
+            parts = line.strip().split('\t')
+            current_taxid = parts[1]
+            if current_taxid == taxid and parts[2] < evalue:
+                out.write(parts[0] + "\n")
+
+    # Step2: Extract reads using seqkit
+    output_files = []
+
+    if single_end:
+        output_file = f"{prefix}_{taxid}.extracted_diamond_reads.fastq"
+        subprocess.run(["seqkit", "grep", "-f", read_id_file, fastq_paths[0], "-o", output_file], check=True)
+        output_files.append(output_file)
+    else:
+        output_file1 = f"{prefix}_{taxid}.extracted_diamond_read1.fastq"
+        output_file2 = f"{prefix}_{taxid}.extracted_diamond_read2.fastq"
+        subprocess.run(["seqkit", "grep", "-f", read_id_file, fastq_paths[0], "-o", output_file1], check=True)
+        subprocess.run(["seqkit", "grep", "-f", read_id_file, fastq_paths[1], "-o", output_file2], check=True)
+        output_files.extend([output_file1, output_file2])
+
+    # Clean up temporary files
+    os.remove(read_id_file)
+
+    return output_files
+
+def main(args=None):
+    args = parse_args(args)
+    extract_reads_by_taxid(
+        tsv_path=args.tsv,
+        taxid=args.taxid,
+        evalue=args.evalue,
+        prefix=args.prefix,
+        single_end=args.single_end,
+        fastq_paths=args.fastq
+    )
+
+if __name__ == "__main__":
+    main()
+
+
+
diff --git a/conf/base.config b/conf/base.config
index 43d57f3..87623a1 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -60,7 +60,7 @@ process {
         maxRetries    = 3
     }
     withName: EXTRACTCDIAMONDREADS {
-        memory = { 60.GB * task.attempt }
+        memory = 180.GB
     }
     withName: SPADES {
         errorStrategy = { task.exitStatus in [143,137,21,2,1,0] ? 'retry' : 'ignore' }
diff --git a/conf/test_denovo.config b/conf/test_denovo.config
index eb05437..8c2488c 100644
--- a/conf/test_denovo.config
+++ b/conf/test_denovo.config
@@ -33,6 +33,7 @@ params {
     fastq_output                  = true
     extract_centrifuge_reads      = true
     extract_diamond_reads         = true
+    evalue                        = 1e-20
 
     // de novo
     perform_shortread_denovo      = true
diff --git a/conf/test_screenpathogen.config b/conf/test_screenpathogen.config
index c9f79d7..c1da2d9 100644
--- a/conf/test_screenpathogen.config
+++ b/conf/test_screenpathogen.config
@@ -2,7 +2,7 @@
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     Nextflow config file for running minimal tests
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    Defines a input taxid list to run a fast and simple pipeline test.
+    Test the pathogen screening workflow.
 
     Use as follows:
         nextflow run genomic-medicine-sweden/metaval -profile test_screenpathogen,<docker/singularity> --outdir <OUTDIR>
@@ -34,6 +34,7 @@ params {
     fastq_output                  = false
     extract_centrifuge_reads      = false
     extract_diamond_reads         = false
+    evalue                        = 1e-20
 
     // de novo
     perform_shortread_denovo      = false
diff --git a/conf/test_taxid.config b/conf/test_taxid.config
index b56cc0b..73fc051 100644
--- a/conf/test_taxid.config
+++ b/conf/test_taxid.config
@@ -34,6 +34,7 @@ params {
     fastq_output                  = true
     extract_centrifuge_reads      = true
     extract_diamond_reads         = true
+    evalue                        = 1e-20
 
     // de novo
     perform_shortread_denovo      = true
diff --git a/conf/test_virus.config b/conf/test_virus.config
index 3c13efa..877a400 100644
--- a/conf/test_virus.config
+++ b/conf/test_virus.config
@@ -33,6 +33,7 @@ params {
     fastq_output                  = true
     extract_centrifuge_reads      = true
     extract_diamond_reads         = true
+    evalue                        = 1e-20
 
     // de novo
     perform_shortread_denovo      = true
diff --git a/modules/local/extract_viral_taxid/main.nf b/modules/local/extract_viral_taxid/main.nf
index 3a6b233..f9960e9 100644
--- a/modules/local/extract_viral_taxid/main.nf
+++ b/modules/local/extract_viral_taxid/main.nf
@@ -27,8 +27,6 @@ process EXTRACT_VIRAL_TAXID {
             awk '\$3 < ${evalue}' ${report} | cut -f 2 | uniq > detected_taxid.txt
             grep -F -w -f taxpasta_viral_taxid.txt detected_taxid.txt | uniq > ${prefix}_viral_taxids.tsv
         fi
-    else
-        echo "No viral taxids found." > "no_viral_taxid.txt"
     fi
     """
 }
diff --git a/modules/local/extract_viral_taxid/meta.yml b/modules/local/extract_viral_taxid/meta.yml
index 6ed01a3..e62e71f 100644
--- a/modules/local/extract_viral_taxid/meta.yml
+++ b/modules/local/extract_viral_taxid/meta.yml
@@ -10,7 +10,7 @@ keywards:
 input:
   - evalue:
       type: ["number", "integer"]
-      description: A e-vaule threshold to filter the diamond classification result
+      description: An e-value threshold to filter the diamond classification result
   - - meta:
         type: map
         description: |
diff --git a/modules/local/extractdiamondreads/environment.yml b/modules/local/extractdiamondreads/environment.yml
new file mode 100644
index 0000000..49335a4
--- /dev/null
+++ b/modules/local/extractdiamondreads/environment.yml
@@ -0,0 +1,7 @@
+name: extractdiamondreads
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - bioconda::seqkit=2.9.0
+  - conda-forge::python=3.13.1
diff --git a/modules/local/extractdiamondreads/main.nf b/modules/local/extractdiamondreads/main.nf
index 0cac464..c6678c0 100644
--- a/modules/local/extractdiamondreads/main.nf
+++ b/modules/local/extractdiamondreads/main.nf
@@ -3,41 +3,41 @@ process EXTRACTCDIAMONDREADS {
     tag "$meta.id"
     label 'process_high'
 
-    conda "bioconda::seqkit=2.8.2"
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/seqkit:2.8.2--h9ee0642_1':
-        'biocontainers/seqkit:2.8.2--h9ee0642_1' }"
+        'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/11/1195d7edbe47145bda13bf6809891dc2fbe9df749c2e567b8879518b8de4ca33/data':
+        'community.wave.seqera.io/library/seqkit_python:cf97bbadc3675f5b' }"
 
     input:
     val taxid
+    val evalue
     tuple val (meta), path(tsv)
-    tuple val (meta), path(fastq) // bowtie2/align *unmapped_{1,2}.fastq.gz
+    tuple val (meta), path(fastq)
 
     output:
-    tuple val(meta), path("*.fastq"), optional:true, emit: extracted_diamond_reads
-    path "versions.yml"                            , emit: versions
+    tuple val(meta), path("*.fastq"), optional: true, emit: extracted_diamond_reads
+    path "versions.yml"                             , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
 
     script:
-    def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
+    def single_end_flag = meta.single_end ? '--single_end' : ''
 
     """
-    awk -v taxID=$taxid '\$2 == taxID {print \$1}' $tsv > readID.txt
-    if [ ${meta.single_end} == 'true' ]; then
-        seqkit grep -f readID.txt $fastq > ${prefix}_${taxid}.extracted_diamond_read.fastq
-    elif [ "${meta.single_end}" == 'false' ]; then
-        seqkit grep -f readID.txt ${fastq[0]} > ${prefix}_${taxid}.extracted_diamond_read1.fastq
-        seqkit grep -f readID.txt ${fastq[1]} > ${prefix}_${taxid}.extracted_diamond_read2.fastq
-    fi
-
-    rm readID.txt
+    extractdiamondreads.py \\
+        --taxid $taxid \\
+        --tsv $tsv \\
+        --evalue $evalue \\
+        --prefix $prefix \\
+        $single_end_flag \\
+        --fastq ${fastq.join(' ')}
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        seqkit: \$( seqkit version | sed 's/seqkit v//' )
+        seqkit: \$( seqkit version | sed -e 's/seqkit v//g' )
+        python: \$( python --version | sed -e 's/Python //g')
     END_VERSIONS
     """
 }
diff --git a/modules/local/extractdiamondreads/meta.yml b/modules/local/extractdiamondreads/meta.yml
index 4e314ae..36c0278 100644
--- a/modules/local/extractdiamondreads/meta.yml
+++ b/modules/local/extractdiamondreads/meta.yml
@@ -2,7 +2,7 @@ name: extractdiamondreads
 description: Use a custom python script to extract reads with specified taxonomic ID from the DIAMOND classification output
 keywards:
   - taxid
-  - DIAMOND/blastx
+  - DIAMOND
   - tsv
   - fastq
   - extract_reads
@@ -10,6 +10,9 @@ input:
   - taxid:
       type: integer
       description: A taxonomic ID to extract the reads
+  - evalue:
+      type: ["number", "integer"]
+      description: An e-value threshold to filter the diamond classification result
   - meta:
       type: map
       description: |
diff --git a/nextflow.config b/nextflow.config
index 8e47cc6..f787b31 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -192,7 +192,6 @@ profiles {
         }
     }
     test                { includeConfig 'conf/test.config'      }
-    test_full           { includeConfig 'conf/test_full.config' }
     test_taxid          { includeConfig 'conf/test_taxid.config' }
     test_virus          { includeConfig 'conf/test_virus.config' }
     test_denovo         { includeConfig 'conf/test_denovo.config' }
diff --git a/subworkflows/local/taxid_reads.nf b/subworkflows/local/taxid_reads.nf
index 2d802eb..895f458 100644
--- a/subworkflows/local/taxid_reads.nf
+++ b/subworkflows/local/taxid_reads.nf
@@ -149,6 +149,7 @@ workflow TAXID_READS {
 
             EXTRACTCDIAMONDREADS(
                 diamond_params_taxid.taxid,
+                params.evalue,
                 diamond_params_taxid.diamond_tsv,
                 diamond_params_taxid.reads
             )
@@ -177,6 +178,7 @@ workflow TAXID_READS {
 
             EXTRACTCDIAMONDREADS(
                 diamond_combined_input.taxid,
+                params.evalue,
                 diamond_combined_input.diamond_tsv,
                 diamond_combined_input.reads,
             )