update extract_viral_taxid and it produces the uniq list of taxid for…

… diamond
genomic-medicine-sweden · Jan 10, 2025 · 50055d2 · 50055d2
1 parent 176c5f9
commit 50055d2
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 16 deletions.
diff --git a/bin/extractdiamondreads.py b/bin/extractdiamondreads.py
@@ -2,33 +2,32 @@
 
 import argparse
 import subprocess
-import os
 
 def parse_args(args=None):
     Description = "Extract reads of specified taxonomic ID from the output of DIAMOND/blastx classification."
     parser = argparse.ArgumentParser(description=Description)
     parser.add_argument("--tsv", required=True, help="Path to the DIAMOND output TSV file.")
-    parser.add_argument("--taxid", required=True, help="Taxonomic ID to extract the reads")
-    parser.add_argument("--evalue", required=True, help="A e-value to filter the DIAMOND classification result.")
-    parser.add_argument("--prefix", required=True, help="Prefix for output files")
-    parser.add_argument("--single_end", action="store_true", help="Flag for single_end reads")
+    parser.add_argument("--taxid", required=True, help="Taxonomic ID to extract the reads.")
+    parser.add_argument("--evalue", required=True, type=float, help="E-value threshold to filter the DIAMOND classification result.")
+    parser.add_argument("--prefix", required=True, help="Prefix for output files.")
+    parser.add_argument("--single_end", action="store_true", help="Flag for single-end reads.")
     parser.add_argument("--fastq", required=True, nargs='+', help="Paths to input FASTQ files.")
 
-
     return parser.parse_args(args)
 
 def extract_reads_by_taxid(tsv_path, taxid, evalue, fastq_paths, single_end, prefix):
     read_id_file = f"{prefix}_readID.txt"
 
-    # Step1: Filter the DIAMOND tsv file line-by-line and collect read IDs by taxid.
+    # Step 1: Filter the DIAMOND tsv file and collect read IDs by taxid and e-value threshold
     with open(tsv_path, 'r') as tsv, open(read_id_file, 'w') as out:
         for line in tsv:
             parts = line.strip().split('\t')
             current_taxid = parts[1]
-            if current_taxid == taxid and parts[2] < evalue:
+            current_evalue = float(parts[2])
+            if current_taxid == taxid and current_evalue < evalue:
                 out.write(parts[0] + "\n")
 
-    # Step2: Extract reads using seqkit
+    # Step 2: Extract reads using seqkit
     output_files = []
 
     if single_end:
@@ -42,8 +41,8 @@ def extract_reads_by_taxid(tsv_path, taxid, evalue, fastq_paths, single_end, pre
         subprocess.run(["seqkit", "grep", "-f", read_id_file, fastq_paths[1], "-o", output_file2], check=True)
         output_files.extend([output_file1, output_file2])
 
-    # Clean up temporary files
-    os.remove(read_id_file)
+    # Clean up temporary files (optional, uncomment if needed)
+    # os.remove(read_id_file)
 
     return output_files
 
@@ -60,6 +59,3 @@ def main(args=None):
 
 if __name__ == "__main__":
     main()
-
-
-
diff --git a/modules/local/extract_viral_taxid/main.nf b/modules/local/extract_viral_taxid/main.nf
@@ -24,8 +24,8 @@ process EXTRACT_VIRAL_TAXID {
             awk -F'\t' '\$3 != 0 {print \$5}' ${report} > detected_taxid.txt
             grep -F -w -f taxpasta_viral_taxid.txt detected_taxid.txt > ${prefix}_viral_taxids.tsv
         elif [[ "${meta.tool}" == "diamond" ]]; then
-            awk '\$3 < ${evalue}' ${report} | cut -f 2 | uniq > detected_taxid.txt
-            grep -F -w -f taxpasta_viral_taxid.txt detected_taxid.txt | uniq > ${prefix}_viral_taxids.tsv
+            awk '\$3 < ${evalue}' ${report} | cut -f 2 | sort | uniq > detected_taxid.txt
+            grep -F -w -f taxpasta_viral_taxid.txt detected_taxid.txt | sort | uniq > ${prefix}_viral_taxids.tsv
         fi
     fi
     """