Skip to content

Commit

Permalink
update extract_viral_taxid and it produces the uniq list of taxid for…
Browse files Browse the repository at this point in the history
… diamond
  • Loading branch information
LilyAnderssonLee committed Jan 10, 2025
1 parent 176c5f9 commit 50055d2
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 16 deletions.
24 changes: 10 additions & 14 deletions bin/extractdiamondreads.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,33 +2,32 @@

import argparse
import subprocess
import os

def parse_args(args=None):
Description = "Extract reads of specified taxonomic ID from the output of DIAMOND/blastx classification."
parser = argparse.ArgumentParser(description=Description)
parser.add_argument("--tsv", required=True, help="Path to the DIAMOND output TSV file.")
parser.add_argument("--taxid", required=True, help="Taxonomic ID to extract the reads")
parser.add_argument("--evalue", required=True, help="A e-value to filter the DIAMOND classification result.")
parser.add_argument("--prefix", required=True, help="Prefix for output files")
parser.add_argument("--single_end", action="store_true", help="Flag for single_end reads")
parser.add_argument("--taxid", required=True, help="Taxonomic ID to extract the reads.")
parser.add_argument("--evalue", required=True, type=float, help="E-value threshold to filter the DIAMOND classification result.")
parser.add_argument("--prefix", required=True, help="Prefix for output files.")
parser.add_argument("--single_end", action="store_true", help="Flag for single-end reads.")
parser.add_argument("--fastq", required=True, nargs='+', help="Paths to input FASTQ files.")


return parser.parse_args(args)

def extract_reads_by_taxid(tsv_path, taxid, evalue, fastq_paths, single_end, prefix):
read_id_file = f"{prefix}_readID.txt"

# Step1: Filter the DIAMOND tsv file line-by-line and collect read IDs by taxid.
# Step 1: Filter the DIAMOND tsv file and collect read IDs by taxid and e-value threshold
with open(tsv_path, 'r') as tsv, open(read_id_file, 'w') as out:
for line in tsv:
parts = line.strip().split('\t')
current_taxid = parts[1]
if current_taxid == taxid and parts[2] < evalue:
current_evalue = float(parts[2])
if current_taxid == taxid and current_evalue < evalue:
out.write(parts[0] + "\n")

# Step2: Extract reads using seqkit
# Step 2: Extract reads using seqkit
output_files = []

if single_end:
Expand All @@ -42,8 +41,8 @@ def extract_reads_by_taxid(tsv_path, taxid, evalue, fastq_paths, single_end, pre
subprocess.run(["seqkit", "grep", "-f", read_id_file, fastq_paths[1], "-o", output_file2], check=True)
output_files.extend([output_file1, output_file2])

# Clean up temporary files
os.remove(read_id_file)
# Clean up temporary files (optional, uncomment if needed)
# os.remove(read_id_file)

return output_files

Expand All @@ -60,6 +59,3 @@ def main(args=None):

if __name__ == "__main__":
main()



4 changes: 2 additions & 2 deletions modules/local/extract_viral_taxid/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ process EXTRACT_VIRAL_TAXID {
awk -F'\t' '\$3 != 0 {print \$5}' ${report} > detected_taxid.txt
grep -F -w -f taxpasta_viral_taxid.txt detected_taxid.txt > ${prefix}_viral_taxids.tsv
elif [[ "${meta.tool}" == "diamond" ]]; then
awk '\$3 < ${evalue}' ${report} | cut -f 2 | uniq > detected_taxid.txt
grep -F -w -f taxpasta_viral_taxid.txt detected_taxid.txt | uniq > ${prefix}_viral_taxids.tsv
awk '\$3 < ${evalue}' ${report} | cut -f 2 | sort | uniq > detected_taxid.txt
grep -F -w -f taxpasta_viral_taxid.txt detected_taxid.txt | sort | uniq > ${prefix}_viral_taxids.tsv
fi
fi
"""
Expand Down

0 comments on commit 50055d2

Please sign in to comment.