From f8843f3903dc3bb71b996566b5f129af04e6e22d Mon Sep 17 00:00:00 2001 From: LilyAnderssonLee Date: Thu, 9 Jan 2025 17:14:45 +0100 Subject: [PATCH] use python instead of awk to extract diamond reads --- .nf-core.yml | 1 + bin/extractdiamondreads.py | 65 +++++++++++++++++++ conf/base.config | 2 +- conf/test_denovo.config | 1 + conf/test_screenpathogen.config | 3 +- conf/test_taxid.config | 1 + conf/test_virus.config | 1 + modules/local/extract_viral_taxid/main.nf | 2 - modules/local/extract_viral_taxid/meta.yml | 2 +- .../local/extractdiamondreads/environment.yml | 7 ++ modules/local/extractdiamondreads/main.nf | 34 +++++----- modules/local/extractdiamondreads/meta.yml | 5 +- nextflow.config | 1 - subworkflows/local/taxid_reads.nf | 2 + 14 files changed, 103 insertions(+), 24 deletions(-) create mode 100755 bin/extractdiamondreads.py create mode 100644 modules/local/extractdiamondreads/environment.yml diff --git a/.nf-core.yml b/.nf-core.yml index 45a97f5..894fe85 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -5,6 +5,7 @@ lint: - docs/images/nf-core-metaval_logo_dark.png - assets/samplesheet.csv - .github/ISSUE_TEMPLATE/config.yml + - conf/test_full.config files_unchanged: - assets/sendmail_template.txt - .github/CONTRIBUTING.md diff --git a/bin/extractdiamondreads.py b/bin/extractdiamondreads.py new file mode 100755 index 0000000..759badb --- /dev/null +++ b/bin/extractdiamondreads.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python + +import argparse +import subprocess +import os + +def parse_args(args=None): + Description = "Extract reads of specified taxonomic ID from the output of DIAMOND/blastx classification." + parser = argparse.ArgumentParser(description=Description) + parser.add_argument("--tsv", required=True, help="Path to the DIAMOND output TSV file.") + parser.add_argument("--taxid", required=True, help="Taxonomic ID to extract the reads") + parser.add_argument("--evalue", required=True, help="A e-value to filter the DIAMOND classification result.") + parser.add_argument("--prefix", required=True, help="Prefix for output files") + parser.add_argument("--single_end", action="store_true", help="Flag for single_end reads") + parser.add_argument("--fastq", required=True, nargs='+', help="Paths to input FASTQ files.") + + + return parser.parse_args(args) + +def extract_reads_by_taxid(tsv_path, taxid, evalue, fastq_paths, single_end, prefix): + read_id_file = f"{prefix}_readID.txt" + + # Step1: Filter the DIAMOND tsv file line-by-line and collect read IDs by taxid. + with open(tsv_path, 'r') as tsv, open(read_id_file, 'w') as out: + for line in tsv: + parts = line.strip().split('\t') + current_taxid = parts[1] + if current_taxid == taxid and parts[2] < evalue: + out.write(parts[0] + "\n") + + # Step2: Extract reads using seqkit + output_files = [] + + if single_end: + output_file = f"{prefix}_{taxid}.extracted_diamond_reads.fastq" + subprocess.run(["seqkit", "grep", "-f", read_id_file, fastq_paths[0], "-o", output_file], check=True) + output_files.append(output_file) + else: + output_file1 = f"{prefix}_{taxid}.extracted_diamond_read1.fastq" + output_file2 = f"{prefix}_{taxid}.extracted_diamond_read2.fastq" + subprocess.run(["seqkit", "grep", "-f", read_id_file, fastq_paths[0], "-o", output_file1], check=True) + subprocess.run(["seqkit", "grep", "-f", read_id_file, fastq_paths[1], "-o", output_file2], check=True) + output_files.extend([output_file1, output_file2]) + + # Clean up temporary files + os.remove(read_id_file) + + return output_files + +def main(args=None): + args = parse_args(args) + extract_reads_by_taxid( + tsv_path=args.tsv, + taxid=args.taxid, + evalue=args.evalue, + prefix=args.prefix, + single_end=args.single_end, + fastq_paths=args.fastq + ) + +if __name__ == "__main__": + main() + + + diff --git a/conf/base.config b/conf/base.config index 43d57f3..87623a1 100644 --- a/conf/base.config +++ b/conf/base.config @@ -60,7 +60,7 @@ process { maxRetries = 3 } withName: EXTRACTCDIAMONDREADS { - memory = { 60.GB * task.attempt } + memory = 180.GB } withName: SPADES { errorStrategy = { task.exitStatus in [143,137,21,2,1,0] ? 'retry' : 'ignore' } diff --git a/conf/test_denovo.config b/conf/test_denovo.config index eb05437..8c2488c 100644 --- a/conf/test_denovo.config +++ b/conf/test_denovo.config @@ -33,6 +33,7 @@ params { fastq_output = true extract_centrifuge_reads = true extract_diamond_reads = true + evalue = 1e-20 // de novo perform_shortread_denovo = true diff --git a/conf/test_screenpathogen.config b/conf/test_screenpathogen.config index c9f79d7..c1da2d9 100644 --- a/conf/test_screenpathogen.config +++ b/conf/test_screenpathogen.config @@ -2,7 +2,7 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Nextflow config file for running minimal tests ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Defines a input taxid list to run a fast and simple pipeline test. + Test the pathogen screening workflow. Use as follows: nextflow run genomic-medicine-sweden/metaval -profile test_screenpathogen, --outdir @@ -34,6 +34,7 @@ params { fastq_output = false extract_centrifuge_reads = false extract_diamond_reads = false + evalue = 1e-20 // de novo perform_shortread_denovo = false diff --git a/conf/test_taxid.config b/conf/test_taxid.config index b56cc0b..73fc051 100644 --- a/conf/test_taxid.config +++ b/conf/test_taxid.config @@ -34,6 +34,7 @@ params { fastq_output = true extract_centrifuge_reads = true extract_diamond_reads = true + evalue = 1e-20 // de novo perform_shortread_denovo = true diff --git a/conf/test_virus.config b/conf/test_virus.config index 3c13efa..877a400 100644 --- a/conf/test_virus.config +++ b/conf/test_virus.config @@ -33,6 +33,7 @@ params { fastq_output = true extract_centrifuge_reads = true extract_diamond_reads = true + evalue = 1e-20 // de novo perform_shortread_denovo = true diff --git a/modules/local/extract_viral_taxid/main.nf b/modules/local/extract_viral_taxid/main.nf index 3a6b233..f9960e9 100644 --- a/modules/local/extract_viral_taxid/main.nf +++ b/modules/local/extract_viral_taxid/main.nf @@ -27,8 +27,6 @@ process EXTRACT_VIRAL_TAXID { awk '\$3 < ${evalue}' ${report} | cut -f 2 | uniq > detected_taxid.txt grep -F -w -f taxpasta_viral_taxid.txt detected_taxid.txt | uniq > ${prefix}_viral_taxids.tsv fi - else - echo "No viral taxids found." > "no_viral_taxid.txt" fi """ } diff --git a/modules/local/extract_viral_taxid/meta.yml b/modules/local/extract_viral_taxid/meta.yml index 6ed01a3..e62e71f 100644 --- a/modules/local/extract_viral_taxid/meta.yml +++ b/modules/local/extract_viral_taxid/meta.yml @@ -10,7 +10,7 @@ keywards: input: - evalue: type: ["number", "integer"] - description: A e-vaule threshold to filter the diamond classification result + description: An e-value threshold to filter the diamond classification result - - meta: type: map description: | diff --git a/modules/local/extractdiamondreads/environment.yml b/modules/local/extractdiamondreads/environment.yml new file mode 100644 index 0000000..49335a4 --- /dev/null +++ b/modules/local/extractdiamondreads/environment.yml @@ -0,0 +1,7 @@ +name: extractdiamondreads +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::seqkit=2.9.0 + - conda-forge::python=3.13.1 diff --git a/modules/local/extractdiamondreads/main.nf b/modules/local/extractdiamondreads/main.nf index 0cac464..c6678c0 100644 --- a/modules/local/extractdiamondreads/main.nf +++ b/modules/local/extractdiamondreads/main.nf @@ -3,41 +3,41 @@ process EXTRACTCDIAMONDREADS { tag "$meta.id" label 'process_high' - conda "bioconda::seqkit=2.8.2" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/seqkit:2.8.2--h9ee0642_1': - 'biocontainers/seqkit:2.8.2--h9ee0642_1' }" + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/11/1195d7edbe47145bda13bf6809891dc2fbe9df749c2e567b8879518b8de4ca33/data': + 'community.wave.seqera.io/library/seqkit_python:cf97bbadc3675f5b' }" input: val taxid + val evalue tuple val (meta), path(tsv) - tuple val (meta), path(fastq) // bowtie2/align *unmapped_{1,2}.fastq.gz + tuple val (meta), path(fastq) output: - tuple val(meta), path("*.fastq"), optional:true, emit: extracted_diamond_reads - path "versions.yml" , emit: versions + tuple val(meta), path("*.fastq"), optional: true, emit: extracted_diamond_reads + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" + def single_end_flag = meta.single_end ? '--single_end' : '' """ - awk -v taxID=$taxid '\$2 == taxID {print \$1}' $tsv > readID.txt - if [ ${meta.single_end} == 'true' ]; then - seqkit grep -f readID.txt $fastq > ${prefix}_${taxid}.extracted_diamond_read.fastq - elif [ "${meta.single_end}" == 'false' ]; then - seqkit grep -f readID.txt ${fastq[0]} > ${prefix}_${taxid}.extracted_diamond_read1.fastq - seqkit grep -f readID.txt ${fastq[1]} > ${prefix}_${taxid}.extracted_diamond_read2.fastq - fi - - rm readID.txt + extractdiamondreads.py \\ + --taxid $taxid \\ + --tsv $tsv \\ + --evalue $evalue \\ + --prefix $prefix \\ + $single_end_flag \\ + --fastq ${fastq.join(' ')} cat <<-END_VERSIONS > versions.yml "${task.process}": - seqkit: \$( seqkit version | sed 's/seqkit v//' ) + seqkit: \$( seqkit version | sed -e 's/seqkit v//g' ) + python: \$( python --version | sed -e 's/Python //g') END_VERSIONS """ } diff --git a/modules/local/extractdiamondreads/meta.yml b/modules/local/extractdiamondreads/meta.yml index 4e314ae..36c0278 100644 --- a/modules/local/extractdiamondreads/meta.yml +++ b/modules/local/extractdiamondreads/meta.yml @@ -2,7 +2,7 @@ name: extractdiamondreads description: Use a custom python script to extract reads with specified taxonomic ID from the DIAMOND classification output keywards: - taxid - - DIAMOND/blastx + - DIAMOND - tsv - fastq - extract_reads @@ -10,6 +10,9 @@ input: - taxid: type: integer description: A taxonomic ID to extract the reads + - evalue: + type: ["number", "integer"] + description: An e-value threshold to filter the diamond classification result - meta: type: map description: | diff --git a/nextflow.config b/nextflow.config index 8e47cc6..f787b31 100644 --- a/nextflow.config +++ b/nextflow.config @@ -192,7 +192,6 @@ profiles { } } test { includeConfig 'conf/test.config' } - test_full { includeConfig 'conf/test_full.config' } test_taxid { includeConfig 'conf/test_taxid.config' } test_virus { includeConfig 'conf/test_virus.config' } test_denovo { includeConfig 'conf/test_denovo.config' } diff --git a/subworkflows/local/taxid_reads.nf b/subworkflows/local/taxid_reads.nf index 2d802eb..895f458 100644 --- a/subworkflows/local/taxid_reads.nf +++ b/subworkflows/local/taxid_reads.nf @@ -149,6 +149,7 @@ workflow TAXID_READS { EXTRACTCDIAMONDREADS( diamond_params_taxid.taxid, + params.evalue, diamond_params_taxid.diamond_tsv, diamond_params_taxid.reads ) @@ -177,6 +178,7 @@ workflow TAXID_READS { EXTRACTCDIAMONDREADS( diamond_combined_input.taxid, + params.evalue, diamond_combined_input.diamond_tsv, diamond_combined_input.reads, )