Skip to content

Commit

Permalink
use python instead of awk to extract diamond reads
Browse files Browse the repository at this point in the history
  • Loading branch information
LilyAnderssonLee committed Jan 9, 2025
1 parent 0ada1f5 commit f8843f3
Show file tree
Hide file tree
Showing 14 changed files with 103 additions and 24 deletions.
1 change: 1 addition & 0 deletions .nf-core.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ lint:
- docs/images/nf-core-metaval_logo_dark.png
- assets/samplesheet.csv
- .github/ISSUE_TEMPLATE/config.yml
- conf/test_full.config
files_unchanged:
- assets/sendmail_template.txt
- .github/CONTRIBUTING.md
Expand Down
65 changes: 65 additions & 0 deletions bin/extractdiamondreads.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/usr/bin/env python

import argparse
import subprocess
import os

def parse_args(args=None):
Description = "Extract reads of specified taxonomic ID from the output of DIAMOND/blastx classification."
parser = argparse.ArgumentParser(description=Description)
parser.add_argument("--tsv", required=True, help="Path to the DIAMOND output TSV file.")
parser.add_argument("--taxid", required=True, help="Taxonomic ID to extract the reads")
parser.add_argument("--evalue", required=True, help="A e-value to filter the DIAMOND classification result.")
parser.add_argument("--prefix", required=True, help="Prefix for output files")
parser.add_argument("--single_end", action="store_true", help="Flag for single_end reads")
parser.add_argument("--fastq", required=True, nargs='+', help="Paths to input FASTQ files.")


return parser.parse_args(args)

def extract_reads_by_taxid(tsv_path, taxid, evalue, fastq_paths, single_end, prefix):
read_id_file = f"{prefix}_readID.txt"

# Step1: Filter the DIAMOND tsv file line-by-line and collect read IDs by taxid.
with open(tsv_path, 'r') as tsv, open(read_id_file, 'w') as out:
for line in tsv:
parts = line.strip().split('\t')
current_taxid = parts[1]
if current_taxid == taxid and parts[2] < evalue:
out.write(parts[0] + "\n")

# Step2: Extract reads using seqkit
output_files = []

if single_end:
output_file = f"{prefix}_{taxid}.extracted_diamond_reads.fastq"
subprocess.run(["seqkit", "grep", "-f", read_id_file, fastq_paths[0], "-o", output_file], check=True)
output_files.append(output_file)
else:
output_file1 = f"{prefix}_{taxid}.extracted_diamond_read1.fastq"
output_file2 = f"{prefix}_{taxid}.extracted_diamond_read2.fastq"
subprocess.run(["seqkit", "grep", "-f", read_id_file, fastq_paths[0], "-o", output_file1], check=True)
subprocess.run(["seqkit", "grep", "-f", read_id_file, fastq_paths[1], "-o", output_file2], check=True)
output_files.extend([output_file1, output_file2])

# Clean up temporary files
os.remove(read_id_file)

return output_files

def main(args=None):
args = parse_args(args)
extract_reads_by_taxid(
tsv_path=args.tsv,
taxid=args.taxid,
evalue=args.evalue,
prefix=args.prefix,
single_end=args.single_end,
fastq_paths=args.fastq
)

if __name__ == "__main__":
main()



2 changes: 1 addition & 1 deletion conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ process {
maxRetries = 3
}
withName: EXTRACTCDIAMONDREADS {
memory = { 60.GB * task.attempt }
memory = 180.GB
}
withName: SPADES {
errorStrategy = { task.exitStatus in [143,137,21,2,1,0] ? 'retry' : 'ignore' }
Expand Down
1 change: 1 addition & 0 deletions conf/test_denovo.config
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ params {
fastq_output = true
extract_centrifuge_reads = true
extract_diamond_reads = true
evalue = 1e-20

// de novo
perform_shortread_denovo = true
Expand Down
3 changes: 2 additions & 1 deletion conf/test_screenpathogen.config
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines a input taxid list to run a fast and simple pipeline test.
Test the pathogen screening workflow.
Use as follows:
nextflow run genomic-medicine-sweden/metaval -profile test_screenpathogen,<docker/singularity> --outdir <OUTDIR>
Expand Down Expand Up @@ -34,6 +34,7 @@ params {
fastq_output = false
extract_centrifuge_reads = false
extract_diamond_reads = false
evalue = 1e-20

// de novo
perform_shortread_denovo = false
Expand Down
1 change: 1 addition & 0 deletions conf/test_taxid.config
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ params {
fastq_output = true
extract_centrifuge_reads = true
extract_diamond_reads = true
evalue = 1e-20

// de novo
perform_shortread_denovo = true
Expand Down
1 change: 1 addition & 0 deletions conf/test_virus.config
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ params {
fastq_output = true
extract_centrifuge_reads = true
extract_diamond_reads = true
evalue = 1e-20

// de novo
perform_shortread_denovo = true
Expand Down
2 changes: 0 additions & 2 deletions modules/local/extract_viral_taxid/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,6 @@ process EXTRACT_VIRAL_TAXID {
awk '\$3 < ${evalue}' ${report} | cut -f 2 | uniq > detected_taxid.txt
grep -F -w -f taxpasta_viral_taxid.txt detected_taxid.txt | uniq > ${prefix}_viral_taxids.tsv
fi
else
echo "No viral taxids found." > "no_viral_taxid.txt"
fi
"""
}
2 changes: 1 addition & 1 deletion modules/local/extract_viral_taxid/meta.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ keywards:
input:
- evalue:
type: ["number", "integer"]
description: A e-vaule threshold to filter the diamond classification result
description: An e-value threshold to filter the diamond classification result
- - meta:
type: map
description: |
Expand Down
7 changes: 7 additions & 0 deletions modules/local/extractdiamondreads/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
name: extractdiamondreads
channels:
- conda-forge
- bioconda
dependencies:
- bioconda::seqkit=2.9.0
- conda-forge::python=3.13.1
34 changes: 17 additions & 17 deletions modules/local/extractdiamondreads/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -3,41 +3,41 @@ process EXTRACTCDIAMONDREADS {
tag "$meta.id"
label 'process_high'

conda "bioconda::seqkit=2.8.2"
conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/seqkit:2.8.2--h9ee0642_1':
'biocontainers/seqkit:2.8.2--h9ee0642_1' }"
'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/11/1195d7edbe47145bda13bf6809891dc2fbe9df749c2e567b8879518b8de4ca33/data':
'community.wave.seqera.io/library/seqkit_python:cf97bbadc3675f5b' }"

input:
val taxid
val evalue
tuple val (meta), path(tsv)
tuple val (meta), path(fastq) // bowtie2/align *unmapped_{1,2}.fastq.gz
tuple val (meta), path(fastq)

output:
tuple val(meta), path("*.fastq"), optional:true, emit: extracted_diamond_reads
path "versions.yml" , emit: versions
tuple val(meta), path("*.fastq"), optional: true, emit: extracted_diamond_reads
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def single_end_flag = meta.single_end ? '--single_end' : ''

"""
awk -v taxID=$taxid '\$2 == taxID {print \$1}' $tsv > readID.txt
if [ ${meta.single_end} == 'true' ]; then
seqkit grep -f readID.txt $fastq > ${prefix}_${taxid}.extracted_diamond_read.fastq
elif [ "${meta.single_end}" == 'false' ]; then
seqkit grep -f readID.txt ${fastq[0]} > ${prefix}_${taxid}.extracted_diamond_read1.fastq
seqkit grep -f readID.txt ${fastq[1]} > ${prefix}_${taxid}.extracted_diamond_read2.fastq
fi
rm readID.txt
extractdiamondreads.py \\
--taxid $taxid \\
--tsv $tsv \\
--evalue $evalue \\
--prefix $prefix \\
$single_end_flag \\
--fastq ${fastq.join(' ')}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
seqkit: \$( seqkit version | sed 's/seqkit v//' )
seqkit: \$( seqkit version | sed -e 's/seqkit v//g' )
python: \$( python --version | sed -e 's/Python //g')
END_VERSIONS
"""
}
5 changes: 4 additions & 1 deletion modules/local/extractdiamondreads/meta.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@ name: extractdiamondreads
description: Use a custom python script to extract reads with specified taxonomic ID from the DIAMOND classification output
keywards:
- taxid
- DIAMOND/blastx
- DIAMOND
- tsv
- fastq
- extract_reads
input:
- taxid:
type: integer
description: A taxonomic ID to extract the reads
- evalue:
type: ["number", "integer"]
description: An e-value threshold to filter the diamond classification result
- meta:
type: map
description: |
Expand Down
1 change: 0 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,6 @@ profiles {
}
}
test { includeConfig 'conf/test.config' }
test_full { includeConfig 'conf/test_full.config' }
test_taxid { includeConfig 'conf/test_taxid.config' }
test_virus { includeConfig 'conf/test_virus.config' }
test_denovo { includeConfig 'conf/test_denovo.config' }
Expand Down
2 changes: 2 additions & 0 deletions subworkflows/local/taxid_reads.nf
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ workflow TAXID_READS {

EXTRACTCDIAMONDREADS(
diamond_params_taxid.taxid,
params.evalue,
diamond_params_taxid.diamond_tsv,
diamond_params_taxid.reads
)
Expand Down Expand Up @@ -177,6 +178,7 @@ workflow TAXID_READS {

EXTRACTCDIAMONDREADS(
diamond_combined_input.taxid,
params.evalue,
diamond_combined_input.diamond_tsv,
diamond_combined_input.reads,
)
Expand Down

0 comments on commit f8843f3

Please sign in to comment.