Skip to content

Commit

Permalink
Updates for v1.6.0
Browse files Browse the repository at this point in the history
Better handle failing samples, add per-rule installation, and general updates
  • Loading branch information
jaleezyy authored Mar 17, 2023
2 parents b8a6388 + b6e5992 commit f66f2f7
Show file tree
Hide file tree
Showing 10 changed files with 560 additions and 163 deletions.
152 changes: 104 additions & 48 deletions README.md

Large diffs are not rendered by default.

96 changes: 84 additions & 12 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,9 @@ rule clean_reads:
expand('{sn}/mapped_clean_reads/{sn}_R{r}.fastq.gz', sn=sample_names, r=[1,2])

rule consensus:
input: expand('{sn}/core/{sn}.consensus.fa', sn=sample_names)
input: expand('{sn}/core/{sn}.consensus.fa', sn=sample_names),
'all_genomes.fa',
# 'failed_samples.log'

rule ivar_variants:
input: expand('{sn}/core/{sn}_ivar_variants.tsv', sn=sample_names)
Expand All @@ -136,14 +138,15 @@ rule breseq:
input: expand('{sn}/breseq/output/index.html', sn=sample_names)

rule freebayes:
input:
input:
'all_freebayes_genomes.fa',
# 'failed_samples.log',
expand('{sn}/freebayes/{sn}.consensus.fasta', sn=sample_names),
expand('{sn}/freebayes/{sn}.variants.norm.vcf', sn=sample_names),
'freebayes_lineage_assignments.tsv',
expand('{sn}/freebayes/quast/{sn}_quast_report.html', sn=sample_names),
expand('{sn}/freebayes/{sn}_consensus_compare.vcf', sn=sample_names)


rule coverage:
input: expand('{sn}/coverage/{sn}_depth.txt', sn=sample_names)

Expand Down Expand Up @@ -239,7 +242,8 @@ rule ncov_tools:
negative_control_prefix = config['negative_control_prefix'],
freebayes_run = config['run_freebayes'],
pangolin = versions['pangolin'],
mode = pango_speed
mode = pango_speed,
failed = 'failed_samples.log'
input:
consensus = expand('{sn}/core/{sn}.consensus.fa', sn=sample_names),
primertrimmed_bams = expand("{sn}/core/{sn}_viral_reference.mapping.primertrimmed.sorted.bam", sn=sample_names),
Expand Down Expand Up @@ -327,7 +331,7 @@ rule raw_reads_composite_reference_bwa_map:
shell:
'(bwa mem -t {threads} {params.composite_index} '
'{input.raw_r1} {input.raw_r2} | '
'{params.script_path} -c {params.viral_contig_name} > {output}) 2> {log}'
"{params.script_path} -c {params.viral_contig_name} > {output}) 2> {log} || echo '' > {output}"

rule get_host_removed_reads:
threads: 2
Expand Down Expand Up @@ -375,7 +379,7 @@ rule run_trimgalore:
shell:
'trim_galore --quality {params.min_qual} --length {params.min_len} '
' -o {params.output_prefix} --cores {threads} --fastqc '
'--paired {input.raw_r1} {input.raw_r2} 2> {log} || touch {output}'
"--paired {input.raw_r1} {input.raw_r2} 2> {log} || (echo -e 'Total reads processed: 0\nReads written (passing filters): 0 (0.0%)\nTotal basepairs processed: 0 bp\nTotal written (filtered): 0 bp (0.0%)' >> {log}; touch {output})"

rule run_filtering_of_residual_adapters:
threads: 2
Expand Down Expand Up @@ -769,6 +773,32 @@ rule run_quast_freebayes:
'quast {input} -r {params.genome} -g {params.fcoords} --output-dir {params.outdir} --threads {threads} >{log} && '
'for f in {params.unlabelled_reports}; do mv $f ${{f/report/{params.sample_name}}}; done'

rule collect_core_genomes:
output:
"all_genomes.fa"
input:
expand(['{sn}/core/{sn}.consensus.fa'], sn=sample_names)
shell:
"""
cat {input} > {output}
sample=''
count=''
echo "Samples that failed to assemble:" > failed_samples.log
while read -r line;
do
if [[ $line =~ '>' ]]; then
sample=$(echo $line | cut -d'.' -f1 | cut -d'_' -f2)
else
count=$(echo $line | wc -c)
if [[ $count -eq 1 ]]; then
echo $sample >> failed_samples.log
else
continue
fi
fi
done < {output}
"""

rule run_lineage_assignment:
threads: 4
conda: 'conda_envs/assign_lineages.yaml'
Expand All @@ -777,7 +807,7 @@ rule run_lineage_assignment:
nextclade_ver_out = 'input_nextclade_versions.txt',
lin_out = 'lineage_assignments.tsv'
input:
expand('{sn}/core/{sn}.consensus.fa', sn=sample_names)
'all_genomes.fa'
params:
pangolin_ver = versions['pangolin'],
pangolearn_ver = versions['pangolearn'],
Expand All @@ -794,8 +824,51 @@ rule run_lineage_assignment:
shell:
"echo -e 'pangolin: {params.pangolin_ver}\nconstellations: {params.constellations_ver}\nscorpio: {params.scorpio_ver}\npangolearn: {params.pangolearn_ver}\npango-designation: {params.designation_ver}\npangolin-data: {params.data_ver}' > {output.pango_ver_out} && "
"echo -e 'nextclade: {params.nextclade_ver}\nnextclade-dataset: {params.nextclade_data}\nnextclade-include-recomb: {params.nextclade_recomb}' > {output.nextclade_ver_out} && "
'cat {input} > all_genomes.fa && '
'{params.assignment_script_path} -i all_genomes.fa -t {threads} -o {output.lin_out} -p {output.pango_ver_out} -n {output.nextclade_ver_out} --mode {params.analysis_mode}'
'{params.assignment_script_path} -i {input} -t {threads} -o {output.lin_out} -p {output.pango_ver_out} -n {output.nextclade_ver_out} --mode {params.analysis_mode}'

rule collect_freebayes_genomes:
output:
"all_freebayes_genomes.fa"
input:
expand('{sn}/freebayes/{sn}.consensus.fasta', sn=sample_names)
shell:
"""
cat {input} > {output}
sample=''
seq=''
count=''
out=''
if [[ -f 'failed_samples.log' ]]; then
out='.failed_freebayes_samples.tmp'
cat failed_samples.log | sed 1,1d > $out
echo "Samples that failed to assemble:" > failed_samples.log
else
out='failed_samples.log'
echo "Samples that failed to assemble:" > $out
fi
while read -r line;
do
if [[ $line =~ '>' ]]; then
if [[ $(echo $seq | wc -c) -eq 1 ]]; then # check if new seq
count=$(echo $seq | grep -vc 'N')
if [[ $count -eq 0 ]]; then
echo $sample >> $out
fi
sample=$(echo $line | cut -d'>' -f2) # start new seq
seq=''
else
sample=$(echo $line | cut -d'>' -f2) # first seq
fi
else
seq+=$line # append seq
fi
done < {output}
if [[ ! $out == 'failed_samples.log' ]]; then
sort -b -d -f $out | uniq >> failed_samples.log
rm $out
fi
"""

rule run_lineage_assignment_freebayes:
threads: 4
Expand All @@ -805,10 +878,9 @@ rule run_lineage_assignment_freebayes:
input:
p_vers = 'input_pangolin_versions.txt',
n_vers = 'input_nextclade_versions.txt',
consensus = expand('{sn}/freebayes/{sn}.consensus.fasta', sn=sample_names)
consensus = 'all_freebayes_genomes.fa'
params:
analysis_mode = pango_speed,
assignment_script_path = os.path.join(exec_dir, 'scripts', 'assign_lineages.py')
shell:
'cat {input.consensus} > all_freebayes_genomes.fa && '
'{params.assignment_script_path} -i all_freebayes_genomes.fa -t {threads} -o {output} -p {input.p_vers} -n {input.n_vers} --mode {params.analysis_mode} --skip'
'{params.assignment_script_path} -i {input.consensus} -t {threads} -o {output} -p {input.p_vers} -n {input.n_vers} --mode {params.analysis_mode} --skip'
1 change: 0 additions & 1 deletion conda_envs/assign_lineages.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ dependencies:
- python>=3.7
- snakemake-minimal
- gofasta
- nodejs
- usher
- pandas
- pysam==0.16.0.1
Expand Down
55 changes: 41 additions & 14 deletions scripts/assign_lineages.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import shutil
import os, sys
from datetime import datetime
import json


def check_file(path: str) -> Path:
Expand Down Expand Up @@ -136,27 +137,53 @@ def update_nextclade_dataset(vers, skip):

# If specific tag requested, attempt to install, otherwise install latest
accession = 'MN908947'
current_tag = None
if os.path.exists(os.path.join(output_dir, 'tag.json')):
j = open(os.path.join(output_dir, 'tag.json'))
data = json.load(j)
current_tag = data['tag']
j.close()
if requested is not None:
# check existing database, if found
if requested == current_tag:
print(f"Nextclade dataset {requested} already installed! Skipping update!")
else:
try:
print(f"\nDownloading Nextclade {dataset} dataset tagged {requested} for reference {accession}!")
subprocess.run(f"nextclade dataset get "
f"--name '{dataset}' "
f"--reference '{accession}' "
f"--tag {requested} "
f"--output-dir '{output_dir}'", shell=True, check=True)
except subprocess.CalledProcessError:
print(f"\nDatabase not found! Please check whether {requested} tag exists! Downloading latest Nextclade {dataset} dataset for reference {accession}...")
try:
subprocess.run(f"nextclade dataset get "
f"--name '{dataset}' "
f"--reference '{accession}' "
f"--output-dir '{output_dir}'", shell=True, check=True)
except subprocess.CalledProcessError:
if current_tag is not None:
print(f"Something went wrong updating the Nextclade dataset, using {current_tag} instead!")
requested = current_tag
else:
print(f"Something went wrong updating the Nextclade dataset! No database could be found which may result in errors! Skipping update...")
requested = "Unknown"
else:
try:
print(f"\nDownloading Nextclade {dataset} dataset tagged {requested} for reference {accession}!")
print(f"\nDownloading latest Nextclade {dataset} dataset for reference {accession}!")
subprocess.run(f"nextclade dataset get "
f"--name '{dataset}' "
f"--reference '{accession}' "
f"--tag {requested} "
f"--output-dir '{output_dir}'", shell=True, check=True)
except subprocess.CalledProcessError:
print(f"\nDatabase not found! Please check whether {requested} tag exists! Downloading latest Nextclade {dataset} dataset for reference {accession}...")
subprocess.run(f"nextclade dataset get "
f"--name '{dataset}' "
f"--reference '{accession}' "
f"--output-dir '{output_dir}'", shell=True, check=True)
else:
print(f"\nDownloading latest Nextclade {dataset} dataset for reference {accession}!")
subprocess.run(f"nextclade dataset get "
f"--name '{dataset}' "
f"--reference '{accession}' "
f"--output-dir '{output_dir}'", shell=True, check=True)

if current_tag is not None:
print(f"Something went wrong updating the Nextclade dataset, using {current_tag} instead!")
requested = current_tag
else:
print(f"Something went wrong updating the Nextclade dataset! No database could be found which may result in errors! Skipping update...")
requested = "Unknown"

# Obtain final version information for output
nextclade_version = subprocess.run(f"nextclade --version".split(), stdout=subprocess.PIPE).stdout.decode('utf-8').strip().lower()
if nextclade_version.startswith("nextclade"):
Expand Down
66 changes: 46 additions & 20 deletions scripts/get_data_dependencies.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,34 +14,42 @@ accession="MN908947.3"

HELP="""
Flags:
-d : Directory to configure database within (~10GB)
-a : Accession to use as viral reference (default=MN908947.3)
-d : Directory to configure database within (~10GB)
-a : Accession to use as viral reference (default=MN908947.3)
"""

while getopts ":d:a:" option; do
case "${option}" in
d) database_dir=$OPTARG;;
a) accession=$OPTARG;;
esac
case "${option}" in
d) database_dir=$OPTARG;;
a) accession=$OPTARG;;
esac
done

if [ $database_dir = 0 ] ; then
echo "You must specify a data directory to install data dependencies."
echo "$HELP"
exit 1
echo "You must specify a data directory to install data dependencies."
echo "$HELP"
exit 1
fi

echo -e "Warning: \n - final databases require ~10GB of storage\n - building databases temporarily requires a peak of ~35GB of storage and ~4GB of memory \n - script takes up to ~1.5 hours (system depending)"

# make database dir and get abspath to it
mkdir -p $database_dir
if [ ! -d $database_dir ]; then mkdir -p $database_dir; fi
database_dir=$(realpath $database_dir)

# use curl to grab "simple data dependencies"
curl -s "https://raw.githubusercontent.com/timflutre/trimmomatic/3694641a92d4dd9311267fed85b05c7a11141e7c/adapters/NexteraPE-PE.fa" > $database_dir/NexteraPE-PE.fa
curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${accession}&rettype=gb&retmode=txt" > $database_dir/$accession.gbk
curl -s "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id=${accession}" > $database_dir/$accession.gff3
curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${accession}&rettype=fasta&retmode=txt" > $database_dir/$accession.fasta
if [ ! -f $database_dir/'NexteraPE-PE.fa' ]; then
curl -s "https://raw.githubusercontent.com/timflutre/trimmomatic/3694641a92d4dd9311267fed85b05c7a11141e7c/adapters/NexteraPE-PE.fa" > $database_dir/NexteraPE-PE.fa
fi
if [ ! -f $database_dir/${accession}.gbk ]; then
curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${accession}&rettype=gb&retmode=txt" > $database_dir/$accession.gbk
fi
if [ ! -f $database_dir/${accession}.gff3 ]; then
curl -s "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id=${accession}" > $database_dir/$accession.gff3
fi
if [ ! -f $database_dir/${accession}.fasta ]; then
curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${accession}&rettype=fasta&retmode=txt" > $database_dir/$accession.fasta
fi

# install and activate env for kraken/bwa to build their databases/index
CONDA_BASE=$($CONDA_EXE info --base)
Expand All @@ -51,19 +59,37 @@ conda activate data_dependencies

# get the GRCh38 human genome
# as per https://lh3.github.io/2017/11/13/which-human-reference-genome-to-use
curl -s "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz" > $database_dir/GRC38_no_alt_analysis_set.fna.gz
gunzip $database_dir/GRC38_no_alt_analysis_set.fna.gz
if [ ! -f $database_dir/"GRC38_no_alt_analysis_set.fna" ]; then
curl -s "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz" > $database_dir/GRC38_no_alt_analysis_set.fna.gz
gunzip $database_dir/GRC38_no_alt_analysis_set.fna.gz
fi

# create composite reference of human and virus for competitive bwt mapping
# based host removal
if [ ! -f $database_dir/'composite_human_viral_reference.fna' ]; then
cat $database_dir/GRC38_no_alt_analysis_set.fna $database_dir/$accession.fasta > $database_dir/composite_human_viral_reference.fna
bwa index $database_dir/composite_human_viral_reference.fna
fi
for file in $database_dir/composite_human_viral_reference.fna.{amb,ann,bwt,pac,sa}; do
if [ ! -f $file ]; then
bwa index $database_dir/composite_human_viral_reference.fna
break
else
continue
fi
done

# get kraken2 viral db
mkdir -p $database_dir/Kraken2/db
curl -s "https://genome-idx.s3.amazonaws.com/kraken/k2_viral_20210517.tar.gz" > $database_dir/Kraken2/db/k2_viral_20210517.tar.gz
cd $database_dir/Kraken2/db
tar xvf k2_viral_20210517.tar.gz
for file in $database_dir/Kraken2/db/{hash,opts,taxo}.k2d; do
if [ ! -f $file ]; then
curl -s "https://genome-idx.s3.amazonaws.com/kraken/k2_viral_20210517.tar.gz" > $database_dir/Kraken2/db/k2_viral_20210517.tar.gz
cd $database_dir/Kraken2/db
tar xvf k2_viral_20210517.tar.gz
break
else
continue
fi
done

# create blank fasta for 'phylo_include_seqs'
touch $database_dir/blank.fasta
Loading

0 comments on commit f66f2f7

Please sign in to comment.