Updates for v1.6.0

Better handle failing samples, add per-rule installation, and general updates
jaleezyy · Mar 17, 2023 · f66f2f7 · f66f2f7
2 parents b8a6388 + b6e5992
commit f66f2f7
Show file tree

Hide file tree

Showing 10 changed files with 560 additions and 163 deletions.
diff --git a/README.md b/README.md
diff --git a/Snakefile b/Snakefile
@@ -127,7 +127,9 @@ rule clean_reads:
        expand('{sn}/mapped_clean_reads/{sn}_R{r}.fastq.gz', sn=sample_names, r=[1,2])
 
 rule consensus:
-    input: expand('{sn}/core/{sn}.consensus.fa', sn=sample_names)
+    input: expand('{sn}/core/{sn}.consensus.fa', sn=sample_names),
+           'all_genomes.fa',
+#           'failed_samples.log'
 
 rule ivar_variants:
     input: expand('{sn}/core/{sn}_ivar_variants.tsv', sn=sample_names)
@@ -136,14 +138,15 @@ rule breseq:
     input: expand('{sn}/breseq/output/index.html', sn=sample_names)
 
 rule freebayes:
-    input: 
+    input:
+        'all_freebayes_genomes.fa',
+#        'failed_samples.log',
         expand('{sn}/freebayes/{sn}.consensus.fasta', sn=sample_names),
         expand('{sn}/freebayes/{sn}.variants.norm.vcf', sn=sample_names),
         'freebayes_lineage_assignments.tsv',
         expand('{sn}/freebayes/quast/{sn}_quast_report.html', sn=sample_names),
         expand('{sn}/freebayes/{sn}_consensus_compare.vcf', sn=sample_names)
 
-
 rule coverage:
     input: expand('{sn}/coverage/{sn}_depth.txt', sn=sample_names)
 
@@ -239,7 +242,8 @@ rule ncov_tools:
         negative_control_prefix = config['negative_control_prefix'],
         freebayes_run = config['run_freebayes'],
         pangolin = versions['pangolin'],
-        mode = pango_speed
+        mode = pango_speed,
+        failed = 'failed_samples.log'
     input:
         consensus = expand('{sn}/core/{sn}.consensus.fa', sn=sample_names),
         primertrimmed_bams = expand("{sn}/core/{sn}_viral_reference.mapping.primertrimmed.sorted.bam", sn=sample_names),
@@ -327,7 +331,7 @@ rule raw_reads_composite_reference_bwa_map:
     shell:
         '(bwa mem -t {threads} {params.composite_index} '
         '{input.raw_r1} {input.raw_r2} | '
-        '{params.script_path} -c {params.viral_contig_name} > {output}) 2> {log}'
+        "{params.script_path} -c {params.viral_contig_name} > {output}) 2> {log} || echo '' > {output}"
 
 rule get_host_removed_reads:
     threads: 2
@@ -375,7 +379,7 @@ rule run_trimgalore:
     shell:
         'trim_galore --quality {params.min_qual} --length {params.min_len} '
         ' -o {params.output_prefix} --cores {threads} --fastqc '
-        '--paired {input.raw_r1} {input.raw_r2} 2> {log} || touch {output}'
+        "--paired {input.raw_r1} {input.raw_r2} 2> {log} || (echo -e 'Total reads processed:  0\nReads written (passing filters):  0 (0.0%)\nTotal basepairs processed:  0 bp\nTotal written (filtered):  0 bp (0.0%)' >> {log}; touch {output})"
 
 rule run_filtering_of_residual_adapters:
     threads: 2
@@ -769,6 +773,32 @@ rule run_quast_freebayes:
          'quast {input} -r {params.genome} -g {params.fcoords} --output-dir {params.outdir} --threads {threads} >{log} && '
          'for f in {params.unlabelled_reports}; do mv $f ${{f/report/{params.sample_name}}}; done'
 
+rule collect_core_genomes:
+    output:
+        "all_genomes.fa"
+    input:
+        expand(['{sn}/core/{sn}.consensus.fa'], sn=sample_names)
+    shell:
+        """
+        cat {input} > {output}
+        sample=''
+        count=''
+        echo "Samples that failed to assemble:" > failed_samples.log
+        while read -r line;
+        do
+            if [[ $line =~ '>' ]]; then
+                sample=$(echo $line | cut -d'.' -f1 | cut -d'_' -f2)
+            else
+                count=$(echo $line | wc -c)
+                if [[ $count -eq 1 ]]; then
+                    echo $sample >> failed_samples.log
+                else
+                    continue
+                fi
+            fi
+        done < {output}
+        """
+
 rule run_lineage_assignment:
     threads: 4
     conda: 'conda_envs/assign_lineages.yaml'
@@ -777,7 +807,7 @@ rule run_lineage_assignment:
         nextclade_ver_out = 'input_nextclade_versions.txt',
         lin_out = 'lineage_assignments.tsv'
     input:
-        expand('{sn}/core/{sn}.consensus.fa', sn=sample_names)
+        'all_genomes.fa'
     params:
         pangolin_ver = versions['pangolin'],
         pangolearn_ver = versions['pangolearn'],
@@ -794,8 +824,51 @@ rule run_lineage_assignment:
     shell:
         "echo -e 'pangolin: {params.pangolin_ver}\nconstellations: {params.constellations_ver}\nscorpio: {params.scorpio_ver}\npangolearn: {params.pangolearn_ver}\npango-designation: {params.designation_ver}\npangolin-data: {params.data_ver}' > {output.pango_ver_out} && "
         "echo -e 'nextclade: {params.nextclade_ver}\nnextclade-dataset: {params.nextclade_data}\nnextclade-include-recomb: {params.nextclade_recomb}' > {output.nextclade_ver_out} && "
-        'cat {input} > all_genomes.fa && '
-        '{params.assignment_script_path} -i all_genomes.fa -t {threads} -o {output.lin_out} -p {output.pango_ver_out} -n {output.nextclade_ver_out} --mode {params.analysis_mode}'
+        '{params.assignment_script_path} -i {input} -t {threads} -o {output.lin_out} -p {output.pango_ver_out} -n {output.nextclade_ver_out} --mode {params.analysis_mode}'
+
+rule collect_freebayes_genomes:
+    output:
+        "all_freebayes_genomes.fa"
+    input:
+        expand('{sn}/freebayes/{sn}.consensus.fasta', sn=sample_names)
+    shell:
+        """
+        cat {input} > {output}
+        sample=''
+        seq=''
+        count=''
+        out=''
+        if [[ -f 'failed_samples.log' ]]; then
+            out='.failed_freebayes_samples.tmp'
+            cat failed_samples.log | sed 1,1d > $out
+            echo "Samples that failed to assemble:" > failed_samples.log
+        else
+            out='failed_samples.log'
+            echo "Samples that failed to assemble:" > $out
+        fi
+        while read -r line;
+        do
+            if [[ $line =~ '>' ]]; then
+                if [[ $(echo $seq | wc -c) -eq 1 ]]; then # check if new seq
+                    count=$(echo $seq | grep -vc 'N')
+                    if [[ $count -eq 0 ]]; then
+                        echo $sample >> $out
+                    fi
+                    sample=$(echo $line | cut -d'>' -f2) # start new seq
+                    seq=''
+                else
+                    sample=$(echo $line | cut -d'>' -f2) # first seq
+                fi
+            else
+                seq+=$line # append seq
+            fi
+        done < {output}
+        
+        if [[ ! $out == 'failed_samples.log' ]]; then
+            sort -b -d -f $out | uniq >> failed_samples.log
+            rm $out
+        fi
+        """
 
 rule run_lineage_assignment_freebayes:
     threads: 4
@@ -805,10 +878,9 @@ rule run_lineage_assignment_freebayes:
     input:
         p_vers = 'input_pangolin_versions.txt',
         n_vers = 'input_nextclade_versions.txt',
-        consensus = expand('{sn}/freebayes/{sn}.consensus.fasta', sn=sample_names)
+        consensus = 'all_freebayes_genomes.fa'
     params:
         analysis_mode = pango_speed,
         assignment_script_path = os.path.join(exec_dir, 'scripts', 'assign_lineages.py')
     shell:
-        'cat {input.consensus} > all_freebayes_genomes.fa && '
-        '{params.assignment_script_path} -i all_freebayes_genomes.fa -t {threads} -o {output} -p {input.p_vers} -n {input.n_vers} --mode {params.analysis_mode} --skip'
+        '{params.assignment_script_path} -i {input.consensus} -t {threads} -o {output} -p {input.p_vers} -n {input.n_vers} --mode {params.analysis_mode} --skip'
diff --git a/conda_envs/assign_lineages.yaml b/conda_envs/assign_lineages.yaml
@@ -11,7 +11,6 @@ dependencies:
   - python>=3.7
   - snakemake-minimal
   - gofasta
-  - nodejs
   - usher
   - pandas
   - pysam==0.16.0.1

diff --git a/scripts/assign_lineages.py b/scripts/assign_lineages.py
@@ -8,6 +8,7 @@
 import shutil
 import os, sys
 from datetime import datetime
+import json
 
 
 def check_file(path: str) -> Path:
@@ -136,27 +137,53 @@ def update_nextclade_dataset(vers, skip):
 
 	# If specific tag requested, attempt to install, otherwise install latest
 	accession = 'MN908947'
+	current_tag = None
+	if os.path.exists(os.path.join(output_dir, 'tag.json')):
+		j = open(os.path.join(output_dir, 'tag.json'))
+		data = json.load(j)
+		current_tag = data['tag']
+		j.close()
 	if requested is not None:
+		# check existing database, if found
+			if requested == current_tag:
+				print(f"Nextclade dataset {requested} already installed! Skipping update!")
+			else:
+				try:
+					print(f"\nDownloading Nextclade {dataset} dataset tagged {requested} for reference {accession}!")
+					subprocess.run(f"nextclade dataset get "
+								f"--name '{dataset}' "
+								f"--reference '{accession}' "
+								f"--tag {requested} "
+								f"--output-dir '{output_dir}'", shell=True, check=True)
+				except subprocess.CalledProcessError:
+					print(f"\nDatabase not found! Please check whether {requested} tag exists! Downloading latest Nextclade {dataset} dataset for reference {accession}...")
+					try:
+						subprocess.run(f"nextclade dataset get "
+									f"--name '{dataset}' "
+									f"--reference '{accession}' "
+									f"--output-dir '{output_dir}'", shell=True, check=True)
+					except subprocess.CalledProcessError:
+						if current_tag is not None:
+							print(f"Something went wrong updating the Nextclade dataset, using {current_tag} instead!")
+							requested = current_tag
+						else:
+							print(f"Something went wrong updating the Nextclade dataset! No database could be found which may result in errors! Skipping update...")
+							requested = "Unknown"
+	else:
 		try:
-			print(f"\nDownloading Nextclade {dataset} dataset tagged {requested} for reference {accession}!")
+			print(f"\nDownloading latest Nextclade {dataset} dataset for reference {accession}!")
 			subprocess.run(f"nextclade dataset get "
 						f"--name '{dataset}' "
 						f"--reference '{accession}' "
-						f"--tag {requested} "
 						f"--output-dir '{output_dir}'", shell=True, check=True)
 		except subprocess.CalledProcessError:
-			print(f"\nDatabase not found! Please check whether {requested} tag exists! Downloading latest Nextclade {dataset} dataset for reference {accession}...")
-			subprocess.run(f"nextclade dataset get "
-						f"--name '{dataset}' "
-						f"--reference '{accession}' "
-						f"--output-dir '{output_dir}'", shell=True, check=True)
-	else:
-		print(f"\nDownloading latest Nextclade {dataset} dataset for reference {accession}!")
-		subprocess.run(f"nextclade dataset get "
-					f"--name '{dataset}' "
-					f"--reference '{accession}' "
-					f"--output-dir '{output_dir}'", shell=True, check=True)
-
+			if current_tag is not None:
+				print(f"Something went wrong updating the Nextclade dataset, using {current_tag} instead!")
+				requested = current_tag
+			else:
+				print(f"Something went wrong updating the Nextclade dataset! No database could be found which may result in errors! Skipping update...")
+				requested = "Unknown"
+
 	# Obtain final version information for output
 	nextclade_version = subprocess.run(f"nextclade --version".split(), stdout=subprocess.PIPE).stdout.decode('utf-8').strip().lower()
 	if nextclade_version.startswith("nextclade"):

diff --git a/scripts/get_data_dependencies.sh b/scripts/get_data_dependencies.sh
@@ -14,34 +14,42 @@ accession="MN908947.3"
 
 HELP="""
 Flags:
-    -d  :  Directory to configure database within (~10GB)
-    -a  :  Accession to use as viral reference (default=MN908947.3)
+	-d  :  Directory to configure database within (~10GB)
+	-a  :  Accession to use as viral reference (default=MN908947.3)
 """
 
 while getopts ":d:a:" option; do
-    case "${option}" in
-        d) database_dir=$OPTARG;;
-        a) accession=$OPTARG;;
-    esac
+	case "${option}" in
+		d) database_dir=$OPTARG;;
+		a) accession=$OPTARG;;
+	esac
 done
 
 if [ $database_dir = 0 ] ; then
-    echo "You must specify a data directory to install data dependencies."
-    echo "$HELP"
-    exit 1
+	echo "You must specify a data directory to install data dependencies."
+	echo "$HELP"
+	exit 1
 fi
 
 echo -e "Warning: \n - final databases require ~10GB of storage\n - building databases temporarily requires a peak of ~35GB of storage and ~4GB of memory \n - script takes up to ~1.5 hours (system depending)"
 
 # make database dir and get abspath to it
-mkdir -p $database_dir
+if [ ! -d $database_dir ]; then mkdir -p $database_dir; fi
 database_dir=$(realpath $database_dir)
 
 # use curl to grab "simple data dependencies"
-curl -s "https://raw.githubusercontent.com/timflutre/trimmomatic/3694641a92d4dd9311267fed85b05c7a11141e7c/adapters/NexteraPE-PE.fa" > $database_dir/NexteraPE-PE.fa
-curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${accession}&rettype=gb&retmode=txt" > $database_dir/$accession.gbk
-curl -s "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id=${accession}" > $database_dir/$accession.gff3
-curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${accession}&rettype=fasta&retmode=txt" > $database_dir/$accession.fasta
+if [ ! -f $database_dir/'NexteraPE-PE.fa' ]; then
+	curl -s "https://raw.githubusercontent.com/timflutre/trimmomatic/3694641a92d4dd9311267fed85b05c7a11141e7c/adapters/NexteraPE-PE.fa" > $database_dir/NexteraPE-PE.fa
+fi
+if [ ! -f $database_dir/${accession}.gbk ]; then
+	curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${accession}&rettype=gb&retmode=txt" > $database_dir/$accession.gbk
+fi
+if [ ! -f $database_dir/${accession}.gff3 ]; then
+	curl -s "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id=${accession}" > $database_dir/$accession.gff3
+fi
+if [ ! -f $database_dir/${accession}.fasta ]; then
+	curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${accession}&rettype=fasta&retmode=txt" > $database_dir/$accession.fasta
+fi
 
 # install and activate env for kraken/bwa to build their databases/index
 CONDA_BASE=$($CONDA_EXE info --base)
@@ -51,19 +59,37 @@ conda activate data_dependencies
 
 # get the GRCh38 human genome
 # as per https://lh3.github.io/2017/11/13/which-human-reference-genome-to-use
-curl -s "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz" > $database_dir/GRC38_no_alt_analysis_set.fna.gz
-gunzip $database_dir/GRC38_no_alt_analysis_set.fna.gz
+if [ ! -f $database_dir/"GRC38_no_alt_analysis_set.fna" ]; then
+	curl -s "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz" > $database_dir/GRC38_no_alt_analysis_set.fna.gz
+	gunzip $database_dir/GRC38_no_alt_analysis_set.fna.gz
+fi
 
 # create composite reference of human and virus for competitive bwt mapping 
 # based host removal
+if [ ! -f $database_dir/'composite_human_viral_reference.fna' ]; then
 cat $database_dir/GRC38_no_alt_analysis_set.fna $database_dir/$accession.fasta > $database_dir/composite_human_viral_reference.fna
-bwa index $database_dir/composite_human_viral_reference.fna
+fi
+for file in $database_dir/composite_human_viral_reference.fna.{amb,ann,bwt,pac,sa}; do
+	if [ ! -f $file ]; then
+		bwa index $database_dir/composite_human_viral_reference.fna
+		break
+	else
+		continue
+	fi
+done
 
 # get kraken2 viral db
 mkdir -p $database_dir/Kraken2/db
-curl -s "https://genome-idx.s3.amazonaws.com/kraken/k2_viral_20210517.tar.gz" > $database_dir/Kraken2/db/k2_viral_20210517.tar.gz
-cd $database_dir/Kraken2/db
-tar xvf k2_viral_20210517.tar.gz
+for file in $database_dir/Kraken2/db/{hash,opts,taxo}.k2d; do
+	if [ ! -f $file ]; then
+		curl -s "https://genome-idx.s3.amazonaws.com/kraken/k2_viral_20210517.tar.gz" > $database_dir/Kraken2/db/k2_viral_20210517.tar.gz
+		cd $database_dir/Kraken2/db
+		tar xvf k2_viral_20210517.tar.gz
+		break
+	else
+		continue
+	fi
+done
 
 # create blank fasta for 'phylo_include_seqs'
 touch $database_dir/blank.fasta
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,7 +11,6 @@ dependencies: @@
       - python>=3.7
       - snakemake-minimal
       - gofasta
-      - nodejs
       - usher
       - pandas
       - pysam==0.16.0.1
@@ Expand Down @@