diff --git a/README.md b/README.md index f9cbea7..a5ec410 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,7 @@ so alternatively install `mamba` and use that (snakemake has beta support for it conda install -c conda-forge mamba mamba create -c conda-forge -c bioconda -n signal snakemake pandas conda mamba conda activate signal + # mamba activate signal is equivalent Additional software dependencies are managed directly by `snakemake` using conda environment files: @@ -79,31 +80,27 @@ Additional software dependencies are managed directly by `snakemake` using conda ## SIGNAL Help Screen: -Using the provided `signal.py` script, the majority of SIGNAL functions can be accessed easily. +Using the provided `signalexe.py` script, the majority of SIGNAL functions can be accessed easily. To display the help screen: ``` -python signal.py -h +python signalexe.py -h -Output: -usage: signal.py [-h] [-c CONFIGFILE] [-d DIRECTORY] [--cores CORES] [--config-only] [--remove-freebayes] [--add-breseq] - [-neg NEG_PREFIX] [--dependencies] [-ri] [--unlock] [-F] [-n] [--verbose] [-v] - [all ...] [postprocess ...] [ncov_tools ...] +usage: signalexe.py [-h] [-c CONFIGFILE] [-d DIRECTORY] [--cores CORES] [--config-only] [--remove-freebayes] [--add-breseq] [-neg NEG_PREFIX] [--dependencies] [--data DATA] [-ri] [-ii] [--unlock] + [-F] [-n] [--quiet] [--verbose] [-v] + [all ...] [postprocess ...] [ncov_tools ...] [install ...] -SARS-CoV-2 Illumina GeNome Assembly Line (SIGNAL) aims to take Illumina short-read sequences and perform consensus assembly + -variant calling for ongoing surveillance and research efforts towards the emergent coronavirus: Severe Acute Respiratory Syndrome -Coronavirus 2 (SARS-CoV-2). +SARS-CoV-2 Illumina GeNome Assembly Line (SIGNAL) aims to take Illumina short-read sequences and perform consensus assembly + variant calling for ongoing surveillance and research efforts towards +the emergent coronavirus: Severe Acute Respiratory Syndrome Coronavirus 2 (SARS-CoV-2). positional arguments: - all Run SIGNAL with all associated assembly rules. Does not include postprocessing '--configfile' or '-- - directory' required. The latter will automatically generate a configuration file and sample table. If - both provided, then '--configfile' will take priority - postprocess Run SIGNAL postprocessing on completed SIGNAL run. '--configfile' is required but will be generated if ' - --directory' is provided - ncov_tools Generate configuration file and filesystem setup required and then execute ncov-tools quality control - assessment. Requires 'ncov-tools' submodule! '--configfile' is required but will be generated if '-- - directory' is provided + all Run SIGNAL with all associated assembly rules. Does not include postprocessing '--configfile' or '--directory' required. The latter will automatically generate a + configuration file and sample table. If both provided, then '--configfile' will take priority + postprocess Run SIGNAL postprocessing on completed SIGNAL run. '--configfile' is required but will be generated if '--directory' is provided + ncov_tools Generate configuration file and filesystem setup required and then execute ncov-tools quality control assessment. Requires 'ncov-tools' submodule! '--configfile' is required + but will be generated if '--directory' is provided + install Install individual rule environments and ensure SIGNAL is functional. The only parameter operable will be '--data'. Will override other operations! optional arguments: -h, --help show this help message and exit @@ -113,49 +110,57 @@ optional arguments: Path to directory containing reads. Will be used to generate sample table and configuration file --cores CORES Number of cores. Default = 1 --config-only Generate sample table and configuration file (i.e., config.yaml) and exit. '--directory' required - --remove-freebayes Configuration file generator parameter. Set flag to DISABLE freebayes variant calling (improves overall - speed) - --add-breseq Configuration file generator parameter. Set flag to ENABLE optional breseq step (will take more time for - analysis to complete) + --remove-freebayes Configuration file generator parameter. Set flag to DISABLE freebayes variant calling (improves overall speed) + --add-breseq Configuration file generator parameter. Set flag to ENABLE optional breseq step (will take more time for analysis to complete) -neg NEG_PREFIX, --neg-prefix NEG_PREFIX - Configuration file generator parameter. Comma-separated list of negative sontrol sample name(s) or - prefix(es). For example, 'Blank' will cover Blank1, Blank2, etc. Recommended if running ncov-tools. Will - be left empty, if not provided - --dependencies Download data dependencies (under a created 'data' directory) required for SIGNAL analysis and exit. - Note: Will override other flags! (~10 GB storage required) + Configuration file generator parameter. Comma-separated list of negative control sample name(s) or prefix(es). For example, 'Blank' will cover Blank1, Blank2, etc. + Recommended if running ncov-tools. Will be left empty, if not provided + --dependencies Download data dependencies (under a created 'data' directory) required for SIGNAL analysis and exit. Note: Will override other parameters! (~10 GB storage required) + --data DATA SIGNAL install and data dependencies parameter. Set location for data dependancies. If '--dependancies' is run, a folder will be created in the specified directory. If '-- + config-only' or '--directory' is used, the value will be applied to the configuration file. (Upcoming feature): When used with 'SIGNAL install', any tests run will use the + dependencies located at this directory. Default = 'data' -ri, --rerun-incomplete Snakemake parameter. Re-run any incomplete samples from a previously failed run + -ii, --ignore-incomplete + Snakemake parameter. Do not check for incomplete output files --unlock Snakemake parameter. Remove a lock on the working directory after a failed run -F, --forceall Snakemake parameter. Force the re-run of all rules regardless of prior output -n, --dry-run Snakemake parameter. Do not execute anything and only display what would be done + --quiet Snakemake parameter. Do not output any progress or rule information. If used with '--dry-run`, it will just display a summary of the DAG of jobs --verbose Snakemake parameter. Display snakemake debugging output -v, --version Display version number ``` ## Summary: -`signal.py` simplies the execution of all functions of SIGNAL. At its simplest, SIGNAL can be run with one line, provided only the directory of sequencing reads. +`signalexe.py` simplies the execution of all functions of SIGNAL. At its simplest, SIGNAL can be run with one line, provided only the directory of sequencing reads. ``` # Download dependances (only needs to be run once; ~10GB of storage required) -python signal.py --dependencies +# --data flag allows you to rename and relocate dependencies directory +python signalexe.py --data data --dependencies -# Generate configuration file and sample table (--neg_prefix can be used to note negative controls) -python signal.py --config-only --directory /path/to/reads +# Generate configuration file and sample table +# --neg_prefix can be used to note negative controls +# --data can be used to specify location of data dependencies +python signalexe.py --config-only --directory /path/to/reads # Execute pipeline (step-by-step; --cores defaults to 1 if not provided) -python signal.py --configfile config.yaml --cores NCORES aLL -python signal.py --configfile config.yaml --cores NCORES postprocess -python signal.py --configfile config.yaml --cores NCORES ncov_tools +# --data can be used to specify location of data dependencies +python signalexe.py --configfile config.yaml --cores NCORES all +python signalexe.py --configfile config.yaml --cores NCORES postprocess +python signalexe.py --configfile config.yaml --cores NCORES ncov_tools # ALTERNATIVE # Execute pipeline (one line) -python signal.py --configfile config.yaml --cores NCORES all postprocess ncov_tools +# --data can be used to specify location of data dependencies +python signalexe.py --configfile config.yaml --cores NCORES all postprocess ncov_tools # ALTERNATIVE # Execute pipeline (one line; no prior configuration file or sample table steps) # --directory can be used in place of --configfile to automatically generate a configuration file -python signal.py --directory /path/to/reads --cores NCORES all postprocess ncov_tools +# --data can be used to specify location of data dependencies +python signalexe.py --directory /path/to/reads --cores NCORES all postprocess ncov_tools ``` Each of the steps in SIGNAL can be run **manually** by accessing the individual scripts or running snakemake. @@ -187,8 +192,9 @@ The pipeline requires: - kraken2 viral database - Human GRCh38 reference fasta (for composite human-viral BWA index) - python signal.py --dependencies + python signalexe.py --dependencies # defaults to a directory called `data` in repository root + # --data can be used to rename and relocate the resultant directory OR @@ -197,14 +203,24 @@ The pipeline requires: **Note: Downloading the database files requires ~10GB of storage, with up to ~35GB required for all temporary downloads!** +### 1.5. Prepare per-rule conda environments (optional, but recommended): + +SIGNAL uses controlled conda environments for individual steps in the workflow. These are generally produced upon first execution of SIGNAL with input data; however, an option to install the per-rule environments is available through the `signalexe.py` script. + + python signalexe.py install + + # Will install per-rule environments + # Later versions of SIGNAL will include a testing module with curated data to ensure function + ### 2. Generate configuration file: -You can use the `--config-only` flag to generate both `config.yaml` and `sample_table.csv` (see step 4). The directory provided will be used to auto-generate a name for the run. +You can use the `--config-only` flag to generate both `config.yaml` and `sample_table.csv`. The directory provided will be used to auto-generate a name for the run. ``` -python signal.py --config-only --directory /path/to/reads +python signalexe.py --config-only --directory /path/to/reads # Outputs: 'reads_config.yaml' and 'reads_sample_table.csv' +# --data can be used to specify the location of data dependancies ``` You can also create the configuration file through modifying the `example_config.yaml` to suit your system. @@ -215,7 +231,7 @@ You can also create the configuration file through modifying the `example_config See the example table `example_sample_table.csv` for an idea of how to organise this table. -**Using the `--config-only` flag, both configuration file and sample table will be generated (see above in step 3) from a given directory path to reads.** +**Using the `--config-only` flag, both configuration file and sample table will be generated (see above in step 2) from a given directory path to reads.** Alternatively, you can attempt to use `generate_sample_table.sh` to circumvent manual creation of the table. @@ -248,7 +264,7 @@ bash scripts/generate_sample_table.sh -d /path/to/more/reads -e sample_table.csv ### 4. Execute pipeline: -For the main `signal.py` script, positional arguments inform the rules of the pipeline to execute with flags supplementing input parameters. +For the main `signalexe.py` script, positional arguments inform the rules of the pipeline to execute with flags supplementing input parameters. The main rules of the pipeline are as followed: @@ -258,7 +274,7 @@ The main rules of the pipeline are as followed: The generated configuration file from the above steps can be used as input. To run the general pipeline: -`python signal.py --configfile config.yaml --cores 4 all` +`python signalexe.py --configfile config.yaml --cores 4 all` is equivalent to running @@ -266,9 +282,9 @@ is equivalent to running You can run the snakemake command as written above, but note that if the `--conda-prefix` is not set as this (i.e., `$PWD/.snakemake/conda`), then all envs will be reinstalled for each time you change the `results_dir` in the `config.yaml`. -Alternatively, you can skip the above configuration and sample table generation steps by simply providing the directory of reads to the main script: +Alternatively, you can skip the above configuration and sample table generation steps by simply providing the directory of reads to the main script (see step 2): -`python signal.py --directory /path/to/reads --cores 4 all` +`python signalexe.py --directory /path/to/reads --cores 4 all` A configuartion file and sample table will automatically be generated prior to running SIGNAL `all`. @@ -278,7 +294,7 @@ FreeBayes variant calling and BreSeq mutational analysis are technically optiona As with the general pipeline, the generated configuration file from the above steps can be used as input. To run `postprocess` which summarizes the SIGNAL results: -`python signal.py --configfile config.yaml --cores 1 postprocess` +`python signalexe.py --configfile config.yaml --cores 1 postprocess` is equivalent to running @@ -306,7 +322,7 @@ Related: because pipeline stages can fail, we run (and recommend running if usin Additionally, SIGNAL can prepare output and execute @jts' [ncov-tools](https://github.com/jts/ncov-tools) to generate phylogenies and alternative summaries. -`python signal.py --configfile config.yaml --cores 1 ncov_tools` +`python signalexe.py --configfile config.yaml --cores 1 ncov_tools` is equivalent to running @@ -318,17 +334,19 @@ SIGNAL will then execute ncov-tools and the **output will be found within the SI ### Multiple operations: -Using `signal.py` positional arguments, you can specify SIGNAL to perform multiple rules in succession. +Using `signalexe.py` positional arguments, you can specify SIGNAL to perform multiple rules in succession. -`python signal.py --configfile config.yaml --cores NCORES all postprocess ncov_tools` +`python signalexe.py --configfile config.yaml --cores NCORES all postprocess ncov_tools` In the above command, SIGNAL `all`, `postprocess`, and `ncov_tools` will run using the provided configuration file as input, which links to a sample table. **Note: Regardless of order for positional arguments, or placement of other parameter flags, SIGNAL will always run in the set order priority: `all` > `postprocess` > `ncov_tools`!** +**Note: If `install` is provided as input, it will override all other positional arguments!** + If no configuration file or sample table was generated for a run, you can provide `--directory` with the path to sequencing reads and SIGNAL will auto-generate both required inputs prior to running any rules. -`python signal.py --directory /path/to/reads --cores NCORES all postprocess ncov_tools` +`python signalexe.py --directory /path/to/reads --cores NCORES all postprocess ncov_tools` Overall, this simplifies executing SIGNAL to one line! @@ -359,6 +377,44 @@ Then execute the pipeline: - To generate summaries of BreSeq among many samples, see [how to summarize BreSeq results using gdtools](resources/dev_scripts/summaries/README.md) +### Convenient extraction script: + +SIGNAL produces several output files and directories on its own alongside the output for ncov-tools. Select files from the output can be copied or transferred for easier parsing using a provided convenience bash script: + +``` +bash scripts/get_signal_results.sh + +Usage: +bash get_signal_results.sh -s -d [-m] [-c] + +This scripts aims to copy (rsync by default, or cp) or move (mv) select output from SIGNAL 'all', 'postprocess', and 'ncov_tools'. + +The following files will be transferred over to the specified destination directory (if found): +SIGNAL 'all' & 'postprocess': +-> signal-results//_sample.txt +-> signal-results//core/.consensus.fa +-> signal-results//core/_ivar_variants.tsv +-> signal-results//freebayes/.consensus.fasta +-> signal-results//freebayes/.variants.norm.vcf + +SIGNAL 'ncov_tools': +-> ncov_tools-results/qc_annotation/.ann.vcf +-> ncov-tools-results/qc_reports/_ambiguous_position_report.tsv +-> ncov-tools-results/qc_reports/_mixture_report.tsv +-> ncov-tools-results/qc_reports/_ncov_watch_variants.tsv +-> ncov-tools-results/qc_reports/_negative_control_report.tsv +-> ncov-tools-results/qc_reports/_summary_qc.tsv + +Flags: + -s : SIGNAL results directory + -d : Directory where summary will be outputted + -m : Invoke 'mv' move command instead of 'rsync' copying of results. Optional + -c : Invoke 'cp' copy command instead of 'rsync' copying of results. Optional + +``` + +The script uses `rsync` to provide accurate copies of select output files organized into `signal-results` and `ncov-tools-results` within a provided destination directory (that must exist). If the `-c` is provided, `cp` will be used instead of `rsync` to produce copies. Similarly, if `-m` is provided, `mv` will be used instead (**WARNING: Any interruption during `mv` could result in data loss.**) + ## Pipeline details: For a step-by-step walkthrough of the pipeline, see [pipeline/README.md](PIPELINE.md). diff --git a/Snakefile b/Snakefile index 76a4e5d..d807d96 100644 --- a/Snakefile +++ b/Snakefile @@ -127,7 +127,9 @@ rule clean_reads: expand('{sn}/mapped_clean_reads/{sn}_R{r}.fastq.gz', sn=sample_names, r=[1,2]) rule consensus: - input: expand('{sn}/core/{sn}.consensus.fa', sn=sample_names) + input: expand('{sn}/core/{sn}.consensus.fa', sn=sample_names), + 'all_genomes.fa', +# 'failed_samples.log' rule ivar_variants: input: expand('{sn}/core/{sn}_ivar_variants.tsv', sn=sample_names) @@ -136,14 +138,15 @@ rule breseq: input: expand('{sn}/breseq/output/index.html', sn=sample_names) rule freebayes: - input: + input: + 'all_freebayes_genomes.fa', +# 'failed_samples.log', expand('{sn}/freebayes/{sn}.consensus.fasta', sn=sample_names), expand('{sn}/freebayes/{sn}.variants.norm.vcf', sn=sample_names), 'freebayes_lineage_assignments.tsv', expand('{sn}/freebayes/quast/{sn}_quast_report.html', sn=sample_names), expand('{sn}/freebayes/{sn}_consensus_compare.vcf', sn=sample_names) - rule coverage: input: expand('{sn}/coverage/{sn}_depth.txt', sn=sample_names) @@ -239,7 +242,8 @@ rule ncov_tools: negative_control_prefix = config['negative_control_prefix'], freebayes_run = config['run_freebayes'], pangolin = versions['pangolin'], - mode = pango_speed + mode = pango_speed, + failed = 'failed_samples.log' input: consensus = expand('{sn}/core/{sn}.consensus.fa', sn=sample_names), primertrimmed_bams = expand("{sn}/core/{sn}_viral_reference.mapping.primertrimmed.sorted.bam", sn=sample_names), @@ -327,7 +331,7 @@ rule raw_reads_composite_reference_bwa_map: shell: '(bwa mem -t {threads} {params.composite_index} ' '{input.raw_r1} {input.raw_r2} | ' - '{params.script_path} -c {params.viral_contig_name} > {output}) 2> {log}' + "{params.script_path} -c {params.viral_contig_name} > {output}) 2> {log} || echo '' > {output}" rule get_host_removed_reads: threads: 2 @@ -375,7 +379,7 @@ rule run_trimgalore: shell: 'trim_galore --quality {params.min_qual} --length {params.min_len} ' ' -o {params.output_prefix} --cores {threads} --fastqc ' - '--paired {input.raw_r1} {input.raw_r2} 2> {log} || touch {output}' + "--paired {input.raw_r1} {input.raw_r2} 2> {log} || (echo -e 'Total reads processed: 0\nReads written (passing filters): 0 (0.0%)\nTotal basepairs processed: 0 bp\nTotal written (filtered): 0 bp (0.0%)' >> {log}; touch {output})" rule run_filtering_of_residual_adapters: threads: 2 @@ -769,6 +773,32 @@ rule run_quast_freebayes: 'quast {input} -r {params.genome} -g {params.fcoords} --output-dir {params.outdir} --threads {threads} >{log} && ' 'for f in {params.unlabelled_reports}; do mv $f ${{f/report/{params.sample_name}}}; done' +rule collect_core_genomes: + output: + "all_genomes.fa" + input: + expand(['{sn}/core/{sn}.consensus.fa'], sn=sample_names) + shell: + """ + cat {input} > {output} + sample='' + count='' + echo "Samples that failed to assemble:" > failed_samples.log + while read -r line; + do + if [[ $line =~ '>' ]]; then + sample=$(echo $line | cut -d'.' -f1 | cut -d'_' -f2) + else + count=$(echo $line | wc -c) + if [[ $count -eq 1 ]]; then + echo $sample >> failed_samples.log + else + continue + fi + fi + done < {output} + """ + rule run_lineage_assignment: threads: 4 conda: 'conda_envs/assign_lineages.yaml' @@ -777,7 +807,7 @@ rule run_lineage_assignment: nextclade_ver_out = 'input_nextclade_versions.txt', lin_out = 'lineage_assignments.tsv' input: - expand('{sn}/core/{sn}.consensus.fa', sn=sample_names) + 'all_genomes.fa' params: pangolin_ver = versions['pangolin'], pangolearn_ver = versions['pangolearn'], @@ -794,8 +824,51 @@ rule run_lineage_assignment: shell: "echo -e 'pangolin: {params.pangolin_ver}\nconstellations: {params.constellations_ver}\nscorpio: {params.scorpio_ver}\npangolearn: {params.pangolearn_ver}\npango-designation: {params.designation_ver}\npangolin-data: {params.data_ver}' > {output.pango_ver_out} && " "echo -e 'nextclade: {params.nextclade_ver}\nnextclade-dataset: {params.nextclade_data}\nnextclade-include-recomb: {params.nextclade_recomb}' > {output.nextclade_ver_out} && " - 'cat {input} > all_genomes.fa && ' - '{params.assignment_script_path} -i all_genomes.fa -t {threads} -o {output.lin_out} -p {output.pango_ver_out} -n {output.nextclade_ver_out} --mode {params.analysis_mode}' + '{params.assignment_script_path} -i {input} -t {threads} -o {output.lin_out} -p {output.pango_ver_out} -n {output.nextclade_ver_out} --mode {params.analysis_mode}' + +rule collect_freebayes_genomes: + output: + "all_freebayes_genomes.fa" + input: + expand('{sn}/freebayes/{sn}.consensus.fasta', sn=sample_names) + shell: + """ + cat {input} > {output} + sample='' + seq='' + count='' + out='' + if [[ -f 'failed_samples.log' ]]; then + out='.failed_freebayes_samples.tmp' + cat failed_samples.log | sed 1,1d > $out + echo "Samples that failed to assemble:" > failed_samples.log + else + out='failed_samples.log' + echo "Samples that failed to assemble:" > $out + fi + while read -r line; + do + if [[ $line =~ '>' ]]; then + if [[ $(echo $seq | wc -c) -eq 1 ]]; then # check if new seq + count=$(echo $seq | grep -vc 'N') + if [[ $count -eq 0 ]]; then + echo $sample >> $out + fi + sample=$(echo $line | cut -d'>' -f2) # start new seq + seq='' + else + sample=$(echo $line | cut -d'>' -f2) # first seq + fi + else + seq+=$line # append seq + fi + done < {output} + + if [[ ! $out == 'failed_samples.log' ]]; then + sort -b -d -f $out | uniq >> failed_samples.log + rm $out + fi + """ rule run_lineage_assignment_freebayes: threads: 4 @@ -805,10 +878,9 @@ rule run_lineage_assignment_freebayes: input: p_vers = 'input_pangolin_versions.txt', n_vers = 'input_nextclade_versions.txt', - consensus = expand('{sn}/freebayes/{sn}.consensus.fasta', sn=sample_names) + consensus = 'all_freebayes_genomes.fa' params: analysis_mode = pango_speed, assignment_script_path = os.path.join(exec_dir, 'scripts', 'assign_lineages.py') shell: - 'cat {input.consensus} > all_freebayes_genomes.fa && ' - '{params.assignment_script_path} -i all_freebayes_genomes.fa -t {threads} -o {output} -p {input.p_vers} -n {input.n_vers} --mode {params.analysis_mode} --skip' + '{params.assignment_script_path} -i {input.consensus} -t {threads} -o {output} -p {input.p_vers} -n {input.n_vers} --mode {params.analysis_mode} --skip' diff --git a/conda_envs/assign_lineages.yaml b/conda_envs/assign_lineages.yaml index a5eecb8..fc4e8fd 100644 --- a/conda_envs/assign_lineages.yaml +++ b/conda_envs/assign_lineages.yaml @@ -11,7 +11,6 @@ dependencies: - python>=3.7 - snakemake-minimal - gofasta - - nodejs - usher - pandas - pysam==0.16.0.1 diff --git a/scripts/assign_lineages.py b/scripts/assign_lineages.py index 43f2bfd..0ba5d3c 100755 --- a/scripts/assign_lineages.py +++ b/scripts/assign_lineages.py @@ -8,6 +8,7 @@ import shutil import os, sys from datetime import datetime +import json def check_file(path: str) -> Path: @@ -136,27 +137,53 @@ def update_nextclade_dataset(vers, skip): # If specific tag requested, attempt to install, otherwise install latest accession = 'MN908947' + current_tag = None + if os.path.exists(os.path.join(output_dir, 'tag.json')): + j = open(os.path.join(output_dir, 'tag.json')) + data = json.load(j) + current_tag = data['tag'] + j.close() if requested is not None: + # check existing database, if found + if requested == current_tag: + print(f"Nextclade dataset {requested} already installed! Skipping update!") + else: + try: + print(f"\nDownloading Nextclade {dataset} dataset tagged {requested} for reference {accession}!") + subprocess.run(f"nextclade dataset get " + f"--name '{dataset}' " + f"--reference '{accession}' " + f"--tag {requested} " + f"--output-dir '{output_dir}'", shell=True, check=True) + except subprocess.CalledProcessError: + print(f"\nDatabase not found! Please check whether {requested} tag exists! Downloading latest Nextclade {dataset} dataset for reference {accession}...") + try: + subprocess.run(f"nextclade dataset get " + f"--name '{dataset}' " + f"--reference '{accession}' " + f"--output-dir '{output_dir}'", shell=True, check=True) + except subprocess.CalledProcessError: + if current_tag is not None: + print(f"Something went wrong updating the Nextclade dataset, using {current_tag} instead!") + requested = current_tag + else: + print(f"Something went wrong updating the Nextclade dataset! No database could be found which may result in errors! Skipping update...") + requested = "Unknown" + else: try: - print(f"\nDownloading Nextclade {dataset} dataset tagged {requested} for reference {accession}!") + print(f"\nDownloading latest Nextclade {dataset} dataset for reference {accession}!") subprocess.run(f"nextclade dataset get " f"--name '{dataset}' " f"--reference '{accession}' " - f"--tag {requested} " f"--output-dir '{output_dir}'", shell=True, check=True) except subprocess.CalledProcessError: - print(f"\nDatabase not found! Please check whether {requested} tag exists! Downloading latest Nextclade {dataset} dataset for reference {accession}...") - subprocess.run(f"nextclade dataset get " - f"--name '{dataset}' " - f"--reference '{accession}' " - f"--output-dir '{output_dir}'", shell=True, check=True) - else: - print(f"\nDownloading latest Nextclade {dataset} dataset for reference {accession}!") - subprocess.run(f"nextclade dataset get " - f"--name '{dataset}' " - f"--reference '{accession}' " - f"--output-dir '{output_dir}'", shell=True, check=True) - + if current_tag is not None: + print(f"Something went wrong updating the Nextclade dataset, using {current_tag} instead!") + requested = current_tag + else: + print(f"Something went wrong updating the Nextclade dataset! No database could be found which may result in errors! Skipping update...") + requested = "Unknown" + # Obtain final version information for output nextclade_version = subprocess.run(f"nextclade --version".split(), stdout=subprocess.PIPE).stdout.decode('utf-8').strip().lower() if nextclade_version.startswith("nextclade"): diff --git a/scripts/get_data_dependencies.sh b/scripts/get_data_dependencies.sh index c525294..43bcf68 100755 --- a/scripts/get_data_dependencies.sh +++ b/scripts/get_data_dependencies.sh @@ -14,34 +14,42 @@ accession="MN908947.3" HELP=""" Flags: - -d : Directory to configure database within (~10GB) - -a : Accession to use as viral reference (default=MN908947.3) + -d : Directory to configure database within (~10GB) + -a : Accession to use as viral reference (default=MN908947.3) """ while getopts ":d:a:" option; do - case "${option}" in - d) database_dir=$OPTARG;; - a) accession=$OPTARG;; - esac + case "${option}" in + d) database_dir=$OPTARG;; + a) accession=$OPTARG;; + esac done if [ $database_dir = 0 ] ; then - echo "You must specify a data directory to install data dependencies." - echo "$HELP" - exit 1 + echo "You must specify a data directory to install data dependencies." + echo "$HELP" + exit 1 fi echo -e "Warning: \n - final databases require ~10GB of storage\n - building databases temporarily requires a peak of ~35GB of storage and ~4GB of memory \n - script takes up to ~1.5 hours (system depending)" # make database dir and get abspath to it -mkdir -p $database_dir +if [ ! -d $database_dir ]; then mkdir -p $database_dir; fi database_dir=$(realpath $database_dir) # use curl to grab "simple data dependencies" -curl -s "https://raw.githubusercontent.com/timflutre/trimmomatic/3694641a92d4dd9311267fed85b05c7a11141e7c/adapters/NexteraPE-PE.fa" > $database_dir/NexteraPE-PE.fa -curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${accession}&rettype=gb&retmode=txt" > $database_dir/$accession.gbk -curl -s "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id=${accession}" > $database_dir/$accession.gff3 -curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${accession}&rettype=fasta&retmode=txt" > $database_dir/$accession.fasta +if [ ! -f $database_dir/'NexteraPE-PE.fa' ]; then + curl -s "https://raw.githubusercontent.com/timflutre/trimmomatic/3694641a92d4dd9311267fed85b05c7a11141e7c/adapters/NexteraPE-PE.fa" > $database_dir/NexteraPE-PE.fa +fi +if [ ! -f $database_dir/${accession}.gbk ]; then + curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${accession}&rettype=gb&retmode=txt" > $database_dir/$accession.gbk +fi +if [ ! -f $database_dir/${accession}.gff3 ]; then + curl -s "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id=${accession}" > $database_dir/$accession.gff3 +fi +if [ ! -f $database_dir/${accession}.fasta ]; then + curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${accession}&rettype=fasta&retmode=txt" > $database_dir/$accession.fasta +fi # install and activate env for kraken/bwa to build their databases/index CONDA_BASE=$($CONDA_EXE info --base) @@ -51,19 +59,37 @@ conda activate data_dependencies # get the GRCh38 human genome # as per https://lh3.github.io/2017/11/13/which-human-reference-genome-to-use -curl -s "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz" > $database_dir/GRC38_no_alt_analysis_set.fna.gz -gunzip $database_dir/GRC38_no_alt_analysis_set.fna.gz +if [ ! -f $database_dir/"GRC38_no_alt_analysis_set.fna" ]; then + curl -s "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz" > $database_dir/GRC38_no_alt_analysis_set.fna.gz + gunzip $database_dir/GRC38_no_alt_analysis_set.fna.gz +fi # create composite reference of human and virus for competitive bwt mapping # based host removal +if [ ! -f $database_dir/'composite_human_viral_reference.fna' ]; then cat $database_dir/GRC38_no_alt_analysis_set.fna $database_dir/$accession.fasta > $database_dir/composite_human_viral_reference.fna -bwa index $database_dir/composite_human_viral_reference.fna +fi +for file in $database_dir/composite_human_viral_reference.fna.{amb,ann,bwt,pac,sa}; do + if [ ! -f $file ]; then + bwa index $database_dir/composite_human_viral_reference.fna + break + else + continue + fi +done # get kraken2 viral db mkdir -p $database_dir/Kraken2/db -curl -s "https://genome-idx.s3.amazonaws.com/kraken/k2_viral_20210517.tar.gz" > $database_dir/Kraken2/db/k2_viral_20210517.tar.gz -cd $database_dir/Kraken2/db -tar xvf k2_viral_20210517.tar.gz +for file in $database_dir/Kraken2/db/{hash,opts,taxo}.k2d; do + if [ ! -f $file ]; then + curl -s "https://genome-idx.s3.amazonaws.com/kraken/k2_viral_20210517.tar.gz" > $database_dir/Kraken2/db/k2_viral_20210517.tar.gz + cd $database_dir/Kraken2/db + tar xvf k2_viral_20210517.tar.gz + break + else + continue + fi +done # create blank fasta for 'phylo_include_seqs' touch $database_dir/blank.fasta diff --git a/scripts/get_signal_results.sh b/scripts/get_signal_results.sh new file mode 100755 index 0000000..0936c39 --- /dev/null +++ b/scripts/get_signal_results.sh @@ -0,0 +1,122 @@ +#!/bin/env bash + +shopt -s extglob + +source=0 +destination=0 +move='false' +copy='false' + +HELP=""" +Usage: +bash get_signal_results.sh -s -d [-m] [-c] + +This scripts aims to copy (rsync by default, or cp) or move (mv) select output from SIGNAL 'all', 'postprocess', and 'ncov_tools'. + +The following files will be transferred over to the specified destination directory (if found): +SIGNAL 'all' & 'postprocess': +-> signal-results//_sample.txt +-> signal-results//core/.consensus.fa +-> signal-results//core/_ivar_variants.tsv +-> signal-results//freebayes/.consensus.fasta +-> signal-results//freebayes/.variants.norm.vcf + +SIGNAL 'ncov_tools': +-> ncov_tools-results/qc_annotation/.ann.vcf +-> ncov-tools-results/qc_reports/_ambiguous_position_report.tsv +-> ncov-tools-results/qc_reports/_mixture_report.tsv +-> ncov-tools-results/qc_reports/_ncov_watch_variants.tsv +-> ncov-tools-results/qc_reports/_negative_control_report.tsv +-> ncov-tools-results/qc_reports/_summary_qc.tsv + +Flags: + -s : SIGNAL results directory + -d : Directory where summary will be outputted + -m : Invoke 'mv' move command instead of 'rsync' copying of results. Optional + -c : Invoke 'cp' copy command instead of 'rsync' copying of results. Optional +""" + +while getopts ":s:d:mc" option; do + case "${option}" in + s) source=$OPTARG;; + d) destination=$OPTARG;; + m) move='true';; + c) copy='true';; + esac +done + + +if [ $source = 0 ] || [ $destination = 0 ] ; then + echo "You must specify both source and destination locations." + echo "$HELP" + exit 1 +fi + +if [ ! -d $destination ]; then + echo "Invalid destination directory!" + exit 1 +fi + +if [ ! -f $source/summary.html ] && [ ! -f $source/summary.zip ]; then + echo "Invalid SIGNAL directory! Make sure you've run SIGNAL 'all' and 'postprocess'!" + exit 1 +else + run_name=$(basename $source) + final_dir=${destination}/${run_name} + mkdir -p $final_dir/signal-results +fi + +if [ ${move} = true ] && [ ${copy} = true ]; then + echo -e "Only pick one of '-m' or '-c' depending on whether you wish to move or copy files, respectively" + exit +elif [ ${move} = true ] && [ ${copy} = false ]; then + cmd='mv' +elif [ ${move} = false ] && [ ${copy} = true ]; then + cmd='cp' +else + cmd='rsync -avh' + # rsync -avh +fi + +echo -e "We will use ${cmd} for your files!" + +### SIGNAL results_dir +for file in $source/*; do + if [ -d $file ]; then # results_dir/sample + sample=$(basename $file) # sample name, within contain our files + sample_dest=${final_dir}/'signal-results'/${sample} + if [[ ! $sample == 'ncov-tools-results' ]]; then + mkdir -p $sample_dest + fi + for d in $file/*; do + name=$(basename $d) + if [ -d $d ] && [[ $name == 'core' ]]; then + mkdir -p $sample_dest/core + $cmd ${d}/${sample}.consensus.fa $sample_dest/core/${sample}.consensus.fa + $cmd ${d}/${sample}_ivar_variants.tsv $sample_dest/core/${sample}_ivar_variants.tsv + elif [ -d $d ] && [[ $name == 'freebayes' ]]; then + mkdir -p $sample_dest/freebayes + $cmd ${d}/${sample}.consensus.fasta $sample_dest/freebayes/${sample}.consensus.fasta + $cmd ${d}/${sample}.variants.norm.vcf $sample_dest/freebayes/${sample}.variants.norm.vcf + elif [ -f $d ] && [[ $name =~ '_sample.txt' ]]; then + $cmd ${d} $sample_dest/$(basename $d) + else + continue + fi + done + fi +done + +echo "Files from SIGNAL transferred!" + +### NCOV-TOOLS +if [ ! -d $source/ncov-tools-results ]; then + echo "No ncov-tools-results directory found!" +else + ncov_dest=${final_dir}/ncov-tools-results + mkdir -p $ncov_dest/qc_{annotation,reports} + $cmd $source/ncov-tools-results/qc_reports/* $ncov_dest/qc_reports + $cmd $source/ncov-tools-results/qc_annotation/*.ann.vcf $ncov_dest/qc_annotation + + echo "Files from ncov-tools transferred!" +fi \ No newline at end of file diff --git a/scripts/ncov-tools.py b/scripts/ncov-tools.py index 76e5201..16cea46 100755 --- a/scripts/ncov-tools.py +++ b/scripts/ncov-tools.py @@ -6,17 +6,21 @@ import fileinput import glob -def link_ivar(root, replace=False): +def link_ivar(root, neg, failed, replace=False): print("Linking iVar files to ncov-tools!") for variants in snakemake.input['variants']: sample = variants.split('/')[0] + if (sample in failed) and (sample not in neg): + continue ln_path = f"{root}/{sample}.variants.tsv" if (not os.path.exists(ln_path)) or (replace is True): os.link(variants, ln_path) for consensus in snakemake.input['consensus']: sample = consensus.split('/')[0] + if (sample in failed) and (sample not in neg): + continue ln_path = f"{root}/{sample}.consensus.fasta" if (not os.path.exists(ln_path)) or (replace is True): os.link(consensus, ln_path) @@ -30,15 +34,17 @@ def link_ivar(root, replace=False): # take sample name from iVar results, redirect to where corresponding FreeBayes should be # if FreeBayes file cannot be found, break from loop, replace all with iVar -def link_freebayes(root): +def link_freebayes(root, neg, failed): print("Linking FreeBayes files to ncov-tools!") for variants in snakemake.input['variants']: sample = variants.split('/')[0] + if (sample in failed) and (sample not in neg): + continue expected_path = os.path.join(sample, 'freebayes', sample+'.variants.norm.vcf') if not os.path.exists(expected_path): print("Missing FreeBayes variant file! Switching to iVar input!") - link_ivar(root, True) + link_ivar(root, neg, failed, replace=True) break else: ln_path = f"{root}/{sample}.variants.vcf" @@ -47,10 +53,12 @@ def link_freebayes(root): for consensus in snakemake.input['consensus']: sample = consensus.split('/')[0] + if (sample in failed) and (sample not in neg): + continue expected_path = os.path.join(sample, 'freebayes', sample+'.consensus.fasta') if not os.path.exists(expected_path): print("Missing FreeBayes variant file! Switching to iVar input!") - link_ivar(root, True) + link_ivar(root, neg, failed, replace=True) break else: ln_path = f"{root}/{sample}.consensus.fasta" @@ -75,10 +83,26 @@ def set_up(): try: assert pangolin == "3" or pangolin == "4" # directly supported versions except AssertionError: - import urllib.request as web - commit_url = web.urlopen(f"https://github.com/cov-lineages/pangolin/releases/latest").geturl() - pangolin = commit_url.split("/")[-1].split(".")[0].lower().strip("v") + # import urllib.request as web + # commit_url = web.urlopen(f"https://github.com/cov-lineages/pangolin/releases/latest").geturl() + # pangolin = commit_url.split("/")[-1].split(".")[0].lower().strip("v") # latest version (should ensure temporary compatibility) + installed_versions = subprocess.run(["pangolin", "--all-versions"], + check=True, + stdout=subprocess.PIPE) + installed_versions = installed_versions.stdout.decode('utf-8') + installed_ver_dict = {} + for dep_ver in map(str.strip, installed_versions.split('\n')): + # skip empty line at end + if len(dep_ver) == 0: + continue + try: + dependency, version = dep_ver.split(': ') + except ValueError: + continue + if dependency == 'pangolin': + pangolin = str(version).split(".",1)[0].strip('v') + break ### Create data directory within ncov-tools data_root = os.path.abspath(os.path.join(exec_dir, 'ncov-tools', "%s" %(result_dir))) @@ -99,6 +123,13 @@ def set_up(): neg_list = list(neg_samples) print("Negative control samples found include: %s" %(neg_list)) +### Pull failed samples (SIGNAL log file: failed_samples.log) + if os.path.exists(snakemake.params['failed']): + with open(snakemake.params['failed']) as fail: + failed_list = [i.strip() for i in fail.readlines()[1:]] + else: + failed_list = [] + print("Failed samples found include: %s" %(failed_list)) ### config.yaml parameters config = {'data_root': f"'{data_root}'", @@ -126,27 +157,32 @@ def set_up(): print("Linking alignment BAMs to ncov-tools!") for bam in snakemake.input['bams']: sample = bam.split('/')[0] + # if sample failed and not a negative, skip linking + if (sample in failed_list) and (sample not in neg_list): + continue ln_path = f"{data_root}/{sample}.bam" - if (not os.path.exists(ln_path)) or (replace is True): + if not os.path.exists(ln_path): os.link(bam, ln_path) for primer_trimmed_bam in snakemake.input['primertrimmed_bams']: sample = primer_trimmed_bam.split('/')[0] + if (sample in failed_list) and (sample not in neg_list): + continue ln_path = f"{data_root}/{sample}.mapped.primertrimmed.sorted.bam" - if (not os.path.exists(ln_path)) or (replace is True): + if not os.path.exists(ln_path): os.link(primer_trimmed_bam, ln_path) if snakemake.params['freebayes_run']: - link_freebayes(data_root) + link_freebayes(data_root, neg_list, failed_list) config['variants_pattern'] = "'{data_root}/{sample}.variants.vcf'" else: - link_ivar(data_root) + link_ivar(data_root, neg_list, failed_list, replace=False) with open(os.path.join(exec_dir, 'ncov-tools', 'config.yaml'), 'w') as fh: for key, value in config.items(): fh.write(f"{key}: {value}\n") - return exec_dir, result_dir + return exec_dir, result_dir, data_root def run_all(): os.system(f"snakemake -s workflow/Snakefile --cores {snakemake.threads} all") @@ -185,12 +221,14 @@ def move(cwd, dest, prefix): print("Missing ncov-tools 'qc_analysis' directory") if __name__ == '__main__': - exec_dir, result_dir = set_up() + exec_dir, result_dir, data_root = set_up() run_script = os.path.join(exec_dir, 'scripts', 'run_ncov_tools.sh') #print("Don't forget to update the config.yaml file as needed prior to running ncov-tools.") print("Running ncov-tools using %s cores!" %(snakemake.threads)) subprocess.run([run_script, '-c', str(snakemake.threads), '-s', str(result_dir)]) - + + # clean up + shutil.rmtree(data_root) #run_all() #move(exec_dir, result_root, result_dir) diff --git a/scripts/run_ncov_tools.sh b/scripts/run_ncov_tools.sh index 2a5f098..5a68aad 100755 --- a/scripts/run_ncov_tools.sh +++ b/scripts/run_ncov_tools.sh @@ -35,9 +35,9 @@ if [ $1 = 'help' ]; then fi if [ $SIGNAL = 0 ] ; then - echo "You must specify the name of the directory holding SIGNAL results." - echo "$HELP" - exit 1 + echo "You must specify the name of the directory holding SIGNAL results." + echo "$HELP" + exit 1 fi # Start point for executing from ncov-tools.py is SIGNAL results directory @@ -48,9 +48,9 @@ RESULTS=$PWD cd ../ncov-tools # run ncov-tools -snakemake -s workflow/Snakefile --cores ${CORES} all +snakemake -k -s workflow/Snakefile --cores ${CORES} all -# move ncovresults to SIGNAL results directory +# move ncovresults to SIGNAL results directory and clean up mv ${SIGNAL}'_ncovresults' ${RESULTS}/ncov-tools-results # return success diff --git a/scripts/signal_postprocess.py b/scripts/signal_postprocess.py index a0a0aba..20fbcde 100755 --- a/scripts/signal_postprocess.py +++ b/scripts/signal_postprocess.py @@ -17,7 +17,7 @@ assert long_git_id.startswith('$Id: ') #short_git_id = long_git_id[5:12] -short_git_id = "v1.5.9" +short_git_id = "v1.6.0" # Suppresses matplotlib warning (https://github.com/jaleezyy/covid-19-signal/issues/59) # Creates a small memory leak, but it's nontrivial to fix, and won't be a practical concern! diff --git a/signal.py b/signalexe.py similarity index 64% rename from signal.py rename to signalexe.py index 00764cd..1d10df6 100755 --- a/signal.py +++ b/signalexe.py @@ -1,23 +1,31 @@ #!/usr/bin/env python -# v1.5.0+ -# signal.py assumes Snakefile is in current working directory (i.e., SIGNAL root) +# v1.6.0+ +# signalexe.py assumes Snakefile is in current working directory (i.e., SIGNAL root) +import signal import argparse import subprocess, os, sys import re from pathlib import Path +import platform + +# for compatibility between platforms +if platform.system() != 'Linux': + signal.SIGHUP = 1 def create_parser(): - allowed = {'all': False, 'postprocess': False, 'ncov_tools': False} + allowed = {'install': False, 'all': False, 'postprocess': False, 'ncov_tools': False} - parser = argparse.ArgumentParser(prog='signal.py', description="SARS-CoV-2 Illumina GeNome Assembly Line (SIGNAL) aims to take Illumina short-read sequences and perform consensus assembly + variant calling for ongoing surveillance and research efforts towards the emergent coronavirus: Severe Acute Respiratory Syndrome Coronavirus 2 (SARS-CoV-2).") + parser = argparse.ArgumentParser(prog='signalexe.py', description="SARS-CoV-2 Illumina GeNome Assembly Line (SIGNAL) aims to take Illumina short-read sequences and perform consensus assembly + variant calling for ongoing surveillance and research efforts towards the emergent coronavirus: Severe Acute Respiratory Syndrome Coronavirus 2 (SARS-CoV-2).") parser.add_argument('all', nargs='*', help="Run SIGNAL with all associated assembly rules. Does not include postprocessing '--configfile' or '--directory' required. The latter will automatically generate a configuration file and sample table. If both provided, then '--configfile' will take priority") parser.add_argument('postprocess', nargs='*', help="Run SIGNAL postprocessing on completed SIGNAL run. '--configfile' is required but will be generated if '--directory' is provided") parser.add_argument('ncov_tools', nargs='*', help="Generate configuration file and filesystem setup required and then execute ncov-tools quality control assessment. Requires 'ncov-tools' submodule! '--configfile' is required but will be generated if '--directory' is provided") + parser.add_argument('install', nargs='*', + help="Install individual rule environments and ensure SIGNAL is functional. The only parameter operable will be '--data'. Will override other operations!") parser.add_argument('-c', '--configfile', type=check_file, default=None, help="Configuration file (i.e., config.yaml) for SIGNAL analysis") parser.add_argument('-d', '--directory', type=check_directory, default=None, @@ -26,18 +34,22 @@ def create_parser(): parser.add_argument('--config-only', action='store_true', help="Generate sample table and configuration file (i.e., config.yaml) and exit. '--directory' required") parser.add_argument('--remove-freebayes', action='store_false', help="Configuration file generator parameter. Set flag to DISABLE freebayes variant calling (improves overall speed)") parser.add_argument('--add-breseq', action='store_true', help="Configuration file generator parameter. Set flag to ENABLE optional breseq step (will take more time for analysis to complete)") - parser.add_argument('-neg', '--neg-prefix', default=None, help="Configuration file generator parameter. Comma-separated list of negative sontrol sample name(s) or prefix(es). For example, 'Blank' will cover Blank1, Blank2, etc. Recommended if running ncov-tools. Will be left empty, if not provided") - parser.add_argument('--dependencies', action='store_true', help="Download data dependencies (under a created 'data' directory) required for SIGNAL analysis and exit. Note: Will override other flags! (~10 GB storage required)") + parser.add_argument('-neg', '--neg-prefix', default=None, help="Configuration file generator parameter. Comma-separated list of negative control sample name(s) or prefix(es). For example, 'Blank' will cover Blank1, Blank2, etc. Recommended if running ncov-tools. Will be left empty, if not provided") + parser.add_argument('--dependencies', action='store_true', help="Download data dependencies (under a created 'data' directory) required for SIGNAL analysis and exit. Note: Will override other parameters! (~10 GB storage required)") + parser.add_argument('--data', default='data', help="SIGNAL install and data dependencies parameter. Set location for data dependancies. If '--dependancies' is run, a folder will be created in the specified directory. If '--config-only' or '--directory' is used, the value will be applied to the configuration file. (Upcoming feature): When used with 'SIGNAL install', any tests run will use the dependencies located at this directory. Default = 'data'") + #parser.add_argument('--enable-test', action='store_true', help='SIGNAL install parameter. Add SIGNAL testing after environment installation using curated test data') parser.add_argument('-ri', '--rerun-incomplete', action='store_true', help="Snakemake parameter. Re-run any incomplete samples from a previously failed run") + parser.add_argument('-ii', '--ignore-incomplete', action='store_true', help='Snakemake parameter. Do not check for incomplete output files') parser.add_argument('--unlock', action='store_true', help="Snakemake parameter. Remove a lock on the working directory after a failed run") parser.add_argument('-F', '--forceall', action='store_true', help='Snakemake parameter. Force the re-run of all rules regardless of prior output') parser.add_argument('-n', '--dry-run', action='store_true', help='Snakemake parameter. Do not execute anything and only display what would be done') + parser.add_argument('--quiet', action='store_true', help="Snakemake parameter. Do not output any progress or rule information. If used with '--dry-run`, it will just display a summary of the DAG of jobs") parser.add_argument('--verbose', action='store_true', help="Snakemake parameter. Display snakemake debugging output") parser.add_argument('-v', '--version', action='store_true', help="Display version number") args, unknown = parser.parse_known_args() provided = [] - for opt in allowed: # ['all', 'postprocess', 'ncov_tools'] + for opt in allowed: # ['install', 'all', 'postprocess', 'ncov_tools'] if len(getattr(args, opt)) > 0: provided = provided + getattr(args, opt) getattr(args, opt).clear() @@ -49,18 +61,16 @@ def create_parser(): allowed[val.lower()] = True else: print(f"Ignoring unknown command: {val}") - - # Unknown - # for x in unknown: - # filter out unknown options (like -b or --b or alll) - # exit with error - # if x.startswith(('-', '--')): - # parser.error(f"unknown argument {x}") - # identify what belongs where - # getattr(result, 'provided').append(x) - + return args, allowed - + +def check_frontend(): + try: + subprocess.check_call(['mamba', 'list'], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) + return 'mamba' + except subprocess.CalledProcessError: + return 'conda' + def check_directory(path: str) -> Path: """ Check an input directory exists and is readable @@ -109,8 +119,12 @@ def check_submodule(exec_dir): try: print("Updating ncov-tools!") subprocess.run(['git', 'submodule', 'update', '--init', '--recursive']) + return True except subprocess.CalledProcessError: - exit("Could not find nor update the required 'ncov-tools' directory! Manually download/update and try again!") + print("Could not find nor update the required 'ncov-tools' directory! Manually download/update and try again!") + sys.exit(1) + else: + return False def write_sample_table(sample_data, output_table): """ @@ -122,8 +136,7 @@ def write_sample_table(sample_data, output_table): for sample in sample_data: out_fh.write(",".join(sample) + '\n') -def download_dependences(): - dir_name = 'data' +def download_dependencies(dir_name): script = os.path.join(script_path, 'scripts', 'get_data_dependencies.sh') subprocess.run(['bash', script, '-d', dir_name, '-a', 'MN908947.3']) @@ -135,7 +148,7 @@ def generate_sample_table(project_directory, project_name): out_table = project_name + "_sample_table.csv" subprocess.run(['bash', script, '-d', project_directory, '-n', out_table]) -def write_config_file(run_name, config_file, opt_tasks): +def write_config_file(run_name, config_file, data_directory, opt_tasks): ### opt_tasks = [args.breseq, args.freebayes, [args.neg_prefix]] - latter only applies to SIGNAL v1.5.8 and earlier config = f"""# This file contains a high-level summary of pipeline configuration and inputs. @@ -157,26 +170,26 @@ def write_config_file(run_name, config_file, opt_tasks): scheme_bed: 'resources/primer_schemes/artic_v3/nCoV-2019.bed' # Path from snakemake dir to bwa indexed human + viral reference genome -composite_reference: 'data/composite_human_viral_reference.fna' +composite_reference: "{data_directory}/composite_human_viral_reference.fna" # Used as bwa reference genome when removing host sequences. # Also used as 'ivar' reference genome in variant detection + consensus. # Used as -r,-g arguments to 'quast' # contig needed for hostremoval filtering script viral_reference_contig_name: 'MN908947.3' -viral_reference_genome: 'data/MN908947.3.fasta' -viral_reference_feature_coords: 'data/MN908947.3.gff3' +viral_reference_genome: "{data_directory}/MN908947.3.fasta" +viral_reference_feature_coords: "{data_directory}/MN908947.3.gff3" # breseq_reference must be defined if run_breseq == True run_breseq: {opt_tasks[0]} # Used as --reference argument to 'breseq' -breseq_reference: 'data/MN908947.3.gbk' +breseq_reference: "{data_directory}/MN908947.3.gbk" # run freebayes for variant and consensus calling (as well as ivar) run_freebayes: {opt_tasks[1]} # Used as --db argument to 'kraken2' -kraken2_db: 'data/Kraken2/db' +kraken2_db: "{data_directory}/Kraken2/db" # For Ivar's amplicon filter # https://github.com/andersen-lab/ivar/commit/7027563fd75581c78dabc6040ebffdee2b24abe6 @@ -226,7 +239,7 @@ def write_config_file(run_name, config_file, opt_tasks): amplicon_loc_bed: 'resources/primer_schemes/artic_v3/ncov-qc_V3.scheme.bed' # fasta of sequences to include with pangolin phylogeny -phylo_include_seqs: "data/blank.fasta" +phylo_include_seqs: "{data_directory}/blank.fasta" # List of negative control sample names or prefixes (i.e., ['Blank'] will cover Blank1, Blank2, etc.) negative_control_prefix: {opt_tasks[2]}""" @@ -234,21 +247,47 @@ def write_config_file(run_name, config_file, opt_tasks): with open(config_file, 'w') as fh: fh.write(config) +def install_signal(frontend, data='data'): + """ + Install SIGNAL dependencies per rule and test using a sample dataset, if desired + """ + dep_snakefile = os.path.join(script_path, 'resources', 'dependencies') + assert os.path.exists(dep_snakefile) + try: + subprocess.run(f"snakemake -s {dep_snakefile} --conda-frontend {frontend} --cores 1 --use-conda --conda-prefix=$PWD/.snakemake/conda", shell=True, check=True) + except subprocess.CalledProcessError: + print("Installation of environments failed!") + sys.exit(1) + + ### TODO: Test SIGNAL with curated data + if os.path.exists(data): + pass + + if __name__ == '__main__': # note: add root_dir to determine the root directory of SIGNAL script_path = os.path.join(os.path.abspath(sys.argv[0]).rsplit("/",1)[0]) args, allowed = create_parser() - version = 'v1.5.9' + version = 'v1.6.0' alt_options = [] if args.version: - exit(f"{version}") + print(f"{version}") + sys.exit(0) + + conda_frontend = check_frontend() # 'mamba' or 'conda' + + if allowed['install']: + install_signal(conda_frontend, args.data) + print("Installation of environments completed successfully!") + sys.exit(0) if args.dependencies: print("Downloading necessary reference and dependency files!") - download_dependences() - exit("Download complete!") - + download_dependencies(args.data) + print("Complete!") + sys.exit(0) + if args.configfile is None: assert args.directory is not None, "Please provide '--directory' to proceed! ('--configfile' if a configuration file already exists!)" run_name = args.directory.name @@ -258,35 +297,53 @@ def write_config_file(run_name, config_file, opt_tasks): neg = [pre.replace(" ","") for pre in args.neg_prefix.split(",")] else: neg = [args.neg_prefix] - write_config_file(run_name, config_file, [args.add_breseq, args.remove_freebayes, neg]) + write_config_file(run_name, config_file, args.data, [args.add_breseq, args.remove_freebayes, neg]) if args.config_only: - exit("Configuration file and sample table generated!") + print("Configuration file and sample table generated!") + sys.exit(0) else: config_file = args.configfile if not any([allowed[x] for x in allowed]): - exit("No task specified! Please provide at least one of 'all', 'postprocess', or 'ncov_tools'! See 'signal.py -h' for details!") + print("No task specified! Please provide at least one of 'all', 'postprocess', or 'ncov_tools'! See 'signalexe.py -h' for details!") + sys.exit(1) else: if args.verbose: alt_options.append('--verbose') + if args.quiet: alt_options.append('--quiet') if args.unlock: alt_options.append('--unlock') if args.forceall: alt_options.append('--forceall') if args.dry_run: alt_options.append('--dry-run') if args.rerun_incomplete: alt_options.append('--rerun-incomplete') + if args.ignore_incomplete: alt_options.append('--ignore-incomplete') opt = " ".join(alt_options) for task in allowed: - if allowed[task] is True: + if (allowed[task] is True) and (task != 'install'): print(f"Running SIGNAL {task}!") try: - subprocess.run(f"snakemake --conda-frontend mamba --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp {opt}", shell=True, check=True) - except subprocess.CalledProcessError: # likely missing mamba + subprocess.run(f"snakemake --conda-frontend {conda_frontend} --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp {opt}", shell=True, check=True) + except subprocess.CalledProcessError: if task == "ncov_tools": - check_submodule(os.getcwd()) - if opt.split(" ")[-1] == '--rerun-incomplete': # remove redundant flag - opt = " ".join(opt.split(" ")[:-1]) - try: - print("Retrying...") - subprocess.run(f"snakemake --conda-frontend conda --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp --rerun-incomplete {opt}", shell=True, check=True) - except subprocess.CalledProcessError: - exit(f"Something went wrong running SIGNAL {task}! Check input and try again!") + mod = check_submodule(os.getcwd()) + if mod: + if opt.split(" ")[-1] == '--rerun-incomplete': # remove redundant flag + opt = " ".join(opt.split(" ")[:-1]) + try: + print("Retrying...ncov-tools!") + subprocess.run(f"snakemake --conda-frontend {conda_frontend} --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp --rerun-incomplete {opt}", shell=True, check=True) + except subprocess.CalledProcessError: + print(f"Some jobs failed while running SIGNAL {task}! Check snakemake logs and the ncov-tools directory for additional details!") + continue + else: + print(f"Some jobs failed while running SIGNAL {task}! Check snakemake logs and the ncov-tools directory for additional details!") + continue + elif task == 'all': + print(f"Some jobs failed while running SIGNAL {task}! This does NOT necessarily mean your run was erroneous! Samples that failed assembly can be found in 'failed_samples.log'! If no such file exists or is blank, check your inputs and logs and try again!") + continue + elif task == 'postprocess': + print(f"Some jobs failed while running SIGNAL {task}! Some output files may be missing! Check SIGNAL results and logs and try again!") + continue + else: + print(f"Some jobs failed while running SIGNAL {task}! Check SIGNAL inputs, logs, and results and try again!") - exit("SIGNAL completed successfully!") + print("SIGNAL run complete! Check corresponding snakemake logs for any details!") + sys.exit(0)