From 3a5524ff42dd464d0af6c34975e9f8038a1d25fe Mon Sep 17 00:00:00 2001 From: Carpanzano Date: Mon, 24 Oct 2022 10:18:43 +0200 Subject: [PATCH 1/6] fixed small issues in docs --- CITATIONS.md | 8 ++++++++ README.md | 5 +++-- docs/usage.md | 8 ++++---- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/CITATIONS.md b/CITATIONS.md index 2b68927..1decfe5 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -22,10 +22,18 @@ > Okonechnikov, K., Conesa, A., & García-Alcalde, F. (2015). “Qualimap 2: advanced multi-sample quality control for high-throughput sequencing data.” Bioinformatics, btv566 +- [BAMTOOLS](https://github.com/pezmaster31/bamtools) + + > Barnett DW, Garrison EK, Quinlan AR, Strömberg MP, Marth GT. BamTools: a C++ API and toolkit for analyzing and managing BAM files. Bioinformatics. 2011 Jun 15;27(12):1691-2. doi: 10.1093/bioinformatics/btr174. Epub 2011 Apr 14. PubMed PMID: 21493652; PubMed Central PMCID: PMC3106182. + - [BWA](http://bio-bwa.sourceforge.net) > Li, H. (2013). Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. arXiv preprint arXiv:1303.3997. +- [BWAmem2](https://github.com/bwa-mem2/bwa-mem2) + + > Vasimuddin Md, Sanchit Misra, Heng Li, Srinivas Aluru. Efficient Architecture-Aware Acceleration of BWA-MEM for Multicore Systems. IEEE Parallel and Distributed Processing Symposium (IPDPS), 2019. + - [SAMtools](https://www.htslib.org) > Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R; 1000 Genome Project Data Processing Subgroup. The Sequence Alignment/Map format and SAMtools. Bioinformatics. 2009 Aug 15;25(16):2078-9. doi: 10.1093/bioinformatics/btp352. Epub 2009 Jun 8. PMID: 19505943; PMCID: PMC2723002. diff --git a/README.md b/README.md index bb8d950..973e988 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,9 @@ ## Introduction -**nf-core/hgtseq** is a bioinformatics best-practice analysis pipeline built to investigate horizontal gene transfer from NGS data. hgtseq inputs can be either fastq files, which are then mapped to the proper host reference with BWA, or mapped BAM files. Unmapped reads are then extracted with SAMtools view based on their SAM flag classification: two separate files are generated, depending on whether both mates are unmapped, or just one is. Taxonomic classification is then performed with Kraken2. A rich set of visualisations completes the -pipeline, accompanying the results with interactive Krona plots as well as Circos-like plots generated with R, aimed at better annotating potential integration sites in the host genome +**nf-core/hgtseq** is a bioinformatics best-practice analysis pipeline built to investigate horizontal gene transfer from NGS data. + +The pipeline uses metagenomic classification of paired-read alignments against a reference genome to identify the presence of non-host microbial sequences within read pairs, and to infer potential integration sites into the host genome. The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! diff --git a/docs/usage.md b/docs/usage.md index b4d9271..93a0d7a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -101,19 +101,19 @@ This version number will be logged in reports when you run the pipeline, so that Please note that, in addition to the classic parameters such as `--input` and `--outdir`, the pipeline requires other specific parameters. -### `--genome` +### --genome The user must specify the genome of interest. A list of genomes is available in the pipeline under the folder conf/igenomes.config, that contains illumina iGenomes reference file paths. This follows [nf-core guidelines](https://nf-co.re/usage/reference_genomes) for reference management, and sets all necessary parameters (like fasta, gtf, bwa). The user is recommended to primarily use the _genome_ parameter, and can follow instructions at [this](https://nf-co.re/usage/reference_genomes#adding-paths-to-a-config-file) page to add genomes not currently included in the repository. All parameters set automatically as a consequence, though hidden, can be accessed by the user at command line should they wish a finer control. -### `--taxonomy_id` +### --taxonomy_id Since the code in the report is executed differently based on the taxonomy id of the analyzed species, the user must enter it in the command line (must be taken from the Taxonomy Database of NCBI). -### `--krakendb` +### --krakendb User must provide a Kraken2 database in order to perform the classification. -### `--kronadb` +### --kronadb User must also provide a Krona database in order to generate interactive pie charts with Kronatools. From d6118521cb1b9505d47f78ba686e156dbf1d82a0 Mon Sep 17 00:00:00 2001 From: Carpanzano Date: Mon, 24 Oct 2022 10:23:17 +0200 Subject: [PATCH 2/6] improved database path description in json schema --- nextflow_schema.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 95a43e3..fc56dfc 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -114,12 +114,12 @@ "krakendb": { "type": "string", "default": "None", - "description": "Either a local path or a URL to compressed kraken database folder" + "description": "A local path to kraken database folder or compressed database file, or a URL to a compressed database file, in tar.gz format" }, "kronadb": { "type": "string", "default": "None", - "description": "Either a local path or a URL to compressed .tab krona taxonomy file" + "description": "A local path or a URL to a .tab krona taxonomy file; it can also receive a compressed .tab file in tar.gz format" }, "gff": { "type": "string", From 458be5114df8a36e6e39b2fab8f413114096c0aa Mon Sep 17 00:00:00 2001 From: Carpanzano Date: Mon, 24 Oct 2022 10:26:08 +0200 Subject: [PATCH 3/6] improved database path description in json schema --- nextflow_schema.json | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index fc56dfc..b093b23 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,11 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir", "taxonomy_id"], + "required": [ + "input", + "outdir", + "taxonomy_id" + ], "properties": { "input": { "type": "string", @@ -142,7 +146,10 @@ "hidden": true } }, - "required": ["krakendb", "kronadb"] + "required": [ + "krakendb", + "kronadb" + ] }, "institutional_config_options": { "title": "Institutional config options", @@ -246,7 +253,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { From 3866dda21a86991c104db141368bdf6ff0d5cc0b Mon Sep 17 00:00:00 2001 From: Carpanzano Date: Mon, 24 Oct 2022 10:27:30 +0200 Subject: [PATCH 4/6] improved database path description in json schema --- nextflow_schema.json | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index b093b23..fc56dfc 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,11 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir", - "taxonomy_id" - ], + "required": ["input", "outdir", "taxonomy_id"], "properties": { "input": { "type": "string", @@ -146,10 +142,7 @@ "hidden": true } }, - "required": [ - "krakendb", - "kronadb" - ] + "required": ["krakendb", "kronadb"] }, "institutional_config_options": { "title": "Institutional config options", @@ -253,14 +246,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { From 5977ae568f76e455be9256e052c9a6ff7881f8e1 Mon Sep 17 00:00:00 2001 From: Carpanzano Date: Mon, 24 Oct 2022 13:31:11 +0200 Subject: [PATCH 5/6] commited last suggestions --- assets/samplesheet_fastq.csv | 2 +- docs/usage.md | 2 +- nextflow_schema.json | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/assets/samplesheet_fastq.csv b/assets/samplesheet_fastq.csv index 0f2844e..f3f521e 100644 --- a/assets/samplesheet_fastq.csv +++ b/assets/samplesheet_fastq.csv @@ -1,3 +1,3 @@ sample,input1,input2 SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001_1.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001_2.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq, + diff --git a/docs/usage.md b/docs/usage.md index 93a0d7a..374dbbf 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -16,7 +16,7 @@ The second category, i.e. unmapped reads whose mate is mapped, provide the oppor ## Input Formats -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the input file. This file can have at least two or three columns according to the format of reads used, i.e. two columns for BAM files and three for FASTQ files (as defined in the tables below). +The input file can have at least two or three columns according to the format of reads used, i.e. two columns for BAM files and three for FASTQ files (as defined in the tables below). ### FASTQ diff --git a/nextflow_schema.json b/nextflow_schema.json index fc56dfc..ae2376a 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -64,7 +64,8 @@ "aligner": { "type": "string", "default": "bwa-mem", - "description": "Choose if aligner should be bwa-mem or bwa-mem2" + "description": "Choose if aligner should be bwa-mem or bwa-mem2", + "enum": ["bwa-mem", "bwa-mem2"] }, "multiqc_runkraken": { "type": "boolean", From 11141ea383308f73b0f3091d7afff1b426953c5e Mon Sep 17 00:00:00 2001 From: Carpanzano Date: Mon, 24 Oct 2022 14:10:53 +0200 Subject: [PATCH 6/6] added last two suggestions --- docs/usage.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 374dbbf..283db66 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -111,11 +111,11 @@ Since the code in the report is executed differently based on the taxonomy id of ### --krakendb -User must provide a Kraken2 database in order to perform the classification. +User must provide a Kraken2 database in order to perform the classification. Can optionally be in a `.tar.gz` archive. ### --kronadb -User must also provide a Krona database in order to generate interactive pie charts with Kronatools. +User must also provide a Krona database in order to generate interactive pie charts with Kronatools. Can optionally be in a `.tar.gz` archive. ## Core Nextflow arguments