diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index e3fd940..ae5bb47 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -26,10 +26,10 @@ If you're not used to this workflow with git, you can start with some [docs from ## Tests -You have the option to test your changes locally by running the pipeline. For receiving warnings about process selectors and other `debug` information, it is recommended to use the debug profile. Execute all the tests with the following command: +You have the option to test your changes locally by running the pipeline. For receiving warnings about process selectors and other `debug` information, it is recommended to use the debug profile. Execute the main functionality test with the following command: ```bash -nf-test test --profile debug,test,docker --verbose +nextflow run . --profile debug,test,docker ``` When you create a pull request with changes, [GitHub Actions](https://github.com/features/actions) will run automatic tests. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 5068d7f..890a6fe 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -18,7 +18,6 @@ Learn more about contributing: [CONTRIBUTING.md](https://github.com/nf-core/deta - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/detaxizer/tree/master/.github/CONTRIBUTING.md) - [ ] If necessary, also make a PR on the nf-core/detaxizer _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. - [ ] Make sure your code lints (`nf-core lint`). -- [ ] Ensure the test suite passes (`nf-test test main.nf.test -profile test,docker`). - [ ] Check for unexpected warnings in debug mode (`nextflow run . -profile debug,test,docker --outdir `). - [ ] Usage Documentation in `docs/usage.md` is updated. - [ ] Output Documentation in `docs/output.md` is updated. diff --git a/.nf-core.yml b/.nf-core.yml index 3805dc8..78a3fd2 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1 +1,5 @@ repository_type: pipeline +lint: + files_unchanged: + - .github/CONTRIBUTING.md + - .github/PULL_REQUEST_TEMPLATE.md diff --git a/CHANGELOG.md b/CHANGELOG.md index bb6a40c..70fbb35 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v1.0.0 - [2024-03-08] +## v1.0.0 - Kobbfarbad - [2024-03-20] Initial release of nf-core/detaxizer, created with the [nf-core](https://nf-co.re/) template. diff --git a/README.md b/README.md index 049c71a..cb0a80b 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ First, prepare a samplesheet with your input data that looks as follows: ```csv title="samplesheet.csv" -sample,fastq_1,fastq_2,fastq_3 +sample,short_reads_fastq_1,short_reads_fastq_2,long_reads_fastq_1 CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz,AEG588A1_S1_L002_R3_001.fastq.gz ``` diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index ced1eca..fcb0266 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,4 +1,4 @@ -sample,fastq_1,fastq_2,fastq_3 +sample,short_reads_fastq_1,short_reads_fastq_2,long_reads_fastq_1 test_paired-end_plus_long-reads,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R3_001.fastq.gz test_paired-end,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz, test_single-end_short,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,,, diff --git a/assets/schema_input.json b/assets/schema_input.json index ee88757..d7e71f0 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -13,27 +13,27 @@ "errorMessage": "Sample name must be provided and cannot contain spaces", "meta": ["id"] }, - "fastq_1": { + "short_reads_fastq_1": { "type": "string", "format": "file-path", "exists": true, "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + "errorMessage": "FastQ file for short reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" }, - "fastq_2": { + "short_reads_fastq_2": { "type": "string", "format": "file-path", "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + "errorMessage": "FastQ file for short reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" }, - "fastq_3": { + "long_reads_fastq_1": { "type": "string", "format": "file-path", "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 3 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + "errorMessage": "FastQ file for long reads cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" } } }, "required": ["sample"], - "anyOf": [{ "required": ["fastq_1"] }, { "required": ["fastq_3"] }] + "anyOf": [{ "required": ["short_reads_fastq_1"] }, { "required": ["long_reads_fastq_1"] }] } diff --git a/bin/rename_fastq_headers_pre.py b/bin/rename_fastq_headers_pre.py index 4bbb4c0..45fe10d 100755 --- a/bin/rename_fastq_headers_pre.py +++ b/bin/rename_fastq_headers_pre.py @@ -5,7 +5,6 @@ from Bio import SeqIO, bgzf import gzip import sys -import json import argparse import re @@ -37,15 +36,16 @@ def renameReadsPaired(reads: tuple, filenames: str) -> tuple: if read_fw.endswith("/1"): read_fw_stripped = read_fw[:-2] else: - sys.exit("Please provide the forward reads in fastq_1 (where the headers are as follows: 'example.1/1').") + raise ValueError("Please provide the forward reads in short_reads_fastq_1 (where the headers are as follows: 'example.1/1').") if read_rv.endswith("/2"): read_rv_stripped = read_rv[:-2] else: - sys.exit("Please provide the reverse reads in fastq_2 (where the headers are as follows: 'example.1/2').") + raise ValueError("Please provide the reverse reads in short_reads_fastq_2 (where the headers are as follows: 'example.1/2').") if read_fw_stripped != read_rv_stripped: - sys.exit(f"Read IDs were not matching! Please provide matching IDs in the headers. The problematic reads were {read_fw} and {read_rv} in the files {filenames}.") + msg = f"Read IDs were not matching! Please provide matching IDs in the headers. The problematic reads were {read_fw} and {read_rv} in the files {filenames}." + raise ValueError(msg) else: read_dict[read_fw_stripped] = [read_fw, read_rv] read_renamed = [read_fw_stripped,read_rv_stripped] @@ -55,15 +55,16 @@ def renameReadsPaired(reads: tuple, filenames: str) -> tuple: if read_fw_split.endswith("/1"): read_fw_stripped = read_fw_split[:-2] else: - sys.exit("Please provide the forward reads in fastq_1 (where the headers are as follows: 'example.1/1 additionalInformation').") + raise ValueError("Please provide the forward reads in short_reads_fastq_1 (where the headers are as follows: 'example.1/1 additionalInformation').") if read_rv_split.endswith("/2"): read_rv_stripped = read_rv_split[:-2] else: - sys.exit("Please provide the reverse reads in fastq_2 (where the headers are as follows: 'example.1/2 additionalInformation').") + raise ValueError("Please provide the reverse reads in short_reads_fastq_2 (where the headers are as follows: 'example.1/2 additionalInformation').") if read_fw_stripped != read_rv_stripped: - sys.exit(f"Read IDs were not matching! Please provide matching IDs in the headers. The problematic reads were {read_fw} and {read_rv} in the files {filenames}.") + msg = f"Read IDs were not matching! Please provide matching IDs in the headers. The problematic reads were {read_fw} and {read_rv} in the files {filenames}." + raise ValueError(msg) else: read_dict[read_fw_stripped] = [read_fw, read_rv] read_renamed = [read_fw_stripped,read_rv_stripped] @@ -72,7 +73,8 @@ def renameReadsPaired(reads: tuple, filenames: str) -> tuple: read_rv_split = read_rv.split(" ")[0] if read_fw_split != read_rv_split: - sys.exit(f"Read IDs were not matching! Please provide matching IDs in the headers. The problematic reads were {read_fw} and {read_rv} in the files {filenames}.") + msg = f"Read IDs were not matching! Please provide matching IDs in the headers. The problematic reads were {read_fw} and {read_rv} in the files {filenames}." + raise ValueError(msg) else: read_dict[read_fw_split] = [read_fw, read_rv] read_renamed = [read_fw_split,read_rv_split] @@ -81,18 +83,21 @@ def renameReadsPaired(reads: tuple, filenames: str) -> tuple: read_rv_split = read_rv.split(" ")[0] if read_fw_split != read_rv_split: - sys.exit(f"Read IDs were not matching! Please provide matching IDs in the headers. The problematic reads were {read_fw} and {read_rv} in the files {filenames}.") + msg = f"Read IDs were not matching! Please provide matching IDs in the headers. The problematic reads were {read_fw} and {read_rv} in the files {filenames}." + raise ValueError(msg) else: read_dict[read_fw_split] = [read_fw, read_rv] read_renamed = [read_fw_split,read_rv_split] elif bool(re.match(pattern5,read_fw)) and bool(re.match(pattern5,read_rv)): if read_fw != read_rv: - sys.exit(f"Read IDs were not matching! Please provide matching IDs in the headers. The problematic reads were {read_fw} and {read_rv} in the files {filenames}.") + msg = f"Read IDs were not matching! Please provide matching IDs in the headers. The problematic reads were {read_fw} and {read_rv} in the files {filenames}." + raise ValueError(msg) else: read_dict[read_fw] = [read_fw, read_rv] read_renamed = [read_fw,read_rv] else: - sys.exit(f"The provided files, {filenames}, contained reads with headers not supported by the pipeline.\n Please use one of the formats:\n example.1/1\n example.1/1 additionalInformation\n readID1 additionalTechnicalInformation\n readID1 additionalTechnicalInformation additionalInformation\n readID1\nAny other format is not supported.") + msg = f"The provided files, {filenames}, contained reads with headers not supported by the pipeline.\n Please use one of the formats:\n example.1/1\n example.1/1 additionalInformation\n readID1 additionalTechnicalInformation\n readID1 additionalTechnicalInformation additionalInformation\n readID1\nAny other format is not supported." + raise ValueError(msg) return (read_dict,read_renamed) def renameReadSingle(read: str, filename: str) -> tuple: @@ -128,7 +133,8 @@ def renameReadSingle(read: str, filename: str) -> tuple: read_dict[read] = [read] read_renamed = [read] else: - sys.exit(f"The provided file, {filename}, contained reads with headers not supported by the pipeline.\n Please use one of the formats:\n example.1/1\n example.1/1 additionalInformation\n readID1 additionalTechnicalInformation\n readID1 additionalTechnicalInformation additionalInformation\n readID1\nAny other format is not supported.") + msg = f"The provided file, {filename}, contained reads with headers not supported by the pipeline.\n Please use one of the formats:\n example.1/1\n example.1/1 additionalInformation\n readID1 additionalTechnicalInformation\n readID1 additionalTechnicalInformation additionalInformation\n readID1\nAny other format is not supported." + raise ValueError(msg) return (read_dict,read_renamed) def main(): diff --git a/conf/modules.config b/conf/modules.config index f67bb1a..33912c7 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -101,7 +101,7 @@ process { withName: FILTER_BLASTN_IDENTCOV { publishDir = [ - path: { "${params.outdir}/blast/filteredIdentCov" }, + path: { "${params.outdir}/blast/filtered_ident_cov" }, mode: params.publish_dir_mode, pattern: '*identcov.txt', enabled: params.save_intermediates diff --git a/docs/output.md b/docs/output.md index 921975b..27ccd97 100644 --- a/docs/output.md +++ b/docs/output.md @@ -72,7 +72,7 @@ blastn can validate the reads classified by kraken2 as the taxon/taxa to be asse Output files - `blast/` - - `filteredIdentCov/`: The read ids and statistics of the reads which were validated by blastn to be the taxon/taxa to assess/to filter. + - `filtered_ident_cov/`: The read ids and statistics of the reads which were validated by blastn to be the taxon/taxa to assess/to filter. - `_R1.identcov.txt`: File is present for single-end and paired-end short reads. - `_R2.identcov.txt`: File is present for paired-end short reads. - `_longReads.identcov.txt`: File is present for long reads. diff --git a/docs/usage.md b/docs/usage.md index d9a450b..60948d6 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -18,12 +18,12 @@ You will need to create a samplesheet with information about the samples you wou ### Full samplesheet -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 4 columns to match those defined in the table below. For single-end short reads use the second column, for long reads use the forth column. +The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 4 columns to match those defined in the table below. For single-end short reads use the column `short_reads_fastq_1`, for long reads use the column `long_reads_fastq_1`. A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 5 samples, showing all possible combinations of short and long reads. ```csv title="samplesheet.csv" -sample,fastq_1,fastq_2,fastq_3 +sample,short_reads_fastq_1,short_reads_fastq_2,long_reads_fastq_1 SINGLE_END_SHORT,AEG588A1_S1_L002_R1_001.fastq.gz,, PAIRED_END_SHORT,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz, SINGLE_END_LONG,,,AEG588A3_001.fastq.gz @@ -31,12 +31,12 @@ SINGLE_END_SHORT_LONG,AEG588A4_S1_L002_R1_001.fastq.gz,,AEG588A4_001.fastq.gz PAIRED_END_PLUS_LONG,AEG588A5_S1_L002_R1_001.fastq.gz,AEG588A5_S1_L002_R2_001.fastq.gz,AEG588A5_001.fastq.gz ``` -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or "fq.gz". Optional, if `fastq_3` is also provided. | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or "fq.gz". Optional. Only used for paired-end files. | -| `fastq_3` | Full path to FastQ file for long reads. File has to be gzipped and have the extension ".fastq.gz" or "fq.gz". Optional. Use only for long reads. | +| Column | Description | +| --------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | +| `short_reads_fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or "fq.gz". Optional, if `long_reads_fastq_1` is also provided. | +| `short_reads_fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or "fq.gz". Optional. Only used for paired-end files. | +| `long_reads_fastq_1` | Full path to FastQ file for long reads. File has to be gzipped and have the extension ".fastq.gz" or "fq.gz". Optional. Use only for long reads. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. diff --git a/workflows/detaxizer.nf b/workflows/detaxizer.nf index 9367882..ee5f2f2 100644 --- a/workflows/detaxizer.nf +++ b/workflows/detaxizer.nf @@ -56,16 +56,6 @@ workflow DETAXIZER { ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() - // check whether the sample sheet is correctly formated - ch_samplesheet.map { - meta, fastq_1, fastq_2, fastq_3 -> - if (!fastq_1 && !fastq_3){ - error("Please provide at least one single end file as input in the sample sheet for ${meta.id}.") - } else if (!fastq_1 && fastq_2 && fastq_3){ - error("Please provide single end reads in following format in the sample sheet: base name, fastq_1,,fastq_3. fastq_1 is the short read file, fastq_3 the long read file. The wrongly formated entry is ${meta.id}.") - } - } - ch_samplesheet.branch { shortReads: it[1] }.set { @@ -73,18 +63,18 @@ workflow DETAXIZER { } ch_short.shortReads.map{ - meta, fastq_1, fastq_2, fastq_3 -> - if (fastq_2){ + meta, short_reads_fastq_1, short_reads_fastq_2, long_reads_fastq_1 -> + if (short_reads_fastq_2){ def newMeta = meta.clone() newMeta.single_end = false newMeta.long_reads = false - return [newMeta, [fastq_1, fastq_2]] + return [newMeta, [short_reads_fastq_1, short_reads_fastq_2]] } else { def newMeta = meta.clone() newMeta.id = "${newMeta.id}_R1" newMeta.single_end = true newMeta.long_reads = false - return [newMeta, fastq_1] + return [newMeta, short_reads_fastq_1] } }.set{ ch_short @@ -97,12 +87,12 @@ workflow DETAXIZER { } ch_long.longReads.map { - meta, fastq_1, fastq_2, fastq_3 -> + meta, short_reads_fastq_1, short_reads_fastq_2, long_reads_fastq_1 -> def newMeta = meta.clone() newMeta.id = "${newMeta.id}_longReads" newMeta.single_end = true newMeta.long_reads = true - return [newMeta, fastq_3] + return [newMeta, long_reads_fastq_1] }.set { ch_long }