Merge pull request #24 from jannikseidelQBiC/corrections_release1.0.0

Corrections for release 1.0.0
nf-core · Mar 20, 2024 · ec4259d · ec4259d
2 parents cf315bf + c1e736f
commit ec4259d
Show file tree

Hide file tree

Showing 12 changed files with 50 additions and 51 deletions.
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
@@ -26,10 +26,10 @@ If you're not used to this workflow with git, you can start with some [docs from
 
 ## Tests
 
-You have the option to test your changes locally by running the pipeline. For receiving warnings about process selectors and other `debug` information, it is recommended to use the debug profile. Execute all the tests with the following command:
+You have the option to test your changes locally by running the pipeline. For receiving warnings about process selectors and other `debug` information, it is recommended to use the debug profile. Execute the main functionality test with the following command:
 
 ```bash
-nf-test test --profile debug,test,docker --verbose
+nextflow run . --profile debug,test,docker
 ```
 
 When you create a pull request with changes, [GitHub Actions](https://github.com/features/actions) will run automatic tests.

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -18,7 +18,6 @@ Learn more about contributing: [CONTRIBUTING.md](https://github.com/nf-core/deta
 - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/detaxizer/tree/master/.github/CONTRIBUTING.md)
 - [ ] If necessary, also make a PR on the nf-core/detaxizer _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository.
 - [ ] Make sure your code lints (`nf-core lint`).
-- [ ] Ensure the test suite passes (`nf-test test main.nf.test -profile test,docker`).
 - [ ] Check for unexpected warnings in debug mode (`nextflow run . -profile debug,test,docker --outdir <OUTDIR>`).
 - [ ] Usage Documentation in `docs/usage.md` is updated.
 - [ ] Output Documentation in `docs/output.md` is updated.

diff --git a/.nf-core.yml b/.nf-core.yml
@@ -1 +1,5 @@
 repository_type: pipeline
+lint:
+  files_unchanged:
+    - .github/CONTRIBUTING.md
+    - .github/PULL_REQUEST_TEMPLATE.md
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,7 +3,7 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## v1.0.0 - [2024-03-08]
+## v1.0.0 - Kobbfarbad - [2024-03-20]
 
 Initial release of nf-core/detaxizer, created with the [nf-core](https://nf-co.re/) template.
 

diff --git a/README.md b/README.md
@@ -39,7 +39,7 @@
 First, prepare a samplesheet with your input data that looks as follows:
 
 ```csv title="samplesheet.csv"
-sample,fastq_1,fastq_2,fastq_3
+sample,short_reads_fastq_1,short_reads_fastq_2,long_reads_fastq_1
 CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz,AEG588A1_S1_L002_R3_001.fastq.gz
 ```
 

diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv
@@ -1,4 +1,4 @@
-sample,fastq_1,fastq_2,fastq_3
+sample,short_reads_fastq_1,short_reads_fastq_2,long_reads_fastq_1
 test_paired-end_plus_long-reads,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R3_001.fastq.gz
 test_paired-end,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz,
 test_single-end_short,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,,,

diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -13,27 +13,27 @@
                 "errorMessage": "Sample name must be provided and cannot contain spaces",
                 "meta": ["id"]
             },
-            "fastq_1": {
+            "short_reads_fastq_1": {
                 "type": "string",
                 "format": "file-path",
                 "exists": true,
                 "pattern": "^\\S+\\.f(ast)?q\\.gz$",
-                "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
+                "errorMessage": "FastQ file for short reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
             },
-            "fastq_2": {
+            "short_reads_fastq_2": {
                 "type": "string",
                 "format": "file-path",
                 "pattern": "^\\S+\\.f(ast)?q\\.gz$",
-                "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
+                "errorMessage": "FastQ file for short reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
             },
-            "fastq_3": {
+            "long_reads_fastq_1": {
                 "type": "string",
                 "format": "file-path",
                 "pattern": "^\\S+\\.f(ast)?q\\.gz$",
-                "errorMessage": "FastQ file for reads 3 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
+                "errorMessage": "FastQ file for long reads cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
             }
         }
     },
     "required": ["sample"],
-    "anyOf": [{ "required": ["fastq_1"] }, { "required": ["fastq_3"] }]
+    "anyOf": [{ "required": ["short_reads_fastq_1"] }, { "required": ["long_reads_fastq_1"] }]
 }
diff --git a/bin/rename_fastq_headers_pre.py b/bin/rename_fastq_headers_pre.py
@@ -5,7 +5,6 @@
 from Bio import SeqIO, bgzf
 import gzip
 import sys
-import json
 import argparse
 import re
 
@@ -37,15 +36,16 @@ def renameReadsPaired(reads: tuple, filenames: str) -> tuple:
         if read_fw.endswith("/1"):
             read_fw_stripped = read_fw[:-2]
         else:
-            sys.exit("Please provide the forward reads in fastq_1 (where the headers are as follows: 'example.1/1').")
+            raise ValueError("Please provide the forward reads in short_reads_fastq_1 (where the headers are as follows: 'example.1/1').")
 
         if read_rv.endswith("/2"):
             read_rv_stripped = read_rv[:-2]
         else:
-            sys.exit("Please provide the reverse reads in fastq_2 (where the headers are as follows: 'example.1/2').")
+            raise ValueError("Please provide the reverse reads in short_reads_fastq_2 (where the headers are as follows: 'example.1/2').")
 
         if read_fw_stripped != read_rv_stripped:
-            sys.exit(f"Read IDs were not matching! Please provide matching IDs in the headers. The problematic reads were {read_fw} and {read_rv} in the files {filenames}.")
+            msg = f"Read IDs were not matching! Please provide matching IDs in the headers. The problematic reads were {read_fw} and {read_rv} in the files {filenames}."
+            raise ValueError(msg)
         else:
             read_dict[read_fw_stripped] = [read_fw, read_rv]
             read_renamed = [read_fw_stripped,read_rv_stripped]
@@ -55,15 +55,16 @@ def renameReadsPaired(reads: tuple, filenames: str) -> tuple:
         if read_fw_split.endswith("/1"):
             read_fw_stripped = read_fw_split[:-2]
         else:
-            sys.exit("Please provide the forward reads in fastq_1 (where the headers are as follows: 'example.1/1 additionalInformation').")
+            raise ValueError("Please provide the forward reads in short_reads_fastq_1 (where the headers are as follows: 'example.1/1 additionalInformation').")
 
         if read_rv_split.endswith("/2"):
             read_rv_stripped = read_rv_split[:-2]
         else:
-            sys.exit("Please provide the reverse reads in fastq_2 (where the headers are as follows: 'example.1/2 additionalInformation').")
+            raise ValueError("Please provide the reverse reads in short_reads_fastq_2 (where the headers are as follows: 'example.1/2 additionalInformation').")
 
         if read_fw_stripped != read_rv_stripped:
-            sys.exit(f"Read IDs were not matching! Please provide matching IDs in the headers. The problematic reads were {read_fw} and {read_rv} in the files {filenames}.")
+            msg = f"Read IDs were not matching! Please provide matching IDs in the headers. The problematic reads were {read_fw} and {read_rv} in the files {filenames}."
+            raise ValueError(msg)
         else:
             read_dict[read_fw_stripped] = [read_fw, read_rv]
             read_renamed = [read_fw_stripped,read_rv_stripped]
@@ -72,7 +73,8 @@ def renameReadsPaired(reads: tuple, filenames: str) -> tuple:
         read_rv_split = read_rv.split(" ")[0]
 
         if read_fw_split != read_rv_split:
-            sys.exit(f"Read IDs were not matching! Please provide matching IDs in the headers. The problematic reads were {read_fw} and {read_rv} in the files {filenames}.")
+            msg = f"Read IDs were not matching! Please provide matching IDs in the headers. The problematic reads were {read_fw} and {read_rv} in the files {filenames}."
+            raise ValueError(msg)
         else:
             read_dict[read_fw_split] = [read_fw, read_rv]
             read_renamed = [read_fw_split,read_rv_split]
@@ -81,18 +83,21 @@ def renameReadsPaired(reads: tuple, filenames: str) -> tuple:
         read_rv_split = read_rv.split(" ")[0]
 
         if read_fw_split != read_rv_split:
-            sys.exit(f"Read IDs were not matching! Please provide matching IDs in the headers. The problematic reads were {read_fw} and {read_rv} in the files {filenames}.")
+            msg = f"Read IDs were not matching! Please provide matching IDs in the headers. The problematic reads were {read_fw} and {read_rv} in the files {filenames}."
+            raise ValueError(msg)
         else:
             read_dict[read_fw_split] = [read_fw, read_rv]
             read_renamed = [read_fw_split,read_rv_split]
     elif bool(re.match(pattern5,read_fw)) and bool(re.match(pattern5,read_rv)):
         if read_fw != read_rv:
-            sys.exit(f"Read IDs were not matching! Please provide matching IDs in the headers. The problematic reads were {read_fw} and {read_rv} in the files {filenames}.")
+            msg = f"Read IDs were not matching! Please provide matching IDs in the headers. The problematic reads were {read_fw} and {read_rv} in the files {filenames}."
+            raise ValueError(msg)
         else:
             read_dict[read_fw] = [read_fw, read_rv]
             read_renamed = [read_fw,read_rv]
     else:
-        sys.exit(f"The provided files, {filenames}, contained reads with headers not supported by the pipeline.\n  Please use one of the formats:\n    example.1/1\n    example.1/1 additionalInformation\n    readID1 additionalTechnicalInformation\n    readID1 additionalTechnicalInformation additionalInformation\n    readID1\nAny other format is not supported.")
+        msg = f"The provided files, {filenames}, contained reads with headers not supported by the pipeline.\n  Please use one of the formats:\n    example.1/1\n    example.1/1 additionalInformation\n    readID1 additionalTechnicalInformation\n    readID1 additionalTechnicalInformation additionalInformation\n    readID1\nAny other format is not supported."
+        raise ValueError(msg)
     return (read_dict,read_renamed)
 
 def renameReadSingle(read: str, filename: str) -> tuple:
@@ -128,7 +133,8 @@ def renameReadSingle(read: str, filename: str) -> tuple:
             read_dict[read] = [read]
             read_renamed = [read]
     else:
-        sys.exit(f"The provided file, {filename}, contained reads with headers not supported by the pipeline.\n  Please use one of the formats:\n    example.1/1\n    example.1/1 additionalInformation\n    readID1 additionalTechnicalInformation\n    readID1 additionalTechnicalInformation additionalInformation\n    readID1\nAny other format is not supported.")
+        msg = f"The provided file, {filename}, contained reads with headers not supported by the pipeline.\n  Please use one of the formats:\n    example.1/1\n    example.1/1 additionalInformation\n    readID1 additionalTechnicalInformation\n    readID1 additionalTechnicalInformation additionalInformation\n    readID1\nAny other format is not supported."
+        raise ValueError(msg)
     return (read_dict,read_renamed)
 
 def main():

diff --git a/conf/modules.config b/conf/modules.config
@@ -101,7 +101,7 @@ process {
 
     withName: FILTER_BLASTN_IDENTCOV {
         publishDir = [
-            path: { "${params.outdir}/blast/filteredIdentCov" },
+            path: { "${params.outdir}/blast/filtered_ident_cov" },
             mode: params.publish_dir_mode,
             pattern: '*identcov.txt',
             enabled: params.save_intermediates

diff --git a/docs/output.md b/docs/output.md
@@ -72,7 +72,7 @@ blastn can validate the reads classified by kraken2 as the taxon/taxa to be asse
 <summary>Output files</summary>
 
 - `blast/`
-  - `filteredIdentCov/`: The read ids and statistics of the reads which were validated by blastn to be the taxon/taxa to assess/to filter.
+  - `filtered_ident_cov/`: The read ids and statistics of the reads which were validated by blastn to be the taxon/taxa to assess/to filter.
     - `<sample>_R1.identcov.txt`: File is present for single-end and paired-end short reads.
     - `<sample>_R2.identcov.txt`: File is present for paired-end short reads.
     - `<sample>_longReads.identcov.txt`: File is present for long reads.

diff --git a/docs/usage.md b/docs/usage.md
@@ -18,25 +18,25 @@ You will need to create a samplesheet with information about the samples you wou
 
 ### Full samplesheet
 
-The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 4 columns to match those defined in the table below. For single-end short reads use the second column, for long reads use the forth column.
+The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 4 columns to match those defined in the table below. For single-end short reads use the column `short_reads_fastq_1`, for long reads use the column `long_reads_fastq_1`.
 
 A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 5 samples, showing all possible combinations of short and long reads.
 
 ```csv title="samplesheet.csv"
-sample,fastq_1,fastq_2,fastq_3
+sample,short_reads_fastq_1,short_reads_fastq_2,long_reads_fastq_1
 SINGLE_END_SHORT,AEG588A1_S1_L002_R1_001.fastq.gz,,
 PAIRED_END_SHORT,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz,
 SINGLE_END_LONG,,,AEG588A3_001.fastq.gz
 SINGLE_END_SHORT_LONG,AEG588A4_S1_L002_R1_001.fastq.gz,,AEG588A4_001.fastq.gz
 PAIRED_END_PLUS_LONG,AEG588A5_S1_L002_R1_001.fastq.gz,AEG588A5_S1_L002_R2_001.fastq.gz,AEG588A5_001.fastq.gz
 ```
 
-| Column    | Description                                                                                                                                                                            |
-| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `sample`  | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). |
-| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or "fq.gz". Optional, if `fastq_3` is also provided.                     |
-| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or "fq.gz". Optional. Only used for paired-end files.                    |
-| `fastq_3` | Full path to FastQ file for long reads. File has to be gzipped and have the extension ".fastq.gz" or "fq.gz". Optional. Use only for long reads.                                       |
+| Column                | Description                                                                                                                                                                            |
+| --------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `sample`              | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). |
+| `short_reads_fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or "fq.gz". Optional, if `long_reads_fastq_1` is also provided.          |
+| `short_reads_fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or "fq.gz". Optional. Only used for paired-end files.                    |
+| `long_reads_fastq_1`  | Full path to FastQ file for long reads. File has to be gzipped and have the extension ".fastq.gz" or "fq.gz". Optional. Use only for long reads.                                       |
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 

diff --git a/workflows/detaxizer.nf b/workflows/detaxizer.nf
@@ -56,35 +56,25 @@ workflow DETAXIZER {
     ch_versions = Channel.empty()
     ch_multiqc_files = Channel.empty()
 
-    // check whether the sample sheet is correctly formated
-    ch_samplesheet.map {
-            meta, fastq_1, fastq_2, fastq_3 ->
-                if (!fastq_1 && !fastq_3){
-                    error("Please provide at least one single end file as input in the sample sheet for ${meta.id}.")
-                } else if (!fastq_1 && fastq_2 && fastq_3){
-                    error("Please provide single end reads in following format in the sample sheet: base name, fastq_1,,fastq_3. fastq_1 is the short read file, fastq_3 the long read file. The wrongly formated entry is ${meta.id}.")
-            }
-        }
-
     ch_samplesheet.branch {
         shortReads: it[1]
         }.set {
             ch_short
         }
 
     ch_short.shortReads.map{
-        meta, fastq_1, fastq_2, fastq_3 ->
-            if (fastq_2){
+        meta, short_reads_fastq_1, short_reads_fastq_2, long_reads_fastq_1 ->
+            if (short_reads_fastq_2){
                 def newMeta = meta.clone()
                 newMeta.single_end = false
                 newMeta.long_reads = false
-                return [newMeta, [fastq_1, fastq_2]]
+                return [newMeta, [short_reads_fastq_1, short_reads_fastq_2]]
             } else {
                 def newMeta = meta.clone()
                 newMeta.id = "${newMeta.id}_R1"
                 newMeta.single_end = true
                 newMeta.long_reads = false
-                return [newMeta, fastq_1]
+                return [newMeta, short_reads_fastq_1]
             }
     }.set{
         ch_short
@@ -97,12 +87,12 @@ workflow DETAXIZER {
     }
 
     ch_long.longReads.map {
-        meta, fastq_1, fastq_2, fastq_3 ->
+        meta, short_reads_fastq_1, short_reads_fastq_2, long_reads_fastq_1 ->
             def newMeta = meta.clone()
             newMeta.id = "${newMeta.id}_longReads"
             newMeta.single_end = true
             newMeta.long_reads = true
-            return [newMeta, fastq_3]
+            return [newMeta, long_reads_fastq_1]
     }.set {
         ch_long
     }