Skip to content

Commit

Permalink
Merge pull request #4 from genomic-medicine-sweden/input
Browse files Browse the repository at this point in the history
structure the samplesheet
  • Loading branch information
LilyAnderssonLee authored Dec 1, 2023
2 parents 11b263c + 6022888 commit 7c8e671
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 52 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## v1.0dev - [date]

Initial release of nf-core/metaval, created with the [nf-core](https://nf-co.re/) template.
Initial release of metaval, created with the [nf-core](https://nf-co.re/) template.

### `Added`

Expand Down
122 changes: 71 additions & 51 deletions bin/check_samplesheet.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
#!/usr/bin/env python


"""Provide a command line tool to validate and transform tabular samplesheets."""


import argparse
import csv
import logging
Expand All @@ -13,7 +9,6 @@

logger = logging.getLogger()


class RowChecker:
"""
Define a service that can validate and transform each given row.
Expand All @@ -24,17 +19,22 @@ class RowChecker:
"""

VALID_FORMATS = (
".fq.gz",
".fastq.gz",
)
VALID_FORMATS = (".fq.gz", ".fastq.gz")

def __init__(
self,
sample_col="sample",
first_col="fastq_1",
second_col="fastq_2",
single_col="single_end",
run_accession_col="run_accession",
instrument_platform_col="instrument_platform",
reads_type_col="reads_type",
fastq_1_col="fastq_1",
fastq_2_col="fastq_2",
fasta_col="fasta",
kraken2_report_col="kraken2_report",
kraken2_classifiedout_col="kraken2_classifiedout",
centrifuge_out_col="centrifuge_out",
centrifuge_result_col="centrifuge_result",
diamond_col="diamond",
**kwargs,
):
"""
Expand All @@ -43,20 +43,43 @@ def __init__(
Args:
sample_col (str): The name of the column that contains the sample name
(default "sample").
first_col (str): The name of the column that contains the first (or only)
FASTQ file path (default "fastq_1").
second_col (str): The name of the column that contains the second (if any)
FASTQ file path (default "fastq_2").
single_col (str): The name of the new column that will be inserted and
records whether the sample contains single- or paired-end sequencing
reads (default "single_end").
run_accession_col (str): The name of the column that contains the run accession
(default "run_accession").
instrument_platform_col (str): The name of the column that contains the instrument platform
(default "instrument_platform").
reads_type_col (str): The name of the column that contains the reads type, shortread or longread
(default "reads_type").
fastq_1_col (str): The name of the column that contains the first (or only)
FASTQ file path (default "fastq_1") from bowtie2 unmapped read 1 against the human genome.
fastq_2_col (str): The name of the column that contains the second (if any)
FASTQ file path (default "fastq_2") from bowtie2 unmapped read 2 against the human genome.
fasta_col (str): The name of the column that contains the FASTA information
(default "fasta") from minimap2 unmapped read against the human genome.
kraken2_report_col (str): The name of the column that contains the kraken2 report
(default "kraken2_report").
kraken2_classifiedout_col (str): The name of the column that contains the kraken2 classifiedout
(default "kraken2_classifiedout") with the format extension "kraken2.kraken2.classifiedreads.txt".
centrifuge_out_col (str): The name of the column that contains the centrifuge out (kraken2-like report)
(default "centrifuge_out").
centrifuge_result_col (str): The name of the column that contains the centrifuge result
(default "centrifuge_result") with the format extension "centrifuge.results.txt".
diamond_col (str): The name of the column that contains the diamond information
(default "diamond") with the format extension ".csv".
"""
super().__init__(**kwargs)
self._sample_col = sample_col
self._first_col = first_col
self._second_col = second_col
self._single_col = single_col
self._run_accession_col = run_accession_col
self._instrument_platform_col = instrument_platform_col
self._reads_type_col = reads_type_col
self._fastq_1_col = fastq_1_col
self._fastq_2_col = fastq_2_col
self._fasta_col = fasta_col
self._kraken2_report_col = kraken2_report_col
self._kraken2_classifiedout_col = kraken2_classifiedout_col
self._centrifuge_out_col = centrifuge_out_col
self._centrifuge_result_col = centrifuge_result_col
self._diamond_col = diamond_col
self._seen = set()
self.modified = []

Expand All @@ -73,7 +96,7 @@ def validate_and_transform(self, row):
self._validate_first(row)
self._validate_second(row)
self._validate_pair(row)
self._seen.add((row[self._sample_col], row[self._first_col]))
self._seen.add((row[self._sample_col], row[self._fastq_1_col]))
self.modified.append(row)

def _validate_sample(self, row):
Expand All @@ -85,25 +108,22 @@ def _validate_sample(self, row):

def _validate_first(self, row):
"""Assert that the first FASTQ entry is non-empty and has the right format."""
if len(row[self._first_col]) <= 0:
if len(row[self._fastq_1_col]) <= 0:
raise AssertionError("At least the first FASTQ file is required.")
self._validate_fastq_format(row[self._first_col])
self._validate_fastq_format(row[self._fastq_1_col])

def _validate_second(self, row):
"""Assert that the second FASTQ entry has the right format if it exists."""
if len(row[self._second_col]) > 0:
self._validate_fastq_format(row[self._second_col])
if len(row[self._fastq_2_col]) > 0:
self._validate_fastq_format(row[self._fastq_2_col])

def _validate_pair(self, row):
"""Assert that read pairs have the same file extension. Report pair status."""
if row[self._first_col] and row[self._second_col]:
row[self._single_col] = False
first_col_suffix = Path(row[self._first_col]).suffixes[-2:]
second_col_suffix = Path(row[self._second_col]).suffixes[-2:]
if row[self._fastq_1_col] and row[self._fastq_2_col]:
first_col_suffix = Path(row[self._fastq_1_col]).suffixes[-1]
second_col_suffix = Path(row[self._fastq_2_col]).suffixes[-1]
if first_col_suffix != second_col_suffix:
raise AssertionError("FASTQ pairs must have the same file extensions.")
else:
row[self._single_col] = True

def _validate_fastq_format(self, filename):
"""Assert that a given filename has one of the expected FASTQ extensions."""
Expand All @@ -118,7 +138,7 @@ def validate_unique_samples(self):
Assert that the combination of sample name and FASTQ filename is unique.
In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the
number of times the same sample exist, but with different FASTQ files, e.g., multiple runs per experiment.
number of times the same sample exists, but with different FASTQ files, e.g., multiple runs per experiment.
"""
if len(self._seen) != len(self.modified):
Expand Down Expand Up @@ -164,7 +184,7 @@ def sniff_format(handle):

def check_samplesheet(file_in, file_out):
"""
Check that the tabular samplesheet has the structure expected by nf-core pipelines.
Check that the tabular samplesheet has the structure expected by metaval.
Validate the general shape of the table, expected columns, and each row. Also add
an additional column which records whether one or two FASTQ reads were found.
Expand All @@ -174,26 +194,26 @@ def check_samplesheet(file_in, file_out):
CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``.
file_out (pathlib.Path): Where the validated and transformed samplesheet should
be created; always in CSV format.
Example:
This function checks that the samplesheet follows the following structure,
see also the `viral recon samplesheet`_::
sample,fastq_1,fastq_2
SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz
SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz
SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz,
.. _viral recon samplesheet:
https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
"""
required_columns = {"sample", "fastq_1", "fastq_2"}
required_columns = [
"sample",
"run_accession",
"instrument_platform",
"reads_type",
"fastq_1",
"fastq_2",
"fasta",
"kraken2_report",
"kraken2_classifiedout",
"centrifuge_out",
"centrifuge_result",
"diamond",
]
# See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
with file_in.open(newline="") as in_handle:
reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle))
# Validate the existence of the expected header columns.
if not required_columns.issubset(reader.fieldnames):
if not required_columns == reader.fieldnames:
req_cols = ", ".join(required_columns)
logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.")
sys.exit(1)
Expand All @@ -207,7 +227,7 @@ def check_samplesheet(file_in, file_out):
sys.exit(1)
checker.validate_unique_samples()
header = list(reader.fieldnames)
header.insert(1, "single_end")
# header.insert(1, "single_end")
# See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
with file_out.open(mode="w", newline="") as out_handle:
writer = csv.DictWriter(out_handle, header, delimiter=",")
Expand Down

0 comments on commit 7c8e671

Please sign in to comment.