From 51ac8fbb56523ee5bf645922aa3e2107b4a3aa8d Mon Sep 17 00:00:00 2001
From: Lili Andersson-Li <64467552+LilyAnderssonLee@users.noreply.github.com>
Date: Fri, 1 Dec 2023 08:58:40 +0100
Subject: [PATCH 1/2] structure the samplesheet

---
 bin/check_samplesheet.py | 385 +++++++++++++++------------------------
 1 file changed, 147 insertions(+), 238 deletions(-)

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
index 4a758fe..7f639da 100755
--- a/bin/check_samplesheet.py
+++ b/bin/check_samplesheet.py
@@ -1,259 +1,168 @@
 #!/usr/bin/env python
 
-
-"""Provide a command line tool to validate and transform tabular samplesheets."""
-
-
-import argparse
-import csv
-import logging
+import os
 import sys
-from collections import Counter
-from pathlib import Path
-
-logger = logging.getLogger()
-
-
-class RowChecker:
-    """
-    Define a service that can validate and transform each given row.
-
-    Attributes:
-        modified (list): A list of dicts, where each dict corresponds to a previously
-            validated and transformed row. The order of rows is maintained.
-
-    """
-
-    VALID_FORMATS = (
-        ".fq.gz",
-        ".fastq.gz",
-    )
-
-    def __init__(
-        self,
-        sample_col="sample",
-        first_col="fastq_1",
-        second_col="fastq_2",
-        single_col="single_end",
-        **kwargs,
-    ):
-        """
-        Initialize the row checker with the expected column names.
-
-        Args:
-            sample_col (str): The name of the column that contains the sample name
-                (default "sample").
-            first_col (str): The name of the column that contains the first (or only)
-                FASTQ file path (default "fastq_1").
-            second_col (str): The name of the column that contains the second (if any)
-                FASTQ file path (default "fastq_2").
-            single_col (str): The name of the new column that will be inserted and
-                records whether the sample contains single- or paired-end sequencing
-                reads (default "single_end").
-
-        """
-        super().__init__(**kwargs)
-        self._sample_col = sample_col
-        self._first_col = first_col
-        self._second_col = second_col
-        self._single_col = single_col
-        self._seen = set()
-        self.modified = []
-
-    def validate_and_transform(self, row):
-        """
-        Perform all validations on the given row and insert the read pairing status.
-
-        Args:
-            row (dict): A mapping from column headers (keys) to elements of that row
-                (values).
-
-        """
-        self._validate_sample(row)
-        self._validate_first(row)
-        self._validate_second(row)
-        self._validate_pair(row)
-        self._seen.add((row[self._sample_col], row[self._first_col]))
-        self.modified.append(row)
-
-    def _validate_sample(self, row):
-        """Assert that the sample name exists and convert spaces to underscores."""
-        if len(row[self._sample_col]) <= 0:
-            raise AssertionError("Sample input is required.")
-        # Sanitize samples slightly.
-        row[self._sample_col] = row[self._sample_col].replace(" ", "_")
-
-    def _validate_first(self, row):
-        """Assert that the first FASTQ entry is non-empty and has the right format."""
-        if len(row[self._first_col]) <= 0:
-            raise AssertionError("At least the first FASTQ file is required.")
-        self._validate_fastq_format(row[self._first_col])
-
-    def _validate_second(self, row):
-        """Assert that the second FASTQ entry has the right format if it exists."""
-        if len(row[self._second_col]) > 0:
-            self._validate_fastq_format(row[self._second_col])
-
-    def _validate_pair(self, row):
-        """Assert that read pairs have the same file extension. Report pair status."""
-        if row[self._first_col] and row[self._second_col]:
-            row[self._single_col] = False
-            first_col_suffix = Path(row[self._first_col]).suffixes[-2:]
-            second_col_suffix = Path(row[self._second_col]).suffixes[-2:]
-            if first_col_suffix != second_col_suffix:
-                raise AssertionError("FASTQ pairs must have the same file extensions.")
-        else:
-            row[self._single_col] = True
-
-    def _validate_fastq_format(self, filename):
-        """Assert that a given filename has one of the expected FASTQ extensions."""
-        if not any(filename.endswith(extension) for extension in self.VALID_FORMATS):
-            raise AssertionError(
-                f"The FASTQ file has an unrecognized extension: {filename}\n"
-                f"It should be one of: {', '.join(self.VALID_FORMATS)}"
-            )
-
-    def validate_unique_samples(self):
-        """
-        Assert that the combination of sample name and FASTQ filename is unique.
-
-        In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the
-        number of times the same sample exist, but with different FASTQ files, e.g., multiple runs per experiment.
+import errno
+import argparse
 
-        """
-        if len(self._seen) != len(self.modified):
-            raise AssertionError("The pair of sample name and FASTQ must be unique.")
-        seen = Counter()
-        for row in self.modified:
-            sample = row[self._sample_col]
-            seen[sample] += 1
-            row[self._sample_col] = f"{sample}_T{seen[sample]}"
 
+def parse_args(args=None):
+    Description = "Reformat and check the contents of the samplesheet."
 
-def read_head(handle, num_lines=10):
-    """Read the specified number of lines from the current position in the file."""
-    lines = []
-    for idx, line in enumerate(handle):
-        if idx == num_lines:
-            break
-        lines.append(line)
-    return "".join(lines)
+    Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"
 
+    parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
+    parser.add_argument("FILE_IN", help="Input samplesheet file.")
+    parser.add_argument("FILE_OUT", help="Output file.")
+    return parser.parse_args(args)
 
-def sniff_format(handle):
-    """
-    Detect the tabular format.
 
-    Args:
-        handle (text file): A handle to a `text file`_ object. The read position is
-        expected to be at the beginning (index 0).
+def make_dir(path):
+    if len(path) > 0:
+        try:
+            os.makedirs(path)
+        except OSError as exception:
+            if exception.errno != errno.EEXIST:
+                raise exception
 
-    Returns:
-        csv.Dialect: The detected tabular format.
 
-    .. _text file:
-        https://docs.python.org/3/glossary.html#term-text-file
-
-    """
-    peek = read_head(handle)
-    handle.seek(0)
-    sniffer = csv.Sniffer()
-    dialect = sniffer.sniff(peek)
-    return dialect
+def print_error(error, context="Line", context_str=""):
+    error_str = "ERROR: Please check samplesheet -> {}".format(error)
+    if context != "" and context_str != "":
+        error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
+            error, context.strip(), context_str.strip()
+        )
+    print(error_str)
+    sys.exit(1)
 
 
 def check_samplesheet(file_in, file_out):
     """
-    Check that the tabular samplesheet has the structure expected by nf-core pipelines.
-
-    Validate the general shape of the table, expected columns, and each row. Also add
-    an additional column which records whether one or two FASTQ reads were found.
-
-    Args:
-        file_in (pathlib.Path): The given tabular samplesheet. The format can be either
-            CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``.
-        file_out (pathlib.Path): Where the validated and transformed samplesheet should
-            be created; always in CSV format.
-
-    Example:
-        This function checks that the samplesheet follows the following structure,
-        see also the `viral recon samplesheet`_::
-
-            sample,fastq_1,fastq_2
-            SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz
-            SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz
-            SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz,
-
-    .. _viral recon samplesheet:
-        https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
-
+    This function checks that the samplesheet follows the structure specified in the provided CSV.
     """
-    required_columns = {"sample", "fastq_1", "fastq_2"}
-    # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
-    with file_in.open(newline="") as in_handle:
-        reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle))
-        # Validate the existence of the expected header columns.
-        if not required_columns.issubset(reader.fieldnames):
-            req_cols = ", ".join(required_columns)
-            logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.")
-            sys.exit(1)
-        # Validate each row.
-        checker = RowChecker()
-        for i, row in enumerate(reader):
-            try:
-                checker.validate_and_transform(row)
-            except AssertionError as error:
-                logger.critical(f"{str(error)} On line {i + 2}.")
-                sys.exit(1)
-        checker.validate_unique_samples()
-    header = list(reader.fieldnames)
-    header.insert(1, "single_end")
-    # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
-    with file_out.open(mode="w", newline="") as out_handle:
-        writer = csv.DictWriter(out_handle, header, delimiter=",")
-        writer.writeheader()
-        for row in checker.modified:
-            writer.writerow(row)
-
-
-def parse_args(argv=None):
-    """Define and immediately parse command line arguments."""
-    parser = argparse.ArgumentParser(
-        description="Validate and transform a tabular samplesheet.",
-        epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv",
-    )
-    parser.add_argument(
-        "file_in",
-        metavar="FILE_IN",
-        type=Path,
-        help="Tabular input samplesheet in CSV or TSV format.",
-    )
-    parser.add_argument(
-        "file_out",
-        metavar="FILE_OUT",
-        type=Path,
-        help="Transformed output samplesheet in CSV format.",
-    )
-    parser.add_argument(
-        "-l",
-        "--log-level",
-        help="The desired log level (default WARNING).",
-        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"),
-        default="WARNING",
-    )
-    return parser.parse_args(argv)
 
+    FQ_EXTENSIONS = (".fq.gz", ".fastq.gz")
+
+    sample_mapping_dict = {}
+    with open(file_in, "r") as fin:
+        ## Check header
+        MIN_COLS = 7
+        HEADER = [
+            "sample",
+            "run_accession",
+            "instrument_platform",
+            "reads_type",
+            "fastq_1",
+            "fastq_2",
+            "fasta",
+            "kraken2_report",
+            "kraken2_classifiedout",
+            "centrifuge_out",
+            "centrifuge_result",
+            "diamond",
+        ]
+        header = [x.strip('"') for x in fin.readline().strip().split(",")]
+
+        ## Check for missing mandatory columns
+        missing_columns = list(set(HEADER) - set(header))
+        if len(missing_columns) > 0:
+            print(
+                "ERROR: Missing required column header -> {}. Note some columns can otherwise be empty.".format(
+                    ",".join(missing_columns)
+                )
+            )
+            sys.exit(1)
 
-def main(argv=None):
-    """Coordinate argument parsing and program execution."""
-    args = parse_args(argv)
-    logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s")
-    if not args.file_in.is_file():
-        logger.error(f"The given input file {args.file_in} was not found!")
-        sys.exit(2)
-    args.file_out.parent.mkdir(parents=True, exist_ok=True)
-    check_samplesheet(args.file_in, args.file_out)
+        ## Find locations of mandatory columns
+        header_locs = {}
+        for i in HEADER:
+            header_locs[i] = header.index(i)
+
+        ## Check sample entries
+        for line in fin:
+            ## Pull out only relevant columns for downstream checking
+            line_parsed = [x.strip().strip('"') for x in line.strip().split(",")]
+
+            # Check valid number of columns per row
+            if len(line_parsed) < MIN_COLS:
+                print_error(
+                    f"Invalid number of columns (minimum = {MIN_COLS})!",
+                    "Line",
+                    line,
+                )
+
+            lspl = [line_parsed[i] for i in header_locs.values()]
+
+            ## Check sample name entries
+
+            (
+                sample,
+                run_accession,
+                instrument_platform,
+                reads_type,
+                fastq_1,
+                fastq_2,
+                fasta,
+                kraken2_report,
+                kraken2_classifiedout,
+                centrifuge_out,
+                centrifuge_result,
+                diamond,
+            ) = lspl
+
+            # Additional checks specific to your CSV format can be added here
+
+            ## Create sample mapping dictionary = { sample: [ run_accession, instrument_platform, ... ] }
+            if sample not in sample_mapping_dict:
+                sample_mapping_dict[sample] = [
+                    run_accession,
+                    instrument_platform,
+                    reads_type,
+                    fastq_1,
+                    fastq_2,
+                    fasta,
+                    kraken2_report,
+                    kraken2_classifiedout,
+                    centrifuge_out,
+                    centrifuge_result,
+                    diamond,
+                ]
+            else:
+                print_error("Samplesheet contains duplicate rows!", "Line", line)
+
+            # Check instrument_platform
+            # (You can add more checks here based on your specific requirements)
+
+    ## Write validated samplesheet with appropriate columns
+    HEADER_OUT = [
+        "sample",
+        "run_accession",
+        "instrument_platform",
+        "reads_type",
+        "fastq_1",
+        "fastq_2",
+        "fasta",
+        "kraken2_report",
+        "kraken2_classifiedout",
+        "centrifuge_out",
+        "centrifuge_result",
+        "diamond",
+    ]
+    if len(sample_mapping_dict) > 0:
+        out_dir = os.path.dirname(file_out)
+        make_dir(out_dir)
+        with open(file_out, "w") as fout:
+            fout.write("\t".join(HEADER_OUT) + "\n")
+            for sample in sorted(sample_mapping_dict.keys()):
+                fout.write(f"{sample}\t{','.join(sample_mapping_dict[sample])}\n")
+    else:
+        print_error("No entries to process!", "Samplesheet: {}".format(file_in))
+
+
+def main(args=None):
+    args = parse_args(args)
+    check_samplesheet(args.FILE_IN, args.FILE_OUT)
 
 
 if __name__ == "__main__":
-    sys.exit(main())
+    sys.exit(main())
\ No newline at end of file

From 60228883c148f031e7c10e32e42115b046ae7655 Mon Sep 17 00:00:00 2001
From: Lili Andersson-Li <64467552+LilyAnderssonLee@users.noreply.github.com>
Date: Fri, 1 Dec 2023 11:04:06 +0100
Subject: [PATCH 2/2] update samplesheet check

---
 CHANGELOG.md             |   2 +-
 bin/check_samplesheet.py | 377 +++++++++++++++++++++++++--------------
 2 files changed, 245 insertions(+), 134 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a89c478..25f377a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## v1.0dev - [date]
 
-Initial release of nf-core/metaval, created with the [nf-core](https://nf-co.re/) template.
+Initial release of metaval, created with the [nf-core](https://nf-co.re/) template.
 
 ### `Added`
 
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
index 7f639da..d4edb74 100755
--- a/bin/check_samplesheet.py
+++ b/bin/check_samplesheet.py
@@ -1,140 +1,201 @@
 #!/usr/bin/env python
 
-import os
-import sys
-import errno
 import argparse
+import csv
+import logging
+import sys
+from collections import Counter
+from pathlib import Path
 
+logger = logging.getLogger()
 
-def parse_args(args=None):
-    Description = "Reformat and check the contents of the samplesheet."
+class RowChecker:
+    """
+    Define a service that can validate and transform each given row.
 
-    Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"
+    Attributes:
+        modified (list): A list of dicts, where each dict corresponds to a previously
+            validated and transformed row. The order of rows is maintained.
 
-    parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
-    parser.add_argument("FILE_IN", help="Input samplesheet file.")
-    parser.add_argument("FILE_OUT", help="Output file.")
-    return parser.parse_args(args)
+    """
 
+    VALID_FORMATS = (".fq.gz", ".fastq.gz")
 
-def make_dir(path):
-    if len(path) > 0:
-        try:
-            os.makedirs(path)
-        except OSError as exception:
-            if exception.errno != errno.EEXIST:
-                raise exception
+    def __init__(
+        self,
+        sample_col="sample",
+        run_accession_col="run_accession",
+        instrument_platform_col="instrument_platform",
+        reads_type_col="reads_type",
+        fastq_1_col="fastq_1",
+        fastq_2_col="fastq_2",
+        fasta_col="fasta",
+        kraken2_report_col="kraken2_report",
+        kraken2_classifiedout_col="kraken2_classifiedout",
+        centrifuge_out_col="centrifuge_out",
+        centrifuge_result_col="centrifuge_result",
+        diamond_col="diamond",
+        **kwargs,
+    ):
+        """
+        Initialize the row checker with the expected column names.
 
+        Args:
+            sample_col (str): The name of the column that contains the sample name
+                (default "sample").
+            run_accession_col (str): The name of the column that contains the run accession
+                (default "run_accession").
+            instrument_platform_col (str): The name of the column that contains the instrument platform
+                (default "instrument_platform").
+            reads_type_col (str): The name of the column that contains the reads type, shortread or longread
+                (default "reads_type").
+            fastq_1_col (str): The name of the column that contains the first (or only)
+                FASTQ file path (default "fastq_1") from bowtie2 unmapped read 1 against the human genome.
+            fastq_2_col (str): The name of the column that contains the second (if any)
+                FASTQ file path (default "fastq_2") from bowtie2 unmapped read 2 against the human genome.
+            fasta_col (str): The name of the column that contains the FASTA information
+                (default "fasta") from minimap2 unmapped read against the human genome.
+            kraken2_report_col (str): The name of the column that contains the kraken2 report
+                (default "kraken2_report").
+            kraken2_classifiedout_col (str): The name of the column that contains the kraken2 classifiedout
+                (default "kraken2_classifiedout") with the format extension "kraken2.kraken2.classifiedreads.txt".
+            centrifuge_out_col (str): The name of the column that contains the centrifuge out (kraken2-like report)
+                (default "centrifuge_out").
+            centrifuge_result_col (str): The name of the column that contains the centrifuge result
+                (default "centrifuge_result") with the format extension "centrifuge.results.txt".
+            diamond_col (str): The name of the column that contains the diamond information
+                (default "diamond") with the format extension ".csv".
 
-def print_error(error, context="Line", context_str=""):
-    error_str = "ERROR: Please check samplesheet -> {}".format(error)
-    if context != "" and context_str != "":
-        error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
-            error, context.strip(), context_str.strip()
-        )
-    print(error_str)
-    sys.exit(1)
+        """
+        super().__init__(**kwargs)
+        self._sample_col = sample_col
+        self._run_accession_col = run_accession_col
+        self._instrument_platform_col = instrument_platform_col
+        self._reads_type_col = reads_type_col
+        self._fastq_1_col = fastq_1_col
+        self._fastq_2_col = fastq_2_col
+        self._fasta_col = fasta_col
+        self._kraken2_report_col = kraken2_report_col
+        self._kraken2_classifiedout_col = kraken2_classifiedout_col
+        self._centrifuge_out_col = centrifuge_out_col
+        self._centrifuge_result_col = centrifuge_result_col
+        self._diamond_col = diamond_col
+        self._seen = set()
+        self.modified = []
 
+    def validate_and_transform(self, row):
+        """
+        Perform all validations on the given row and insert the read pairing status.
 
-def check_samplesheet(file_in, file_out):
+        Args:
+            row (dict): A mapping from column headers (keys) to elements of that row
+                (values).
+
+        """
+        self._validate_sample(row)
+        self._validate_first(row)
+        self._validate_second(row)
+        self._validate_pair(row)
+        self._seen.add((row[self._sample_col], row[self._fastq_1_col]))
+        self.modified.append(row)
+
+    def _validate_sample(self, row):
+        """Assert that the sample name exists and convert spaces to underscores."""
+        if len(row[self._sample_col]) <= 0:
+            raise AssertionError("Sample input is required.")
+        # Sanitize samples slightly.
+        row[self._sample_col] = row[self._sample_col].replace(" ", "_")
+
+    def _validate_first(self, row):
+        """Assert that the first FASTQ entry is non-empty and has the right format."""
+        if len(row[self._fastq_1_col]) <= 0:
+            raise AssertionError("At least the first FASTQ file is required.")
+        self._validate_fastq_format(row[self._fastq_1_col])
+
+    def _validate_second(self, row):
+        """Assert that the second FASTQ entry has the right format if it exists."""
+        if len(row[self._fastq_2_col]) > 0:
+            self._validate_fastq_format(row[self._fastq_2_col])
+
+    def _validate_pair(self, row):
+        """Assert that read pairs have the same file extension. Report pair status."""
+        if row[self._fastq_1_col] and row[self._fastq_2_col]:
+            first_col_suffix = Path(row[self._fastq_1_col]).suffixes[-1]
+            second_col_suffix = Path(row[self._fastq_2_col]).suffixes[-1]
+            if first_col_suffix != second_col_suffix:
+                raise AssertionError("FASTQ pairs must have the same file extensions.")
+
+    def _validate_fastq_format(self, filename):
+        """Assert that a given filename has one of the expected FASTQ extensions."""
+        if not any(filename.endswith(extension) for extension in self.VALID_FORMATS):
+            raise AssertionError(
+                f"The FASTQ file has an unrecognized extension: {filename}\n"
+                f"It should be one of: {', '.join(self.VALID_FORMATS)}"
+            )
+
+    def validate_unique_samples(self):
+        """
+        Assert that the combination of sample name and FASTQ filename is unique.
+
+        In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the
+        number of times the same sample exists, but with different FASTQ files, e.g., multiple runs per experiment.
+
+        """
+        if len(self._seen) != len(self.modified):
+            raise AssertionError("The pair of sample name and FASTQ must be unique.")
+        seen = Counter()
+        for row in self.modified:
+            sample = row[self._sample_col]
+            seen[sample] += 1
+            row[self._sample_col] = f"{sample}_T{seen[sample]}"
+
+
+def read_head(handle, num_lines=10):
+    """Read the specified number of lines from the current position in the file."""
+    lines = []
+    for idx, line in enumerate(handle):
+        if idx == num_lines:
+            break
+        lines.append(line)
+    return "".join(lines)
+
+
+def sniff_format(handle):
     """
-    This function checks that the samplesheet follows the structure specified in the provided CSV.
+    Detect the tabular format.
+
+    Args:
+        handle (text file): A handle to a `text file`_ object. The read position is
+        expected to be at the beginning (index 0).
+
+    Returns:
+        csv.Dialect: The detected tabular format.
+
+    .. _text file:
+        https://docs.python.org/3/glossary.html#term-text-file
+
     """
+    peek = read_head(handle)
+    handle.seek(0)
+    sniffer = csv.Sniffer()
+    dialect = sniffer.sniff(peek)
+    return dialect
 
-    FQ_EXTENSIONS = (".fq.gz", ".fastq.gz")
-
-    sample_mapping_dict = {}
-    with open(file_in, "r") as fin:
-        ## Check header
-        MIN_COLS = 7
-        HEADER = [
-            "sample",
-            "run_accession",
-            "instrument_platform",
-            "reads_type",
-            "fastq_1",
-            "fastq_2",
-            "fasta",
-            "kraken2_report",
-            "kraken2_classifiedout",
-            "centrifuge_out",
-            "centrifuge_result",
-            "diamond",
-        ]
-        header = [x.strip('"') for x in fin.readline().strip().split(",")]
-
-        ## Check for missing mandatory columns
-        missing_columns = list(set(HEADER) - set(header))
-        if len(missing_columns) > 0:
-            print(
-                "ERROR: Missing required column header -> {}. Note some columns can otherwise be empty.".format(
-                    ",".join(missing_columns)
-                )
-            )
-            sys.exit(1)
 
-        ## Find locations of mandatory columns
-        header_locs = {}
-        for i in HEADER:
-            header_locs[i] = header.index(i)
-
-        ## Check sample entries
-        for line in fin:
-            ## Pull out only relevant columns for downstream checking
-            line_parsed = [x.strip().strip('"') for x in line.strip().split(",")]
-
-            # Check valid number of columns per row
-            if len(line_parsed) < MIN_COLS:
-                print_error(
-                    f"Invalid number of columns (minimum = {MIN_COLS})!",
-                    "Line",
-                    line,
-                )
-
-            lspl = [line_parsed[i] for i in header_locs.values()]
-
-            ## Check sample name entries
-
-            (
-                sample,
-                run_accession,
-                instrument_platform,
-                reads_type,
-                fastq_1,
-                fastq_2,
-                fasta,
-                kraken2_report,
-                kraken2_classifiedout,
-                centrifuge_out,
-                centrifuge_result,
-                diamond,
-            ) = lspl
-
-            # Additional checks specific to your CSV format can be added here
-
-            ## Create sample mapping dictionary = { sample: [ run_accession, instrument_platform, ... ] }
-            if sample not in sample_mapping_dict:
-                sample_mapping_dict[sample] = [
-                    run_accession,
-                    instrument_platform,
-                    reads_type,
-                    fastq_1,
-                    fastq_2,
-                    fasta,
-                    kraken2_report,
-                    kraken2_classifiedout,
-                    centrifuge_out,
-                    centrifuge_result,
-                    diamond,
-                ]
-            else:
-                print_error("Samplesheet contains duplicate rows!", "Line", line)
-
-            # Check instrument_platform
-            # (You can add more checks here based on your specific requirements)
-
-    ## Write validated samplesheet with appropriate columns
-    HEADER_OUT = [
+def check_samplesheet(file_in, file_out):
+    """
+    Check that the tabular samplesheet has the structure expected by metaval.
+
+    Validate the general shape of the table, expected columns, and each row. Also add
+    an additional column which records whether one or two FASTQ reads were found.
+
+    Args:
+        file_in (pathlib.Path): The given tabular samplesheet. The format can be either
+            CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``.
+        file_out (pathlib.Path): Where the validated and transformed samplesheet should
+            be created; always in CSV format.
+    """
+    required_columns = [
         "sample",
         "run_accession",
         "instrument_platform",
@@ -148,21 +209,71 @@ def check_samplesheet(file_in, file_out):
         "centrifuge_result",
         "diamond",
     ]
-    if len(sample_mapping_dict) > 0:
-        out_dir = os.path.dirname(file_out)
-        make_dir(out_dir)
-        with open(file_out, "w") as fout:
-            fout.write("\t".join(HEADER_OUT) + "\n")
-            for sample in sorted(sample_mapping_dict.keys()):
-                fout.write(f"{sample}\t{','.join(sample_mapping_dict[sample])}\n")
-    else:
-        print_error("No entries to process!", "Samplesheet: {}".format(file_in))
+    # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
+    with file_in.open(newline="") as in_handle:
+        reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle))
+        # Validate the existence of the expected header columns.
+        if not required_columns == reader.fieldnames:
+            req_cols = ", ".join(required_columns)
+            logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.")
+            sys.exit(1)
+        # Validate each row.
+        checker = RowChecker()
+        for i, row in enumerate(reader):
+            try:
+                checker.validate_and_transform(row)
+            except AssertionError as error:
+                logger.critical(f"{str(error)} On line {i + 2}.")
+                sys.exit(1)
+        checker.validate_unique_samples()
+    header = list(reader.fieldnames)
+ #   header.insert(1, "single_end")
+    # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
+    with file_out.open(mode="w", newline="") as out_handle:
+        writer = csv.DictWriter(out_handle, header, delimiter=",")
+        writer.writeheader()
+        for row in checker.modified:
+            writer.writerow(row)
+
+
+def parse_args(argv=None):
+    """Define and immediately parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Validate and transform a tabular samplesheet.",
+        epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv",
+    )
+    parser.add_argument(
+        "file_in",
+        metavar="FILE_IN",
+        type=Path,
+        help="Tabular input samplesheet in CSV or TSV format.",
+    )
+    parser.add_argument(
+        "file_out",
+        metavar="FILE_OUT",
+        type=Path,
+        help="Transformed output samplesheet in CSV format.",
+    )
+    parser.add_argument(
+        "-l",
+        "--log-level",
+        help="The desired log level (default WARNING).",
+        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"),
+        default="WARNING",
+    )
+    return parser.parse_args(argv)
 
 
-def main(args=None):
-    args = parse_args(args)
-    check_samplesheet(args.FILE_IN, args.FILE_OUT)
+def main(argv=None):
+    """Coordinate argument parsing and program execution."""
+    args = parse_args(argv)
+    logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s")
+    if not args.file_in.is_file():
+        logger.error(f"The given input file {args.file_in} was not found!")
+        sys.exit(2)
+    args.file_out.parent.mkdir(parents=True, exist_ok=True)
+    check_samplesheet(args.file_in, args.file_out)
 
 
 if __name__ == "__main__":
-    sys.exit(main())
\ No newline at end of file
+    sys.exit(main())