From 51ac8fbb56523ee5bf645922aa3e2107b4a3aa8d Mon Sep 17 00:00:00 2001 From: Lili Andersson-Li <64467552+LilyAnderssonLee@users.noreply.github.com> Date: Fri, 1 Dec 2023 08:58:40 +0100 Subject: [PATCH 1/2] structure the samplesheet --- bin/check_samplesheet.py | 385 +++++++++++++++------------------------ 1 file changed, 147 insertions(+), 238 deletions(-) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 4a758fe..7f639da 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -1,259 +1,168 @@ #!/usr/bin/env python - -"""Provide a command line tool to validate and transform tabular samplesheets.""" - - -import argparse -import csv -import logging +import os import sys -from collections import Counter -from pathlib import Path - -logger = logging.getLogger() - - -class RowChecker: - """ - Define a service that can validate and transform each given row. - - Attributes: - modified (list): A list of dicts, where each dict corresponds to a previously - validated and transformed row. The order of rows is maintained. - - """ - - VALID_FORMATS = ( - ".fq.gz", - ".fastq.gz", - ) - - def __init__( - self, - sample_col="sample", - first_col="fastq_1", - second_col="fastq_2", - single_col="single_end", - **kwargs, - ): - """ - Initialize the row checker with the expected column names. - - Args: - sample_col (str): The name of the column that contains the sample name - (default "sample"). - first_col (str): The name of the column that contains the first (or only) - FASTQ file path (default "fastq_1"). - second_col (str): The name of the column that contains the second (if any) - FASTQ file path (default "fastq_2"). - single_col (str): The name of the new column that will be inserted and - records whether the sample contains single- or paired-end sequencing - reads (default "single_end"). - - """ - super().__init__(**kwargs) - self._sample_col = sample_col - self._first_col = first_col - self._second_col = second_col - self._single_col = single_col - self._seen = set() - self.modified = [] - - def validate_and_transform(self, row): - """ - Perform all validations on the given row and insert the read pairing status. - - Args: - row (dict): A mapping from column headers (keys) to elements of that row - (values). - - """ - self._validate_sample(row) - self._validate_first(row) - self._validate_second(row) - self._validate_pair(row) - self._seen.add((row[self._sample_col], row[self._first_col])) - self.modified.append(row) - - def _validate_sample(self, row): - """Assert that the sample name exists and convert spaces to underscores.""" - if len(row[self._sample_col]) <= 0: - raise AssertionError("Sample input is required.") - # Sanitize samples slightly. - row[self._sample_col] = row[self._sample_col].replace(" ", "_") - - def _validate_first(self, row): - """Assert that the first FASTQ entry is non-empty and has the right format.""" - if len(row[self._first_col]) <= 0: - raise AssertionError("At least the first FASTQ file is required.") - self._validate_fastq_format(row[self._first_col]) - - def _validate_second(self, row): - """Assert that the second FASTQ entry has the right format if it exists.""" - if len(row[self._second_col]) > 0: - self._validate_fastq_format(row[self._second_col]) - - def _validate_pair(self, row): - """Assert that read pairs have the same file extension. Report pair status.""" - if row[self._first_col] and row[self._second_col]: - row[self._single_col] = False - first_col_suffix = Path(row[self._first_col]).suffixes[-2:] - second_col_suffix = Path(row[self._second_col]).suffixes[-2:] - if first_col_suffix != second_col_suffix: - raise AssertionError("FASTQ pairs must have the same file extensions.") - else: - row[self._single_col] = True - - def _validate_fastq_format(self, filename): - """Assert that a given filename has one of the expected FASTQ extensions.""" - if not any(filename.endswith(extension) for extension in self.VALID_FORMATS): - raise AssertionError( - f"The FASTQ file has an unrecognized extension: {filename}\n" - f"It should be one of: {', '.join(self.VALID_FORMATS)}" - ) - - def validate_unique_samples(self): - """ - Assert that the combination of sample name and FASTQ filename is unique. - - In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the - number of times the same sample exist, but with different FASTQ files, e.g., multiple runs per experiment. +import errno +import argparse - """ - if len(self._seen) != len(self.modified): - raise AssertionError("The pair of sample name and FASTQ must be unique.") - seen = Counter() - for row in self.modified: - sample = row[self._sample_col] - seen[sample] += 1 - row[self._sample_col] = f"{sample}_T{seen[sample]}" +def parse_args(args=None): + Description = "Reformat and check the contents of the samplesheet." -def read_head(handle, num_lines=10): - """Read the specified number of lines from the current position in the file.""" - lines = [] - for idx, line in enumerate(handle): - if idx == num_lines: - break - lines.append(line) - return "".join(lines) + Epilog = "Example usage: python check_samplesheet.py " + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("FILE_IN", help="Input samplesheet file.") + parser.add_argument("FILE_OUT", help="Output file.") + return parser.parse_args(args) -def sniff_format(handle): - """ - Detect the tabular format. - Args: - handle (text file): A handle to a `text file`_ object. The read position is - expected to be at the beginning (index 0). +def make_dir(path): + if len(path) > 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise exception - Returns: - csv.Dialect: The detected tabular format. - .. _text file: - https://docs.python.org/3/glossary.html#term-text-file - - """ - peek = read_head(handle) - handle.seek(0) - sniffer = csv.Sniffer() - dialect = sniffer.sniff(peek) - return dialect +def print_error(error, context="Line", context_str=""): + error_str = "ERROR: Please check samplesheet -> {}".format(error) + if context != "" and context_str != "": + error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format( + error, context.strip(), context_str.strip() + ) + print(error_str) + sys.exit(1) def check_samplesheet(file_in, file_out): """ - Check that the tabular samplesheet has the structure expected by nf-core pipelines. - - Validate the general shape of the table, expected columns, and each row. Also add - an additional column which records whether one or two FASTQ reads were found. - - Args: - file_in (pathlib.Path): The given tabular samplesheet. The format can be either - CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``. - file_out (pathlib.Path): Where the validated and transformed samplesheet should - be created; always in CSV format. - - Example: - This function checks that the samplesheet follows the following structure, - see also the `viral recon samplesheet`_:: - - sample,fastq_1,fastq_2 - SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz - SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz - SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz, - - .. _viral recon samplesheet: - https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv - + This function checks that the samplesheet follows the structure specified in the provided CSV. """ - required_columns = {"sample", "fastq_1", "fastq_2"} - # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. - with file_in.open(newline="") as in_handle: - reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) - # Validate the existence of the expected header columns. - if not required_columns.issubset(reader.fieldnames): - req_cols = ", ".join(required_columns) - logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.") - sys.exit(1) - # Validate each row. - checker = RowChecker() - for i, row in enumerate(reader): - try: - checker.validate_and_transform(row) - except AssertionError as error: - logger.critical(f"{str(error)} On line {i + 2}.") - sys.exit(1) - checker.validate_unique_samples() - header = list(reader.fieldnames) - header.insert(1, "single_end") - # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. - with file_out.open(mode="w", newline="") as out_handle: - writer = csv.DictWriter(out_handle, header, delimiter=",") - writer.writeheader() - for row in checker.modified: - writer.writerow(row) - - -def parse_args(argv=None): - """Define and immediately parse command line arguments.""" - parser = argparse.ArgumentParser( - description="Validate and transform a tabular samplesheet.", - epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv", - ) - parser.add_argument( - "file_in", - metavar="FILE_IN", - type=Path, - help="Tabular input samplesheet in CSV or TSV format.", - ) - parser.add_argument( - "file_out", - metavar="FILE_OUT", - type=Path, - help="Transformed output samplesheet in CSV format.", - ) - parser.add_argument( - "-l", - "--log-level", - help="The desired log level (default WARNING).", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"), - default="WARNING", - ) - return parser.parse_args(argv) + FQ_EXTENSIONS = (".fq.gz", ".fastq.gz") + + sample_mapping_dict = {} + with open(file_in, "r") as fin: + ## Check header + MIN_COLS = 7 + HEADER = [ + "sample", + "run_accession", + "instrument_platform", + "reads_type", + "fastq_1", + "fastq_2", + "fasta", + "kraken2_report", + "kraken2_classifiedout", + "centrifuge_out", + "centrifuge_result", + "diamond", + ] + header = [x.strip('"') for x in fin.readline().strip().split(",")] + + ## Check for missing mandatory columns + missing_columns = list(set(HEADER) - set(header)) + if len(missing_columns) > 0: + print( + "ERROR: Missing required column header -> {}. Note some columns can otherwise be empty.".format( + ",".join(missing_columns) + ) + ) + sys.exit(1) -def main(argv=None): - """Coordinate argument parsing and program execution.""" - args = parse_args(argv) - logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") - if not args.file_in.is_file(): - logger.error(f"The given input file {args.file_in} was not found!") - sys.exit(2) - args.file_out.parent.mkdir(parents=True, exist_ok=True) - check_samplesheet(args.file_in, args.file_out) + ## Find locations of mandatory columns + header_locs = {} + for i in HEADER: + header_locs[i] = header.index(i) + + ## Check sample entries + for line in fin: + ## Pull out only relevant columns for downstream checking + line_parsed = [x.strip().strip('"') for x in line.strip().split(",")] + + # Check valid number of columns per row + if len(line_parsed) < MIN_COLS: + print_error( + f"Invalid number of columns (minimum = {MIN_COLS})!", + "Line", + line, + ) + + lspl = [line_parsed[i] for i in header_locs.values()] + + ## Check sample name entries + + ( + sample, + run_accession, + instrument_platform, + reads_type, + fastq_1, + fastq_2, + fasta, + kraken2_report, + kraken2_classifiedout, + centrifuge_out, + centrifuge_result, + diamond, + ) = lspl + + # Additional checks specific to your CSV format can be added here + + ## Create sample mapping dictionary = { sample: [ run_accession, instrument_platform, ... ] } + if sample not in sample_mapping_dict: + sample_mapping_dict[sample] = [ + run_accession, + instrument_platform, + reads_type, + fastq_1, + fastq_2, + fasta, + kraken2_report, + kraken2_classifiedout, + centrifuge_out, + centrifuge_result, + diamond, + ] + else: + print_error("Samplesheet contains duplicate rows!", "Line", line) + + # Check instrument_platform + # (You can add more checks here based on your specific requirements) + + ## Write validated samplesheet with appropriate columns + HEADER_OUT = [ + "sample", + "run_accession", + "instrument_platform", + "reads_type", + "fastq_1", + "fastq_2", + "fasta", + "kraken2_report", + "kraken2_classifiedout", + "centrifuge_out", + "centrifuge_result", + "diamond", + ] + if len(sample_mapping_dict) > 0: + out_dir = os.path.dirname(file_out) + make_dir(out_dir) + with open(file_out, "w") as fout: + fout.write("\t".join(HEADER_OUT) + "\n") + for sample in sorted(sample_mapping_dict.keys()): + fout.write(f"{sample}\t{','.join(sample_mapping_dict[sample])}\n") + else: + print_error("No entries to process!", "Samplesheet: {}".format(file_in)) + + +def main(args=None): + args = parse_args(args) + check_samplesheet(args.FILE_IN, args.FILE_OUT) if __name__ == "__main__": - sys.exit(main()) + sys.exit(main()) \ No newline at end of file From 60228883c148f031e7c10e32e42115b046ae7655 Mon Sep 17 00:00:00 2001 From: Lili Andersson-Li <64467552+LilyAnderssonLee@users.noreply.github.com> Date: Fri, 1 Dec 2023 11:04:06 +0100 Subject: [PATCH 2/2] update samplesheet check --- CHANGELOG.md | 2 +- bin/check_samplesheet.py | 377 +++++++++++++++++++++++++-------------- 2 files changed, 245 insertions(+), 134 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a89c478..25f377a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## v1.0dev - [date] -Initial release of nf-core/metaval, created with the [nf-core](https://nf-co.re/) template. +Initial release of metaval, created with the [nf-core](https://nf-co.re/) template. ### `Added` diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 7f639da..d4edb74 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -1,140 +1,201 @@ #!/usr/bin/env python -import os -import sys -import errno import argparse +import csv +import logging +import sys +from collections import Counter +from pathlib import Path +logger = logging.getLogger() -def parse_args(args=None): - Description = "Reformat and check the contents of the samplesheet." +class RowChecker: + """ + Define a service that can validate and transform each given row. - Epilog = "Example usage: python check_samplesheet.py " + Attributes: + modified (list): A list of dicts, where each dict corresponds to a previously + validated and transformed row. The order of rows is maintained. - parser = argparse.ArgumentParser(description=Description, epilog=Epilog) - parser.add_argument("FILE_IN", help="Input samplesheet file.") - parser.add_argument("FILE_OUT", help="Output file.") - return parser.parse_args(args) + """ + VALID_FORMATS = (".fq.gz", ".fastq.gz") -def make_dir(path): - if len(path) > 0: - try: - os.makedirs(path) - except OSError as exception: - if exception.errno != errno.EEXIST: - raise exception + def __init__( + self, + sample_col="sample", + run_accession_col="run_accession", + instrument_platform_col="instrument_platform", + reads_type_col="reads_type", + fastq_1_col="fastq_1", + fastq_2_col="fastq_2", + fasta_col="fasta", + kraken2_report_col="kraken2_report", + kraken2_classifiedout_col="kraken2_classifiedout", + centrifuge_out_col="centrifuge_out", + centrifuge_result_col="centrifuge_result", + diamond_col="diamond", + **kwargs, + ): + """ + Initialize the row checker with the expected column names. + Args: + sample_col (str): The name of the column that contains the sample name + (default "sample"). + run_accession_col (str): The name of the column that contains the run accession + (default "run_accession"). + instrument_platform_col (str): The name of the column that contains the instrument platform + (default "instrument_platform"). + reads_type_col (str): The name of the column that contains the reads type, shortread or longread + (default "reads_type"). + fastq_1_col (str): The name of the column that contains the first (or only) + FASTQ file path (default "fastq_1") from bowtie2 unmapped read 1 against the human genome. + fastq_2_col (str): The name of the column that contains the second (if any) + FASTQ file path (default "fastq_2") from bowtie2 unmapped read 2 against the human genome. + fasta_col (str): The name of the column that contains the FASTA information + (default "fasta") from minimap2 unmapped read against the human genome. + kraken2_report_col (str): The name of the column that contains the kraken2 report + (default "kraken2_report"). + kraken2_classifiedout_col (str): The name of the column that contains the kraken2 classifiedout + (default "kraken2_classifiedout") with the format extension "kraken2.kraken2.classifiedreads.txt". + centrifuge_out_col (str): The name of the column that contains the centrifuge out (kraken2-like report) + (default "centrifuge_out"). + centrifuge_result_col (str): The name of the column that contains the centrifuge result + (default "centrifuge_result") with the format extension "centrifuge.results.txt". + diamond_col (str): The name of the column that contains the diamond information + (default "diamond") with the format extension ".csv". -def print_error(error, context="Line", context_str=""): - error_str = "ERROR: Please check samplesheet -> {}".format(error) - if context != "" and context_str != "": - error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format( - error, context.strip(), context_str.strip() - ) - print(error_str) - sys.exit(1) + """ + super().__init__(**kwargs) + self._sample_col = sample_col + self._run_accession_col = run_accession_col + self._instrument_platform_col = instrument_platform_col + self._reads_type_col = reads_type_col + self._fastq_1_col = fastq_1_col + self._fastq_2_col = fastq_2_col + self._fasta_col = fasta_col + self._kraken2_report_col = kraken2_report_col + self._kraken2_classifiedout_col = kraken2_classifiedout_col + self._centrifuge_out_col = centrifuge_out_col + self._centrifuge_result_col = centrifuge_result_col + self._diamond_col = diamond_col + self._seen = set() + self.modified = [] + def validate_and_transform(self, row): + """ + Perform all validations on the given row and insert the read pairing status. -def check_samplesheet(file_in, file_out): + Args: + row (dict): A mapping from column headers (keys) to elements of that row + (values). + + """ + self._validate_sample(row) + self._validate_first(row) + self._validate_second(row) + self._validate_pair(row) + self._seen.add((row[self._sample_col], row[self._fastq_1_col])) + self.modified.append(row) + + def _validate_sample(self, row): + """Assert that the sample name exists and convert spaces to underscores.""" + if len(row[self._sample_col]) <= 0: + raise AssertionError("Sample input is required.") + # Sanitize samples slightly. + row[self._sample_col] = row[self._sample_col].replace(" ", "_") + + def _validate_first(self, row): + """Assert that the first FASTQ entry is non-empty and has the right format.""" + if len(row[self._fastq_1_col]) <= 0: + raise AssertionError("At least the first FASTQ file is required.") + self._validate_fastq_format(row[self._fastq_1_col]) + + def _validate_second(self, row): + """Assert that the second FASTQ entry has the right format if it exists.""" + if len(row[self._fastq_2_col]) > 0: + self._validate_fastq_format(row[self._fastq_2_col]) + + def _validate_pair(self, row): + """Assert that read pairs have the same file extension. Report pair status.""" + if row[self._fastq_1_col] and row[self._fastq_2_col]: + first_col_suffix = Path(row[self._fastq_1_col]).suffixes[-1] + second_col_suffix = Path(row[self._fastq_2_col]).suffixes[-1] + if first_col_suffix != second_col_suffix: + raise AssertionError("FASTQ pairs must have the same file extensions.") + + def _validate_fastq_format(self, filename): + """Assert that a given filename has one of the expected FASTQ extensions.""" + if not any(filename.endswith(extension) for extension in self.VALID_FORMATS): + raise AssertionError( + f"The FASTQ file has an unrecognized extension: {filename}\n" + f"It should be one of: {', '.join(self.VALID_FORMATS)}" + ) + + def validate_unique_samples(self): + """ + Assert that the combination of sample name and FASTQ filename is unique. + + In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the + number of times the same sample exists, but with different FASTQ files, e.g., multiple runs per experiment. + + """ + if len(self._seen) != len(self.modified): + raise AssertionError("The pair of sample name and FASTQ must be unique.") + seen = Counter() + for row in self.modified: + sample = row[self._sample_col] + seen[sample] += 1 + row[self._sample_col] = f"{sample}_T{seen[sample]}" + + +def read_head(handle, num_lines=10): + """Read the specified number of lines from the current position in the file.""" + lines = [] + for idx, line in enumerate(handle): + if idx == num_lines: + break + lines.append(line) + return "".join(lines) + + +def sniff_format(handle): """ - This function checks that the samplesheet follows the structure specified in the provided CSV. + Detect the tabular format. + + Args: + handle (text file): A handle to a `text file`_ object. The read position is + expected to be at the beginning (index 0). + + Returns: + csv.Dialect: The detected tabular format. + + .. _text file: + https://docs.python.org/3/glossary.html#term-text-file + """ + peek = read_head(handle) + handle.seek(0) + sniffer = csv.Sniffer() + dialect = sniffer.sniff(peek) + return dialect - FQ_EXTENSIONS = (".fq.gz", ".fastq.gz") - - sample_mapping_dict = {} - with open(file_in, "r") as fin: - ## Check header - MIN_COLS = 7 - HEADER = [ - "sample", - "run_accession", - "instrument_platform", - "reads_type", - "fastq_1", - "fastq_2", - "fasta", - "kraken2_report", - "kraken2_classifiedout", - "centrifuge_out", - "centrifuge_result", - "diamond", - ] - header = [x.strip('"') for x in fin.readline().strip().split(",")] - - ## Check for missing mandatory columns - missing_columns = list(set(HEADER) - set(header)) - if len(missing_columns) > 0: - print( - "ERROR: Missing required column header -> {}. Note some columns can otherwise be empty.".format( - ",".join(missing_columns) - ) - ) - sys.exit(1) - ## Find locations of mandatory columns - header_locs = {} - for i in HEADER: - header_locs[i] = header.index(i) - - ## Check sample entries - for line in fin: - ## Pull out only relevant columns for downstream checking - line_parsed = [x.strip().strip('"') for x in line.strip().split(",")] - - # Check valid number of columns per row - if len(line_parsed) < MIN_COLS: - print_error( - f"Invalid number of columns (minimum = {MIN_COLS})!", - "Line", - line, - ) - - lspl = [line_parsed[i] for i in header_locs.values()] - - ## Check sample name entries - - ( - sample, - run_accession, - instrument_platform, - reads_type, - fastq_1, - fastq_2, - fasta, - kraken2_report, - kraken2_classifiedout, - centrifuge_out, - centrifuge_result, - diamond, - ) = lspl - - # Additional checks specific to your CSV format can be added here - - ## Create sample mapping dictionary = { sample: [ run_accession, instrument_platform, ... ] } - if sample not in sample_mapping_dict: - sample_mapping_dict[sample] = [ - run_accession, - instrument_platform, - reads_type, - fastq_1, - fastq_2, - fasta, - kraken2_report, - kraken2_classifiedout, - centrifuge_out, - centrifuge_result, - diamond, - ] - else: - print_error("Samplesheet contains duplicate rows!", "Line", line) - - # Check instrument_platform - # (You can add more checks here based on your specific requirements) - - ## Write validated samplesheet with appropriate columns - HEADER_OUT = [ +def check_samplesheet(file_in, file_out): + """ + Check that the tabular samplesheet has the structure expected by metaval. + + Validate the general shape of the table, expected columns, and each row. Also add + an additional column which records whether one or two FASTQ reads were found. + + Args: + file_in (pathlib.Path): The given tabular samplesheet. The format can be either + CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``. + file_out (pathlib.Path): Where the validated and transformed samplesheet should + be created; always in CSV format. + """ + required_columns = [ "sample", "run_accession", "instrument_platform", @@ -148,21 +209,71 @@ def check_samplesheet(file_in, file_out): "centrifuge_result", "diamond", ] - if len(sample_mapping_dict) > 0: - out_dir = os.path.dirname(file_out) - make_dir(out_dir) - with open(file_out, "w") as fout: - fout.write("\t".join(HEADER_OUT) + "\n") - for sample in sorted(sample_mapping_dict.keys()): - fout.write(f"{sample}\t{','.join(sample_mapping_dict[sample])}\n") - else: - print_error("No entries to process!", "Samplesheet: {}".format(file_in)) + # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. + with file_in.open(newline="") as in_handle: + reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) + # Validate the existence of the expected header columns. + if not required_columns == reader.fieldnames: + req_cols = ", ".join(required_columns) + logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.") + sys.exit(1) + # Validate each row. + checker = RowChecker() + for i, row in enumerate(reader): + try: + checker.validate_and_transform(row) + except AssertionError as error: + logger.critical(f"{str(error)} On line {i + 2}.") + sys.exit(1) + checker.validate_unique_samples() + header = list(reader.fieldnames) + # header.insert(1, "single_end") + # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. + with file_out.open(mode="w", newline="") as out_handle: + writer = csv.DictWriter(out_handle, header, delimiter=",") + writer.writeheader() + for row in checker.modified: + writer.writerow(row) + + +def parse_args(argv=None): + """Define and immediately parse command line arguments.""" + parser = argparse.ArgumentParser( + description="Validate and transform a tabular samplesheet.", + epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv", + ) + parser.add_argument( + "file_in", + metavar="FILE_IN", + type=Path, + help="Tabular input samplesheet in CSV or TSV format.", + ) + parser.add_argument( + "file_out", + metavar="FILE_OUT", + type=Path, + help="Transformed output samplesheet in CSV format.", + ) + parser.add_argument( + "-l", + "--log-level", + help="The desired log level (default WARNING).", + choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"), + default="WARNING", + ) + return parser.parse_args(argv) -def main(args=None): - args = parse_args(args) - check_samplesheet(args.FILE_IN, args.FILE_OUT) +def main(argv=None): + """Coordinate argument parsing and program execution.""" + args = parse_args(argv) + logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") + if not args.file_in.is_file(): + logger.error(f"The given input file {args.file_in} was not found!") + sys.exit(2) + args.file_out.parent.mkdir(parents=True, exist_ok=True) + check_samplesheet(args.file_in, args.file_out) if __name__ == "__main__": - sys.exit(main()) \ No newline at end of file + sys.exit(main())