diff --git a/altamisa/isatab/parse_assay_study.py b/altamisa/isatab/parse_assay_study.py index de2239d..b75ec0b 100644 --- a/altamisa/isatab/parse_assay_study.py +++ b/altamisa/isatab/parse_assay_study.py @@ -817,7 +817,7 @@ def from_stream(cls, study_id: str, input_file: TextIO, filename: Optional[str] def __init__(self, study_id: str, input_file: TextIO, filename: Optional[str]): self.study_id = study_id self.input_file = input_file - self.filename = filename or getattr(input_file, "name", "") + self._filename = filename or getattr(input_file, "name", "") self.unique_rows = set() self.duplicate_rows = [] self._reader = csv.reader(input_file, delimiter="\t", quotechar='"') @@ -848,6 +848,9 @@ def _read_next_line(self): self.unique_rows.add("\t".join(self._line)) except StopIteration: self._line = None + except UnicodeDecodeError as e: # pragma: no cover + msg = f"Invalid encoding of study file '{self._filename}' (use Unicode/UTF-8)." + raise ParseIsatabException(msg) from e return prev_line def read(self): @@ -856,7 +859,7 @@ def read(self): :returns: Nodes per row of the study file """ - builder = _StudyRowBuilder(self.header, self.filename, self.study_id) + builder = _StudyRowBuilder(self.header, self._filename, self.study_id) while True: line = self._read_next_line() if line: @@ -939,7 +942,7 @@ def __init__(self, study_id: str, assay_id: str, input_file: TextIO, filename: O self.study_id = study_id self.assay_id = assay_id self.input_file = input_file - self.filename = filename or getattr(input_file, "name", "") + self._filename = filename or getattr(input_file, "name", "") self.unique_rows = set() self.duplicate_rows = [] self._reader = csv.reader(input_file, delimiter="\t", quotechar='"') @@ -970,6 +973,9 @@ def _read_next_line(self): self.unique_rows.add("\t".join(self._line)) except StopIteration: self._line = None + except UnicodeDecodeError as e: # pragma: no cover + msg = f"Invalid encoding of assay file '{self._filename}' (use Unicode/UTF-8)." + raise ParseIsatabException(msg) from e return prev_line def read(self): @@ -978,7 +984,7 @@ def read(self): :return: Nodes per row of the assay file """ - builder = _AssayRowBuilder(self.header, self.filename, self.study_id, self.assay_id) + builder = _AssayRowBuilder(self.header, self._filename, self.study_id, self.assay_id) while True: line = self._read_next_line() if line: diff --git a/altamisa/isatab/parse_investigation.py b/altamisa/isatab/parse_investigation.py index 6e9fb64..6c40cbd 100644 --- a/altamisa/isatab/parse_investigation.py +++ b/altamisa/isatab/parse_investigation.py @@ -128,6 +128,9 @@ def _read_next_line(self) -> Optional[List[str]]: self._line = list_strip(next(self._reader)) except StopIteration: self._line = None + except UnicodeDecodeError as e: # pragma: no cover + msg = f"Invalid encoding of investigation file '{self._filename}' (use Unicode/UTF-8)." + raise ParseIsatabException(msg) from e return prev_line def _next_line_startswith_comment(self): @@ -366,7 +369,7 @@ def _read_studies(self) -> Iterator[models.StudyInfo]: line = self._read_next_line() if not line or not line[0] == investigation_headers.STUDY: # pragma: no cover tpl = "Expected {} but got {}" - msg = tpl.format(investigation_headers.INVESTIGATION, line) + msg = tpl.format(investigation_headers.STUDY, line) raise ParseIsatabException(msg) # Read the other lines in this section. section, comment_keys = self._read_single_column_section( diff --git a/requirements/test.txt b/requirements/test.txt index 42f9503..4b19f51 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -18,3 +18,5 @@ flake8 >=3.5.0 isort pyright + +syrupy diff --git a/tests/__snapshots__/test_apps.ambr b/tests/__snapshots__/test_apps.ambr new file mode 100644 index 0000000..85ec357 --- /dev/null +++ b/tests/__snapshots__/test_apps.ambr @@ -0,0 +1,65 @@ +# serializer version: 1 +# name: test_isatab2isatab + list([ + ''' + Investigation with only one study contains metadata: + ID: i_minimal + Title: Minimal Investigation + Path: i_minimal.txt + Submission Date: + Public Release Date: None + Prefer recording metadata in the study section. + ''', + ''' + Assay without platform: + Path: a_minimal.txt + Measurement Type: exome sequencing assay + Technology Type: nucleotide sequencing + Technology Platform: + ''', + 'No reference headers available for section INVESTIGATION PUBLICATIONS. Applying default order.', + 'No reference headers available for section INVESTIGATION CONTACTS. Applying default order.', + 'No reference headers available for section STUDY DESIGN DESCRIPTORS. Applying default order.', + 'No reference headers available for section STUDY PUBLICATIONS. Applying default order.', + 'No reference headers available for section STUDY FACTORS. Applying default order.', + 'No reference headers available for section STUDY CONTACTS. Applying default order.', + ]) +# --- +# name: test_isatab2isatab_input_is_output + '' +# --- +# name: test_isatab_validate + list([ + 'Incomplete ontology source; found: , Incomplete 1, 1, Incomplete 1, ()', + 'Incomplete ontology source; found: Incomplete 2, , 2, Incomplete 2, ()', + 'Ontology source name including whitespace(s); found: Incomplete 2, , 2, Incomplete 2, ()', + 'Incomplete ontology source; found: Incomplete 3, Incomplete 3, , Incomplete 3, ()', + 'Ontology source name including whitespace(s); found: Incomplete 3, Incomplete 3, , Incomplete 3, ()', + 'Incomplete ontology source; found: Incomplete 4, Incomplete 4, 4, , ()', + 'Ontology source name including whitespace(s); found: Incomplete 4, Incomplete 4, 4, , ()', + ''' + Investigation with only one study contains metadata: + ID: i_warnings + Title: Investigation with Warnings + Path: i_warnings.txt + Submission Date: + Public Release Date: None + Prefer recording metadata in the study section. + ''', + 'Invalid mail address: invalid_mail', + 'Invalid phone/fax number: CALL-ME', + 'Invalid phone/fax number: FAX-ME', + 'Invalid pubmed_id string: not-pubmed', + 'Invalid doi string: not-a-doi', + ''' + Assay without platform: + Path: a_warnings.txt + Measurement Type: exome sequencing assay + Technology Type: nucleotide sequencing + Technology Platform: + ''', + 'Assay path used more than once: a_warnings.txt', + "Found samples in assay 'a_warnings.txt' but not in parent study 's_warnings.txt':\\n0815-N2", + "Found samples in assay 'a_warnings.txt' but not in parent study 's_warnings.txt':\\n0815-N2", + ]) +# --- diff --git a/tests/test_apps.py b/tests/test_apps.py index 638a726..aa8b234 100644 --- a/tests/test_apps.py +++ b/tests/test_apps.py @@ -4,6 +4,7 @@ import os.path import pytest +from syrupy.assertion import SnapshotAssertion from typer.testing import CliRunner from altamisa.apps import isatab2dot, isatab2isatab, isatab_validate @@ -12,7 +13,7 @@ runner = CliRunner() -def test_isatab_validate(): +def test_isatab_validate(snapshot: SnapshotAssertion): i_file = os.path.join(os.path.dirname(__file__), "data", "i_warnings", "i_warnings.txt") argv = ["--input-investigation-file", i_file, "--show-duplicate-warnings"] @@ -20,10 +21,10 @@ def test_isatab_validate(): result = runner.invoke(isatab_validate.app, argv) assert result.exit_code == 0 - assert 17 == len(record) + assert snapshot == [str(r.message) for r in record] -def test_isatab2isatab(tmpdir): +def test_isatab2isatab(tmpdir, snapshot: SnapshotAssertion): i_file = os.path.join(os.path.dirname(__file__), "data", "i_minimal", "i_minimal.txt") argv = [ "--input-investigation-file", @@ -38,10 +39,10 @@ def test_isatab2isatab(tmpdir): result = runner.invoke(isatab2isatab.app, argv) assert result.exit_code == 0 - assert 8 == len(record) + assert snapshot == [str(r.message) for r in record] -def test_isatab2isatab_input_is_output(tmpdir): +def test_isatab2isatab_input_is_output(tmpdir, snapshot: SnapshotAssertion): i_file = os.path.join(os.path.dirname(__file__), "data", "i_minimal", "i_minimal.txt") argv = [ "--input-investigation-file", @@ -54,7 +55,9 @@ def test_isatab2isatab_input_is_output(tmpdir): result = runner.invoke(isatab2isatab.app, argv) assert result.exit_code == 1 - assert "Can't output ISA-tab files to same directory as as input" in str(result) + assert snapshot == str(result).replace( + os.path.dirname(__file__), "/home/runner/work/altamisa/tests" + ) def test_isatab2dot(tmpdir):