diff --git a/pyproject.toml b/pyproject.toml index 74d050a05..99e65ef5a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,7 @@ dependencies = [ "networkx", "pandas", "pyteomics", + "pyyaml", "rich", "scipy", "sortedcontainers", diff --git a/src/nplinker/metabolomics/gnps/gnps_format.py b/src/nplinker/metabolomics/gnps/gnps_format.py index 9e963c815..2492627be 100644 --- a/src/nplinker/metabolomics/gnps/gnps_format.py +++ b/src/nplinker/metabolomics/gnps/gnps_format.py @@ -1,11 +1,13 @@ from __future__ import annotations import re +import tarfile import zipfile from enum import Enum from enum import unique from os import PathLike from pathlib import Path import httpx +import yaml from bs4 import BeautifulSoup @@ -72,17 +74,22 @@ def gnps_format_from_gnps1_task_id(task_id: str) -> GNPSFormat: return GNPSFormat.Unknown -def gnps_format_from_archive(zip_file: str | PathLike) -> GNPSFormat: - """Detect GNPS format from GNPS zip archive. +def gnps_format_from_archive(file: str | PathLike) -> GNPSFormat: + """Detect GNPS format or workflow from GNPS archive file. - The detection is based on the filename of the zip file and the names of the - files contained in the zip file. + GNPS archive files can be in two formats: GNPS1 (.zip) and GNPS2 (.tar). + + For GNPS1 data, the detection of workflow format is based on the filename of the zip archive and + the names of the files contained in the zip archive. + + For GNPS2 data, the workflow format is taken from the `submission_parameters.yaml` file in the + tar archive, which has a key `workflowname`. Args: - zip_file: Path to the GNPS zip file. + file: Path to the GNPS archive file. Returns: - The format identified in the GNPS zip file. + The format identified in the GNPS archive file. Examples: >>> gnps_format_from_archive("ProteoSAFe-METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra.zip") @@ -91,8 +98,22 @@ def gnps_format_from_archive(zip_file: str | PathLike) -> GNPSFormat: >>> gnps_format_from_archive("ProteoSAFe-FEATURE-BASED-MOLECULAR-NETWORKING-672d0a53-download_cytoscape_data.zip") + >>> gnps_format_from_archive("206a7b40b7ed41c1ae6b4fbd2def3636.tar") + + >>> gnps_format_from_archive("2014f321d72542afb5216c932e0d5079.tar") + """ - file = Path(zip_file) + file = Path(file) + suffix = file.suffix + if suffix == ".zip": + return _gnps_format_from_archive_gnps1(file) + if suffix == ".tar": + return _gnps_format_from_archive_gnps2(file) + return GNPSFormat.Unknown + + +def _gnps_format_from_archive_gnps1(file: PathLike) -> GNPSFormat: + """Detect GNPS format from GNPS1 archive file.""" # Guess the format from the filename of the zip file if GNPSFormat.FBMN.value in file.name: return GNPSFormat.FBMN @@ -116,6 +137,26 @@ def gnps_format_from_archive(zip_file: str | PathLike) -> GNPSFormat: return GNPSFormat.Unknown +def _gnps_format_from_archive_gnps2(file: PathLike) -> GNPSFormat: + """Detect GNPS format from GNPS2 archive file.""" + with tarfile.open(file, "r") as tar: + try: + submission_file = tar.extractfile("submission_parameters.yaml") + if submission_file is None: + return GNPSFormat.Unknown + submission_params = yaml.safe_load(submission_file) + except (KeyError, yaml.YAMLError): + return GNPSFormat.Unknown + + workflow = submission_params.get("workflowname") + + if workflow == GNPSFormat.GNPS2FBMN.value: + return GNPSFormat.GNPS2FBMN + if workflow == GNPSFormat.GNPS2CN.value: + return GNPSFormat.GNPS2CN + return GNPSFormat.Unknown + + def gnps_format_from_file_mapping(file: str | PathLike) -> GNPSFormat: """Detect GNPS format from the given file mapping file. diff --git a/tests/unit/data/gnps/2014f321d72542afb5216c932e0d5079.tar b/tests/unit/data/gnps/2014f321d72542afb5216c932e0d5079.tar new file mode 100644 index 000000000..178a664c1 Binary files /dev/null and b/tests/unit/data/gnps/2014f321d72542afb5216c932e0d5079.tar differ diff --git a/tests/unit/data/gnps/206a7b40b7ed41c1ae6b4fbd2def3636.tar b/tests/unit/data/gnps/206a7b40b7ed41c1ae6b4fbd2def3636.tar new file mode 100644 index 000000000..9bec8710c Binary files /dev/null and b/tests/unit/data/gnps/206a7b40b7ed41c1ae6b4fbd2def3636.tar differ diff --git a/tests/unit/data/gnps/gnps2_nnknown.tar b/tests/unit/data/gnps/gnps2_nnknown.tar new file mode 100644 index 000000000..dd416ee8e Binary files /dev/null and b/tests/unit/data/gnps/gnps2_nnknown.tar differ diff --git a/tests/unit/metabolomics/conftest.py b/tests/unit/metabolomics/conftest.py index 646695261..81637457b 100644 --- a/tests/unit/metabolomics/conftest.py +++ b/tests/unit/metabolomics/conftest.py @@ -6,6 +6,11 @@ from .. import GNPS_DATA_DIR +# +# Fixtures for GNPS1 +# + + @pytest.fixture(scope="session") def gnps_website_is_down(): """Check if the GNPS website is down.""" @@ -133,3 +138,26 @@ def gnps_annotations_files(tmp_gnps_dir) -> dict[GNPSFormat, PathLike]: / "DB_result" / "7dc5b46b50d94246a1de12ef485d0f75.tsv", } + + +# +# Fixtures for GNPS2 +# + + +@pytest.fixture(scope="session") +def gnps2_tar_files() -> dict[GNPSFormat, PathLike]: + """Get the paths of the GNPS2 tar archives as a dict. + + The dict keys are the workflow short names taken from the GNPSFormat enum. + The dict values are the paths to the tar archives. + + You can download the archives ("Download all results") from the following links : + - https://gnps2.org/status?task=2014f321d72542afb5216c932e0d5079 + - https://gnps2.org/status?task=206a7b40b7ed41c1ae6b4fbd2def3636 + """ + return { + GNPSFormat.GNPS2CN: GNPS_DATA_DIR / "206a7b40b7ed41c1ae6b4fbd2def3636.tar", + GNPSFormat.GNPS2FBMN: GNPS_DATA_DIR / "2014f321d72542afb5216c932e0d5079.tar", + GNPSFormat.Unknown: GNPS_DATA_DIR / "gnps2_nnknown.tar", + } diff --git a/tests/unit/metabolomics/test_gnps_format.py b/tests/unit/metabolomics/test_gnps_format.py index 69ba5bc18..fc267f91f 100644 --- a/tests/unit/metabolomics/test_gnps_format.py +++ b/tests/unit/metabolomics/test_gnps_format.py @@ -5,6 +5,9 @@ from nplinker.metabolomics.gnps import gnps_format_from_gnps1_task_id +# +# Test GNPS1 formats +# @pytest.mark.parametrize( "task_id, expected", [ @@ -24,7 +27,7 @@ def test_gnps_format_from_gnps1_task_id(task_id: str, expected: GNPSFormat, gnps @pytest.mark.parametrize( "workflow", [GNPSFormat.FBMN, GNPSFormat.SNETS, GNPSFormat.SNETSV2, GNPSFormat.Unknown] ) -def test_gnps_format_from_archive(workflow: str, gnps_zip_files): +def test_gnps_format_from_archive_gnps1(workflow: str, gnps_zip_files): actual = gnps_format_from_archive(gnps_zip_files[workflow]) assert actual is workflow @@ -33,3 +36,12 @@ def test_gnps_format_from_archive(workflow: str, gnps_zip_files): def test_gnps_format_from_file_mapping(workflow: str, gnps_file_mappings_files): actual = gnps_format_from_file_mapping(gnps_file_mappings_files[workflow]) assert actual is workflow + + +# +# Test GNPS2 formats +# +@pytest.mark.parametrize("workflow", [GNPSFormat.GNPS2CN, GNPSFormat.GNPS2FBMN, GNPSFormat.Unknown]) +def test_gnps_format_from_archive_gnps2(workflow: str, gnps2_tar_files): + actual = gnps_format_from_archive(gnps2_tar_files[workflow]) + assert actual is workflow