Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update gnpsformat detection function to support gnps2 #293

Open
wants to merge 5 commits into
base: update_GNPSFormat_for_gnps2
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ dependencies = [
"networkx",
"pandas",
"pyteomics",
"pyyaml",
"rich",
"scipy",
"sortedcontainers",
Expand Down
55 changes: 48 additions & 7 deletions src/nplinker/metabolomics/gnps/gnps_format.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from __future__ import annotations
import re
import tarfile
import zipfile
from enum import Enum
from enum import unique
from os import PathLike
from pathlib import Path
import httpx
import yaml
from bs4 import BeautifulSoup


Expand Down Expand Up @@ -72,17 +74,22 @@ def gnps_format_from_gnps1_task_id(task_id: str) -> GNPSFormat:
return GNPSFormat.Unknown


def gnps_format_from_archive(zip_file: str | PathLike) -> GNPSFormat:
"""Detect GNPS format from GNPS zip archive.
def gnps_format_from_archive(file: str | PathLike) -> GNPSFormat:
"""Detect GNPS format or workflow from GNPS archive file.

The detection is based on the filename of the zip file and the names of the
files contained in the zip file.
GNPS archive files can be in two formats: GNPS1 (.zip) and GNPS2 (.tar).

For GNPS1 data, the detection of workflow format is based on the filename of the zip archive and
the names of the files contained in the zip archive.

For GNPS2 data, the workflow format is taken from the `submission_parameters.yaml` file in the
tar archive, which has a key `workflowname`.

Args:
zip_file: Path to the GNPS zip file.
file: Path to the GNPS archive file.

Returns:
The format identified in the GNPS zip file.
The format identified in the GNPS archive file.

Examples:
>>> gnps_format_from_archive("ProteoSAFe-METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra.zip")
Expand All @@ -91,8 +98,22 @@ def gnps_format_from_archive(zip_file: str | PathLike) -> GNPSFormat:
<GNPSFormat.SNETSV2: 'METABOLOMICS-SNETS-V2'>
>>> gnps_format_from_archive("ProteoSAFe-FEATURE-BASED-MOLECULAR-NETWORKING-672d0a53-download_cytoscape_data.zip")
<GNPSFormat.FBMN: 'FEATURE-BASED-MOLECULAR-NETWORKING'>
>>> gnps_format_from_archive("206a7b40b7ed41c1ae6b4fbd2def3636.tar")
<GNPSFormat.GNPS2CN: 'classical_networking_workflow'>
>>> gnps_format_from_archive("2014f321d72542afb5216c932e0d5079.tar")
<GNPSFormat.GNPS2FBMN: 'feature_based_molecular_networking_workflow'>
"""
file = Path(zip_file)
file = Path(file)
suffix = file.suffix
if suffix == ".zip":
return _gnps_format_from_archive_gnps1(file)
if suffix == ".tar":
return _gnps_format_from_archive_gnps2(file)
return GNPSFormat.Unknown


def _gnps_format_from_archive_gnps1(file: PathLike) -> GNPSFormat:
"""Detect GNPS format from GNPS1 archive file."""
# Guess the format from the filename of the zip file
if GNPSFormat.FBMN.value in file.name:
return GNPSFormat.FBMN
Expand All @@ -116,6 +137,26 @@ def gnps_format_from_archive(zip_file: str | PathLike) -> GNPSFormat:
return GNPSFormat.Unknown


def _gnps_format_from_archive_gnps2(file: PathLike) -> GNPSFormat:
"""Detect GNPS format from GNPS2 archive file."""
with tarfile.open(file, "r") as tar:
try:
submission_file = tar.extractfile("submission_parameters.yaml")
if submission_file is None:
return GNPSFormat.Unknown
submission_params = yaml.safe_load(submission_file)
except (KeyError, yaml.YAMLError):
return GNPSFormat.Unknown

workflow = submission_params.get("workflowname")

if workflow == GNPSFormat.GNPS2FBMN.value:
return GNPSFormat.GNPS2FBMN
if workflow == GNPSFormat.GNPS2CN.value:
return GNPSFormat.GNPS2CN
return GNPSFormat.Unknown


def gnps_format_from_file_mapping(file: str | PathLike) -> GNPSFormat:
"""Detect GNPS format from the given file mapping file.

Expand Down
Binary file not shown.
Binary file not shown.
Binary file added tests/unit/data/gnps/gnps2_nnknown.tar
Binary file not shown.
28 changes: 28 additions & 0 deletions tests/unit/metabolomics/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
from .. import GNPS_DATA_DIR


#
# Fixtures for GNPS1
#


@pytest.fixture(scope="session")
def gnps_website_is_down():
"""Check if the GNPS website is down."""
Expand Down Expand Up @@ -133,3 +138,26 @@ def gnps_annotations_files(tmp_gnps_dir) -> dict[GNPSFormat, PathLike]:
/ "DB_result"
/ "7dc5b46b50d94246a1de12ef485d0f75.tsv",
}


#
# Fixtures for GNPS2
#


@pytest.fixture(scope="session")
def gnps2_tar_files() -> dict[GNPSFormat, PathLike]:
"""Get the paths of the GNPS2 tar archives as a dict.

The dict keys are the workflow short names taken from the GNPSFormat enum.
The dict values are the paths to the tar archives.

You can download the archives ("Download all results") from the following links :
- https://gnps2.org/status?task=2014f321d72542afb5216c932e0d5079
- https://gnps2.org/status?task=206a7b40b7ed41c1ae6b4fbd2def3636
"""
return {
GNPSFormat.GNPS2CN: GNPS_DATA_DIR / "206a7b40b7ed41c1ae6b4fbd2def3636.tar",
GNPSFormat.GNPS2FBMN: GNPS_DATA_DIR / "2014f321d72542afb5216c932e0d5079.tar",
GNPSFormat.Unknown: GNPS_DATA_DIR / "gnps2_nnknown.tar",
}
14 changes: 13 additions & 1 deletion tests/unit/metabolomics/test_gnps_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
from nplinker.metabolomics.gnps import gnps_format_from_gnps1_task_id


#
# Test GNPS1 formats
#
@pytest.mark.parametrize(
"task_id, expected",
[
Expand All @@ -24,7 +27,7 @@ def test_gnps_format_from_gnps1_task_id(task_id: str, expected: GNPSFormat, gnps
@pytest.mark.parametrize(
"workflow", [GNPSFormat.FBMN, GNPSFormat.SNETS, GNPSFormat.SNETSV2, GNPSFormat.Unknown]
)
def test_gnps_format_from_archive(workflow: str, gnps_zip_files):
def test_gnps_format_from_archive_gnps1(workflow: str, gnps_zip_files):
actual = gnps_format_from_archive(gnps_zip_files[workflow])
assert actual is workflow

Expand All @@ -33,3 +36,12 @@ def test_gnps_format_from_archive(workflow: str, gnps_zip_files):
def test_gnps_format_from_file_mapping(workflow: str, gnps_file_mappings_files):
actual = gnps_format_from_file_mapping(gnps_file_mappings_files[workflow])
assert actual is workflow


#
# Test GNPS2 formats
#
@pytest.mark.parametrize("workflow", [GNPSFormat.GNPS2CN, GNPSFormat.GNPS2FBMN, GNPSFormat.Unknown])
def test_gnps_format_from_archive_gnps2(workflow: str, gnps2_tar_files):
actual = gnps_format_from_archive(gnps2_tar_files[workflow])
assert actual is workflow
Loading