Skip to content

Commit

Permalink
Re-implement reconciliation code with curies (#426)
Browse files Browse the repository at this point in the history
  • Loading branch information
cthoyt authored Sep 26, 2023
1 parent 2eba644 commit 11fde9d
Show file tree
Hide file tree
Showing 6 changed files with 256 additions and 244 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/qc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ jobs:
with:
python-version: ${{ matrix.python-version }}

- name: Install poetry dependencies
run: python -m pip install -U pip setuptools

#----------------------------------------------
# install & configure poetry
#----------------------------------------------
Expand Down
355 changes: 180 additions & 175 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ readme = "README.md"
[tool.poetry.dependencies]
python = "^3.8"
click = ">=8.1.6"
curies = ">=0.6.2"
curies = ">=0.6.4"
linkml-runtime = ">=1.5.5"
importlib-metadata = ">=6.8.0"
pandas = ">=2.0.3"
Expand Down
79 changes: 24 additions & 55 deletions src/sssom/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -1104,65 +1104,34 @@ def reconcile_prefix_and_data(
:param msdf: Mapping Set DataFrame.
:param prefix_reconciliation: Prefix reconcilation dictionary from a YAML file
:return: Mapping Set DataFrame with reconciled prefix_map and data.
This method is build on :func:`curies.remap_curie_prefixes` and
:func:`curies.rewire`. Note that if you want to overwrite a CURIE prefix in the Bioregistry
extended prefix map, you need to provide a place for the old one to go as in
``{"geo": "ncbi.geo", "geogeo": "geo"}``.
Just doing ``{"geogeo": "geo"}`` would not work since `geo` already exists.
"""
# Discussion about this found here:
# https://github.com/mapping-commons/sssom-py/issues/216#issue-1171701052
converter = msdf.converter
converter = curies.remap_curie_prefixes(converter, prefix_reconciliation["prefix_synonyms"])
converter = curies.rewire(converter, prefix_reconciliation["prefix_expansion_reconciliation"])

# TODO make this standardization code directly part of msdf after
# switching to native converter
def _upgrade(curie_or_iri: str) -> str:
if not is_iri(curie_or_iri) and is_curie(curie_or_iri):
return converter.standardize_curie(curie_or_iri) or curie_or_iri
return curie_or_iri

for column, values in _get_sssom_schema_object().dict["slots"].items():
if values["range"] != "EntityReference":
continue
if column not in msdf.df.columns:
continue
msdf.df[column] = msdf.df[column].map(_upgrade)

prefix_map = msdf.prefix_map
df: pd.DataFrame = msdf.df
data_switch_dict = dict()

prefix_synonyms = prefix_reconciliation["prefix_synonyms"]
prefix_expansion = prefix_reconciliation["prefix_expansion_reconciliation"]

# The prefix exists but the expansion needs to be updated.
expansion_replace = {
k: v for k, v in prefix_expansion.items() if k in prefix_map.keys() and v != prefix_map[k]
}

# Updates expansions in prefix_map
prefix_map.update(expansion_replace)

# Prefixes that need to be replaced
# IF condition:
# 1. Key OR Value in prefix_synonyms are keys in prefix_map
# e.g.: ICD10: ICD10CM - either should be present within
# the prefix_map.
# AND
# 2. Value in prefix_synonyms is NOT a value in expansion_replace.
# In other words, the existing expansion do not match the YAML.

prefix_replace = [
k
for k, v in prefix_synonyms.items()
if (k in prefix_map.keys() or v in prefix_map.keys()) and v not in expansion_replace.keys()
]

if len(prefix_replace) > 0:
for pr in prefix_replace:
correct_prefix = prefix_synonyms[pr]
correct_expansion = prefix_expansion[correct_prefix]
prefix_map[correct_prefix] = correct_expansion
logging.info(f"Adding prefix_map {correct_prefix}: {correct_expansion}")
if pr in prefix_map.keys():
prefix_map.pop(pr, None)
data_switch_dict[pr] = correct_prefix

logging.warning(f"Replacing prefix {pr} with {correct_prefix}")

# Data editing
if len(data_switch_dict) > 0:
# Read schema file
slots = _get_sssom_schema_object().dict["slots"]
entity_reference_columns = [k for k, v in slots.items() if v["range"] == "EntityReference"]
update_columns = [c for c in df.columns if c in entity_reference_columns]
for k, v in data_switch_dict.items():
df[update_columns] = df[update_columns].replace(k + ":", v + ":", regex=True)

msdf.df = df
msdf.prefix_map = prefix_map

# TODO: When expansion of 2 prefixes in the prefix_map are the same.
msdf.prefix_map = dict(converter.bimap)
return msdf


Expand Down
2 changes: 0 additions & 2 deletions tests/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,3 @@

test_out_dir = cwd / "tmp"
test_out_dir.mkdir(parents=True, exist_ok=True)

prefix_recon_yaml = data_dir / "prefix_reconciliation.yaml"
59 changes: 48 additions & 11 deletions tests/test_collapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

import unittest

import yaml

from sssom.parsers import parse_sssom_table
from sssom.util import (
collapse,
Expand All @@ -14,7 +12,7 @@
parse,
reconcile_prefix_and_data,
)
from tests.constants import data_dir, prefix_recon_yaml
from tests.constants import data_dir


class TestCollapse(unittest.TestCase):
Expand Down Expand Up @@ -76,13 +74,52 @@ def test_reconcile_prefix(self):
"""Test curie reconciliation is performing as expected."""
msdf = parse_sssom_table(data_dir / "basic3.tsv")

with open(prefix_recon_yaml) as pref_rec:
prefix_reconciliation = yaml.safe_load(pref_rec)
self.assertEqual(
{
"a": "http://example.org/a/",
"b": "http://example.org/b/",
"c": "http://example.org/c/",
"d": "http://example.org/d/",
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
"owl": "http://www.w3.org/2002/07/owl#",
"orcid": "https://orcid.org/",
"semapv": "https://w3id.org/semapv/vocab/",
"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
"skos": "http://www.w3.org/2004/02/skos/core#",
"sssom": "https://w3id.org/sssom/",
},
msdf.prefix_map,
)
prefix_reconciliation = {
"prefix_synonyms": {
"a": "c",
"c": "c2",
"b": "bravo",
"r": "rdfs", # does not do anything, since "r" is not already in the prefix map
"o": "owl", # does not do anything, since "o" is not already in the prefix map
},
"prefix_expansion_reconciliation": {
"c": "http://test.owl/c/",
"bravo": "http://test.owl/bravo",
"rdfs": "http://www.w3.org/2000/01/rdf-schema#", # matches what's already there
"owl": "http://www.w3.org/2002/07/owl#", # matches what's already there
},
}

recon_msdf = reconcile_prefix_and_data(msdf, prefix_reconciliation)

prefix_expansion = prefix_reconciliation["prefix_expansion_reconciliation"]

for pfx, exp in prefix_expansion.items():
if pfx in recon_msdf.prefix_map.keys():
self.assertEqual(recon_msdf.prefix_map[pfx], exp)
self.assertEqual(
{
"bravo": "http://test.owl/bravo",
"c": "http://test.owl/c/",
"c2": "http://example.org/c/",
"d": "http://example.org/d/",
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
"owl": "http://www.w3.org/2002/07/owl#",
"orcid": "https://orcid.org/",
"semapv": "https://w3id.org/semapv/vocab/",
"skos": "http://www.w3.org/2004/02/skos/core#",
"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
"sssom": "https://w3id.org/sssom/",
},
recon_msdf.prefix_map,
)

0 comments on commit 11fde9d

Please sign in to comment.