Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend parse_sssom_table to report wrong prefixes and metadata #565

Merged
merged 5 commits into from
Dec 7, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 97 additions & 0 deletions src/sssom/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,97 @@ def _get_seperator_symbol_from_file_path(file):
return None


def _is_check_valid_extension_slot(slot_name):
logging.warning(
f"'{slot_name}' could be a valid extension slot "
f"(https://mapping-commons.github.io/sssom/spec-model/#non-standard-slots), "
f"but the validator does not check that yet."
)
return False


def _check_irregular_metadata(sssom_metadata, meta):
fail_metadata = False
for m in [sssom_metadata, meta]:
for key in m:
if (key not in _get_sssom_schema_object().mapping_set_slots) and (
not _is_check_valid_extension_slot(key)
):
logging.warning(
f"Metadata key '{key}' is not a standard SSSOM mapping set metadata field."
)
fail_metadata = True
return fail_metadata


def _check_redefined_builtin_prefixes(sssom_metadata, meta, prefix_map):

# There are three ways in which prefixes can be communicated, so we will check all of them
# This is a bit overly draconian, as in the end, only the highest priority one gets picked
# But since this only constitues a (logging) warning, I think its worth reporting
builtin_converter = _get_built_in_prefix_map()
sssom_metadata_converter = _get_converter_pop_replace_curie_map(sssom_metadata)
meta_converter = _get_converter_pop_replace_curie_map(meta)
prefix_map_converter = ensure_converter(prefix_map, use_defaults=False)
is_valid_prefixes = True

for converter in [sssom_metadata_converter, meta_converter, prefix_map_converter]:
for builtin_prefix, builtin_uri in builtin_converter.bimap.items():
if builtin_prefix in converter.bimap:
if builtin_uri != converter.bimap[builtin_prefix]:
logging.warning(
f"A built-in prefix ({builtin_prefix}) was provided, "
f"but the provided URI expansion ({converter.bimap[builtin_prefix]}) does not correspond "
f"to the required URI expansion: {builtin_uri}. The prefix will be ignored."
)
is_valid_prefixes = False
reverse_bimap = {value: key for key, value in builtin_converter.bimap.items()}
if builtin_uri in reverse_bimap:
if builtin_prefix != reverse_bimap[builtin_uri]:
logging.warning(
f"A built-in URI namespace ({builtin_uri}) was used in (one of) the provided prefix map(s), "
f"but the provided prefix ({reverse_bimap[builtin_uri]}) does not correspond to the "
f"standard prefix: {builtin_prefix}. The prefix will be ignored."
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I’ll just note that this case (standard prefix associated to a different prefix name than the one listed in the spec, e.g. som: https://w3id.org/sssom/) is not forbidden by the spec. The spec only forbids associated a builtin prefix name to a different prefix (e.g., sssom: https://not-the-sssom-namespace.example.org/).

The spec says:

By exception, prefix names listed in the table found in the IRI prefixes section are considered “built-in”. As such, they MAY be omitted from the curie_map. If they are not omitted, they MUST point to the same IRI prefixes as in the aforementioned table.

There is nothing here that forbids using another prefix name than a builtin one to refer to one of the known namespaces.

Not requesting any change here because SSSOM-Py is free to be stricter than the spec if it wants to.

)
is_valid_prefixes = False
return is_valid_prefixes


def _fail_in_strict_parsing_mode(is_valid_built_in_prefixes, is_valid_metadata):
report = ""
if not is_valid_built_in_prefixes:
report += "STRONG WARNING: The prefix map provided contains built-in prefixes that were redefined.+\n"
if not is_valid_metadata:
report += (
"STRONG WARNING: The metadata provided contains non-standard and undefined metadata.+\n"
)

if report:
raise ValueError(report)


def _get_converter_pop_replace_curie_map(sssom_metadata):
"""
Pop CURIE_MAP from sssom_metadata, process it, and restore it if it existed.

Args:
sssom_metadata (dict): The metadata dictionary.

Returns:
Converter: A Converter object created from the CURIE_MAP.
"""
curie_map = sssom_metadata.pop(CURIE_MAP, {})

# Process the popped value
sssom_metadata_converter = Converter.from_prefix_map(curie_map)

# Reinsert CURIE_MAP if it was present
if curie_map:
sssom_metadata[CURIE_MAP] = curie_map

return sssom_metadata_converter


def parse_sssom_table(
file_path: Union[str, Path, TextIO],
prefix_map: ConverterHint = None,
Expand All @@ -197,6 +288,12 @@ def parse_sssom_table(
if meta is None:
meta = {}

is_valid_built_in_prefixes = _check_redefined_builtin_prefixes(sssom_metadata, meta, prefix_map)
is_valid_metadata = _check_irregular_metadata(sssom_metadata, meta)

if kwargs.get("strict"):
_fail_in_strict_parsing_mode(is_valid_built_in_prefixes, is_valid_metadata)

# The priority order for combining prefix maps are:
# 1. Built-in prefix map
# 2. Internal prefix map inside the document
Expand Down
15 changes: 15 additions & 0 deletions tests/data/basic_strict_fail.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# curie_map:
# HP: http://purl.obolibrary.org/obo/HP_
# MP: http://purl.obolibrary.org/obo/MP_
# owl: http://www.w3.org/2002/07/owl#
# rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
# rdfs: http://www.w3.org/2000/01/rdf-schema_fail#
# semapv: https://w3id.org/semapv/vocab/
# skos: http://www.w3.org/2004/02/skos/core#
# sssom: https://w3id.org/sssom/
# license_fail: https://creativecommons.org/publicdomain/zero/1.0/
# mapping_provider: http://purl.obolibrary.org/obo/upheno.owl
# mapping_set_id: https://w3id.org/sssom/mappings/27f85fe9-8a72-4e76-909b-7ba4244d9ede
subject_id subject_label predicate_id object_id object_label mapping_fail_justification
HP:0000175 Cleft palate skos:exactMatch MP:0000111 cleft palate semapv:LexicalMatching
HP:0000252 Microcephaly skos:exactMatch MP:0000433 microcephaly semapv:LexicalMatching
14 changes: 14 additions & 0 deletions tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,3 +447,17 @@ def test_round_trip_rdf(self):
def test_round_trip_tsv(self):
"""Test writing then reading TSV."""
self._basic_round_trip("tsv")

def test_strict_parsing(self):
"""Test Strict parsing mode."""
input_path = f"{test_data_dir}/basic_strict_fail.tsv"
with open(input_path, "r") as file:
input_string = file.read()
stream = io.StringIO(input_string)

with self.assertRaises(ValueError):
parse_sssom_table(stream, strict=True)

# Make sure it parses in non-strict mode
msdf = parse_sssom_table(stream)
self.assertEqual(len(msdf.df), 2)
Loading