From 695ecaba8731c128d80b45fc93be4c5c92dd4fb2 Mon Sep 17 00:00:00 2001 From: Joe Flack Date: Sat, 13 Jan 2024 18:37:40 -0500 Subject: [PATCH] Extended prefix maps usage updates - Bugfix: clean_prefix_map(): Was ignoring prefix aliases. This was causing an error, as there was a perceived mismatch between the prefixes of the mapping set, and the prefix_map. - Bugfix: get_metadata_and_prefix_map(): Was not utilizing extended prefix maps. This manifested in issue where prefix aliases were not incorporated. This meant that (a) if we tried to fix by removing the alias from the plain prefix_map, these CURIEs could not be resolved, (b) if we included the alias in the plain prefix_map, there would be a duplicate URI prefix, which would result in an error. - Add: convert_plain_prefix_map_to_extended() --- src/sssom/io.py | 21 ++++++++++++++++++++- src/sssom/util.py | 10 +++++++--- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/src/sssom/io.py b/src/sssom/io.py index 8f4c947e..d0807e41 100644 --- a/src/sssom/io.py +++ b/src/sssom/io.py @@ -130,6 +130,23 @@ def split_file(input_path: str, output_directory: Union[str, Path]) -> None: write_tables(splitted, output_directory) +def convert_plain_prefix_map_to_extended(prefix_map): + """Convert a standard key/val previx map to extended prefix map format""" + by_uri_prefix = {} + for prefix, uri_prefix in prefix_map.items(): + if uri_prefix in by_uri_prefix: + by_uri_prefix[uri_prefix]["prefix_synonyms"].append(prefix) + continue + by_uri_prefix[uri_prefix] = { + "prefix": prefix, + "prefix_synonyms": [], + "uri_prefix": uri_prefix, + "uri_prefix_synonyms": [] + } + epm = list(by_uri_prefix.values()) + return epm + + def get_metadata_and_prefix_map( metadata_path: Union[None, str, Path] = None, *, prefix_map_mode: Optional[MergeMode] = None ) -> Tuple[Converter, MetadataType]: @@ -147,7 +164,9 @@ def get_metadata_and_prefix_map( metadata = yaml.safe_load(file) metadata = dict(ChainMap(metadata, get_default_metadata())) - converter = Converter.from_prefix_map(metadata.pop(CURIE_MAP, {})) + prefix_map = metadata.pop(CURIE_MAP, {}) + epm = convert_plain_prefix_map_to_extended(prefix_map) + converter = Converter.from_extended_prefix_map(epm) converter = _merge_converter(converter, prefix_map_mode=prefix_map_mode) return converter, metadata diff --git a/src/sssom/util.py b/src/sssom/util.py index 64db60ed..233f5b2c 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -234,7 +234,7 @@ def __str__(self) -> str: # noqa:D105 description += self.df.tail().to_string() + "\n" return description - def clean_prefix_map(self, strict: bool = True) -> None: + def clean_prefix_map(self, strict: bool = True, merge=False) -> None: """ Remove unused prefixes from the internal prefix map based on the internal dataframe. @@ -246,7 +246,11 @@ def clean_prefix_map(self, strict: bool = True) -> None: if self.metadata: prefixes_in_table.update(get_prefixes_used_in_metadata(self.metadata)) - missing_prefixes = prefixes_in_table - self.converter.get_prefixes() + prefixes = {record.prefix for record in self.converter.records} + aliases = {p for p in itt.chain(*[record.prefix_synonyms for record in self.converter.records]) if p} + converter_prefixes = prefixes | aliases + missing_prefixes = prefixes_in_table - converter_prefixes + if missing_prefixes and strict: raise ValueError( f"{missing_prefixes} are used in the SSSOM mapping set but it does not exist in the prefix map" @@ -254,7 +258,7 @@ def clean_prefix_map(self, strict: bool = True) -> None: subconverter = self.converter.get_subconverter(prefixes_in_table) for prefix in missing_prefixes: - subconverter.add_prefix(prefix, f"{UNKNOWN_IRI}{prefix.lower()}/") + subconverter.add_prefix(prefix, f"{UNKNOWN_IRI}{prefix.lower()}/", merge=merge) self.converter = subconverter