Skip to content

Commit

Permalink
Extended prefix maps usage updates
Browse files Browse the repository at this point in the history
- Bugfix: clean_prefix_map(): Was ignoring prefix aliases. This was causing an error, as there was a perceived mismatch between the prefixes of the mapping set, and the prefix_map.
- Bugfix: get_metadata_and_prefix_map(): Was not utilizing extended prefix maps. This manifested in issue where prefix aliases were not incorporated. This meant that (a) if we tried to fix by removing the alias from the plain prefix_map, these CURIEs could not be resolved, (b) if we included the alias in the plain prefix_map, there would be a duplicate URI prefix, which would result in an error.
- Add: convert_plain_prefix_map_to_extended()
  • Loading branch information
joeflack4 committed Jan 13, 2024
1 parent f40a1c8 commit 695ecab
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 4 deletions.
21 changes: 20 additions & 1 deletion src/sssom/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,23 @@ def split_file(input_path: str, output_directory: Union[str, Path]) -> None:
write_tables(splitted, output_directory)


def convert_plain_prefix_map_to_extended(prefix_map):
"""Convert a standard key/val previx map to extended prefix map format"""
by_uri_prefix = {}
for prefix, uri_prefix in prefix_map.items():
if uri_prefix in by_uri_prefix:
by_uri_prefix[uri_prefix]["prefix_synonyms"].append(prefix)
continue
by_uri_prefix[uri_prefix] = {
"prefix": prefix,
"prefix_synonyms": [],
"uri_prefix": uri_prefix,
"uri_prefix_synonyms": []
}
epm = list(by_uri_prefix.values())
return epm


def get_metadata_and_prefix_map(
metadata_path: Union[None, str, Path] = None, *, prefix_map_mode: Optional[MergeMode] = None
) -> Tuple[Converter, MetadataType]:
Expand All @@ -147,7 +164,9 @@ def get_metadata_and_prefix_map(
metadata = yaml.safe_load(file)

metadata = dict(ChainMap(metadata, get_default_metadata()))
converter = Converter.from_prefix_map(metadata.pop(CURIE_MAP, {}))
prefix_map = metadata.pop(CURIE_MAP, {})
epm = convert_plain_prefix_map_to_extended(prefix_map)
converter = Converter.from_extended_prefix_map(epm)
converter = _merge_converter(converter, prefix_map_mode=prefix_map_mode)
return converter, metadata

Expand Down
10 changes: 7 additions & 3 deletions src/sssom/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ def __str__(self) -> str: # noqa:D105
description += self.df.tail().to_string() + "\n"
return description

def clean_prefix_map(self, strict: bool = True) -> None:
def clean_prefix_map(self, strict: bool = True, merge=False) -> None:
"""
Remove unused prefixes from the internal prefix map based on the internal dataframe.
Expand All @@ -246,15 +246,19 @@ def clean_prefix_map(self, strict: bool = True) -> None:
if self.metadata:
prefixes_in_table.update(get_prefixes_used_in_metadata(self.metadata))

missing_prefixes = prefixes_in_table - self.converter.get_prefixes()
prefixes = {record.prefix for record in self.converter.records}
aliases = {p for p in itt.chain(*[record.prefix_synonyms for record in self.converter.records]) if p}
converter_prefixes = prefixes | aliases
missing_prefixes = prefixes_in_table - converter_prefixes

if missing_prefixes and strict:
raise ValueError(
f"{missing_prefixes} are used in the SSSOM mapping set but it does not exist in the prefix map"
)

subconverter = self.converter.get_subconverter(prefixes_in_table)
for prefix in missing_prefixes:
subconverter.add_prefix(prefix, f"{UNKNOWN_IRI}{prefix.lower()}/")
subconverter.add_prefix(prefix, f"{UNKNOWN_IRI}{prefix.lower()}/", merge=merge)

self.converter = subconverter

Expand Down

0 comments on commit 695ecab

Please sign in to comment.