diff --git a/src/sssom/io.py b/src/sssom/io.py index 8f4c947e..d0807e41 100644 --- a/src/sssom/io.py +++ b/src/sssom/io.py @@ -130,6 +130,23 @@ def split_file(input_path: str, output_directory: Union[str, Path]) -> None: write_tables(splitted, output_directory) +def convert_plain_prefix_map_to_extended(prefix_map): + """Convert a standard key/val previx map to extended prefix map format""" + by_uri_prefix = {} + for prefix, uri_prefix in prefix_map.items(): + if uri_prefix in by_uri_prefix: + by_uri_prefix[uri_prefix]["prefix_synonyms"].append(prefix) + continue + by_uri_prefix[uri_prefix] = { + "prefix": prefix, + "prefix_synonyms": [], + "uri_prefix": uri_prefix, + "uri_prefix_synonyms": [] + } + epm = list(by_uri_prefix.values()) + return epm + + def get_metadata_and_prefix_map( metadata_path: Union[None, str, Path] = None, *, prefix_map_mode: Optional[MergeMode] = None ) -> Tuple[Converter, MetadataType]: @@ -147,7 +164,9 @@ def get_metadata_and_prefix_map( metadata = yaml.safe_load(file) metadata = dict(ChainMap(metadata, get_default_metadata())) - converter = Converter.from_prefix_map(metadata.pop(CURIE_MAP, {})) + prefix_map = metadata.pop(CURIE_MAP, {}) + epm = convert_plain_prefix_map_to_extended(prefix_map) + converter = Converter.from_extended_prefix_map(epm) converter = _merge_converter(converter, prefix_map_mode=prefix_map_mode) return converter, metadata diff --git a/src/sssom/util.py b/src/sssom/util.py index 64db60ed..233f5b2c 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -234,7 +234,7 @@ def __str__(self) -> str: # noqa:D105 description += self.df.tail().to_string() + "\n" return description - def clean_prefix_map(self, strict: bool = True) -> None: + def clean_prefix_map(self, strict: bool = True, merge=False) -> None: """ Remove unused prefixes from the internal prefix map based on the internal dataframe. @@ -246,7 +246,11 @@ def clean_prefix_map(self, strict: bool = True) -> None: if self.metadata: prefixes_in_table.update(get_prefixes_used_in_metadata(self.metadata)) - missing_prefixes = prefixes_in_table - self.converter.get_prefixes() + prefixes = {record.prefix for record in self.converter.records} + aliases = {p for p in itt.chain(*[record.prefix_synonyms for record in self.converter.records]) if p} + converter_prefixes = prefixes | aliases + missing_prefixes = prefixes_in_table - converter_prefixes + if missing_prefixes and strict: raise ValueError( f"{missing_prefixes} are used in the SSSOM mapping set but it does not exist in the prefix map" @@ -254,7 +258,7 @@ def clean_prefix_map(self, strict: bool = True) -> None: subconverter = self.converter.get_subconverter(prefixes_in_table) for prefix in missing_prefixes: - subconverter.add_prefix(prefix, f"{UNKNOWN_IRI}{prefix.lower()}/") + subconverter.add_prefix(prefix, f"{UNKNOWN_IRI}{prefix.lower()}/", merge=merge) self.converter = subconverter