diff --git a/poetry.lock b/poetry.lock index ab5c3602..f32a7321 100644 --- a/poetry.lock +++ b/poetry.lock @@ -238,13 +238,13 @@ files = [ [[package]] name = "curies" -version = "0.7.3" +version = "0.7.4" description = "Idiomatic conversion between URIs and compact URIs (CURIEs)." optional = false python-versions = ">=3.8" files = [ - {file = "curies-0.7.3-py3-none-any.whl", hash = "sha256:011d0695f45ae52e51a4d611235e444a1acae7ea3ba95f73ec8e039fbed08004"}, - {file = "curies-0.7.3.tar.gz", hash = "sha256:1cb13d04d63410ea7068a6c9e006e3d40e849003a34f397420b58c03ac594d5a"}, + {file = "curies-0.7.4-py3-none-any.whl", hash = "sha256:478f1818345988933d8bc6060f80a985401331f856ff8cf9bd98fa00d178ad39"}, + {file = "curies-0.7.4.tar.gz", hash = "sha256:d3aaf16644b26ac2605ff83c565ec7df0ba0b5f7425516047666e609ec5fb718"}, ] [package.dependencies] @@ -2454,4 +2454,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "972153808613ca75c4312c68d918a1994adde178b3d3ec258c6ac6a193765bc1" +content-hash = "640bd51021bffca9663cb0562c7e8d897bdd56735ee885c4a6720075964a7276" diff --git a/src/sssom/constants.py b/src/sssom/constants.py index 84ecd18b..9f8a09c0 100644 --- a/src/sssom/constants.py +++ b/src/sssom/constants.py @@ -3,8 +3,8 @@ import pathlib import uuid from enum import Enum -from functools import lru_cache -from typing import Any, Dict, List, Literal +from functools import cached_property, lru_cache +from typing import Any, Dict, List, Literal, Set import pkg_resources import yaml @@ -213,48 +213,56 @@ class SSSOMSchemaView(object): Implemented via PR: https://github.com/mapping-commons/sssom-py/pull/323 """ - _view = None - _dict = None - def __new__(cls): """Create a instance of the SSSOM schema view if non-existent.""" if not hasattr(cls, "instance"): cls.instance = super(SSSOMSchemaView, cls).__new__(cls) - return cls.instance + return cls.instance - @property + @cached_property def view(self) -> SchemaView: """Return SchemaView object.""" - if self._view is None: - self._view = SchemaView(SCHEMA_YAML) - return self._view + return SchemaView(SCHEMA_YAML) - @property + @cached_property def dict(self) -> dict: """Return SchemaView as a dictionary.""" - if self._dict is None: - self._dict = schema_as_dict(self.view.schema) - return self._dict + return schema_as_dict(self.view.schema) - @property + @cached_property def mapping_slots(self) -> List[str]: """Return list of mapping slots.""" return self.view.get_class("mapping").slots - @property + @cached_property def mapping_set_slots(self) -> List[str]: """Return list of mapping set slots.""" return self.view.get_class("mapping set").slots - @property - def multivalued_slots(self) -> List[str]: - """Return list of multivalued slots.""" - return [c for c in self.view.all_slots() if self.view.get_slot(c).multivalued] - - @property - def entity_reference_slots(self) -> List[str]: - """Return list of entity reference slots.""" - return [c for c in self.view.all_slots() if self.view.get_slot(c).range == ENTITY_REFERENCE] + @cached_property + def multivalued_slots(self) -> Set[str]: + """Return set of multivalued slots.""" + return {c for c in self.view.all_slots() if self.view.get_slot(c).multivalued} + + @cached_property + def entity_reference_slots(self) -> Set[str]: + """Return set of entity reference slots.""" + return {c for c in self.view.all_slots() if self.view.get_slot(c).range == ENTITY_REFERENCE} + + @cached_property + def mapping_enum_keys(self) -> Set[str]: + """Return a set of mapping enum keys.""" + return set(_get_sssom_schema_object().dict["enums"].keys()) + + @cached_property + def slots(self) -> Dict[str, str]: + """Return the slots for SSSOMSchemaView object.""" + return self.dict["slots"] + + @cached_property + def double_slots(self) -> Set[str]: + """Return the slot names for SSSOMSchemaView object.""" + return {k for k, v in self.dict["slots"].items() if v["range"] == "double"} @lru_cache(1) diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py index cda463ce..87ad603d 100644 --- a/src/sssom/parsers.py +++ b/src/sssom/parsers.py @@ -313,7 +313,7 @@ def _get_prefix_map_and_metadata( def _address_multivalued_slot(k: str, v: Any) -> Union[str, List[str]]: - if is_multivalued_slot(k) and v is not None and isinstance(v, str): + if isinstance(v, str) and is_multivalued_slot(k): # IF k is multivalued, then v = List[values] return [s.strip() for s in v.split("|")] else: @@ -329,20 +329,28 @@ def _init_mapping_set(meta: Optional[MetadataType]) -> MappingSet: return mapping_set +MAPPING_SLOTS = set(_get_sssom_schema_object().mapping_slots) + + def _get_mapping_dict(row: pd.Series, bad_attrs: Counter) -> Dict[str, Any]: - mdict = {} - sssom_schema_object = _get_sssom_schema_object() - for k, v in row.items(): - if not v or pd.isna(v): - continue - k = cast(str, k) - if k in sssom_schema_object.mapping_slots: - mdict[k] = _address_multivalued_slot(k, v) - else: - # There's the possibility that the key is in - # sssom_schema_object.mapping_set_slots, but - # this is skipped for now - bad_attrs[k] += 1 + """Generate a mapping dictionary from a given row of data. + + It also updates the 'bad_attrs' counter for keys that are not present + in the sssom_schema_object's mapping_slots. + """ + # Populate the mapping dictionary with key-value pairs from the row, + # only if the value exists, is not NaN, and the key is in the schema's mapping slots. + # The value could be a string or a list and is handled accordingly via _address_multivalued_slot(). + mdict = { + k: _address_multivalued_slot(k, v) + for k, v in row.items() + if v and pd.notna(v) and k in MAPPING_SLOTS + } + + # Update bad_attrs for keys not in mapping_slots + bad_keys = set(row.keys()) - MAPPING_SLOTS + for bad_key in bad_keys: + bad_attrs[bad_key] += 1 return mdict @@ -795,9 +803,14 @@ def to_mapping_set_document(msdf: MappingSetDataFrame) -> MappingSetDocument: def _get_mapping_set_from_df(df: pd.DataFrame, meta: Optional[MetadataType] = None) -> MappingSet: mapping_set = _init_mapping_set(meta) bad_attrs: Counter = Counter() - for _, row in df.iterrows(): - mapping_dict = _get_mapping_dict(row, bad_attrs) - _add_valid_mapping_to_list(mapping_dict, mapping_set.mappings) + + df.apply( + lambda row: _add_valid_mapping_to_list( + _get_mapping_dict(row, bad_attrs), mapping_set.mappings + ), + axis=1, + ) + for k, v in bad_attrs.items(): logging.warning(f"No attr for {k} [{v} instances]") return mapping_set diff --git a/src/sssom/util.py b/src/sssom/util.py index cb7762dc..64db60ed 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -156,14 +156,13 @@ def from_mapping_set_document(cls, doc: MappingSetDocument) -> "MappingSetDataFr df.replace("", np.nan, inplace=True) df.dropna(axis=1, how="all", inplace=True) # remove columns with all row = 'None'-s. + slots = _get_sssom_schema_object().dict["slots"] slots_with_double_as_range = { - slot - for slot, slot_metadata in _get_sssom_schema_object().dict["slots"].items() - if slot_metadata["range"] == "double" + slot for slot, slot_metadata in slots.items() if slot_metadata["range"] == "double" } non_double_cols = df.loc[:, ~df.columns.isin(slots_with_double_as_range)] - non_double_cols = non_double_cols.replace(np.nan, "") - df[non_double_cols.columns] = non_double_cols + non_double_cols.replace(np.nan, "", inplace=True) + df.update(non_double_cols) df = sort_df_rows_columns(df) return cls.with_converter(df=df, converter=doc.converter, metadata=meta) @@ -1044,46 +1043,35 @@ def get_dict_from_mapping(map_obj: Union[Any, Dict[Any, Any], SSSOM_Mapping]) -> :return: Dictionary """ map_dict = {} - slots_with_double_as_range = [ - s - for s in _get_sssom_schema_object().dict["slots"].keys() - if _get_sssom_schema_object().dict["slots"][s]["range"] == "double" - ] + sssom_schema_object = _get_sssom_schema_object() for property in map_obj: - if map_obj[property] is not None: - if isinstance(map_obj[property], list): - # IF object is an enum - if ( - _get_sssom_schema_object().dict["slots"][property]["range"] - in _get_sssom_schema_object().dict["enums"].keys() - ): - # IF object is a multivalued enum - if _get_sssom_schema_object().dict["slots"][property]["multivalued"]: - map_dict[property] = "|".join( - enum_value.code.text for enum_value in map_obj[property] - ) - # If object is NOT multivalued BUT an enum. - else: - map_dict[property] = map_obj[property].code.text - # IF object is NOT an enum but a list - else: - map_dict[property] = "|".join(enum_value for enum_value in map_obj[property]) - # IF object NOT a list + mapping_property = map_obj[property] + if mapping_property is None: + map_dict[property] = np.nan if property in sssom_schema_object.double_slots else "" + continue + + slot_of_interest = sssom_schema_object.slots[property] + is_enum = slot_of_interest["range"] in sssom_schema_object.mapping_enum_keys # type:ignore + + # Check if the mapping_property is a list + if isinstance(mapping_property, list): + # If the property is an enumeration and it allows multiple values + if is_enum and slot_of_interest["multivalued"]: # type:ignore + # Join all the enum values into a string separated by '|' + map_dict[property] = "|".join( + enum_value.code.text for enum_value in mapping_property + ) else: - # IF object is an enum - if ( - _get_sssom_schema_object().dict["slots"][property]["range"] - in _get_sssom_schema_object().dict["enums"].keys() - ): - map_dict[property] = map_obj[property].code.text - else: - map_dict[property] = map_obj[property] + # If the property is not an enumeration or doesn't allow multiple values, + # join all the values into a string separated by '|' + map_dict[property] = "|".join(enum_value for enum_value in mapping_property) + elif is_enum: + # Assign the text of the enumeration code to the property in the dictionary + map_dict[property] = mapping_property.code.text else: - # IF map_obj[property] is None: - if property in slots_with_double_as_range: - map_dict[property] = np.nan - else: - map_dict[property] = "" + # If the mapping_property is neither a list nor an enumeration, + # assign the value directly to the property in the dictionary + map_dict[property] = mapping_property return map_dict @@ -1139,18 +1127,21 @@ def get_prefixes_used_in_table(df: pd.DataFrame, converter: Converter) -> Set[st prefixes = set(SSSOM_BUILT_IN_PREFIXES) if df.empty: return prefixes - for col in _get_sssom_schema_object().entity_reference_slots: - if col not in df.columns: - continue - prefixes.update( - converter.parse_curie(row).prefix - for row in df[col] - # we don't use the converter here since get_prefixes_used_in_table - # is often used to identify prefixes that are not properly registered - # in the converter - if not _is_iri(row) and _is_curie(row) - ) - return set(prefixes) + sssom_schema_object = _get_sssom_schema_object() + entity_reference_slots = sssom_schema_object.entity_reference_slots & set(df.columns) + new_prefixes = { + converter.parse_curie(row).prefix + for col in entity_reference_slots + for row in df[col] + if not _is_iri(row) and _is_curie(row) + # we don't use the converter here since get_prefixes_used_in_table + # is often used to identify prefixes that are not properly registered + # in the converter + } + + prefixes.update(new_prefixes) + + return prefixes def get_prefixes_used_in_metadata(meta: MetadataType) -> Set[str]: diff --git a/tests/test_utils.py b/tests/test_utils.py index 4e414d0a..fb943115 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,11 +2,21 @@ import unittest +import numpy as np import pandas as pd import yaml from curies import Converter, Record - -from sssom.constants import OBJECT_ID, SUBJECT_ID +from sssom_schema import Mapping as SSSOM_Mapping + +from sssom.constants import ( + CREATOR_ID, + OBJECT_ID, + OBJECT_LABEL, + PREDICATE_ID, + SEMAPV, + SUBJECT_ID, + SUBJECT_LABEL, +) from sssom.context import SSSOM_BUILT_IN_PREFIXES, ensure_converter from sssom.io import extract_iris from sssom.parsers import parse_sssom_table @@ -14,6 +24,7 @@ MappingSetDataFrame, filter_out_prefixes, filter_prefixes, + get_dict_from_mapping, get_prefixes_used_in_table, inject_metadata_into_df, invert_mappings, @@ -30,6 +41,7 @@ def setUp(self) -> None: self.msdf = parse_sssom_table(f"{data_dir}/basic.tsv") self.msdf2 = parse_sssom_table(f"{data_dir}/basic7.tsv") self.features = [SUBJECT_ID, OBJECT_ID] + self.mapping_justification = SEMAPV.ManualMappingCuration.value def test_broken_predicate_list(self): """Test merging of multiple msdfs.""" @@ -314,18 +326,18 @@ def test_msdf_from_mappings(self): "skos:exactMatch", "UMLS:C1863204", "ADULT SYNDROME", - "semapv:ManualMappingCuration", + SEMAPV.ManualMappingCuration.value, "orcid:0000-0003-4423-4370", ) ] columns = [ - "subject_id", - "subject_label", - "predicate_id", - "object_id", - "object_label", - "mapping_justification", - "creator_id", + SUBJECT_ID, + SUBJECT_LABEL, + PREDICATE_ID, + OBJECT_ID, + OBJECT_LABEL, + SEMAPV.ManualMappingCuration.value, + CREATOR_ID, ] df = pd.DataFrame(rows, columns=columns) msdf = MappingSetDataFrame(df=df, converter=ensure_converter()) @@ -345,3 +357,76 @@ def test_msdf_from_mappings(self): self.assertEqual(1, len(new_msdf.df.index)) self.assertEqual(rows[0], tuple(msdf.df.iloc[0])) self.assertEqual(new_msdf.metadata, msdf.metadata) + + def test_get_dict_from_mapping(self): + """Test getting dict from a SSSOM mapping object or a dictionary.""" + mapping_obj = SSSOM_Mapping( + subject_id="DOID:0050601", + predicate_id="skos:exactMatch", + object_id="UMLS:C1863204", + mapping_justification=SEMAPV.ManualMappingCuration.value, + author_id=["orcid:0000-0002-2411-565X", "orcid:0000-0002-7356-1779"], + confidence=0.5, + ) + mapping_dict = mapping_obj.__dict__ + + expected_result = { + "subject_id": "DOID:0050601", + "predicate_id": "skos:exactMatch", + "object_id": "UMLS:C1863204", + "mapping_justification": "semapv:ManualMappingCuration", + "subject_label": "", + "subject_category": "", + "predicate_label": "", + "predicate_modifier": "", + "object_label": "", + "object_category": "", + "author_id": "orcid:0000-0002-2411-565X|orcid:0000-0002-7356-1779", + "author_label": "", + "reviewer_id": "", + "reviewer_label": "", + "creator_id": "", + "creator_label": "", + "license": "", + "subject_type": "", + "subject_source": "", + "subject_source_version": "", + "object_type": "", + "object_source": "", + "object_source_version": "", + "mapping_provider": "", + "mapping_source": "", + "mapping_cardinality": "", + "mapping_tool": "", + "mapping_tool_version": "", + "mapping_date": "", + "publication_date": "", + "confidence": 0.5, + "curation_rule": "", + "curation_rule_text": "", + "subject_match_field": "", + "object_match_field": "", + "match_string": "", + "subject_preprocessing": "", + "object_preprocessing": "", + "semantic_similarity_score": np.nan, + "semantic_similarity_measure": "", + "see_also": "", + "issue_tracker_item": "", + "other": "", + "comment": "", + } + + result_with_mapping_object = get_dict_from_mapping(mapping_obj) + result_with_dict = get_dict_from_mapping(mapping_dict) + self.assertEqual(result_with_mapping_object, result_with_dict) + + # Assert that every attribute value in expected_result + # equals the corresponding key in result_with_mapping_object (except lists) + for key, value in expected_result.items(): + if value is None or value == [] or value is np.nan: + self.assertIn(result_with_mapping_object[key], [np.nan, ""]) + self.assertIn(result_with_dict[key], [np.nan, ""]) + else: + self.assertEqual(value, result_with_mapping_object[key]) + self.assertEqual(value, result_with_dict[key])