Optimization of some functions (#462)

Addresses #202 - [x] Ran `poetry update` - [x] Call `_get_sssom_schema_object()` once in the function `get_dict_from_mapping()` rather than multiple times in a for loop that is inefficient. - [x] Instead of `pandas.iterrows()` use `pandas.apply()` in `_get_mapping_set_from_df()` - [x] Use dict/list comprehensions instead of for loops - [x] Use sets instead of lists where lookups are done and sequence of elements don't matter. - [x] Improve `SchemaView` object instantiation and persistence - [x] Use `@cached_property` thank you @cthoyt --------- Co-authored-by: Charles Tapley Hoyt <cthoyt@gmail.com> Co-authored-by: Nico Matentzoglu <nicolas.matentzoglu@gmail.com>
mapping-commons · Nov 20, 2023 · 20a00be · 20a00be
1 parent da2a250
commit 20a00be
Show file tree

Hide file tree

Showing 5 changed files with 207 additions and 110 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/src/sssom/constants.py b/src/sssom/constants.py
@@ -3,8 +3,8 @@
 import pathlib
 import uuid
 from enum import Enum
-from functools import lru_cache
-from typing import Any, Dict, List, Literal
+from functools import cached_property, lru_cache
+from typing import Any, Dict, List, Literal, Set
 
 import pkg_resources
 import yaml
@@ -213,48 +213,56 @@ class SSSOMSchemaView(object):
     Implemented via PR: https://github.com/mapping-commons/sssom-py/pull/323
     """
 
-    _view = None
-    _dict = None
-
     def __new__(cls):
         """Create a instance of the SSSOM schema view if non-existent."""
         if not hasattr(cls, "instance"):
             cls.instance = super(SSSOMSchemaView, cls).__new__(cls)
-            return cls.instance
+        return cls.instance
 
-    @property
+    @cached_property
     def view(self) -> SchemaView:
         """Return SchemaView object."""
-        if self._view is None:
-            self._view = SchemaView(SCHEMA_YAML)
-        return self._view
+        return SchemaView(SCHEMA_YAML)
 
-    @property
+    @cached_property
     def dict(self) -> dict:
         """Return SchemaView as a dictionary."""
-        if self._dict is None:
-            self._dict = schema_as_dict(self.view.schema)
-        return self._dict
+        return schema_as_dict(self.view.schema)
 
-    @property
+    @cached_property
     def mapping_slots(self) -> List[str]:
         """Return list of mapping slots."""
         return self.view.get_class("mapping").slots
 
-    @property
+    @cached_property
     def mapping_set_slots(self) -> List[str]:
         """Return list of mapping set slots."""
         return self.view.get_class("mapping set").slots
 
-    @property
-    def multivalued_slots(self) -> List[str]:
-        """Return list of multivalued slots."""
-        return [c for c in self.view.all_slots() if self.view.get_slot(c).multivalued]
-
-    @property
-    def entity_reference_slots(self) -> List[str]:
-        """Return list of entity reference slots."""
-        return [c for c in self.view.all_slots() if self.view.get_slot(c).range == ENTITY_REFERENCE]
+    @cached_property
+    def multivalued_slots(self) -> Set[str]:
+        """Return set of multivalued slots."""
+        return {c for c in self.view.all_slots() if self.view.get_slot(c).multivalued}
+
+    @cached_property
+    def entity_reference_slots(self) -> Set[str]:
+        """Return set of entity reference slots."""
+        return {c for c in self.view.all_slots() if self.view.get_slot(c).range == ENTITY_REFERENCE}
+
+    @cached_property
+    def mapping_enum_keys(self) -> Set[str]:
+        """Return a set of mapping enum keys."""
+        return set(_get_sssom_schema_object().dict["enums"].keys())
+
+    @cached_property
+    def slots(self) -> Dict[str, str]:
+        """Return the slots for SSSOMSchemaView object."""
+        return self.dict["slots"]
+
+    @cached_property
+    def double_slots(self) -> Set[str]:
+        """Return the slot names for SSSOMSchemaView object."""
+        return {k for k, v in self.dict["slots"].items() if v["range"] == "double"}
 
 
 @lru_cache(1)

diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py
@@ -313,7 +313,7 @@ def _get_prefix_map_and_metadata(
 
 
 def _address_multivalued_slot(k: str, v: Any) -> Union[str, List[str]]:
-    if is_multivalued_slot(k) and v is not None and isinstance(v, str):
+    if isinstance(v, str) and is_multivalued_slot(k):
         # IF k is multivalued, then v = List[values]
         return [s.strip() for s in v.split("|")]
     else:
@@ -329,20 +329,28 @@ def _init_mapping_set(meta: Optional[MetadataType]) -> MappingSet:
     return mapping_set
 
 
+MAPPING_SLOTS = set(_get_sssom_schema_object().mapping_slots)
+
+
 def _get_mapping_dict(row: pd.Series, bad_attrs: Counter) -> Dict[str, Any]:
-    mdict = {}
-    sssom_schema_object = _get_sssom_schema_object()
-    for k, v in row.items():
-        if not v or pd.isna(v):
-            continue
-        k = cast(str, k)
-        if k in sssom_schema_object.mapping_slots:
-            mdict[k] = _address_multivalued_slot(k, v)
-        else:
-            # There's the possibility that the key is in
-            # sssom_schema_object.mapping_set_slots, but
-            # this is skipped for now
-            bad_attrs[k] += 1
+    """Generate a mapping dictionary from a given row of data.
+
+    It also updates the 'bad_attrs' counter for keys that are not present
+    in the sssom_schema_object's mapping_slots.
+    """
+    # Populate the mapping dictionary with key-value pairs from the row,
+    # only if the value exists, is not NaN, and the key is in the schema's mapping slots.
+    # The value could be a string or a list and is handled accordingly via _address_multivalued_slot().
+    mdict = {
+        k: _address_multivalued_slot(k, v)
+        for k, v in row.items()
+        if v and pd.notna(v) and k in MAPPING_SLOTS
+    }
+
+    # Update bad_attrs for keys not in mapping_slots
+    bad_keys = set(row.keys()) - MAPPING_SLOTS
+    for bad_key in bad_keys:
+        bad_attrs[bad_key] += 1
     return mdict
 
 
@@ -795,9 +803,14 @@ def to_mapping_set_document(msdf: MappingSetDataFrame) -> MappingSetDocument:
 def _get_mapping_set_from_df(df: pd.DataFrame, meta: Optional[MetadataType] = None) -> MappingSet:
     mapping_set = _init_mapping_set(meta)
     bad_attrs: Counter = Counter()
-    for _, row in df.iterrows():
-        mapping_dict = _get_mapping_dict(row, bad_attrs)
-        _add_valid_mapping_to_list(mapping_dict, mapping_set.mappings)
+
+    df.apply(
+        lambda row: _add_valid_mapping_to_list(
+            _get_mapping_dict(row, bad_attrs), mapping_set.mappings
+        ),
+        axis=1,
+    )
+
     for k, v in bad_attrs.items():
         logging.warning(f"No attr for {k} [{v} instances]")
     return mapping_set

diff --git a/src/sssom/util.py b/src/sssom/util.py
@@ -156,14 +156,13 @@ def from_mapping_set_document(cls, doc: MappingSetDocument) -> "MappingSetDataFr
         df.replace("", np.nan, inplace=True)
         df.dropna(axis=1, how="all", inplace=True)  # remove columns with all row = 'None'-s.
 
+        slots = _get_sssom_schema_object().dict["slots"]
         slots_with_double_as_range = {
-            slot
-            for slot, slot_metadata in _get_sssom_schema_object().dict["slots"].items()
-            if slot_metadata["range"] == "double"
+            slot for slot, slot_metadata in slots.items() if slot_metadata["range"] == "double"
         }
         non_double_cols = df.loc[:, ~df.columns.isin(slots_with_double_as_range)]
-        non_double_cols = non_double_cols.replace(np.nan, "")
-        df[non_double_cols.columns] = non_double_cols
+        non_double_cols.replace(np.nan, "", inplace=True)
+        df.update(non_double_cols)
 
         df = sort_df_rows_columns(df)
         return cls.with_converter(df=df, converter=doc.converter, metadata=meta)
@@ -1044,46 +1043,35 @@ def get_dict_from_mapping(map_obj: Union[Any, Dict[Any, Any], SSSOM_Mapping]) ->
     :return: Dictionary
     """
     map_dict = {}
-    slots_with_double_as_range = [
-        s
-        for s in _get_sssom_schema_object().dict["slots"].keys()
-        if _get_sssom_schema_object().dict["slots"][s]["range"] == "double"
-    ]
+    sssom_schema_object = _get_sssom_schema_object()
     for property in map_obj:
-        if map_obj[property] is not None:
-            if isinstance(map_obj[property], list):
-                # IF object is an enum
-                if (
-                    _get_sssom_schema_object().dict["slots"][property]["range"]
-                    in _get_sssom_schema_object().dict["enums"].keys()
-                ):
-                    # IF object is a multivalued enum
-                    if _get_sssom_schema_object().dict["slots"][property]["multivalued"]:
-                        map_dict[property] = "|".join(
-                            enum_value.code.text for enum_value in map_obj[property]
-                        )
-                    # If object is NOT multivalued BUT an enum.
-                    else:
-                        map_dict[property] = map_obj[property].code.text
-                # IF object is NOT an enum but a list
-                else:
-                    map_dict[property] = "|".join(enum_value for enum_value in map_obj[property])
-            # IF object NOT a list
+        mapping_property = map_obj[property]
+        if mapping_property is None:
+            map_dict[property] = np.nan if property in sssom_schema_object.double_slots else ""
+            continue
+
+        slot_of_interest = sssom_schema_object.slots[property]
+        is_enum = slot_of_interest["range"] in sssom_schema_object.mapping_enum_keys  # type:ignore
+
+        # Check if the mapping_property is a list
+        if isinstance(mapping_property, list):
+            # If the property is an enumeration and it allows multiple values
+            if is_enum and slot_of_interest["multivalued"]:  # type:ignore
+                # Join all the enum values into a string separated by '|'
+                map_dict[property] = "|".join(
+                    enum_value.code.text for enum_value in mapping_property
+                )
             else:
-                # IF object is an enum
-                if (
-                    _get_sssom_schema_object().dict["slots"][property]["range"]
-                    in _get_sssom_schema_object().dict["enums"].keys()
-                ):
-                    map_dict[property] = map_obj[property].code.text
-                else:
-                    map_dict[property] = map_obj[property]
+                # If the property is not an enumeration or doesn't allow multiple values,
+                # join all the values into a string separated by '|'
+                map_dict[property] = "|".join(enum_value for enum_value in mapping_property)
+        elif is_enum:
+            # Assign the text of the enumeration code to the property in the dictionary
+            map_dict[property] = mapping_property.code.text
         else:
-            # IF map_obj[property] is None:
-            if property in slots_with_double_as_range:
-                map_dict[property] = np.nan
-            else:
-                map_dict[property] = ""
+            # If the mapping_property is neither a list nor an enumeration,
+            # assign the value directly to the property in the dictionary
+            map_dict[property] = mapping_property
 
     return map_dict
 
@@ -1139,18 +1127,21 @@ def get_prefixes_used_in_table(df: pd.DataFrame, converter: Converter) -> Set[st
     prefixes = set(SSSOM_BUILT_IN_PREFIXES)
     if df.empty:
         return prefixes
-    for col in _get_sssom_schema_object().entity_reference_slots:
-        if col not in df.columns:
-            continue
-        prefixes.update(
-            converter.parse_curie(row).prefix
-            for row in df[col]
-            # we don't use the converter here since get_prefixes_used_in_table
-            # is often used to identify prefixes that are not properly registered
-            # in the converter
-            if not _is_iri(row) and _is_curie(row)
-        )
-    return set(prefixes)
+    sssom_schema_object = _get_sssom_schema_object()
+    entity_reference_slots = sssom_schema_object.entity_reference_slots & set(df.columns)
+    new_prefixes = {
+        converter.parse_curie(row).prefix
+        for col in entity_reference_slots
+        for row in df[col]
+        if not _is_iri(row) and _is_curie(row)
+        # we don't use the converter here since get_prefixes_used_in_table
+        # is often used to identify prefixes that are not properly registered
+        # in the converter
+    }
+
+    prefixes.update(new_prefixes)
+
+    return prefixes
 
 
 def get_prefixes_used_in_metadata(meta: MetadataType) -> Set[str]: