Skip to content

Commit

Permalink
Optimization of some functions (#462)
Browse files Browse the repository at this point in the history
Addresses #202 
 - [x] Ran `poetry update`
- [x] Call `_get_sssom_schema_object()` once in the function
`get_dict_from_mapping()` rather than multiple times in a for loop that
is inefficient.
- [x] Instead of `pandas.iterrows()` use `pandas.apply()` in
`_get_mapping_set_from_df()`
 - [x] Use dict/list comprehensions instead of for loops
- [x] Use sets instead of lists where lookups are done and sequence of
elements don't matter.
 - [x] Improve `SchemaView` object instantiation and persistence
   - [x] Use `@cached_property` thank you @cthoyt

---------

Co-authored-by: Charles Tapley Hoyt <cthoyt@gmail.com>
Co-authored-by: Nico Matentzoglu <nicolas.matentzoglu@gmail.com>
  • Loading branch information
3 people authored Nov 20, 2023
1 parent da2a250 commit 20a00be
Show file tree
Hide file tree
Showing 5 changed files with 207 additions and 110 deletions.
8 changes: 4 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

58 changes: 33 additions & 25 deletions src/sssom/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
import pathlib
import uuid
from enum import Enum
from functools import lru_cache
from typing import Any, Dict, List, Literal
from functools import cached_property, lru_cache
from typing import Any, Dict, List, Literal, Set

import pkg_resources
import yaml
Expand Down Expand Up @@ -213,48 +213,56 @@ class SSSOMSchemaView(object):
Implemented via PR: https://github.com/mapping-commons/sssom-py/pull/323
"""

_view = None
_dict = None

def __new__(cls):
"""Create a instance of the SSSOM schema view if non-existent."""
if not hasattr(cls, "instance"):
cls.instance = super(SSSOMSchemaView, cls).__new__(cls)
return cls.instance
return cls.instance

@property
@cached_property
def view(self) -> SchemaView:
"""Return SchemaView object."""
if self._view is None:
self._view = SchemaView(SCHEMA_YAML)
return self._view
return SchemaView(SCHEMA_YAML)

@property
@cached_property
def dict(self) -> dict:
"""Return SchemaView as a dictionary."""
if self._dict is None:
self._dict = schema_as_dict(self.view.schema)
return self._dict
return schema_as_dict(self.view.schema)

@property
@cached_property
def mapping_slots(self) -> List[str]:
"""Return list of mapping slots."""
return self.view.get_class("mapping").slots

@property
@cached_property
def mapping_set_slots(self) -> List[str]:
"""Return list of mapping set slots."""
return self.view.get_class("mapping set").slots

@property
def multivalued_slots(self) -> List[str]:
"""Return list of multivalued slots."""
return [c for c in self.view.all_slots() if self.view.get_slot(c).multivalued]

@property
def entity_reference_slots(self) -> List[str]:
"""Return list of entity reference slots."""
return [c for c in self.view.all_slots() if self.view.get_slot(c).range == ENTITY_REFERENCE]
@cached_property
def multivalued_slots(self) -> Set[str]:
"""Return set of multivalued slots."""
return {c for c in self.view.all_slots() if self.view.get_slot(c).multivalued}

@cached_property
def entity_reference_slots(self) -> Set[str]:
"""Return set of entity reference slots."""
return {c for c in self.view.all_slots() if self.view.get_slot(c).range == ENTITY_REFERENCE}

@cached_property
def mapping_enum_keys(self) -> Set[str]:
"""Return a set of mapping enum keys."""
return set(_get_sssom_schema_object().dict["enums"].keys())

@cached_property
def slots(self) -> Dict[str, str]:
"""Return the slots for SSSOMSchemaView object."""
return self.dict["slots"]

@cached_property
def double_slots(self) -> Set[str]:
"""Return the slot names for SSSOMSchemaView object."""
return {k for k, v in self.dict["slots"].items() if v["range"] == "double"}


@lru_cache(1)
Expand Down
47 changes: 30 additions & 17 deletions src/sssom/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ def _get_prefix_map_and_metadata(


def _address_multivalued_slot(k: str, v: Any) -> Union[str, List[str]]:
if is_multivalued_slot(k) and v is not None and isinstance(v, str):
if isinstance(v, str) and is_multivalued_slot(k):
# IF k is multivalued, then v = List[values]
return [s.strip() for s in v.split("|")]
else:
Expand All @@ -329,20 +329,28 @@ def _init_mapping_set(meta: Optional[MetadataType]) -> MappingSet:
return mapping_set


MAPPING_SLOTS = set(_get_sssom_schema_object().mapping_slots)


def _get_mapping_dict(row: pd.Series, bad_attrs: Counter) -> Dict[str, Any]:
mdict = {}
sssom_schema_object = _get_sssom_schema_object()
for k, v in row.items():
if not v or pd.isna(v):
continue
k = cast(str, k)
if k in sssom_schema_object.mapping_slots:
mdict[k] = _address_multivalued_slot(k, v)
else:
# There's the possibility that the key is in
# sssom_schema_object.mapping_set_slots, but
# this is skipped for now
bad_attrs[k] += 1
"""Generate a mapping dictionary from a given row of data.
It also updates the 'bad_attrs' counter for keys that are not present
in the sssom_schema_object's mapping_slots.
"""
# Populate the mapping dictionary with key-value pairs from the row,
# only if the value exists, is not NaN, and the key is in the schema's mapping slots.
# The value could be a string or a list and is handled accordingly via _address_multivalued_slot().
mdict = {
k: _address_multivalued_slot(k, v)
for k, v in row.items()
if v and pd.notna(v) and k in MAPPING_SLOTS
}

# Update bad_attrs for keys not in mapping_slots
bad_keys = set(row.keys()) - MAPPING_SLOTS
for bad_key in bad_keys:
bad_attrs[bad_key] += 1
return mdict


Expand Down Expand Up @@ -795,9 +803,14 @@ def to_mapping_set_document(msdf: MappingSetDataFrame) -> MappingSetDocument:
def _get_mapping_set_from_df(df: pd.DataFrame, meta: Optional[MetadataType] = None) -> MappingSet:
mapping_set = _init_mapping_set(meta)
bad_attrs: Counter = Counter()
for _, row in df.iterrows():
mapping_dict = _get_mapping_dict(row, bad_attrs)
_add_valid_mapping_to_list(mapping_dict, mapping_set.mappings)

df.apply(
lambda row: _add_valid_mapping_to_list(
_get_mapping_dict(row, bad_attrs), mapping_set.mappings
),
axis=1,
)

for k, v in bad_attrs.items():
logging.warning(f"No attr for {k} [{v} instances]")
return mapping_set
Expand Down
99 changes: 45 additions & 54 deletions src/sssom/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,14 +156,13 @@ def from_mapping_set_document(cls, doc: MappingSetDocument) -> "MappingSetDataFr
df.replace("", np.nan, inplace=True)
df.dropna(axis=1, how="all", inplace=True) # remove columns with all row = 'None'-s.

slots = _get_sssom_schema_object().dict["slots"]
slots_with_double_as_range = {
slot
for slot, slot_metadata in _get_sssom_schema_object().dict["slots"].items()
if slot_metadata["range"] == "double"
slot for slot, slot_metadata in slots.items() if slot_metadata["range"] == "double"
}
non_double_cols = df.loc[:, ~df.columns.isin(slots_with_double_as_range)]
non_double_cols = non_double_cols.replace(np.nan, "")
df[non_double_cols.columns] = non_double_cols
non_double_cols.replace(np.nan, "", inplace=True)
df.update(non_double_cols)

df = sort_df_rows_columns(df)
return cls.with_converter(df=df, converter=doc.converter, metadata=meta)
Expand Down Expand Up @@ -1044,46 +1043,35 @@ def get_dict_from_mapping(map_obj: Union[Any, Dict[Any, Any], SSSOM_Mapping]) ->
:return: Dictionary
"""
map_dict = {}
slots_with_double_as_range = [
s
for s in _get_sssom_schema_object().dict["slots"].keys()
if _get_sssom_schema_object().dict["slots"][s]["range"] == "double"
]
sssom_schema_object = _get_sssom_schema_object()
for property in map_obj:
if map_obj[property] is not None:
if isinstance(map_obj[property], list):
# IF object is an enum
if (
_get_sssom_schema_object().dict["slots"][property]["range"]
in _get_sssom_schema_object().dict["enums"].keys()
):
# IF object is a multivalued enum
if _get_sssom_schema_object().dict["slots"][property]["multivalued"]:
map_dict[property] = "|".join(
enum_value.code.text for enum_value in map_obj[property]
)
# If object is NOT multivalued BUT an enum.
else:
map_dict[property] = map_obj[property].code.text
# IF object is NOT an enum but a list
else:
map_dict[property] = "|".join(enum_value for enum_value in map_obj[property])
# IF object NOT a list
mapping_property = map_obj[property]
if mapping_property is None:
map_dict[property] = np.nan if property in sssom_schema_object.double_slots else ""
continue

slot_of_interest = sssom_schema_object.slots[property]
is_enum = slot_of_interest["range"] in sssom_schema_object.mapping_enum_keys # type:ignore

# Check if the mapping_property is a list
if isinstance(mapping_property, list):
# If the property is an enumeration and it allows multiple values
if is_enum and slot_of_interest["multivalued"]: # type:ignore
# Join all the enum values into a string separated by '|'
map_dict[property] = "|".join(
enum_value.code.text for enum_value in mapping_property
)
else:
# IF object is an enum
if (
_get_sssom_schema_object().dict["slots"][property]["range"]
in _get_sssom_schema_object().dict["enums"].keys()
):
map_dict[property] = map_obj[property].code.text
else:
map_dict[property] = map_obj[property]
# If the property is not an enumeration or doesn't allow multiple values,
# join all the values into a string separated by '|'
map_dict[property] = "|".join(enum_value for enum_value in mapping_property)
elif is_enum:
# Assign the text of the enumeration code to the property in the dictionary
map_dict[property] = mapping_property.code.text
else:
# IF map_obj[property] is None:
if property in slots_with_double_as_range:
map_dict[property] = np.nan
else:
map_dict[property] = ""
# If the mapping_property is neither a list nor an enumeration,
# assign the value directly to the property in the dictionary
map_dict[property] = mapping_property

return map_dict

Expand Down Expand Up @@ -1139,18 +1127,21 @@ def get_prefixes_used_in_table(df: pd.DataFrame, converter: Converter) -> Set[st
prefixes = set(SSSOM_BUILT_IN_PREFIXES)
if df.empty:
return prefixes
for col in _get_sssom_schema_object().entity_reference_slots:
if col not in df.columns:
continue
prefixes.update(
converter.parse_curie(row).prefix
for row in df[col]
# we don't use the converter here since get_prefixes_used_in_table
# is often used to identify prefixes that are not properly registered
# in the converter
if not _is_iri(row) and _is_curie(row)
)
return set(prefixes)
sssom_schema_object = _get_sssom_schema_object()
entity_reference_slots = sssom_schema_object.entity_reference_slots & set(df.columns)
new_prefixes = {
converter.parse_curie(row).prefix
for col in entity_reference_slots
for row in df[col]
if not _is_iri(row) and _is_curie(row)
# we don't use the converter here since get_prefixes_used_in_table
# is often used to identify prefixes that are not properly registered
# in the converter
}

prefixes.update(new_prefixes)

return prefixes


def get_prefixes_used_in_metadata(meta: MetadataType) -> Set[str]:
Expand Down
Loading

0 comments on commit 20a00be

Please sign in to comment.