From 40b97ac1426bd749f65bcae5f7ced5143398bc66 Mon Sep 17 00:00:00 2001 From: Nico Matentzoglu Date: Sat, 9 Nov 2024 17:04:25 +0200 Subject: [PATCH 1/2] Handle two warnings that pollute the output of sssom-py CLI (#561) See commits for a more detailed description of the changes; This PR should not introduce any functional changes. --- src/sssom/cli.py | 4 ++++ src/sssom/parsers.py | 4 +--- src/sssom/util.py | 16 ++++++++++------ 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/src/sssom/cli.py b/src/sssom/cli.py index 9762d700..3bd8b72a 100644 --- a/src/sssom/cli.py +++ b/src/sssom/cli.py @@ -54,6 +54,7 @@ filter_redundant_rows, invert_mappings, merge_msdf, + pandas_set_no_silent_downcasting, reconcile_prefix_and_data, remove_unmatched, sort_df_rows_columns, @@ -126,6 +127,9 @@ def main(verbose: int, quiet: bool): """Run the SSSOM CLI.""" logger = _logging.getLogger() + + pandas_set_no_silent_downcasting() + if verbose >= 2: logger.setLevel(level=_logging.DEBUG) elif verbose == 1: diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py index c2348fe3..28f96b43 100644 --- a/src/sssom/parsers.py +++ b/src/sssom/parsers.py @@ -424,9 +424,7 @@ def from_sssom_dataframe( # Need to revisit this solution. # This is to address: A value is trying to be set on a copy of a slice from a DataFrame if CONFIDENCE in df.columns: - df2 = df.copy() - df2[CONFIDENCE].replace(r"^\s*$", np.nan, regex=True, inplace=True) - df = df2 + df.replace({CONFIDENCE: r"^\s*$"}, np.nan, regex=True, inplace=True) mapping_set = _get_mapping_set_from_df(df=df, meta=meta) doc = MappingSetDocument(mapping_set=mapping_set, converter=converter) diff --git a/src/sssom/util.py b/src/sssom/util.py index f506c44a..6684f9a1 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -158,14 +158,9 @@ def from_mapping_set_document(cls, doc: MappingSetDocument) -> "MappingSetDataFr df = pd.DataFrame(get_dict_from_mapping(mapping) for mapping in doc.mapping_set.mappings) meta = _extract_global_metadata(doc) - if pandas_version >= (2, 0, 0): - # For pandas >= 2.0.0, use the 'copy' parameter - df = df.infer_objects(copy=False) - else: - # For pandas < 2.0.0, call 'infer_objects()' without any parameters - df = df.infer_objects() # remove columns where all values are blank. df.replace("", np.nan, inplace=True) + df = df.infer_objects() df.dropna(axis=1, how="all", inplace=True) # remove columns with all row = 'None'-s. slots = _get_sssom_schema_object().dict["slots"] @@ -1493,3 +1488,12 @@ def safe_compress(uri: str, converter: Converter) -> str: :return: A CURIE """ return converter.compress_or_standardize(uri, strict=True) + + +def pandas_set_no_silent_downcasting(no_silent_downcasting=True): + """Set pandas future.no_silent_downcasting option. Context https://github.com/pandas-dev/pandas/issues/57734.""" + try: + pd.set_option("future.no_silent_downcasting", no_silent_downcasting) + except KeyError: + # Option does not exist in this version of pandas + pass From 53d134576ae40cebc157ddba2c889724dbfba96d Mon Sep 17 00:00:00 2001 From: Nico Matentzoglu Date: Sat, 9 Nov 2024 17:43:07 +0200 Subject: [PATCH 2/2] park stached semra compatibility tests --- tests/test_semra_compatibility.py | 46 +++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 tests/test_semra_compatibility.py diff --git a/tests/test_semra_compatibility.py b/tests/test_semra_compatibility.py new file mode 100644 index 00000000..123afd25 --- /dev/null +++ b/tests/test_semra_compatibility.py @@ -0,0 +1,46 @@ +"""Test for merging MappingSetDataFrames.""" + +import unittest + +from sssom_schema import Mapping + +from sssom.context import get_converter +from sssom.parsers import parse_sssom_table +from sssom.util import MappingSetDataFrame +from sssom.writers import write_table + + +class TestSemraCompatibility(unittest.TestCase): + """A test case for making sure the model works as intended.""" + + def test_basic_inference(self): + """Test if instantiating Mapping() fails when required elements are missing.""" + mdict_missing = dict( + subject_id="ID:123" + ) # This is missing object_id, predicate_id, mapping_justification + + import io + + import pandas as pd + from semra.api import infer_chains, infer_reversible + from semra.io import from_sssom_df, get_sssom_df + + data = [ + ["UBERON:1", "skos:exactMatch", "FBbt:9"], + ["UBERON:1", "skos:exactMatch", "WBbt:6"], + ] + + df = pd.DataFrame(data=data, columns=["subject_id", "predicate_id", "object_id"]) + + mappings = from_sssom_df(df, mapping_set_name="test") + mappings = infer_reversible(mappings, progress=False) + mappings = infer_chains(mappings, progress=False) + + df = get_sssom_df(mappings) + print(df) + msdf = MappingSetDataFrame(df=df, converter=get_converter()) + print(msdf.df) + msdf.standardize_references() + msdf.clean_prefix_map() + with open("testout.sssom.tsv", "w", encoding="utf-8") as file: + write_table(msdf=msdf, file=file)