From f76c562fca03498b415be8839e872d38953440c6 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Fri, 12 May 2023 11:10:31 -0400 Subject: [PATCH] Fixes Min Score Bug Fixes a bug where the min_score argument in the mapper was ignored when not using TFIDF --- test/simple-test.py | 12 ++++++------ text2term/config.py | 2 +- text2term/t2t.py | 17 +++++++++++++---- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/test/simple-test.py b/test/simple-test.py index 0145ca7..7143c1f 100644 --- a/test/simple-test.py +++ b/test/simple-test.py @@ -6,12 +6,12 @@ def main(): pizza = "https://protege.stanford.edu/ontologies/pizza/pizza.owl" ncit = "http://purl.obolibrary.org/obo/ncit/releases/2022-08-19/ncit.owl" # print(bioregistry.get_owl_download("eFo")) - # if not text2term.cache_exists("EFO"): - # cached_onto = text2term.cache_ontology("EFO") - # # df = cached_onto.map_terms(["asthma", "disease location", "obsolete food allergy"], excl_deprecated=True, term_type="classes") - # print("Cache exists:", cached_onto.cache_exists()) - caches = text2term.cache_ontology_set("text2term/resources/ontologies.csv") - df = text2term.map_terms(["asthma", "disease location", "obsolete food allergy"], "EFO", excl_deprecated=True, use_cache=True, term_type="classes") + if not text2term.cache_exists("EFO"): + cached_onto = text2term.cache_ontology("EFO") + # df = cached_onto.map_terms(["asthma", "disease location", "obsolete food allergy"], excl_deprecated=True, term_type="classes") + print("Cache exists:", cached_onto.cache_exists()) + # caches = text2term.cache_ontology_set("text2term/resources/ontologies.csv") + df = text2term.map_terms(["asthma", "disease location", "obsolete food allergy"], "EFO", min_score=.8, mapper=text2term.Mapper.JARO_WINKLER, excl_deprecated=True, use_cache=True, term_type="classes") # df = text2term.map_terms(["contains", "asthma"], "EFO", term_type="classes") print(df.to_string()) diff --git a/text2term/config.py b/text2term/config.py index 9b332ca..388faa9 100644 --- a/text2term/config.py +++ b/text2term/config.py @@ -1 +1 @@ -VERSION = "2.3.1" \ No newline at end of file +VERSION = "2.3.2" \ No newline at end of file diff --git a/text2term/t2t.py b/text2term/t2t.py index 711c3c0..12cc402 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -219,18 +219,27 @@ def _load_ontology(ontology, iris, exclude_deprecated, use_cache=False, term_typ def _do_mapping(source_terms, source_term_ids, ontology_terms, mapper, max_mappings, min_score): if mapper == Mapper.TFIDF: term_mapper = TFIDFMapper(ontology_terms) - return term_mapper.map(source_terms, source_term_ids, max_mappings=max_mappings, min_score=min_score) + mappings_df = term_mapper.map(source_terms, source_term_ids, max_mappings=max_mappings, min_score=min_score) elif mapper == Mapper.ZOOMA: term_mapper = ZoomaMapper() - return term_mapper.map(source_terms, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) + mappings_df = term_mapper.map(source_terms, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) elif mapper == Mapper.BIOPORTAL: term_mapper = BioPortalAnnotatorMapper("8f0cbe43-2906-431a-9572-8600d3f4266e") - return term_mapper.map(source_terms, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) + mappings_df = term_mapper.map(source_terms, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) elif mapper in {Mapper.LEVENSHTEIN, Mapper.JARO, Mapper.JARO_WINKLER, Mapper.INDEL, Mapper.FUZZY, Mapper.JACCARD}: term_mapper = SyntacticMapper(ontology_terms) - return term_mapper.map(source_terms, source_term_ids, mapper, max_mappings=max_mappings) + mappings_df = term_mapper.map(source_terms, source_term_ids, mapper, max_mappings=max_mappings) else: raise ValueError("Unsupported mapper: " + mapper) + df = _filter_mappings(mappings_df, min_score) + return df + +def _filter_mappings(mappings_df, min_score): + new_df = pd.DataFrame(columns=mappings_df.columns) + for index, row in mappings_df.iterrows(): + if row['Mapping Score'] >= min_score: + new_df.loc[len(new_df.index)] = row + return new_df def _save_mappings(mappings, output_file, min_score, mapper, target_ontology, base_iris, excl_deprecated, max_mappings, term_type): if os.path.dirname(output_file): # create output directories if needed