From d4fd48e6b16cd8b5facdf847f46c686b27223e81 Mon Sep 17 00:00:00 2001 From: Joe Flack Date: Thu, 11 Jan 2024 20:53:56 -0500 Subject: [PATCH] Bugfixes: match-mondo-sources-all-lexical.py - Bugfix: AttributeError: 'tuple' object has no attribute 'pop': wrong datatype for metadata was being passed to lexical_index_to_sssom() - Bugfix: Several other bugs in mondo-ingest, and upgrading OAK/sssom-py/curies to fix other bugs related to prefix maps. - Update: mondo.sssom.config.yml: Commented out duplicate prefix 'oio' - Update: prefixes.csv: Removed duplicate prefix oio - Update: Python requirements: Upgraded curies for bugfix involving get_prefixes(include_synonyms) --- python-requirements-apple-silicon.txt | 2 +- python-requirements.txt | 2 +- src/ontology/config/prefixes.csv | 2 +- src/ontology/metadata/mondo.sssom.config.yml | 2 +- .../match-mondo-sources-all-lexical.py | 45 +++++++++++++------ 5 files changed, 36 insertions(+), 17 deletions(-) diff --git a/python-requirements-apple-silicon.txt b/python-requirements-apple-silicon.txt index 7af503b98..53f694ff7 100644 --- a/python-requirements-apple-silicon.txt +++ b/python-requirements-apple-silicon.txt @@ -15,7 +15,7 @@ charset-normalizer==3.3.2 class-resolver==0.4.2 click==8.1.7 colorama==0.4.6 -curies==0.7.4 +curies==0.7.6 Deprecated==1.2.14 deprecation==2.1.0 distlib==0.3.7 diff --git a/python-requirements.txt b/python-requirements.txt index 4731bf75f..ca771da21 100644 --- a/python-requirements.txt +++ b/python-requirements.txt @@ -22,7 +22,7 @@ class-resolver==0.4.2 click==8.1.7 colorama==0.4.6 commonmark==0.9.1 -curies==0.6.4 +curies==0.7.6 decorator==5.1.1 Deprecated==1.2.13 deprecation==2.1.0 diff --git a/src/ontology/config/prefixes.csv b/src/ontology/config/prefixes.csv index f29f5f415..0efab0571 100644 --- a/src/ontology/config/prefixes.csv +++ b/src/ontology/config/prefixes.csv @@ -3,7 +3,7 @@ rdf,http://www.w3.org/1999/02/22-rdf-syntax-ns# rdfs,http://www.w3.org/2000/01/rdf-schema# xsd,http://www.w3.org/2001/XMLSchema# owl,http://www.w3.org/2002/07/owl# -oio,http://www.geneontology.org/formats/oboInOwl# +oboInOwl,http://www.geneontology.org/formats/oboInOwl# dce,http://purl.org/dc/elements/1.1/ dct,http://purl.org/dc/terms/ foaf,http://xmlns.com/foaf/0.1/ diff --git a/src/ontology/metadata/mondo.sssom.config.yml b/src/ontology/metadata/mondo.sssom.config.yml index 6875e9cfb..6fb52f357 100644 --- a/src/ontology/metadata/mondo.sssom.config.yml +++ b/src/ontology/metadata/mondo.sssom.config.yml @@ -34,7 +34,7 @@ curie_map: semapv: https://w3id.org/semapv/vocab/ rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns# sssom: https://w3id.org/sssom/ - oio: http://www.geneontology.org/formats/oboInOwl# +# oio: http://www.geneontology.org/formats/oboInOwl# GTR: "http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/GTR/" NCI: "http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/NCI/" NIFSTD: "http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/NIFSTD/" diff --git a/src/scripts/match-mondo-sources-all-lexical.py b/src/scripts/match-mondo-sources-all-lexical.py index 1da046468..8ed877620 100644 --- a/src/scripts/match-mondo-sources-all-lexical.py +++ b/src/scripts/match-mondo-sources-all-lexical.py @@ -11,7 +11,10 @@ # Use oak.mapping() pipeline import logging +from datetime import datetime from pathlib import Path + +from curies import Converter from oaklib.resource import OntologyResource from oaklib.implementations.sqldb.sql_implementation import SqlImplementation from oaklib.utilities.lexical.lexical_indexer import ( @@ -25,11 +28,11 @@ import yaml import pandas as pd -from sssom.constants import SUBJECT_ID, OBJECT_ID, PREDICATE_MODIFIER +from sssom.constants import SUBJECT_ID, OBJECT_ID from sssom.util import filter_prefixes, is_curie, is_iri from sssom.parsers import parse_sssom_table from sssom.writers import write_table -from sssom.io import get_metadata_and_prefix_map, filter_file +from sssom.io import filter_file from bioregistry import curie_from_iri SRC = Path(__file__).resolve().parents[1] @@ -83,11 +86,11 @@ def main(verbose: int, quiet: bool): ) @output_option def run(input: str, config: str, rules: str, rejects: str, output: str): - # Implemented `meta` param in `lexical_index_to_sssom` - - meta = get_metadata_and_prefix_map(config) + t0 = datetime.now() # todo: temp + # Get metadata config with open(config, "r") as f: yml = yaml.safe_load(f) + converter = Converter.from_extended_prefix_map(yml.pop('extended_prefix_map', {})) # Get mondo.sssom.tsv mapping_msdf = parse_sssom_table(SSSOM_MAP_FILE) @@ -108,19 +111,31 @@ def run(input: str, config: str, rules: str, rejects: str, output: str): # .reset_index(drop=True) # ) - prefix_of_interest = yml["subject_prefixes"] - resource = OntologyResource(slug=f"sqlite:///{Path(input).absolute()}") oi = SqlImplementation(resource=resource) ruleset = load_mapping_rules(rules) # syn_rules = [x.synonymizer for x in ruleset.rules if x.synonymizer] - lexical_index = create_lexical_index(oi=oi, mapping_rule_collection=ruleset) - save_lexical_index(lexical_index, OUT_INDEX_DB) - + # TODO: uncomment + # t0_2 = datetime.now() # todo: temp + # lexical_index = create_lexical_index(oi=oi, mapping_rule_collection=ruleset) + # save_lexical_index(lexical_index, OUT_INDEX_DB) + # t1_2 = datetime.now() # todo: temp + # print('lexical_index & save_lexical_index complete in seconds:', (t1_2 - t0_2).seconds) # todo temp + # TODO: /uncomment + + # TODO temp delete after + import pickle + pp = '/Users/joeflack4/projects/mondo-ingest/cache/issues/lexmatch/mondo-ingest/lexical_index.pickle/b4_after_remove_oio_prefixes_csv/after/lexical_index.pickle' + # pickle.dump(lexical_index, open(pp, "wb")) + lexical_index = pickle.load(open(pp, "rb")) + + t0_3 = datetime.now() # todo: temp if rules: - msdf = lexical_index_to_sssom(oi, lexical_index, ruleset=ruleset, meta=meta) + msdf = lexical_index_to_sssom(oi, lexical_index, ruleset=ruleset, prefix_map=converter) else: - msdf = lexical_index_to_sssom(oi, lexical_index, meta=meta) + msdf = lexical_index_to_sssom(oi, lexical_index) + t1_3 = datetime.now() # todo: temp + print('lexical_index_to_sssom complete in seconds:', (t1_3 - t0_3).seconds) # todo temp # msdf.prefix_map = sssom_yaml['curie_map'] # msdf.metadata = sssom_yaml['global_metadata'] @@ -131,8 +146,9 @@ def run(input: str, config: str, rules: str, rejects: str, output: str): # msdf.df[OBJECT_ID] = msdf.df[OBJECT_ID].apply( # lambda x: iri_to_curie(x) if x.startswith("