From 71fd56a9c5c7ba733d9999a06123e8918b7f281b Mon Sep 17 00:00:00 2001 From: Joe Flack Date: Thu, 11 Jan 2024 20:53:56 -0500 Subject: [PATCH] Bugfixes: match-mondo-sources-all-lexical.py - Bugfix: AttributeError: 'tuple' object has no attribute 'pop': wrong datatype for metadata was being passed to lexical_index_to_sssom() - Bugfix: Several other bugs in mondo-ingest, and upgrading OAK/sssom-py/curies to fix other bugs related to prefix maps. - Update: mondo.sssom.config.yaml: Added extended prefix map --- src/ontology/metadata/mondo.sssom.config.yml | 298 ++++++++++++++++++ .../match-mondo-sources-all-lexical.py | 74 ++++- 2 files changed, 360 insertions(+), 12 deletions(-) diff --git a/src/ontology/metadata/mondo.sssom.config.yml b/src/ontology/metadata/mondo.sssom.config.yml index 2afeff320..a59983ff1 100644 --- a/src/ontology/metadata/mondo.sssom.config.yml +++ b/src/ontology/metadata/mondo.sssom.config.yml @@ -75,6 +75,304 @@ curie_map: GC_ID: "http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/GC_ID/" SNOMEDCT_2010_1_31: "http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/SNOMEDCT_2010_1_31/" +extended_prefix_map: + - prefix: MONDO + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/MONDO_ + uri_prefix_synonyms: [] + - prefix: UMLS + prefix_synonyms: [] + uri_prefix: http://linkedlifedata.com/resource/umls/id/ + uri_prefix_synonyms: [] + - prefix: NCIT + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/NCIT_ + uri_prefix_synonyms: [] + - prefix: DOID + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/DOID_ + uri_prefix_synonyms: [] + - prefix: EFO + prefix_synonyms: [] + uri_prefix: http://www.ebi.ac.uk/efo/EFO_ + uri_prefix_synonyms: [] + - prefix: HGNC + prefix_synonyms: [] + uri_prefix: http://identifiers.org/hgnc/ + uri_prefix_synonyms: [] + - prefix: HGNC__2 + prefix_synonyms: [] + uri_prefix: 'https://identifiers.org/hgnc:' + uri_prefix_synonyms: [] + - prefix: HGNC__3 + prefix_synonyms: [] + uri_prefix: 'https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/HGNC:' + uri_prefix_synonyms: [] + - prefix: HGNC_symbol + prefix_synonyms: [] + uri_prefix: https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/ + uri_prefix_synonyms: [] + - prefix: HP + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/HP_ + uri_prefix_synonyms: [] + - prefix: SCTID + prefix_synonyms: [] + uri_prefix: http://identifiers.org/snomedct/ + uri_prefix_synonyms: [] + - prefix: SCTID__2 + prefix_synonyms: [] + uri_prefix: http://snomed.info/id/ + uri_prefix_synonyms: [] + - prefix: OMIM + prefix_synonyms: [] + uri_prefix: https://omim.org/entry/ + uri_prefix_synonyms: [] + - prefix: OMIM__2 + prefix_synonyms: [] + uri_prefix: http://identifiers.org/omim/ + uri_prefix_synonyms: [] + - prefix: OMIM__3 + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/OMIM_ + uri_prefix_synonyms: [] + - prefix: OMIM__4 + prefix_synonyms: [] + uri_prefix: http://omim.org/entry/ + uri_prefix_synonyms: [] + - prefix: MESH + prefix_synonyms: [] + uri_prefix: http://identifiers.org/mesh/ + uri_prefix_synonyms: [] + - prefix: Orphanet + prefix_synonyms: [] + uri_prefix: http://www.orpha.net/ORDO/Orphanet_ + uri_prefix_synonyms: [] + - prefix: Orphanet__2 + prefix_synonyms: [] + uri_prefix: https://www.orpha.net/ORDO/Orphanet_ + uri_prefix_synonyms: [] + - prefix: Orphanet__3 + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/Orphanet_ + uri_prefix_synonyms: [] + - prefix: oboInOwl + prefix_synonyms: + - oio + uri_prefix: http://www.geneontology.org/formats/oboInOwl# + uri_prefix_synonyms: [] + - prefix: NCBITaxon + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/NCBITaxon_ + uri_prefix_synonyms: [] + - prefix: skos + prefix_synonyms: [] + uri_prefix: http://www.w3.org/2004/02/skos/core# + uri_prefix_synonyms: [] + - prefix: ICD10CM + prefix_synonyms: [] + uri_prefix: http://purl.bioontology.org/ontology/ICD10CM/ + uri_prefix_synonyms: [] + - prefix: ICD10CM__2 + prefix_synonyms: [] + uri_prefix: https://icd.codes/icd10cm/ + uri_prefix_synonyms: [] + - prefix: ICD10WHO + prefix_synonyms: [] + uri_prefix: https://icd.who.int/browse10/2019/en#/ + uri_prefix_synonyms: [] + - prefix: ICD10WHO__2 + prefix_synonyms: [] + uri_prefix: http://apps.who.int/classifications/icd10/browse/2010/en#/ + uri_prefix_synonyms: [] + - prefix: OMIMPS + prefix_synonyms: [] + uri_prefix: https://omim.org/phenotypicSeries/PS + uri_prefix_synonyms: [] + - prefix: MEDGEN + prefix_synonyms: [] + uri_prefix: http://identifiers.org/medgen/ + uri_prefix_synonyms: [] + - prefix: MedDRA + prefix_synonyms: [] + uri_prefix: http://identifiers.org/meddra/ + uri_prefix_synonyms: [] + - prefix: rdfs + prefix_synonyms: [] + uri_prefix: http://www.w3.org/2000/01/rdf-schema# + uri_prefix_synonyms: [] + - prefix: owl + prefix_synonyms: [] + uri_prefix: http://www.w3.org/2002/07/owl# + uri_prefix_synonyms: [] + - prefix: semapv + prefix_synonyms: [] + uri_prefix: https://w3id.org/semapv/vocab/ + uri_prefix_synonyms: [] + - prefix: rdf + prefix_synonyms: [] + uri_prefix: http://www.w3.org/1999/02/22-rdf-syntax-ns# + uri_prefix_synonyms: [] + - prefix: sssom + prefix_synonyms: [] + uri_prefix: https://w3id.org/sssom/ + uri_prefix_synonyms: [] + - prefix: GTR + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/GTR/ + uri_prefix_synonyms: [] + - prefix: NCI + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/NCI/ + uri_prefix_synonyms: [] + - prefix: NIFSTD + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/NIFSTD/ + uri_prefix_synonyms: [] + - prefix: PO_GIT + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/PO_GIT/ + uri_prefix_synonyms: [] + - prefix: CALOHA + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/CALOHA/ + uri_prefix_synonyms: [] + - prefix: Reactome + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/Reactome/ + uri_prefix_synonyms: [] + - prefix: MTH + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/MTH/ + uri_prefix_synonyms: [] + - prefix: IMDRF + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/IMDRF/ + uri_prefix_synonyms: [] + - prefix: LOINC + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/LOINC/ + uri_prefix_synonyms: [] + - prefix: MEDDRA + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/MEDDRA/ + uri_prefix_synonyms: [] + - prefix: ncithesaurus + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/ncithesaurus/ + uri_prefix_synonyms: [] + - prefix: COHD + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/COHD/ + uri_prefix_synonyms: [] + - prefix: ONCOTREE + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/ONCOTREE/ + uri_prefix_synonyms: [] + - prefix: ICD9 + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/ICD9/ + uri_prefix_synonyms: [] + - prefix: NDFRT + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/NDFRT/ + uri_prefix_synonyms: [] + - prefix: ICD9CM + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/ICD9CM/ + uri_prefix_synonyms: [] + - prefix: SUBSET_SIREN + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/SUBSET_SIREN/ + uri_prefix_synonyms: [] + - prefix: ICDO + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/ICDO/ + uri_prefix_synonyms: [] + - prefix: Wikidata + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/Wikidata/ + uri_prefix_synonyms: [] + - prefix: IEDB + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/IEDB/ + uri_prefix_synonyms: [] + - prefix: PMID + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/PMID/ + uri_prefix_synonyms: [] + - prefix: KEGG + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/KEGG/ + uri_prefix_synonyms: [] + - prefix: ICD11 + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/ICD11/ + uri_prefix_synonyms: [] + - prefix: DECIPHER + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/DECIPHER/ + uri_prefix_synonyms: [] + - prefix: CSP + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/CSP/ + uri_prefix_synonyms: [] + - prefix: Wikipedia + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/Wikipedia/ + uri_prefix_synonyms: [] + - prefix: Fyler + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/Fyler/ + uri_prefix_synonyms: [] + - prefix: EPCC + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/EPCC/ + uri_prefix_synonyms: [] + - prefix: UMLS_CUI + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/UMLS_CUI/ + uri_prefix_synonyms: [] + - prefix: KUPO + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/KUPO/ + uri_prefix_synonyms: [] + - prefix: OMOP + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/OMOP/ + uri_prefix_synonyms: [] + - prefix: ICD10 + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/ICD10/ + uri_prefix_synonyms: [] + - prefix: ICD10EXP + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/ICD10EXP/ + uri_prefix_synonyms: [] + - prefix: DERMO + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/DERMO/ + uri_prefix_synonyms: [] + - prefix: GARD + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/GARD/ + uri_prefix_synonyms: [] + - prefix: SNOMEDCT_US + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/SNOMEDCT_US/ + uri_prefix_synonyms: [] + - prefix: MSH + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/MSH/ + uri_prefix_synonyms: [] + - prefix: GC_ID + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/GC_ID/ + uri_prefix_synonyms: [] + - prefix: SNOMEDCT_2010_1_31 + prefix_synonyms: [] + uri_prefix: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/SNOMEDCT_2010_1_31/ + uri_prefix_synonyms: [] subject_prefixes: - MONDO diff --git a/src/scripts/match-mondo-sources-all-lexical.py b/src/scripts/match-mondo-sources-all-lexical.py index 1da046468..464475346 100644 --- a/src/scripts/match-mondo-sources-all-lexical.py +++ b/src/scripts/match-mondo-sources-all-lexical.py @@ -11,7 +11,12 @@ # Use oak.mapping() pipeline import logging +from collections import ChainMap +from datetime import datetime from pathlib import Path +from typing import List, Tuple, Union + +from curies import Converter from oaklib.resource import OntologyResource from oaklib.implementations.sqldb.sql_implementation import SqlImplementation from oaklib.utilities.lexical.lexical_indexer import ( @@ -25,11 +30,12 @@ import yaml import pandas as pd -from sssom.constants import SUBJECT_ID, OBJECT_ID, PREDICATE_MODIFIER +from sssom.constants import MetadataType, SUBJECT_ID, OBJECT_ID, PREDICATE_MODIFIER, get_default_metadata +from sssom.context import get_converter from sssom.util import filter_prefixes, is_curie, is_iri from sssom.parsers import parse_sssom_table from sssom.writers import write_table -from sssom.io import get_metadata_and_prefix_map, filter_file +from sssom.io import _merge_converter, get_metadata_and_prefix_map, filter_file from bioregistry import curie_from_iri SRC = Path(__file__).resolve().parents[1] @@ -49,6 +55,26 @@ ) +# todo: Harshad initially set up to have this return metadata but I don't see it being used. Should just instantiate +# simple converter? maybe even not do as a function? +def get_converter_and_metadata(metadata_path: Union[None, str, Path] = None) -> Tuple[Converter, MetadataType]: + """ + Load SSSOM metadata from a YAML file, and then augment it with default prefixes. + + :param metadata_path: The metadata file in YAML format + """ + if metadata_path is None: + return get_converter(), get_default_metadata() + + with Path(metadata_path).resolve().open() as file: + metadata = yaml.safe_load(file) + + metadata = dict(ChainMap(metadata, get_default_metadata())) + converter = Converter.from_extended_prefix_map(metadata.pop('extended_prefix_map', {})) + converter = _merge_converter(converter) + return converter, metadata + + @click.group() @click.option("-v", "--verbose", count=True) @click.option("-q", "--quiet") @@ -83,18 +109,32 @@ def main(verbose: int, quiet: bool): ) @output_option def run(input: str, config: str, rules: str, rejects: str, output: str): - # Implemented `meta` param in `lexical_index_to_sssom` - - meta = get_metadata_and_prefix_map(config) + t0 = datetime.now() # todo: temp + # TODO 01/17: this will be a private method. can i simply create converter (and meta if needed) here? + # old code + # converter, meta = get_metadata_and_prefix_map(config) + # new code + converter, meta = get_converter_and_metadata(config) + + # todo's <01/17 + # todo temp: if lexical_index_to_sssom() is actually in need of passing 'meta', pass msdf_meta? But causes error: + # ValueError: Unknown argument: curie_map = ... + # msdf_meta = {'curie_map': converter.prefix_map} + # todo: wanna add 'Records' this but get + # ModuleNotFoundError: No module named 'defusedxml' + # from curies.mapping_service.utils import Records + epm: List = converter.records with open(config, "r") as f: yml = yaml.safe_load(f) # Get mondo.sssom.tsv + # TODO: # mapping_msdf.df = ( # pd.merge( # mapping_msdf.df, @@ -108,19 +148,25 @@ def run(input: str, config: str, rules: str, rejects: str, output: str): # .reset_index(drop=True) # ) - prefix_of_interest = yml["subject_prefixes"] - resource = OntologyResource(slug=f"sqlite:///{Path(input).absolute()}") oi = SqlImplementation(resource=resource) ruleset = load_mapping_rules(rules) # syn_rules = [x.synonymizer for x in ruleset.rules if x.synonymizer] - lexical_index = create_lexical_index(oi=oi, mapping_rule_collection=ruleset) - save_lexical_index(lexical_index, OUT_INDEX_DB) + # TODO: uncomment + # lexical_index = create_lexical_index(oi=oi, mapping_rule_collection=ruleset) + # save_lexical_index(lexical_index, OUT_INDEX_DB) + # TODO: /uncomment + + # TODO temp delete after + import pickle + pp = '/Users/joeflack4/projects/mondo-ingest/cache/issues/lexmatch/lexical_index.pickle' + # pickle.dump(lexical_index, open(pp, "wb")) + lexical_index = pickle.load(open(pp, "rb")) if rules: - msdf = lexical_index_to_sssom(oi, lexical_index, ruleset=ruleset, meta=meta) + msdf = lexical_index_to_sssom(oi, lexical_index, ruleset=ruleset, extended_prefix_map=epm) else: - msdf = lexical_index_to_sssom(oi, lexical_index, meta=meta) + msdf = lexical_index_to_sssom(oi, lexical_index) # msdf.prefix_map = sssom_yaml['curie_map'] # msdf.metadata = sssom_yaml['global_metadata'] @@ -131,8 +177,9 @@ def run(input: str, config: str, rules: str, rejects: str, output: str): # msdf.df[OBJECT_ID] = msdf.df[OBJECT_ID].apply( # lambda x: iri_to_curie(x) if x.startswith("