Skip to content

Commit

Permalink
Bugfixes: match-mondo-sources-all-lexical.py
Browse files Browse the repository at this point in the history
- Bugfix: AttributeError: 'tuple' object has no attribute 'pop': wrong datatype for metadata was being passed to lexical_index_to_sssom()
- Bugfix: Several other bugs in mondo-ingest, and upgrading OAK/sssom-py/curies to fix other bugs related to prefix maps.
- Update: mondo.sssom.config.yml: Commented out duplicate prefix 'oio'
- Update: prefixes.csv: Removed duplicate prefix oio
- Update: Python requirements: Upgraded curies for bugfix involving get_prefixes(include_synonyms)
  • Loading branch information
joeflack4 committed Jan 20, 2024
1 parent 466b83c commit d4fd48e
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 17 deletions.
2 changes: 1 addition & 1 deletion python-requirements-apple-silicon.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ charset-normalizer==3.3.2
class-resolver==0.4.2
click==8.1.7
colorama==0.4.6
curies==0.7.4
curies==0.7.6
Deprecated==1.2.14
deprecation==2.1.0
distlib==0.3.7
Expand Down
2 changes: 1 addition & 1 deletion python-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class-resolver==0.4.2
click==8.1.7
colorama==0.4.6
commonmark==0.9.1
curies==0.6.4
curies==0.7.6
decorator==5.1.1
Deprecated==1.2.13
deprecation==2.1.0
Expand Down
2 changes: 1 addition & 1 deletion src/ontology/config/prefixes.csv
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ rdf,http://www.w3.org/1999/02/22-rdf-syntax-ns#
rdfs,http://www.w3.org/2000/01/rdf-schema#
xsd,http://www.w3.org/2001/XMLSchema#
owl,http://www.w3.org/2002/07/owl#
oio,http://www.geneontology.org/formats/oboInOwl#
oboInOwl,http://www.geneontology.org/formats/oboInOwl#
dce,http://purl.org/dc/elements/1.1/
dct,http://purl.org/dc/terms/
foaf,http://xmlns.com/foaf/0.1/
Expand Down
2 changes: 1 addition & 1 deletion src/ontology/metadata/mondo.sssom.config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ curie_map:
semapv: https://w3id.org/semapv/vocab/
rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
sssom: https://w3id.org/sssom/
oio: http://www.geneontology.org/formats/oboInOwl#
# oio: http://www.geneontology.org/formats/oboInOwl#
GTR: "http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/GTR/"
NCI: "http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/NCI/"
NIFSTD: "http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/NIFSTD/"
Expand Down
45 changes: 32 additions & 13 deletions src/scripts/match-mondo-sources-all-lexical.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@
# Use oak.mapping() pipeline

import logging
from datetime import datetime
from pathlib import Path

from curies import Converter
from oaklib.resource import OntologyResource
from oaklib.implementations.sqldb.sql_implementation import SqlImplementation
from oaklib.utilities.lexical.lexical_indexer import (
Expand All @@ -25,11 +28,11 @@
import yaml
import pandas as pd

from sssom.constants import SUBJECT_ID, OBJECT_ID, PREDICATE_MODIFIER
from sssom.constants import SUBJECT_ID, OBJECT_ID
from sssom.util import filter_prefixes, is_curie, is_iri
from sssom.parsers import parse_sssom_table
from sssom.writers import write_table
from sssom.io import get_metadata_and_prefix_map, filter_file
from sssom.io import filter_file
from bioregistry import curie_from_iri

SRC = Path(__file__).resolve().parents[1]
Expand Down Expand Up @@ -83,11 +86,11 @@ def main(verbose: int, quiet: bool):
)
@output_option
def run(input: str, config: str, rules: str, rejects: str, output: str):
# Implemented `meta` param in `lexical_index_to_sssom`

meta = get_metadata_and_prefix_map(config)
t0 = datetime.now() # todo: temp
# Get metadata config
with open(config, "r") as f:
yml = yaml.safe_load(f)
converter = Converter.from_extended_prefix_map(yml.pop('extended_prefix_map', {}))

# Get mondo.sssom.tsv
mapping_msdf = parse_sssom_table(SSSOM_MAP_FILE)
Expand All @@ -108,19 +111,31 @@ def run(input: str, config: str, rules: str, rejects: str, output: str):
# .reset_index(drop=True)
# )

prefix_of_interest = yml["subject_prefixes"]

resource = OntologyResource(slug=f"sqlite:///{Path(input).absolute()}")
oi = SqlImplementation(resource=resource)
ruleset = load_mapping_rules(rules)
# syn_rules = [x.synonymizer for x in ruleset.rules if x.synonymizer]
lexical_index = create_lexical_index(oi=oi, mapping_rule_collection=ruleset)
save_lexical_index(lexical_index, OUT_INDEX_DB)

# TODO: uncomment
# t0_2 = datetime.now() # todo: temp
# lexical_index = create_lexical_index(oi=oi, mapping_rule_collection=ruleset)
# save_lexical_index(lexical_index, OUT_INDEX_DB)
# t1_2 = datetime.now() # todo: temp
# print('lexical_index & save_lexical_index complete in seconds:', (t1_2 - t0_2).seconds) # todo temp
# TODO: /uncomment

# TODO temp delete after
import pickle
pp = '/Users/joeflack4/projects/mondo-ingest/cache/issues/lexmatch/mondo-ingest/lexical_index.pickle/b4_after_remove_oio_prefixes_csv/after/lexical_index.pickle'
# pickle.dump(lexical_index, open(pp, "wb"))
lexical_index = pickle.load(open(pp, "rb"))

t0_3 = datetime.now() # todo: temp
if rules:
msdf = lexical_index_to_sssom(oi, lexical_index, ruleset=ruleset, meta=meta)
msdf = lexical_index_to_sssom(oi, lexical_index, ruleset=ruleset, prefix_map=converter)
else:
msdf = lexical_index_to_sssom(oi, lexical_index, meta=meta)
msdf = lexical_index_to_sssom(oi, lexical_index)
t1_3 = datetime.now() # todo: temp
print('lexical_index_to_sssom complete in seconds:', (t1_3 - t0_3).seconds) # todo temp

# msdf.prefix_map = sssom_yaml['curie_map']
# msdf.metadata = sssom_yaml['global_metadata']
Expand All @@ -131,8 +146,9 @@ def run(input: str, config: str, rules: str, rejects: str, output: str):
# msdf.df[OBJECT_ID] = msdf.df[OBJECT_ID].apply(
# lambda x: iri_to_curie(x) if x.startswith("<http") else x
# )
prefixes_of_interest = yml["subject_prefixes"]
msdf.df = filter_prefixes(
df=msdf.df, filter_prefixes=prefix_of_interest, features=[SUBJECT_ID, OBJECT_ID]
df=msdf.df, filter_prefixes=prefixes_of_interest, features=[SUBJECT_ID, OBJECT_ID]
)
msdf.remove_mappings(mapping_msdf)

Expand All @@ -145,6 +161,9 @@ def run(input: str, config: str, rules: str, rejects: str, output: str):
kwargs = {"subject_id": ("MONDO:%",), "object_id": prefix_args}
with open(str(Path(output.replace("lexical", "lexical-2"))), "w") as f:
filter_file(input=str(Path(output)), output=f, **kwargs)
t1 = datetime.now() # todo: temp
print('match-mondo-sources-all-lexical complete in seconds:', (t1 - t0).seconds) # todo temp
print()


def iri_to_curie(item):
Expand Down

0 comments on commit d4fd48e

Please sign in to comment.