Skip to content

Commit

Permalink
adding OHD-Carolina
Browse files Browse the repository at this point in the history
  • Loading branch information
EvanDietzMorris committed Jan 21, 2025
1 parent 20f9aa8 commit 4e599a2
Show file tree
Hide file tree
Showing 5 changed files with 127 additions and 11 deletions.
6 changes: 6 additions & 0 deletions Common/biolink_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@
KNOWLEDGE_LEVEL = 'knowledge_level'
MAX_RESEARCH_PHASE = 'max_research_phase'
HAS_SUPPORTING_STUDY_RESULT = 'has_supporting_study_result'
LOG_ODDS_RATIO = 'log_odds_ratio'
LOG_ODDS_RATIO_95_CI = 'log_odds_ratio_95_ci'
TOTAL_SAMPLE_SIZE = 'total_sample_size'

# enums for knowledge level
KNOWLEDGE_ASSERTION = 'knowledge_assertion'
Expand Down Expand Up @@ -153,6 +156,9 @@
MECHANISM_OF_ACTION,
MAX_RESEARCH_PHASE,
HAS_SUPPORTING_STUDY_RESULT,
LOG_ODDS_RATIO,
LOG_ODDS_RATIO_95_CI,
TOTAL_SAMPLE_SIZE,
# qualifiers
ANATOMICAL_CONTEXT_QUALIFIER,
CAUSAL_MECHANISM_QUALIFIER,
Expand Down
2 changes: 2 additions & 0 deletions Common/data_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
MOLEPRO = 'MolePro'
MONARCH_KG = 'MonarchKG'
MONDO_PROPS = 'MONDOProps'
OHD_CAROLINA = 'OHD-Carolina'
ONTOLOGICAL_HIERARCHY = 'OntologicalHierarchy'
PANTHER = 'PANTHER'
PHAROS = 'PHAROS'
Expand Down Expand Up @@ -82,6 +83,7 @@
MOLEPRO: ("parsers.molepro.src.loadMolePro", "MoleProLoader"),
MONARCH_KG: ("parsers.monarchkg.src.loadMonarchKG", "MonarchKGLoader"),
MONDO_PROPS: ("parsers.MONDOProperties.src.loadMP", "MPLoader"),
OHD_CAROLINA: ("parsers.ohd_carolina.src.loadOHD", "OHDLoader"),
ONTOLOGICAL_HIERARCHY: ("parsers.UberGraph.src.loadUG", "OHLoader"),
PANTHER: ("parsers.panther.src.loadPanther", "PLoader"),
PHAROS: ("parsers.PHAROS.src.loadPHAROS", "PHAROSLoader"),
Expand Down
10 changes: 0 additions & 10 deletions graph_specs/cohd-graph-spec.yaml

This file was deleted.

1 change: 0 additions & 1 deletion parsers/cohd/src/loadCOHD.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import yaml

from Common.loader_interface import SourceDataLoader
from Common.biolink_constants import PRIMARY_KNOWLEDGE_SOURCE
from Common.utils import GetData, quick_jsonl_file_iterator


Expand Down
119 changes: 119 additions & 0 deletions parsers/ohd_carolina/src/loadOHD.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@

import os
import requests
import yaml
import enum
import orjson

from io import TextIOWrapper
from zipfile import ZipFile
from Common.extractor import Extractor
from Common.loader_interface import SourceDataLoader
from Common.biolink_constants import *
from Common.utils import GetData


class EDGESDATACOLS(enum.IntEnum):
SUBJECT_ID = 0
SUBJECT_NAME = 1
OBJECT_ID = 2
OBJECT_NAME = 3
PREDICATE = 4
CHI_SQUARED_P_VALUE = 5
LOG_ODDS_RATIO = 6
LOG_ODDS_RATIO_95_CI = 7
SCORE = 8
TOTAL_SAMPLE_SIZE = 9
PRIMARY_KS = 10


##############
# Class: OHD source loader
#
# Desc: Class that loads/parses the Open Health Data @ Carolina data.
##############
class OHDLoader(SourceDataLoader):

source_id: str = 'OHD-Carolina'
provenance_id: str = 'infores:openhealthdata-carolina'
parsing_version: str = '1.0'

def __init__(self, test_mode: bool = False, source_data_dir: str = None):
"""
:param test_mode - sets the run into test mode
:param source_data_dir - the specific storage directory to save files in
"""
super().__init__(test_mode=test_mode, source_data_dir=source_data_dir)

self.data_url = 'https://stars.renci.org/var/data_services/ohd/'
self.version_file = 'ohd.yaml'
self.ohd_archive_file = 'unc_omop_2018_2022_kg.zip'
self.ohd_edges_file = 'unc_omop_2018_2022_kg.csv'
self.data_files = [self.ohd_archive_file]

def get_latest_source_version(self) -> str:
version_file_url = f"{self.data_url}{self.version_file}"
r = requests.get(version_file_url)
if not r.ok:
r.raise_for_status()
version_yaml = yaml.full_load(r.text)
build_version = str(version_yaml['build'])
return build_version

def get_data(self) -> bool:
for data_file in self.data_files:
source_data_url = f'{self.data_url}{data_file}'
data_puller = GetData()
data_puller.pull_via_http(source_data_url, self.data_path)
return True

def parse_data(self) -> dict:
"""
Parses the data file for graph nodes/edges
:return: ret_val: load_metadata
"""
extractor = Extractor(file_writer=self.output_file_writer)

ohd_archive_file_path: str = os.path.join(self.data_path, self.ohd_archive_file)
with ZipFile(ohd_archive_file_path) as ohd_archive:
with ohd_archive.open(self.ohd_edges_file, "r") as fp:
extractor.csv_extract(TextIOWrapper(fp),
lambda line: line[EDGESDATACOLS.SUBJECT_ID.value], # subject id
lambda line: line[EDGESDATACOLS.OBJECT_ID.value], # object id
lambda line: line[EDGESDATACOLS.PREDICATE], # predicate extractor
lambda line: {NAME: line[EDGESDATACOLS.SUBJECT_NAME.value]}, # subject props
lambda line: {NAME: line[EDGESDATACOLS.OBJECT_NAME.value]}, # object props
lambda line: self.get_edge_properties(line), # edgeprops
comment_character=None,
delim=',',
has_header_row=True)

return extractor.load_metadata

@staticmethod
def get_edge_properties(line):
return {
AGENT_TYPE: DATA_PIPELINE,
KNOWLEDGE_LEVEL: STATISTICAL_ASSOCIATION,
'score': line[EDGESDATACOLS.SCORE.value],
PRIMARY_KNOWLEDGE_SOURCE: line[EDGESDATACOLS.PRIMARY_KS.value],
P_VALUE: float(line[EDGESDATACOLS.CHI_SQUARED_P_VALUE.value]),
LOG_ODDS_RATIO: float(line[EDGESDATACOLS.LOG_ODDS_RATIO.value]),
LOG_ODDS_RATIO_95_CI: orjson.loads(line[EDGESDATACOLS.LOG_ODDS_RATIO_95_CI.value]),
TOTAL_SAMPLE_SIZE: int(line[EDGESDATACOLS.TOTAL_SAMPLE_SIZE.value])
}

"""
# this should probably be something like this instead to match COHD,
because merged edges wont be able to handle conflicting attributes across multiple supporting studies
'attributes': [orjson.dumps({
HAS_SUPPORTING_STUDY_RESULT: [{
P_VALUE: float(line[EDGESDATACOLS.CHI_SQUARED_P_VALUE.value]),
LOG_ODDS_RATIO: float(line[EDGESDATACOLS.LOG_ODDS_RATIO.value]),
LOG_ODDS_RATIO_95_CI: orjson.loads(line[EDGESDATACOLS.LOG_ODDS_RATIO_95_CI.value]),
TOTAL_SAMPLE_SIZE: int(line[EDGESDATACOLS.TOTAL_SAMPLE_SIZE.value])
}]
}).decode('utf-8')]
"""

0 comments on commit 4e599a2

Please sign in to comment.