Skip to content

Commit

Permalink
fixing parser see commit description
Browse files Browse the repository at this point in the history
fixing several bugs and broken variable names
- fixing source data download location
- source data delimiter is comma not tab
- cleaning up properties (they need to be a dictionary, but node props were unnecessary anyway)

simplifying/fixing qualifier handling
- using predicates like RO:0002212 includes directionality and will normalize to qualified version, old implementation didn't work anyway, so this is better
  • Loading branch information
EvanDietzMorris committed Dec 2, 2024
1 parent d2bead1 commit 3491e3f
Showing 1 changed file with 20 additions and 33 deletions.
53 changes: 20 additions & 33 deletions parsers/LINCS/src/loadLINCS.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,27 +7,22 @@
from Common.prefixes import PUBCHEM_COMPOUND
from Common.utils import GetData


class GENERICDATACOLS(enum.IntEnum):
SOURCE_ID = 2
SOURCE_LABEL = 3
TARGET_ID = 5
TARGET_LABEL = 6
PREDICATE = 7


PREDICATE_MAPPING = {
"in_similarity_relationship_with": "biolink:chemically_similar_to",
"negatively_regulates": {
"RO:0002448": {
OBJECT_DIRECTION_QUALIFIER: "downregulated"}
},
"positively_regulates": {
"RO:0002448": {
OBJECT_DIRECTION_QUALIFIER: "upregulated"}
}
"negatively_regulates": "RO:0002212",
"positively_regulates": "RO:0002213"
}



##############
# Class: LINCS loader
#
Expand All @@ -36,8 +31,6 @@ class GENERICDATACOLS(enum.IntEnum):
# Desc: Class that loads/parses the data in Library of Integrated Network-Based Cellular Signatures.
#
##############


class LINCSLoader(SourceDataLoader):

source_id: str = 'LINCS'
Expand All @@ -51,20 +44,17 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None):
"""
super().__init__(test_mode=test_mode, source_data_dir=source_data_dir)

self.lincs_url = 'https://stars.renci.org/var/data_services/LINCS/'
self.data_url = 'https://stars.renci.org/var/data_services/LINCS/'
self.edge_file = "LINCS.lookup.edges.csv"
self.data_files = [self.edge_file]

def get_latest_source_version(self) -> str:
# if possible go to the source and retrieve a string that is the latest version of the source data
# The KG was generated from Data Distillery KG. There was no version defined.
latest_version = 'v1.0'
return latest_version

def get_data(self) -> bool:
# get_data is responsible for fetching the files in self.data_files and saving them to self.data_path
# Not used for LINCS so far.
source_data_url = f'{self.example_url}{self.edge_file}'
source_data_url = f'{self.data_url}{self.edge_file}'
data_puller = GetData()
data_puller.pull_via_http(source_data_url, self.data_path)
return True
Expand All @@ -76,33 +66,30 @@ def parse_data(self) -> dict:
:return: ret_val: load_metadata
"""
extractor = Extractor(file_writer=self.output_file_writer)
lincs_file: str = os.path.join(self.lincs_url, self.edge_file)
lincs_file: str = os.path.join(self.data_path, self.edge_file)
with open(lincs_file, 'rt') as fp:
extractor.csv_extract(fp,
lambda line: self.resolve_id(line[GENERICDATACOLS.SOURCE_ID.value]), # source id
lambda line: self.resolve_id(line[GENERICDATACOLS.TARGET_ID.value]), # target id
lambda line: PREDICATE_MAPPING[line[GENERICDATACOLS.PREDICATE.value]].key, # predicate extractor
lambda line: {line[GENERICDATACOLS.SOURCE_LABEL.value]}, # subject properties
lambda line: {line[GENERICDATACOLS.TARGET_LABEL.value]}, # object properties
lambda line: self.format_edge_properties(line[GENERICDATACOLS.PREDICATE.value]), # edge properties
lambda line: PREDICATE_MAPPING[line[GENERICDATACOLS.PREDICATE.value]], # predicate extractor
lambda line: {}, # subject properties
lambda line: {}, # object properties
lambda line: self.get_edge_properties(), # edge properties
comment_character='#',
delim='\t',
delim=',',
has_header_row=True)
return extractor.load_metadata

def resolve_id(self, idstring: str):
@staticmethod
def resolve_id(idstring: str):
if idstring.startswith("PUBCHEM"):
return f"{PUBCHEM_COMPOUND}{idstring.replace("PUBCHEM","")}"
elif idstring.startswith("HGNC"):
return idstring

def format_edge_properties(self, predicate: str):
properties = PREDICATE_MAPPING[predicate].value
return idstring.replace("PUBCHEM", PUBCHEM_COMPOUND)
return idstring

properties.update({
def get_edge_properties(self):
properties = {
PRIMARY_KNOWLEDGE_SOURCE: self.provenance_id,
KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION,
AGENT_TYPE: DATA_PIPELINE
})

return properties
}
return properties

0 comments on commit 3491e3f

Please sign in to comment.