fixing parser see commit description

fixing several bugs and broken variable names - fixing source data download location - source data delimiter is comma not tab - cleaning up properties (they need to be a dictionary, but node props were unnecessary anyway) simplifying/fixing qualifier handling - using predicates like RO:0002212 includes directionality and will normalize to qualified version, old implementation didn't work anyway, so this is better
RobokopU24 · Dec 2, 2024 · 3491e3f · 3491e3f
1 parent d2bead1
commit 3491e3f
Showing 1 changed file with 20 additions and 33 deletions.
diff --git a/parsers/LINCS/src/loadLINCS.py b/parsers/LINCS/src/loadLINCS.py
@@ -7,27 +7,22 @@
 from Common.prefixes import PUBCHEM_COMPOUND
 from Common.utils import GetData
 
+
 class GENERICDATACOLS(enum.IntEnum):
     SOURCE_ID = 2
     SOURCE_LABEL = 3
     TARGET_ID = 5
     TARGET_LABEL = 6
     PREDICATE = 7
 
+
 PREDICATE_MAPPING = {
     "in_similarity_relationship_with": "biolink:chemically_similar_to",
-    "negatively_regulates": {
-        "RO:0002448": {
-            OBJECT_DIRECTION_QUALIFIER: "downregulated"}
-    },
-    "positively_regulates": {
-        "RO:0002448": {
-            OBJECT_DIRECTION_QUALIFIER: "upregulated"}
-    }
+    "negatively_regulates": "RO:0002212",
+    "positively_regulates": "RO:0002213"
 }
 
 
-
 ##############
 # Class: LINCS loader
 #
@@ -36,8 +31,6 @@ class GENERICDATACOLS(enum.IntEnum):
 # Desc: Class that loads/parses the data in Library of Integrated Network-Based Cellular Signatures.
 # 
 ##############
-
-
 class LINCSLoader(SourceDataLoader):
 
     source_id: str = 'LINCS'
@@ -51,20 +44,17 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None):
         """
         super().__init__(test_mode=test_mode, source_data_dir=source_data_dir)
 
-        self.lincs_url = 'https://stars.renci.org/var/data_services/LINCS/'
+        self.data_url = 'https://stars.renci.org/var/data_services/LINCS/'
         self.edge_file = "LINCS.lookup.edges.csv"
         self.data_files = [self.edge_file]
 
     def get_latest_source_version(self) -> str:
-        # if possible go to the source and retrieve a string that is the latest version of the source data
         # The KG was generated from Data Distillery KG. There was no version defined.
         latest_version = 'v1.0'
         return latest_version
 
     def get_data(self) -> bool:
-        # get_data is responsible for fetching the files in self.data_files and saving them to self.data_path
-        # Not used for LINCS so far.
-        source_data_url = f'{self.example_url}{self.edge_file}'
+        source_data_url = f'{self.data_url}{self.edge_file}'
         data_puller = GetData()
         data_puller.pull_via_http(source_data_url, self.data_path)
         return True
@@ -76,33 +66,30 @@ def parse_data(self) -> dict:
         :return: ret_val: load_metadata
         """
         extractor = Extractor(file_writer=self.output_file_writer)
-        lincs_file: str = os.path.join(self.lincs_url, self.edge_file)
+        lincs_file: str = os.path.join(self.data_path, self.edge_file)
         with open(lincs_file, 'rt') as fp:
             extractor.csv_extract(fp,
                                   lambda line: self.resolve_id(line[GENERICDATACOLS.SOURCE_ID.value]),  # source id
                                   lambda line: self.resolve_id(line[GENERICDATACOLS.TARGET_ID.value]),  # target id
-                                  lambda line: PREDICATE_MAPPING[line[GENERICDATACOLS.PREDICATE.value]].key,  # predicate extractor
-                                  lambda line: {line[GENERICDATACOLS.SOURCE_LABEL.value]},  # subject properties
-                                  lambda line: {line[GENERICDATACOLS.TARGET_LABEL.value]},  # object properties
-                                  lambda line: self.format_edge_properties(line[GENERICDATACOLS.PREDICATE.value]),  # edge properties
+                                  lambda line: PREDICATE_MAPPING[line[GENERICDATACOLS.PREDICATE.value]],  # predicate extractor
+                                  lambda line: {},  # subject properties
+                                  lambda line: {},  # object properties
+                                  lambda line: self.get_edge_properties(),  # edge properties
                                   comment_character='#',
-                                  delim='\t',
+                                  delim=',',
                                   has_header_row=True)
         return extractor.load_metadata
 
-    def resolve_id(self, idstring: str):
+    @staticmethod
+    def resolve_id(idstring: str):
         if idstring.startswith("PUBCHEM"):
-            return f"{PUBCHEM_COMPOUND}{idstring.replace("PUBCHEM","")}"
-        elif idstring.startswith("HGNC"):
-            return idstring
-
-    def format_edge_properties(self, predicate: str):
-        properties = PREDICATE_MAPPING[predicate].value
+            return idstring.replace("PUBCHEM", PUBCHEM_COMPOUND)
+        return idstring
 
-        properties.update({
+    def get_edge_properties(self):
+        properties = {
             PRIMARY_KNOWLEDGE_SOURCE: self.provenance_id,
             KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION,
             AGENT_TYPE: DATA_PIPELINE
-        })
-
-        return properties
+        }
+        return properties