improved csv conversion property handling and efficiency

- made property ignore lists more flexible and efficient and made robokop_variant_id the default for nodes - optimized the csv generation script some for how it handles converting properties
RobokopU24 · Jan 22, 2025 · e3f15bd · e3f15bd
1 parent 9cecd1f
commit e3f15bd
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 16 deletions.
diff --git a/Common/build_manager.py b/Common/build_manager.py
@@ -25,7 +25,6 @@
 EDGES_FILENAME = 'edges.jsonl'
 REDUNDANT_EDGES_FILENAME = 'redundant_edges.jsonl'
 COLLAPSED_QUALIFIERS_FILENAME = 'collapsed_qualifier_edges.jsonl'
-DEFAULT_EDGE_PROPERTY_IGNORE_LIST = ['robokop_variant_id']
 
 
 class GraphBuilder:
@@ -118,7 +117,11 @@ def build_graph(self, graph_id: str):
         output_formats = graph_spec.graph_output_format.lower().split('+') if graph_spec.graph_output_format else []
         nodes_filepath = os.path.join(graph_output_dir, NODES_FILENAME)
         edges_filepath = os.path.join(graph_output_dir, EDGES_FILENAME)
-
+
+        # TODO allow these to be specified in the graph spec
+        node_property_ignore_list = {'robokop_variant_id'}
+        edge_property_ignore_list = None
+
         if 'redundant_jsonl' in output_formats:
             self.logger.info(f'Generating redundant edge KG for {graph_id}...')
             redundant_filepath = edges_filepath.replace(EDGES_FILENAME, REDUNDANT_EDGES_FILENAME)
@@ -134,7 +137,8 @@ def build_graph(self, graph_id: str):
                                              output_directory=graph_output_dir,
                                              graph_id=graph_id,
                                              graph_version=graph_version,
-                                             edge_property_ignore_list=DEFAULT_EDGE_PROPERTY_IGNORE_LIST,
+                                             node_property_ignore_list=node_property_ignore_list,
+                                             edge_property_ignore_list=edge_property_ignore_list,
                                              logger=self.logger)
 
             if dump_success:
@@ -156,7 +160,8 @@ def build_graph(self, graph_id: str):
                                              output_directory=graph_output_dir,
                                              graph_id=graph_id,
                                              graph_version=graph_version,
-                                             edge_property_ignore_list=DEFAULT_EDGE_PROPERTY_IGNORE_LIST,
+                                             node_property_ignore_list=node_property_ignore_list,
+                                             edge_property_ignore_list=edge_property_ignore_list,
                                              logger=self.logger)
 
             if dump_success:
@@ -170,7 +175,8 @@ def build_graph(self, graph_id: str):
                                              output_directory=graph_output_dir,
                                              graph_id=graph_id,
                                              graph_version=graph_version,
-                                             edge_property_ignore_list=DEFAULT_EDGE_PROPERTY_IGNORE_LIST,
+                                             node_property_ignore_list=node_property_ignore_list,
+                                             edge_property_ignore_list=edge_property_ignore_list,
                                              logger=self.logger)
 
             if dump_success:

diff --git a/Common/kgx_file_converter.py b/Common/kgx_file_converter.py
@@ -184,9 +184,33 @@ def __convert_to_csv(input_file: str,
                      properties: dict,  # dictionary of { node/edge property: property_type }
                      array_delimiter: str,
                      output_delimiter: str,
-                     property_ignore_list=None):
+                     property_ignore_list: set = None):
+
+    # generate the headers which for neo4j include the property name and the type
+    # for example:
+    # id:ID	name:string	category:LABEL	equivalent_identifiers:string[]	information_content:float
     headers = {prop: f'{prop.removeprefix("biolink:")}:{prop_type}'
-               for prop, prop_type in properties.items() if prop not in property_ignore_list}
+               for prop, prop_type in properties.items()}
+
+    # if there is a property_ignore_list, remove them from the headers
+    # also filter the list to include only properties that are actually present
+    if property_ignore_list:
+        ignored_props_present = set()
+        for ignored_prop in property_ignore_list:
+            if headers.pop(ignored_prop, 'PROP_NOT_FOUND') != 'PROP_NOT_FOUND':
+                ignored_props_present.add(ignored_prop)
+        if not ignored_props_present:
+            property_ignore_list = None
+        else:
+            property_ignore_list = ignored_props_present
+            print(f'Properties that should be ignored were found, ignoring: {property_ignore_list}')
+
+    properties_that_are_lists = {prop for prop in headers if properties[prop] in {'LABEL',
+                                                                                  'string[]',
+                                                                                  'float[]',
+                                                                                  'int[]'}}
+    properties_that_are_boolean = {prop for prop in headers if properties[prop] == 'boolean'}
+
     with open(output_file, 'w', newline='') as output_file_handler:
         csv_file_writer = csv.DictWriter(output_file_handler,
                                          delimiter=output_delimiter,
@@ -205,15 +229,11 @@ def __convert_to_csv(input_file: str,
                 elif property_ignore_list and key in property_ignore_list:
                     del item[key]
                 else:
-                    prop_type = properties[key]
-                    # convert lists into strings with an array delimiter
-                    if prop_type == 'LABEL' or \
-                            prop_type == 'string[]' or \
-                            prop_type == 'float[]' or \
-                            prop_type == 'int[]':
+                    if key in properties_that_are_lists:
+                        # convert lists into strings with an array delimiter
                         if isinstance(item[key], list):  # need to doublecheck for cases of properties with mixed types
                             item[key] = array_delimiter.join(str(value) for value in item[key])
-                    elif prop_type == 'boolean':
+                    elif key in properties_that_are_boolean:
                         # neo4j handles boolean with string 'true' being true and everything else false
                         item[key] = 'true' if item[key] is True else 'false'
             csv_file_writer.writerow(item)

diff --git a/Common/neo4j_tools.py b/Common/neo4j_tools.py
@@ -224,8 +224,8 @@ def create_neo4j_dump(nodes_filepath: str,
                       output_directory: str,
                       graph_id: str = 'graph',
                       graph_version: str = '',
-                      node_property_ignore_list: list = None,
-                      edge_property_ignore_list: list = None,
+                      node_property_ignore_list: set = None,
+                      edge_property_ignore_list: set = None,
                       logger=None):
     nodes_csv_filename = 'nodes.temp_csv'
     edges_csv_filename = 'edges.temp_csv'