Skip to content

Commit

Permalink
improved csv conversion property handling and efficiency
Browse files Browse the repository at this point in the history
- made property ignore lists more flexible and efficient and made robokop_variant_id the default for nodes
- optimized the csv generation script some for how it handles converting properties
  • Loading branch information
EvanDietzMorris committed Jan 22, 2025
1 parent 9cecd1f commit e3f15bd
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 16 deletions.
16 changes: 11 additions & 5 deletions Common/build_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
EDGES_FILENAME = 'edges.jsonl'
REDUNDANT_EDGES_FILENAME = 'redundant_edges.jsonl'
COLLAPSED_QUALIFIERS_FILENAME = 'collapsed_qualifier_edges.jsonl'
DEFAULT_EDGE_PROPERTY_IGNORE_LIST = ['robokop_variant_id']


class GraphBuilder:
Expand Down Expand Up @@ -118,7 +117,11 @@ def build_graph(self, graph_id: str):
output_formats = graph_spec.graph_output_format.lower().split('+') if graph_spec.graph_output_format else []
nodes_filepath = os.path.join(graph_output_dir, NODES_FILENAME)
edges_filepath = os.path.join(graph_output_dir, EDGES_FILENAME)


# TODO allow these to be specified in the graph spec
node_property_ignore_list = {'robokop_variant_id'}
edge_property_ignore_list = None

if 'redundant_jsonl' in output_formats:
self.logger.info(f'Generating redundant edge KG for {graph_id}...')
redundant_filepath = edges_filepath.replace(EDGES_FILENAME, REDUNDANT_EDGES_FILENAME)
Expand All @@ -134,7 +137,8 @@ def build_graph(self, graph_id: str):
output_directory=graph_output_dir,
graph_id=graph_id,
graph_version=graph_version,
edge_property_ignore_list=DEFAULT_EDGE_PROPERTY_IGNORE_LIST,
node_property_ignore_list=node_property_ignore_list,
edge_property_ignore_list=edge_property_ignore_list,
logger=self.logger)

if dump_success:
Expand All @@ -156,7 +160,8 @@ def build_graph(self, graph_id: str):
output_directory=graph_output_dir,
graph_id=graph_id,
graph_version=graph_version,
edge_property_ignore_list=DEFAULT_EDGE_PROPERTY_IGNORE_LIST,
node_property_ignore_list=node_property_ignore_list,
edge_property_ignore_list=edge_property_ignore_list,
logger=self.logger)

if dump_success:
Expand All @@ -170,7 +175,8 @@ def build_graph(self, graph_id: str):
output_directory=graph_output_dir,
graph_id=graph_id,
graph_version=graph_version,
edge_property_ignore_list=DEFAULT_EDGE_PROPERTY_IGNORE_LIST,
node_property_ignore_list=node_property_ignore_list,
edge_property_ignore_list=edge_property_ignore_list,
logger=self.logger)

if dump_success:
Expand Down
38 changes: 29 additions & 9 deletions Common/kgx_file_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,9 +184,33 @@ def __convert_to_csv(input_file: str,
properties: dict, # dictionary of { node/edge property: property_type }
array_delimiter: str,
output_delimiter: str,
property_ignore_list=None):
property_ignore_list: set = None):

# generate the headers which for neo4j include the property name and the type
# for example:
# id:ID name:string category:LABEL equivalent_identifiers:string[] information_content:float
headers = {prop: f'{prop.removeprefix("biolink:")}:{prop_type}'
for prop, prop_type in properties.items() if prop not in property_ignore_list}
for prop, prop_type in properties.items()}

# if there is a property_ignore_list, remove them from the headers
# also filter the list to include only properties that are actually present
if property_ignore_list:
ignored_props_present = set()
for ignored_prop in property_ignore_list:
if headers.pop(ignored_prop, 'PROP_NOT_FOUND') != 'PROP_NOT_FOUND':
ignored_props_present.add(ignored_prop)
if not ignored_props_present:
property_ignore_list = None
else:
property_ignore_list = ignored_props_present
print(f'Properties that should be ignored were found, ignoring: {property_ignore_list}')

properties_that_are_lists = {prop for prop in headers if properties[prop] in {'LABEL',
'string[]',
'float[]',
'int[]'}}
properties_that_are_boolean = {prop for prop in headers if properties[prop] == 'boolean'}

with open(output_file, 'w', newline='') as output_file_handler:
csv_file_writer = csv.DictWriter(output_file_handler,
delimiter=output_delimiter,
Expand All @@ -205,15 +229,11 @@ def __convert_to_csv(input_file: str,
elif property_ignore_list and key in property_ignore_list:
del item[key]
else:
prop_type = properties[key]
# convert lists into strings with an array delimiter
if prop_type == 'LABEL' or \
prop_type == 'string[]' or \
prop_type == 'float[]' or \
prop_type == 'int[]':
if key in properties_that_are_lists:
# convert lists into strings with an array delimiter
if isinstance(item[key], list): # need to doublecheck for cases of properties with mixed types
item[key] = array_delimiter.join(str(value) for value in item[key])
elif prop_type == 'boolean':
elif key in properties_that_are_boolean:
# neo4j handles boolean with string 'true' being true and everything else false
item[key] = 'true' if item[key] is True else 'false'
csv_file_writer.writerow(item)
Expand Down
4 changes: 2 additions & 2 deletions Common/neo4j_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,8 @@ def create_neo4j_dump(nodes_filepath: str,
output_directory: str,
graph_id: str = 'graph',
graph_version: str = '',
node_property_ignore_list: list = None,
edge_property_ignore_list: list = None,
node_property_ignore_list: set = None,
edge_property_ignore_list: set = None,
logger=None):
nodes_csv_filename = 'nodes.temp_csv'
edges_csv_filename = 'edges.temp_csv'
Expand Down

0 comments on commit e3f15bd

Please sign in to comment.