From bbe7ed7c1c1a1cf80bb7bf5f1b5dbcd5902fb8a6 Mon Sep 17 00:00:00 2001 From: mlupei Date: Wed, 10 Jul 2024 13:46:45 -0400 Subject: [PATCH] test --- 1.py | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2.py | 7 +++++++ 2 files changed, 60 insertions(+) create mode 100644 1.py create mode 100644 2.py diff --git a/1.py b/1.py new file mode 100644 index 0000000..c42fd68 --- /dev/null +++ b/1.py @@ -0,0 +1,53 @@ +import json +import csv +import ast # Import the ast module + +# File paths +edges_file_path = 'kg2c-2.8.4-edges.jsonl' +nodes_file_path = 'kg2c-2.8.4-nodes.jsonl' +output_csv_path = 'output_filtered.csv' + +# Load all nodes into a dictionary for quick access by ID +nodes = {} +with open(nodes_file_path, 'r') as nodes_file: + for line in nodes_file: + node_data = json.loads(line) + # Get the name or the first alternative name if the primary name is absent + name = node_data.get('name') or (node_data.get('all_names')[0] if 'all_names' in node_data and node_data['all_names'] else "Unknown") + nodes[node_data['id']] = name + +# Process edges that meet the criteria +with open(edges_file_path, 'r') as edges_file, open(output_csv_path, 'w', newline='', encoding='utf-8') as output_file: + csv_writer = csv.writer(output_file) + csv_writer.writerow(['ID', 'Fact', 'Source', 'Template', 'Reference', 'Name']) + + for line in edges_file: + edge = json.loads(line) + + # Filter based on the knowledge source + if edge.get('primary_knowledge_source') == 'infores:semmeddb': + publications_info_raw = edge.get('publications_info', '{}') + try: + # Safely evaluate the string to a Python dictionary + publications_info = ast.literal_eval(publications_info_raw) + except (SyntaxError, ValueError) as e: + # Handle errors in literal evaluation + print(f"Error parsing publications_info: {publications_info_raw} with error: {e}") + publications_info = {} + + # Extracting the first available sentence from publications_info + sentence = next((info.get('sentence', '') for info in publications_info.values()), '') + + # Use names instead of IDs where possible + subject_name = nodes.get(edge['subject'], edge['subject']) + predicate_name = nodes.get(edge['predicate'], edge['predicate']) + object_name = nodes.get(edge['object'], edge['object']) + + # Construct the Fact + fact = f"{subject_name} {predicate_name} {object_name}" + + # Write to CSV + csv_writer.writerow([edge['id'], fact, sentence, '', '', '']) + # print([edge['id'], fact, sentence, '', '', '']) + +print("Filtered CSV file has been generated successfully.") diff --git a/2.py b/2.py new file mode 100644 index 0000000..32cb6f4 --- /dev/null +++ b/2.py @@ -0,0 +1,7 @@ +with open('data/RTX-KG2.8.4c_sentence_data.csv', 'r') as fin: + data = fin.read().splitlines(True) +with open('data/RTX-KG2.8.4c_sentence_data.csv', 'w') as fout: + fout.writelines(data[1:]) + #data/RTX-KG2.8.4c_labeled_records.csv + #data/RTX-KG2.8.4c_sentence_data.csv + #data/RTX-KG2.8.4c_triple_data.csv