From 8b0949a308cde101db166e1ce252272f3b500812 Mon Sep 17 00:00:00 2001
From: Maksym <maxim.lupey@gmail.com>
Date: Mon, 29 Apr 2024 16:16:45 -0400
Subject: [PATCH] LLMFactCheck

---
 process.py | 129 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 129 insertions(+)
 create mode 100644 process.py

diff --git a/process.py b/process.py
new file mode 100644
index 0000000..b28caa6
--- /dev/null
+++ b/process.py
@@ -0,0 +1,129 @@
+import json
+import csv
+import ast  # Import the ast module for safely evaluating strings as Python expressions
+import pandas as pd
+# File paths
+edges_file_path = 'kg2c-2.8.4-edges.jsonl'
+nodes_file_path = 'kg2c-2.8.4-nodes.jsonl'
+output_csv_path = 'output_filtered.csv'
+
+# Load all nodes into a dictionary for quick access by ID
+nodes = {}
+equivalent_curies_map = {}  # For mapping equivalent curies to names
+with open(nodes_file_path, 'r') as nodes_file:
+    for line in nodes_file:
+        node_data = json.loads(line)
+        # Get the name or the first alternative name if the primary name is absent
+        name = node_data.get('name') or (node_data.get('all_names')[0] if 'all_names' in node_data and node_data['all_names'] else "Unknown")
+        nodes[node_data['id']] = name
+        # Map equivalent curies to the primary name
+        for curie in node_data.get('equivalent_curies', []):
+            equivalent_curies_map[curie] = name 
+
+# Process edges that meet the criteria
+with open(edges_file_path, 'r') as edges_file, open(output_csv_path, 'w', newline='', encoding='utf-8') as output_file:
+    csv_writer = csv.writer(output_file)
+    csv_writer.writerow(['ID', 'Fact', 'Source', 'Template', 'Reference', 'Name'])
+
+    sentence_columns = ["SENTENCE_ID", "PMID", "TYPE", "NUMBER", "SENT_START_INDEX", "SENTENCE",
+                            "SECTION_HEADER", "NORMALIZED_SECTION_HEADER", "Column", "Column"]
+    sentence_records = []
+
+    labeled_columns = ["Predicate ID", "Triple", "Sentence ID", "Sentence", "Question", "Label",
+                            "Reference"]
+    labeled_records = []
+                    #Predicate ID,Triple,Sentence ID,Sentence,Question,Label,Reference
+
+    triple_columns = ["PREDICATION_ID", "SENTENCE_ID", "PMID", "PREDICATE",
+                          "SUBJECT_CUI", "SUBJECT_NAME", "SUBJECT_SEMTYPE", "SUBJECT_NOVELTY",
+                          "OBJECT_CUI", "OBJECT_NAME", "OBJECT_SEMTYPE", "OBJECT_NOVELTY",
+                          "Column", "Column", "Column"]
+    triple_records = []
+    sentence_id=0
+    for line in edges_file:
+        edge = json.loads(line)
+        
+        # Filter based on the knowledge source and retrieve the sentence
+        if edge.get('primary_knowledge_source') == 'infores:semmeddb':
+            publications_info_raw = edge.get('publications_info', '{}')
+            try:
+                # Use ast.literal_eval to safely evaluate the string as a Python dictionary
+                publications_info = ast.literal_eval(publications_info_raw)
+            except ValueError as e:
+                print(f"Error parsing publications_info: {publications_info_raw} with error: {e}")
+                publications_info = {}
+
+            # Extracting the first available sentence from publications_info
+            sentence = next((info.get('sentence', '') for info in publications_info.values()), '')
+
+            # Look up names
+            subject_name = nodes.get(edge['subject'], equivalent_curies_map.get(edge['subject'], edge['subject']))
+            object_name = nodes.get(edge['object'], equivalent_curies_map.get(edge['object'], edge['object']))
+
+            # Handle predicate by checking equivalent curies
+            predicate_name = nodes.get(edge['predicate'], equivalent_curies_map.get(edge['predicate'], edge['predicate']))
+
+
+            # Construct the Fact
+            fact = f"{subject_name} {predicate_name} {object_name}"
+
+            # Write to CSV
+            csv_writer.writerow([edge['id'], fact, sentence, '', '', ''])
+
+
+            labeled_records.append({
+                "Predicate ID": edge['id'],
+                "Triple": f"{subject_name} {predicate_name} {object_name}",
+                "Sentence ID": sentence_id,
+                "Sentence": sentence,
+                "Question": f"Is the triple \"{subject_name} {predicate_name} {object_name}\" supported by the sentence: \"{sentence}\"?",
+                "Label": None,
+                "Reference": None
+            })
+            sentence_records.append({
+                "SENTENCE_ID": sentence_id,
+                "PMID": None,
+                "TYPE": None,
+                "NUMBER": None,
+                "SENT_START_INDEX": None,
+                "SENTENCE": sentence,
+                "SECTION_HEADER": None,
+                "NORMALIZED_SECTION_HEADER": None,
+                "Column": None,
+                "Column": None
+            })
+
+            triple_records.append({
+                "PREDICATION_ID": edge['id'],
+                "SENTENCE_ID": sentence_id,
+                "PMID": None,
+                "PREDICATE": predicate_name,
+                "SUBJECT_CUI": None,
+                "SUBJECT_NAME": subject_name,
+                "SUBJECT_SEMTYPE": None,
+                "SUBJECT_NOVELTY": None,
+                "OBJECT_CUI": None,
+                "OBJECT_NAME": object_name,
+                "OBJECT_SEMTYPE": None,
+                "OBJECT_NOVELTY": None,
+                "Column": None,
+                "Column": None,
+                "Column": None
+            })
+            sentence_id=sentence_id+1
+            #print(labeled_records)
+            #print("__")
+            #print(sentence_records)
+            #print("__")
+            #print(triple_records)
+            #print("__")
+    labeled_records_df=pd.DataFrame(labeled_records, columns=labeled_columns)
+    sentence_df = pd.DataFrame(sentence_records, columns=sentence_columns)
+    triple_df = pd.DataFrame(triple_records, columns=triple_columns)
+
+    labeled_records_df.to_csv(f"{name}_labeled_records.csv", index=False)
+    sentence_df.to_csv(f"{name}_sentence_data.csv", index=False)
+    triple_df.to_csv(f"{name}_triple_data.csv", index=False)
+    
+
+print("Filtered CSV file has been generated successfully.")