Updating the latest code

KoslickiLab · Jul 25, 2024 · 1db4762 · 1db4762
1 parent 86679c3
commit 1db4762
Show file tree

Hide file tree

Showing 10 changed files with 479 additions and 8 deletions.
diff --git a/1.py b/1.py
@@ -0,0 +1,53 @@
+import json
+import csv
+import ast  # Import the ast module
+
+# File paths
+edges_file_path = 'kg2c-2.8.4-edges.jsonl'
+nodes_file_path = 'kg2c-2.8.4-nodes.jsonl'
+output_csv_path = 'output_filtered.csv'
+
+# Load all nodes into a dictionary for quick access by ID
+nodes = {}
+with open(nodes_file_path, 'r') as nodes_file:
+    for line in nodes_file:
+        node_data = json.loads(line)
+        # Get the name or the first alternative name if the primary name is absent
+        name = node_data.get('name') or (node_data.get('all_names')[0] if 'all_names' in node_data and node_data['all_names'] else "Unknown")
+        nodes[node_data['id']] = name
+
+# Process edges that meet the criteria
+with open(edges_file_path, 'r') as edges_file, open(output_csv_path, 'w', newline='', encoding='utf-8') as output_file:
+    csv_writer = csv.writer(output_file)
+    csv_writer.writerow(['ID', 'Fact', 'Source', 'Template', 'Reference', 'Name'])
+
+    for line in edges_file:
+        edge = json.loads(line)
+
+        # Filter based on the knowledge source
+        if edge.get('primary_knowledge_source') == 'infores:semmeddb':
+            publications_info_raw = edge.get('publications_info', '{}')
+            try:
+                # Safely evaluate the string to a Python dictionary
+                publications_info = ast.literal_eval(publications_info_raw)
+            except (SyntaxError, ValueError) as e:
+                # Handle errors in literal evaluation
+                print(f"Error parsing publications_info: {publications_info_raw} with error: {e}")
+                publications_info = {}
+
+            # Extracting the first available sentence from publications_info
+            sentence = next((info.get('sentence', '') for info in publications_info.values()), '')
+
+            # Use names instead of IDs where possible
+            subject_name = nodes.get(edge['subject'], edge['subject'])
+            predicate_name = nodes.get(edge['predicate'], edge['predicate'])
+            object_name = nodes.get(edge['object'], edge['object'])
+
+            # Construct the Fact
+            fact = f"{subject_name} {predicate_name} {object_name}"
+
+            # Write to CSV
+            csv_writer.writerow([edge['id'], fact, sentence, '', '', ''])
+            # print([edge['id'], fact, sentence, '', '', ''])
+
+print("Filtered CSV file has been generated successfully.")
diff --git a/2.py b/2.py
@@ -0,0 +1,7 @@
+with open('data/RTX-KG2.8.4c_sentence_data.csv', 'r') as fin:
+    data = fin.read().splitlines(True)
+with open('data/RTX-KG2.8.4c_sentence_data.csv', 'w') as fout:
+    fout.writelines(data[1:])
+	#data/RTX-KG2.8.4c_labeled_records.csv
+	#data/RTX-KG2.8.4c_sentence_data.csv
+	#data/RTX-KG2.8.4c_triple_data.csv
diff --git a/main.py b/main.py
@@ -27,7 +27,7 @@ def main() -> None:
         None
     """
     parser = argparse.ArgumentParser(description="Fact-Checking App")
-    parser.add_argument("--model", required=True, choices=['llama', 'gpt_4_0', 'gpt_3_5_turbo'], help="Model to use")
+    parser.add_argument("--model", required=True, choices=['mixtral1','mixtral2','mixtral3','mixtral4','mixtral5','llama', 'gpt_4_0', 'gpt_3_5_turbo'], help="Model to use")
     parser.add_argument("--icl", action='store_true', help="Use In-Context Learning")
     parser.add_argument("--triple_file", required=True, help="Path to the SemMedDB triple file")
     parser.add_argument("--sentence_file", required=True, help="Path to the SemMedDB sentence file")

diff --git a/process.py b/process.py
@@ -0,0 +1,122 @@
+import json
+import csv
+import ast  # Import the ast module for safely evaluating strings as Python expressions
+import pandas as pd
+# File paths
+edges_file_path = 'kg2c-2.8.4-edges.jsonl'
+nodes_file_path = 'kg2c-2.8.4-nodes.jsonl'
+output_csv_path = 'output_filtered.csv'
+
+# Load all nodes into a dictionary for quick access by ID
+nodes = {}
+with open(nodes_file_path, 'r') as nodes_file:
+    for line in nodes_file:
+        node_data = json.loads(line)
+        # Get the name or the first alternative name if the primary name is absent
+        name = node_data.get('name') or (node_data.get('all_names')[0] if 'all_names' in node_data and node_data['all_names'] else "Unknown")
+        nodes[node_data['id']] = name
+
+# Process edges that meet the criteria
+with open(edges_file_path, 'r') as edges_file, open(output_csv_path, 'w', newline='', encoding='utf-8') as output_file:
+    csv_writer = csv.writer(output_file)
+    csv_writer.writerow(['ID', 'Fact', 'Source', 'Template', 'Reference', 'Name'])
+
+    sentence_columns = ["SENTENCE_ID", "PMID", "TYPE", "NUMBER", "SENT_START_INDEX", "SENTENCE",
+                            "SECTION_HEADER", "NORMALIZED_SECTION_HEADER", "Column", "Column"]
+    sentence_records = []
+
+    labeled_columns = ["Predicate ID", "Triple", "Sentence ID", "Sentence", "Question", "Label",
+                            "Reference"]
+    labeled_records = []
+                    #Predicate ID,Triple,Sentence ID,Sentence,Question,Label,Reference
+
+    triple_columns = ["PREDICATION_ID", "SENTENCE_ID", "PMID", "PREDICATE",
+                          "SUBJECT_CUI", "SUBJECT_NAME", "SUBJECT_SEMTYPE", "SUBJECT_NOVELTY",
+                          "OBJECT_CUI", "OBJECT_NAME", "OBJECT_SEMTYPE", "OBJECT_NOVELTY",
+                          "Column", "Column", "Column"]
+    triple_records = []
+    sentence_id=0
+    for line in edges_file:
+        edge = json.loads(line)
+
+        # Filter based on the knowledge source and retrieve the sentence
+        if edge.get('primary_knowledge_source') == 'infores:semmeddb':
+            publications_info_raw = edge.get('publications_info', '{}')
+            try:
+                # Use ast.literal_eval to safely evaluate the string as a Python dictionary
+                publications_info = ast.literal_eval(publications_info_raw)
+            except ValueError as e:
+                print(f"Error parsing publications_info: {publications_info_raw} with error: {e}")
+                publications_info = {}
+
+            # Extracting the first available sentence from publications_info
+            sentence = next((info.get('sentence', '') for info in publications_info.values()), '')
+
+            # Use names instead of IDs where possible
+            subject_name = nodes.get(edge['subject'], edge['subject'])
+            predicate_name = nodes.get(edge['predicate'], edge['predicate'])
+            object_name = nodes.get(edge['object'], edge['object'])
+
+            # Construct the Fact
+            fact = f"{subject_name} {predicate_name} {object_name}"
+
+            # Write to CSV
+            csv_writer.writerow([edge['id'], fact, sentence, '', '', ''])
+
+
+            labeled_records.append({
+                "Predicate ID": edge['id'],
+                "Triple": f"{subject_name} {predicate_name} {object_name}",
+                "Sentence ID": sentence_id,
+                "Sentence": sentence,
+                "Question": f"Is the triple \"{subject_name} {predicate_name} {object_name}\" supported by the sentence: \"{sentence}\"?",
+                "Label": None,
+                "Reference": None
+            })
+            sentence_records.append({
+                "SENTENCE_ID": sentence_id,
+                "PMID": None,
+                "TYPE": None,
+                "NUMBER": None,
+                "SENT_START_INDEX": None,
+                "SENTENCE": sentence,
+                "SECTION_HEADER": None,
+                "NORMALIZED_SECTION_HEADER": None,
+                "Column": None,
+                "Column": None
+            })
+
+            triple_records.append({
+                "PREDICATION_ID": edge['id'],
+                "SENTENCE_ID": sentence_id,
+                "PMID": None,
+                "PREDICATE": predicate_name,
+                "SUBJECT_CUI": None,
+                "SUBJECT_NAME": subject_name,
+                "SUBJECT_SEMTYPE": None,
+                "SUBJECT_NOVELTY": None,
+                "OBJECT_CUI": None,
+                "OBJECT_NAME": object_name,
+                "OBJECT_SEMTYPE": None,
+                "OBJECT_NOVELTY": None,
+                "Column": None,
+                "Column": None,
+                "Column": None
+            })
+            sentence_id=sentence_id+1
+            #print(labeled_records)
+            #print("__")
+            #print(sentence_records)
+            #print("__")
+            #print(triple_records)
+            #print("__")
+    labeled_records_df=pd.DataFrame(labeled_records, columns=labeled_columns)
+    sentence_df = pd.DataFrame(sentence_records, columns=sentence_columns)
+    triple_df = pd.DataFrame(triple_records, columns=triple_columns)
+
+    labeled_records_df.to_csv(f"{name}_labeled_records.csv", index=False)
+    sentence_df.to_csv(f"{name}_sentence_data.csv", index=False)
+    triple_df.to_csv(f"{name}_triple_data.csv", index=False)
+
+
+print("Filtered CSV file has been generated successfully.")
diff --git a/process1.py b/process1.py
@@ -0,0 +1,129 @@
+import json
+import csv
+import ast  # Import the ast module for safely evaluating strings as Python expressions
+import pandas as pd
+# File paths
+edges_file_path = 'kg2c-2.8.4-edges.jsonl'
+nodes_file_path = 'kg2c-2.8.4-nodes.jsonl'
+output_csv_path = 'output_filtered.csv'
+
+# Load all nodes into a dictionary for quick access by ID
+nodes = {}
+equivalent_curies_map = {}  # For mapping equivalent curies to names
+with open(nodes_file_path, 'r') as nodes_file:
+    for line in nodes_file:
+        node_data = json.loads(line)
+        # Get the name or the first alternative name if the primary name is absent
+        name = node_data.get('name') or (node_data.get('all_names')[0] if 'all_names' in node_data and node_data['all_names'] else "Unknown")
+        nodes[node_data['id']] = name
+        # Map equivalent curies to the primary name
+        for curie in node_data.get('equivalent_curies', []):
+            equivalent_curies_map[curie] = name 
+
+# Process edges that meet the criteria
+with open(edges_file_path, 'r') as edges_file, open(output_csv_path, 'w', newline='', encoding='utf-8') as output_file:
+    csv_writer = csv.writer(output_file)
+    csv_writer.writerow(['ID', 'Fact', 'Source', 'Template', 'Reference', 'Name'])
+
+    sentence_columns = ["SENTENCE_ID", "PMID", "TYPE", "NUMBER", "SENT_START_INDEX", "SENTENCE",
+                            "SECTION_HEADER", "NORMALIZED_SECTION_HEADER", "Column", "Column"]
+    sentence_records = []
+
+    labeled_columns = ["Predicate ID", "Triple", "Sentence ID", "Sentence", "Question", "Label",
+                            "Reference"]
+    labeled_records = []
+                    #Predicate ID,Triple,Sentence ID,Sentence,Question,Label,Reference
+
+    triple_columns = ["PREDICATION_ID", "SENTENCE_ID", "PMID", "PREDICATE",
+                          "SUBJECT_CUI", "SUBJECT_NAME", "SUBJECT_SEMTYPE", "SUBJECT_NOVELTY",
+                          "OBJECT_CUI", "OBJECT_NAME", "OBJECT_SEMTYPE", "OBJECT_NOVELTY",
+                          "Column", "Column", "Column"]
+    triple_records = []
+    sentence_id=0
+    for line in edges_file:
+        edge = json.loads(line)
+
+        # Filter based on the knowledge source and retrieve the sentence
+        if edge.get('primary_knowledge_source') == 'infores:semmeddb':
+            publications_info_raw = edge.get('publications_info', '{}')
+            try:
+                # Use ast.literal_eval to safely evaluate the string as a Python dictionary
+                publications_info = ast.literal_eval(publications_info_raw)
+            except ValueError as e:
+                print(f"Error parsing publications_info: {publications_info_raw} with error: {e}")
+                publications_info = {}
+
+            # Extracting the first available sentence from publications_info
+            sentence = next((info.get('sentence', '') for info in publications_info.values()), '')
+
+            # Look up names
+            subject_name = nodes.get(edge['subject'], equivalent_curies_map.get(edge['subject'], edge['subject']))
+            object_name = nodes.get(edge['object'], equivalent_curies_map.get(edge['object'], edge['object']))
+
+            # Handle predicate by checking equivalent curies
+            predicate_name = nodes.get(edge['predicate'], equivalent_curies_map.get(edge['predicate'], edge['predicate']))
+
+
+            # Construct the Fact
+            fact = f"{subject_name} {predicate_name} {object_name}"
+
+            # Write to CSV
+            csv_writer.writerow([edge['id'], fact, sentence, '', '', ''])
+
+
+            labeled_records.append({
+                "Predicate ID": edge['id'],
+                "Triple": f"{subject_name} {predicate_name} {object_name}",
+                "Sentence ID": sentence_id,
+                "Sentence": sentence,
+                "Question": f"Is the triple \"{subject_name} {predicate_name} {object_name}\" supported by the sentence: \"{sentence}\"?",
+                "Label": None,
+                "Reference": None
+            })
+            sentence_records.append({
+                "SENTENCE_ID": sentence_id,
+                "PMID": None,
+                "TYPE": None,
+                "NUMBER": None,
+                "SENT_START_INDEX": None,
+                "SENTENCE": sentence,
+                "SECTION_HEADER": None,
+                "NORMALIZED_SECTION_HEADER": None,
+                "Column": None,
+                "Column": None
+            })
+
+            triple_records.append({
+                "PREDICATION_ID": edge['id'],
+                "SENTENCE_ID": sentence_id,
+                "PMID": None,
+                "PREDICATE": predicate_name,
+                "SUBJECT_CUI": None,
+                "SUBJECT_NAME": subject_name,
+                "SUBJECT_SEMTYPE": None,
+                "SUBJECT_NOVELTY": None,
+                "OBJECT_CUI": None,
+                "OBJECT_NAME": object_name,
+                "OBJECT_SEMTYPE": None,
+                "OBJECT_NOVELTY": None,
+                "Column": None,
+                "Column": None,
+                "Column": None
+            })
+            sentence_id=sentence_id+1
+            #print(labeled_records)
+            #print("__")
+            #print(sentence_records)
+            #print("__")
+            #print(triple_records)
+            #print("__")
+    labeled_records_df=pd.DataFrame(labeled_records, columns=labeled_columns)
+    sentence_df = pd.DataFrame(sentence_records, columns=sentence_columns)
+    triple_df = pd.DataFrame(triple_records, columns=triple_columns)
+
+    labeled_records_df.to_csv(f"{name}_labeled_records.csv", index=False)
+    sentence_df.to_csv(f"{name}_sentence_data.csv", index=False)
+    triple_df.to_csv(f"{name}_triple_data.csv", index=False)
+
+
+print("Filtered CSV file has been generated successfully.")
diff --git a/src/get_result.py b/src/get_result.py
@@ -12,7 +12,7 @@ def get_result(model_info, prompt, model_type):
         str: The generated result text.
 
     """
-    if model_type.startswith('llama'):
+    if model_type.startswith('mixtral') or model_type.startswith('llama'):
         # If using a Llama model
 
         if isinstance(model_info, tuple):
@@ -25,7 +25,33 @@ def get_result(model_info, prompt, model_type):
             model = model_info
             full_prompt = prompt
         prompt = full_prompt
-        prompt_template=f'''SYSTEM: You are a computational biologist tasked with evaluating scientific claims. Your role requires you to apply critical thinking and your expertise to interpret data and research findings accurately. When responding, please start with 'Yes' or 'No' to directly address the query posed. Follow this with a comprehensive justification of your decision, integrating relevant scientific knowledge, the specifics of the case at hand, and any potential implications or nuances that may influence the interpretation of the evidence provided.           
+        prompt_template=f'''
+        Context: 
+	USER: ('\'Is the triple "Phase related to Follicle stimulating hormone measurement" directly or indirectly supported by the sentence: "In pre-menopause healthy females, blood was sampled weekly during one menstruation cycle and menstruation phases (follicular, ovulatory, luteal) were determined by FSH/LH levels."?',)
+
+        ASSISTANT: Yes
+
+        USER: ('\'Is the triple "Phase related to Sodium measurement" directly or indirectly supported by the sentence: "Based on a biophysical photoreceptor model, the Na(+)- and Ca(2+)-currents and concentration changes were determined from the first transient depolarization phase of the photoreceptor response."?',)
+
+        ASSISTANT:Yes
+
+        USER: ('\'Is the triple "Phase related to Bronchoalveolar Lavage" directly or indirectly supported by the sentence: "Challenge of the airways of sensitized guinea pigs with aerosolized ovalbumin resulted in an early phase of microvascular protein leakage and a delayed phase of eosinophil accumulation in the airway lumen, as measured using bronchoalveolar lavage (BAL)."?',)
+
+        ASSISTANT: Yes
+
+        USER: ('\'Does the phrase "Ciprofloxacin related to DNA Gyrase" receive at least indirect support from the statement: "Effect of ranolazine in preventing postoperative atrial fibrillation in patients undergoing coronary revascularization surgery."?',)
+
+        ASSISTANT: No
+
+        USER: ('\'Does the phrase "Ciprofloxacin related to Crohn disease" receive at least indirect support from the statement: "Recent evidence of beneficial effects of ranolazine (RAN) in type II diabetes motivates interest in the role of the late sodium current (INaL) in glucose-stimulated insulin secretion."?',)
+
+        ASSISTANT: No
+
+        USER: ('\'Does the phrase "Ciprofloxacin related to endophthalmitis" receive at least indirect support from the statement: "Furthermore, the activated Akt/mTOR signaling pathway induced by AF was further activated by ranolazine."?',)
+
+        ASSISTANT: No
+
+        SYSTEM:  You are a computational biologist. Answer yes or no fast.      
 
         USER: {prompt}
 
@@ -39,7 +65,7 @@ def get_result(model_info, prompt, model_type):
 
             print(chunk)
             try:
-                response = model(prompt=chunk, max_tokens=1024, temperature=0.8,
+                response = model(prompt=chunk, max_tokens=1, temperature=0.8,
                              top_p=0.95, repeat_penalty=1.2, top_k=150, echo=False)
                 result_text += response["choices"][0]["text"]