Skip to content

Commit

Permalink
Updating the latest code
Browse files Browse the repository at this point in the history
  • Loading branch information
mlupei committed Jul 25, 2024
1 parent 86679c3 commit 1db4762
Show file tree
Hide file tree
Showing 10 changed files with 479 additions and 8 deletions.
53 changes: 53 additions & 0 deletions 1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import json
import csv
import ast # Import the ast module

# File paths
edges_file_path = 'kg2c-2.8.4-edges.jsonl'
nodes_file_path = 'kg2c-2.8.4-nodes.jsonl'
output_csv_path = 'output_filtered.csv'

# Load all nodes into a dictionary for quick access by ID
nodes = {}
with open(nodes_file_path, 'r') as nodes_file:
for line in nodes_file:
node_data = json.loads(line)
# Get the name or the first alternative name if the primary name is absent
name = node_data.get('name') or (node_data.get('all_names')[0] if 'all_names' in node_data and node_data['all_names'] else "Unknown")
nodes[node_data['id']] = name

# Process edges that meet the criteria
with open(edges_file_path, 'r') as edges_file, open(output_csv_path, 'w', newline='', encoding='utf-8') as output_file:
csv_writer = csv.writer(output_file)
csv_writer.writerow(['ID', 'Fact', 'Source', 'Template', 'Reference', 'Name'])

for line in edges_file:
edge = json.loads(line)

# Filter based on the knowledge source
if edge.get('primary_knowledge_source') == 'infores:semmeddb':
publications_info_raw = edge.get('publications_info', '{}')
try:
# Safely evaluate the string to a Python dictionary
publications_info = ast.literal_eval(publications_info_raw)
except (SyntaxError, ValueError) as e:
# Handle errors in literal evaluation
print(f"Error parsing publications_info: {publications_info_raw} with error: {e}")
publications_info = {}

# Extracting the first available sentence from publications_info
sentence = next((info.get('sentence', '') for info in publications_info.values()), '')

# Use names instead of IDs where possible
subject_name = nodes.get(edge['subject'], edge['subject'])
predicate_name = nodes.get(edge['predicate'], edge['predicate'])
object_name = nodes.get(edge['object'], edge['object'])

# Construct the Fact
fact = f"{subject_name} {predicate_name} {object_name}"

# Write to CSV
csv_writer.writerow([edge['id'], fact, sentence, '', '', ''])
# print([edge['id'], fact, sentence, '', '', ''])

print("Filtered CSV file has been generated successfully.")
7 changes: 7 additions & 0 deletions 2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
with open('data/RTX-KG2.8.4c_sentence_data.csv', 'r') as fin:
data = fin.read().splitlines(True)
with open('data/RTX-KG2.8.4c_sentence_data.csv', 'w') as fout:
fout.writelines(data[1:])
#data/RTX-KG2.8.4c_labeled_records.csv
#data/RTX-KG2.8.4c_sentence_data.csv
#data/RTX-KG2.8.4c_triple_data.csv
2 changes: 1 addition & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def main() -> None:
None
"""
parser = argparse.ArgumentParser(description="Fact-Checking App")
parser.add_argument("--model", required=True, choices=['llama', 'gpt_4_0', 'gpt_3_5_turbo'], help="Model to use")
parser.add_argument("--model", required=True, choices=['mixtral1','mixtral2','mixtral3','mixtral4','mixtral5','llama', 'gpt_4_0', 'gpt_3_5_turbo'], help="Model to use")
parser.add_argument("--icl", action='store_true', help="Use In-Context Learning")
parser.add_argument("--triple_file", required=True, help="Path to the SemMedDB triple file")
parser.add_argument("--sentence_file", required=True, help="Path to the SemMedDB sentence file")
Expand Down
122 changes: 122 additions & 0 deletions process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import json
import csv
import ast # Import the ast module for safely evaluating strings as Python expressions
import pandas as pd
# File paths
edges_file_path = 'kg2c-2.8.4-edges.jsonl'
nodes_file_path = 'kg2c-2.8.4-nodes.jsonl'
output_csv_path = 'output_filtered.csv'

# Load all nodes into a dictionary for quick access by ID
nodes = {}
with open(nodes_file_path, 'r') as nodes_file:
for line in nodes_file:
node_data = json.loads(line)
# Get the name or the first alternative name if the primary name is absent
name = node_data.get('name') or (node_data.get('all_names')[0] if 'all_names' in node_data and node_data['all_names'] else "Unknown")
nodes[node_data['id']] = name

# Process edges that meet the criteria
with open(edges_file_path, 'r') as edges_file, open(output_csv_path, 'w', newline='', encoding='utf-8') as output_file:
csv_writer = csv.writer(output_file)
csv_writer.writerow(['ID', 'Fact', 'Source', 'Template', 'Reference', 'Name'])

sentence_columns = ["SENTENCE_ID", "PMID", "TYPE", "NUMBER", "SENT_START_INDEX", "SENTENCE",
"SECTION_HEADER", "NORMALIZED_SECTION_HEADER", "Column", "Column"]
sentence_records = []

labeled_columns = ["Predicate ID", "Triple", "Sentence ID", "Sentence", "Question", "Label",
"Reference"]
labeled_records = []
#Predicate ID,Triple,Sentence ID,Sentence,Question,Label,Reference

triple_columns = ["PREDICATION_ID", "SENTENCE_ID", "PMID", "PREDICATE",
"SUBJECT_CUI", "SUBJECT_NAME", "SUBJECT_SEMTYPE", "SUBJECT_NOVELTY",
"OBJECT_CUI", "OBJECT_NAME", "OBJECT_SEMTYPE", "OBJECT_NOVELTY",
"Column", "Column", "Column"]
triple_records = []
sentence_id=0
for line in edges_file:
edge = json.loads(line)

# Filter based on the knowledge source and retrieve the sentence
if edge.get('primary_knowledge_source') == 'infores:semmeddb':
publications_info_raw = edge.get('publications_info', '{}')
try:
# Use ast.literal_eval to safely evaluate the string as a Python dictionary
publications_info = ast.literal_eval(publications_info_raw)
except ValueError as e:
print(f"Error parsing publications_info: {publications_info_raw} with error: {e}")
publications_info = {}

# Extracting the first available sentence from publications_info
sentence = next((info.get('sentence', '') for info in publications_info.values()), '')

# Use names instead of IDs where possible
subject_name = nodes.get(edge['subject'], edge['subject'])
predicate_name = nodes.get(edge['predicate'], edge['predicate'])
object_name = nodes.get(edge['object'], edge['object'])

# Construct the Fact
fact = f"{subject_name} {predicate_name} {object_name}"

# Write to CSV
csv_writer.writerow([edge['id'], fact, sentence, '', '', ''])


labeled_records.append({
"Predicate ID": edge['id'],
"Triple": f"{subject_name} {predicate_name} {object_name}",
"Sentence ID": sentence_id,
"Sentence": sentence,
"Question": f"Is the triple \"{subject_name} {predicate_name} {object_name}\" supported by the sentence: \"{sentence}\"?",
"Label": None,
"Reference": None
})
sentence_records.append({
"SENTENCE_ID": sentence_id,
"PMID": None,
"TYPE": None,
"NUMBER": None,
"SENT_START_INDEX": None,
"SENTENCE": sentence,
"SECTION_HEADER": None,
"NORMALIZED_SECTION_HEADER": None,
"Column": None,
"Column": None
})

triple_records.append({
"PREDICATION_ID": edge['id'],
"SENTENCE_ID": sentence_id,
"PMID": None,
"PREDICATE": predicate_name,
"SUBJECT_CUI": None,
"SUBJECT_NAME": subject_name,
"SUBJECT_SEMTYPE": None,
"SUBJECT_NOVELTY": None,
"OBJECT_CUI": None,
"OBJECT_NAME": object_name,
"OBJECT_SEMTYPE": None,
"OBJECT_NOVELTY": None,
"Column": None,
"Column": None,
"Column": None
})
sentence_id=sentence_id+1
#print(labeled_records)
#print("__")
#print(sentence_records)
#print("__")
#print(triple_records)
#print("__")
labeled_records_df=pd.DataFrame(labeled_records, columns=labeled_columns)
sentence_df = pd.DataFrame(sentence_records, columns=sentence_columns)
triple_df = pd.DataFrame(triple_records, columns=triple_columns)

labeled_records_df.to_csv(f"{name}_labeled_records.csv", index=False)
sentence_df.to_csv(f"{name}_sentence_data.csv", index=False)
triple_df.to_csv(f"{name}_triple_data.csv", index=False)


print("Filtered CSV file has been generated successfully.")
129 changes: 129 additions & 0 deletions process1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import json
import csv
import ast # Import the ast module for safely evaluating strings as Python expressions
import pandas as pd
# File paths
edges_file_path = 'kg2c-2.8.4-edges.jsonl'
nodes_file_path = 'kg2c-2.8.4-nodes.jsonl'
output_csv_path = 'output_filtered.csv'

# Load all nodes into a dictionary for quick access by ID
nodes = {}
equivalent_curies_map = {} # For mapping equivalent curies to names
with open(nodes_file_path, 'r') as nodes_file:
for line in nodes_file:
node_data = json.loads(line)
# Get the name or the first alternative name if the primary name is absent
name = node_data.get('name') or (node_data.get('all_names')[0] if 'all_names' in node_data and node_data['all_names'] else "Unknown")
nodes[node_data['id']] = name
# Map equivalent curies to the primary name
for curie in node_data.get('equivalent_curies', []):
equivalent_curies_map[curie] = name

# Process edges that meet the criteria
with open(edges_file_path, 'r') as edges_file, open(output_csv_path, 'w', newline='', encoding='utf-8') as output_file:
csv_writer = csv.writer(output_file)
csv_writer.writerow(['ID', 'Fact', 'Source', 'Template', 'Reference', 'Name'])

sentence_columns = ["SENTENCE_ID", "PMID", "TYPE", "NUMBER", "SENT_START_INDEX", "SENTENCE",
"SECTION_HEADER", "NORMALIZED_SECTION_HEADER", "Column", "Column"]
sentence_records = []

labeled_columns = ["Predicate ID", "Triple", "Sentence ID", "Sentence", "Question", "Label",
"Reference"]
labeled_records = []
#Predicate ID,Triple,Sentence ID,Sentence,Question,Label,Reference

triple_columns = ["PREDICATION_ID", "SENTENCE_ID", "PMID", "PREDICATE",
"SUBJECT_CUI", "SUBJECT_NAME", "SUBJECT_SEMTYPE", "SUBJECT_NOVELTY",
"OBJECT_CUI", "OBJECT_NAME", "OBJECT_SEMTYPE", "OBJECT_NOVELTY",
"Column", "Column", "Column"]
triple_records = []
sentence_id=0
for line in edges_file:
edge = json.loads(line)

# Filter based on the knowledge source and retrieve the sentence
if edge.get('primary_knowledge_source') == 'infores:semmeddb':
publications_info_raw = edge.get('publications_info', '{}')
try:
# Use ast.literal_eval to safely evaluate the string as a Python dictionary
publications_info = ast.literal_eval(publications_info_raw)
except ValueError as e:
print(f"Error parsing publications_info: {publications_info_raw} with error: {e}")
publications_info = {}

# Extracting the first available sentence from publications_info
sentence = next((info.get('sentence', '') for info in publications_info.values()), '')

# Look up names
subject_name = nodes.get(edge['subject'], equivalent_curies_map.get(edge['subject'], edge['subject']))
object_name = nodes.get(edge['object'], equivalent_curies_map.get(edge['object'], edge['object']))

# Handle predicate by checking equivalent curies
predicate_name = nodes.get(edge['predicate'], equivalent_curies_map.get(edge['predicate'], edge['predicate']))


# Construct the Fact
fact = f"{subject_name} {predicate_name} {object_name}"

# Write to CSV
csv_writer.writerow([edge['id'], fact, sentence, '', '', ''])


labeled_records.append({
"Predicate ID": edge['id'],
"Triple": f"{subject_name} {predicate_name} {object_name}",
"Sentence ID": sentence_id,
"Sentence": sentence,
"Question": f"Is the triple \"{subject_name} {predicate_name} {object_name}\" supported by the sentence: \"{sentence}\"?",
"Label": None,
"Reference": None
})
sentence_records.append({
"SENTENCE_ID": sentence_id,
"PMID": None,
"TYPE": None,
"NUMBER": None,
"SENT_START_INDEX": None,
"SENTENCE": sentence,
"SECTION_HEADER": None,
"NORMALIZED_SECTION_HEADER": None,
"Column": None,
"Column": None
})

triple_records.append({
"PREDICATION_ID": edge['id'],
"SENTENCE_ID": sentence_id,
"PMID": None,
"PREDICATE": predicate_name,
"SUBJECT_CUI": None,
"SUBJECT_NAME": subject_name,
"SUBJECT_SEMTYPE": None,
"SUBJECT_NOVELTY": None,
"OBJECT_CUI": None,
"OBJECT_NAME": object_name,
"OBJECT_SEMTYPE": None,
"OBJECT_NOVELTY": None,
"Column": None,
"Column": None,
"Column": None
})
sentence_id=sentence_id+1
#print(labeled_records)
#print("__")
#print(sentence_records)
#print("__")
#print(triple_records)
#print("__")
labeled_records_df=pd.DataFrame(labeled_records, columns=labeled_columns)
sentence_df = pd.DataFrame(sentence_records, columns=sentence_columns)
triple_df = pd.DataFrame(triple_records, columns=triple_columns)

labeled_records_df.to_csv(f"{name}_labeled_records.csv", index=False)
sentence_df.to_csv(f"{name}_sentence_data.csv", index=False)
triple_df.to_csv(f"{name}_triple_data.csv", index=False)


print("Filtered CSV file has been generated successfully.")
32 changes: 29 additions & 3 deletions src/get_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def get_result(model_info, prompt, model_type):
str: The generated result text.
"""
if model_type.startswith('llama'):
if model_type.startswith('mixtral') or model_type.startswith('llama'):
# If using a Llama model

if isinstance(model_info, tuple):
Expand All @@ -25,7 +25,33 @@ def get_result(model_info, prompt, model_type):
model = model_info
full_prompt = prompt
prompt = full_prompt
prompt_template=f'''SYSTEM: You are a computational biologist tasked with evaluating scientific claims. Your role requires you to apply critical thinking and your expertise to interpret data and research findings accurately. When responding, please start with 'Yes' or 'No' to directly address the query posed. Follow this with a comprehensive justification of your decision, integrating relevant scientific knowledge, the specifics of the case at hand, and any potential implications or nuances that may influence the interpretation of the evidence provided.
prompt_template=f'''
Context:
USER: ('\'Is the triple "Phase related to Follicle stimulating hormone measurement" directly or indirectly supported by the sentence: "In pre-menopause healthy females, blood was sampled weekly during one menstruation cycle and menstruation phases (follicular, ovulatory, luteal) were determined by FSH/LH levels."?',)
ASSISTANT: Yes
USER: ('\'Is the triple "Phase related to Sodium measurement" directly or indirectly supported by the sentence: "Based on a biophysical photoreceptor model, the Na(+)- and Ca(2+)-currents and concentration changes were determined from the first transient depolarization phase of the photoreceptor response."?',)
ASSISTANT:Yes
USER: ('\'Is the triple "Phase related to Bronchoalveolar Lavage" directly or indirectly supported by the sentence: "Challenge of the airways of sensitized guinea pigs with aerosolized ovalbumin resulted in an early phase of microvascular protein leakage and a delayed phase of eosinophil accumulation in the airway lumen, as measured using bronchoalveolar lavage (BAL)."?',)
ASSISTANT: Yes
USER: ('\'Does the phrase "Ciprofloxacin related to DNA Gyrase" receive at least indirect support from the statement: "Effect of ranolazine in preventing postoperative atrial fibrillation in patients undergoing coronary revascularization surgery."?',)
ASSISTANT: No
USER: ('\'Does the phrase "Ciprofloxacin related to Crohn disease" receive at least indirect support from the statement: "Recent evidence of beneficial effects of ranolazine (RAN) in type II diabetes motivates interest in the role of the late sodium current (INaL) in glucose-stimulated insulin secretion."?',)
ASSISTANT: No
USER: ('\'Does the phrase "Ciprofloxacin related to endophthalmitis" receive at least indirect support from the statement: "Furthermore, the activated Akt/mTOR signaling pathway induced by AF was further activated by ranolazine."?',)
ASSISTANT: No
SYSTEM: You are a computational biologist. Answer yes or no fast.
USER: {prompt}
Expand All @@ -39,7 +65,7 @@ def get_result(model_info, prompt, model_type):

print(chunk)
try:
response = model(prompt=chunk, max_tokens=1024, temperature=0.8,
response = model(prompt=chunk, max_tokens=1, temperature=0.8,
top_p=0.95, repeat_penalty=1.2, top_k=150, echo=False)
result_text += response["choices"][0]["text"]

Expand Down
Loading

0 comments on commit 1db4762

Please sign in to comment.