Refactored source code using postgres

KoslickiLab · Oct 11, 2024 · 82f6793 · 82f6793
1 parent ffed0f9
commit 82f6793
Showing 1 changed file with 135 additions and 0 deletions.
diff --git a/process.py b/process.py
@@ -0,0 +1,135 @@
+# -*- coding: utf-8 -*-
+"""Process
+
+Automatically generated by Colab.
+
+Original file is located at
+    https://colab.research.google.com/drive/1Q1UxrmNHSKvNRqlksj7Lf0sObE5Zeq74
+"""
+
+import psycopg2
+import json
+import os
+from dotenv import load_dotenv
+from llama_cpp import Llama
+from huggingface_hub import hf_hub_download
+
+# Constants
+MODEL_NAME = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF"
+MODEL_PATH = hf_hub_download(repo_id=MODEL_NAME, filename="mixtral-8x7b-instruct-v0.1.Q5_K_M.gguf")
+BATCH_SIZE = 10  # Number of records to process in each batch
+TIMEOUT = 5000  # Statement timeout in milliseconds
+
+# Load Llama model
+model = Llama(
+    model_path=MODEL_PATH,
+    n_threads=64,
+    n_batch=128,
+    n_ctx=4096,
+    n_gpu_layers=96,
+    mlock=True,
+    logits_all=True
+)
+
+# Load environment variables
+load_dotenv('.env')
+
+# Establish database connection
+def create_db_connection():
+    return psycopg2.connect(
+        dbname=os.getenv("DB_NAME"),
+        user=os.getenv("DB_USER"),
+        password=os.getenv("DB_PASSWORD"),
+        host=os.getenv("DB_HOST"),
+        port=os.getenv("DB_PORT"),
+        connect_timeout=10  # Set connection timeout to 10 seconds
+    )
+
+# Set statement timeout for long-running queries
+def set_statement_timeout(cursor, timeout=TIMEOUT):
+    cursor.execute(f"SET statement_timeout = {timeout};")
+
+# Read starting ID from environment
+def read_starting_id():
+    return int(os.getenv("STARTING_ID", 0))
+
+# Save updated ID to the .env file
+def save_updated_id(id_value):
+    with open('.env', 'r') as file:
+        lines = file.readlines()
+
+    updated_lines = [
+        f"STARTING_ID={id_value}\n" if line.startswith("STARTING_ID=") else line
+        for line in lines
+    ]
+
+    if not any(line.startswith("STARTING_ID=") for line in lines):
+        updated_lines.append(f"STARTING_ID={id_value}\n")
+
+    with open('.env', 'w') as file:
+        file.writelines(updated_lines)
+
+# Process a triple and sentence using Llama model
+def process_triple(triple, sentence):
+    prompt = f"""
+    SYSTEM: You are a computational biologist tasked with evaluating scientific claims.
+    Your role requires you to apply critical thinking and your expertise to interpret data and research findings accurately.
+    Answer 'Yes' or 'No' to directly address the query posed.
+
+    USER: ('Does the phrase "{triple}" receive at least indirect support from the statement: "{sentence}"?',).
+    ASSISTANT:
+    """
+
+    response = model(prompt=prompt, max_tokens=1, temperature=0, echo=False, logprobs=True)
+    return response["choices"][0]["text"], json.dumps(response["choices"][0]["logprobs"])
+
+# Process batches of records from the database
+def process_batches(cursor, starting_id):
+    while True:
+        cursor.execute("""
+            SELECT "id", "triple", "sentence"
+            FROM public."tblBiomedicalFactcheck"
+            WHERE "id" >= %s
+            ORDER BY "id"
+            LIMIT %s
+        """, (starting_id, BATCH_SIZE))
+
+        records = cursor.fetchall()
+        if not records:
+            print("No more records to process.")
+            break
+
+        for record in records:
+            record_id, triple, sentence = record
+            answer, logprops_json = process_triple(triple, sentence)
+            try:
+                cursor.execute("""
+                    UPDATE public."tblBiomedicalFactcheck"
+                    SET "answer" = %s, "logprops" = %s
+                    WHERE "id" = %s
+                """, (answer, logprops_json, record_id))
+            except Exception as e:
+                print(f"Error updating record ID {record_id}: {e}")
+                conn.rollback()  # Rollback in case of an error
+
+        conn.commit()
+        starting_id += BATCH_SIZE
+        save_updated_id(starting_id)
+        print(f"Processed up to ID: {starting_id}")
+
+# Main processing function
+def main():
+    starting_id = read_starting_id()
+    conn = create_db_connection()
+    cursor = conn.cursor()
+
+    try:
+        set_statement_timeout(cursor)
+        process_batches(cursor, starting_id)
+    finally:
+        cursor.close()
+        conn.close()
+        print("Data processing completed successfully.")
+
+if __name__ == "__main__":
+    main()