e-lab · AkshathRaghav · Jan 30, 2024 · Jan 31, 2024 · Jan 31, 2024 · Jan 31, 2024
diff --git a/.gitignore b/.gitignore
@@ -40,3 +40,12 @@ Test_Results/results_test_1-6-24.csv
 Test_Results/results_test_1-3-24_log.txt
 Test_Results/results_test_1-3-24.csv
 Test_Results/results_test_1-2-24_log.txt
+
+venv/
+__pycache__/
+__pycache__/*.pyc
+
+data/ 
+images.txt
+src/*.pdf
+src/*.txt
diff --git a/TreeHugger_Exam_ans.csv b/TreeHugger_Exam_ans.csv
diff --git a/data/chroma_db/chroma.sqlite3 b/data/chroma_db/chroma.sqlite3
diff --git a/src/app.py b/src/app.py
@@ -0,0 +1,40 @@
+# __import__('pysqlite3')
+# import sys
+# sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
+# import sqlite3
+
+import streamlit as st 
+from components.frontend.chat import Chat_UI
+from components.frontend.sidebar import Sidebar
+from components.backend.pipeline.pipeline import Pipeline
+import os 
+import uuid 
+
+
+st.set_page_config(layout='wide')
+
+
+@st.cache_resource
+def initalize(): 
+    pipeline = Pipeline()
+    return pipeline, Sidebar(pipeline), Chat_UI(pipeline)
+
+class UI: 
+    def __init__(self): 
+        self._pipeline, self.sidebar, self.chat = initalize()
+        st.session_state['documents'] = False
+        st.session_state['user_id'] = str(uuid.uuid4())
+        st.session_state['api_key'] = "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"
+
+        if 'messages' not in st.session_state: 
+            st.session_state['messages'] = [] 
+
+    def render(self): 
+      self.sidebar() 
+      self.chat() 
+
+def main(): 
+    UI().render()
+
+if __name__ == "__main__": 
+    main() 
diff --git a/src/assets/eugenie.png b/src/assets/eugenie.png
diff --git a/src/components/backend/pipeline/document_handler.py b/src/components/backend/pipeline/document_handler.py
@@ -0,0 +1,64 @@
+import numpy as np 
+import fitz
+import requests
+import time as time
+import uuid 
+import streamlit as st 
+import base64
+
+class Document_Handler: 
+  def __init__(self):
+    pass 
+
+  def __call__(self, bytes_array):
+    return self.extract_and_chunk(bytes_array) 
+
+  def semantic_chunking(self, text, chunk_size=200, overlap=50):
+    chunks = []
+    current_chunk = ""
+    words = text.split()
+
+    for word in words:
+        current_chunk += (word + " ")
+        if len(current_chunk) >= chunk_size:
+            period_pos = current_chunk.rfind('. ')
+            if period_pos != -1 and period_pos + 1 < len(current_chunk):
+                chunks.append(current_chunk[:period_pos + 1])
+                current_chunk = current_chunk[max(period_pos + 1 - overlap, 0):]
+            else:
+                chunks.append(current_chunk.strip())
+                current_chunk = ""
+
+    if len(current_chunk) > chunk_size // 2:
+        chunks.append(current_chunk.strip())
+
+    return chunks
+
+  def extract_and_chunk(self, file_name):
+    doc = fitz.open(file_name)
+
+    text_blocks = []
+    id = file_name
+
+    for page_num in range(len(doc)):
+        page = doc[page_num]
+        blocks = page.get_text("dict")["blocks"]
+
+        for b in blocks:
+            if "lines" in b: 
+                bbox = fitz.Rect(b["bbox"])
+                text = " ".join([" ".join([span["text"] for span in line["spans"]]) for line in b["lines"]])
+
+                if len(text.split()) > 100:
+                    chunks = self.semantic_chunking(text)
+                else:
+                    chunks = [text]
+
+                for chunk in chunks:
+                    text_blocks.append((id, page_num, bbox.x0, bbox.y0, bbox.x1, bbox.y1, chunk))
+
+        print('here')
+
+    doc.close()
+    return text_blocks
+
diff --git a/src/components/backend/pipeline/llm.py b/src/components/backend/pipeline/llm.py
@@ -0,0 +1,9 @@
+from langchain_openai import ChatOpenAI
+
+import os, re, json
+
+class LLM: 
+  def __init__(self, temperature=0.0001): 
+    self.llm = ChatOpenAI(model_name='gpt-4', temperature=temperature)
+
+
diff --git a/src/components/backend/pipeline/pipeline.py b/src/components/backend/pipeline/pipeline.py
@@ -0,0 +1,50 @@
+from components.backend.pipeline.vectorstore import VectorStore
+from components.backend.pipeline.llm import LLM
+from components.backend.pipeline.document_handler import Document_Handler
+
+import os, io 
+
+from components.backend.tools.python_interpreter import PythonInterpreter
+from components.backend.tools.arxiv_search import ArxivSearch
+from components.backend.tools.calculator import Calculator
+from components.backend.tools.web_search import WebSearch
+from components.backend.tools.rag import RAG
+
+from langchain.agents import initialize_agent
+
+os.environ["OPENAI_API_KEY"] = "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"
+os.environ['PINECONE_API_KEY'] = "204755b4-f7d8-4aa4-b16b-764e66796cc3"
+os.environ["GOOGLE_API_KEY"] = "AIzaSyDKxAadUfBZ9oAMDlRjRe0jlp3N0oZKqvg"
+os.environ["GOOGLE_CSE_ID"] = "57d010b1a25ce48c0"
+
+class Pipeline: 
+  def __init__(self, max_iterations=5): 
+    self.document_handler = Document_Handler() 
+    self.llm = LLM()
+    self.vectorstore = VectorStore() 
+    self.rag = RAG(llm=self.llm.llm, vectorstore=self.vectorstore.vectorstore)
+    self.tools = [
+      PythonInterpreter(llm=self.llm.llm).initialize(),
+      ArxivSearch().initialize(),
+      Calculator(llm=self.llm.llm).initialize(),
+      WebSearch(llm=self.llm.llm, vectorstore_public=self.vectorstore.vectorstore).initialize(),
+    ]
+
+    self.agent = initialize_agent(self.tools, 
+      self.llm.llm, 
+      agent="chat-conversational-react-description",
+      verbose=True, 
+      handle_parsing_errors=True, 
+      max_iterations=max_iterations
+    )
+
+  def run(self, query, chat_history):
+    return self.agent.invoke({'input': query.strip(), 'chat_history': chat_history}) 
+
+  def add(self, pdf): 
+    self.vectorstore.add(self.document_handler(pdf))
+    print('Done')
+    return 1
+
+  def get_sources(self, query): 
+    return self.vectorstore.get_sources(query)