Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding Source Finder Functionality #5

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,12 @@ Test_Results/results_test_1-6-24.csv
Test_Results/results_test_1-3-24_log.txt
Test_Results/results_test_1-3-24.csv
Test_Results/results_test_1-2-24_log.txt

venv/
__pycache__/
__pycache__/*.pyc

data/
images.txt
src/*.pdf
src/*.txt
76 changes: 0 additions & 76 deletions TreeHugger_Exam_ans.csv

This file was deleted.

Binary file added data/chroma_db/chroma.sqlite3
Binary file not shown.
40 changes: 40 additions & 0 deletions src/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# __import__('pysqlite3')
# import sys
# sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
# import sqlite3

import streamlit as st
from components.frontend.chat import Chat_UI
from components.frontend.sidebar import Sidebar
from components.backend.pipeline.pipeline import Pipeline
import os
import uuid


st.set_page_config(layout='wide')


@st.cache_resource
def initalize():
pipeline = Pipeline()
return pipeline, Sidebar(pipeline), Chat_UI(pipeline)

class UI:
def __init__(self):
self._pipeline, self.sidebar, self.chat = initalize()
st.session_state['documents'] = False
st.session_state['user_id'] = str(uuid.uuid4())
st.session_state['api_key'] = "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"

if 'messages' not in st.session_state:
st.session_state['messages'] = []

def render(self):
self.sidebar()
self.chat()

def main():
UI().render()

if __name__ == "__main__":
main()
Binary file added src/assets/eugenie.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
64 changes: 64 additions & 0 deletions src/components/backend/pipeline/document_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import numpy as np
import fitz
import requests
import time as time
import uuid
import streamlit as st
import base64

class Document_Handler:
def __init__(self):
pass

def __call__(self, bytes_array):
return self.extract_and_chunk(bytes_array)

def semantic_chunking(self, text, chunk_size=200, overlap=50):
chunks = []
current_chunk = ""
words = text.split()

for word in words:
current_chunk += (word + " ")
if len(current_chunk) >= chunk_size:
period_pos = current_chunk.rfind('. ')
if period_pos != -1 and period_pos + 1 < len(current_chunk):
chunks.append(current_chunk[:period_pos + 1])
current_chunk = current_chunk[max(period_pos + 1 - overlap, 0):]
else:
chunks.append(current_chunk.strip())
current_chunk = ""

if len(current_chunk) > chunk_size // 2:
chunks.append(current_chunk.strip())

return chunks

def extract_and_chunk(self, file_name):
doc = fitz.open(file_name)

text_blocks = []
id = file_name

for page_num in range(len(doc)):
page = doc[page_num]
blocks = page.get_text("dict")["blocks"]

for b in blocks:
if "lines" in b:
bbox = fitz.Rect(b["bbox"])
text = " ".join([" ".join([span["text"] for span in line["spans"]]) for line in b["lines"]])

if len(text.split()) > 100:
chunks = self.semantic_chunking(text)
else:
chunks = [text]

for chunk in chunks:
text_blocks.append((id, page_num, bbox.x0, bbox.y0, bbox.x1, bbox.y1, chunk))

print('here')

doc.close()
return text_blocks

9 changes: 9 additions & 0 deletions src/components/backend/pipeline/llm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from langchain_openai import ChatOpenAI

import os, re, json

class LLM:
def __init__(self, temperature=0.0001):
self.llm = ChatOpenAI(model_name='gpt-4', temperature=temperature)


50 changes: 50 additions & 0 deletions src/components/backend/pipeline/pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from components.backend.pipeline.vectorstore import VectorStore
from components.backend.pipeline.llm import LLM
from components.backend.pipeline.document_handler import Document_Handler

import os, io

from components.backend.tools.python_interpreter import PythonInterpreter
from components.backend.tools.arxiv_search import ArxivSearch
from components.backend.tools.calculator import Calculator
from components.backend.tools.web_search import WebSearch
from components.backend.tools.rag import RAG

from langchain.agents import initialize_agent

os.environ["OPENAI_API_KEY"] = "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"
os.environ['PINECONE_API_KEY'] = "204755b4-f7d8-4aa4-b16b-764e66796cc3"
os.environ["GOOGLE_API_KEY"] = "AIzaSyDKxAadUfBZ9oAMDlRjRe0jlp3N0oZKqvg"
os.environ["GOOGLE_CSE_ID"] = "57d010b1a25ce48c0"

class Pipeline:
def __init__(self, max_iterations=5):
self.document_handler = Document_Handler()
self.llm = LLM()
self.vectorstore = VectorStore()
self.rag = RAG(llm=self.llm.llm, vectorstore=self.vectorstore.vectorstore)
self.tools = [
PythonInterpreter(llm=self.llm.llm).initialize(),
ArxivSearch().initialize(),
Calculator(llm=self.llm.llm).initialize(),
WebSearch(llm=self.llm.llm, vectorstore_public=self.vectorstore.vectorstore).initialize(),
]

self.agent = initialize_agent(self.tools,
self.llm.llm,
agent="chat-conversational-react-description",
verbose=True,
handle_parsing_errors=True,
max_iterations=max_iterations
)

def run(self, query, chat_history):
return self.agent.invoke({'input': query.strip(), 'chat_history': chat_history})

def add(self, pdf):
self.vectorstore.add(self.document_handler(pdf))
print('Done')
return 1

def get_sources(self, query):
return self.vectorstore.get_sources(query)
Loading