-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
99 lines (76 loc) · 3.4 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import streamlit as st
from PyPDF2 import PdfReader
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
import cohere
import numpy as np
st.set_page_config(page_icon="📄", layout="wide", page_title="QA Bot with RAG")
def icon(emoji: str):
st.write(f'<span style="font-size: 78px; line-height: 1">{emoji}</span>', unsafe_allow_html=True)
icon("🤖")
st.subheader("Ask Questions Based on Your Document")
pinecone = Pinecone(api_key='your-apikey', environment='us-west1-gcp') # please replace it with you Pinecone api key
index_name = "qa-bot"
if not pinecone.list_indexes():
pinecone.create_index(
name=index_name,
dimension=768,
metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-east-1")
)
index = pinecone.Index(index_name)
cohere_client = cohere.Client(api_key='your-apikey')# please replace it with you cohere api key
# Use a more powerful embedding model
embedding_model = SentenceTransformer('all-mpnet-base-v2')
def normalize_embedding(embedding):
return embedding / np.linalg.norm(embedding)
def create_embeddings(text):
embeddings = embedding_model.encode(text)
normalized_embeddings = [normalize_embedding(emb) for emb in embeddings]
return [embedding.tolist() for embedding in normalized_embeddings]
def store_embeddings_in_pinecone(text_chunks):
embeddings = create_embeddings(text_chunks)
vectors = [{"id": str(i), "values": embedding, "metadata": {"text": chunk}} for i, (embedding, chunk) in enumerate(zip(embeddings, text_chunks))]
index.upsert(vectors=vectors, namespace="document-namespace")
def chunk_text(text, max_chunk_size=512):
sentences = text.split('. ')
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) <= max_chunk_size:
current_chunk += sentence + ". "
else:
chunks.append(current_chunk.strip())
current_chunk = sentence + ". "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
def query_pinecone(query_text):
query_embedding = normalize_embedding(embedding_model.encode([query_text])[0])
query_results = index.query(vector=query_embedding.tolist(), top_k=5, include_metadata=True, namespace="document-namespace")
return [match['metadata']['text'] for match in query_results['matches']]
def generate_answer_from_chunks(query, retrieved_chunks):
combined_text = " ".join(retrieved_chunks)
prompt = f"{combined_text}\n\nAnswer the following question: {query}"
response = cohere_client.generate(
model='c4ai-aya-23-35b',
prompt=prompt,
max_tokens=300
)
return response.generations[0].text
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
if uploaded_file is not None:
reader = PdfReader(uploaded_file)
document_text = ''
for page in reader.pages:
document_text += page.extract_text()
st.write("Extracted Document Text")
text_chunks = chunk_text(document_text)
store_embeddings_in_pinecone(text_chunks)
st.success("Embeddings stored successfully!")
query = st.text_input("Ask a question based on the document")
if query:
retrieved_chunks = query_pinecone(query)
generated_answer = generate_answer_from_chunks(query, retrieved_chunks)
st.write("Generated Answer")
st.write(generated_answer)