Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Run black/flake8 against code base #243

Merged
merged 6 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 19 additions & 10 deletions code/admin/Admin.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,24 @@
import streamlit as st
import os
import logging
import sys
from dotenv import load_dotenv
load_dotenv()

import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
load_dotenv()

logger = logging.getLogger('azure.core.pipeline.policies.http_logging_policy').setLevel(logging.WARNING)
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))

logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
logging.WARNING
)


st.set_page_config(page_title="Admin", page_icon=os.path.join('images','favicon.ico'), layout="wide", menu_items=None)
st.set_page_config(
page_title="Admin",
page_icon=os.path.join("images", "favicon.ico"),
layout="wide",
menu_items=None,
)

mod_page_style = """
<style>
Expand All @@ -23,14 +30,16 @@
st.markdown(mod_page_style, unsafe_allow_html=True)


col1, col2, col3 = st.columns([1,2,1])
col1, col2, col3 = st.columns([1, 2, 1])
with col1:
st.image(os.path.join('images','logo.png'))
st.image(os.path.join("images", "logo.png"))

st.write("# Chat with your data Solution Accelerator")

st.write("""
st.write(
"""
* If you want to ingest data (pdf, websites, etc.), then use the `Ingest Data` tab
* If you want to explore how your data was chunked, check the `Explore Data` tab
* If you want to adapt the underlying prompts, logging settings and others, use the `Configuration` tab
""")
"""
)
156 changes: 109 additions & 47 deletions code/admin/pages/01_Ingest_Data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import streamlit as st
import os, json
import os
from typing import Optional
import mimetypes
import traceback
Expand All @@ -10,13 +10,22 @@
from azure.storage.blob import BlobServiceClient, generate_blob_sas, ContentSettings
import urllib.parse
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from utilities.helpers.ConfigHelper import ConfigHelper
from dotenv import load_dotenv

sys.path.append(os.path.join(os.path.dirname(__file__), ".."))

load_dotenv()

logger = logging.getLogger('azure.core.pipeline.policies.http_logging_policy').setLevel(logging.WARNING)
st.set_page_config(page_title="Ingest Data", page_icon=os.path.join('images','favicon.ico'), layout="wide", menu_items=None)
logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
logging.WARNING
)
st.set_page_config(
page_title="Ingest Data",
page_icon=os.path.join("images", "favicon.ico"),
layout="wide",
menu_items=None,
)
mod_page_style = """
<style>
#MainMenu {visibility: hidden;}
Expand All @@ -25,92 +34,145 @@
</style>
"""
st.markdown(mod_page_style, unsafe_allow_html=True)



def remote_convert_files_and_add_embeddings(process_all=False):
backend_url = urllib.parse.urljoin(os.getenv('BACKEND_URL','http://localhost:7071'), "/api/BatchStartProcessing")
backend_url = urllib.parse.urljoin(
os.getenv("BACKEND_URL", "http://localhost:7071"), "/api/BatchStartProcessing"
)
params = {}
if os.getenv('FUNCTION_KEY') != None:
params['code'] = os.getenv('FUNCTION_KEY')
params['clientId'] = "clientKey"
if os.getenv("FUNCTION_KEY") is not None:
params["code"] = os.getenv("FUNCTION_KEY")
params["clientId"] = "clientKey"
if process_all:
params['process_all'] = "true"
params["process_all"] = "true"
try:
response = requests.post(backend_url, params=params)
if response.status_code == 200:
st.success(f"{response.text}\nPlease note this is an asynchronous process and may take a few minutes to complete.")
st.success(
f"{response.text}\nPlease note this is an asynchronous process and may take a few minutes to complete."
)
else:
st.error(f"Error: {response.text}")
except Exception as e:
except Exception:
st.error(traceback.format_exc())


def add_urls():
params = {}
if os.getenv('FUNCTION_KEY') != None:
params['code'] = os.getenv('FUNCTION_KEY')
params['clientId'] = "clientKey"
urls = st.session_state['urls'].split('\n')
if os.getenv("FUNCTION_KEY") is not None:
params["code"] = os.getenv("FUNCTION_KEY")
params["clientId"] = "clientKey"
urls = st.session_state["urls"].split("\n")
for url in urls:
body = {
"url": url
}
backend_url = urllib.parse.urljoin(os.getenv('BACKEND_URL','http://localhost:7071'), "/api/AddURLEmbeddings")
body = {"url": url}
backend_url = urllib.parse.urljoin(
os.getenv("BACKEND_URL", "http://localhost:7071"), "/api/AddURLEmbeddings"
)
r = requests.post(url=backend_url, params=params, json=body)
if not r.ok:
raise ValueError(f'Error {r.status_code}: {r.text}')
raise ValueError(f"Error {r.status_code}: {r.text}")
else:
st.success(f'Embeddings added successfully for {url}')
st.success(f"Embeddings added successfully for {url}")


def upload_file(bytes_data: bytes, file_name: str, content_type: Optional[str] = None):
def upload_file(bytes_data: bytes, file_name: str, content_type: Optional[str] = None):
# Upload a new file
st.session_state['filename'] = file_name
if content_type == None:
st.session_state["filename"] = file_name
if content_type is None:
content_type = mimetypes.MimeTypes().guess_type(file_name)[0]
charset = f"; charset={chardet.detect(bytes_data)['encoding']}" if content_type == 'text/plain' else ''
content_type = content_type if content_type != None else 'text/plain'
account_name = os.getenv('AZURE_BLOB_ACCOUNT_NAME')
account_key = os.getenv('AZURE_BLOB_ACCOUNT_KEY')
container_name = os.getenv('AZURE_BLOB_CONTAINER_NAME')
if account_name == None or account_key == None or container_name == None:
raise ValueError("Please provide values for AZURE_BLOB_ACCOUNT_NAME, AZURE_BLOB_ACCOUNT_KEY and AZURE_BLOB_CONTAINER_NAME")
charset = (
f"; charset={chardet.detect(bytes_data)['encoding']}"
if content_type == "text/plain"
else ""
)
content_type = content_type if content_type is not None else "text/plain"
account_name = os.getenv("AZURE_BLOB_ACCOUNT_NAME")
account_key = os.getenv("AZURE_BLOB_ACCOUNT_KEY")
container_name = os.getenv("AZURE_BLOB_CONTAINER_NAME")
if account_name is None or account_key is None or container_name is None:
raise ValueError(
"Please provide values for AZURE_BLOB_ACCOUNT_NAME, AZURE_BLOB_ACCOUNT_KEY and AZURE_BLOB_CONTAINER_NAME"
)
connect_str = f"DefaultEndpointsProtocol=https;AccountName={account_name};AccountKey={account_key};EndpointSuffix=core.windows.net"
blob_service_client : BlobServiceClient = BlobServiceClient.from_connection_string(connect_str)
blob_service_client: BlobServiceClient = BlobServiceClient.from_connection_string(
connect_str
)
# Create a blob client using the local file name as the name for the blob
blob_client = blob_service_client.get_blob_client(container=container_name, blob=file_name)
blob_client = blob_service_client.get_blob_client(
container=container_name, blob=file_name
)
# Upload the created file
blob_client.upload_blob(bytes_data, overwrite=True, content_settings=ContentSettings(content_type=content_type+charset))
blob_client.upload_blob(
bytes_data,
overwrite=True,
content_settings=ContentSettings(content_type=content_type + charset),
)
# Generate a SAS URL to the blob and return it
st.session_state['file_url'] = blob_client.url + '?' + generate_blob_sas(account_name, container_name, file_name,account_key=account_key, permission="r", expiry=datetime.utcnow() + timedelta(hours=3))
st.session_state["file_url"] = (
blob_client.url
+ "?"
+ generate_blob_sas(
account_name,
container_name,
file_name,
account_key=account_key,
permission="r",
expiry=datetime.utcnow() + timedelta(hours=3),
)
)


try:
with st.expander("Add documents in Batch", expanded=True):
config = ConfigHelper.get_active_config_or_default()
file_type = [processor.document_type for processor in config.document_processors]
uploaded_files = st.file_uploader("Upload a document to add it to the Azure Storage Account, compute embeddings and add them to the Azure AI Search index. Check your configuration for available document processors.", type=file_type, accept_multiple_files=True)
file_type = [
processor.document_type for processor in config.document_processors
]
uploaded_files = st.file_uploader(
"Upload a document to add it to the Azure Storage Account, compute embeddings and add them to the Azure AI Search index. Check your configuration for available document processors.",
type=file_type,
accept_multiple_files=True,
)
if uploaded_files is not None:
for up in uploaded_files:
# To read file as bytes:
bytes_data = up.getvalue()
if st.session_state.get('filename', '') != up.name:
if st.session_state.get("filename", "") != up.name:
# Upload a new file
upload_file(bytes_data, up.name)
if len(uploaded_files) > 0:
st.success(f"{len(uploaded_files)} documents uploaded. Embeddings computation in progress. \nPlease note this is an asynchronous process and may take a few minutes to complete.\nYou can check for further details in the Azure Function logs.")
st.success(
f"{len(uploaded_files)} documents uploaded. Embeddings computation in progress. \nPlease note this is an asynchronous process and may take a few minutes to complete.\nYou can check for further details in the Azure Function logs."
)

col1, col2, col3 = st.columns([2,1,2])
col1, col2, col3 = st.columns([2, 1, 2])
# with col1:
# st.button("Process and ingest new files", on_click=remote_convert_files_and_add_embeddings)
with col3:
st.button("Reprocess all documents in the Azure Storage account", on_click=remote_convert_files_and_add_embeddings, args=(True,))
st.button(
"Reprocess all documents in the Azure Storage account",
on_click=remote_convert_files_and_add_embeddings,
args=(True,),
)

with st.expander("Add URLs to the knowledge base", expanded=True):
col1, col2 = st.columns([3,1])
with col1:
st.text_area("Add a URLs and than click on 'Compute Embeddings'", placeholder="PLACE YOUR URLS HERE SEPARATED BY A NEW LINE", height=100, key="urls")
col1, col2 = st.columns([3, 1])
with col1:
st.text_area(
"Add a URLs and than click on 'Compute Embeddings'",
placeholder="PLACE YOUR URLS HERE SEPARATED BY A NEW LINE",
height=100,
key="urls",
)

with col2:
st.selectbox('Embeddings models', [os.getenv('AZURE_OPENAI_EMBEDDING_MODEL')], disabled=True)
st.selectbox(
"Embeddings models",
[os.getenv("AZURE_OPENAI_EMBEDDING_MODEL")],
disabled=True,
)
st.button("Process and ingest web pages", on_click=add_urls, key="add_url")

except Exception as e:
except Exception:
st.error(traceback.format_exc())
42 changes: 28 additions & 14 deletions code/admin/pages/02_Explore_Data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,22 @@
import logging
import pandas as pd
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from utilities.helpers.AzureSearchHelper import AzureSearchHelper
from dotenv import load_dotenv

sys.path.append(os.path.join(os.path.dirname(__file__), ".."))

load_dotenv()

logger = logging.getLogger('azure.core.pipeline.policies.http_logging_policy').setLevel(logging.WARNING)
st.set_page_config(page_title="Explore Data", page_icon=os.path.join('images','favicon.ico'), layout="wide", menu_items=None)
logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
logging.WARNING
)
st.set_page_config(
page_title="Explore Data",
page_icon=os.path.join("images", "favicon.ico"),
layout="wide",
menu_items=None,
)
mod_page_style = """
<style>
#MainMenu {visibility: hidden;}
Expand All @@ -34,20 +43,25 @@


try:
vector_store_helper : AzureSearchHelper = AzureSearchHelper()
vector_store_helper: AzureSearchHelper = AzureSearchHelper()
search_client = vector_store_helper.get_vector_store().client
# get unique document names by getting facets for title field
results = search_client.search("*", facets=["title"])
unique_files = [filename['value'] for filename in results.get_facets()["title"]]
filename = st.selectbox('Select your file:', unique_files)
st.write('Showing chunks for:', filename)

results = search_client.search("*", select="title, content, metadata", filter=f"title eq '{filename}'")

data = [[json.loads(result['metadata'])['chunk'], result['content']] for result in results]
df = pd.DataFrame(data, columns=('Chunk', 'Content')).sort_values(by=['Chunk'])
unique_files = [filename["value"] for filename in results.get_facets()["title"]]
filename = st.selectbox("Select your file:", unique_files)
st.write("Showing chunks for:", filename)

results = search_client.search(
"*", select="title, content, metadata", filter=f"title eq '{filename}'"
)

data = [
[json.loads(result["metadata"])["chunk"], result["content"]]
for result in results
]
df = pd.DataFrame(data, columns=("Chunk", "Content")).sort_values(by=["Chunk"])
st.table(df)


except Exception as e:

except Exception:
st.error(traceback.format_exc())
Loading