Skip to content

Commit

Permalink
Merge branch 'main' into ross/dev-requirements
Browse files Browse the repository at this point in the history
  • Loading branch information
ross-p-smith authored Jan 31, 2024
2 parents 1078fe6 + 30051be commit 148ed64
Show file tree
Hide file tree
Showing 61 changed files with 1,822 additions and 1,044 deletions.
3 changes: 3 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[flake8]
max-line-length = 88
extend-ignore = E203, E501
29 changes: 19 additions & 10 deletions code/admin/Admin.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,24 @@
import streamlit as st
import os
import logging
import sys
from dotenv import load_dotenv
load_dotenv()

import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
load_dotenv()

logger = logging.getLogger('azure.core.pipeline.policies.http_logging_policy').setLevel(logging.WARNING)
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))

logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
logging.WARNING
)


st.set_page_config(page_title="Admin", page_icon=os.path.join('images','favicon.ico'), layout="wide", menu_items=None)
st.set_page_config(
page_title="Admin",
page_icon=os.path.join("images", "favicon.ico"),
layout="wide",
menu_items=None,
)

mod_page_style = """
<style>
Expand All @@ -23,14 +30,16 @@
st.markdown(mod_page_style, unsafe_allow_html=True)


col1, col2, col3 = st.columns([1,2,1])
col1, col2, col3 = st.columns([1, 2, 1])
with col1:
st.image(os.path.join('images','logo.png'))
st.image(os.path.join("images", "logo.png"))

st.write("# Chat with your data Solution Accelerator")

st.write("""
st.write(
"""
* If you want to ingest data (pdf, websites, etc.), then use the `Ingest Data` tab
* If you want to explore how your data was chunked, check the `Explore Data` tab
* If you want to adapt the underlying prompts, logging settings and others, use the `Configuration` tab
""")
"""
)
156 changes: 109 additions & 47 deletions code/admin/pages/01_Ingest_Data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import streamlit as st
import os, json
import os
from typing import Optional
import mimetypes
import traceback
Expand All @@ -10,13 +10,22 @@
from azure.storage.blob import BlobServiceClient, generate_blob_sas, ContentSettings
import urllib.parse
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from utilities.helpers.ConfigHelper import ConfigHelper
from dotenv import load_dotenv

sys.path.append(os.path.join(os.path.dirname(__file__), ".."))

load_dotenv()

logger = logging.getLogger('azure.core.pipeline.policies.http_logging_policy').setLevel(logging.WARNING)
st.set_page_config(page_title="Ingest Data", page_icon=os.path.join('images','favicon.ico'), layout="wide", menu_items=None)
logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
logging.WARNING
)
st.set_page_config(
page_title="Ingest Data",
page_icon=os.path.join("images", "favicon.ico"),
layout="wide",
menu_items=None,
)
mod_page_style = """
<style>
#MainMenu {visibility: hidden;}
Expand All @@ -25,92 +34,145 @@
</style>
"""
st.markdown(mod_page_style, unsafe_allow_html=True)



def remote_convert_files_and_add_embeddings(process_all=False):
backend_url = urllib.parse.urljoin(os.getenv('BACKEND_URL','http://localhost:7071'), "/api/BatchStartProcessing")
backend_url = urllib.parse.urljoin(
os.getenv("BACKEND_URL", "http://localhost:7071"), "/api/BatchStartProcessing"
)
params = {}
if os.getenv('FUNCTION_KEY') != None:
params['code'] = os.getenv('FUNCTION_KEY')
params['clientId'] = "clientKey"
if os.getenv("FUNCTION_KEY") is not None:
params["code"] = os.getenv("FUNCTION_KEY")
params["clientId"] = "clientKey"
if process_all:
params['process_all'] = "true"
params["process_all"] = "true"
try:
response = requests.post(backend_url, params=params)
if response.status_code == 200:
st.success(f"{response.text}\nPlease note this is an asynchronous process and may take a few minutes to complete.")
st.success(
f"{response.text}\nPlease note this is an asynchronous process and may take a few minutes to complete."
)
else:
st.error(f"Error: {response.text}")
except Exception as e:
except Exception:
st.error(traceback.format_exc())


def add_urls():
params = {}
if os.getenv('FUNCTION_KEY') != None:
params['code'] = os.getenv('FUNCTION_KEY')
params['clientId'] = "clientKey"
urls = st.session_state['urls'].split('\n')
if os.getenv("FUNCTION_KEY") is not None:
params["code"] = os.getenv("FUNCTION_KEY")
params["clientId"] = "clientKey"
urls = st.session_state["urls"].split("\n")
for url in urls:
body = {
"url": url
}
backend_url = urllib.parse.urljoin(os.getenv('BACKEND_URL','http://localhost:7071'), "/api/AddURLEmbeddings")
body = {"url": url}
backend_url = urllib.parse.urljoin(
os.getenv("BACKEND_URL", "http://localhost:7071"), "/api/AddURLEmbeddings"
)
r = requests.post(url=backend_url, params=params, json=body)
if not r.ok:
raise ValueError(f'Error {r.status_code}: {r.text}')
raise ValueError(f"Error {r.status_code}: {r.text}")
else:
st.success(f'Embeddings added successfully for {url}')
st.success(f"Embeddings added successfully for {url}")


def upload_file(bytes_data: bytes, file_name: str, content_type: Optional[str] = None):
def upload_file(bytes_data: bytes, file_name: str, content_type: Optional[str] = None):
# Upload a new file
st.session_state['filename'] = file_name
if content_type == None:
st.session_state["filename"] = file_name
if content_type is None:
content_type = mimetypes.MimeTypes().guess_type(file_name)[0]
charset = f"; charset={chardet.detect(bytes_data)['encoding']}" if content_type == 'text/plain' else ''
content_type = content_type if content_type != None else 'text/plain'
account_name = os.getenv('AZURE_BLOB_ACCOUNT_NAME')
account_key = os.getenv('AZURE_BLOB_ACCOUNT_KEY')
container_name = os.getenv('AZURE_BLOB_CONTAINER_NAME')
if account_name == None or account_key == None or container_name == None:
raise ValueError("Please provide values for AZURE_BLOB_ACCOUNT_NAME, AZURE_BLOB_ACCOUNT_KEY and AZURE_BLOB_CONTAINER_NAME")
charset = (
f"; charset={chardet.detect(bytes_data)['encoding']}"
if content_type == "text/plain"
else ""
)
content_type = content_type if content_type is not None else "text/plain"
account_name = os.getenv("AZURE_BLOB_ACCOUNT_NAME")
account_key = os.getenv("AZURE_BLOB_ACCOUNT_KEY")
container_name = os.getenv("AZURE_BLOB_CONTAINER_NAME")
if account_name is None or account_key is None or container_name is None:
raise ValueError(
"Please provide values for AZURE_BLOB_ACCOUNT_NAME, AZURE_BLOB_ACCOUNT_KEY and AZURE_BLOB_CONTAINER_NAME"
)
connect_str = f"DefaultEndpointsProtocol=https;AccountName={account_name};AccountKey={account_key};EndpointSuffix=core.windows.net"
blob_service_client : BlobServiceClient = BlobServiceClient.from_connection_string(connect_str)
blob_service_client: BlobServiceClient = BlobServiceClient.from_connection_string(
connect_str
)
# Create a blob client using the local file name as the name for the blob
blob_client = blob_service_client.get_blob_client(container=container_name, blob=file_name)
blob_client = blob_service_client.get_blob_client(
container=container_name, blob=file_name
)
# Upload the created file
blob_client.upload_blob(bytes_data, overwrite=True, content_settings=ContentSettings(content_type=content_type+charset))
blob_client.upload_blob(
bytes_data,
overwrite=True,
content_settings=ContentSettings(content_type=content_type + charset),
)
# Generate a SAS URL to the blob and return it
st.session_state['file_url'] = blob_client.url + '?' + generate_blob_sas(account_name, container_name, file_name,account_key=account_key, permission="r", expiry=datetime.utcnow() + timedelta(hours=3))
st.session_state["file_url"] = (
blob_client.url
+ "?"
+ generate_blob_sas(
account_name,
container_name,
file_name,
account_key=account_key,
permission="r",
expiry=datetime.utcnow() + timedelta(hours=3),
)
)


try:
with st.expander("Add documents in Batch", expanded=True):
config = ConfigHelper.get_active_config_or_default()
file_type = [processor.document_type for processor in config.document_processors]
uploaded_files = st.file_uploader("Upload a document to add it to the Azure Storage Account, compute embeddings and add them to the Azure AI Search index. Check your configuration for available document processors.", type=file_type, accept_multiple_files=True)
file_type = [
processor.document_type for processor in config.document_processors
]
uploaded_files = st.file_uploader(
"Upload a document to add it to the Azure Storage Account, compute embeddings and add them to the Azure AI Search index. Check your configuration for available document processors.",
type=file_type,
accept_multiple_files=True,
)
if uploaded_files is not None:
for up in uploaded_files:
# To read file as bytes:
bytes_data = up.getvalue()
if st.session_state.get('filename', '') != up.name:
if st.session_state.get("filename", "") != up.name:
# Upload a new file
upload_file(bytes_data, up.name)
if len(uploaded_files) > 0:
st.success(f"{len(uploaded_files)} documents uploaded. Embeddings computation in progress. \nPlease note this is an asynchronous process and may take a few minutes to complete.\nYou can check for further details in the Azure Function logs.")
st.success(
f"{len(uploaded_files)} documents uploaded. Embeddings computation in progress. \nPlease note this is an asynchronous process and may take a few minutes to complete.\nYou can check for further details in the Azure Function logs."
)

col1, col2, col3 = st.columns([2,1,2])
col1, col2, col3 = st.columns([2, 1, 2])
# with col1:
# st.button("Process and ingest new files", on_click=remote_convert_files_and_add_embeddings)
with col3:
st.button("Reprocess all documents in the Azure Storage account", on_click=remote_convert_files_and_add_embeddings, args=(True,))
st.button(
"Reprocess all documents in the Azure Storage account",
on_click=remote_convert_files_and_add_embeddings,
args=(True,),
)

with st.expander("Add URLs to the knowledge base", expanded=True):
col1, col2 = st.columns([3,1])
with col1:
st.text_area("Add a URLs and than click on 'Compute Embeddings'", placeholder="PLACE YOUR URLS HERE SEPARATED BY A NEW LINE", height=100, key="urls")
col1, col2 = st.columns([3, 1])
with col1:
st.text_area(
"Add a URLs and than click on 'Compute Embeddings'",
placeholder="PLACE YOUR URLS HERE SEPARATED BY A NEW LINE",
height=100,
key="urls",
)

with col2:
st.selectbox('Embeddings models', [os.getenv('AZURE_OPENAI_EMBEDDING_MODEL')], disabled=True)
st.selectbox(
"Embeddings models",
[os.getenv("AZURE_OPENAI_EMBEDDING_MODEL")],
disabled=True,
)
st.button("Process and ingest web pages", on_click=add_urls, key="add_url")

except Exception as e:
except Exception:
st.error(traceback.format_exc())
42 changes: 28 additions & 14 deletions code/admin/pages/02_Explore_Data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,22 @@
import logging
import pandas as pd
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from utilities.helpers.AzureSearchHelper import AzureSearchHelper
from dotenv import load_dotenv

sys.path.append(os.path.join(os.path.dirname(__file__), ".."))

load_dotenv()

logger = logging.getLogger('azure.core.pipeline.policies.http_logging_policy').setLevel(logging.WARNING)
st.set_page_config(page_title="Explore Data", page_icon=os.path.join('images','favicon.ico'), layout="wide", menu_items=None)
logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
logging.WARNING
)
st.set_page_config(
page_title="Explore Data",
page_icon=os.path.join("images", "favicon.ico"),
layout="wide",
menu_items=None,
)
mod_page_style = """
<style>
#MainMenu {visibility: hidden;}
Expand All @@ -34,20 +43,25 @@


try:
vector_store_helper : AzureSearchHelper = AzureSearchHelper()
vector_store_helper: AzureSearchHelper = AzureSearchHelper()
search_client = vector_store_helper.get_vector_store().client
# get unique document names by getting facets for title field
results = search_client.search("*", facets=["title"])
unique_files = [filename['value'] for filename in results.get_facets()["title"]]
filename = st.selectbox('Select your file:', unique_files)
st.write('Showing chunks for:', filename)

results = search_client.search("*", select="title, content, metadata", filter=f"title eq '{filename}'")

data = [[json.loads(result['metadata'])['chunk'], result['content']] for result in results]
df = pd.DataFrame(data, columns=('Chunk', 'Content')).sort_values(by=['Chunk'])
unique_files = [filename["value"] for filename in results.get_facets()["title"]]
filename = st.selectbox("Select your file:", unique_files)
st.write("Showing chunks for:", filename)

results = search_client.search(
"*", select="title, content, metadata", filter=f"title eq '{filename}'"
)

data = [
[json.loads(result["metadata"])["chunk"], result["content"]]
for result in results
]
df = pd.DataFrame(data, columns=("Chunk", "Content")).sort_values(by=["Chunk"])
st.table(df)


except Exception as e:

except Exception:
st.error(traceback.format_exc())
Loading

0 comments on commit 148ed64

Please sign in to comment.