Azure-Samples · ross-p-smith · Jan 31, 2024 · Jan 29, 2024 · Jan 30, 2024 · Jan 30, 2024
@@ -0,0 +1,3 @@
+[flake8]
+max-line-length = 88
+extend-ignore = E203, E501
@@ -1,17 +1,24 @@
 import streamlit as st
 import os
 import logging
+import sys
 from dotenv import load_dotenv
-load_dotenv()
 
-import sys
-sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+load_dotenv()
 
-logger = logging.getLogger('azure.core.pipeline.policies.http_logging_policy').setLevel(logging.WARNING)
+sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 
+logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
+    logging.WARNING
+)
 
 
-st.set_page_config(page_title="Admin", page_icon=os.path.join('images','favicon.ico'), layout="wide", menu_items=None)
+st.set_page_config(
+    page_title="Admin",
+    page_icon=os.path.join("images", "favicon.ico"),
+    layout="wide",
+    menu_items=None,
+)
 
 mod_page_style = """
             <style>
@@ -23,14 +30,16 @@
 st.markdown(mod_page_style, unsafe_allow_html=True)
 
 
-col1, col2, col3 = st.columns([1,2,1])
+col1, col2, col3 = st.columns([1, 2, 1])
 with col1:
-    st.image(os.path.join('images','logo.png'))
-    
+    st.image(os.path.join("images", "logo.png"))
+
 st.write("# Chat with your data Solution Accelerator")
 
-st.write("""
+st.write(
+    """
          * If you want to ingest data (pdf, websites, etc.), then use the `Ingest Data` tab
          * If you want to explore how your data was chunked, check the `Explore Data` tab
          * If you want to adapt the underlying prompts, logging settings and others, use the `Configuration` tab
-         """)
+         """
+)
@@ -1,5 +1,5 @@
 import streamlit as st
-import os, json
+import os
 from typing import Optional
 import mimetypes
 import traceback
@@ -10,13 +10,22 @@
 from azure.storage.blob import BlobServiceClient, generate_blob_sas, ContentSettings
 import urllib.parse
 import sys
-sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 from utilities.helpers.ConfigHelper import ConfigHelper
 from dotenv import load_dotenv
+
+sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
+
 load_dotenv()
 
-logger = logging.getLogger('azure.core.pipeline.policies.http_logging_policy').setLevel(logging.WARNING)
-st.set_page_config(page_title="Ingest Data", page_icon=os.path.join('images','favicon.ico'), layout="wide", menu_items=None)
+logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
+    logging.WARNING
+)
+st.set_page_config(
+    page_title="Ingest Data",
+    page_icon=os.path.join("images", "favicon.ico"),
+    layout="wide",
+    menu_items=None,
+)
 mod_page_style = """
             <style>
             #MainMenu {visibility: hidden;}
@@ -25,92 +34,145 @@
             </style>
             """
 st.markdown(mod_page_style, unsafe_allow_html=True)
-
+
+
 def remote_convert_files_and_add_embeddings(process_all=False):
-    backend_url = urllib.parse.urljoin(os.getenv('BACKEND_URL','http://localhost:7071'), "/api/BatchStartProcessing")
+    backend_url = urllib.parse.urljoin(
+        os.getenv("BACKEND_URL", "http://localhost:7071"), "/api/BatchStartProcessing"
+    )
     params = {}
-    if os.getenv('FUNCTION_KEY') != None:
-        params['code'] = os.getenv('FUNCTION_KEY')
-        params['clientId'] = "clientKey"
+    if os.getenv("FUNCTION_KEY") is not None:
+        params["code"] = os.getenv("FUNCTION_KEY")
+        params["clientId"] = "clientKey"
     if process_all:
-        params['process_all'] = "true"
+        params["process_all"] = "true"
     try:
         response = requests.post(backend_url, params=params)
         if response.status_code == 200:
-            st.success(f"{response.text}\nPlease note this is an asynchronous process and may take a few minutes to complete.")
+            st.success(
+                f"{response.text}\nPlease note this is an asynchronous process and may take a few minutes to complete."
+            )
         else:
             st.error(f"Error: {response.text}")
-    except Exception as e:
+    except Exception:
         st.error(traceback.format_exc())
 
+
 def add_urls():
     params = {}
-    if os.getenv('FUNCTION_KEY') != None:
-        params['code'] = os.getenv('FUNCTION_KEY')
-        params['clientId'] = "clientKey"
-    urls = st.session_state['urls'].split('\n')
+    if os.getenv("FUNCTION_KEY") is not None:
+        params["code"] = os.getenv("FUNCTION_KEY")
+        params["clientId"] = "clientKey"
+    urls = st.session_state["urls"].split("\n")
     for url in urls:
-        body = {
-            "url": url
-        }
-        backend_url = urllib.parse.urljoin(os.getenv('BACKEND_URL','http://localhost:7071'), "/api/AddURLEmbeddings")
+        body = {"url": url}
+        backend_url = urllib.parse.urljoin(
+            os.getenv("BACKEND_URL", "http://localhost:7071"), "/api/AddURLEmbeddings"
+        )
         r = requests.post(url=backend_url, params=params, json=body)
         if not r.ok:
-            raise ValueError(f'Error {r.status_code}: {r.text}')
+            raise ValueError(f"Error {r.status_code}: {r.text}")
         else:
-            st.success(f'Embeddings added successfully for {url}')
+            st.success(f"Embeddings added successfully for {url}")
 
 
-def upload_file(bytes_data: bytes, file_name: str, content_type: Optional[str] = None):    
+def upload_file(bytes_data: bytes, file_name: str, content_type: Optional[str] = None):
     # Upload a new file
-    st.session_state['filename'] = file_name
-    if content_type == None:
+    st.session_state["filename"] = file_name
+    if content_type is None:
         content_type = mimetypes.MimeTypes().guess_type(file_name)[0]
-        charset = f"; charset={chardet.detect(bytes_data)['encoding']}" if content_type == 'text/plain' else ''
-        content_type = content_type if content_type != None else 'text/plain'
-    account_name = os.getenv('AZURE_BLOB_ACCOUNT_NAME')
-    account_key =  os.getenv('AZURE_BLOB_ACCOUNT_KEY')
-    container_name = os.getenv('AZURE_BLOB_CONTAINER_NAME')
-    if account_name == None or account_key == None or container_name == None:
-        raise ValueError("Please provide values for AZURE_BLOB_ACCOUNT_NAME, AZURE_BLOB_ACCOUNT_KEY and AZURE_BLOB_CONTAINER_NAME")
+        charset = (
+            f"; charset={chardet.detect(bytes_data)['encoding']}"
+            if content_type == "text/plain"
+            else ""
+        )
+        content_type = content_type if content_type is not None else "text/plain"
+    account_name = os.getenv("AZURE_BLOB_ACCOUNT_NAME")
+    account_key = os.getenv("AZURE_BLOB_ACCOUNT_KEY")
+    container_name = os.getenv("AZURE_BLOB_CONTAINER_NAME")
+    if account_name is None or account_key is None or container_name is None:
+        raise ValueError(
+            "Please provide values for AZURE_BLOB_ACCOUNT_NAME, AZURE_BLOB_ACCOUNT_KEY and AZURE_BLOB_CONTAINER_NAME"
+        )
     connect_str = f"DefaultEndpointsProtocol=https;AccountName={account_name};AccountKey={account_key};EndpointSuffix=core.windows.net"
-    blob_service_client : BlobServiceClient = BlobServiceClient.from_connection_string(connect_str)
+    blob_service_client: BlobServiceClient = BlobServiceClient.from_connection_string(
+        connect_str
+    )
     # Create a blob client using the local file name as the name for the blob
-    blob_client = blob_service_client.get_blob_client(container=container_name, blob=file_name)
+    blob_client = blob_service_client.get_blob_client(
+        container=container_name, blob=file_name
+    )
     # Upload the created file
-    blob_client.upload_blob(bytes_data, overwrite=True, content_settings=ContentSettings(content_type=content_type+charset))
+    blob_client.upload_blob(
+        bytes_data,
+        overwrite=True,
+        content_settings=ContentSettings(content_type=content_type + charset),
+    )
     # Generate a SAS URL to the blob and return it
-    st.session_state['file_url'] = blob_client.url + '?' + generate_blob_sas(account_name, container_name, file_name,account_key=account_key,  permission="r", expiry=datetime.utcnow() + timedelta(hours=3))
+    st.session_state["file_url"] = (
+        blob_client.url
+        + "?"
+        + generate_blob_sas(
+            account_name,
+            container_name,
+            file_name,
+            account_key=account_key,
+            permission="r",
+            expiry=datetime.utcnow() + timedelta(hours=3),
+        )
+    )
+
 
 try:
     with st.expander("Add documents in Batch", expanded=True):
         config = ConfigHelper.get_active_config_or_default()
-        file_type = [processor.document_type for processor in config.document_processors]
-        uploaded_files = st.file_uploader("Upload a document to add it to the Azure Storage Account, compute embeddings and add them to the Azure AI Search index. Check your configuration for available document processors.", type=file_type, accept_multiple_files=True)
+        file_type = [
+            processor.document_type for processor in config.document_processors
+        ]
+        uploaded_files = st.file_uploader(
+            "Upload a document to add it to the Azure Storage Account, compute embeddings and add them to the Azure AI Search index. Check your configuration for available document processors.",
+            type=file_type,
+            accept_multiple_files=True,
+        )
         if uploaded_files is not None:
             for up in uploaded_files:
                 # To read file as bytes:
                 bytes_data = up.getvalue()
-                if st.session_state.get('filename', '') != up.name:
+                if st.session_state.get("filename", "") != up.name:
                     # Upload a new file
                     upload_file(bytes_data, up.name)
             if len(uploaded_files) > 0:
-                st.success(f"{len(uploaded_files)} documents uploaded. Embeddings computation in progress. \nPlease note this is an asynchronous process and may take a few minutes to complete.\nYou can check for further details in the Azure Function logs.")
+                st.success(
+                    f"{len(uploaded_files)} documents uploaded. Embeddings computation in progress. \nPlease note this is an asynchronous process and may take a few minutes to complete.\nYou can check for further details in the Azure Function logs."
+                )
 
-        col1, col2, col3 = st.columns([2,1,2])
+        col1, col2, col3 = st.columns([2, 1, 2])
         # with col1:
         #     st.button("Process and ingest new files", on_click=remote_convert_files_and_add_embeddings)
         with col3:
-            st.button("Reprocess all documents in the Azure Storage account", on_click=remote_convert_files_and_add_embeddings, args=(True,))
+            st.button(
+                "Reprocess all documents in the Azure Storage account",
+                on_click=remote_convert_files_and_add_embeddings,
+                args=(True,),
+            )
 
     with st.expander("Add URLs to the knowledge base", expanded=True):
-        col1, col2 = st.columns([3,1])
-        with col1: 
-            st.text_area("Add a URLs and than click on 'Compute Embeddings'", placeholder="PLACE YOUR URLS HERE SEPARATED BY A NEW LINE", height=100, key="urls")
+        col1, col2 = st.columns([3, 1])
+        with col1:
+            st.text_area(
+                "Add a URLs and than click on 'Compute Embeddings'",
+                placeholder="PLACE YOUR URLS HERE SEPARATED BY A NEW LINE",
+                height=100,
+                key="urls",
+            )
 
         with col2:
-            st.selectbox('Embeddings models', [os.getenv('AZURE_OPENAI_EMBEDDING_MODEL')], disabled=True)
+            st.selectbox(
+                "Embeddings models",
+                [os.getenv("AZURE_OPENAI_EMBEDDING_MODEL")],
+                disabled=True,
+            )
             st.button("Process and ingest web pages", on_click=add_urls, key="add_url")
 
-except Exception as e:
+except Exception:
     st.error(traceback.format_exc())
@@ -5,13 +5,22 @@
 import logging
 import pandas as pd
 import sys
-sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 from utilities.helpers.AzureSearchHelper import AzureSearchHelper
 from dotenv import load_dotenv
+
+sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
+
 load_dotenv()
 
-logger = logging.getLogger('azure.core.pipeline.policies.http_logging_policy').setLevel(logging.WARNING)
-st.set_page_config(page_title="Explore Data", page_icon=os.path.join('images','favicon.ico'), layout="wide", menu_items=None)
+logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
+    logging.WARNING
+)
+st.set_page_config(
+    page_title="Explore Data",
+    page_icon=os.path.join("images", "favicon.ico"),
+    layout="wide",
+    menu_items=None,
+)
 mod_page_style = """
             <style>
             #MainMenu {visibility: hidden;}
@@ -34,20 +43,25 @@
 
 
 try:
-    vector_store_helper : AzureSearchHelper = AzureSearchHelper()
+    vector_store_helper: AzureSearchHelper = AzureSearchHelper()
     search_client = vector_store_helper.get_vector_store().client
     # get unique document names by getting facets for title field
     results = search_client.search("*", facets=["title"])
-    unique_files = [filename['value'] for filename in results.get_facets()["title"]]
-    filename = st.selectbox('Select your file:', unique_files)
-    st.write('Showing chunks for:', filename)
-
-    results = search_client.search("*", select="title, content, metadata", filter=f"title eq '{filename}'")
-
-    data = [[json.loads(result['metadata'])['chunk'], result['content']] for result in results]
-    df = pd.DataFrame(data, columns=('Chunk', 'Content')).sort_values(by=['Chunk'])           
+    unique_files = [filename["value"] for filename in results.get_facets()["title"]]
+    filename = st.selectbox("Select your file:", unique_files)
+    st.write("Showing chunks for:", filename)
+
+    results = search_client.search(
+        "*", select="title, content, metadata", filter=f"title eq '{filename}'"
+    )
+
+    data = [
+        [json.loads(result["metadata"])["chunk"], result["content"]]
+        for result in results
+    ]
+    df = pd.DataFrame(data, columns=("Chunk", "Content")).sort_values(by=["Chunk"])
     st.table(df)
-
 
-except Exception as e:
+
+except Exception:
     st.error(traceback.format_exc())