From 30051be9e92b8f1f7f608441d843c521c499f802 Mon Sep 17 00:00:00 2001 From: Ross Smith Date: Wed, 31 Jan 2024 23:09:46 +0000 Subject: [PATCH] Run black/flake8 against code base (#243) * Run black against code base * Amending imports * Add flake8 config --- .flake8 | 3 + code/admin/Admin.py | 29 +- code/admin/pages/01_Ingest_Data.py | 156 +++++++---- code/admin/pages/02_Explore_Data.py | 42 ++- code/admin/pages/03_Delete_Data.py | 35 ++- code/admin/pages/04_Configuration.py | 226 ++++++++++------ code/app/app.py | 247 +++++++++++------- code/batch/AddURLEmbeddings.py | 39 +-- code/batch/BatchPushResults.py | 37 ++- code/batch/BatchStartProcessing.py | 38 ++- code/batch/function_app.py | 4 +- code/utilities/common/Answer.py | 41 +-- code/utilities/common/SourceDocument.py | 119 +++++---- .../document_chunking/DocumentChunkingBase.py | 9 +- .../document_chunking/FixedSizeOverlap.py | 18 +- code/utilities/document_chunking/Layout.py | 20 +- code/utilities/document_chunking/Page.py | 29 +- code/utilities/document_chunking/Paragraph.py | 7 +- .../utilities/document_chunking/Strategies.py | 23 +- code/utilities/document_chunking/__init__.py | 8 +- .../document_loading/DocumentLoadingBase.py | 5 +- code/utilities/document_loading/Layout.py | 20 +- code/utilities/document_loading/Read.py | 14 +- code/utilities/document_loading/Strategies.py | 17 +- code/utilities/document_loading/Web.py | 14 +- .../document_loading/WordDocument.py | 22 +- code/utilities/document_loading/__init__.py | 12 +- .../helpers/AzureBlobStorageHelper.py | 154 ++++++++--- .../helpers/AzureFormRecognizerHelper.py | 90 +++++-- code/utilities/helpers/AzureSearchHelper.py | 51 ++-- code/utilities/helpers/ConfigHelper.py | 135 +++++----- .../helpers/DocumentChunkingHelper.py | 20 +- .../helpers/DocumentLoadingHelper.py | 12 +- .../helpers/DocumentProcessorHelper.py | 16 +- code/utilities/helpers/EnvHelper.py | 142 ++++++---- code/utilities/helpers/LLMHelper.py | 44 +++- code/utilities/helpers/OrchestratorHelper.py | 23 +- code/utilities/loggers/ConversationLogger.py | 48 ++-- code/utilities/loggers/TokenLogger.py | 16 +- code/utilities/orchestrator/LangChainAgent.py | 84 ++++-- .../utilities/orchestrator/OpenAIFunctions.py | 109 +++++--- .../orchestrator/OrchestratorBase.py | 55 ++-- code/utilities/orchestrator/Strategies.py | 11 +- code/utilities/orchestrator/__init__.py | 13 +- code/utilities/parser/OutputParserTool.py | 53 ++-- code/utilities/parser/ParserBase.py | 12 +- code/utilities/parser/__init__.py | 6 +- code/utilities/tools/AnswerProcessingBase.py | 7 +- code/utilities/tools/AnsweringToolBase.py | 9 +- code/utilities/tools/ContentSafetyChecker.py | 43 +-- code/utilities/tools/PostPromptTool.py | 70 +++-- code/utilities/tools/QuestionAnswerTool.py | 49 ++-- code/utilities/tools/TextProcessingTool.py | 37 +-- extensions/backend/http_cwyod/__init__.py | 43 ++- tests/test_AzureBlobStorage.py | 3 +- tests/test_ContentSafetyChecker.py | 8 +- tests/test_DocumentChunking.py | 65 ++--- tests/test_DocumentLoading.py | 14 +- tests/test_DocumentProcessor.py | 42 ++- tests/test_Orchestrator.py | 32 ++- tests/test_OutputParserTool.py | 116 ++++---- 61 files changed, 1822 insertions(+), 1044 deletions(-) create mode 100644 .flake8 diff --git a/.flake8 b/.flake8 new file mode 100644 index 000000000..73646ac28 --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +max-line-length = 88 +extend-ignore = E203, E501 \ No newline at end of file diff --git a/code/admin/Admin.py b/code/admin/Admin.py index e0112ac9a..8e4ba8b27 100644 --- a/code/admin/Admin.py +++ b/code/admin/Admin.py @@ -1,17 +1,24 @@ import streamlit as st import os import logging +import sys from dotenv import load_dotenv -load_dotenv() -import sys -sys.path.append(os.path.join(os.path.dirname(__file__), '..')) +load_dotenv() -logger = logging.getLogger('azure.core.pipeline.policies.http_logging_policy').setLevel(logging.WARNING) +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) +logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel( + logging.WARNING +) -st.set_page_config(page_title="Admin", page_icon=os.path.join('images','favicon.ico'), layout="wide", menu_items=None) +st.set_page_config( + page_title="Admin", + page_icon=os.path.join("images", "favicon.ico"), + layout="wide", + menu_items=None, +) mod_page_style = """ """ st.markdown(mod_page_style, unsafe_allow_html=True) - + + def remote_convert_files_and_add_embeddings(process_all=False): - backend_url = urllib.parse.urljoin(os.getenv('BACKEND_URL','http://localhost:7071'), "/api/BatchStartProcessing") + backend_url = urllib.parse.urljoin( + os.getenv("BACKEND_URL", "http://localhost:7071"), "/api/BatchStartProcessing" + ) params = {} - if os.getenv('FUNCTION_KEY') != None: - params['code'] = os.getenv('FUNCTION_KEY') - params['clientId'] = "clientKey" + if os.getenv("FUNCTION_KEY") is not None: + params["code"] = os.getenv("FUNCTION_KEY") + params["clientId"] = "clientKey" if process_all: - params['process_all'] = "true" + params["process_all"] = "true" try: response = requests.post(backend_url, params=params) if response.status_code == 200: - st.success(f"{response.text}\nPlease note this is an asynchronous process and may take a few minutes to complete.") + st.success( + f"{response.text}\nPlease note this is an asynchronous process and may take a few minutes to complete." + ) else: st.error(f"Error: {response.text}") - except Exception as e: + except Exception: st.error(traceback.format_exc()) + def add_urls(): params = {} - if os.getenv('FUNCTION_KEY') != None: - params['code'] = os.getenv('FUNCTION_KEY') - params['clientId'] = "clientKey" - urls = st.session_state['urls'].split('\n') + if os.getenv("FUNCTION_KEY") is not None: + params["code"] = os.getenv("FUNCTION_KEY") + params["clientId"] = "clientKey" + urls = st.session_state["urls"].split("\n") for url in urls: - body = { - "url": url - } - backend_url = urllib.parse.urljoin(os.getenv('BACKEND_URL','http://localhost:7071'), "/api/AddURLEmbeddings") + body = {"url": url} + backend_url = urllib.parse.urljoin( + os.getenv("BACKEND_URL", "http://localhost:7071"), "/api/AddURLEmbeddings" + ) r = requests.post(url=backend_url, params=params, json=body) if not r.ok: - raise ValueError(f'Error {r.status_code}: {r.text}') + raise ValueError(f"Error {r.status_code}: {r.text}") else: - st.success(f'Embeddings added successfully for {url}') + st.success(f"Embeddings added successfully for {url}") -def upload_file(bytes_data: bytes, file_name: str, content_type: Optional[str] = None): +def upload_file(bytes_data: bytes, file_name: str, content_type: Optional[str] = None): # Upload a new file - st.session_state['filename'] = file_name - if content_type == None: + st.session_state["filename"] = file_name + if content_type is None: content_type = mimetypes.MimeTypes().guess_type(file_name)[0] - charset = f"; charset={chardet.detect(bytes_data)['encoding']}" if content_type == 'text/plain' else '' - content_type = content_type if content_type != None else 'text/plain' - account_name = os.getenv('AZURE_BLOB_ACCOUNT_NAME') - account_key = os.getenv('AZURE_BLOB_ACCOUNT_KEY') - container_name = os.getenv('AZURE_BLOB_CONTAINER_NAME') - if account_name == None or account_key == None or container_name == None: - raise ValueError("Please provide values for AZURE_BLOB_ACCOUNT_NAME, AZURE_BLOB_ACCOUNT_KEY and AZURE_BLOB_CONTAINER_NAME") + charset = ( + f"; charset={chardet.detect(bytes_data)['encoding']}" + if content_type == "text/plain" + else "" + ) + content_type = content_type if content_type is not None else "text/plain" + account_name = os.getenv("AZURE_BLOB_ACCOUNT_NAME") + account_key = os.getenv("AZURE_BLOB_ACCOUNT_KEY") + container_name = os.getenv("AZURE_BLOB_CONTAINER_NAME") + if account_name is None or account_key is None or container_name is None: + raise ValueError( + "Please provide values for AZURE_BLOB_ACCOUNT_NAME, AZURE_BLOB_ACCOUNT_KEY and AZURE_BLOB_CONTAINER_NAME" + ) connect_str = f"DefaultEndpointsProtocol=https;AccountName={account_name};AccountKey={account_key};EndpointSuffix=core.windows.net" - blob_service_client : BlobServiceClient = BlobServiceClient.from_connection_string(connect_str) + blob_service_client: BlobServiceClient = BlobServiceClient.from_connection_string( + connect_str + ) # Create a blob client using the local file name as the name for the blob - blob_client = blob_service_client.get_blob_client(container=container_name, blob=file_name) + blob_client = blob_service_client.get_blob_client( + container=container_name, blob=file_name + ) # Upload the created file - blob_client.upload_blob(bytes_data, overwrite=True, content_settings=ContentSettings(content_type=content_type+charset)) + blob_client.upload_blob( + bytes_data, + overwrite=True, + content_settings=ContentSettings(content_type=content_type + charset), + ) # Generate a SAS URL to the blob and return it - st.session_state['file_url'] = blob_client.url + '?' + generate_blob_sas(account_name, container_name, file_name,account_key=account_key, permission="r", expiry=datetime.utcnow() + timedelta(hours=3)) + st.session_state["file_url"] = ( + blob_client.url + + "?" + + generate_blob_sas( + account_name, + container_name, + file_name, + account_key=account_key, + permission="r", + expiry=datetime.utcnow() + timedelta(hours=3), + ) + ) + try: with st.expander("Add documents in Batch", expanded=True): config = ConfigHelper.get_active_config_or_default() - file_type = [processor.document_type for processor in config.document_processors] - uploaded_files = st.file_uploader("Upload a document to add it to the Azure Storage Account, compute embeddings and add them to the Azure AI Search index. Check your configuration for available document processors.", type=file_type, accept_multiple_files=True) + file_type = [ + processor.document_type for processor in config.document_processors + ] + uploaded_files = st.file_uploader( + "Upload a document to add it to the Azure Storage Account, compute embeddings and add them to the Azure AI Search index. Check your configuration for available document processors.", + type=file_type, + accept_multiple_files=True, + ) if uploaded_files is not None: for up in uploaded_files: # To read file as bytes: bytes_data = up.getvalue() - if st.session_state.get('filename', '') != up.name: + if st.session_state.get("filename", "") != up.name: # Upload a new file upload_file(bytes_data, up.name) if len(uploaded_files) > 0: - st.success(f"{len(uploaded_files)} documents uploaded. Embeddings computation in progress. \nPlease note this is an asynchronous process and may take a few minutes to complete.\nYou can check for further details in the Azure Function logs.") + st.success( + f"{len(uploaded_files)} documents uploaded. Embeddings computation in progress. \nPlease note this is an asynchronous process and may take a few minutes to complete.\nYou can check for further details in the Azure Function logs." + ) - col1, col2, col3 = st.columns([2,1,2]) + col1, col2, col3 = st.columns([2, 1, 2]) # with col1: # st.button("Process and ingest new files", on_click=remote_convert_files_and_add_embeddings) with col3: - st.button("Reprocess all documents in the Azure Storage account", on_click=remote_convert_files_and_add_embeddings, args=(True,)) + st.button( + "Reprocess all documents in the Azure Storage account", + on_click=remote_convert_files_and_add_embeddings, + args=(True,), + ) with st.expander("Add URLs to the knowledge base", expanded=True): - col1, col2 = st.columns([3,1]) - with col1: - st.text_area("Add a URLs and than click on 'Compute Embeddings'", placeholder="PLACE YOUR URLS HERE SEPARATED BY A NEW LINE", height=100, key="urls") + col1, col2 = st.columns([3, 1]) + with col1: + st.text_area( + "Add a URLs and than click on 'Compute Embeddings'", + placeholder="PLACE YOUR URLS HERE SEPARATED BY A NEW LINE", + height=100, + key="urls", + ) with col2: - st.selectbox('Embeddings models', [os.getenv('AZURE_OPENAI_EMBEDDING_MODEL')], disabled=True) + st.selectbox( + "Embeddings models", + [os.getenv("AZURE_OPENAI_EMBEDDING_MODEL")], + disabled=True, + ) st.button("Process and ingest web pages", on_click=add_urls, key="add_url") -except Exception as e: +except Exception: st.error(traceback.format_exc()) diff --git a/code/admin/pages/02_Explore_Data.py b/code/admin/pages/02_Explore_Data.py index 68952c396..9231371dc 100644 --- a/code/admin/pages/02_Explore_Data.py +++ b/code/admin/pages/02_Explore_Data.py @@ -5,13 +5,22 @@ import logging import pandas as pd import sys -sys.path.append(os.path.join(os.path.dirname(__file__), '..')) from utilities.helpers.AzureSearchHelper import AzureSearchHelper from dotenv import load_dotenv + +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) + load_dotenv() -logger = logging.getLogger('azure.core.pipeline.policies.http_logging_policy').setLevel(logging.WARNING) -st.set_page_config(page_title="Explore Data", page_icon=os.path.join('images','favicon.ico'), layout="wide", menu_items=None) +logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel( + logging.WARNING +) +st.set_page_config( + page_title="Explore Data", + page_icon=os.path.join("images", "favicon.ico"), + layout="wide", + menu_items=None, +) mod_page_style = """