From 8b0c345f3f5a24147c84456f14b02ffde1896d2f Mon Sep 17 00:00:00 2001 From: Bhavana Hindupur Date: Thu, 9 May 2024 14:22:07 +0100 Subject: [PATCH] refactor: Simplify url embeddings logic in Ingest admin app page (#799) --- code/backend/batch/AddURLEmbeddings.py | 73 ++++++++++---- code/backend/batch/BatchStartProcessing.py | 7 +- code/backend/pages/01_Ingest_Data.py | 37 +------ code/tests/test_AddURLEmbeddings.py | 111 +++++++++++++++++++-- code/tests/test_BatchStartProcessing.py | 46 ++------- 5 files changed, 170 insertions(+), 104 deletions(-) diff --git a/code/backend/batch/AddURLEmbeddings.py b/code/backend/batch/AddURLEmbeddings.py index 369a65a72..617c4f082 100644 --- a/code/backend/batch/AddURLEmbeddings.py +++ b/code/backend/batch/AddURLEmbeddings.py @@ -1,9 +1,13 @@ +import io import os import logging import traceback import azure.functions as func -from utilities.helpers.embedders.EmbedderFactory import EmbedderFactory +import requests +from bs4 import BeautifulSoup from utilities.helpers.EnvHelper import EnvHelper +from utilities.helpers.AzureBlobStorageClient import AzureBlobStorageClient +from utilities.helpers.embedders.EmbedderFactory import EmbedderFactory bp_add_url_embeddings = func.Blueprint() logger = logging.getLogger(__name__) @@ -16,30 +20,57 @@ def add_url_embeddings(req: func.HttpRequest) -> func.HttpResponse: logger.info("Python HTTP trigger function processed a request.") # Get Url from request - url = req.params.get("url") - if not url: - try: - req_body = req.get_json() - except ValueError: - pass - else: - url = req_body.get("url") - # Check if url is present, compute embeddings and add them to VectorStore - if url: - try: - embedder = EmbedderFactory.create(env_helper) - embedder.embed_file(url, ".url") - except Exception: - return func.HttpResponse( - f"Error: {traceback.format_exc()}", status_code=500 - ) + url = None + try: + url = req.get_json().get("url") + except Exception: + url = None + if not url: return func.HttpResponse( - f"Embeddings added successfully for {url}", status_code=200 + "Please pass a URL on the query string or in the request body", + status_code=400, ) + env_helper: EnvHelper = EnvHelper() + if env_helper.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION: + return download_url_and_upload_to_blob(url) else: + return process_url_contents_directly(url, env_helper) + + +def process_url_contents_directly(url: str, env_helper: EnvHelper): + try: + embedder = EmbedderFactory.create(env_helper) + embedder.embed_file(url, ".url") + except Exception: + logger.error( + f"Error while processing contents of URL {url}: {traceback.format_exc()}" + ) return func.HttpResponse( - "Please pass a url on the query string or in the request body", - status_code=400, + f"Unexpected error occurred while processing the contents of the URL {url}", + status_code=500, + ) + + return func.HttpResponse( + f"Embeddings added successfully for {url}", status_code=200 + ) + + +def download_url_and_upload_to_blob(url: str): + try: + response = requests.get(url) + parsed_data = BeautifulSoup(response.content, "html.parser") + with io.BytesIO(parsed_data.get_text().encode("utf-8")) as stream: + blob_client = AzureBlobStorageClient() + blob_client.upload_file(stream, url, metadata={"title": url}) + return func.HttpResponse(f"URL {url} added to knowledge base", status_code=200) + + except Exception: + logger.error( + f"Error while adding URL {url} to the knowledge base: {traceback.format_exc()}" + ) + return func.HttpResponse( + f"Error occurred while adding {url} to the knowledge base.", + status_code=500, ) diff --git a/code/backend/batch/BatchStartProcessing.py b/code/backend/batch/BatchStartProcessing.py index 5d0884989..79f0aa7dc 100644 --- a/code/backend/batch/BatchStartProcessing.py +++ b/code/backend/batch/BatchStartProcessing.py @@ -20,12 +20,7 @@ def batch_start_processing(req: func.HttpRequest) -> func.HttpResponse: azure_blob_storage_client = AzureBlobStorageClient() # Get all files from Blob Storage files_data = azure_blob_storage_client.get_all_files() - # Filter out files that have already been processed - files_data = ( - list(filter(lambda x: not x["embeddings_added"], files_data)) - if req.params.get("process_all") != "true" - else files_data - ) + files_data = list(map(lambda x: {"filename": x["filename"]}, files_data)) # Send a message to the queue for each file diff --git a/code/backend/pages/01_Ingest_Data.py b/code/backend/pages/01_Ingest_Data.py index 1455bb608..5851b3363 100644 --- a/code/backend/pages/01_Ingest_Data.py +++ b/code/backend/pages/01_Ingest_Data.py @@ -1,6 +1,4 @@ -import io from os import path -from bs4 import BeautifulSoup import streamlit as st import traceback import requests @@ -31,7 +29,7 @@ st.markdown(mod_page_style, unsafe_allow_html=True) -def remote_convert_files_and_add_embeddings(process_all=False): +def remote_convert_files_and_add_embeddings(): backend_url = urllib.parse.urljoin( env_helper.BACKEND_URL, "/api/BatchStartProcessing" ) @@ -39,8 +37,7 @@ def remote_convert_files_and_add_embeddings(process_all=False): if env_helper.FUNCTION_KEY is not None: params["code"] = env_helper.FUNCTION_KEY params["clientId"] = "clientKey" - if process_all: - params["process_all"] = "true" + try: response = requests.post(backend_url, params=params) if response.status_code == 200: @@ -53,30 +50,9 @@ def remote_convert_files_and_add_embeddings(process_all=False): st.error(traceback.format_exc()) -def add_urls(blob_client: AzureBlobStorageClient): +def add_urls(): urls = st.session_state["urls"].split("\n") - if env_helper.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION: - download_url_and_upload_to_blob(blob_client, urls) - else: - add_url_embeddings(urls) - - -def download_url_and_upload_to_blob( - blob_client: AzureBlobStorageClient, urls: list[str] -): - for url in urls: - try: - response = requests.get(url) - parsed_data = BeautifulSoup(response.content, "html.parser") - with io.BytesIO(parsed_data.get_text().encode("utf-8")) as stream: - st.session_state["filename"] = url - st.session_state["file_url"] = blob_client.upload_file( - stream, url, metadata={"title": url} - ) - st.success(f"Url {url} added to knowledge base") - except Exception: - logger.error(traceback.format_exc()) - st.error(f"Exception occurred while adding {url} to the knowledge base.") + add_url_embeddings(urls) def add_url_embeddings(urls: list[str]): @@ -124,13 +100,10 @@ def add_url_embeddings(urls: list[str]): ) col1, col2, col3 = st.columns([2, 1, 2]) - # with col1: - # st.button("Process and ingest new files", on_click=remote_convert_files_and_add_embeddings) with col3: st.button( "Reprocess all documents in the Azure Storage account", on_click=remote_convert_files_and_add_embeddings, - args=(True,), ) with st.expander("Add URLs to the knowledge base", expanded=True): @@ -151,7 +124,7 @@ def add_url_embeddings(urls: list[str]): ) st.button( "Process and ingest web pages", - on_click=lambda: add_urls(blob_client), + on_click=add_urls, key="add_url", ) diff --git a/code/tests/test_AddURLEmbeddings.py b/code/tests/test_AddURLEmbeddings.py index 538f49058..614c28604 100644 --- a/code/tests/test_AddURLEmbeddings.py +++ b/code/tests/test_AddURLEmbeddings.py @@ -1,6 +1,6 @@ import sys import os -from unittest.mock import patch +from unittest.mock import ANY, MagicMock, patch import azure.functions as func @@ -10,43 +10,134 @@ @patch("backend.batch.AddURLEmbeddings.EmbedderFactory") -def test_add_url_embeddings_when_url_set_in_body(_): +def test_add_url_embeddings(mock_embedder_factory: MagicMock): + # given fake_request = func.HttpRequest( method="POST", url="", body=b'{"url": "https://example.com"}', headers={"Content-Type": "application/json"}, ) + mock_embedder_instance = mock_embedder_factory.create.return_value + # when response = add_url_embeddings.build().get_user_function()(fake_request) + # then assert response.status_code == 200 + mock_embedder_instance.embed_file.assert_called_once_with( + "https://example.com", ".url" + ) -@patch("backend.batch.AddURLEmbeddings.EmbedderFactory") -def test_add_url_embeddings_when_url_set_in_param(_): +def test_add_url_embeddings_returns_400_when_url_not_set(): + # given fake_request = func.HttpRequest( method="POST", url="", body=b"", + params={}, + ) + + # when + response = add_url_embeddings.build().get_user_function()(fake_request) + + # then + assert response.status_code == 400 + + +@patch("backend.batch.AddURLEmbeddings.EmbedderFactory") +def test_add_url_embeddings_returns_500_when_exception_occurs( + mock_embedder_factory: MagicMock, +): + # given + fake_request = func.HttpRequest( + method="POST", + url="", + body=b'{"url": "https://example.com"}', + headers={"Content-Type": "application/json"}, + ) + mock_embedder_instance = mock_embedder_factory.create.return_value + mock_embedder_instance.embed_file.side_effect = Exception("Test exception") + + # when + response = add_url_embeddings.build().get_user_function()(fake_request) + + # then + assert response.status_code == 500 + assert ( + b"Unexpected error occurred while processing the contents of the URL https://example.com" + in response.get_body() + ) + + +@patch("backend.batch.AddURLEmbeddings.EnvHelper") +@patch("backend.batch.AddURLEmbeddings.AzureBlobStorageClient") +@patch("backend.batch.AddURLEmbeddings.requests") +def test_add_url_embeddings_integrated_vectorization( + mock_requests: MagicMock, + mock_blob_storage_client: MagicMock, + mock_env_helper: MagicMock, +): + # given + url = "https://example.com" + fake_request = func.HttpRequest( + method="POST", + url="", + body=b'{"url":"' + url.encode("utf-8") + b'"}', headers={"Content-Type": "application/json"}, - params={"url": "https://example.com"}, ) + mock_env_helper_instance = mock_env_helper.return_value + mock_env_helper_instance.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION = True + + mock_get = mock_requests.get + mock_get.return_value.content = "url data" + mock_blob_storage_client_instance = mock_blob_storage_client.return_value + + # when response = add_url_embeddings.build().get_user_function()(fake_request) + # then assert response.status_code == 200 + mock_blob_storage_client_instance.upload_file.assert_called_once_with( + ANY, url, metadata={"title": url} + ) -@patch("backend.batch.AddURLEmbeddings.EmbedderFactory") -def test_add_url_embeddings_returns_400_when_url_not_set(_): +@patch("backend.batch.AddURLEmbeddings.EnvHelper") +@patch("backend.batch.AddURLEmbeddings.AzureBlobStorageClient") +@patch("backend.batch.AddURLEmbeddings.requests") +def test_add_url_embeddings_integrated_vectorization_returns_500_when_exception_occurs( + mock_requests: MagicMock, + mock_blob_storage_client: MagicMock, + mock_env_helper: MagicMock, +): + # given + url = "https://example.com" fake_request = func.HttpRequest( method="POST", url="", - body=b"", - params={}, + body=b'{"url":"' + url.encode("utf-8") + b'"}', + headers={"Content-Type": "application/json"}, ) + mock_env_helper_instance = mock_env_helper.return_value + mock_env_helper_instance.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION = True + + mock_get = mock_requests.get + mock_get.return_value.content = "url data" + mock_blob_storage_client_instance = mock_blob_storage_client.return_value + mock_blob_storage_client_instance.upload_file.side_effect = Exception( + "Test exception" + ) + + # when response = add_url_embeddings.build().get_user_function()(fake_request) - assert response.status_code == 400 + # then + assert response.status_code == 500 + assert ( + b"Error occurred while adding https://example.com to the knowledge base." + in response.get_body() + ) diff --git a/code/tests/test_BatchStartProcessing.py b/code/tests/test_BatchStartProcessing.py index 413c13e23..9b5fbe75e 100644 --- a/code/tests/test_BatchStartProcessing.py +++ b/code/tests/test_BatchStartProcessing.py @@ -1,6 +1,6 @@ import sys import os -from unittest.mock import patch, Mock +from unittest.mock import call, patch, Mock sys.path.append(os.path.join(os.path.dirname(sys.path[0]), "backend", "batch")) @@ -12,49 +12,25 @@ def test_batch_start_processing_processes_all( mock_blob_storage_client, mock_create_queue_client ): + # given mock_http_request = Mock() mock_http_request.params = dict() - mock_http_request.params["process_all"] = "true" mock_queue_client = Mock() mock_create_queue_client.return_value = mock_queue_client - mock_blob_storage_client.return_value.get_all_files.return_value = [ - {"filename": "file_name_one", "embeddings_added": False} + {"filename": "file_name_one", "embeddings_added": False}, + {"filename": "file_name_two", "embeddings_added": True}, ] + # when response = batch_start_processing.build().get_user_function()(mock_http_request) + # then assert response.status_code == 200 + assert response.get_body() == b"Conversion started successfully for 2 documents." - mock_queue_client.send_message.assert_called_once_with( - b'{"filename": "file_name_one"}', - ) - - -@patch("backend.batch.BatchStartProcessing.create_queue_client") -@patch("backend.batch.BatchStartProcessing.AzureBlobStorageClient") -def test_batch_start_processing_filters_filter_no_embeddings( - mock_blob_storage_client, mock_create_queue_client -): - mock_http_request = Mock() - mock_http_request.params = dict() - mock_http_request.params["process_all"] = "false" - - mock_queue_client = Mock() - mock_create_queue_client.return_value = mock_queue_client - - mock_blob_storage_client.return_value.get_all_files.return_value = [ - { - "filename": "file_name_one", - "embeddings_added": True, # will get filtered out - }, - {"filename": "file_name_two", "embeddings_added": False}, - ] - response = batch_start_processing.build().get_user_function()(mock_http_request) - - assert response.status_code == 200 - - mock_queue_client.send_message.assert_called_once_with( - b'{"filename": "file_name_two"}', - ) + send_message_calls = mock_queue_client.send_message.call_args_list + assert len(send_message_calls) == 2 + assert send_message_calls[0] == call(b'{"filename": "file_name_one"}') + assert send_message_calls[1] == call(b'{"filename": "file_name_two"}')