Skip to content

Commit

Permalink
refactor: Simplify url embeddings logic in Ingest admin app page (#799)
Browse files Browse the repository at this point in the history
  • Loading branch information
superhindupur authored May 9, 2024
1 parent 4836e6b commit 8b0c345
Show file tree
Hide file tree
Showing 5 changed files with 170 additions and 104 deletions.
73 changes: 52 additions & 21 deletions code/backend/batch/AddURLEmbeddings.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
import io
import os
import logging
import traceback
import azure.functions as func
from utilities.helpers.embedders.EmbedderFactory import EmbedderFactory
import requests
from bs4 import BeautifulSoup
from utilities.helpers.EnvHelper import EnvHelper
from utilities.helpers.AzureBlobStorageClient import AzureBlobStorageClient
from utilities.helpers.embedders.EmbedderFactory import EmbedderFactory

bp_add_url_embeddings = func.Blueprint()
logger = logging.getLogger(__name__)
Expand All @@ -16,30 +20,57 @@ def add_url_embeddings(req: func.HttpRequest) -> func.HttpResponse:
logger.info("Python HTTP trigger function processed a request.")

# Get Url from request
url = req.params.get("url")
if not url:
try:
req_body = req.get_json()
except ValueError:
pass
else:
url = req_body.get("url")
# Check if url is present, compute embeddings and add them to VectorStore
if url:
try:
embedder = EmbedderFactory.create(env_helper)
embedder.embed_file(url, ".url")
except Exception:
return func.HttpResponse(
f"Error: {traceback.format_exc()}", status_code=500
)
url = None
try:
url = req.get_json().get("url")
except Exception:
url = None

if not url:
return func.HttpResponse(
f"Embeddings added successfully for {url}", status_code=200
"Please pass a URL on the query string or in the request body",
status_code=400,
)

env_helper: EnvHelper = EnvHelper()
if env_helper.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION:
return download_url_and_upload_to_blob(url)
else:
return process_url_contents_directly(url, env_helper)


def process_url_contents_directly(url: str, env_helper: EnvHelper):
try:
embedder = EmbedderFactory.create(env_helper)
embedder.embed_file(url, ".url")
except Exception:
logger.error(
f"Error while processing contents of URL {url}: {traceback.format_exc()}"
)
return func.HttpResponse(
"Please pass a url on the query string or in the request body",
status_code=400,
f"Unexpected error occurred while processing the contents of the URL {url}",
status_code=500,
)

return func.HttpResponse(
f"Embeddings added successfully for {url}", status_code=200
)


def download_url_and_upload_to_blob(url: str):
try:
response = requests.get(url)
parsed_data = BeautifulSoup(response.content, "html.parser")
with io.BytesIO(parsed_data.get_text().encode("utf-8")) as stream:
blob_client = AzureBlobStorageClient()
blob_client.upload_file(stream, url, metadata={"title": url})
return func.HttpResponse(f"URL {url} added to knowledge base", status_code=200)

except Exception:
logger.error(
f"Error while adding URL {url} to the knowledge base: {traceback.format_exc()}"
)
return func.HttpResponse(
f"Error occurred while adding {url} to the knowledge base.",
status_code=500,
)
7 changes: 1 addition & 6 deletions code/backend/batch/BatchStartProcessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,7 @@ def batch_start_processing(req: func.HttpRequest) -> func.HttpResponse:
azure_blob_storage_client = AzureBlobStorageClient()
# Get all files from Blob Storage
files_data = azure_blob_storage_client.get_all_files()
# Filter out files that have already been processed
files_data = (
list(filter(lambda x: not x["embeddings_added"], files_data))
if req.params.get("process_all") != "true"
else files_data
)

files_data = list(map(lambda x: {"filename": x["filename"]}, files_data))

# Send a message to the queue for each file
Expand Down
37 changes: 5 additions & 32 deletions code/backend/pages/01_Ingest_Data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import io
from os import path
from bs4 import BeautifulSoup
import streamlit as st
import traceback
import requests
Expand Down Expand Up @@ -31,16 +29,15 @@
st.markdown(mod_page_style, unsafe_allow_html=True)


def remote_convert_files_and_add_embeddings(process_all=False):
def remote_convert_files_and_add_embeddings():
backend_url = urllib.parse.urljoin(
env_helper.BACKEND_URL, "/api/BatchStartProcessing"
)
params = {}
if env_helper.FUNCTION_KEY is not None:
params["code"] = env_helper.FUNCTION_KEY
params["clientId"] = "clientKey"
if process_all:
params["process_all"] = "true"

try:
response = requests.post(backend_url, params=params)
if response.status_code == 200:
Expand All @@ -53,30 +50,9 @@ def remote_convert_files_and_add_embeddings(process_all=False):
st.error(traceback.format_exc())


def add_urls(blob_client: AzureBlobStorageClient):
def add_urls():
urls = st.session_state["urls"].split("\n")
if env_helper.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION:
download_url_and_upload_to_blob(blob_client, urls)
else:
add_url_embeddings(urls)


def download_url_and_upload_to_blob(
blob_client: AzureBlobStorageClient, urls: list[str]
):
for url in urls:
try:
response = requests.get(url)
parsed_data = BeautifulSoup(response.content, "html.parser")
with io.BytesIO(parsed_data.get_text().encode("utf-8")) as stream:
st.session_state["filename"] = url
st.session_state["file_url"] = blob_client.upload_file(
stream, url, metadata={"title": url}
)
st.success(f"Url {url} added to knowledge base")
except Exception:
logger.error(traceback.format_exc())
st.error(f"Exception occurred while adding {url} to the knowledge base.")
add_url_embeddings(urls)


def add_url_embeddings(urls: list[str]):
Expand Down Expand Up @@ -124,13 +100,10 @@ def add_url_embeddings(urls: list[str]):
)

col1, col2, col3 = st.columns([2, 1, 2])
# with col1:
# st.button("Process and ingest new files", on_click=remote_convert_files_and_add_embeddings)
with col3:
st.button(
"Reprocess all documents in the Azure Storage account",
on_click=remote_convert_files_and_add_embeddings,
args=(True,),
)

with st.expander("Add URLs to the knowledge base", expanded=True):
Expand All @@ -151,7 +124,7 @@ def add_url_embeddings(urls: list[str]):
)
st.button(
"Process and ingest web pages",
on_click=lambda: add_urls(blob_client),
on_click=add_urls,
key="add_url",
)

Expand Down
111 changes: 101 additions & 10 deletions code/tests/test_AddURLEmbeddings.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import sys
import os
from unittest.mock import patch
from unittest.mock import ANY, MagicMock, patch
import azure.functions as func


Expand All @@ -10,43 +10,134 @@


@patch("backend.batch.AddURLEmbeddings.EmbedderFactory")
def test_add_url_embeddings_when_url_set_in_body(_):
def test_add_url_embeddings(mock_embedder_factory: MagicMock):
# given
fake_request = func.HttpRequest(
method="POST",
url="",
body=b'{"url": "https://example.com"}',
headers={"Content-Type": "application/json"},
)
mock_embedder_instance = mock_embedder_factory.create.return_value

# when
response = add_url_embeddings.build().get_user_function()(fake_request)

# then
assert response.status_code == 200
mock_embedder_instance.embed_file.assert_called_once_with(
"https://example.com", ".url"
)


@patch("backend.batch.AddURLEmbeddings.EmbedderFactory")
def test_add_url_embeddings_when_url_set_in_param(_):
def test_add_url_embeddings_returns_400_when_url_not_set():
# given
fake_request = func.HttpRequest(
method="POST",
url="",
body=b"",
params={},
)

# when
response = add_url_embeddings.build().get_user_function()(fake_request)

# then
assert response.status_code == 400


@patch("backend.batch.AddURLEmbeddings.EmbedderFactory")
def test_add_url_embeddings_returns_500_when_exception_occurs(
mock_embedder_factory: MagicMock,
):
# given
fake_request = func.HttpRequest(
method="POST",
url="",
body=b'{"url": "https://example.com"}',
headers={"Content-Type": "application/json"},
)
mock_embedder_instance = mock_embedder_factory.create.return_value
mock_embedder_instance.embed_file.side_effect = Exception("Test exception")

# when
response = add_url_embeddings.build().get_user_function()(fake_request)

# then
assert response.status_code == 500
assert (
b"Unexpected error occurred while processing the contents of the URL https://example.com"
in response.get_body()
)


@patch("backend.batch.AddURLEmbeddings.EnvHelper")
@patch("backend.batch.AddURLEmbeddings.AzureBlobStorageClient")
@patch("backend.batch.AddURLEmbeddings.requests")
def test_add_url_embeddings_integrated_vectorization(
mock_requests: MagicMock,
mock_blob_storage_client: MagicMock,
mock_env_helper: MagicMock,
):
# given
url = "https://example.com"
fake_request = func.HttpRequest(
method="POST",
url="",
body=b'{"url":"' + url.encode("utf-8") + b'"}',
headers={"Content-Type": "application/json"},
params={"url": "https://example.com"},
)
mock_env_helper_instance = mock_env_helper.return_value
mock_env_helper_instance.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION = True

mock_get = mock_requests.get
mock_get.return_value.content = "url data"

mock_blob_storage_client_instance = mock_blob_storage_client.return_value

# when
response = add_url_embeddings.build().get_user_function()(fake_request)

# then
assert response.status_code == 200
mock_blob_storage_client_instance.upload_file.assert_called_once_with(
ANY, url, metadata={"title": url}
)


@patch("backend.batch.AddURLEmbeddings.EmbedderFactory")
def test_add_url_embeddings_returns_400_when_url_not_set(_):
@patch("backend.batch.AddURLEmbeddings.EnvHelper")
@patch("backend.batch.AddURLEmbeddings.AzureBlobStorageClient")
@patch("backend.batch.AddURLEmbeddings.requests")
def test_add_url_embeddings_integrated_vectorization_returns_500_when_exception_occurs(
mock_requests: MagicMock,
mock_blob_storage_client: MagicMock,
mock_env_helper: MagicMock,
):
# given
url = "https://example.com"
fake_request = func.HttpRequest(
method="POST",
url="",
body=b"",
params={},
body=b'{"url":"' + url.encode("utf-8") + b'"}',
headers={"Content-Type": "application/json"},
)
mock_env_helper_instance = mock_env_helper.return_value
mock_env_helper_instance.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION = True

mock_get = mock_requests.get
mock_get.return_value.content = "url data"

mock_blob_storage_client_instance = mock_blob_storage_client.return_value
mock_blob_storage_client_instance.upload_file.side_effect = Exception(
"Test exception"
)

# when
response = add_url_embeddings.build().get_user_function()(fake_request)

assert response.status_code == 400
# then
assert response.status_code == 500
assert (
b"Error occurred while adding https://example.com to the knowledge base."
in response.get_body()
)
Loading

0 comments on commit 8b0c345

Please sign in to comment.