diff --git a/code/backend/batch/utilities/common/SourceDocument.py b/code/backend/batch/utilities/common/SourceDocument.py index bb1639708..296dc90eb 100644 --- a/code/backend/batch/utilities/common/SourceDocument.py +++ b/code/backend/batch/utilities/common/SourceDocument.py @@ -3,6 +3,7 @@ import json from urllib.parse import urlparse, quote from ..helpers.AzureBlobStorageClient import AzureBlobStorageClient +from langchain.docstore.document import Document class SourceDocument: @@ -80,8 +81,6 @@ def from_metadata( ) def convert_to_langchain_document(self): - from langchain.docstore.document import Document - return Document( page_content=self.content, metadata={ diff --git a/code/backend/batch/utilities/document_chunking/ChunkingStrategy.py b/code/backend/batch/utilities/document_chunking/ChunkingStrategy.py new file mode 100644 index 000000000..dfe10749d --- /dev/null +++ b/code/backend/batch/utilities/document_chunking/ChunkingStrategy.py @@ -0,0 +1,25 @@ +from enum import Enum + + +class ChunkingStrategy(Enum): + LAYOUT = "layout" + PAGE = "page" + FIXED_SIZE_OVERLAP = "fixed_size_overlap" + PARAGRAPH = "paragraph" + + +class ChunkingSettings: + def __init__(self, chunking: dict): + self.chunking_strategy = ChunkingStrategy(chunking["strategy"]) + self.chunk_size = chunking["size"] + self.chunk_overlap = chunking["overlap"] + + def __eq__(self, other: object) -> bool: + if isinstance(self, other.__class__): + return ( + self.chunking_strategy == other.chunking_strategy + and self.chunk_size == other.chunk_size + and self.chunk_overlap == other.chunk_overlap + ) + else: + return False diff --git a/code/backend/batch/utilities/document_chunking/DocumentChunkingBase.py b/code/backend/batch/utilities/document_chunking/DocumentChunkingBase.py index c2640de63..e2e33bfd2 100644 --- a/code/backend/batch/utilities/document_chunking/DocumentChunkingBase.py +++ b/code/backend/batch/utilities/document_chunking/DocumentChunkingBase.py @@ -2,7 +2,7 @@ from typing import List from abc import ABC, abstractmethod from ..common.SourceDocument import SourceDocument -from .Strategies import ChunkingSettings +from .ChunkingStrategy import ChunkingSettings class DocumentChunkingBase(ABC): diff --git a/code/backend/batch/utilities/document_chunking/FixedSizeOverlap.py b/code/backend/batch/utilities/document_chunking/FixedSizeOverlap.py index 5bb1d01d3..eb7b7618f 100644 --- a/code/backend/batch/utilities/document_chunking/FixedSizeOverlap.py +++ b/code/backend/batch/utilities/document_chunking/FixedSizeOverlap.py @@ -1,7 +1,7 @@ from typing import List from .DocumentChunkingBase import DocumentChunkingBase from langchain.text_splitter import TokenTextSplitter -from .Strategies import ChunkingSettings +from .ChunkingStrategy import ChunkingSettings from ..common.SourceDocument import SourceDocument diff --git a/code/backend/batch/utilities/document_chunking/Layout.py b/code/backend/batch/utilities/document_chunking/Layout.py index df563420c..b9c876b60 100644 --- a/code/backend/batch/utilities/document_chunking/Layout.py +++ b/code/backend/batch/utilities/document_chunking/Layout.py @@ -1,7 +1,7 @@ from typing import List from .DocumentChunkingBase import DocumentChunkingBase from langchain.text_splitter import MarkdownTextSplitter -from .Strategies import ChunkingSettings +from .ChunkingStrategy import ChunkingSettings from ..common.SourceDocument import SourceDocument diff --git a/code/backend/batch/utilities/document_chunking/Page.py b/code/backend/batch/utilities/document_chunking/Page.py index 92ff78903..7c82a14f5 100644 --- a/code/backend/batch/utilities/document_chunking/Page.py +++ b/code/backend/batch/utilities/document_chunking/Page.py @@ -1,7 +1,7 @@ from typing import List from .DocumentChunkingBase import DocumentChunkingBase from langchain.text_splitter import MarkdownTextSplitter -from .Strategies import ChunkingSettings +from .ChunkingStrategy import ChunkingSettings from ..common.SourceDocument import SourceDocument diff --git a/code/backend/batch/utilities/document_chunking/Paragraph.py b/code/backend/batch/utilities/document_chunking/Paragraph.py index 8b2378331..2f13c8f8d 100644 --- a/code/backend/batch/utilities/document_chunking/Paragraph.py +++ b/code/backend/batch/utilities/document_chunking/Paragraph.py @@ -1,6 +1,6 @@ from typing import List from .DocumentChunkingBase import DocumentChunkingBase -from .Strategies import ChunkingSettings +from .ChunkingStrategy import ChunkingSettings from ..common.SourceDocument import SourceDocument diff --git a/code/backend/batch/utilities/document_chunking/Strategies.py b/code/backend/batch/utilities/document_chunking/Strategies.py index 2525f2072..61b0093a8 100644 --- a/code/backend/batch/utilities/document_chunking/Strategies.py +++ b/code/backend/batch/utilities/document_chunking/Strategies.py @@ -1,46 +1,18 @@ -from enum import Enum - - -class ChunkingStrategy(Enum): - LAYOUT = "layout" - PAGE = "page" - FIXED_SIZE_OVERLAP = "fixed_size_overlap" - PARAGRAPH = "paragraph" +from .ChunkingStrategy import ChunkingStrategy +from .Layout import LayoutDocumentChunking +from .Page import PageDocumentChunking +from .FixedSizeOverlap import FixedSizeOverlapDocumentChunking +from .Paragraph import ParagraphDocumentChunking def get_document_chunker(chunking_strategy: str): if chunking_strategy == ChunkingStrategy.LAYOUT.value: - from .Layout import LayoutDocumentChunking - return LayoutDocumentChunking() elif chunking_strategy == ChunkingStrategy.PAGE.value: - from .Page import PageDocumentChunking - return PageDocumentChunking() elif chunking_strategy == ChunkingStrategy.FIXED_SIZE_OVERLAP.value: - from .FixedSizeOverlap import FixedSizeOverlapDocumentChunking - return FixedSizeOverlapDocumentChunking() elif chunking_strategy == ChunkingStrategy.PARAGRAPH.value: - from .Paragraph import ParagraphDocumentChunking - return ParagraphDocumentChunking() else: raise Exception(f"Unknown chunking strategy: {chunking_strategy}") - - -class ChunkingSettings: - def __init__(self, chunking: dict): - self.chunking_strategy = ChunkingStrategy(chunking["strategy"]) - self.chunk_size = chunking["size"] - self.chunk_overlap = chunking["overlap"] - - def __eq__(self, other: object) -> bool: - if isinstance(self, other.__class__): - return ( - self.chunking_strategy == other.chunking_strategy - and self.chunk_size == other.chunk_size - and self.chunk_overlap == other.chunk_overlap - ) - else: - return False diff --git a/code/backend/batch/utilities/document_loading/Strategies.py b/code/backend/batch/utilities/document_loading/Strategies.py index fe58b13d9..83f6568fd 100644 --- a/code/backend/batch/utilities/document_loading/Strategies.py +++ b/code/backend/batch/utilities/document_loading/Strategies.py @@ -1,4 +1,8 @@ from enum import Enum +from .Layout import LayoutDocumentLoading +from .Read import ReadDocumentLoading +from .Web import WebDocumentLoading +from .WordDocument import WordDocumentLoading class LoadingStrategy(Enum): @@ -10,20 +14,12 @@ class LoadingStrategy(Enum): def get_document_loader(loader_strategy: str): if loader_strategy == LoadingStrategy.LAYOUT.value: - from .Layout import LayoutDocumentLoading - return LayoutDocumentLoading() elif loader_strategy == LoadingStrategy.READ.value: - from .Read import ReadDocumentLoading - return ReadDocumentLoading() elif loader_strategy == LoadingStrategy.WEB.value: - from .Web import WebDocumentLoading - return WebDocumentLoading() elif loader_strategy == LoadingStrategy.DOCX.value: - from .WordDocument import WordDocumentLoading - return WordDocumentLoading() else: raise Exception(f"Unknown loader strategy: {loader_strategy}") diff --git a/code/backend/batch/utilities/helpers/DocumentChunkingHelper.py b/code/backend/batch/utilities/helpers/DocumentChunkingHelper.py index 10132afcb..ef40238ff 100644 --- a/code/backend/batch/utilities/helpers/DocumentChunkingHelper.py +++ b/code/backend/batch/utilities/helpers/DocumentChunkingHelper.py @@ -1,10 +1,8 @@ from typing import List from langchain.docstore.document import Document -from ..document_chunking.Strategies import ( - get_document_chunker, - ChunkingSettings, - ChunkingStrategy, -) + +from ..document_chunking.ChunkingStrategy import ChunkingSettings, ChunkingStrategy +from ..document_chunking.Strategies import get_document_chunker __all__ = ["ChunkingStrategy"] diff --git a/code/backend/batch/utilities/helpers/OrchestratorHelper.py b/code/backend/batch/utilities/helpers/OrchestratorHelper.py index ed1dfa80d..3c2ffc93f 100644 --- a/code/backend/batch/utilities/helpers/OrchestratorHelper.py +++ b/code/backend/batch/utilities/helpers/OrchestratorHelper.py @@ -1,6 +1,8 @@ from typing import List + +from ..orchestrator.OrchestrationStrategy import OrchestrationStrategy from ..orchestrator import OrchestrationSettings -from ..orchestrator.Strategies import get_orchestrator, OrchestrationStrategy +from ..orchestrator.Strategies import get_orchestrator __all__ = ["OrchestrationStrategy"] diff --git a/code/backend/batch/utilities/helpers/config/ConfigHelper.py b/code/backend/batch/utilities/helpers/config/ConfigHelper.py index 48a655308..5b14c5cdc 100644 --- a/code/backend/batch/utilities/helpers/config/ConfigHelper.py +++ b/code/backend/batch/utilities/helpers/config/ConfigHelper.py @@ -2,14 +2,13 @@ import json import logging from string import Template + from ..AzureBlobStorageClient import AzureBlobStorageClient -from ...document_chunking.Strategies import ChunkingSettings, ChunkingStrategy +from ...document_chunking.ChunkingStrategy import ChunkingStrategy, ChunkingSettings from ...document_loading import LoadingSettings, LoadingStrategy from .EmbeddingConfig import EmbeddingConfig -from ..OrchestratorHelper import ( - OrchestrationSettings, - OrchestrationStrategy, -) +from ...orchestrator.OrchestrationStrategy import OrchestrationStrategy +from ...orchestrator import OrchestrationSettings from ..EnvHelper import EnvHelper CONFIG_CONTAINER_NAME = "config" diff --git a/code/backend/batch/utilities/orchestrator/OrchestrationStrategy.py b/code/backend/batch/utilities/orchestrator/OrchestrationStrategy.py new file mode 100644 index 000000000..c1b15b8e5 --- /dev/null +++ b/code/backend/batch/utilities/orchestrator/OrchestrationStrategy.py @@ -0,0 +1,7 @@ +from enum import Enum + + +class OrchestrationStrategy(Enum): + OPENAI_FUNCTION = "openai_function" + LANGCHAIN = "langchain" + SEMANTIC_KERNEL = "semantic_kernel" diff --git a/code/backend/batch/utilities/orchestrator/Strategies.py b/code/backend/batch/utilities/orchestrator/Strategies.py index ec57f2034..df47363a3 100644 --- a/code/backend/batch/utilities/orchestrator/Strategies.py +++ b/code/backend/batch/utilities/orchestrator/Strategies.py @@ -1,24 +1,15 @@ -from enum import Enum - - -class OrchestrationStrategy(Enum): - OPENAI_FUNCTION = "openai_function" - LANGCHAIN = "langchain" - SEMANTIC_KERNEL = "semantic_kernel" +from .OrchestrationStrategy import OrchestrationStrategy +from .OpenAIFunctions import OpenAIFunctionsOrchestrator +from .LangChainAgent import LangChainAgent +from .SemanticKernel import SemanticKernelOrchestrator def get_orchestrator(orchestration_strategy: str): if orchestration_strategy == OrchestrationStrategy.OPENAI_FUNCTION.value: - from .OpenAIFunctions import OpenAIFunctionsOrchestrator - return OpenAIFunctionsOrchestrator() elif orchestration_strategy == OrchestrationStrategy.LANGCHAIN.value: - from .LangChainAgent import LangChainAgent - return LangChainAgent() elif orchestration_strategy == OrchestrationStrategy.SEMANTIC_KERNEL.value: - from .SemanticKernel import SemanticKernelOrchestrator - return SemanticKernelOrchestrator() else: raise Exception(f"Unknown orchestration strategy: {orchestration_strategy}") diff --git a/code/backend/batch/utilities/orchestrator/__init__.py b/code/backend/batch/utilities/orchestrator/__init__.py index da087d844..98e33770e 100644 --- a/code/backend/batch/utilities/orchestrator/__init__.py +++ b/code/backend/batch/utilities/orchestrator/__init__.py @@ -2,7 +2,7 @@ from typing import List import os.path import pkgutil -from .Strategies import OrchestrationStrategy +from .OrchestrationStrategy import OrchestrationStrategy class OrchestrationSettings: diff --git a/code/create_app.py b/code/create_app.py index cfbc7effe..a707c7575 100644 --- a/code/create_app.py +++ b/code/create_app.py @@ -9,6 +9,8 @@ import sys import functools from backend.batch.utilities.helpers.EnvHelper import EnvHelper +from backend.batch.utilities.helpers.OrchestratorHelper import Orchestrator +from backend.batch.utilities.helpers.config.ConfigHelper import ConfigHelper from azure.mgmt.cognitiveservices import CognitiveServicesManagementClient from azure.identity import DefaultAzureCredential @@ -216,14 +218,10 @@ def stream_without_data(response): def get_message_orchestrator(): - from backend.batch.utilities.helpers.OrchestratorHelper import Orchestrator - return Orchestrator() def get_orchestrator_config(): - from backend.batch.utilities.helpers.config.ConfigHelper import ConfigHelper - return ConfigHelper.get_active_config_or_default().orchestrator diff --git a/code/tests/utilities/helpers/test_ConfigHelper.py b/code/tests/utilities/helpers/test_ConfigHelper.py index c8d8d1e86..aa7af44cf 100644 --- a/code/tests/utilities/helpers/test_ConfigHelper.py +++ b/code/tests/utilities/helpers/test_ConfigHelper.py @@ -3,7 +3,7 @@ from unittest.mock import patch, MagicMock from backend.batch.utilities.helpers.config.ConfigHelper import ConfigHelper, Config from backend.batch.utilities.helpers.config.EmbeddingConfig import EmbeddingConfig -from backend.batch.utilities.document_chunking.Strategies import ChunkingSettings +from backend.batch.utilities.document_chunking.ChunkingStrategy import ChunkingSettings from backend.batch.utilities.document_loading import LoadingSettings diff --git a/code/tests/utilities/helpers/test_DocumentChunkingHelper.py b/code/tests/utilities/helpers/test_DocumentChunkingHelper.py index 256ced2de..e7b8245bd 100644 --- a/code/tests/utilities/helpers/test_DocumentChunkingHelper.py +++ b/code/tests/utilities/helpers/test_DocumentChunkingHelper.py @@ -1,8 +1,8 @@ from backend.batch.utilities.common.SourceDocument import SourceDocument -from backend.batch.utilities.helpers.DocumentChunkingHelper import ( - DocumentChunking, - ChunkingSettings, +from backend.batch.utilities.helpers.DocumentChunkingHelper import DocumentChunking +from backend.batch.utilities.document_chunking.ChunkingStrategy import ( ChunkingStrategy, + ChunkingSettings, ) # Create a sample document