diff --git a/code/backend/batch/utilities/integrated_vectorization/azure_search_indexer.py b/code/backend/batch/utilities/integrated_vectorization/azure_search_indexer.py index 632cdbd38..9be9fb858 100644 --- a/code/backend/batch/utilities/integrated_vectorization/azure_search_indexer.py +++ b/code/backend/batch/utilities/integrated_vectorization/azure_search_indexer.py @@ -27,11 +27,26 @@ def create_or_update_indexer(self, indexer_name: str, skillset_name: str): skillset_name=skillset_name, target_index_name=self.env_helper.AZURE_SEARCH_INDEX, data_source_name=self.env_helper.AZURE_SEARCH_DATASOURCE_NAME, + parameters={ + "configuration": { + "dataToExtract": "contentAndMetadata", + "parsingMode": "default", + "imageAction": "generateNormalizedImages", + } + }, field_mappings=[ FieldMapping( source_field_name="metadata_storage_path", target_field_name="source", ), + FieldMapping( + source_field_name="/document/normalized_images/*/text", + target_field_name="text", + ), + FieldMapping( + source_field_name="/document/normalized_images/*/layoutText", + target_field_name="layoutText", + ), ], ) indexer_result = self.indexer_client.create_or_update_indexer(indexer) diff --git a/code/backend/batch/utilities/integrated_vectorization/azure_search_skillset.py b/code/backend/batch/utilities/integrated_vectorization/azure_search_skillset.py index f80b3e6d1..622fa3152 100644 --- a/code/backend/batch/utilities/integrated_vectorization/azure_search_skillset.py +++ b/code/backend/batch/utilities/integrated_vectorization/azure_search_skillset.py @@ -4,6 +4,8 @@ InputFieldMappingEntry, OutputFieldMappingEntry, AzureOpenAIEmbeddingSkill, + OcrSkill, + MergeSkill, SearchIndexerIndexProjections, SearchIndexerIndexProjectionSelector, SearchIndexerIndexProjectionsParameters, @@ -39,6 +41,38 @@ def __init__( def create_skillset(self): skillset_name = f"{self.env_helper.AZURE_SEARCH_INDEX}-skillset" + ocr_skill = OcrSkill( + description="Extract text (plain and structured) from image", + context="/document/normalized_images/*", + inputs=[ + InputFieldMappingEntry( + name="image", + source="/document/normalized_images/*", + ) + ], + outputs=[ + OutputFieldMappingEntry(name="text", target_name="text"), + OutputFieldMappingEntry(name="layoutText", target_name="layoutText"), + ], + ) + + merge_skill = MergeSkill( + description="Merge text from OCR and text from document", + context="/document", + inputs=[ + InputFieldMappingEntry(name="text", source="/document/content"), + InputFieldMappingEntry( + name="itemsToInsert", source="/document/normalized_images/*/text" + ), + InputFieldMappingEntry( + name="offsets", source="/document/normalized_images/*/contentOffset" + ), + ], + outputs=[ + OutputFieldMappingEntry(name="mergedText", target_name="merged_content") + ], + ) + split_skill = SplitSkill( description="Split skill to chunk documents", text_split_mode="pages", @@ -46,7 +80,7 @@ def create_skillset(self): maximum_page_length=self.integrated_vectorization_config.max_page_length, page_overlap_length=self.integrated_vectorization_config.page_overlap_length, inputs=[ - InputFieldMappingEntry(name="text", source="/document/content"), + InputFieldMappingEntry(name="text", source="/document/merged_content"), ], outputs=[OutputFieldMappingEntry(name="textItems", target_name="pages")], ) @@ -98,7 +132,7 @@ def create_skillset(self): skillset = SearchIndexerSkillset( name=skillset_name, description="Skillset to chunk documents and generating embeddings", - skills=[split_skill, embedding_skill], + skills=[ocr_skill, merge_skill, split_skill, embedding_skill], index_projections=index_projections, ) diff --git a/code/tests/functional/conftest.py b/code/tests/functional/conftest.py index 752ea49ac..f416d9c65 100644 --- a/code/tests/functional/conftest.py +++ b/code/tests/functional/conftest.py @@ -143,23 +143,83 @@ def setup_default_mocking(httpserver: HTTPServer, app_config: AppConfig): ).respond_with_json( { "name": f"{app_config.get('AZURE_SEARCH_INDEX')}-skillset", - "description": "Extract entities, detect language and extract key-phrases", + "description": "Skillset to chunk documents and generating embeddings", "skills": [ { - "@odata.type": "#Microsoft.Skills.Text.SplitSkill", - "name": "#3", - "description": None, - "context": None, + "@odata.type": "#Microsoft.Skills.Vision.OcrSkill", + "description": "Extract text (plain and structured) from image", + "context": "/document/normalized_images/*", + "inputs": [ + {"name": "image", "source": "/document/normalized_images/*"} + ], + "outputs": [ + {"name": "text", "targetName": "text"}, + {"name": "layoutText", "targetName": "layoutText"}, + ], + "detectOrientation": False, + }, + { + "@odata.type": "#Microsoft.Skills.Text.MergeSkill", + "description": "Merge text from OCR and text from document", + "context": "/document", "inputs": [ {"name": "text", "source": "/document/content"}, - {"name": "languageCode", "source": "/document/languageCode"}, + { + "name": "itemsToInsert", + "source": "/document/normalized_images/*/text", + }, + { + "name": "offsets", + "source": "/document/normalized_images/*/contentOffset", + }, ], + "outputs": [{"name": "mergedText", "targetName": "merged_content"}], + "insertPreTag": " ", + "insertPostTag": " ", + }, + { + "@odata.type": "#Microsoft.Skills.Text.SplitSkill", + "description": "Split skill to chunk documents", + "context": "/document", + "inputs": [{"name": "text", "source": "/document/merged_content"}], "outputs": [{"name": "textItems", "targetName": "pages"}], - "defaultLanguageCode": None, "textSplitMode": "pages", - "maximumPageLength": 4000, + "maximumPageLength": 800, + "pageOverlapLength": 100, + }, + { + "@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill", + "description": "Skill to generate embeddings via Azure OpenAI", + "context": "/document/pages/*", + "inputs": [{"name": "text", "source": "/document/pages/*"}], + "outputs": [{"name": "embedding", "targetName": "content_vector"}], + "resourceUri": f"https://localhost:{httpserver.port}/", + "deploymentId": f"{app_config.get('AZURE_OPENAI_EMBEDDING_MODEL')}", + "apiKey": f"{app_config.get('AZURE_OPENAI_API_KEY')}", }, ], + "indexProjections": { + "selectors": [ + { + "targetIndexName": f"{app_config.get('AZURE_SEARCH_INDEX')}", + "parentKeyFieldName": "id", + "sourceContext": "/document/pages/*", + "mappings": [ + {"name": "content", "source": "/document/pages/*"}, + { + "name": "content_vector", + "source": "/document/pages/*/content_vector", + }, + {"name": "title", "source": "/document/title"}, + { + "name": "source", + "source": "/document/metadata_storage_path", + }, + ], + } + ], + "parameters": {"projectionMode": "skipIndexingParentDocuments"}, + }, }, status=201, ) diff --git a/code/tests/functional/tests/functions/integrated_vectorization/test_integrated_vectorization_resource_creation.py b/code/tests/functional/tests/functions/integrated_vectorization/test_integrated_vectorization_resource_creation.py index 21e252be1..30ea6c9ed 100644 --- a/code/tests/functional/tests/functions/integrated_vectorization/test_integrated_vectorization_resource_creation.py +++ b/code/tests/functional/tests/functions/integrated_vectorization/test_integrated_vectorization_resource_creation.py @@ -284,6 +284,92 @@ def test_integrated_vectorization_skillset_created( method="PUT", query_string="api-version=2023-10-01-Preview", times=1, + json={ + "name": f"{app_config.get('AZURE_SEARCH_INDEX')}-skillset", + "description": "Skillset to chunk documents and generating embeddings", + "skills": [ + { + "@odata.type": "#Microsoft.Skills.Vision.OcrSkill", + "description": "Extract text (plain and structured) from image", + "context": "/document/normalized_images/*", + "inputs": [ + {"name": "image", "source": "/document/normalized_images/*"} + ], + "outputs": [ + {"name": "text", "targetName": "text"}, + {"name": "layoutText", "targetName": "layoutText"}, + ], + "detectOrientation": False, + }, + { + "@odata.type": "#Microsoft.Skills.Text.MergeSkill", + "description": "Merge text from OCR and text from document", + "context": "/document", + "inputs": [ + {"name": "text", "source": "/document/content"}, + { + "name": "itemsToInsert", + "source": "/document/normalized_images/*/text", + }, + { + "name": "offsets", + "source": "/document/normalized_images/*/contentOffset", + }, + ], + "outputs": [ + {"name": "mergedText", "targetName": "merged_content"} + ], + "insertPreTag": " ", + "insertPostTag": " ", + }, + { + "@odata.type": "#Microsoft.Skills.Text.SplitSkill", + "description": "Split skill to chunk documents", + "context": "/document", + "inputs": [ + {"name": "text", "source": "/document/merged_content"} + ], + "outputs": [{"name": "textItems", "targetName": "pages"}], + "textSplitMode": "pages", + "maximumPageLength": 800, + "pageOverlapLength": 100, + }, + { + "@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill", + "description": "Skill to generate embeddings via Azure OpenAI", + "context": "/document/pages/*", + "inputs": [{"name": "text", "source": "/document/pages/*"}], + "outputs": [ + {"name": "embedding", "targetName": "content_vector"} + ], + "resourceUri": f"https://localhost:{httpserver.port}/", + "deploymentId": f"{app_config.get('AZURE_OPENAI_EMBEDDING_MODEL')}", + "apiKey": f"{app_config.get('AZURE_OPENAI_API_KEY')}", + }, + ], + "indexProjections": { + "selectors": [ + { + "targetIndexName": f"{app_config.get('AZURE_SEARCH_INDEX')}", + "parentKeyFieldName": "id", + "sourceContext": "/document/pages/*", + "mappings": [ + {"name": "content", "source": "/document/pages/*"}, + { + "name": "content_vector", + "source": "/document/pages/*/content_vector", + }, + {"name": "title", "source": "/document/title"}, + { + "name": "source", + "source": "/document/metadata_storage_path", + }, + ], + } + ], + "parameters": {"projectionMode": "skipIndexingParentDocuments"}, + }, + }, ), ) diff --git a/code/tests/utilities/integrated_vectorization/test_azure_search_indexer.py b/code/tests/utilities/integrated_vectorization/test_azure_search_indexer.py index dae2e065f..bbf7f3c70 100644 --- a/code/tests/utilities/integrated_vectorization/test_azure_search_indexer.py +++ b/code/tests/utilities/integrated_vectorization/test_azure_search_indexer.py @@ -61,6 +61,13 @@ def test_create_or_update_indexer_keys( skillset_name="skillset_name", target_index_name=env_helper_mock.AZURE_SEARCH_INDEX, data_source_name=env_helper_mock.AZURE_SEARCH_DATASOURCE_NAME, + parameters={ + "configuration": { + "dataToExtract": "contentAndMetadata", + "parsingMode": "default", + "imageAction": "generateNormalizedImages", + } + }, field_mappings=ANY, ) @@ -88,6 +95,13 @@ def test_create_or_update_indexer_rbac( skillset_name="skillset_name", target_index_name=env_helper_mock.AZURE_SEARCH_INDEX, data_source_name=env_helper_mock.AZURE_SEARCH_DATASOURCE_NAME, + parameters={ + "configuration": { + "dataToExtract": "contentAndMetadata", + "parsingMode": "default", + "imageAction": "generateNormalizedImages", + } + }, field_mappings=ANY, ) diff --git a/code/tests/utilities/integrated_vectorization/test_azure_search_skillset.py b/code/tests/utilities/integrated_vectorization/test_azure_search_skillset.py index 19150b948..95b453047 100644 --- a/code/tests/utilities/integrated_vectorization/test_azure_search_skillset.py +++ b/code/tests/utilities/integrated_vectorization/test_azure_search_skillset.py @@ -6,6 +6,8 @@ from azure.search.documents.indexes.models import ( SearchIndexerSkillset, SplitSkill, + OcrSkill, + MergeSkill, AzureOpenAIEmbeddingSkill, SearchIndexerIndexProjections, ) @@ -43,7 +45,7 @@ def search_indexer_client_mock(): indexer_client.create_or_update_skillset.return_value = SearchIndexerSkillset( name="skillset_name", description="Skillset to chunk documents and generating embeddings", - skills=[SplitSkill, AzureOpenAIEmbeddingSkill], + skills=[OcrSkill, MergeSkill, SplitSkill, AzureOpenAIEmbeddingSkill], index_projections=SearchIndexerIndexProjections, ) yield mock @@ -62,7 +64,7 @@ def test_create_skillset_keys( # then assert create_or_update_skillset.name == "skillset_name" - assert len(create_or_update_skillset.skills) == 2 + assert len(create_or_update_skillset.skills) == 4 assert create_or_update_skillset.index_projections is not None search_indexer_client_mock.return_value.create_or_update_skillset.assert_called_once() @@ -82,6 +84,6 @@ def test_create_skillset_rbac( # then assert create_or_update_skillset.name == "skillset_name" - assert len(create_or_update_skillset.skills) == 2 + assert len(create_or_update_skillset.skills) == 4 assert create_or_update_skillset.index_projections is not None search_indexer_client_mock.return_value.create_or_update_skillset.assert_called_once()