Skip to content

Commit

Permalink
feat: Integrated Vectorization - adding OCR skill (#1021)
Browse files Browse the repository at this point in the history
  • Loading branch information
komalg1 authored Jun 20, 2024
1 parent f61045e commit 30440a8
Show file tree
Hide file tree
Showing 6 changed files with 224 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,26 @@ def create_or_update_indexer(self, indexer_name: str, skillset_name: str):
skillset_name=skillset_name,
target_index_name=self.env_helper.AZURE_SEARCH_INDEX,
data_source_name=self.env_helper.AZURE_SEARCH_DATASOURCE_NAME,
parameters={
"configuration": {
"dataToExtract": "contentAndMetadata",
"parsingMode": "default",
"imageAction": "generateNormalizedImages",
}
},
field_mappings=[
FieldMapping(
source_field_name="metadata_storage_path",
target_field_name="source",
),
FieldMapping(
source_field_name="/document/normalized_images/*/text",
target_field_name="text",
),
FieldMapping(
source_field_name="/document/normalized_images/*/layoutText",
target_field_name="layoutText",
),
],
)
indexer_result = self.indexer_client.create_or_update_indexer(indexer)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
InputFieldMappingEntry,
OutputFieldMappingEntry,
AzureOpenAIEmbeddingSkill,
OcrSkill,
MergeSkill,
SearchIndexerIndexProjections,
SearchIndexerIndexProjectionSelector,
SearchIndexerIndexProjectionsParameters,
Expand Down Expand Up @@ -39,14 +41,46 @@ def __init__(
def create_skillset(self):
skillset_name = f"{self.env_helper.AZURE_SEARCH_INDEX}-skillset"

ocr_skill = OcrSkill(
description="Extract text (plain and structured) from image",
context="/document/normalized_images/*",
inputs=[
InputFieldMappingEntry(
name="image",
source="/document/normalized_images/*",
)
],
outputs=[
OutputFieldMappingEntry(name="text", target_name="text"),
OutputFieldMappingEntry(name="layoutText", target_name="layoutText"),
],
)

merge_skill = MergeSkill(
description="Merge text from OCR and text from document",
context="/document",
inputs=[
InputFieldMappingEntry(name="text", source="/document/content"),
InputFieldMappingEntry(
name="itemsToInsert", source="/document/normalized_images/*/text"
),
InputFieldMappingEntry(
name="offsets", source="/document/normalized_images/*/contentOffset"
),
],
outputs=[
OutputFieldMappingEntry(name="mergedText", target_name="merged_content")
],
)

split_skill = SplitSkill(
description="Split skill to chunk documents",
text_split_mode="pages",
context="/document",
maximum_page_length=self.integrated_vectorization_config.max_page_length,
page_overlap_length=self.integrated_vectorization_config.page_overlap_length,
inputs=[
InputFieldMappingEntry(name="text", source="/document/content"),
InputFieldMappingEntry(name="text", source="/document/merged_content"),
],
outputs=[OutputFieldMappingEntry(name="textItems", target_name="pages")],
)
Expand Down Expand Up @@ -98,7 +132,7 @@ def create_skillset(self):
skillset = SearchIndexerSkillset(
name=skillset_name,
description="Skillset to chunk documents and generating embeddings",
skills=[split_skill, embedding_skill],
skills=[ocr_skill, merge_skill, split_skill, embedding_skill],
index_projections=index_projections,
)

Expand Down
76 changes: 68 additions & 8 deletions code/tests/functional/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,23 +143,83 @@ def setup_default_mocking(httpserver: HTTPServer, app_config: AppConfig):
).respond_with_json(
{
"name": f"{app_config.get('AZURE_SEARCH_INDEX')}-skillset",
"description": "Extract entities, detect language and extract key-phrases",
"description": "Skillset to chunk documents and generating embeddings",
"skills": [
{
"@odata.type": "#Microsoft.Skills.Text.SplitSkill",
"name": "#3",
"description": None,
"context": None,
"@odata.type": "#Microsoft.Skills.Vision.OcrSkill",
"description": "Extract text (plain and structured) from image",
"context": "/document/normalized_images/*",
"inputs": [
{"name": "image", "source": "/document/normalized_images/*"}
],
"outputs": [
{"name": "text", "targetName": "text"},
{"name": "layoutText", "targetName": "layoutText"},
],
"detectOrientation": False,
},
{
"@odata.type": "#Microsoft.Skills.Text.MergeSkill",
"description": "Merge text from OCR and text from document",
"context": "/document",
"inputs": [
{"name": "text", "source": "/document/content"},
{"name": "languageCode", "source": "/document/languageCode"},
{
"name": "itemsToInsert",
"source": "/document/normalized_images/*/text",
},
{
"name": "offsets",
"source": "/document/normalized_images/*/contentOffset",
},
],
"outputs": [{"name": "mergedText", "targetName": "merged_content"}],
"insertPreTag": " ",
"insertPostTag": " ",
},
{
"@odata.type": "#Microsoft.Skills.Text.SplitSkill",
"description": "Split skill to chunk documents",
"context": "/document",
"inputs": [{"name": "text", "source": "/document/merged_content"}],
"outputs": [{"name": "textItems", "targetName": "pages"}],
"defaultLanguageCode": None,
"textSplitMode": "pages",
"maximumPageLength": 4000,
"maximumPageLength": 800,
"pageOverlapLength": 100,
},
{
"@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill",
"description": "Skill to generate embeddings via Azure OpenAI",
"context": "/document/pages/*",
"inputs": [{"name": "text", "source": "/document/pages/*"}],
"outputs": [{"name": "embedding", "targetName": "content_vector"}],
"resourceUri": f"https://localhost:{httpserver.port}/",
"deploymentId": f"{app_config.get('AZURE_OPENAI_EMBEDDING_MODEL')}",
"apiKey": f"{app_config.get('AZURE_OPENAI_API_KEY')}",
},
],
"indexProjections": {
"selectors": [
{
"targetIndexName": f"{app_config.get('AZURE_SEARCH_INDEX')}",
"parentKeyFieldName": "id",
"sourceContext": "/document/pages/*",
"mappings": [
{"name": "content", "source": "/document/pages/*"},
{
"name": "content_vector",
"source": "/document/pages/*/content_vector",
},
{"name": "title", "source": "/document/title"},
{
"name": "source",
"source": "/document/metadata_storage_path",
},
],
}
],
"parameters": {"projectionMode": "skipIndexingParentDocuments"},
},
},
status=201,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,92 @@ def test_integrated_vectorization_skillset_created(
method="PUT",
query_string="api-version=2023-10-01-Preview",
times=1,
json={
"name": f"{app_config.get('AZURE_SEARCH_INDEX')}-skillset",
"description": "Skillset to chunk documents and generating embeddings",
"skills": [
{
"@odata.type": "#Microsoft.Skills.Vision.OcrSkill",
"description": "Extract text (plain and structured) from image",
"context": "/document/normalized_images/*",
"inputs": [
{"name": "image", "source": "/document/normalized_images/*"}
],
"outputs": [
{"name": "text", "targetName": "text"},
{"name": "layoutText", "targetName": "layoutText"},
],
"detectOrientation": False,
},
{
"@odata.type": "#Microsoft.Skills.Text.MergeSkill",
"description": "Merge text from OCR and text from document",
"context": "/document",
"inputs": [
{"name": "text", "source": "/document/content"},
{
"name": "itemsToInsert",
"source": "/document/normalized_images/*/text",
},
{
"name": "offsets",
"source": "/document/normalized_images/*/contentOffset",
},
],
"outputs": [
{"name": "mergedText", "targetName": "merged_content"}
],
"insertPreTag": " ",
"insertPostTag": " ",
},
{
"@odata.type": "#Microsoft.Skills.Text.SplitSkill",
"description": "Split skill to chunk documents",
"context": "/document",
"inputs": [
{"name": "text", "source": "/document/merged_content"}
],
"outputs": [{"name": "textItems", "targetName": "pages"}],
"textSplitMode": "pages",
"maximumPageLength": 800,
"pageOverlapLength": 100,
},
{
"@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill",
"description": "Skill to generate embeddings via Azure OpenAI",
"context": "/document/pages/*",
"inputs": [{"name": "text", "source": "/document/pages/*"}],
"outputs": [
{"name": "embedding", "targetName": "content_vector"}
],
"resourceUri": f"https://localhost:{httpserver.port}/",
"deploymentId": f"{app_config.get('AZURE_OPENAI_EMBEDDING_MODEL')}",
"apiKey": f"{app_config.get('AZURE_OPENAI_API_KEY')}",
},
],
"indexProjections": {
"selectors": [
{
"targetIndexName": f"{app_config.get('AZURE_SEARCH_INDEX')}",
"parentKeyFieldName": "id",
"sourceContext": "/document/pages/*",
"mappings": [
{"name": "content", "source": "/document/pages/*"},
{
"name": "content_vector",
"source": "/document/pages/*/content_vector",
},
{"name": "title", "source": "/document/title"},
{
"name": "source",
"source": "/document/metadata_storage_path",
},
],
}
],
"parameters": {"projectionMode": "skipIndexingParentDocuments"},
},
},
),
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,13 @@ def test_create_or_update_indexer_keys(
skillset_name="skillset_name",
target_index_name=env_helper_mock.AZURE_SEARCH_INDEX,
data_source_name=env_helper_mock.AZURE_SEARCH_DATASOURCE_NAME,
parameters={
"configuration": {
"dataToExtract": "contentAndMetadata",
"parsingMode": "default",
"imageAction": "generateNormalizedImages",
}
},
field_mappings=ANY,
)

Expand Down Expand Up @@ -88,6 +95,13 @@ def test_create_or_update_indexer_rbac(
skillset_name="skillset_name",
target_index_name=env_helper_mock.AZURE_SEARCH_INDEX,
data_source_name=env_helper_mock.AZURE_SEARCH_DATASOURCE_NAME,
parameters={
"configuration": {
"dataToExtract": "contentAndMetadata",
"parsingMode": "default",
"imageAction": "generateNormalizedImages",
}
},
field_mappings=ANY,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from azure.search.documents.indexes.models import (
SearchIndexerSkillset,
SplitSkill,
OcrSkill,
MergeSkill,
AzureOpenAIEmbeddingSkill,
SearchIndexerIndexProjections,
)
Expand Down Expand Up @@ -43,7 +45,7 @@ def search_indexer_client_mock():
indexer_client.create_or_update_skillset.return_value = SearchIndexerSkillset(
name="skillset_name",
description="Skillset to chunk documents and generating embeddings",
skills=[SplitSkill, AzureOpenAIEmbeddingSkill],
skills=[OcrSkill, MergeSkill, SplitSkill, AzureOpenAIEmbeddingSkill],
index_projections=SearchIndexerIndexProjections,
)
yield mock
Expand All @@ -62,7 +64,7 @@ def test_create_skillset_keys(

# then
assert create_or_update_skillset.name == "skillset_name"
assert len(create_or_update_skillset.skills) == 2
assert len(create_or_update_skillset.skills) == 4
assert create_or_update_skillset.index_projections is not None
search_indexer_client_mock.return_value.create_or_update_skillset.assert_called_once()

Expand All @@ -82,6 +84,6 @@ def test_create_skillset_rbac(

# then
assert create_or_update_skillset.name == "skillset_name"
assert len(create_or_update_skillset.skills) == 2
assert len(create_or_update_skillset.skills) == 4
assert create_or_update_skillset.index_projections is not None
search_indexer_client_mock.return_value.create_or_update_skillset.assert_called_once()

0 comments on commit 30440a8

Please sign in to comment.