diff --git a/workers/python/poetry.lock b/workers/python/poetry.lock index 540da1c..aee6791 100644 --- a/workers/python/poetry.lock +++ b/workers/python/poetry.lock @@ -6255,4 +6255,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = "~3.11" -content-hash = "b7c5999bf7bb5549653212da4a48285ca331b4aa8606c3cd7a5b9669e7ccf44d" +content-hash = "3481ba83425bf124a20d55ab7919488f419ebceb0f076b6b9b1ef91f36c4451f" diff --git a/workers/python/pyproject.toml b/workers/python/pyproject.toml index 359d9b5..f2c8943 100644 --- a/workers/python/pyproject.toml +++ b/workers/python/pyproject.toml @@ -19,7 +19,7 @@ opentelemetry-instrumentation-redis = "^0.49b2" optional = true [tool.poetry.group.docling.dependencies] -docling = "^2.9.0" +docling = "^2.10.0" [tool.poetry.group.unstructured] optional = true diff --git a/workers/python/python/config.py b/workers/python/python/config.py index a445f9a..3d2979e 100644 --- a/workers/python/python/config.py +++ b/workers/python/python/config.py @@ -52,5 +52,9 @@ def run_bullmq_workers(self) -> list[str]: otel_sdk_disabled: bool = False + docling_do_table_structure: bool = True + docling_pdf_do_ocr: bool = True + docling_advanced_chunker: bool = True + config = Config() diff --git a/workers/python/python/extraction/docling.py b/workers/python/python/extraction/docling.py index f5a844c..27c2b1d 100644 --- a/workers/python/python/extraction/docling.py +++ b/workers/python/python/extraction/docling.py @@ -17,8 +17,12 @@ import json import aioboto3 -from docling.document_converter import DocumentConverter +from docling.datamodel.base_models import InputFormat +from docling.document_converter import DocumentConverter, PdfFormatOption +from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker from docling_core.transforms.chunker.hybrid_chunker import HybridChunker +from docling.datamodel.pipeline_options import PdfPipelineOptions + from config import config from database import database @@ -27,6 +31,16 @@ S3_URL = f"s3://{config.s3_bucket_file_storage}" +converter = DocumentConverter(format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=PdfPipelineOptions( + do_table_structure=config.docling_do_table_structure, + do_ocr=config.docling_pdf_do_ocr) + ) +}) +chunker = HybridChunker( + tokenizer="BAAI/bge-small-en-v1.5") if config.docling_advanced_chunker else HierarchicalChunker() + async def docling_extraction(file): storage_id = file["storageId"] @@ -40,18 +54,17 @@ async def docling_extraction(file): aws_session_token=None, ) as s3: with tempfile.TemporaryDirectory() as tmp_dir: - # Use file_name to support file type discrimination. + # Use file_name to support file type discrimination. source_doc = f"{tmp_dir}/{file_name}" await s3.meta.client.download_file(config.s3_bucket_file_storage, storage_id, source_doc) - converter = DocumentConverter() result = await asyncio.to_thread(converter.convert, source_doc, max_num_pages=100, max_file_size=20971520) doc = result.document dict = doc.export_to_dict() markdown = doc.export_to_markdown() chunks = [{"text": c.text} - for c in list(HybridChunker(tokenizer="BAAI/bge-small-en-v1.5").chunk(doc))] + for c in list(await asyncio.to_thread(chunker.chunk, doc))] document_storage_id = f"{EXTRACTION_DIR}/{storage_id}/document.json" await s3.meta.client.put_object(