chore(docling): upgrade to v2.10.0 (#129)

Signed-off-by: Tomas Pilar <tomas.pilar@apoco.com>
i-am-bee · Dec 12, 2024 · 9f99d92 · 9f99d92
1 parent 0c0d9ff
commit 9f99d92
Show file tree

Hide file tree

Showing 4 changed files with 23 additions and 6 deletions.
diff --git a/workers/python/poetry.lock b/workers/python/poetry.lock
diff --git a/workers/python/pyproject.toml b/workers/python/pyproject.toml
@@ -19,7 +19,7 @@ opentelemetry-instrumentation-redis = "^0.49b2"
 optional = true
 
 [tool.poetry.group.docling.dependencies]
-docling = "^2.9.0"
+docling = "^2.10.0"
 
 [tool.poetry.group.unstructured]
 optional = true

diff --git a/workers/python/python/config.py b/workers/python/python/config.py
@@ -52,5 +52,9 @@ def run_bullmq_workers(self) -> list[str]:
 
     otel_sdk_disabled: bool = False
 
+    docling_do_table_structure: bool = True
+    docling_pdf_do_ocr: bool = True
+    docling_advanced_chunker: bool = True
+
 
 config = Config()
diff --git a/workers/python/python/extraction/docling.py b/workers/python/python/extraction/docling.py
@@ -17,8 +17,12 @@
 import json
 import aioboto3
 
-from docling.document_converter import DocumentConverter
+from docling.datamodel.base_models import InputFormat
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
 from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+
 
 from config import config
 from database import database
@@ -27,6 +31,16 @@
 
 S3_URL = f"s3://{config.s3_bucket_file_storage}"
 
+converter = DocumentConverter(format_options={
+    InputFormat.PDF: PdfFormatOption(
+        pipeline_options=PdfPipelineOptions(
+            do_table_structure=config.docling_do_table_structure,
+            do_ocr=config.docling_pdf_do_ocr)
+    )
+})
+chunker = HybridChunker(
+    tokenizer="BAAI/bge-small-en-v1.5") if config.docling_advanced_chunker else HierarchicalChunker()
+
 
 async def docling_extraction(file):
     storage_id = file["storageId"]
@@ -40,18 +54,17 @@ async def docling_extraction(file):
                                 aws_session_token=None,
                                 ) as s3:
         with tempfile.TemporaryDirectory() as tmp_dir:
-            # Use file_name to support file type discrimination.
+           # Use file_name to support file type discrimination.
             source_doc = f"{tmp_dir}/{file_name}"
 
             await s3.meta.client.download_file(config.s3_bucket_file_storage, storage_id, source_doc)
 
-            converter = DocumentConverter()
             result = await asyncio.to_thread(converter.convert, source_doc, max_num_pages=100, max_file_size=20971520)
             doc = result.document
             dict = doc.export_to_dict()
             markdown = doc.export_to_markdown()
             chunks = [{"text": c.text}
-                      for c in list(HybridChunker(tokenizer="BAAI/bge-small-en-v1.5").chunk(doc))]
+                      for c in list(await asyncio.to_thread(chunker.chunk, doc))]
 
             document_storage_id = f"{EXTRACTION_DIR}/{storage_id}/document.json"
             await s3.meta.client.put_object(