From 9f99d921d4849d141fdc4bc95913ea85ca0f7874 Mon Sep 17 00:00:00 2001
From: Tomas Pilar <tomas.pilar@apoco.com>
Date: Thu, 12 Dec 2024 11:15:13 +0100
Subject: [PATCH] chore(docling): upgrade to v2.10.0 (#129)

Signed-off-by: Tomas Pilar <tomas.pilar@apoco.com>
---
 workers/python/poetry.lock                  |  2 +-
 workers/python/pyproject.toml               |  2 +-
 workers/python/python/config.py             |  4 ++++
 workers/python/python/extraction/docling.py | 21 +++++++++++++++++----
 4 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/workers/python/poetry.lock b/workers/python/poetry.lock
index 540da1c..aee6791 100644
--- a/workers/python/poetry.lock
+++ b/workers/python/poetry.lock
@@ -6255,4 +6255,4 @@ type = ["pytest-mypy"]
 [metadata]
 lock-version = "2.0"
 python-versions = "~3.11"
-content-hash = "b7c5999bf7bb5549653212da4a48285ca331b4aa8606c3cd7a5b9669e7ccf44d"
+content-hash = "3481ba83425bf124a20d55ab7919488f419ebceb0f076b6b9b1ef91f36c4451f"
diff --git a/workers/python/pyproject.toml b/workers/python/pyproject.toml
index 359d9b5..f2c8943 100644
--- a/workers/python/pyproject.toml
+++ b/workers/python/pyproject.toml
@@ -19,7 +19,7 @@ opentelemetry-instrumentation-redis = "^0.49b2"
 optional = true
 
 [tool.poetry.group.docling.dependencies]
-docling = "^2.9.0"
+docling = "^2.10.0"
 
 [tool.poetry.group.unstructured]
 optional = true
diff --git a/workers/python/python/config.py b/workers/python/python/config.py
index a445f9a..3d2979e 100644
--- a/workers/python/python/config.py
+++ b/workers/python/python/config.py
@@ -52,5 +52,9 @@ def run_bullmq_workers(self) -> list[str]:
 
     otel_sdk_disabled: bool = False
 
+    docling_do_table_structure: bool = True
+    docling_pdf_do_ocr: bool = True
+    docling_advanced_chunker: bool = True
+
 
 config = Config()
diff --git a/workers/python/python/extraction/docling.py b/workers/python/python/extraction/docling.py
index f5a844c..27c2b1d 100644
--- a/workers/python/python/extraction/docling.py
+++ b/workers/python/python/extraction/docling.py
@@ -17,8 +17,12 @@
 import json
 import aioboto3
 
-from docling.document_converter import DocumentConverter
+from docling.datamodel.base_models import InputFormat
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
 from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+
 
 from config import config
 from database import database
@@ -27,6 +31,16 @@
 
 S3_URL = f"s3://{config.s3_bucket_file_storage}"
 
+converter = DocumentConverter(format_options={
+    InputFormat.PDF: PdfFormatOption(
+        pipeline_options=PdfPipelineOptions(
+            do_table_structure=config.docling_do_table_structure,
+            do_ocr=config.docling_pdf_do_ocr)
+    )
+})
+chunker = HybridChunker(
+    tokenizer="BAAI/bge-small-en-v1.5") if config.docling_advanced_chunker else HierarchicalChunker()
+
 
 async def docling_extraction(file):
     storage_id = file["storageId"]
@@ -40,18 +54,17 @@ async def docling_extraction(file):
                                 aws_session_token=None,
                                 ) as s3:
         with tempfile.TemporaryDirectory() as tmp_dir:
-            # Use file_name to support file type discrimination.
+           # Use file_name to support file type discrimination.
             source_doc = f"{tmp_dir}/{file_name}"
 
             await s3.meta.client.download_file(config.s3_bucket_file_storage, storage_id, source_doc)
 
-            converter = DocumentConverter()
             result = await asyncio.to_thread(converter.convert, source_doc, max_num_pages=100, max_file_size=20971520)
             doc = result.document
             dict = doc.export_to_dict()
             markdown = doc.export_to_markdown()
             chunks = [{"text": c.text}
-                      for c in list(HybridChunker(tokenizer="BAAI/bge-small-en-v1.5").chunk(doc))]
+                      for c in list(await asyncio.to_thread(chunker.chunk, doc))]
 
             document_storage_id = f"{EXTRACTION_DIR}/{storage_id}/document.json"
             await s3.meta.client.put_object(