Skip to content

Commit

Permalink
chore(docling): upgrade to v2.10.0 (#129)
Browse files Browse the repository at this point in the history
Signed-off-by: Tomas Pilar <tomas.pilar@apoco.com>
  • Loading branch information
pilartomas authored Dec 12, 2024
1 parent 0c0d9ff commit 9f99d92
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 6 deletions.
2 changes: 1 addition & 1 deletion workers/python/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion workers/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ opentelemetry-instrumentation-redis = "^0.49b2"
optional = true

[tool.poetry.group.docling.dependencies]
docling = "^2.9.0"
docling = "^2.10.0"

[tool.poetry.group.unstructured]
optional = true
Expand Down
4 changes: 4 additions & 0 deletions workers/python/python/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,5 +52,9 @@ def run_bullmq_workers(self) -> list[str]:

otel_sdk_disabled: bool = False

docling_do_table_structure: bool = True
docling_pdf_do_ocr: bool = True
docling_advanced_chunker: bool = True


config = Config()
21 changes: 17 additions & 4 deletions workers/python/python/extraction/docling.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,12 @@
import json
import aioboto3

from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
from docling.datamodel.pipeline_options import PdfPipelineOptions


from config import config
from database import database
Expand All @@ -27,6 +31,16 @@

S3_URL = f"s3://{config.s3_bucket_file_storage}"

converter = DocumentConverter(format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=PdfPipelineOptions(
do_table_structure=config.docling_do_table_structure,
do_ocr=config.docling_pdf_do_ocr)
)
})
chunker = HybridChunker(
tokenizer="BAAI/bge-small-en-v1.5") if config.docling_advanced_chunker else HierarchicalChunker()


async def docling_extraction(file):
storage_id = file["storageId"]
Expand All @@ -40,18 +54,17 @@ async def docling_extraction(file):
aws_session_token=None,
) as s3:
with tempfile.TemporaryDirectory() as tmp_dir:
# Use file_name to support file type discrimination.
# Use file_name to support file type discrimination.
source_doc = f"{tmp_dir}/{file_name}"

await s3.meta.client.download_file(config.s3_bucket_file_storage, storage_id, source_doc)

converter = DocumentConverter()
result = await asyncio.to_thread(converter.convert, source_doc, max_num_pages=100, max_file_size=20971520)
doc = result.document
dict = doc.export_to_dict()
markdown = doc.export_to_markdown()
chunks = [{"text": c.text}
for c in list(HybridChunker(tokenizer="BAAI/bge-small-en-v1.5").chunk(doc))]
for c in list(await asyncio.to_thread(chunker.chunk, doc))]

document_storage_id = f"{EXTRACTION_DIR}/{storage_id}/document.json"
await s3.meta.client.put_object(
Expand Down

0 comments on commit 9f99d92

Please sign in to comment.