-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy patheng_tesseract.py
154 lines (137 loc) · 6.87 KB
/
eng_tesseract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0
"""Example integration for (Py)Tesseract as a custom OCR engine
"""
# Python Built-Ins:
from logging import getLogger
import os
from statistics import mean
from tempfile import TemporaryDirectory
from typing import Iterable, List, Optional
# External Dependencies:
import pandas as pd
import pytesseract
# Local Dependencies:
from .base import BaseOCREngine, generate_response_json, OCRGeometry, OCRLine, OCRPage, OCRWord
from ..image_utils import Document
logger = getLogger("eng_tesseract")
if os.environ.get("TESSDATA_PREFIX") is None:
os.environ["TESSDATA_PREFIX"] = "/opt/conda/share/tessdata"
class TesseractEngine(BaseOCREngine):
"""Tesseract-based engine for custom SageMaker OCR endpoint option"""
engine_name = "tesseract"
def process(self, raw_doc: Document, languages: Optional[Iterable[str]] = None) -> dict:
ocr_pages = []
with TemporaryDirectory() as tmpdir:
raw_doc.set_workspace(tmpdir)
for ixpage, page in enumerate(raw_doc.get_pages()):
logger.debug(f"Serializing page {ixpage + 1}")
page_ocr = pytesseract.image_to_data(
page.file_path,
output_type=pytesseract.Output.DATAFRAME,
lang="+".join(self.default_languages if languages is None else languages),
pandas_config={
# Need this explicit override or else pages containing only a single number
# can sometimes have text column interpreted as numeric type:
"dtype": {"text": str},
},
)
ocr_pages += self.dataframe_to_ocrpages(page_ocr)
return generate_response_json(ocr_pages, self.engine_name)
@classmethod
def dataframe_to_ocrpages(cls, ocr_df: pd.DataFrame) -> List[OCRPage]:
"""Convert a Tesseract DataFrame to a list of OCRPage ready for Textract-like serialization
Tesseract TSVs / PyTesseract DataFrames group detections by multiple levels: Page, block,
paragraph, line, word. Columns are: level, page_num, block_num, par_num, line_num, word_num,
left, top, width, height, conf, text.
Each level is introduced by a record, so for example there will be an initial record with
(level=1, page_num=1, block_num=0, par_num=0, line_num=0, word_num=0)... And then several
others before finally getting down to the first WORD record (level=5, page_num=1,
block_num=1, par_num=1, line_num=1, word_num=1). Records are assumed to be sorted in order,
as indeed they are direct from Tesseract.
"""
# First construct an indexable list of page geometries, as we'll need these to normalize
# other entity coordinates from absolute pixel values to 0-1 range:
# (Note: In fact this function will often be called with only one page_num at a time)
page_dims = (
ocr_df[ocr_df["level"] == 1]
.groupby("page_num")
.agg(
{
"left": "min",
"top": "min",
"width": "max",
"height": "max",
"page_num": "count",
}
)
)
# There should be exactly one level=1 record per page in the dataframe. After checking
# this, we can dispose the "page_num" count column.
if (page_dims["page_num"] > 1).sum() > 0:
raise ValueError(
"Tesseract DataFrame had duplicate entries for these page_nums at level 1: %s"
% page_dims.index[page_dims["page_num"] > 0].values[:20]
)
page_dims.drop(columns="page_num", inplace=True)
# We need to collapse the {block, paragraph} levels of Tesseract hierarchy to preserve only
# PAGE, LINE and WORD for consistency with Textract. Here we'll assume the DataFrame is in
# its original Tesseract sort order, allowing iteration through the records to correctly
# roll the entities up. Although iterating through large DataFrames isn't generally a
# performant practice, this could always be balanced with specific parallelism if wanted:
# E.g. processing multiple pages at once.
pages = {
num: OCRPage([]) # Initialise all pages first with no text
for num in sorted(ocr_df[ocr_df["level"] == 1]["page_num"].unique())
}
cur_page_num = None
page_lines = []
cur_line_id = None
line_words = []
# Tesseract LINE records (level 4) don't have a confidence (equals -1), so we'll use the
# average over the included WORDs as a heuristic. They *do* have T/L/H/W geometry info, but
# we'll ignore that for the sake of code simplicity and let OCRLine infer it from the union
# of all WORD bounding boxes.
add_line = lambda words: (
page_lines.append(OCRLine(mean(w.confidence for w in words), words))
)
# Loop through all WORD records, ignoring whitespace-only ones that Tesseract likes to yield
words_df = ocr_df[ocr_df["level"] == 5].copy()
words_df["text"] = words_df["text"].str.strip()
words_df = words_df[words_df["text"].str.len() > 0]
for _, rec in words_df.iterrows():
line_id = (rec.block_num, rec.par_num, rec.line_num)
page_num = rec.page_num
if cur_line_id != line_id:
# Start of new LINE - add previous one to result:
if cur_line_id is not None:
add_line(line_words)
cur_line_id = line_id
line_words = []
if cur_page_num != page_num:
# Start of new PAGE - add previous one to result:
if cur_page_num is not None:
pages[cur_page_num].add_lines(page_lines)
cur_page_num = page_num
page_lines = []
# Parse this record into a WORD:
page_dim_rec = page_dims.loc[page_num]
line_words.append(
OCRWord(
rec.text,
rec.conf,
OCRGeometry.from_bbox(
# Word geometries, too, need normalizing by page dimensions.
(rec.top - page_dim_rec.top) / page_dim_rec.height,
(rec.left - page_dim_rec.left) / page_dim_rec.width,
rec.height / page_dim_rec.height,
rec.width / page_dim_rec.width,
),
)
)
# End of last line and last page: Add any remaining content.
if len(line_words):
add_line(line_words)
if len(page_lines):
pages[cur_page_num].add_lines(page_lines)
return [page for page in pages.values()]