Skip to content

Commit

Permalink
Markup <table/> and <td/> (#84)
Browse files Browse the repository at this point in the history
  • Loading branch information
Hiromu Hota authored Oct 9, 2020
1 parent f631efc commit 6ff4a7c
Show file tree
Hide file tree
Showing 8 changed files with 146 additions and 95 deletions.
113 changes: 72 additions & 41 deletions pdftotree/TreeExtract.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import logging
import os
from functools import cmp_to_key
from typing import Any, Dict, List, Tuple
from typing import Any, Dict, List, Optional, Tuple
from xml.dom.minidom import Document, Element

import numpy as np
Expand All @@ -29,14 +29,15 @@
from pdftotree.utils.pdf.pdf_utils import CustomPDFPageAggregator, PDFElems
from pdftotree.utils.pdf.vector_utils import column_order, reading_order

logger = logging.getLogger(__name__)


class TreeExtractor(object):
"""
Object to extract tree structure from pdf files
"""

def __init__(self, pdf_file):
self.log = logging.getLogger(__name__)
self.pdf_file = pdf_file
self.elems: Dict[int, PDFElems] = {} # key represents page_num
self.font_stats: Dict[int, Any] = {} # key represents page_num
Expand Down Expand Up @@ -165,7 +166,7 @@ def get_candidates_and_features_page_num(self, page_num):

boxes = alignments_bboxes
if len(boxes) == 0:
self.log.info("No boxes were found on page {}.".format(page_num))
logger.info("No boxes were found on page {}.".format(page_num))
return [], []

lines_features = get_lines_features(boxes, elems)
Expand Down Expand Up @@ -197,7 +198,7 @@ def get_candidates_alignments(self, page_num, elems):
try:
nodes, features = parse_layout(elems, font_stat)
except Exception as e:
self.log.exception(e)
logger.exception(e)
nodes, features = [], []
return (
[
Expand Down Expand Up @@ -348,7 +349,7 @@ def get_word_boundaries(
char_idx += 1
continue
if word[len_idx] != mention_chars[char_idx][0]:
self.log.warning(
logger.warning(
"Out of order ({}, {})".format(word, mention_chars[char_idx][0])
)
curr_word[1] = min(curr_word[1], mention_chars[char_idx][1])
Expand Down Expand Up @@ -402,42 +403,72 @@ def get_html_others(self, tag: str, box: List[float], page_num: int) -> Element:
word_element.appendChild(self.doc.createTextNode(text))
return element

def get_html_table(self, table, page_num) -> Element:
table_str = [str(i) for i in table]
def get_html_table(self, table: List[float], page_num) -> Optional[Element]:
"""Recognize a table using tabula and return a DOM element.
:param table: bbox for a table (top,left,bottom,right)
:param page_num: 1-based page number
:return: DOM element for a table
"""
logger.debug(f"Calling tabula at page: {page_num} and area: {table}.")
table_json = tabula.read_pdf(
self.pdf_file, pages=page_num, area=table_str, output_format="json"
self.pdf_file, pages=page_num, area=table, output_format="json"
)
if len(table_json) > 0:
table_element = self.doc.createElement("table")
for i, row in enumerate(table_json[0]["data"]):
row_element = self.doc.createElement("tr")
table_element.appendChild(row_element)
for j, column in enumerate(row):
col_element = self.doc.createElement("td")
row_element.appendChild(col_element)
box = [
column["top"],
column["left"],
column["top"] + column["height"],
column["left"] + column["width"],
]
elems = get_mentions_within_bbox(box, self.elems[page_num].mentions)
elems.sort(key=cmp_to_key(reading_order))
for elem in elems:
words = self.get_word_boundaries(elem)
for word in words:
top = int(word[1])
left = int(word[2])
bottom = int(word[3])
right = int(word[4])
# escape special HTML chars
text = html.escape(word[0])

word_element = self.doc.createElement("span")
col_element.appendChild(word_element)
word_element.setAttribute("class", "ocrx_word")
word_element.setAttribute(
"title", f"bbox {left} {top} {right} {bottom}"
)
word_element.appendChild(self.doc.createTextNode(text))
logger.debug(f"Tabula recognized {len(table_json)} table(s).")
if len(table_json) == 0:
return None
table_element = self.doc.createElement("table")
table_element.setAttribute("class", "ocr_table")
top = int(table_json[0]["top"])
left = int(table_json[0]["left"])
bottom = int(table_json[0]["bottom"])
right = int(table_json[0]["right"])
table_element.setAttribute("title", f"bbox {left} {top} {right} {bottom}")
for i, row in enumerate(table_json[0]["data"]):
row_element = self.doc.createElement("tr")
table_element.appendChild(row_element)
for j, cell in enumerate(row):
# It is not explicitly stated anywhere but tabula seems to use the cell
# bbox to represent that of cell itself rather than that of text inside.
# Note: bbox could be [0, 0, 0, 0] if tabula recognizes no text inside.
box: List[float] = [
cell["top"],
cell["left"],
cell["top"] + cell["height"],
cell["left"] + cell["width"],
]
cell_element = self.doc.createElement("td")
row_element.appendChild(cell_element)
elems = get_mentions_within_bbox(box, self.elems[page_num].mentions)
if len(elems) == 0:
continue
cell_element.setAttribute(
"title",
f"bbox {int(box[1])} {int(box[0])} {int(box[3])} {int(box[2])}",
)
elems.sort(key=cmp_to_key(reading_order))
for elem in elems:
line_element = self.doc.createElement("span")
cell_element.appendChild(line_element)
line_element.setAttribute("class", "ocrx_line")
line_element.setAttribute(
"title",
" ".join(["bbox"] + [str(int(_)) for _ in elem.bbox]),
)
words = self.get_word_boundaries(elem)
for word in words:
top = int(word[1])
left = int(word[2])
bottom = int(word[3])
right = int(word[4])
# escape special HTML chars
text = html.escape(word[0])

word_element = self.doc.createElement("span")
line_element.appendChild(word_element)
word_element.setAttribute("class", "ocrx_word")
word_element.setAttribute(
"title", f"bbox {left} {top} {right} {bottom}"
)
word_element.appendChild(self.doc.createTextNode(text))
return table_element
16 changes: 8 additions & 8 deletions pdftotree/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,18 @@
from pdftotree.TreeExtract import TreeExtractor
from pdftotree.TreeVisualizer import TreeVisualizer

logger = logging.getLogger(__name__)


def load_model(model_type, model_path):
log = logging.getLogger(__name__)
log.info("Loading pretrained {} model for table detection".format(model_type))
logger.info("Loading pretrained {} model for table detection".format(model_type))
if model_type == "ml":
model = pickle.load(open(model_path, "rb"))
else:
from keras.models import load_model as load_vision_model

model = load_vision_model(model_path)
log.info("Model loaded!")
logger.info("Model loaded!")
return model


Expand All @@ -51,20 +52,19 @@ def parse(
model_path=None,
visualize=False,
):
log = logging.getLogger(__name__)
model = None
if model_type is not None and model_path is not None:
model = load_model(model_type, model_path)
extractor = TreeExtractor(pdf_file)
if extractor.is_scanned():
log.warning("Document looks scanned, the result may be far from expected.")
logger.warning("Document looks scanned, the result may be far from expected.")
else:
log.info("Digitized PDF detected, building tree structure...")
logger.info("Digitized PDF detected, building tree structure...")

pdf_tree = extractor.get_tree_structure(model_type, model)
log.info("Tree structure built, creating html...")
logger.info("Tree structure built, creating html...")
pdf_html = extractor.get_html_tree()
log.info("HTML created.")
logger.info("HTML created.")
# TODO: what is the following substition for and is it required?
# pdf_html = re.sub(r"[\x00-\x1F]+", "", pdf_html)

Expand Down
11 changes: 6 additions & 5 deletions pdftotree/ml/TableExtractML.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,15 @@
from pdftotree.utils.pdf.pdf_parsers import parse_layout
from pdftotree.utils.pdf.pdf_utils import analyze_pages, normalize_pdf

logger = logging.getLogger(__name__)


class TableExtractorML(object):
"""
Object to extract tables regions from pdf files
"""

def __init__(self, pdf_file):
self.log = logging.getLogger(__name__)
self.pdf_file = pdf_file
self.elems = {}
self.font_stats = {}
Expand Down Expand Up @@ -97,7 +98,7 @@ def parse(self):
and round(fig.bbox[2]) == round(elems.layout.width)
and round(fig.bbox[3]) == round(elems.layout.height)
):
self.log.debug(
logger.debug(
"{} is scanned because of full-page figure.".format(
self.pdf_file
)
Expand All @@ -111,7 +112,7 @@ def parse(self):
)
# doc is scanned if any page is scanned
if page_scanned:
self.log.debug(
logger.debug(
"{} is scanned one of its pages is scanned.".format(self.pdf_file)
)
is_scanned = True
Expand Down Expand Up @@ -139,7 +140,7 @@ def get_candidates(self):
def get_candidates_and_features(self):
self.parse()
if self.scanned:
self.log.info("{} is scanned.".format(self.pdf_file))
logger.info("{} is scanned.".format(self.pdf_file))
return [], [], self.scanned
for page_num in list(self.elems.keys()):
page_boxes, page_features = self.get_candidates_and_features_page_num(
Expand All @@ -161,7 +162,7 @@ def get_candidates_and_features_page_num(self, page_num):
alignments_bboxes, alignment_features = self.get_candidates_alignments(
page_num, elems
)
self.log.info(
logger.info(
"Page Num: {}, Line bboxes: {}, Alignment bboxes: {}".format(
page_num, len(lines_bboxes), len(alignments_bboxes)
)
Expand Down
12 changes: 10 additions & 2 deletions pdftotree/ml/features.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import string
from builtins import str
from collections import defaultdict
from typing import List
from typing import Any, List

from pdfminer.layout import LTTextLine

Expand Down Expand Up @@ -35,7 +35,15 @@ def get_height_coverage(bbox):
# ******************* Text Coverage Features *************************************


def get_mentions_within_bbox(bbox, mentions) -> List[LTTextLine]:
def get_mentions_within_bbox(
bbox: List[Any], mentions: List[LTTextLine]
) -> List[LTTextLine]:
"""Get textlines within bbox.
:param bbox: a list containing (top, left, bottom, right) in the last 4 digits
:param mentions: a list of textlines
:return: a list of textlines within the given bbox
"""
mentions_within_bbox = []
for mention in mentions:
bbox_mention = (
Expand Down
7 changes: 4 additions & 3 deletions pdftotree/utils/pdf/grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

from pdftotree.utils.pdf.vector_utils import inside, reading_order

logger = logging.getLogger(__name__)


class Cell(object):
"""Represents a cell with no visual dividers inside"""
Expand Down Expand Up @@ -117,7 +119,6 @@ def get_normalized_grid(self):
"""
Analyzes subcell structure
"""
log = logging.getLogger(__name__)
# Resolve multirow mentions, TODO: validate against all PDFs
# subcol_count = 0
mega_rows = []
Expand All @@ -127,12 +128,12 @@ def get_normalized_grid(self):
for col_id, cell in enumerate(row):
# Keep cell text in reading order
cell.texts.sort(key=cmp_to_key(reading_order))
log.debug("=" * 50)
logger.debug("=" * 50)
for m in cell.texts:
subrow_across_cell[m.yc_grid].append(m)
# prev = m

log.debug(pformat(dict(subrow_across_cell)))
logger.debug(pformat(dict(subrow_across_cell)))

mega_rows.append(subrow_across_cell)

Expand Down
Loading

0 comments on commit 6ff4a7c

Please sign in to comment.