Markup <table/> and <td/> (#84)

HazyResearch · Oct 9, 2020 · 6ff4a7c · 6ff4a7c
1 parent f631efc
commit 6ff4a7c
Show file tree

Hide file tree

Showing 8 changed files with 146 additions and 95 deletions.
diff --git a/pdftotree/TreeExtract.py b/pdftotree/TreeExtract.py
@@ -2,7 +2,7 @@
 import logging
 import os
 from functools import cmp_to_key
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 from xml.dom.minidom import Document, Element
 
 import numpy as np
@@ -29,14 +29,15 @@
 from pdftotree.utils.pdf.pdf_utils import CustomPDFPageAggregator, PDFElems
 from pdftotree.utils.pdf.vector_utils import column_order, reading_order
 
+logger = logging.getLogger(__name__)
+
 
 class TreeExtractor(object):
     """
     Object to extract tree structure from pdf files
     """
 
     def __init__(self, pdf_file):
-        self.log = logging.getLogger(__name__)
         self.pdf_file = pdf_file
         self.elems: Dict[int, PDFElems] = {}  # key represents page_num
         self.font_stats: Dict[int, Any] = {}  # key represents page_num
@@ -165,7 +166,7 @@ def get_candidates_and_features_page_num(self, page_num):
 
         boxes = alignments_bboxes
         if len(boxes) == 0:
-            self.log.info("No boxes were found on page {}.".format(page_num))
+            logger.info("No boxes were found on page {}.".format(page_num))
             return [], []
 
         lines_features = get_lines_features(boxes, elems)
@@ -197,7 +198,7 @@ def get_candidates_alignments(self, page_num, elems):
         try:
             nodes, features = parse_layout(elems, font_stat)
         except Exception as e:
-            self.log.exception(e)
+            logger.exception(e)
             nodes, features = [], []
         return (
             [
@@ -348,7 +349,7 @@ def get_word_boundaries(
                     char_idx += 1
                     continue
                 if word[len_idx] != mention_chars[char_idx][0]:
-                    self.log.warning(
+                    logger.warning(
                         "Out of order ({}, {})".format(word, mention_chars[char_idx][0])
                     )
                 curr_word[1] = min(curr_word[1], mention_chars[char_idx][1])
@@ -402,42 +403,72 @@ def get_html_others(self, tag: str, box: List[float], page_num: int) -> Element:
                 word_element.appendChild(self.doc.createTextNode(text))
         return element
 
-    def get_html_table(self, table, page_num) -> Element:
-        table_str = [str(i) for i in table]
+    def get_html_table(self, table: List[float], page_num) -> Optional[Element]:
+        """Recognize a table using tabula and return a DOM element.
+
+        :param table: bbox for a table (top,left,bottom,right)
+        :param page_num: 1-based page number
+        :return: DOM element for a table
+        """
+        logger.debug(f"Calling tabula at page: {page_num} and area: {table}.")
         table_json = tabula.read_pdf(
-            self.pdf_file, pages=page_num, area=table_str, output_format="json"
+            self.pdf_file, pages=page_num, area=table, output_format="json"
         )
-        if len(table_json) > 0:
-            table_element = self.doc.createElement("table")
-            for i, row in enumerate(table_json[0]["data"]):
-                row_element = self.doc.createElement("tr")
-                table_element.appendChild(row_element)
-                for j, column in enumerate(row):
-                    col_element = self.doc.createElement("td")
-                    row_element.appendChild(col_element)
-                    box = [
-                        column["top"],
-                        column["left"],
-                        column["top"] + column["height"],
-                        column["left"] + column["width"],
-                    ]
-                    elems = get_mentions_within_bbox(box, self.elems[page_num].mentions)
-                    elems.sort(key=cmp_to_key(reading_order))
-                    for elem in elems:
-                        words = self.get_word_boundaries(elem)
-                        for word in words:
-                            top = int(word[1])
-                            left = int(word[2])
-                            bottom = int(word[3])
-                            right = int(word[4])
-                            # escape special HTML chars
-                            text = html.escape(word[0])
-
-                            word_element = self.doc.createElement("span")
-                            col_element.appendChild(word_element)
-                            word_element.setAttribute("class", "ocrx_word")
-                            word_element.setAttribute(
-                                "title", f"bbox {left} {top} {right} {bottom}"
-                            )
-                            word_element.appendChild(self.doc.createTextNode(text))
+        logger.debug(f"Tabula recognized {len(table_json)} table(s).")
+        if len(table_json) == 0:
+            return None
+        table_element = self.doc.createElement("table")
+        table_element.setAttribute("class", "ocr_table")
+        top = int(table_json[0]["top"])
+        left = int(table_json[0]["left"])
+        bottom = int(table_json[0]["bottom"])
+        right = int(table_json[0]["right"])
+        table_element.setAttribute("title", f"bbox {left} {top} {right} {bottom}")
+        for i, row in enumerate(table_json[0]["data"]):
+            row_element = self.doc.createElement("tr")
+            table_element.appendChild(row_element)
+            for j, cell in enumerate(row):
+                # It is not explicitly stated anywhere but tabula seems to use the cell
+                # bbox to represent that of cell itself rather than that of text inside.
+                # Note: bbox could be [0, 0, 0, 0] if tabula recognizes no text inside.
+                box: List[float] = [
+                    cell["top"],
+                    cell["left"],
+                    cell["top"] + cell["height"],
+                    cell["left"] + cell["width"],
+                ]
+                cell_element = self.doc.createElement("td")
+                row_element.appendChild(cell_element)
+                elems = get_mentions_within_bbox(box, self.elems[page_num].mentions)
+                if len(elems) == 0:
+                    continue
+                cell_element.setAttribute(
+                    "title",
+                    f"bbox {int(box[1])} {int(box[0])} {int(box[3])} {int(box[2])}",
+                )
+                elems.sort(key=cmp_to_key(reading_order))
+                for elem in elems:
+                    line_element = self.doc.createElement("span")
+                    cell_element.appendChild(line_element)
+                    line_element.setAttribute("class", "ocrx_line")
+                    line_element.setAttribute(
+                        "title",
+                        " ".join(["bbox"] + [str(int(_)) for _ in elem.bbox]),
+                    )
+                    words = self.get_word_boundaries(elem)
+                    for word in words:
+                        top = int(word[1])
+                        left = int(word[2])
+                        bottom = int(word[3])
+                        right = int(word[4])
+                        # escape special HTML chars
+                        text = html.escape(word[0])
+
+                        word_element = self.doc.createElement("span")
+                        line_element.appendChild(word_element)
+                        word_element.setAttribute("class", "ocrx_word")
+                        word_element.setAttribute(
+                            "title", f"bbox {left} {top} {right} {bottom}"
+                        )
+                        word_element.appendChild(self.doc.createTextNode(text))
         return table_element
diff --git a/pdftotree/core.py b/pdftotree/core.py
@@ -24,17 +24,18 @@
 from pdftotree.TreeExtract import TreeExtractor
 from pdftotree.TreeVisualizer import TreeVisualizer
 
+logger = logging.getLogger(__name__)
+
 
 def load_model(model_type, model_path):
-    log = logging.getLogger(__name__)
-    log.info("Loading pretrained {} model for table detection".format(model_type))
+    logger.info("Loading pretrained {} model for table detection".format(model_type))
     if model_type == "ml":
         model = pickle.load(open(model_path, "rb"))
     else:
         from keras.models import load_model as load_vision_model
 
         model = load_vision_model(model_path)
-    log.info("Model loaded!")
+    logger.info("Model loaded!")
     return model
 
 
@@ -51,20 +52,19 @@ def parse(
     model_path=None,
     visualize=False,
 ):
-    log = logging.getLogger(__name__)
     model = None
     if model_type is not None and model_path is not None:
         model = load_model(model_type, model_path)
     extractor = TreeExtractor(pdf_file)
     if extractor.is_scanned():
-        log.warning("Document looks scanned, the result may be far from expected.")
+        logger.warning("Document looks scanned, the result may be far from expected.")
     else:
-        log.info("Digitized PDF detected, building tree structure...")
+        logger.info("Digitized PDF detected, building tree structure...")
 
     pdf_tree = extractor.get_tree_structure(model_type, model)
-    log.info("Tree structure built, creating html...")
+    logger.info("Tree structure built, creating html...")
     pdf_html = extractor.get_html_tree()
-    log.info("HTML created.")
+    logger.info("HTML created.")
     # TODO: what is the following substition for and is it required?
     # pdf_html = re.sub(r"[\x00-\x1F]+", "", pdf_html)
 

diff --git a/pdftotree/ml/TableExtractML.py b/pdftotree/ml/TableExtractML.py
@@ -20,14 +20,15 @@
 from pdftotree.utils.pdf.pdf_parsers import parse_layout
 from pdftotree.utils.pdf.pdf_utils import analyze_pages, normalize_pdf
 
+logger = logging.getLogger(__name__)
+
 
 class TableExtractorML(object):
     """
     Object to extract tables regions from pdf files
     """
 
     def __init__(self, pdf_file):
-        self.log = logging.getLogger(__name__)
         self.pdf_file = pdf_file
         self.elems = {}
         self.font_stats = {}
@@ -97,7 +98,7 @@ def parse(self):
                     and round(fig.bbox[2]) == round(elems.layout.width)
                     and round(fig.bbox[3]) == round(elems.layout.height)
                 ):
-                    self.log.debug(
+                    logger.debug(
                         "{} is scanned because of full-page figure.".format(
                             self.pdf_file
                         )
@@ -111,7 +112,7 @@ def parse(self):
             )
             # doc is scanned if any page is scanned
             if page_scanned:
-                self.log.debug(
+                logger.debug(
                     "{} is scanned one of its pages is scanned.".format(self.pdf_file)
                 )
                 is_scanned = True
@@ -139,7 +140,7 @@ def get_candidates(self):
     def get_candidates_and_features(self):
         self.parse()
         if self.scanned:
-            self.log.info("{} is scanned.".format(self.pdf_file))
+            logger.info("{} is scanned.".format(self.pdf_file))
             return [], [], self.scanned
         for page_num in list(self.elems.keys()):
             page_boxes, page_features = self.get_candidates_and_features_page_num(
@@ -161,7 +162,7 @@ def get_candidates_and_features_page_num(self, page_num):
         alignments_bboxes, alignment_features = self.get_candidates_alignments(
             page_num, elems
         )
-        self.log.info(
+        logger.info(
             "Page Num: {}, Line bboxes: {}, Alignment bboxes: {}".format(
                 page_num, len(lines_bboxes), len(alignments_bboxes)
             )

diff --git a/pdftotree/ml/features.py b/pdftotree/ml/features.py
@@ -1,7 +1,7 @@
 import string
 from builtins import str
 from collections import defaultdict
-from typing import List
+from typing import Any, List
 
 from pdfminer.layout import LTTextLine
 
@@ -35,7 +35,15 @@ def get_height_coverage(bbox):
 # ******************* Text Coverage Features *************************************
 
 
-def get_mentions_within_bbox(bbox, mentions) -> List[LTTextLine]:
+def get_mentions_within_bbox(
+    bbox: List[Any], mentions: List[LTTextLine]
+) -> List[LTTextLine]:
+    """Get textlines within bbox.
+
+    :param bbox: a list containing (top, left, bottom, right) in the last 4 digits
+    :param mentions: a list of textlines
+    :return: a list of textlines within the given bbox
+    """
     mentions_within_bbox = []
     for mention in mentions:
         bbox_mention = (

diff --git a/pdftotree/utils/pdf/grid.py b/pdftotree/utils/pdf/grid.py
@@ -16,6 +16,8 @@
 
 from pdftotree.utils.pdf.vector_utils import inside, reading_order
 
+logger = logging.getLogger(__name__)
+
 
 class Cell(object):
     """Represents a cell with no visual dividers inside"""
@@ -117,7 +119,6 @@ def get_normalized_grid(self):
         """
         Analyzes subcell structure
         """
-        log = logging.getLogger(__name__)
         # Resolve multirow mentions, TODO: validate against all PDFs
         #  subcol_count = 0
         mega_rows = []
@@ -127,12 +128,12 @@ def get_normalized_grid(self):
             for col_id, cell in enumerate(row):
                 # Keep cell text in reading order
                 cell.texts.sort(key=cmp_to_key(reading_order))
-                log.debug("=" * 50)
+                logger.debug("=" * 50)
                 for m in cell.texts:
                     subrow_across_cell[m.yc_grid].append(m)
                     #  prev = m
 
-            log.debug(pformat(dict(subrow_across_cell)))
+            logger.debug(pformat(dict(subrow_across_cell)))
 
             mega_rows.append(subrow_across_cell)