diff --git a/pdftotree/TreeExtract.py b/pdftotree/TreeExtract.py index 7588337..9ae9ce3 100644 --- a/pdftotree/TreeExtract.py +++ b/pdftotree/TreeExtract.py @@ -2,7 +2,7 @@ import logging import os from functools import cmp_to_key -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List, Optional, Tuple from xml.dom.minidom import Document, Element import numpy as np @@ -29,6 +29,8 @@ from pdftotree.utils.pdf.pdf_utils import CustomPDFPageAggregator, PDFElems from pdftotree.utils.pdf.vector_utils import column_order, reading_order +logger = logging.getLogger(__name__) + class TreeExtractor(object): """ @@ -36,7 +38,6 @@ class TreeExtractor(object): """ def __init__(self, pdf_file): - self.log = logging.getLogger(__name__) self.pdf_file = pdf_file self.elems: Dict[int, PDFElems] = {} # key represents page_num self.font_stats: Dict[int, Any] = {} # key represents page_num @@ -165,7 +166,7 @@ def get_candidates_and_features_page_num(self, page_num): boxes = alignments_bboxes if len(boxes) == 0: - self.log.info("No boxes were found on page {}.".format(page_num)) + logger.info("No boxes were found on page {}.".format(page_num)) return [], [] lines_features = get_lines_features(boxes, elems) @@ -197,7 +198,7 @@ def get_candidates_alignments(self, page_num, elems): try: nodes, features = parse_layout(elems, font_stat) except Exception as e: - self.log.exception(e) + logger.exception(e) nodes, features = [], [] return ( [ @@ -348,7 +349,7 @@ def get_word_boundaries( char_idx += 1 continue if word[len_idx] != mention_chars[char_idx][0]: - self.log.warning( + logger.warning( "Out of order ({}, {})".format(word, mention_chars[char_idx][0]) ) curr_word[1] = min(curr_word[1], mention_chars[char_idx][1]) @@ -402,42 +403,72 @@ def get_html_others(self, tag: str, box: List[float], page_num: int) -> Element: word_element.appendChild(self.doc.createTextNode(text)) return element - def get_html_table(self, table, page_num) -> Element: - table_str = [str(i) for i in table] + def get_html_table(self, table: List[float], page_num) -> Optional[Element]: + """Recognize a table using tabula and return a DOM element. + + :param table: bbox for a table (top,left,bottom,right) + :param page_num: 1-based page number + :return: DOM element for a table + """ + logger.debug(f"Calling tabula at page: {page_num} and area: {table}.") table_json = tabula.read_pdf( - self.pdf_file, pages=page_num, area=table_str, output_format="json" + self.pdf_file, pages=page_num, area=table, output_format="json" ) - if len(table_json) > 0: - table_element = self.doc.createElement("table") - for i, row in enumerate(table_json[0]["data"]): - row_element = self.doc.createElement("tr") - table_element.appendChild(row_element) - for j, column in enumerate(row): - col_element = self.doc.createElement("td") - row_element.appendChild(col_element) - box = [ - column["top"], - column["left"], - column["top"] + column["height"], - column["left"] + column["width"], - ] - elems = get_mentions_within_bbox(box, self.elems[page_num].mentions) - elems.sort(key=cmp_to_key(reading_order)) - for elem in elems: - words = self.get_word_boundaries(elem) - for word in words: - top = int(word[1]) - left = int(word[2]) - bottom = int(word[3]) - right = int(word[4]) - # escape special HTML chars - text = html.escape(word[0]) - - word_element = self.doc.createElement("span") - col_element.appendChild(word_element) - word_element.setAttribute("class", "ocrx_word") - word_element.setAttribute( - "title", f"bbox {left} {top} {right} {bottom}" - ) - word_element.appendChild(self.doc.createTextNode(text)) + logger.debug(f"Tabula recognized {len(table_json)} table(s).") + if len(table_json) == 0: + return None + table_element = self.doc.createElement("table") + table_element.setAttribute("class", "ocr_table") + top = int(table_json[0]["top"]) + left = int(table_json[0]["left"]) + bottom = int(table_json[0]["bottom"]) + right = int(table_json[0]["right"]) + table_element.setAttribute("title", f"bbox {left} {top} {right} {bottom}") + for i, row in enumerate(table_json[0]["data"]): + row_element = self.doc.createElement("tr") + table_element.appendChild(row_element) + for j, cell in enumerate(row): + # It is not explicitly stated anywhere but tabula seems to use the cell + # bbox to represent that of cell itself rather than that of text inside. + # Note: bbox could be [0, 0, 0, 0] if tabula recognizes no text inside. + box: List[float] = [ + cell["top"], + cell["left"], + cell["top"] + cell["height"], + cell["left"] + cell["width"], + ] + cell_element = self.doc.createElement("td") + row_element.appendChild(cell_element) + elems = get_mentions_within_bbox(box, self.elems[page_num].mentions) + if len(elems) == 0: + continue + cell_element.setAttribute( + "title", + f"bbox {int(box[1])} {int(box[0])} {int(box[3])} {int(box[2])}", + ) + elems.sort(key=cmp_to_key(reading_order)) + for elem in elems: + line_element = self.doc.createElement("span") + cell_element.appendChild(line_element) + line_element.setAttribute("class", "ocrx_line") + line_element.setAttribute( + "title", + " ".join(["bbox"] + [str(int(_)) for _ in elem.bbox]), + ) + words = self.get_word_boundaries(elem) + for word in words: + top = int(word[1]) + left = int(word[2]) + bottom = int(word[3]) + right = int(word[4]) + # escape special HTML chars + text = html.escape(word[0]) + + word_element = self.doc.createElement("span") + line_element.appendChild(word_element) + word_element.setAttribute("class", "ocrx_word") + word_element.setAttribute( + "title", f"bbox {left} {top} {right} {bottom}" + ) + word_element.appendChild(self.doc.createTextNode(text)) return table_element diff --git a/pdftotree/core.py b/pdftotree/core.py index 29d77b7..77a3036 100644 --- a/pdftotree/core.py +++ b/pdftotree/core.py @@ -24,17 +24,18 @@ from pdftotree.TreeExtract import TreeExtractor from pdftotree.TreeVisualizer import TreeVisualizer +logger = logging.getLogger(__name__) + def load_model(model_type, model_path): - log = logging.getLogger(__name__) - log.info("Loading pretrained {} model for table detection".format(model_type)) + logger.info("Loading pretrained {} model for table detection".format(model_type)) if model_type == "ml": model = pickle.load(open(model_path, "rb")) else: from keras.models import load_model as load_vision_model model = load_vision_model(model_path) - log.info("Model loaded!") + logger.info("Model loaded!") return model @@ -51,20 +52,19 @@ def parse( model_path=None, visualize=False, ): - log = logging.getLogger(__name__) model = None if model_type is not None and model_path is not None: model = load_model(model_type, model_path) extractor = TreeExtractor(pdf_file) if extractor.is_scanned(): - log.warning("Document looks scanned, the result may be far from expected.") + logger.warning("Document looks scanned, the result may be far from expected.") else: - log.info("Digitized PDF detected, building tree structure...") + logger.info("Digitized PDF detected, building tree structure...") pdf_tree = extractor.get_tree_structure(model_type, model) - log.info("Tree structure built, creating html...") + logger.info("Tree structure built, creating html...") pdf_html = extractor.get_html_tree() - log.info("HTML created.") + logger.info("HTML created.") # TODO: what is the following substition for and is it required? # pdf_html = re.sub(r"[\x00-\x1F]+", "", pdf_html) diff --git a/pdftotree/ml/TableExtractML.py b/pdftotree/ml/TableExtractML.py index b9b6b53..57272e7 100644 --- a/pdftotree/ml/TableExtractML.py +++ b/pdftotree/ml/TableExtractML.py @@ -20,6 +20,8 @@ from pdftotree.utils.pdf.pdf_parsers import parse_layout from pdftotree.utils.pdf.pdf_utils import analyze_pages, normalize_pdf +logger = logging.getLogger(__name__) + class TableExtractorML(object): """ @@ -27,7 +29,6 @@ class TableExtractorML(object): """ def __init__(self, pdf_file): - self.log = logging.getLogger(__name__) self.pdf_file = pdf_file self.elems = {} self.font_stats = {} @@ -97,7 +98,7 @@ def parse(self): and round(fig.bbox[2]) == round(elems.layout.width) and round(fig.bbox[3]) == round(elems.layout.height) ): - self.log.debug( + logger.debug( "{} is scanned because of full-page figure.".format( self.pdf_file ) @@ -111,7 +112,7 @@ def parse(self): ) # doc is scanned if any page is scanned if page_scanned: - self.log.debug( + logger.debug( "{} is scanned one of its pages is scanned.".format(self.pdf_file) ) is_scanned = True @@ -139,7 +140,7 @@ def get_candidates(self): def get_candidates_and_features(self): self.parse() if self.scanned: - self.log.info("{} is scanned.".format(self.pdf_file)) + logger.info("{} is scanned.".format(self.pdf_file)) return [], [], self.scanned for page_num in list(self.elems.keys()): page_boxes, page_features = self.get_candidates_and_features_page_num( @@ -161,7 +162,7 @@ def get_candidates_and_features_page_num(self, page_num): alignments_bboxes, alignment_features = self.get_candidates_alignments( page_num, elems ) - self.log.info( + logger.info( "Page Num: {}, Line bboxes: {}, Alignment bboxes: {}".format( page_num, len(lines_bboxes), len(alignments_bboxes) ) diff --git a/pdftotree/ml/features.py b/pdftotree/ml/features.py index 5927457..78167db 100644 --- a/pdftotree/ml/features.py +++ b/pdftotree/ml/features.py @@ -1,7 +1,7 @@ import string from builtins import str from collections import defaultdict -from typing import List +from typing import Any, List from pdfminer.layout import LTTextLine @@ -35,7 +35,15 @@ def get_height_coverage(bbox): # ******************* Text Coverage Features ************************************* -def get_mentions_within_bbox(bbox, mentions) -> List[LTTextLine]: +def get_mentions_within_bbox( + bbox: List[Any], mentions: List[LTTextLine] +) -> List[LTTextLine]: + """Get textlines within bbox. + + :param bbox: a list containing (top, left, bottom, right) in the last 4 digits + :param mentions: a list of textlines + :return: a list of textlines within the given bbox + """ mentions_within_bbox = [] for mention in mentions: bbox_mention = ( diff --git a/pdftotree/utils/pdf/grid.py b/pdftotree/utils/pdf/grid.py index c2f18d9..6f2d257 100644 --- a/pdftotree/utils/pdf/grid.py +++ b/pdftotree/utils/pdf/grid.py @@ -16,6 +16,8 @@ from pdftotree.utils.pdf.vector_utils import inside, reading_order +logger = logging.getLogger(__name__) + class Cell(object): """Represents a cell with no visual dividers inside""" @@ -117,7 +119,6 @@ def get_normalized_grid(self): """ Analyzes subcell structure """ - log = logging.getLogger(__name__) # Resolve multirow mentions, TODO: validate against all PDFs # subcol_count = 0 mega_rows = [] @@ -127,12 +128,12 @@ def get_normalized_grid(self): for col_id, cell in enumerate(row): # Keep cell text in reading order cell.texts.sort(key=cmp_to_key(reading_order)) - log.debug("=" * 50) + logger.debug("=" * 50) for m in cell.texts: subrow_across_cell[m.yc_grid].append(m) # prev = m - log.debug(pformat(dict(subrow_across_cell))) + logger.debug(pformat(dict(subrow_across_cell))) mega_rows.append(subrow_across_cell) diff --git a/pdftotree/utils/pdf/pdf_parsers.py b/pdftotree/utils/pdf/pdf_parsers.py index 7b4b912..1e4cd62 100644 --- a/pdftotree/utils/pdf/pdf_parsers.py +++ b/pdftotree/utils/pdf/pdf_parsers.py @@ -19,6 +19,8 @@ from pdftotree.utils.pdf.pdf_utils import PDFElems from pdftotree.utils.pdf.vector_utils import center, intersect, l1, xy_reading_order +logger = logging.getLogger(__name__) + def parse_layout(elems, font_stat, combine=False): """ @@ -75,7 +77,6 @@ def cluster_vertically_aligned_boxes( page_width, combine, ): - log = logging.getLogger(__name__) # Filter out boxes with zero width or height filtered_boxes = [] for bbox in boxes: @@ -85,10 +86,10 @@ def cluster_vertically_aligned_boxes( # Too many "." in the Table of Content pages if len(boxes) == 0: - log.warning("No boxes were found to cluster.") + logger.warning("No boxes were found to cluster.") return [], [] elif len(boxes) > 3500: - log.warning("Too many '.' in the Table of Content pages?") + logger.warning("Too many '.' in the Table of Content pages?") return [], [] plane = Plane(page_bbox) @@ -810,7 +811,6 @@ def extract_text_candidates( page_width, page_height, ) -> Tuple[Dict[str, List], bool]: - log = logging.getLogger(__name__) # Filter out boxes with zero width or height filtered_boxes = [] for bbox in boxes: @@ -1052,7 +1052,7 @@ def extract_text_candidates( min_y_page = min(min_y_page, box.bbox[1]) if page_num == -1: # handle title, authors and abstract here - log.error("TODO: no way to handle title authors abstract yet.") + logger.error("TODO: no way to handle title authors abstract yet.") else: # eliminate header, footer, page number # sort other text and classify as header/paragraph @@ -1180,7 +1180,6 @@ def extract_text_candidates( def get_figures(boxes, page_bbox, page_num, boxes_figures, page_width, page_height): - log = logging.getLogger(__name__) # Filter out boxes with zero width or height filtered_boxes = [] for bbox in boxes: @@ -1189,7 +1188,7 @@ def get_figures(boxes, page_bbox, page_num, boxes_figures, page_width, page_heig boxes = filtered_boxes if len(boxes) == 0: - log.warning("No boxes to get figures from on page {}.".format(page_num)) + logger.warning("No boxes to get figures from on page {}.".format(page_num)) return [] plane = Plane(page_bbox) @@ -1255,7 +1254,6 @@ def get_most_common_font_pts(mentions, font_stat): """ font_stat: Counter object of font sizes """ - log = logging.getLogger(__name__) try: # default min font size of 1 pt in case no font present most_common_font_size = font_stat.most_common(1)[0][0] @@ -1269,7 +1267,7 @@ def get_most_common_font_pts(mentions, font_stat): return height_sum / count except IndexError: - log.info("No text found on page. Default most_common_font_pts to 2.0") + logger.info("No text found on page. Default most_common_font_pts to 2.0") return 2.0 @@ -1284,7 +1282,6 @@ def get_page_width(boxes): def get_char_width(boxes: List[LTTextLine]) -> float: - log = logging.getLogger(__name__) box_len_sum = 0 num_char_sum = 0 for i, b in enumerate(boxes): @@ -1293,5 +1290,5 @@ def get_char_width(boxes: List[LTTextLine]) -> float: try: return box_len_sum / num_char_sum except ZeroDivisionError: - log.warning("No text found. Defaulting to char_width = 2.0.") + logger.warning("No text found. Defaulting to char_width = 2.0.") return 2.0 diff --git a/pdftotree/utils/pdf/render.py b/pdftotree/utils/pdf/render.py index cf806ef..d195f72 100644 --- a/pdftotree/utils/pdf/render.py +++ b/pdftotree/utils/pdf/render.py @@ -11,6 +11,8 @@ import numpy as np from pdf.vector_utils import x0, x1, y0, y1 +logger = logging.getLogger(__name__) + class Renderer(object): """ @@ -32,7 +34,6 @@ def __init__(self, elems, scaler=1): scaler so we can map original coordinates into the new grid map. """ - self.log = logging.getLogger(__name__) self.scaler = scaler layout = elems.layout width = int(np.ceil(scaler * layout.width)) @@ -41,7 +42,7 @@ def __init__(self, elems, scaler=1): self.grid = np.zeros((width, height), dtype=np.int8) # Estimates the grid size in megabytes - self.log.info(self.grid.nbytes / float(1048576)) + logger.info(self.grid.nbytes / float(1048576)) for line in elems.segments: if line.height < 0.1: # Horizontal lines self.draw_rect(line.bbox, self.horizontal_line) diff --git a/tests/test_basic.py b/tests/test_basic.py index 8f9fd0d..92c9463 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -10,6 +10,27 @@ import pdftotree +# Adapted from https://github.com/ocropus/hocr-tools/blob/v1.3.0/hocr-check +def get_prop(node: Tag, name: str) -> Optional[str]: + title = node.get("title") + if not title: + return None + props = title.split(";") + for prop in props: + (key, args) = prop.split(None, 1) + if key == name: + return args + return None + + +# Adapted from https://github.com/ocropus/hocr-tools/blob/v1.3.0/hocr-check +def get_bbox(node: Tag) -> box: + bbox = get_prop(node, "bbox") + if not bbox: + return None + return box(*[int(x) for x in bbox.split()]) + + def test_heuristic_completion(): """Simply test that parse runs to completion without errors.""" output = pdftotree.parse("tests/input/paleo.pdf") @@ -48,25 +69,6 @@ def test_looks_scanned(): assert len(soup.find_all(class_="ocrx_word")) >= 1000 assert len(soup.find_all("figure")) == 3 - # Adapted from https://github.com/ocropus/hocr-tools/blob/v1.3.0/hocr-check - def get_prop(node: Tag, name: str) -> Optional[str]: - title = node.get("title") - if not title: - return None - props = title.split(";") - for prop in props: - (key, args) = prop.split(None, 1) - if key == name: - return args - return None - - # Adapted from https://github.com/ocropus/hocr-tools/blob/v1.3.0/hocr-check - def get_bbox(node: Tag) -> box: - bbox = get_prop(node, "bbox") - if not bbox: - return None - return box(*[int(x) for x in bbox.split()]) - # Check if words are extracted even though they are overlapped by a figure (#77). page = soup.find(class_="ocr_page") # checking only 1st page is good enough. words = [get_bbox(word) for word in page.find_all(class_="ocrx_word")] @@ -74,10 +76,12 @@ def get_bbox(node: Tag) -> box: assert all([figure.contains(word) for word in words]) -def test_LTChar_under_LTFigure(): +def test_LTChar_under_LTFigure(tmp_path): """Test on a PDF where LTChar(s) are children of LTFigure.""" - output = pdftotree.parse("tests/input/CentralSemiconductorCorp_2N4013.pdf") - soup = BeautifulSoup(output, "lxml") + html_path = os.path.join(tmp_path, "paleo.html") + pdftotree.parse("tests/input/CentralSemiconductorCorp_2N4013.pdf", html_path) + with open(html_path) as f: + soup = BeautifulSoup(f, "lxml") line: Tag = soup.find(class_="ocrx_line") assert [word.text for word in line.find_all(class_="ocrx_word")] == [ "Small", @@ -87,8 +91,16 @@ def test_LTChar_under_LTFigure(): # The table in the 1st page should contain 18 columns page = soup.find(class_="ocr_page") - table = page.find("table") + table = page.find(class_="ocr_table") assert len(table.find("tr").find_all("td")) == 18 + assert get_bbox(table) is not None + + # Find a cell containing one or more of ocrx_word and check if it has bbox + cell = table.find(class_="ocrx_word").parent.parent + assert get_bbox(cell) is not None + + with Popen(["hocr-check", html_path], stderr=PIPE) as proc: + assert all([line.decode("utf-8").startswith("ok") for line in proc.stderr]) def test_ml_completion():