Extract table spanning multiple pages and columns into 1 markdown table #1019
Replies: 2 comments 6 replies
-
This is the PDF received Life_Cycle_Assessment_of_Cow_Tanned_Leather_Produc.pdf I see 2 tables in this PDF, the one that spans 3 pages has 10 columns. This "lines" in the tables seem to actually be many small lines that are "nearly parallel" bunched together so that they look like a single continuous line, which is somewhat awkward. I'm not sure if there is a simple way to parse this, but this was the first idea that came to mind:
Perhaps it's a useful start point to build upon. import itertools
import math
import pdfplumber
import re
from operator import itemgetter
from pprint import pp
TABLE_HEADER_RE = r"(?<=\n)(Table \d.*)\n(.+)"
def is_bold_text(chars):
return chars and all("bold" in char["fontname"].casefold() for char in chars)
def group_horizontal_lines(page, tolerance=1):
horizontal_lines = []
line_groups = pdfplumber.utils.cluster_objects(
page.horizontal_edges,
itemgetter("bottom"),
tolerance,
)
for lines in line_groups:
left = min(lines, key=itemgetter("x0"))["x0"]
right = max(lines, key=itemgetter("x1"))["x1"]
width = right - left
# multiple line fragments and "almost" full width of page
if len(lines) > 2 and page.width / width <= 1.5:
horizontal_lines.append(lines[0]["bottom"])
return sorted(horizontal_lines)
pdf = pdfplumber.open("Downloads/Life_Cycle_Assessment_of_Cow_Tanned_Leather_Produc.pdf")
db = {}
height = 0
for page_num, page in enumerate(pdf.pages):
horizontal_lines = group_horizontal_lines(page)
for top, bottom, in itertools.pairwise(horizontal_lines):
crop = page.crop((0, top, page.width, bottom))
db.setdefault("crops", []).append({
"content": crop,
"top": top + height,
"bottom": bottom + height
})
# Search for "Table N" strings
for lines in page.search(TABLE_HEADER_RE):
# Is "Table" Bold
if is_bold_text(lines["chars"][:5]):
name = lines["groups"][0]
header = re.escape(lines["groups"][1])
# is Header line all Bold
if is_bold_text(page.search(header)[0]["chars"]):
top = round(lines["top"]) + height
db.setdefault("tables", []).append({"name": name, "top": top})
height += page.height
pp(sorted(db["crops"] + db["tables"], key=itemgetter("top"))) With the data sorted, you can group them by name: [{'name': 'Table 1. Inputs and Outputs in the tanning industry', 'top': 2943},
{'content': <Page:4>, 'top': 2953.70001, 'bottom': 2963.5000099999997},
{'content': <Page:4>, 'top': 2963.5000099999997, 'bottom': 3286.17498},
{'content': <Page:5>, 'top': 3481.4300000000003, 'bottom': 4126.17498},
{'content': <Page:6>, 'top': 4323.43, 'bottom': 4471.25001},
{'content': <Page:6>, 'top': 4471.25001, 'bottom': 4572.27998},
{'name': 'Table 2. Summary of Life Cycle Impact Assessment UD. PKX',
'top': 4561},
{'content': <Page:6>, 'top': 4572.27998, 'bottom': 4581.87001},
{'content': <Page:6>, 'top': 4581.87001, 'bottom': 4827.54998}] The 2nd Page:6 entry is the section in between the 2 tables on the page and would need some futher processing to identify it as a false match. For extracting the tables, it looks like the text strategy is needed. (further tweaking may be needed, I didn't check all the data) import pandas as pd
ts = {"horizontal_strategy": "text", "vertical_strategy": "text"}
print(
pd.DataFrame(
db["crops"][1]["content"].extract_table(ts)
)
.head(10).to_markdown()
)
Some logic could also be added to extract the column names. |
Beta Was this translation helpful? Give feedback.
-
Taking another look at this, I think it is possible to make the approach a bit smarter. We can assume the first crop after the table name is the column header area, and we can attempt to extract the column names. For the "false positive" area in this specific PDF, we can filter it out based on "positional" information, but I'm assuming there is a better way to detect them in a more general case. import itertools
import pdfplumber
import re
import pandas as pd
from operator import itemgetter
TABLE_HEADER_RE = r"(?<=\n)(Table \d.*)\n(.+)"
def is_bold_text(chars):
return chars and all("bold" in char["fontname"].casefold() for char in chars)
def extract_columns(page, tolerance=10):
def column_names_to_patterns(columns):
columns = list(map(re.escape, columns))
num_cols = len(columns)
# is there an iterools method that simplifies this?
for n, col in enumerate(columns):
preceeding = ""
if n > 0:
preceeding = f"(?<={' '.join(columns[:n])} )"
following = "\s*$"
if n + 1 < num_cols:
following = f" {' '.join(columns[n + 1:])}"
yield f"{preceeding}({col})(?={following})"
char_groups = pdfplumber.utils.cluster_objects(
page.chars,
itemgetter("x0"),
tolerance
)
columns = [
"".join(char["text"] for char in chars).strip()
for chars in char_groups
]
# x0 assumes the columns are "left-aligned"
# could add param + logic to allow switching?
explicit_vertical_lines = [
page.search(pattern, return_chars=False)[0]["x0"]
for pattern in column_names_to_patterns(columns)
]
return {"names": columns, "explicit_vertical_lines": explicit_vertical_lines}
def merge_tables(result):
# remove sections in between tables
for n in range(len(result) - 1, -1, -1):
item = result[n]
if "name" in item:
table = item
table["columns"] = extract_columns(result[n + 1]["content"])
for prev in range(n - 1, -1, -1):
if "name" in result[prev]:
break
if result[prev]["bottom"] > table["top"]:
del result[prev]
# remove the column crops
for n in range(len(result) - 1, -1, -1):
item = result[n]
if "name" in item:
del result[n + 1]
merged_tables = {}
table = None
for item in result:
if "name" in item:
table = item
else:
crop = item["content"]
name = table["name"]
default = {"content": [], "columns": table["columns"]["names"]}
merged_tables.setdefault(name, default)
# use right-most char as position for last vertical line
explicit_vertical_lines = table["columns"]["explicit_vertical_lines"]
right = max(crop.chars, key=itemgetter("x1"))["x1"]
explicit_vertical_lines.append(right)
merged_tables[name]["content"].extend(
crop.extract_table({
"explicit_vertical_lines": explicit_vertical_lines,
"horizontal_strategy": "text",
})
)
return merged_tables
pdf = pdfplumber.open("Downloads/Life_Cycle_Assessment_of_Cow_Tanned_Leather_Produc.pdf")
db = {}
height = 0
for page in pdf.pages:
horizontal_lines = []
line_groups = pdfplumber.utils.cluster_objects(
page.horizontal_edges,
itemgetter("bottom"),
tolerance = 1
)
for lines in line_groups:
left = min(lines, key=itemgetter("x0"))["x0"]
right = max(lines, key=itemgetter("x1"))["x1"]
width = right - left
# multiple line fragments and "almost" full width of page
if len(lines) > 2 and page.width / width <= 1.5:
horizontal_lines.append(lines[0]["bottom"])
horizontal_lines = sorted(horizontal_lines)
for top, bottom, in itertools.pairwise(horizontal_lines):
crop = page.crop((0, top, page.width, bottom))
db.setdefault("crops", []).append({
"content": crop,
"top": top + height,
"bottom": bottom + height
})
# Search for "Table N" strings
for lines in page.search(TABLE_HEADER_RE):
# Is "Table" Bold
if is_bold_text(lines["chars"][:5]):
name = lines["groups"][0]
header = re.escape(lines["groups"][1])
# is Header line all Bold
if is_bold_text(page.search(header)[0]["chars"]):
top = round(lines["top"]) + height
db.setdefault("tables", []).append({
"name": name,
"top": top,
"page_number": page.page_number
})
height += page.height
result = sorted(db["crops"] + db["tables"], key=itemgetter("top"))
for name, table in merge_tables(result).items():
print(f"{name=}")
df = pd.DataFrame(table["content"], columns=table["columns"])
print(df.shape)
print(df.head().to_markdown())
print(df.tail().to_markdown()) Outputsname='Table 1. Inputs and Outputs in the tanning industry'
(140, 10)
name='Table 2. Summary of Life Cycle Impact Assessment UD. PKX'
(50, 4)
|
Beta Was this translation helpful? Give feedback.
-
Similiar to #1012 discussion, I have a specific PDF that contains a PDF that has a table of 2 columns spanning pages to be extracted out. The file https://drive.google.com/file/d/167Y6KKW5cv0-7r8FV830iofWvYZWP4b6/view?usp=sharing is an interesting one that when I use extract_table() it simply returns alot of None, can I run a customized script over it to convert it to markdown formatted table and mark it with page numbers?
Beta Was this translation helpful? Give feedback.
All reactions