Skip to content

Commit

Permalink
Revert "Feature: Warn on unicode decoding errors in PDF annotations"
Browse files Browse the repository at this point in the history
  • Loading branch information
jsvine authored Dec 9, 2024
1 parent 871770a commit fa923cb
Show file tree
Hide file tree
Showing 4 changed files with 1 addition and 39 deletions.
11 changes: 1 addition & 10 deletions pdfplumber/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
Union,
)
from unicodedata import normalize as normalize_unicode
from warnings import warn

from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import (
Expand Down Expand Up @@ -307,15 +306,7 @@ def parse(annot: T_obj) -> T_obj:
try:
extras[k] = v.decode("utf-8")
except UnicodeDecodeError:
try:
extras[k] = v.decode("utf-16")
except UnicodeDecodeError:
if self.pdf.raise_unicode_errors:
raise
warn(
f"Could not decode {k} of annotation."
f" {k} will be missing."
)
extras[k] = v.decode("utf-16")

parsed = {
"page_number": self.page_number,
Expand Down
4 changes: 0 additions & 4 deletions pdfplumber/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ def __init__(
password: Optional[str] = None,
strict_metadata: bool = False,
unicode_norm: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None,
raise_unicode_errors: bool = True,
):
self.stream = stream
self.stream_is_external = stream_is_external
Expand All @@ -44,7 +43,6 @@ def __init__(
self.laparams = None if laparams is None else LAParams(**laparams)
self.password = password
self.unicode_norm = unicode_norm
self.raise_unicode_errors = raise_unicode_errors

self.doc = PDFDocument(PDFParser(stream), password=password or "")
self.rsrcmgr = PDFResourceManager()
Expand Down Expand Up @@ -78,7 +76,6 @@ def open(
repair: bool = False,
gs_path: Optional[Union[str, pathlib.Path]] = None,
repair_setting: T_repair_setting = "default",
raise_unicode_errors: bool = True,
) -> "PDF":

stream: Union[BufferedReader, BytesIO]
Expand Down Expand Up @@ -110,7 +107,6 @@ def open(
strict_metadata=strict_metadata,
unicode_norm=unicode_norm,
stream_is_external=stream_is_external,
raise_unicode_errors=raise_unicode_errors,
)

except PSException:
Expand Down
Binary file removed tests/pdfs/annotations-unicode-issues.pdf
Binary file not shown.
25 changes: 0 additions & 25 deletions tests/test_issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
resource = None
import unittest

import pytest

import pdfplumber

logging.disable(logging.ERROR)
Expand Down Expand Up @@ -334,26 +332,3 @@ def test_issue_1181(self):
["Bar10", "Bar11", "Bar12"],
["", "", ""],
]

def test_pr_1195(self):
"""
In certain scenarios, annotations may include invalid or extraneous
data that can obstruct the annotation processing workflow. To mitigate
this, the raise_unicode_errors parameter in the PDF initializer and the
.open() method provides a configurable option to bypass these errors
and generate warnings instead, ensuring smoother handling of such
anomalies.
The following tests verifies the functionality of the
raise_unicode_errors parameter.
"""
path = os.path.join(HERE, "pdfs/annotations-unicode-issues.pdf")
with pdfplumber.open(path) as pdf, pytest.raises(UnicodeDecodeError):
for _ in pdf.annots:
pass

with pdfplumber.open(path, raise_unicode_errors=False) as pdf, pytest.warns(
UserWarning
):
for _ in pdf.annots:
pass

0 comments on commit fa923cb

Please sign in to comment.