jsvine · ennamarie19 · Jan 5, 2025 · Jan 18, 2025 · Jan 18, 2025 · Jan 18, 2025
diff --git a/.github/workflows/sync.yml b/.github/workflows/sync.yml
@@ -0,0 +1,29 @@
+name: Sync Fork with Upstream
+
+on:
+  schedule:
+    - cron: '0 0 * * *' # Run daily at midnight UTC
+  workflow_dispatch: # Allow manual triggering
+
+jobs:
+  sync:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout Fork
+        uses: actions/checkout@v3
+        with:
+          persist-credentials: false
+
+      - name: Set Upstream
+        run: |
+          git remote add upstream https://github.com/jsvine/pdfplumber
+          git fetch upstream
+          git checkout stable
+          git merge upstream/stable --allow-unrelated-histories
+          git status
+
+      - name: Push Changes to Fork
+        run: |
+          git push origin stable
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ All notable changes to this project will be documented in this file. The format
 - Add `--format text` options to CLI (in addition to previously-available `csv` and `json`) (h/t @brandonrobertz). ([#1235](https://github.com/jsvine/pdfplumber/pull/1235))
 - Add `raise_unicode_errors: bool` parameter to `pdfplumber.open()` to allow bypassing `UnicodeDecodeError`s in annotation-parsing and generate warnings instead (h/t @stolarczyk). ([#1195](https://github.com/jsvine/pdfplumber/issues/1195))
 - Add `name` property to `image` objects (h/t @djr2015). ([#1201](https://github.com/jsvine/pdfplumber/discussions/1201))
+- Added necessary build scripts, pipelines, and harnesses to integrate with [OSS-Fuzz](https://github.com/google/oss-fuzz). ([#1245](https://github.com/jsvine/pdfplumber/pull/1245)
 
 ### Fixed
 

diff --git a/README.md b/README.md
@@ -570,6 +570,7 @@ Many thanks to the following users who've contributed ideas, features, and fixes
 - [@wodny](https://github.com/wodny)
 - [Michal Stolarczyk](https://github.com/stolarczyk)
 - [Brandon Roberts](https://github.com/brandonrobertz)
+- [@ennamarie19](https://github.com/ennamarie19/)
 
 ## Contributing
 

diff --git a/fuzz/build.sh b/fuzz/build.sh
@@ -0,0 +1,13 @@
+#!/bin/bash -eu
+
+cd "$SRC"/pdfplumber
+pip3 install .
+
+# Build fuzzers in $OUT
+for fuzzer in $(find fuzz -name '*_fuzzer.py');do
+  compile_python_fuzzer "$fuzzer"
+done
+
+mkdir -p fuzz/corpus
+find . -name "*.pdf" -exec cp "{}" fuzz/corpus \;
+zip -q $OUT/pdf_load_fuzzer_seed_corpus.zip fuzz/corpus/*
diff --git a/fuzz/fuzz_helpers.py b/fuzz/fuzz_helpers.py
@@ -0,0 +1,91 @@
+#!/usr/bin/python3
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
+import contextlib
+import io
+import tempfile
+from enum import IntEnum
+from typing import Protocol, Type, TypeVar
+
+import atheris
+
+
+class HasMax(Protocol):
+    MAX: int
+
+
+T = TypeVar("T", bound=IntEnum)
+
+
+class EnhancedFuzzedDataProvider(atheris.FuzzedDataProvider):
+    def ConsumeRandomBytes(self) -> bytes:
+        return self.ConsumeBytes(self.ConsumeIntInRange(0, self.remaining_bytes()))
+
+    def ConsumeRandomString(self) -> str:
+        return self.ConsumeUnicodeNoSurrogates(
+            self.ConsumeIntInRange(0, self.remaining_bytes())
+        )
+
+    def ConsumeRemainingString(self) -> str:
+        return self.ConsumeUnicodeNoSurrogates(self.remaining_bytes())
+
+    def ConsumeRemainingBytes(self) -> bytes:
+        return self.ConsumeBytes(self.remaining_bytes())
+
+    @contextlib.contextmanager
+    def ConsumeMemoryFile(
+        self, all_data: bool = False, as_bytes: bool = True
+    ) -> io.BytesIO:
+        if all_data:
+            file_data = (
+                self.ConsumeRemainingBytes()
+                if as_bytes
+                else self.ConsumeRemainingString()
+            )
+        else:
+            file_data = (
+                self.ConsumeRandomBytes() if as_bytes else self.ConsumeRandomString()
+            )
+
+        file = io.BytesIO(file_data) if as_bytes else io.StringIO(file_data)
+        yield file
+        file.close()
+
+    @contextlib.contextmanager
+    def ConsumeTemporaryFile(
+        self, suffix: str, all_data: bool = False, as_bytes: bool = True
+    ) -> str:
+        if all_data:
+            file_data = (
+                self.ConsumeRemainingBytes()
+                if as_bytes
+                else self.ConsumeRemainingString()
+            )
+        else:
+            file_data = (
+                self.ConsumeRandomBytes() if as_bytes else self.ConsumeRandomString()
+            )
+
+        mode = "w+b" if as_bytes else "w+"
+        tfile = tempfile.NamedTemporaryFile(mode=mode, suffix=suffix)
+        tfile.write(file_data)
+        tfile.seek(0)
+        tfile.flush()
+        yield tfile.name
+        tfile.close()
+
+    def ConsumeEnum(self, enum_type: Type[T]) -> T:
+        return enum_type(self.ConsumeIntInRange(0, enum_type.MAX))
diff --git a/fuzz/pdf_load_fuzzer.py b/fuzz/pdf_load_fuzzer.py
@@ -0,0 +1,59 @@
+import sys
+from enum import IntEnum
+
+import atheris
+from fuzz_helpers import EnhancedFuzzedDataProvider
+
+with atheris.instrument_imports(include=["pdfplumber"]):
+    from pdfminer.pdftypes import PDFException
+    from pdfminer.psparser import PSException
+
+    import pdfplumber
+
+
+class CastType(IntEnum):
+    CSV = 0
+    IMAGE = 1
+    JSON = 2
+    DICT = 3
+    MAX = 4
+
+
+def TestOneInput(data: bytes):
+    fdp = EnhancedFuzzedDataProvider(data)
+
+    try:
+        with fdp.ConsumeMemoryFile(all_data=False, as_bytes=True) as f:
+            pdf = pdfplumber.open(f)
+
+            # Test casting
+            cast_ty = fdp.ConsumeEnum(CastType)
+
+            if cast_ty is CastType.CSV:
+                pdf.to_csv()
+            elif cast_ty is CastType.IMAGE and pdf.pages:
+                pdf.pages[0].to_image()
+            elif cast_ty is CastType.JSON:
+                pdf.to_json()
+            elif cast_ty is CastType.DICT:
+                pdf.to_dict()
+
+    except (PDFException, PSException, AssertionError):
+        return -1
+    except ValueError as e:
+        if "invalid literal for int" in str(e):
+            return -1
+        raise e
+    except TypeError as e:
+        if "argument must be a string" in str(e):
+            return -1
+        raise e
+
+
+def main():
+    atheris.Setup(sys.argv, TestOneInput)
+    atheris.Fuzz()
+
+
+if __name__ == "__main__":
+    main()