Add initial test/docs for format --text (#1235)

jsvine · Dec 16, 2024 · 69d010a · 69d010a
1 parent e0ee254
commit 69d010a
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -47,7 +47,7 @@ The output will be a CSV containing info about every character, line, and rectan
 
 | Argument | Description |
 |----------|-------------|
-|`--format [format]`| `csv` or `json`. The `json` format returns more information; it includes PDF-level and page-level metadata, plus dictionary-nested attributes.|
+|`--format [format]`| `csv`, `json`, or `text`. The `csv` and `json` formats return information about each object. Of those two, the `json` format returns more information; it includes PDF-level and page-level metadata, plus dictionary-nested attributes. The `text` option returns a plain-text representation of the PDF, using `Page.extract_text(layout=True)`.|
 |`--pages [list of pages]`| A space-delimited, `1`-indexed list of pages or hyphenated page ranges. E.g., `1, 11-15`, which would return data for pages 1, 11, 12, 13, 14, and 15.|
 |`--types [list of object types to extract]`| Choices are `char`, `rect`, `line`, `curve`, `image`, `annot`, et cetera. Defaults to all available.|
 |`--laparams`| A JSON-formatted string (e.g., `'{"detect_vertical": true}'`) to pass to `pdfplumber.open(..., laparams=...)`.|

diff --git a/tests/test_convert.py b/tests/test_convert.py
@@ -292,6 +292,23 @@ def test_cli_csv_include(self):
 
         assert res.decode("utf-8").split("\r\n")[9] == ("char,1")
 
+    def test_cli_text(self):
+        path = os.path.join(HERE, "pdfs/scotus-transcript-p1.pdf")
+        res = run(
+            [
+                sys.executable,
+                "-m",
+                "pdfplumber.cli",
+                path,
+                "--format",
+                "text",
+            ]
+        )
+
+        target_path = os.path.join(HERE, "comparisons/scotus-transcript-p1.txt")
+        target = open(target_path).read()
+        assert res.decode("utf-8") == target
+
     def test_page_to_dict(self):
         x = self.pdf.pages[0].to_dict(object_types=["char"])
         assert len(x["chars"]) == len(self.pdf.pages[0].chars)