Skip to content

Commit

Permalink
adapted code to the latest version of PyPDF2 (>3.0.0)
Browse files Browse the repository at this point in the history
fixes the following errors with PyPDF2 > 3.x:
- failed to process samples/scans/007-captain-future-galaxy-drift-1.desc (PdfFileReader is deprecated and was removed in PyPDF2 3.0.0. Use PdfReader instead.)
- failed to process samples/scans/007-captain-future-galaxy-drift-1.desc (reader.numPages is deprecated and was removed in PyPDF2 3.0.0. Use  failed to process samples/scans/007-captain-future-galaxy-drift-1.desc (reader.getPage(pageNumber) is deprecated and was removed in PyPDF2 3.0.0. Use reader.pages[page_number] instead.)
- failed to process samples/scans/007-captain-future-galaxy-drift-1.desc (PdfFileWriter is deprecated and was removed in PyPDF2 3.0.0. Use PdfWriter instead.)
- failed to process samples/scans/007-captain-future-galaxy-drift-1.desc (addPage is deprecated and was removed in PyPDF2 3.0.0. Use add_page instead.)
- PdfFileReader is deprecated and was removed in PyPDF2 3.0.0. Use PdfReader instead.
- outlines is deprecated and was removed in PyPDF2 3.0.0. Use outline instead.
- 'PdfReader' object has no attribute 'resolvedObjects'
- module 'PyPDF2' has no attribute 'pdf'
  • Loading branch information
g-raffy committed Nov 7, 2023
1 parent 6cf46a3 commit ab94c5f
Show file tree
Hide file tree
Showing 6 changed files with 55 additions and 53 deletions.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
PyPDF2
PyPDF2 >= 3.0.0 # the syntax has changed between PyPDF2 2.x and PyPDF2 3.x
pillow
opencv-python
20 changes: 11 additions & 9 deletions src/pymusco/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
https://automatetheboringstuff.com/chapter13/
https://github.com/RussellLuo/pdfbookmarker/blob/master/add_bookmarks.py
"""
from typing import Dict, Any
import abc
# sudo port install py27-pypdf2
import PyPDF2
# from PyPDF2 import PdfFileMerger, PdfFileReader
# from PyPDF2 import PdfFileMerger, PdfReader
from PIL import Image
import json

Expand Down Expand Up @@ -516,16 +517,16 @@ def get_stub_tracks(src_stub_file_path, orchestra):
table_of_contents : TableOfContents
"""
# note: this function was implemented using trial & error. There must be cleaner and easier ways to do this.
def find_page_number(page_contents_id, pdf_reader):
def find_page_number(page_contents_id: int, pdf_reader: PyPDF2.PdfReader):
"""Finds in the input pdf the page number for the given page contents id
:param int page_contents_id:
:param PyPDF2.PdfFileReader pdf_reader: the input pdf file
:param PyPDF2.PdfReader pdf_reader: the input pdf file
"""
# print('looking for page with id %d' % page_contents_id)
for page_index in range(len(pdf_reader.pages)):
page_object = pdf_reader.pages[page_index]
assert isinstance(page_object, PyPDF2.pdf.PageObject)
assert isinstance(page_object, PyPDF2._page.PageObject)
# at this point, a page_object of the table of contents (with 23 links) looks like :
# {'/Contents': IndirectObject(196, 0), '/Parent': IndirectObject(203, 0), '/Type': '/Page', '/Resources': IndirectObject(195, 0), '/MediaBox': [0, 0, 612, 792], '/Annots': [IndirectObject(171, 0), IndirectObject(172, 0), IndirectObject(173, 0), IndirectObject(174, 0), IndirectObject(175, 0), IndirectObject(176, 0), IndirectObject(177, 0), IndirectObject(178, 0), IndirectObject(179, 0), IndirectObject(180, 0), IndirectObject(181, 0), IndirectObject(182, 0), IndirectObject(183, 0), IndirectObject(184, 0), IndirectObject(185, 0), IndirectObject(186, 0), IndirectObject(187, 0), IndirectObject(188, 0), IndirectObject(189, 0), IndirectObject(190, 0), IndirectObject(191, 0), IndirectObject(192, 0), IndirectObject(193, 0)]}
# while a normal page_object looks like
Expand All @@ -540,7 +541,7 @@ def find_page_number(page_contents_id, pdf_reader):

assert False, "failed to find in the given input pdf file, a page whose contents id is %s" % page_contents_id

def get_pdf_toc_item_page(pdf_toc_item, pdf_reader):
def get_pdf_toc_item_page(pdf_toc_item: Dict[str, Any], pdf_reader: PyPDF2.PdfReader) -> int:
"""
:param dict(str, object) pdf_toc_item: an item of the pdf table of contents, such as
{
Expand All @@ -551,7 +552,7 @@ def get_pdf_toc_item_page(pdf_toc_item, pdf_reader):
'/Zoom': <PyPDF2.generic.NullObject object at 0x1110b1a90>,
'/Page': IndirectObject(228, 0)
}
:param PyPDF2.PdfFileReader pdf_reader:
:param PyPDF2.PdfReader pdf_reader:
:return int: the page number which is the target of the given pdf toc item.
"""
Expand All @@ -563,7 +564,8 @@ def get_pdf_toc_item_page(pdf_toc_item, pdf_reader):

# at this point, linked_page_indirect_object is of type PyPDF2.generic.IndirectObject, with a value such as:
# IndirectObject(228, 0)
linked_page_object = pdf_reader.resolvedObjects[(0, linked_page_indirect_object.idnum)]
print(dir(pdf_reader))
linked_page_object = pdf_reader.resolved_objects[(0, linked_page_indirect_object.idnum)]
# at this point, linked_page_object is of type PyPDF2.generic.DictionaryObject with a value such as :
# {
# '/Contents': IndirectObject(229, 0),
Expand All @@ -576,8 +578,8 @@ def get_pdf_toc_item_page(pdf_toc_item, pdf_reader):
return find_page_number(linked_page_content_id, pdf_reader)

with open(src_stub_file_path, 'rb') as stub_file:
reader = PyPDF2.PdfFileReader(stub_file)
pdf_toc = reader.outlines
reader = PyPDF2.PdfReader(stub_file)
pdf_toc = reader.outline
# pdf_toc is a list of toc items like the following example :
# [
# {'/Title': u'c piccolo', '/Left': 155.354, '/Type': '/XYZ', '/Top': 669.191, '/Zoom': <PyPDF2.generic.NullObject object at 0x1110b1a90>, '/Page': IndirectObject(29, 0)},
Expand Down
42 changes: 21 additions & 21 deletions src/pymusco/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,12 +338,12 @@ def scan_to_stub(src_scanned_pdf_file_path, dst_stub_pdf_file_path, toc, title,

scanned_image_file_paths = []
with open(src_scanned_pdf_file_path, 'rb') as src_pdf_file:
pdf_reader = PyPDF2.PdfFileReader(src_pdf_file)
pdf_reader = PyPDF2.PdfReader(src_pdf_file)
# pdfReader.numPages
# 19
for page_index in range(pdf_reader.numPages):
for page_index in range(len(pdf_reader.pages)):
print('page_index = %d' % page_index)
page = pdf_reader.getPage(page_index)
page = pdf_reader.pages[page_index]
# image_file_path = extract_pdf_page_main_image(page, image_dir=tmp_dir, image_name=('page%03d' % page_index))
image_file_path = extract_pdf_page(page, image_dir=tmp_dir, image_name=('page%03d' % page_index))

Expand All @@ -370,10 +370,10 @@ def stub_to_print(src_stub_file_path, dst_print_file_path, track_selector, orche
dst_print_file_path.parent.mkdir(parents=True, exist_ok=True)

with open(dst_print_file_path, 'wb') as print_file, open(dst_print_file_path.with_suffix('.log'), 'wt') as log_file:
print_pdf = PyPDF2.PdfFileWriter()
print_pdf = PyPDF2.PdfWriter()
log_file.write("contents of print file %s :\n\n" % dst_print_file_path)
with open(src_stub_file_path, 'rb') as stub_file:
stub_pdf = PyPDF2.PdfFileReader(stub_file)
stub_pdf = PyPDF2.PdfReader(stub_file)

sorted_tracks = [Track(track_id, orchestra) for track_id in track_to_print_count.keys()]
sorted_tracks.sort()
Expand All @@ -386,10 +386,10 @@ def stub_to_print(src_stub_file_path, dst_print_file_path, track_selector, orche
num_copies = track_to_print_count[track.id]
if num_copies > 0:
first_page_index = stub_toc.get_tracks_first_page_index([track])
last_page_index = stub_toc.get_tracks_last_page_index([track], stub_pdf.getNumPages())
last_page_index = stub_toc.get_tracks_last_page_index([track], len(stub_pdf.pages))
print('adding %d copies of %s (pages %d-%d)' % (num_copies, track.id, first_page_index, last_page_index))
assert first_page_index <= last_page_index
assert last_page_index <= stub_pdf.getNumPages()
assert last_page_index <= len(stub_pdf.pages)
page_range = (first_page_index, last_page_index)
if page_range in ranges:
# this page range has already been encountered. This can happen when multiple tracks share the same pages (eg crash cymbals are on the same pages as suspended cybal)
Expand All @@ -415,9 +415,9 @@ def stub_to_print(src_stub_file_path, dst_print_file_path, track_selector, orche
# print(page_range, num_copies)
for copy_index in range(num_copies): # @UnusedVariable pylint: disable=unused-variable
for page_index in range(first_page_index, last_page_index + 1):
track_page = stub_pdf.getPage(page_index - 1) # -1 to convert 1-based index into 0-based index
track_page = stub_pdf.pages[page_index - 1] # -1 to convert 1-based index into 0-based index
# print('adding page %d' % page_index)
print_pdf.addPage(track_page)
print_pdf.add_page(track_page)

log_file.write("\nunprinted tracks :\n\n")
for label in stub_toc.get_track_ids():
Expand All @@ -443,10 +443,10 @@ def split_double_pages(src_scanned_pdf_file_path, dst_scanned_pdf_file_path, spl
tmp_dir.mkdir(parents=True, exist_ok=True)
scanned_image_file_paths = []
with open(src_scanned_pdf_file_path, 'rb') as src_pdf_file:
pdf_reader = PyPDF2.PdfFileReader(src_pdf_file)
pdf_reader = PyPDF2.PdfReader(src_pdf_file)
for page_index in range(pdf_reader.numPages):
print('page_index = %d' % page_index)
double_page = pdf_reader.getPage(page_index)
double_page = pdf_reader.pages[page_index]
image_name = ('page%03d' % page_index)
double_image_file_path = extract_pdf_page_main_image(double_page, image_dir=tmp_dir, image_name=image_name)
double_png_file_path = "%s.png" % double_image_file_path
Expand Down Expand Up @@ -492,10 +492,10 @@ def crop_pdf(src_scanned_pdf_file_path, dst_scanned_pdf_file_path, x_scale, y_sc
tmp_dir.mkdir(parents=True, exist_ok=True)
scanned_image_file_paths = []
with open(src_scanned_pdf_file_path, 'rb') as src_pdf_file:
pdf_reader = PyPDF2.PdfFileReader(src_pdf_file)
pdf_reader = PyPDF2.PdfReader(src_pdf_file)
for page_index in range(pdf_reader.numPages):
print('page_index = %d' % page_index)
page = pdf_reader.getPage(page_index)
page = pdf_reader.pages[page_index]
image_name = ('page%03d' % page_index)
image_file_path = extract_pdf_page_main_image(page, image_dir=tmp_dir, image_name=image_name)
png_file_path = "%s.png" % image_file_path
Expand All @@ -519,8 +519,8 @@ def crop_pdf(src_scanned_pdf_file_path, dst_scanned_pdf_file_path, x_scale, y_sc
def pdf_is_readable_by_pypdf2(src_pdf_path):
with open(src_pdf_path, 'rb') as src_pdf_file:
try:
src_pdf = PyPDF2.PdfFileReader(src_pdf_file)
num_pages = src_pdf.getNumPages() # noqa:F841
src_pdf = PyPDF2.PdfReader(src_pdf_file)
num_pages = len(src_pdf.pages) # noqa:F841
return True
except NotImplementedError as error:
if error.message == "only algorithm code 1 and 2 are supported":
Expand All @@ -538,7 +538,7 @@ def merge_pdf(dst_pdf_path, src_pdf_paths):
dst_pdf_path.parent.mkdir(parents=True, exist_ok=True)

with open(dst_pdf_path, 'wb') as dst_pdf_file:
dst_pdf = PyPDF2.PdfFileWriter()
dst_pdf = PyPDF2.PdfWriter()
for src_pdf_path in src_pdf_paths:
print(src_pdf_path)
if not pdf_is_readable_by_pypdf2(src_pdf_path):
Expand All @@ -547,10 +547,10 @@ def merge_pdf(dst_pdf_path, src_pdf_paths):
remove_unneeded_pdf_password(src_pdf_path, fixed_pdf_path)
src_pdf_path = fixed_pdf_path
with open(src_pdf_path, 'rb') as src_pdf_file:
src_pdf = PyPDF2.PdfFileReader(src_pdf_file)
for page_index in range(src_pdf.getNumPages()):
src_page = src_pdf.getPage(page_index)
dst_pdf.addPage(src_page)
src_pdf = PyPDF2.PdfReader(src_pdf_file)
for page_index in range(len(src_pdf.pages)):
src_page = src_pdf.pages[page_index]
dst_pdf.add_page(src_page)
dst_pdf.write(dst_pdf_file)


Expand All @@ -569,7 +569,7 @@ def remove_unneeded_pdf_password(src_pdf_path, dst_pdf_path):
You did not supply this password. Please respect any copyright.
This causes pypdf2 to fail retreiving the number of pages :
File "/usr/lib/python3/dist-packages/PyPDF2/pdf.py", line 1147, in getNumPages
File "/usr/lib/python3/dist-packages/PyPDF2/pdf.py", line 1147, in pages
self.decrypt('')
File "/usr/lib/python3/dist-packages/PyPDF2/pdf.py", line 1987, in decrypt
return self._decrypt(password)
Expand Down
30 changes: 15 additions & 15 deletions src/pymusco/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,8 +218,8 @@ def extract_pdf_page_main_image(pdf_page, image_dir, image_name):
# this page doesn't contain a raster image, so we keep it in its original vectorial form
saved_image_file_path = "%s/%s.pdf" % (image_dir, image_name)
with open(saved_image_file_path, 'wb') as pdf_file:
dst_pdf = PyPDF2.PdfFileWriter()
dst_pdf.addPage(pdf_page)
dst_pdf = PyPDF2.PdfWriter()
dst_pdf.add_page(pdf_page)
dst_pdf.write(pdf_file)
return saved_image_file_path

Expand All @@ -233,8 +233,8 @@ def extract_pdf_page(pdf_page, image_dir, image_name):
"""
saved_image_file_path = "%s/%s.pdf" % (image_dir, image_name)
with open(saved_image_file_path, 'wb') as pdf_file:
dst_pdf = PyPDF2.PdfFileWriter()
dst_pdf.addPage(pdf_page)
dst_pdf = PyPDF2.PdfWriter()
dst_pdf.add_page(pdf_page)
dst_pdf.write(pdf_file)
return saved_image_file_path

Expand All @@ -259,8 +259,8 @@ def pdf_page_to_png(pdf_page, resolution=72):
"""
:param pdf_page:
"""
dst_pdf = PyPDF2.PdfFileWriter()
dst_pdf.addPage(pdf_page)
dst_pdf = PyPDF2.PdfWriter()
dst_pdf.add_page(pdf_page)

tmp_dir = Path('/tmp/pymusco')
tmp_dir.mkdir(parents=True, exist_ok=True)
Expand Down Expand Up @@ -294,7 +294,7 @@ def add_bookmarks(pdf_in_filename, bookmarks_tree, pdf_out_filename=None):
pdf_out.append(inputStream, import_bookmarks=False)

# copy/preserve existing metainfo
pdf_in = PyPDF2.PdfFileReader(pdf_in_filename)
pdf_in = PyPDF2.PdfReader(pdf_in_filename)
metaInfo = pdf_in.getDocumentInfo()
if metaInfo:
pdf_out.addMetadata(metaInfo)
Expand Down Expand Up @@ -325,8 +325,8 @@ def add_stamp(src_pdf_file_path, dst_pdf_file_path, stamp_file_path, scale=1.0,
:param str stamp_file_path: location of the pdf file containing the stamp used
"""
pdf_watermark_reader = PyPDF2.PdfFileReader(open(stamp_file_path, 'rb'))
watermark = pdf_watermark_reader.getPage(0)
pdf_watermark_reader = PyPDF2.PdfReader(open(stamp_file_path, 'rb'))
watermark = pdf_watermark_reader.pages[0]

use_tmp_output_file = False
if dst_pdf_file_path == src_pdf_file_path:
Expand All @@ -336,18 +336,18 @@ def add_stamp(src_pdf_file_path, dst_pdf_file_path, stamp_file_path, scale=1.0,
else:
tmp_dst_pdf_file_path = dst_pdf_file_path

pdf_writer = PyPDF2.PdfFileWriter()
pdf_writer = PyPDF2.PdfWriter()
with open(src_pdf_file_path, 'rb') as src_pdf_file:
pdf_reader = PyPDF2.PdfFileReader(src_pdf_file)
pdf_reader = PyPDF2.PdfReader(src_pdf_file)
# pdfReader.numPages
# 19
for page_index in range(pdf_reader.numPages):
page = pdf_reader.getPage(page_index)
page = pdf_reader.pages[page_index]
# page.mergePage(watermark)
page.mergeScaledTranslatedPage(watermark, scale=scale, tx=tx, ty=ty)
# pdf_writer.addBookmark(title='toto %s' % page_index, pagenum=page_index, parent=None, color=None, bold=False, italic=False, fit='/Fit')

pdf_writer.addPage(page)
pdf_writer.add_page(page)
# pdf_writer.addBookmark('Hello, World Bookmark', 0, parent=None)
# pdf_writer.addBookmark(title='toto', pagenum=2, parent=None, color=None, bold=False, italic=False, fit='/Fit')
# pdf_writer.setPageMode("/UseOutlines")
Expand All @@ -368,10 +368,10 @@ def check_pdf(src_pdf_file_path):
please note that all maformations are not detected yet
"""
with open(src_pdf_file_path, 'rb') as src_pdf_file:
pdf_reader = PyPDF2.PdfFileReader(src_pdf_file)
pdf_reader = PyPDF2.PdfReader(src_pdf_file)
for page_index in range(pdf_reader.numPages):
print('page_index = %d' % page_index)
pdf_page = pdf_reader.getPage(page_index)
pdf_page = pdf_reader.pages[page_index]
if '/XObject' in pdf_page['/Resources']:
xObject = pdf_page['/Resources']['/XObject'].getObject()
for obj in xObject:
Expand Down
4 changes: 2 additions & 2 deletions src/pymusco/tesseract.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,11 @@ def extract_pdf_text(src_pdf_file_path):
os.environ["TESSDATA_PREFIX"] = '/opt/local/share' # this is required otherwise tesseract complains about file permissions

with open(src_pdf_file_path, 'rb') as src_pdf_file:
pdf_reader = PyPDF2.PdfFileReader(src_pdf_file)
pdf_reader = PyPDF2.PdfReader(src_pdf_file)
# pdfReader.numPages
# 19
for page_index in range(pdf_reader.numPages):
page = pdf_reader.getPage(page_index)
page = pdf_reader.pages[page_index]

image = pdf_page_to_png(page, resolution=72)
# extract_pdf_page_images(page)
Expand Down
10 changes: 5 additions & 5 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"""
# sudo port install py27-pypdf2
import PyPDF2
from PyPDF2 import PdfFileMerger, PdfFileReader
from PyPDF2 import PdfFileMerger, PdfReader

import tesseract

Expand Down Expand Up @@ -39,7 +39,7 @@ def process_neonlight_serenade(src_pdf_file_path, dst_pdf_file_path):
pymusco.add_stamp(src_pdf_file_path, tmp_pdf_file_path, os.getenv('HOME')+'/data/Perso/MeltingNotes_work.git/partitions/mno-stamp.pdf')
#https://github.com/RussellLuo/pdfbookmarker/blob/master/add_bookmarks.py

# print(reader.outlines)
# print(reader.outline)
# [{'/Title': '1 Introduction', '/Left': 99.213, '/Type': '/XYZ', '/Top': 742.911, '/Zoom': ..., '/Page': IndirectObject(513, 0)},
# {'/Title': '2 Convolutional Neural Networks', '/Left': 99.213, '/Type': '/XYZ', '/Top': 742.911, '/Zoom': ..., '/Page': IndirectObject(554, 0)}, [{'/Title': '2.1 Linear Image Filters', '/Left': 99.213, '/Type': '/XYZ', '/Top': 486.791, '/Zoom': ..., '/Page': IndirectObject(554, 0)},
# {'/Title': '2.2 CNN Layer Types', '/Left': 70.866, '/Type': '/XYZ', '/Top': 316.852, '/Zoom': ..., '/Page': IndirectObject(580, 0)},
Expand Down Expand Up @@ -115,9 +115,9 @@ def process_neonlight_serenade(src_pdf_file_path, dst_pdf_file_path):


def test(src_pdf_file_path, dst_pdf_file_path):
output = PyPDF2.PdfFileWriter() # open output
input = PyPDF2.PdfFileReader(open(src_pdf_file_path, 'rb')) # open input
output.addPage(input.getPage(0)) # insert page
output = PyPDF2.PdfWriter() # open output
input = PyPDF2.PdfReader(open(src_pdf_file_path, 'rb')) # open input
output.add_page(input.pages[0]) # insert page
output.addBookmark('Hello, World Bookmark', 0, parent=None) # add bookmark
outputStream = open(dst_pdf_file_path,'wb') #creating result pdf JCT
output.write(outputStream) #writing to result pdf JCT
Expand Down

0 comments on commit ab94c5f

Please sign in to comment.