This repository has been archived by the owner on Jul 23, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathmain.py
121 lines (80 loc) · 3.77 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import sys
import requests
from xml.dom.minidom import parseString
from io import BytesIO, SEEK_SET, SEEK_END
import PyPDF2
# Create a class which convert PDF in BytesIO form
# TBH I stole this one from somewhere and I have no idea how it works
class ResponseStream(object):
def __init__(self, request_iterator):
self._bytes = BytesIO()
self._iterator = request_iterator
def _load_all(self):
self._bytes.seek(0, SEEK_END)
for chunk in self._iterator:
self._bytes.write(chunk)
def _load_until(self, goal_position):
current_position = self._bytes.seek(0, SEEK_END)
while current_position < goal_position:
try:
current_position = self._bytes.write(next(self._iterator))
except StopIteration:
break
def tell(self):
return self._bytes.tell()
def read(self, size = None):
left_off_at = self._bytes.tell()
if size is None:
self._load_all()
else:
goal_position = left_off_at + size
self._load_until(goal_position)
self._bytes.seek(left_off_at)
return self._bytes.read(size)
def seek(self, position, whence = SEEK_SET):
if whence == SEEK_END:
self._load_all()
else:
self._bytes.seek(position, whence)
if sys.platform == 'darwin': #macos """fixup"""
input("Paste the cookie in the cookies.txt and then press enter")
cookie = open("cookies.txt", "rb").read()
else:
cookie = input("Paste the cookies: ").encode('latin-1', 'replace')
isbn = input("Input the ISBN of the book you want to download: ")
print("Gethering information about the volume...")
spine = requests.get('https://web-booktab.zanichelli.it/api/v1/resources_web/'+isbn+'/spine.xml', allow_redirects=False, headers={'Cookie':cookie})
pdfsToMerge = []
if spine.status_code == 302:
print("Invalid shibsession cookie, please try again.")
sys.exit()
elif spine.status_code != 200:
spine = requests.get('https://web-booktab.zanichelli.it/api/v1/resources_web/'+isbn+'/volume.xml', allow_redirects=False, headers={'Cookie':cookie})
if spine.status_code == 302:
print("Invalid shibsession cookie, please try again.")
sys.exit()
elif spine.status_code != 200:
print("Invalid ISBN, please try again.")
sys.exit()
print("Extracting chapters...")
spine = parseString(spine.text)
parts = spine.getElementsByTagName("unit")
merger = PyPDF2.PdfFileMerger()
print("Downloading all parts...")
for part in parts:
if part.getAttribute("features") == 'flash':
continue
partInfo = requests.get('https://web-booktab.zanichelli.it/api/v1/resources_web/'+isbn+'/'+part.getAttribute("btbid")+'/config.xml', headers={'Cookie':cookie})
#print('http://web.booktab.it/boooks_web/'+isbn+'/'+part.getAttribute("btbid")+'/config.xml')
if partInfo.status_code != 200:
continue
partXML = parseString(partInfo.text)
key = partXML.getElementsByTagName("content")[0].firstChild.nodeValue
pdfUrl = ''
for entry in partXML.getElementsByTagName("entry"):
if entry.getAttribute("key") == key+".pdf":
pdfUrl = entry.firstChild.nodeValue+".pdf"
break
pdf = requests.get('https://web-booktab.zanichelli.it/api/v1/resources_web/'+isbn+'/'+part.getAttribute("btbid")+'/'+pdfUrl, headers={'Cookie':cookie})
merger.append(PyPDF2.PdfFileReader(ResponseStream(pdf.iter_content(64))))
merger.write(input("Input a title for the file: ") + ".pdf")