-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_extractor.py
59 lines (51 loc) · 1.97 KB
/
text_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from io import BytesIO
import requests, logging
from fitz import open as fitzopen, Pixmap
from models import Paper
MAX_PDF_SIZE = 1 * 1024 * 1024 * 1024 # ~1GB
MAX_TEXT_SIZE = 1 * 1024 * 1024 # ~1MB
def fetch_and_extract_text_from_pdf(url: str) -> Paper:
try:
head_response = requests.head(url)
content_length = int(head_response.headers.get("Content-Length", 0))
status = "success"
if content_length > MAX_PDF_SIZE:
logging.info(
f"PDF is too large (> {MAX_PDF_SIZE / 1024 / 1024}MB), skipping blob storage for {url}"
)
store_blob = False
status = "pdf_too_large"
else:
store_blob = True
response = requests.get(url)
response.raise_for_status()
text = ""
with fitzopen(stream=BytesIO(response.content), filetype="pdf") as doc:
for page in doc:
if page.number == 0:
pic: Pixmap = page.get_pixmap()
if len(text) < MAX_TEXT_SIZE:
new_text = page.get_text()
if len(text) + len(new_text) > MAX_TEXT_SIZE:
text += new_text[: MAX_TEXT_SIZE - len(text)]
break
else:
text += new_text
else:
break
maybe_blob = response.content if store_blob else None
return Paper(
url=url,
status=status,
text=text,
blob=maybe_blob,
pic=pic,
)
except requests.exceptions.RequestException as e:
logging.info(f"request failed - {url}: {e}")
logging.exception(e)
return Paper(url=url, status="unable_to_fetch", text=None, blob=None)
except Exception as e:
logging.info(f"Error processing PDF {url}: {e}")
logging.exception(e)
return Paper(url=url, status="processing_failed", text=None, blob=None)