-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathiauploader.py
164 lines (147 loc) · 7.3 KB
/
iauploader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/env python3
"""
Retrieve and disseminate files and metadata to Internet Archive
"""
import logging
import sys
from internetarchive import get_item, upload, exceptions as ia_except
from io import BytesIO
from requests import exceptions as req_except
from errors import DisseminationError
from uploader import Uploader, Location
class IAUploader(Uploader):
"""Dissemination logic for Internet Archive"""
def upload_to_platform(self):
"""Upload work in required format to Internet Archive"""
# Fast-fail if credentials for upload are missing
try:
access_key = self.get_variable_from_env(
'ia_s3_access', 'Internet Archive')
secret_key = self.get_variable_from_env(
'ia_s3_secret', 'Internet Archive')
except DisseminationError as error:
logging.error(error)
sys.exit(1)
# Use Thoth ID as unique identifier (URL will be in format `archive.org/details/[identifier]`)
filename = self.work_id
# Ensure that this identifier is available in the Archive
# (this will also check that the identifier is in a valid format,
# although all Thoth IDs should be acceptable)
if not get_item(filename).identifier_available():
logging.error(
'Cannot upload to Internet Archive: an item with this identifier already exists')
sys.exit(1)
# Include full work metadata file in JSON format,
# as a supplement to filling out Internet Archive metadata fields
metadata_bytes = self.get_formatted_metadata('json::thoth')
# Can't continue if no PDF file is present
try:
publication = self.get_publication_details('PDF')
pdf_bytes = publication.bytes
except DisseminationError as error:
logging.error(error)
sys.exit(1)
# Convert Thoth work metadata into Internet Archive format
# (not expected to fail, as "required" metadata is minimal)
ia_metadata = self.parse_metadata()
try:
responses = upload(
identifier=filename,
files={
'{}.pdf'.format(filename): BytesIO(pdf_bytes),
'{}.json'.format(filename): BytesIO(metadata_bytes),
},
metadata=ia_metadata,
access_key=access_key,
secret_key=secret_key,
retries=2,
retries_sleep=30,
verify=True,
)
# Empty access_key and/or secret_key triggers an AuthenticationError.
# Incorrect access_key and/or secret_key triggers an HTTPError.
except ia_except.AuthenticationError:
# The fast-fail above ought to prevent us from hitting this
logging.error(
'Error uploading to Internet Archive: credentials missing')
sys.exit(1)
except req_except.HTTPError:
# internetarchive module outputs its own ERROR log before we catch this exception,
# so no need to repeat the error text. As a future enhancement,
# we could filter out these third-party logs (along with the INFO logs
# which are output during the upload process) and update this message.
logging.error(
'Error uploading to Internet Archive: credentials may be incorrect')
sys.exit(1)
if len(responses) < 1:
logging.error(
'Error uploading to Internet Archive: no response received from server')
sys.exit(1)
for response in responses:
if response.status_code != 200:
logging.error(
'Error uploading to Internet Archive: {}'.format(response.text))
sys.exit(1)
landing_page = 'https://archive.org/details/{}'.format(filename)
full_text_url = 'https://archive.org/download/{}/{}.pdf'.format(filename, filename)
location_platform = 'INTERNET_ARCHIVE'
logging.info('Successfully uploaded to Internet Archive at {}'.format(landing_page))
# Return details of created upload to be entered as a Thoth Location
return [Location(publication.id, location_platform, landing_page,
full_text_url)]
def parse_metadata(self):
"""Convert work metadata into Internet Archive format"""
work_metadata = self.metadata.get('data').get('work')
# Repeatable fields such as 'creator', 'isbn', 'subject'
# can be set by submitting a list of values
creators = [n.get('fullName')
for n in work_metadata.get('contributions')
if n.get('mainContribution') is True]
# IA metadata schema suggests hyphens should be omitted,
# although including them does not cause any errors
isbns = [n.get('isbn').replace(
'-', '') for n in work_metadata.get('publications') if n.get('isbn') is not None]
# We may want to mark BIC, BISAC, Thema etc subject codes as such;
# IA doesn't set a standard so representations vary across the archive
subjects = [n.get('subjectCode')
for n in work_metadata.get('subjects')]
languages = [n.get('languageCode')
for n in work_metadata.get('languages')]
issns = [n.get('series').get(key) for n in work_metadata.get(
'issues') for key in ['issnPrint', 'issnDigital']]
# IA only accepts a single volume number
volume = next(iter([str(n.get('issueOrdinal'))
for n in work_metadata.get('issues')]), None)
ia_metadata = {
# All fields are non-mandatory
# Any None values or empty lists are ignored by IA on ingest
'collection': 'thoth-archiving-network',
'title': work_metadata.get('fullTitle'),
'publisher': self.get_publisher_name(),
'creator': creators,
# IA requires date in YYYY-MM-DD format, as output by Thoth
'date': work_metadata.get('publicationDate'),
'description': work_metadata.get('longAbstract'),
# Field name is misleading; displayed in IA as 'Pages'
'imagecount': work_metadata.get('pageCount'),
'isbn': isbns,
'lccn': work_metadata.get('lccn'),
'licenseurl': work_metadata.get('license'),
'mediatype': 'texts',
'oclc-id': work_metadata.get('oclc'),
# IA has no dedicated DOI field but 'source' is
# "[u]sed to signify where a piece of media originated"
'source': work_metadata.get('doi'),
# https://help.archive.org/help/uploading-a-basic-guide/ requests no more than
# 10 subject tags, but additional tags appear to be accepted without error
'subject': subjects,
'language': languages,
'issn': issns,
'volume': volume,
# Custom field: this data should already be included in the formatted
# metadata file, but including it here may be beneficial for searching
'thoth-work-id': self.work_id,
# Custom field helping future users determine what logic was used to create an upload
'thoth-dissemination-service': self.version,
}
return ia_metadata