-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharchive_downloader.py
154 lines (129 loc) · 5.37 KB
/
archive_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
from multiprocessing import Pool
import internetarchive
import numpy as np
class ArchiveDownloader:
'''
Internet Archive download utility utilizing multiprocessing.
'''
def __init__(self, archive_identifier : str, output_loc : str, glob_str : str = None, process_num : int = 4) -> None:
'''
Initializes utility for downloading archive with given parameters.
Parameters
----------
archive_identifier : str
identifier of archive item to download from
output_loc : str
output directory for downloads on local disk
glob_str : str
glob pattern to filter items to download within archive
process_num : int
number of simultaneous download processes to run
'''
self.archive_identifier = archive_identifier
self.output_loc = output_loc
self.glob_str = glob_str
self.process_num = process_num
def set_params(self, archive_identifier : str, output_loc : str, glob_str : str = None, process_num : int = 4) -> None:
'''
Sets parameters for an archive download.
Parameters
----------
archive_identifier : str
identifier of archive item to download from
output_loc : str
output directory for downloads on local disk
glob_str : str
glob pattern to filter items to download within archive
process_num : int
number of simultaneous download processes to run
Returns
-------
None
'''
self.archive_identifier = archive_identifier
self.output_loc = output_loc
self.glob_str = glob_str
self.process_num = process_num
def download_files(self, files : list) -> None:
'''
Downloads the given files.
Parameters
----------
files : list
list of the files to download from the archive
Returns
-------
None
'''
internetarchive.download(self.archive_identifier, destdir=self.output_loc, verbose=True, ignore_existing=True, files=files)
def download_archive(self) -> None:
'''
Runs a download task. Splits downloading of an archive over given number of processes.
Returns
-------
None
'''
# names of files to download
names = np.array([f.name for f in internetarchive.get_files(self.archive_identifier, glob_pattern=self.glob_str)])
pool = Pool(processes=self.process_num) # pool object with number of processes to run
task_lists = np.array_split(names, self.process_num) # split files among processes
task_lists = [(list(l)) for l in task_lists] # convert back to python list
# map the function to the list and pass function and task_lists as arguments
# starts downloads
pool.map(self.download_files, task_lists)
# Driver code
if __name__ == '__main__':
'''
Driver for running archive download task from command line.
Command Line Arguments
----------------------
-i, --identifier
type=str
Required
Identifier of archive item to download from.
-o, --output_loc
type=str
Required
Output location to download items to.
-g, --glob_str
type=str
Default: None
Glob string to filter items within archive for download.
-p, --process_num
type=int
Default: 4
Number of simultaneous download processes to run.
'''
import argparse
## Define command line argument parser and arguments ##
archive_identifier = "" # identifier of item to download from
output_loc = "" # output directory on local disk
glob_str = None # glob pattern to filter items to download within archive
process_num = 4 # number of simultaneous download processes to run
parser = argparse.ArgumentParser(description="Downloads items from an archive on the InternetArchive. Must specify archive identifier and output location.")
parser.add_argument("-i", "--identifier", type=str, help="Identifier of archive item to download from.")
parser.add_argument("-o", "--output_loc", type=str, help="Output location to download items to.")
parser.add_argument("-g", "--glob_str", type=str, help="Glob string to filter items within archive for download. Default: None.")
parser.add_argument("-p", "--process_num", type=int, help="Number of simultaneous download processes to run. Default: 4.")
args = parser.parse_args()
## Validate arguments ##
if not args.identifier:
raise RuntimeError("Must specify archive identifier with -i <identifier>.")
else:
archive_identifier = args.identifier
if not args.output_loc:
raise RuntimeError("Must specify output location with -o <output_loc>.")
else:
output_loc = args.output_loc
if not args.glob_str:
glob_str = None
else:
glob_str = args.glob_str
if not args.process_num:
process_num = 4
else:
process_num = args.process_num
## Run download task ##
print("Beginning Download Task of", archive_identifier, "to", output_loc, "with filter", glob_str, "using", process_num, "processes.")
downloader = ArchiveDownloader(archive_identifier, output_loc, glob_str, process_num)
downloader.download_archive()