-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimage_scraper.py
59 lines (50 loc) · 2.13 KB
/
image_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from bs4 import BeautifulSoup
import urllib.request
import shutil
import requests
from urllib.parse import urljoin
import sys, os
import time
# Basic source found here: https://pythonprogramming.altervista.org/how-to-download-images-from-the-web-with-python/?doing_wp_cron=1588099753.4233078956604003906250
# Function used to bring raw HTML into the python environment
def make_soup(url):
req = urllib.request.Request(url, headers={'User-Agent' : "Magic Browser"})
html = urllib.request.urlopen(req)
return BeautifulSoup(html, 'html.parser')
# Download the images from a given website
def get_images(url):
# Create a directory to store the incoming images into
dir_name = url.split('/')[2]
curr_path = os.getcwd()
relative_path = curr_path + '/' + dir_name
if not os.path.exists(relative_path):
os.makedirs(relative_path)
# Begin process of ientifying and extracting the images
soup = make_soup(url)
images = [img for img in soup.findAll('img')]
print (str(len(images)) + " images found.")
print('Downloading images to current working directory.')
image_links = [each.get('src') for each in images]
for each in image_links:
print(each)
try:
filename = each.strip().split('/')[-1].strip()
src = urljoin(url, each)
print('Getting: ' + filename)
response = requests.get(src, stream = True)
# delay to avoid corrupted previews
#time.sleep(1)
with open(filename, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
except:
print('An error occured. Continuing.')
# move the image file from the curr_path to the relative_path
shutil.move(os.getcwd() + '/' + filename, os.getcwd() + '/' + dir_name + '/'+ filename)
print('Done.')
def main(argv):
# Scrape images from a given URL; save the images to the a relative path created by manipulating the URL name
# get_images('https://fr.wikipedia.org/wiki/Wikip%C3%A9dia:Accueil_principal')
# get_images('https://www.victoria.ca/EN/main/residents/archives/photo-gallery/140-years-of-victoria-city-hall.html')
get_images('https://www.reddit.com/r/OldPhotosInRealLife/')
if __name__ == '__main__':
main(sys.argv)