-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathscraper.py
169 lines (150 loc) · 5.27 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#test
import urllib.parse
import requests
from bs4 import BeautifulSoup
def makeResponseData(status, name, data=None):
response = {}
response['status'] = status
if data:
response['results'] = len(data)
response[name] = data
else:
response['results'] = None
response[name] = None
return response
def getDestinations(region):
url = 'https://www.atlasobscura.com/destinations'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
destinations = {}
for item in soup.find_all('li', class_='global-region-item'):
continent_soup = item.div
continent_name = continent_soup.h2.text.strip()
destinations[continent_name] = []
for country in continent_soup.find_all('a', class_='detail-md non-decorated-link'):
destinations[continent_name].append(country.text);
# region parameter
if region:
return destinations[region.replace('-', ' ')]
else:
return destinations
def getAttractions(country, city, state, sort, limit, offset):
# base url
url = 'https://www.atlasobscura.com/things-to-do/'
# specify CITY
#if country == 'United-States' and state:
# country = state
if city:
url += city + '-'
if state:
country = state
# specify COUNTRY
url += country + '/places?'
# QUERY PARAMS
# specify PAGE
if 0 < offset and offset <= 16:
url += '&page=' + str(offset + 1)
# specify SORT
if sort == 'recent':
url += '&sort=recent'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
attractions = []
for card in soup.find_all('a',
class_='Card --content-card-v2 --content-card-item Card--flat')[:limit]:
# CREATE NEW ATTRACTION
curr_attraction = {}
# LOCATION (City, Town)
curr_attraction['location'] = card.find('div',
class_='Card__hat --place').text
# NAME
curr_attraction['name'] = card.find('h3',
class_='Card__heading --content-card-v2-title').find("span").text
# Description
curr_attraction['description'] = card.find('div',
class_='Card__content js-subtitle-content').text
# Coordinates
curr_attraction['coordinates'] = [float(card["data-lat"]), float(card["data-lng"])]
# Image Thumbnail
curr_attraction['img'] = card.find('img')['data-src']
# PATH
curr_attraction['path'] = card['href']
attractions.append(curr_attraction)
return attractions
def getFoodandDrink(country, offset, limit, region=None):
foods = []
url = 'https://www.atlasobscura.com/unique-food-drink'
if country:
url += ('/' + country)
if offset != 0:
url += '?page=' + str(offset + 1)
if region:
url += '#' + region.replace('-', '%20')
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
cards = soup.find_all('a', class_='content-card food-card')
if country:
start = 0
end = limit
else:
start = offset * 16
end = start + limit
for card in cards[start:end]:
curr_food = {}
# CATEGORY
curr_food['category'] = card.find('div',
class_='detail-sm food-card-label food-card-supertag').text
# NAME
curr_food['name'] = card.find('h3',
class_='title-md content-card-title').text.strip()
# REGION
if region:
curr_food['region'] = card.parent['data-region'][2:-2]
elif country:
curr_food['region'] = country
# DESCRIPTION
curr_food['description'] = card.find('div',
class_='content-card-subtitle content-card-subtitle-food'
' js-subtitle-content').text
# IMAGE THUMBNAIL
curr_food['img'] = card.find('img')['data-src']
# PATH
curr_food['path'] = card['href']
# ADD Food
foods.append(curr_food)
return foods
def getGastroPlaces(offset, limit):
# base url
url = 'https://www.atlasobscura.com/gastro/places?'
# QUERY PARAMS
# specify PAGE
if 0 < offset and offset <= 18:
url += '&page=' + str(offset + 1)
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
places = []
for card in soup.find_all('a',
class_='content-card content-card-place')[:limit]:
# CREATE NEW ATTRACTION
curr_place = {}
# LOCATION (City, Town)
curr_place['location'] = card.find('div',
class_='detail-sm place-card-location').text
# NAME
curr_place['name'] = card.find('span',
class_='title-underline js-title-content').text
# Description
curr_place['description'] = card.find('div',
class_='subtitle-sm content-card-subtitle js-subtitle-content').text
# Coordinates
curr_place['coordinates'] = [float(c) for c in card.find('div',
class_='lat-lng').text.split(',')]
# Image Thumbnail
curr_place['img'] = card.find('img')['data-src']
# PATH
curr_place['path'] = card['href']
places.append(curr_place)
return places
# TEST CODE HERE
#response = getAttractions('germany', 'berlin')
#print(json.dumps(response, separators=(',',':'), indent=3))