-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
62 lines (55 loc) · 2.01 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from bs4 import BeautifulSoup
from pprint import pprint
import sys
import csv
import time
import urllib.request
property_id = 7344
# property_id = sys.argv[0]
highest_id = 9050
def fetch_tds(url):
result = urllib.request.urlopen(url).read()
soup = BeautifulSoup(result, 'html.parser')
table = soup.find('table', attrs = {'id': 'ctl00_Col2ContentPlaceholder_ucRentBoardUnitList_gvRentBoardUnitList'});
table_data = table.find_all('td')
return table_data
def output(name, values):
with open(name, 'w') as f:
writer = csv.writer(f)
writer.writerows(values)
values = [['Unit Address', 'Unit Status', 'Tenancy Start', 'Rent Ceiling', 'Housing Services', 'Other']]
current = []
sleepCounter = 0
while (property_id <= highest_id):
print(property_id)
url = 'http://www.cityofberkeley.info/RentBoardUnitSearch.aspx?propertyId=' + str(property_id)
try:
table_data = fetch_tds(url)
except AttributeError:
output('broken' + str(property_id) + '.csv', values)
i = 0
for data_tag in table_data:
if (i == 0):
if (data_tag['class'][0] == 'gridItemUnitFullAddress'):
unit_address_one = data_tag.find('span', attrs = {'class': 'singleUnitMode'})
unit_address_two = data_tag.findChildren()[0]
if (unit_address_one):
current.append(unit_address_one.get_text())
elif (unit_address_two):
current.append(unit_address_two.get_text())
else:
print("Check unit addressing at this id: " + str(property_id))
break
else:
# only one child
child = data_tag.findChildren()
current.append(child[0].get_text())
if (i == 5):
values.append(current)
current = []
i = (i + 1) % 6
if (sleepCounter == 0):
time.sleep(1)
sleepCounter = (sleepCounter + 1) % 60
property_id += 1
output('output.csv', values)