-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLoadLinkData.py
84 lines (62 loc) · 2.15 KB
/
LoadLinkData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os
import json
import csv
from urlparse import urlparse
from speedy.speedydb import SpeedyDb
from speedy.domainwapple import DomainWapple
#
# Gets the folder (that will have all the files in.)
#
def getSiteFolder(name):
folder = os.path.join(os.path.dirname(__file__), 'data/links/' + name + '/')
if not os.path.exists(folder):
return None
return folder
def getSiteInfo(name):
folder = getSiteFolder(name)
if folder is None:
return None
info = '{0}{1}_info.txt'.format(folder, name)
if not os.path.exists(info):
return None
with open(info) as data_file:
data = json.load(data_file)
if data['pages'] <= 1:
return None
return data
def getDomains(id, url, name, db):
purl = urlparse(url)
domain = purl.netloc;
folder = getSiteFolder(name)
if folder is None:
return None
domains = '{0}{1}_domain.txt'.format(folder, name)
if not os.path.exists(domains):
return None
with open(domains) as data_file:
r = csv.reader(data_file)
for row in r:
if row[0].lower() != domain.lower():
db.saveDomainInfo(id, row[0].replace("'", " "), row[1].replace("'", " "))
def loaddata():
db = SpeedyDb()
sites = db.getSites()
for site in sites:
name = site[1]
print name,
spider_ok = True
data = getSiteInfo(name)
if not (data is None):
print 'Loading....', site[0], name, data['pages'],
db.saveLinkInfo(site[0], int(data['pages']), int(data['docs']), int(data['broken']), int(data['queued']))
if int(data["pages"]) == 10000 or int(data['links']) == 20000 or int(data['broken']) == 1000 or int(data['queued']) > 0:
spider_ok = False
db.setSpiderStatus(site[0], spider_ok)
# print spider_ok
domains = getDomains(site[0], site[2], name, db)
if __name__ == '__main__':
loaddata()
wp = DomainWapple()
wp.process()
# wp.test('http://democracy.allerdale.gov.uk/ielistdocuments.aspx?cid=11&mid=3351')
# wp.test('https://democracy.basingstoke.gov.uk/mgfindmember.aspx')