-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSpeedySpider.py
128 lines (94 loc) · 3.19 KB
/
SpeedySpider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#
# the pagespeedy spider
#
#
#
#
#
import os
import datetime
import getopt
import sys
import math
from multiprocessing import Pool
from spider.SpeedyCrawler import SpeedyCrawler
from speedy.speedydb import SpeedyDb
#
# main spider call, will spider a site. using the crawler
#
def spiderSingleSite(site):
# this is how many pages we will max out on
# limit * 2 is the number links we will try
page_limit = 10000
folder = os.path.join(os.path.dirname(__file__), 'data/links/')
spider = SpeedyCrawler(page_limit, folder)
print '>>>>> Starting : ', site[1], site[2]
spider.process(site[1], site[2])
print '<<<<< Done : ', site[1]
#
# Multi-threaded spider crawl, will fire off multiple site crawls
# (each crawl is single threaded)
#
def SpiderSites(sites, threads):
pool = Pool(processes=threads)
pool.map(spiderSingleSite, sites)
pool.close()
pool.join()
#
# helper functions
#
#
# the nightly spider is designed to crawl all sites over 30 days
# everynight it takes 14 sites from speedy, and crawls them.
#
def nightlySpider(dayNum, threads):
db = SpeedyDb()
sites = db.getSpiderSites()
start = (dayNum-1)*14
end = dayNum * 14
print ''
print '---------------------------------------------------------------------'
print 'processing: ', start , 'to', end, ':', threads, 'threads'
print '---------------------------------------------------------------------'
print ''
SpiderSites(sites[start:end], threads)
#
# respiders the broken sites.
#
def respider(groupsize, threads):
db = SpeedyDb()
sites = db.getSpiderSitesInError()
siteCount = len(sites)
nights = int(math.ceil(float(siteCount) / groupsize))
size = int(math.ceil( siteCount / nights))
day = datetime.datetime.today().day - 1
group = (day % nights)+1
start = group * size;
end = min((group * size) + size, siteCount)
print ' Performing recrawl from sites in error ( currently', siteCount , ')'
print ' Day:', day, '. Group:', group,
print '. Start:', start, ". End:", end
print ''
print r'------------------------------------------------------------------'
SpiderSites(sites[start:end], threads)
#for site in sites[start:end]:
# print site[1], site[2]
# for site in sites[count:count+14]:
# print site[0], site[1]
if __name__ == '__main__':
print r' _____ __ _____ _ __ '
print r' / ___/____ ___ ___ ____/ /_ __/ ___/____ (_)___/ /__ _____'
print r' \__ \/ __ \/ _ \/ _ \/ __ / / / /\__ \/ __ \/ / __ / _ \/ ___/'
print r' ___/ / /_/ / __/ __/ /_/ / /_/ /___/ / /_/ / / /_/ / __/ / '
print r'/____/ .___/\___/\___/\__,_/\__, //____/ .___/_/\__,_/\___/_/ '
print r' /_/ /____/ /_/ site crawling thingy '
print r'------------------------------------------------------------------'
print
# nightlySpider(day, 8)
# nightlySpider(12, 8)
# site = ['1', 'liverpool', 'http://liverpool.gov.uk']
# spiderSites(site)
respider(14, 7)
# db = SpeedyDb()
# sites = db.getNewSites(31)
# SpiderSites(sites, 8)