-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathrun_crawler.py
60 lines (45 loc) · 1.48 KB
/
run_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
import os
import shutil
import glob
curfilePath = os.path.abspath(__file__)
curDir = os.path.abspath(os.path.join(curfilePath, os.pardir))
tmpDir = os.path.abspath(os.path.join(curDir,'tmp/'))
# remove old crawling data
try:
shutil.rmtree(tmpDir)
except:
pass
# Get the search keyword from the user
print "Enter Search Keyword (product or brand name to search):",
pruduct = raw_input()
# configure logging
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
# get the project settings
s=get_project_settings()
# Change the depth limit here
# s['DEPTH_LIMIT'] = 2
process = CrawlerProcess(s)
# Add spiders to crawl
process.crawl('amazon',product=pruduct)
process.crawl('ebay',product=pruduct)
process.crawl('shopclues',product=pruduct)
process.crawl('olx',product=pruduct)
process.start()
# Add results to results.csv file after crawling is complete
interesting_files = glob.glob(tmpDir+'/*.csv')
header_saved = False
with open('results.csv','wb') as fout:
for filename in interesting_files:
if os.path.getsize(filename) > 0:
with open(filename) as fin:
header = next(fin)
if not header_saved:
fout.write(header)
header_saved = True
for line in fin:
fout.write(line)
print 'Crawling Completed'