-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmiddlewares.py
73 lines (53 loc) · 2.44 KB
/
middlewares.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
from random import choice
from scrapy.exceptions import NotConfigured
class DomainCrawlerSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
def __init__(self, user_agents):
self.enabled = False
self.user_agents = user_agents
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
user_agents = crawler.settings.get('USER_AGENT_CHOICES', [])
if not user_agents:
raise NotConfigured("USER_AGENT_CHOICES not set or empty")
o = cls(user_agents)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
#for r in start_requests:
# yield r
if not self.enabled or not self.user_agents:
return
request.headers['user-agent'] = choice(self.user_agents)
def spider_opened(self, spider):
self.enabled = getattr(spider, 'rotate_user_agent', self.enabled)