需求:想让程序完成某样事情之后发生一个信号,或着。。。不或者了,反正是就是发送信号的。哼。
首先看一下scrapy dispatcher
1 | from pydispatch import dispatcher |
输出结果:1
signal aa is send by 打印中的sender
Signals信号槽
看了 scrapy 的官方文档,关于 signals 的函数在 from_crawler()中有如下示例代码:
1 | crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened) #代码 1 |
两个都用
1 | from scrapy.xlib.pydispatch import dispatcher |
spider类初始化时就绑定了两个方法,spider_closed
和spider_stopped
其实还有好多,具体看官方文档
注意, signals.spider_closed比signals.engine_stopped先执行
完整点的看一下
这个是我在网上粘的python2的代码,但是不重要,重要的是思想!1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142# -*- coding: utf-8 -*-
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from items import IpProxyPoolItem
from model.spider_running_log import SpiderCrawlLog
from model import loadSession
from datetime import datetime
class ProxySpiderSpider(CrawlSpider):
name = 'MagicSpider'
def __init__(self,rule):
#spider启动信号和spider_opened函数绑定
dispatcher.connect(self.spider_opened, signals.spider_opened)
#spider关闭信号和spider_spider_closed函数绑定
dispatcher.connect(self.spider_closed, signals.spider_closed)
self.rule = rule
self.name = rule.name
self.allowed_domains = rule.allowed_domains.split(',')
self.start_urls = rule.start_urls.split(',')
rule_list = []
# 添加`下一页`的规则
if len(rule.next_page):
rule_list.append(Rule(LinkExtractor(restrict_xpaths=rule.next_page), follow=True))
rule_list.append(Rule(LinkExtractor(
allow=rule.allow_url.split(','),
unique=True),
follow=True,
callback='parse_item'))
self.rules = tuple(rule_list)
super(ProxySpiderSpider, self).__init__()
#spider关闭时的逻辑
def spider_closed(self, spider):
print "spider is closed!"
session = loadSession()
log = session.query(SpiderCrawlLog).filter(SpiderCrawlLog.spiderID == self.rule.id,
SpiderCrawlLog.endTime == None
).first()
log.endTime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log.status = "closed"
session.commit()
#spider启动时的逻辑
def spider_opened(self, spider):
print "spider is running!"
item = SpiderCrawlLog(
spiderID=self.rule.id,
spiderName=self.rule.name,
status="Running...",
startTime=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
endTime=None,
pages=0,
items=0
)
session = loadSession()
log = session.query(SpiderCrawlLog).filter(
SpiderCrawlLog.spiderID == self.rule.id,
SpiderCrawlLog.endTime == None)
# 查询当前spider是否有未结束的日志
if log.count() == 0 :
session.add(item)
session.commit()
else:
pass
def parse_item(self, response):
# print 'Hi, this is an item page! %s' % response.url
#print response.body
session = loadSession()
log = session.query(SpiderCrawlLog).filter(SpiderCrawlLog.spiderID == self.rule.id,
SpiderCrawlLog.endTime == None).first()
log.pages = int(log.pages) + 1
session.commit()
item=IpProxyPoolItem()
if len(self.rule.loop_xpath):
# print 'Find %d items!'% len(response.xpath(self.rule.loop_xpath))
for proxy in response.xpath(self.rule.loop_xpath):
if len(self.rule.ip_xpath):
tmp_ip = proxy.xpath(self.rule.ip_xpath).extract_first()
ip = tmp_ip.strip() if tmp_ip is not None else ""
else:
ip = ""
if len(self.rule.port_xpath):
tmp_port = proxy.xpath(self.rule.port_xpath).extract_first()
port = tmp_port.strip() if tmp_port is not None else ""
else:
port = ""
if len(self.rule.location1_xpath):
tmp_location1 = proxy.xpath(self.rule.location1_xpath).extract_first()
location1 = tmp_location1.strip() if tmp_location1 is not None else ""
else:
location1 = ""
if len(self.rule.location2_xpath):
tmp_location2 = proxy.xpath(self.rule.location2_xpath).extract_first()
location2 = tmp_location2.strip() if tmp_location2 is not None else ""
else:
location2 = ""
if len(self.rule.lifetime_xpath):
tmp_lifetime = proxy.xpath(self.rule.lifetime_xpath).extract_first()
lifetime = tmp_lifetime.strip() if tmp_lifetime is not None else ""
else:
lifetime = ""
if len(self.rule.lastcheck_xpath):
tmp_lastcheck = proxy.xpath(self.rule.lastcheck_xpath).extract_first()
lastcheck = tmp_lastcheck.strip() if tmp_lastcheck is not None else ""
else:
lastcheck = ""
if len(self.rule.level_xpath):
tmp_level = proxy.xpath(self.rule.level_xpath).extract_first()
level = tmp_level.strip() if tmp_level is not None else ""
else:
level = ""
if len(self.rule.type_xpath):
tmp_type = proxy.xpath(self.rule.type_xpath).extract_first()
type = tmp_type.strip() if tmp_type is not None else ""
else:
type = ""
if len(self.rule.speed_xpath):
tmp_speed = proxy.xpath(self.rule.speed_xpath).extract_first()
speed = tmp_speed.strip() if tmp_speed is not None else ""
else:
speed = ""
item['ip_port']=(":".join([ip,port])) if len(port) else ip
item['type']=type
item['level']=level
item['location']=(" ".join([location1,location2])) if location2 is not None and len(location2) else location1
item['speed']=speed
item['lifetime']=lifetime
item['lastcheck']=lastcheck
item['rule_id']=self.rule.id
item['source']=response.url
yield item
总结
这几个代码总能搞清楚这个信号机制怎么用了吧。
参考: