-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdouban.py
163 lines (120 loc) · 5.01 KB
/
douban.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/python
# -*- coding: utf-8 -*-
import threading
import urllib2
import pystache
import requests
import re
import os
from datetime import *
import time
from pyquery import PyQuery as pq
import Queue
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
## print pystache.render('Hi {{person}}!', {'person': 'Mom'})
class RSS:
def __init__(self,url,output_file,tpl,encode_code="utf-8",rss_title="unnamed rss"):
self.path = os.path.dirname(os.path.abspath(__file__))
self.encode_code = encode_code
self.url = url
self.lock = threading.Lock()
self.rss_title = rss_title
self.output_file = output_file
self.tpl = tpl
self.workers = [] # 储存所有线程, 由于线程里面的run 是一个死循环, 所以最后当queue处理完毕的时候, 需要关闭所有的子线程
# print self.path
def generate_rss(self,data):
with open(self.path+"/"+self.tpl,"r") as file:
items = []
for key in data['items']:
items.append(data['items'][key])
print data['items'][key]
data['items'] = items
rss_tpl = file.read()
renderer = pystache.Renderer(file_encoding="utf-8",string_encoding="utf-8")
data['rss_title'] = self.rss_title
data['source_url'] = self.url
try:
rss = pystache.render(rss_tpl, data)
except:
print "error"
return rss
def write_to_file(self,content):
print "write to " + self.output_file
with open(self.path+'/'+self.output_file, 'w') as xml_file:
# print repr(content)
xml_file.write(content.encode("utf-8"))
#返回 unicode
def fetch_web_page(self):
print "fetch " + self.url
response = requests.get(self.url)
content = response.content
return content.decode(self.encode_code)
# 获取 item的全文内容, unicode 编码
def load_item_full_content(self,items,threads_num = 5):
# print "start 5 threads to fullfil task"
queue = Queue.Queue()
for key in items:
queue.put(items[key])
for i in range(threads_num):
# print "new worker %s" % i
worker = FetchContent(queue = queue, items = items,lock = self.lock, encode_code = self.encode_code)
worker.setDaemon(True) # 这个很重要, 让主线程完成之后 也关闭这些 worker线程, 否则程序不会结束
worker.start()
self.workers.append(worker)
queue.join()
return items
# 找出页面上需要的链接
def filter_links(self):
page = self.fetch_web_page()
jQuery = pq(page)
source_item = jQuery(".channel-item h3 a")
# print source_item
items = {}
for key,row in enumerate(source_item):
title = jQuery(row).text()
link = jQuery(row).attr("href")
link = str(link).strip() # 去掉首尾空格
items[link] = {"title":title,"link":link,"description":""}
# print "done: " + link
return items
class FetchContent(threading.Thread):
# 传入 items数组 和 需要处理的 item的 index下标
def __init__(self,queue,items,lock,encode_code = "utf-8"):
threading.Thread.__init__(self)
print "%s init" % self.getName()
self.queue = queue
self.items = items
self.lock = lock
self.encode_code = encode_code
self.running_flag = True # 留一个flag 主线程可以 通过这个flag 退出 run 循环
def run(self):
while self.running_flag:
item = self.queue.get()
response = requests.get(item['link'])
response = response.content
print "%s is fetching url : %s , size: %s" % (self.getName(),item['link'], len(response))
response = response.decode(self.encode_code)
jQuery = pq(response)
content = jQuery('.topic-doc')
try:
content = jQuery(content[0]).html()
# title = jQuery(".title").text()
# self.items[item['link']]['title'] = title
self.items[item['link']]['description'] = content
except:
self.lock.acquire()
print "remove unhealthy item, url : %s" % item['link']
self.items.pop(item['link'],"default")
self.lock.release()
self.queue.task_done()
start = time.time()
rss = RSS(encode_code="utf-8",url="http://www.douban.com/group/explore",rss_title=u"豆瓣话题精选",output_file="douban_huati.xml",tpl="tpl.py")
items = rss.filter_links();
# print items
items = rss.load_item_full_content(items)
rss_content = rss.generate_rss({"items":items,"lastBuildDate":datetime.today()})
rss.write_to_file(rss_content)
print "Elapsed time: %s" % (time.time()-start)