-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIMDb_crawler.py
161 lines (141 loc) · 6.39 KB
/
IMDb_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import os
import requests
from bs4 import BeautifulSoup # pip install beautifulsoup4
import pandas as pd
import pickle
import threading
import time
from movie_linking import split_title_and_year, linking
# In[全域變數與函數]:
main_page = 'https://www.imdb.com/title/' # IMDB網域名稱/title/
header_example = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'}
default_poster = 'https://m.media-amazon.com/images/G/01/imdb/images/social/imdb_logo.png'
def RW_ClassObj(obj=None, write=True, dir_name='temp/', name='var', date='', batch = ''):
if date:
dir_name += date+'/'
if not os.path.exists(dir_name):
os.makedirs(dir_name)
if batch:
name += '_'+str(batch)
if write:
with open(dir_name+name, 'wb') as file:
pickle.dump(obj, file)
else:
with open(dir_name+name, 'rb') as file:
obj = pickle.load(file)
return obj
# In[IMDb_crawler類別]:
class IMDb_crawler:
def __init__(self, idlist):
self.imdbId_list = idlist
self.n = len(self.imdbId_list)
self.name_zhtw = ['']*self.n
self.years = ['']*self.n
self.genres = ['']*self.n
self.grades = ['']*self.n
self.posters = ['']*self.n # 部分電影可能共用預覽圖(非default_poster), 這是IMDb網站的問題, 與爬蟲程式無關
def reload_var(self, num):
self.name_zhtw = RW_ClassObj(write=False, name='name_zhtw', batch=num)
self.genres = RW_ClassObj(write=False, name='grades', batch=num)
self.grades = RW_ClassObj(write=False, name='grades', batch=num)
self.posters = RW_ClassObj(write=False, name='posters', batch=num)
def save_var(self, num):
RW_ClassObj(obj=self.name_zhtw, write=True, name='name_zhtw', batch=num)
RW_ClassObj(obj=self.genres, write=True, name='genres', batch=num)
RW_ClassObj(obj=self.grades, write=True, name='grades', batch=num)
RW_ClassObj(obj=self.posters, write=True, name='posters', batch=num)
def isfloat(self, s):
try:
float(s)
return True
except ValueError:
return False
def idx_of_last_left_parenthesis(self, s):
for i in range(len(s)-1, -1, -1):
if s[i] == '(':
return i
return -1
def crawl_IMDb(self, i, show_text=False): # 子執行緒的工作函數, 注意無法return
url = main_page+self.imdbId_list[i]
request = requests.get(url, headers=header_example, timeout=10)
html = BeautifulSoup(request.text, 'html.parser') # 解析HTML網頁
links = html.find_all('meta') # 所有包含'meta'標籤的元素
text, img = '', ''
for link in links:
if 'property' in link.attrs:
if link.attrs['property'] == 'og:title':
text = link.attrs['content'] # 電影"片名(年份)*星數|類型"的字串
if link.attrs['property'] == 'og:image':
img = link.attrs['content'] # 電影海報的url
if text and img:
break
"""
text = html.find('meta', attrs={'property': 'og:title'}) # 電影"片名(年份)*星數|類型"的字串
# <meta content="玩具總動員 (1995) ⭐ 8.3 | Animation, Adventure, Comedy" property="og:title"/>
img = html.find('meta', attrs={'property': 'og:image'}) # 電影海報的url
# <meta content="url" property="og:image"/>
text, img = str(text)[15:-23], str(img)[15:-23]
"""
if len(text) == 0:
return
if show_text:
print(i+1, '-th catched', sep='')
parts = text.split('|')
self.name_zhtw[i], self.years[i] = split_title_and_year(parts[0])
tail = -2 if parts[0][-1] == ' ' else -1
if self.isfloat(parts[0].split(' ')[tail]):
self.grades[i] = parts[0].split(' ')[tail]
else:
self.grades[i] = '0'
if len(parts) > 1:
self.genres[i] = parts[1].replace(' ', '').replace(',', '|')
else:
self.genres[i] = '(no genres listed)'
self.posters[i] = img
if show_text:
print(' ', self.name_zhtw[i], self.years[i], self.grades[i], self.genres[i], self.posters[i] != default_poster)
def threading_crawler(self, idxlist, show_text=False):
for i in idxlist:
self.crawl_IMDb(i, show_text)
time.sleep(1)
def crawl_threads(self, begin, batch_size, thread_count, target_idx=[], show_text=False):
if not target_idx:
target_idx = list(range(self.n))
start_t = time.time()
threads = []
for i in range(thread_count): # 建立10個子執行緒
start = begin+i*batch_size # 爬取範圍
end = start+batch_size
threads.append(threading.Thread(
target=self.threading_crawler, # 子執行緒目標函數
args=(target_idx[start:end], show_text)) # 目標函數的參數
)
threads[i].start() # 子執行緒開始運行
for j in range(thread_count): # 等待所有子執行緒結束
threads[j].join()
end_t = time.time()
print('Done, duration =', end_t-start_t, 'sec.')
def check_missing(self): # 檢查遺漏項
return [i for i in range(self.n) if not self.name_zhtw[i] or not self.posters[i]]
def get_result(self):
return self.name_zhtw, self.years, self.genres, self.grades, self.posters
# In[main]:
if __name__ == "__main__":
linking() # 合併movies.csv, links.csv
movies = pd.read_csv('movies_linked.csv', sep = ',') # 62423部電影
imdbId_list = list(movies['imdbId'])
crawler = IMDb_crawler(imdbId_list)
crawler.crawl_threads(begin = 0, batch_size = 41, thread_count = 10, show_text = True)
name_zhtw, years, genres, grades, posters = crawler.get_result()
# In[]:
# movies['name_zhtw'] = name_zhtw
movies['year'] = years
movies['genres'] = genres
movies['grade'] = grades
movies['picture'] = posters
"""
未上映
tt7335008 (2024)
"""
movies[movies['imdbId'] == 'tt7335008']['year'] = '2024'
movies.to_csv('movies_extended.csv', index = False, header = True)