forked from simonseo/instagram-hashtag-crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
129 lines (117 loc) · 4.06 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import json
import os
from collections import deque
from re import findall
from time import time, sleep
from util import randselect, byteify, file_to_list
import csv
def crawl(api, hashtag, config):
# print('Crawling started at origin hashtag', origin['user']['username'], 'with ID', origin['user']['pk'])
if visit_profile(api, hashtag, config):
pass
def visit_profile(api, hashtag, config):
while True:
try:
processed_tagfeed = {
'posts' : []
}
feed = get_posts(api, hashtag, config)
with open(config['profile_path'] + os.sep + str(hashtag) + '_rawfeed.json', 'w') as outfile:
json.dump(feed, outfile, indent=2)
profile_dic = {}
posts = [beautify_post(api, post, profile_dic) for post in feed]
posts = list(filter(lambda x: not x is None, posts))
if len(posts) < config['min_collect_media']:
return False
else:
processed_tagfeed['posts'] = posts[:config['max_collect_media']]
try:
if not os.path.exists(config['profile_path'] + os.sep): os.makedirs(config['profile_path'])
except Exception as e:
print('exception in profile path')
raise e
try:
with open(config['profile_path'] + os.sep + str(hashtag) + '.json', 'w') as outfile:
json.dump(processed_tagfeed, outfile, indent=2)
except Exception as e:
print('exception while dumping')
raise e
except Exception as e:
print('exception while visiting profile', e)
if str(e) == '-':
raise e
return False
else:
return True
def beautify_post(api, post, profile_dic):
try:
if post['media_type'] != 1: # If post is not a single image media
return None
keys = post.keys()
# print(post)
user_id = post['user']['pk']
profile = profile_dic.get(user_id, False)
while True:
try:
sleep(0.05)
if not profile:
profile = api.user_info(user_id)
profile_dic[user_id] = profile
except Exception as e:
# print(post)
print('exception in getting user_info from {} {}'.format(user_id, post['user']['username']), e)
sleep(5)
# raise e
else:
break
# profile = api.username_info('simon_oncepiglet')
# print(profile)
# print('Visiting:', profile['user']['username'])
processed_media = {
'user_id' : user_id,
'username' : profile['user']['username'],
'full_name' : profile['user']['full_name'],
'profile_pic_url' : profile['user']['profile_pic_url'],
'media_count' : profile['user']['media_count'],
'follower_count' : profile['user']['follower_count'],
'following_count' : profile['user']['following_count'],
'date' : post['taken_at'],
'pic_url' : post['image_versions2']['candidates'][0]['url'],
'like_count' : post['like_count'] if 'like_count' in keys else 0,
'comment_count' : post['comment_count'] if 'comment_count' in keys else 0,
'caption' : post['caption']['text'] if 'caption' in keys and post['caption'] is not None else ''
}
processed_media['tags'] = findall(r'#[^#\s]*', processed_media['caption'])
# print(processed_media['tags'])
return processed_media
except Exception as e:
print('exception in beautify post')
raise e
def get_posts(api, hashtag, config):
try:
feed = []
try:
uuid = api.generate_uuid(return_hex=False, seed='0')
results = api.feed_tag(hashtag, rank_token=uuid, min_timestamp=config['min_timestamp'])
except Exception as e:
print('exception while getting feed1')
raise e
feed.extend(results.get('items', []))
if config['min_timestamp'] is not None: return feed
next_max_id = results.get('next_max_id')
while next_max_id and len(feed) < config['max_collect_media']:
print("next_max_id", next_max_id, "len(feed) < max_collect_media", len(feed) < config['max_collect_media'] , len(feed))
try:
results = api.feed_tag(hashtag, rank_token=uuid, max_id=next_max_id)
except Exception as e:
print('exception while getting feed2')
if str(e) == 'Bad Request: Please wait a few minutes before you try again.':
sleep(60)
else:
raise e
feed.extend(results.get('items', []))
next_max_id = results.get('next_max_id')
return feed
except Exception as e:
print('exception while getting posts')
raise e