-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfbpagescraper.py
407 lines (361 loc) · 13.5 KB
/
fbpagescraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
import json,requests
import sys,csv,re,os
import time
from time import gmtime, strftime
v=False
#v=True
# Flag for verbose printing
outFile=None
# File to deposit filtered content
logFile=csv.writer(open('log.csv','a'),delimiter='\t')
# Log file for requests
trashFile=csv.writer(open('trash.txt','w'),delimiter='\t')
# File for writing weird bits of content I don't understand yet
ACCESSTOKEN='' #Insert access token here.
ACCESSTOKEN='' #Insert access token here.
# This is long lasting app key
LIMIT='5000'
# 5000 is limit for pages
QUERY='sunil'
# Query to grab pages
terms=['sunil']
regexString='|'.join(terms)
matchRe=re.compile(regexString)
# Construct regex from terms
nMatches=0
nTrash=0
#################
def logQuery(url):
#################
# Makes an entry in log file of URL, ID of page queries and time
global logFile
pageId=url.partition('graph.facebook.com/')[2]
pageId=pageId.partition('/posts')[0]
logFile.writerow([strftime("%H:%M:%S",time.localtime()),pageId,url])
#################
def matchesQuery(text,outFile):
#################
# Searches each piece of content for a
# single search term returns true/false
# Can't use compiled regex with unicode flag
returnVal=False
res=re.search(regexString,text,re.UNICODE|re.IGNORECASE)
if res:
returnVal=True
outFile.writerow(['MATCH',res.group().encode('utf-8')])
# Log each match
return returnVal
#################
def parsePosts(rr,nPages,nPosts,category):
#################
# Cycles through all posts form a given FB page
# if matching keywords, writes to file
global outFile
global logFile
global nMatches
global trashFile
global nTrash
for d,dd in enumerate(rr[u'data']):
nPosts+=1
try:
if v:print '\tMESSAGE',dd[u'message'].encode('utf-8')
except:
z=0
if u'message' in dd.keys():
# Can we use 'story' entries?
message=dd['message'].encode('utf-8')
outLine=['POST',dd[u'id'],dd[u'created_time']]
outLine.append(message.replace('\n',' | '))
outLine.append(category)
if v:print 'MATCHES?'
if matchesQuery(dd['message'],outFile):
outFile.writerow([o for o in outLine])
nMatches+=1
else:
if v:print '!!! NO MESSAGE',dd.keys()
if v: print 'COMMENTS?'
if u'comments' in dd.keys():
if dd[u'type'] in [u'photo',u'swf',u'link',u'status',u'video']:
# print 'HAS COMMENTS',dd[u'type'],dd[u'link'],dd.keys()
if u'link' in dd.keys():trashFile.writerow([dd[u'type'],dd[u'link'].encode('utf-8')])
else:trashFile.writerow([dd[u'type']])
# sys.exit(1)
# Keep track of photo,links,statuses etc which DO have comments (many don't)
for c in dd['comments']['data']:
if v:print '\t\tCOMMENT',c['message']
message=c['message'].encode('utf-8')
outLine=['COMMENT',c[u'id'],c[u'created_time']]
outLine.append(re.sub('\n',' | ',message))
outLine.append(category)
if matchesQuery(outLine[-2],outFile):
outFile.writerow([o for o in outLine])
nMatches+=1
## If not comments, catch other possible types
## These all seem to consistently not have any comments
## Tried querying many different types across many different pages
## Seems not due to privacy settings as comments possible in browser
## Also swf,question
#################################
elif dd[u'type']==u'video':
outLine=['VIDEO',dd[u'id'],dd[u'created_time']]
contentString=''
if u'link' in dd.keys(): trashFile.writerow([dd[u'link'].encode('utf-8')])
trashFile.writerow(dd.keys())
for k in [u'description',u'message',u'caption']:
if k in dd.keys():
contentString+='|'+dd[k].replace('\n','|')
contentString=contentString.encode('utf-8')
if matchesQuery(contentString,outFile):
outLine.append(contentString)
outFile.writerow(outLine)
nMatches+=1
#################################
elif dd[u'type']==u'status':
# message
# likes
outLine=['STATUS',dd[u'id'],dd[u'created_time']]
contentString=''
if u'link' in dd.keys(): trashFile.writerow([dd[u'link'].encode('utf-8')])
trashFile.writerow(dd.keys())
for k in [u'message']:
if k in dd.keys():
contentString+='|'+dd[k].replace('\n','|')
contentString=contentString.encode('utf-8')
if matchesQuery(contentString,outFile):
outLine.append(contentString)
outFile.writerow(outLine)
nMatches+=1
#################################
elif dd[u'type']==u'photo':
# picture,message
# likes
outLine=['PHOTO',dd[u'id'],dd[u'created_time']]
contentString=''
if u'link' in dd.keys(): trashFile.writerow([dd[u'link'].encode('utf-8')])
trashFile.writerow(dd.keys())
for k in [u'picture',u'message']:
if k in dd.keys():
contentString+='|'+dd[k].replace('\n','|')
contentString=contentString.encode('utf-8')
if matchesQuery(contentString,outFile):
outLine.append(contentString)
outFile.writerow(outLine)
nMatches+=1
#################################
elif dd[u'type']==u'link':
# description,message
# likes
outLine=['LINK',dd[u'id'],dd[u'created_time']]
contentString=''
if u'link' in dd.keys(): trashFile.writerow([dd[u'link'].encode('utf-8')])
trashFile.writerow(dd.keys())
for k in [u'description',u'message']:
if k in dd.keys():
contentString+='|'+dd[k].replace('\n','|')
contentString=contentString.encode('utf-8')
if matchesQuery(contentString,outFile):
outLine.append(contentString)
outFile.writerow(outLine)
nMatches+=1
#################################
else:
if v:print '!!! NO COMMENTS',dd.keys()
#print dd
# nTrash+=1
# print 'ADDING TO TRASH FILE',nTrash
# print dd.keys(),
# print 'TYPE',dd['type']
# print dd[u'id']
# if u'link' in dd.keys():print dd[u'link']
# if 'message' in dd.keys():print 'MESSAGE',dd['message'].replace('\n','|').encode('utf-8'),dd['type']
# json.dump(dd,trashFile,indent=2)
# sys.exit(1)
if v:print '+++++++++++++++++++'
if v:print ''
nPages+=1
return nPages,nPosts
'''
except:
print 'MISSING data KEY',rr.keys()
print rr[u'error_code'],rr[u'error_msg']
outFile.writerow(['MISSING DATA'])
# sys.exit(1)
'''
########################
def main():
########################
global outFile
restartOffset=0
nPostsTotal=0
# Counts total number of unfiltered posts considered
nMatchesTotal=0
global nMatches
startTime=time.localtime()
###################################
# Parse args
if len(sys.argv)==2:
restartId=sys.argv[1]
# outFile=csv.writer(open('out_'+QUERY.encode('utf-8')+'.csv','a'),delimiter='\t')
skip=True
commentsPageSkip=False
print '******APPENDING TO FILE','out_'+QUERY+'.csv'
print '******RESTARTING FROM PAGE',restartId
raw_input('IS THIS OK?')
outFile=csv.writer(open('out_'+QUERY+'.csv','a'),delimiter='\t')
restartCommentsPage=None
elif len(sys.argv)==3:
restartId=sys.argv[1]
restartCommentsPage=sys.argv[2]
# outFile=csv.writer(open('out_'+QUERY.encode('utf-8')+'.csv','a'),delimiter='\t')
skip=True
commentsPageSkip=True
print '******APPENDING TO FILE','out_'+QUERY+'.csv'
print '******RESTARTING FROM POSTS PAGE',restartCommentsPage
raw_input('IS THIS OK?')
outFile=csv.writer(open('out_'+QUERY+'.csv','a'),delimiter='\t')
else:
print '******OPENING OUTFILE','out_'+QUERY+'.csv'
if 'out_'+QUERY+'.csv' in os.listdir('.'):print '!!!!!WILL OVERWRITE'
raw_input('IS THIS OK?')
outFile=csv.writer(open('out_'+QUERY+'.csv','w'),delimiter='\t')
skip=False
commentsPageSkip=False
restartCommentsPage=None
restartId=-9999
# restartCommentsPage is FB page to resume from
# restartId is ID of FB page to resume from
# skip is flag to skip FB pages until restartId is found
# commentsPageSkip is flag to skip pages of comments on a
# FB page matching restartId until restartCommentsPage is found
###################################
tempUrl='https://graph.facebook.com/search?q='+QUERY+'&limit='+LIMIT+'&type=page&access_token='+ACCESSTOKEN
r=requests.get(tempUrl).json()
logQuery(tempUrl)
# Get all pages matching QUERY
# LIMIT=5000
# Make limit higher once we start to look at comments not pages???
if not 'data' in r.keys():
print 'EXPIRED????',r
sys.exit(1)
################################################
for p,page in enumerate(r[u'data']):
# Each page has 'category','name','id'
errorSkip=False
nError=0
try:
print 'PAGE #',p,'('+str(len(r[u'data']))+')',page[u'name'],page[u'category'],page[u'id'],strftime("%H:%M:%S", time.localtime())
except:
print '!!!!!!!PAGE ERROR'
if page[u'id']==restartId:
skip=False
print 'RESTARTING....'
if not skip:
tempUrl='https://graph.facebook.com/'+page[u'id']+'/posts?'+'&limit='+LIMIT+'&access_token='+ACCESSTOKEN
logQuery(tempUrl)
rr=requests.get(tempUrl).json()
# Try to get the posts
while u'error' in rr.keys() or u'error_msg' in rr.keys():
if (u'error' in rr.keys() and u'code' in rr[u'error'].keys() and rr[u'error'][u'code'] in [1,2]) or u'error_msg' in rr.keys():
# API error
print 'API ERROR: SLEEPING....'
print rr
time.sleep(60)
print 'RETRYING (1)'
nError+=1
if nError==10:
print nError,'ERRORS - SKIPPING'
errorSkip=True
break
else:
# TOKEN ERROR
print '********ERROR',rr[u'error']
sys.exit(1)
tempUrl='https://graph.facebook.com/'+page[u'id']+'/posts?'+'&limit='+LIMIT+'&access_token='+ACCESSTOKEN
rrtemp=requests.get(tempUrl)
print 'rrtemp',rrtemp,rrtemp.text
rr=rrtemp.json()
logQuery(tempUrl)
# Try to get the posts again
nPages=0
nError=0
nPosts=0
nMatches=0
outFile.writerow(['PAGE',page[u'id'],page[u'name'].encode('utf-8'),page[u'category'].encode('utf-8')])
if not errorSkip and not commentsPageSkip:
# If API has caused 3 errors, skip
# Or if restarting from a later comments page, skip
errorSkip=False
nPages,nPosts=parsePosts(rr,nPages,nPosts,page[u'category'].encode('utf-8'))
while 'paging' in rr.keys() and not errorSkip and not commentsPageSkip:
if v:print 'LOADING',rr[u'paging'][u'next']
rrrRaw=requests.get(rr[u'paging'][u'next'])
logQuery(rr[u'paging'][u'next'])
if rr['paging']['next']==restartCommentsPage and restartCommentsPage:
commentsPageSkip=False
print '**********MATCHED RESTART PAGE - RESUMING PARSING COMMENTS'
# If we want to restart from last page
elif restartCommentsPage and restartId==page['id']:
print '**********DIDNT MATCH COMMENTS RESTART PAGE'
restartOffset+=1
try:
rrr=rrrRaw.json()
except:
print 'JSON ERROR', rrrRaw.status_code
while u'error' in rrr.keys() or u'error_msg' in rrr.keys():
if u'error' in rrr.keys() or u'error_msg' in rrr.keys():
# API error
print 'API ERROR: SLEEPING....'
print rrr,rrrRaw,rrrRaw.status_code,rrrRaw.text
print rr[u'paging'][u'next']
time.sleep(10)
print 'RETRYING'
nError+=1
if nError==10:
print nError,'ERRORS - SKIPPING'
errorSkip=True
break
else:
# TOKEN ERROR ?
print '********ERROR',rrr
sys.exit(1)
rrr=requests.get(rr[u'paging'][u'next'])
# print 'rrr',rrr,rrr.text,rrr.status_code
# print
rrr=rrr.json()
logQuery(rr['paging']['next'])
# Try to get the posts again
# if the API doesn't respond
if not commentsPageSkip:
if v:
print '# COMMENTS PAGES',nPages,'# POSTS',nPosts,'# MATCHES',nMatches,strftime("%H:%M:%S", time.localtime()),
if not restartOffset==0:
print '# OFFSET',restartOffset
else:
print ''
if (not errorSkip and not commentsPageSkip) and not skip:
# If API has caused 10 errors in a row
# Or if restarting from a later page of comments
# or if not already found restart page, skip
nPages,nPosts=parsePosts(rrr,nPages,nPosts,page[u'category'].encode('utf-8'))
else:
print '************NOT PARSING POSTS',errorSkip,commentsPageSkip
print 'BREAKING'
break
rr=rrr
print '# COMMENTS PAGES',nPages,'# POSTS',nPosts,'# MATCHES',nMatches,strftime("%H:%M:%S", time.localtime())
if not restartOffset==0:
print '# OFFSET',restartOffset
outFile.writerow(['PAGE TOTALS',str(nPages),str(nPosts),str(nMatches)])
outFile.writerow(['RUNNING PAGE TOTALS',p,str(nPostsTotal),str(nMatchesTotal)])
nPostsTotal+=nPosts
nMatchesTotal+=nMatches
print 'TOTAL SO FAR #POSTS',nPostsTotal,'#MATCHES',nMatchesTotal
print '-----------'
restartOffset=0
else:
print 'SKIPPING.....',nPostsTotal
print 'FINISHED',strftime("%H:%M:%S",startTime),'-',strftime("%H:%M:%S",time.localtime())
#####
if __name__=='__main__':
#####
main()