-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtweets.py
46 lines (35 loc) · 1.37 KB
/
tweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from pymongo import MongoClient
import re
import os
def clean(uncleanText):
cleanTweet = re.sub(r'http\S+', '', uncleanText)
cleanTweet = ''.join([char if ord(char) < 128 else '' for char in cleanTweet])
cleanTweet = re.sub('[_()\"*#/@<>{}`+=~|]', "", str(cleanTweet))
cleanTweet = ' '.join([word for word in uncleanText.split(" ") if word != "&"])
cleanTweet = ' '.join([word for word in uncleanText.split(" ") if word != "RT"])
cleanTweet = re.sub(r'([^\s\w]|_)+', '', cleanTweet)
cleanTweet = cleanTweet.replace("\n" , " ")
cleanTweet = cleanTweet.lower()
# print("Data cleaned for the given line")
return cleanTweet
# ------------------------------ Extracting tweets from Mongo --------------------------------- #
client = MongoClient()
db = client.Assignment3Database
search = db.twitterSearchCollection
stream = db.twitterStreamCollection
if not os.path.exists("tweets"):
os.mkdir("tweets")
counter = 1
for tweet in search.find():
f = open("tweets/file_" + str(counter) + ".txt", 'a')
text = clean(tweet["text"])
f.write(text)
counter += 1
f.close()
for tweet in stream.find():
f = open("tweets/file_" + str(counter) + ".txt", 'a')
text = clean(tweet["text"])
f.write(text)
counter += 1
f.close()
# ------------------------------------------------------------------------------------------------ #