-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsentiment_build_allcomms.py
332 lines (289 loc) · 14.2 KB
/
sentiment_build_allcomms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
#Sentiment Calculator
#Collect file locations of csv files in folder
#Loop through each file and
#Loop through each comment and
#Run sentiment analysis
#Save as new dataframe
#Store dataframe elsewhere
#Calculate a list of averages
#Append list to dictionary of Overall Stats
#Append Post ID to dictionary of Overall Stats
#Store Sentiment Dataframe in CSV
print("Importing modules....")
import csv #open csv files
import pandas as pd #data manipulation
import sys #csv files in folder
import datetime #converting timestamps
import requests #for sentiment
from nltk.tokenize import sent_tokenize, word_tokenize
import os #access filenames
import time #Track execution time
import string
from collections import Counter
from nltk.corpus import stopwords
stopWords = set(stopwords.words("english"))
#--------------------BUILD SENTIMENT ANALYSIS FUNCTIONS-----------------#
#Function removes punctuation and stopwords and returns list of words.
def refine(text):
filtered = []
nopunc = [char for char in text if char not in string.punctuation] #remove punctuation
nopunc = "".join(nopunc)
tokens = word_tokenize(nopunc) #split into list of words
wordsFiltered = [word for word in tokens if word.lower() not in stopWords]
#extra stopwords would go here
#('game', 612), ('like', 450), ('games', 343),
#('people', 223), ('dont', 220), ('really', 187), ('get', 178), ('one', 174), ('conference', 169), ('think', 161), ('Im', 154),
#[('game', 456), ('like', 355), ('games', 207), ('looks', 152), ('really', 143), ('world', 138), ('people', 138),('one', 120) ('think', 119), ('dont', 112), ('one', 110), ('would', 98), ('get', 96),
#('much', 95), ('even', 90), ('make', 88), ('way', 85), ('pretty', 82), ('see', 78), ('thing', 77), ('time', 75), ('actually', 75), ('going', 73)]
return wordsFiltered
#Function returns 30 most common words.
def most_common(text):
threadCounter = Counter()
refined = refine(text) #clean, remove stopwords
threadCounter.update(refined) #update thread counter when new words
return threadCounter.most_common(30)
#Function
def low_effort(text):
filtered = []
nopunc = [char for char in text if char not in string.punctuation] #remove punctuation
nopunc = "".join(nopunc)
tokens = word_tokenize(nopunc) #split into list of words
if len(tokens) < 6 or len(nopunc) < 140:
label = "low effort"
else:
label = "high effort"
return label
#Hu Liu Sentiment Build
#Return list of words within URL
def get_posnegWords(url):
words = requests.get(url).content.decode('latin-1')
word_list = words.split('\n')
index = 0
while index < len(word_list):
word = word_list[index]
if ';' in word or not word:
word_list.pop(index)
else:
index+=1
return word_list
#URLs for Positive and Negative words for Hu Liu Model
p_url = 'http://ptrckprry.com/course/ssd/data/positive-words.txt'
n_url = 'http://ptrckprry.com/course/ssd/data/negative-words.txt'
positive_words = get_posnegWords(p_url)
negative_words = get_posnegWords(n_url)
print('Within Hu and Liu\'s sentiment analysis lexicon:')
print('There are ' + str(len(positive_words)) + ' positive words and ' + str(len(negative_words)) + ' negative words' )
#Track scores for no. of positive/negative words and Return final scores
def huLiu_sentiment(text):
posTrack = 0
negTrack = 0
sentiment_field = list()
for word in word_tokenize(text):
if word in positive_words:
posTrack+=1
if word in negative_words:
negTrack+=1
pos = posTrack/len(word_tokenize(text))
neg = negTrack/len(word_tokenize(text))
difference = (posTrack-negTrack)/len(word_tokenize(text))
sentiment_field.extend((pos,neg,difference))
return sentiment_field
#NRC Sentiment Build
#Builds NRC Word dictionary
def get_nrc_words():
#check location of this file
nrc = "\\NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt"
count=0
emotion_dict=dict()
with open(nrc,'r') as f:
all_lines = list()
for line in f:
if count < 46:
count+=1
continue
line = line.strip().split('\t')
if int(line[2]) == 1:
if emotion_dict.get(line[0]):
emotion_dict[line[0]].append(line[1])
else:
emotion_dict[line[0]] = [line[1]]
return emotion_dict
emotion_dict = get_nrc_words()
#Using NRC Emotion words analyse text within argument and return a dictionary of emotion scores
def emotion_analyzer(text,emotion_dict=emotion_dict):
emotions = {x for y in emotion_dict.values() for x in y} #return set based on the following for-loops
emotion_count = dict()
for emotion in emotions:
emotion_count[emotion] = 0 #sets each emotion e.g. anger, fear at 0
total_words = len(text.split()) #splits text into list then returns total number of words
for word in text.split(): #for each word in list of words
if emotion_dict.get(word): #if that word is listed in emotion_dict
for emotion in emotion_dict.get(word): #for each emotion in dictionary of emotions (all currently set to zero)
emotion_count[emotion] += 1/len(text.split())
#sort emotions into ascending order top to bottom
from collections import OrderedDict
sorted_emotions = OrderedDict(sorted(emotion_count.items(), key=lambda t: t[1], reverse=True))
return sorted_emotions
#Vader Comparison Build
def vader_comparison(texts):
from nltk import sent_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
headers = ['pos','neg','neu','compound']
analyzer = SentimentIntensityAnalyzer()
sentences = sent_tokenize(texts)
pos=compound=neu=neg=0
for sentence in sentences:
vs = analyzer.polarity_scores(sentence)
pos+=vs['pos']/(len(sentences)) #positive = total cumulative positive score / no. of sentences = average positive score
compound+=vs['compound']/(len(sentences))
neu+=vs['neu']/(len(sentences))
neg+=vs['neg']/(len(sentences))
vader_scores = {'v_positive':pos,'v_negative':neg,'v_neutral':neu,'v_compound':compound}
#sort emotions into ascending order top to bottom
from collections import OrderedDict
vader_scores = OrderedDict(sorted(vader_scores.items(), key=lambda t: t[0], reverse=True)) #changed to [0] to order by key alphabetically
return vader_scores
#Creates new dataframe of sentiments from file
def add_sentiment_rows(file, folder):
#Open File
df = pd.read_csv(file, encoding="ISO-8859-1") #Open CSV in read mode
#Define whether it's Top Comments or All Comments
if folder == "Comments":
post_id = file.replace("\\Comments\\","")
comments = df["All Comments"]
elif folder == "Top Comments":
post_id = file.replace("\\Top Comments\\","")
comments = df["Top Comment"]
#Assign Post ID to post_id
post_id = post_id.replace(".csv", "")
print(str(post_id) + " has been opened for " + folder)
#Create new dictionary to store Sentiments for each comment
new_df = {}
#alist, blist, clist, dlist, elist = ([] for i in range(5))
(new_df['v_positive'], new_df['v_negative'], new_df['v_neutral'], new_df['v_compound'], new_df['Hu_pos'], new_df['Hu_neg'], new_df['Hu_diff'],
new_df['nrc_positive'], new_df['nrc_negative'], new_df['nrc_fear'], new_df['nrc_trust'], new_df['nrc_anger'], new_df['nrc_joy'],
new_df['nrc_anticipation'], new_df['nrc_sadness'], new_df['nrc_disgust'], new_df['nrc_surprise']) = ([] for i in range(17))
#create a dictionary entry = common words
#add all words from each comment to total words
#run common words
#Run Sentiment Analysis on each comment
for comment in comments:
#remove low effort comments
if low_effort(comment) == "low effort":
continue
vader = vader_comparison(comment)
new_df['v_positive'].append(vader['v_positive'])
new_df['v_negative'].append(vader['v_negative'])
new_df['v_neutral'].append(vader['v_neutral'])
new_df['v_compound'].append(vader['v_compound'])
#huLiu_sentiment(text) returns sentiment_field.extend((pos,neg,difference))
huLiu = huLiu_sentiment(comment)
new_df['Hu_pos'].append(huLiu[0])
new_df['Hu_neg'].append(huLiu[1])
new_df['Hu_diff'].append(huLiu[2])
#('positive','negative','fear','trust','anger','joy','anticipation','sadness','disgust','surprise')
#emotion_analyzer(text,emotion_dict=emotion_dict)
nrc = emotion_analyzer(comment)
new_df['nrc_positive'].append(nrc['positive'])
new_df['nrc_negative'].append(nrc['negative'])
new_df['nrc_fear'].append(nrc['fear'])
new_df['nrc_trust'].append(nrc['trust'])
new_df['nrc_anger'].append(nrc['anger'])
new_df['nrc_joy'].append(nrc['joy'])
new_df['nrc_anticipation'].append(nrc['anticipation'])
new_df['nrc_sadness'].append(nrc['sadness'])
new_df['nrc_disgust'].append(nrc['disgust'])
new_df['nrc_surprise'].append(nrc['surprise'])
#Turn dictionary into a Dataframe
new_frame = pd.DataFrame.from_dict(new_df)
#Turn Comments from original series into Dataframe
comments_frame = comments.to_frame()
#Merge Dataframes of Comments and Sentiments
merged_frame = pd.concat([comments_frame, new_frame], axis=1)
return post_id, merged_frame
#Calculate Mean Averages of each Column in dataframe and return as list
def averages(data_frame):
full_text = ""
info = []
all_comments = data_frame['All Comments'].tolist()
for i in all_comments:
full_text+=i
common_words = most_common(full_text)
vpos_mean = data_frame['v_positive'].mean()
vneg_mean = data_frame['v_negative'].mean()
vneu_mean = data_frame['v_neutral'].mean()
vcomp_mean = data_frame['v_compound'].mean()
hpos_mean = data_frame['Hu_pos'].mean()
hneg_mean = data_frame['Hu_neg'].mean()
hdiff_mean = data_frame['Hu_diff'].mean()
npos_mean = data_frame['nrc_positive'].mean()
nneg_mean = data_frame['nrc_negative'].mean()
nfear_mean = data_frame['nrc_fear'].mean()
ntrust_mean = data_frame['nrc_trust'].mean()
nanger_mean = data_frame['nrc_anger'].mean()
njoy_mean = data_frame['nrc_joy'].mean()
nantic_mean = data_frame['nrc_anticipation'].mean()
nsad_mean = data_frame['nrc_sadness'].mean()
ndisg_mean = data_frame['nrc_disgust'].mean()
nsurpr_mean = data_frame['nrc_surprise'].mean()
info.extend((vpos_mean, vneg_mean, vneu_mean, vcomp_mean, hpos_mean, hneg_mean, hdiff_mean, npos_mean,
nneg_mean, nfear_mean, ntrust_mean, nanger_mean, njoy_mean, nantic_mean, nsad_mean, ndisg_mean, nsurpr_mean, common_words))
return info
def update_stats(post_id, sentiments):
posts_stats['Most Common Words'].append(sentiments[17])
posts_stats['Vader Positive'].append(sentiments[0])
posts_stats['Vader Negative'].append(sentiments[1])
posts_stats['Vader Neutral'].append(sentiments[2])
posts_stats['Vader Comp'].append(sentiments[3])
posts_stats['Hui Positive'].append(sentiments[4])
posts_stats['Hui Negative'].append(sentiments[5])
posts_stats['Hui Difference'].append(sentiments[6])
posts_stats['NRC Positive'].append(sentiments[7])
posts_stats['NRC Negative'].append(sentiments[8])
posts_stats['NRC Fear'].append(sentiments[9])
posts_stats['NRC Trust'].append(sentiments[10])
posts_stats['NRC Anger'].append(sentiments[11])
posts_stats['NRC Joy'].append(sentiments[12])
posts_stats['NRC Anticipation'].append(sentiments[13])
posts_stats['NRC Sad'].append(sentiments[14])
posts_stats['NRC Disgust'].append(sentiments[15])
posts_stats['NRC Surprise'].append(sentiments[16])
posts_stats['Post ID'].append(post_id)
print(str(post_id) + " sentiment averages have been calculated.")
#Create dictionary to store averages for each file
posts_stats = {}
#alist, blist, clist, dlist, elist = ([] for i in range(5))
(posts_stats['Vader Positive'], posts_stats['Vader Negative'], posts_stats['Vader Neutral'], posts_stats['Vader Comp'],
posts_stats['Hui Positive'], posts_stats['Hui Negative'], posts_stats['Hui Difference'], posts_stats['NRC Positive'], posts_stats['NRC Negative'],
posts_stats['NRC Fear'], posts_stats['NRC Trust'], posts_stats['NRC Anger'], posts_stats['NRC Joy'], posts_stats['NRC Anticipation'],
posts_stats['NRC Sad'], posts_stats['NRC Disgust'], posts_stats['NRC Surprise'], posts_stats['Post ID'],
posts_stats['Most Common Words']) = ([] for i in range(19))
#-------------------------Execute above code--------------------------------------
print("Here we go")
location = "\\Comments\\"
comms_list = os.listdir(location)
comms_file_list = []
#create list of file locations of comments
for filename in comms_list:
comms_file_list.append(location + filename)
print("List of file dictories have been created")
counter = 0
for file in comms_file_list:
start_point = time.time()
start_time = time.time()
#remove stopwords, 140 characters, add most common words
post_id, comms_sents = add_sentiment_rows(file, "Comments") #add_sentiment_rows(file, folder)
comms_sents.to_csv("\\Sentiments\\" + str(post_id) + "_sentiments.csv")
means = averages(comms_sents)
update_stats(post_id, means)
counter+=1
if counter % 10 == 0:
print("Currently on " + str(counter) + " with Post ID " + str(post_id))
print("--- %s minutes ---" % ((time.time() - start_time)*60)) #fix timings
ult_frame = pd.DataFrame.from_dict(posts_stats)
#---------------------------------------------------------------
print("See below shape of Ultimate Sentiments Dataframe")
print(ult_frame.shape) #check shape
print("Time to upload final sentiment file")
ult_frame.to_csv("\\Sentiment_All_Comments_Done.csv")
print("Program executed in --- %s minutes ---" % ((time.time() - start_point)/60))