-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathEc_candidates_tweets_sentiment_an.py
190 lines (162 loc) · 7.92 KB
/
Ec_candidates_tweets_sentiment_an.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# imports
import pandas as pd
import nltk
from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords, words
from matplotlib import pyplot as plt
from keras.utils import to_categorical
import numpy as np
from sklearn.model_selection import train_test_split
from keras import models, layers
from collections import Counter
# Function for build the one hot matrix
def build_corpus(tweets):
# corpora downloads for nltk
nltk.download('stopwords')
# dictionary size
dictionarySize = 7000
# tokenize and get frequency
topUniqueWordsFiltered = []
tok = Tokenizer(filters='!"#$%&()*+,-./:;<=>?[\\]^_`{|}~\t\n')
tok.fit_on_texts(tweets)
# print(tok.__dict__.items())
topUniqueWords = sorted(tok.word_counts.items(), key=lambda x: x[1], reverse=True)
# print(topUniqueWords)
for word,_ in topUniqueWords:
if len(word)> 3 and ('@') not in word and 'http' not in word and word not in stopwords.words('spanish') and not word in topUniqueWordsFiltered:
topUniqueWordsFiltered.append(word)
return topUniqueWordsFiltered[:dictionarySize]
# Function to represent tweets as numerical vectors considering the corpus of TASS and candidate datsets as reference
def buildWordVectorMatrix(tweetsVect, corpusW):
# empty numpy matrix of necesary size
wordVectorMatrix = np.zeros((len(tweetsVect), len(corpusW)))
# fill matrix, with binary representation of words
for pos, tweetInPos in enumerate (tweetsVect):
# split each tweet into a list of its words
tweetWords = tweetInPos.lower().split()
# assign value of 1 in matrix position corresponding to the current tweet
# and the id, each of its contained words is located on the previously built dictionary
for word in tweetWords:
# only consider words that are part of the built dictionary
if word in corpusDictionary:
wordVectorMatrix[pos, corpusDictionary.index(word)] = 1
return wordVectorMatrix
# Load joined TASS dataset
tassDf = pd.read_csv("Datasets/ALL_TassDF.csv", encoding='utf8').reset_index(drop=True)[['Text', 'Tag']]
# Select tweets with tag of positive, neative or neutral
tassDf = tassDf.loc[(tassDf.Tag == 'P') | (tassDf.Tag == 'N') | (tassDf.Tag == 'NEU')]
# Verify the final dataset - 57454 tweets
print(tassDf, '\n\n', tassDf.shape)
print(tassDf.columns.values)
print(tassDf.values)
# Load replies to tweets from two Ecuadorian presidential candidates
candidDf = pd.read_csv( 'Datasets/ALL_candidates.csv', encoding='utf8')
# Merge TASS and canidiate datsets to create the corpus
joinedDfTexts = candidDf['text'].append(tassDf['Text'], ignore_index=True) # continuous idxs
print(joinedDfTexts, '\n\n', joinedDfTexts.shape)
print(joinedDfTexts.values)
# Build the one hot matrix considering TASS and candidate datasets
corpusDictionary = build_corpus(joinedDfTexts)
print (len(corpusDictionary), corpusDictionary)
# Observe Tass dataset balance, to see if we apply any method for balancing
# histogram to see dataset balance
plt.hist(tassDf['Tag'])
# Transform target names (P, N, NEU) of TASS dataset into integer representation
tassDf.loc[tassDf['Tag'] == 'N', 'Tag'] = 0
tassDf.loc[tassDf['Tag'] == 'NEU', 'Tag'] = 1
tassDf.loc[tassDf['Tag'] == 'P', 'Tag'] = 2
# Visualize transformation to numerical classes
plt.hist(tassDf['Tag'])
# Reprresent TASS dataset as vectors considering the corpus created before
X_data = np.array(buildWordVectorMatrix(tassDf['Text'], corpusDictionary))
# Select the y (target) of all data
y_data = np.array(tassDf['Tag'])
# Divide TASS dataset into training (70%), validation (15%) and test (15%)
# Training and validation-test
X_train, X_other, y_train, y_other = train_test_split(X_data, y_data, test_size=0.50, random_state=1, stratify=y_data)
# divide into validation and test datasets
X_validation, X_test, y_validation, y_test = train_test_split(X_other, y_other, test_size=0.50, random_state=1, stratify=y_other)
# Convert tags to binnary representation using to categorical function from Keras
y_train_bin = to_categorical(y_train, num_classes=3)
y_test_bin = to_categorical(y_test, num_classes=3)
y_validation_bin = to_categorical(y_validation, num_classes=3)
# Create the neuronal network model with keras Sequential class and add layer with given activation functions
model = models.Sequential()
model.add(layers.Dense(1200, activation='relu', input_shape=(len(corpusDictionary),)))
model.add(layers.Dense(300, activation='relu'))
model.add(layers.Dense(50, activation='relu'))
model.add(layers.Dense(3, activation='softmax'))
# Associate the metrics to the model
model.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['acc', 'AUC'])
# Run the model with processed data and selected metrics
print('xTest test', X_train.sum(axis=1))
print('X_test', X_train, X_train.shape)
print('y_train_bin', y_train_bin)
train_log = model.fit(X_train, y_train_bin,
epochs=10, batch_size=512,
validation_data=(X_validation, y_validation_bin))
# Model evaluation considering the accuracy of training and validations datasets
acc = train_log.history['acc']
val_acc = train_log.history['val_acc']
loss = train_log.history['loss']
val_loss = train_log.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'r', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()
# Calculate the accuracy considering test dataset
test_accuracy = model.evaluate(X_test, y_test_bin)
print(test_accuracy)
# Import data from replies to tweets of ecuadorian candidates
lassoPath = 'Datasets/replies_random1000_lasso_Diciembre.csv'
arauzPath = 'Datasets/replies_random1000_arauz_Diciembre.csv'
lassoDf = pd.read_csv(lassoPath, encoding='utf8').reset_index(drop=True)
arauzDf = pd.read_csv(arauzPath, encoding='utf8').reset_index(drop=True)
# Transform candidate tweets into numercial values by hot encoding
X_lasso = np.array(buildWordVectorMatrix(lassoDf['text'], corpusDictionary))
X_arauz = np.array(buildWordVectorMatrix(arauzDf['text'], corpusDictionary))
# Predict sentiment analysis of candidate tweets using outr trained model
# predict classes function will be deprecated for multi class distribution use (model.predict(X_lasso) > 0.5).astype("int32") in the future
# or numpy argmax, could be good option
classResultsLasso = model.predict_classes(X_lasso)
lassoPredDistribution = Counter(classResultsLasso).most_common()
print(lassoPredDistribution)
classResultsArauz = model.predict_classes(X_arauz) # predict classes function will be deprecated
arauzPredDistribution = Counter(classResultsArauz).most_common()
print(arauzPredDistribution)
# Candidate results
# (0 Negative, 1 neutral, 2 postive)
print('\nReplies to Lasso tweets results: ')
print(f'NEGATIVES: {lassoPredDistribution[0][1]}')
print(f'POSITIVES: {lassoPredDistribution[1][1]}')
print(f'NEUTRAL: {lassoPredDistribution[2][1]}')
print('\nReplies to Arauz tweets results:')
print(f'NEGATIVES: {arauzPredDistribution[0][1]}')
print(f'POSITIVES: {arauzPredDistribution[1][1]}')
print(f'NEUTRAL: {arauzPredDistribution[2][1]}')
# Pie charts
# Lasso
valuesLasso = [freq for word, freq in lassoPredDistribution]
labelsLasso = ['NEGATIVE', 'POSITIVE', 'NEUTRAL']
plt.figure('Guillermo Lasso')
figLasso1, axLasso1 = plt.subplots()
axLasso1.set_title('Guillermo Laso')
axLasso1.pie(valuesLasso, labels=labelsLasso, autopct='%1.1f%%', shadow=True, startangle=90)
plt.show()
# Arauz
valuesArauz = [freq for word, freq in arauzPredDistribution]
labelsArauz= ['NEGATIVE', 'POSITIVE', 'NEUTRAL']
plt.figure('Andres Arauz')
figArauz1, axArauz1 = plt.subplots()
axArauz1.set_title('Andres Arauz')
axArauz1.pie(valuesArauz, labels=labelsArauz, autopct='%1.1f%%', shadow=True, startangle=90)
plt.show()