-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.py
409 lines (308 loc) · 15 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
# -*- coding: utf-8 -*-
"""
Created on Sat Jul 17 20:25:28 2022
@author: Сухас Дхолз
"""
import pandas as pd
import numpy as np
import re
# plotting
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import word_tokenize
nltk.download('stopwords')
nltk.download('wordnet')
# read data
data = pd.read_csv('C:/Users/GTekSD/Desktop/personality_prediction/mdataset.csv')
data.head(10)
[p.split('|||') for p in data.head(2).posts.values]
cnt_types = data['type'].value_counts()
plt.figure(figsize=(12,4))
sns.barplot(cnt_types.index, cnt_types.values, alpha=0.8)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Types', fontsize=12)
plt.show()
def get_types(row):
t=row['type']
I = 0; N = 0
T = 0; J = 0
if t[0] == 'I': I = 1
elif t[0] == 'E': I = 0
else: print('I-E incorrect')
if t[1] == 'N': N = 1
elif t[1] == 'S': N = 0
else: print('N-S incorrect')
if t[2] == 'T': T = 1
elif t[2] == 'F': T = 0
else: print('T-F incorrect')
if t[3] == 'J': J = 1
elif t[3] == 'P': J = 0
else: print('J-P incorrect')
return pd.Series( {'IE':I, 'NS':N , 'TF': T, 'JP': J })
data = data.join(data.apply (lambda row: get_types (row),axis=1))
data.head(5)
print ("Introversion (I) / Extroversion (E):\t", data['IE'].value_counts()[0], " / ", data['IE'].value_counts()[1])
print ("Intuition (N) – Sensing (S):\t\t", data['NS'].value_counts()[0], " / ", data['NS'].value_counts()[1])
print ("Thinking (T) – Feeling (F):\t\t", data['TF'].value_counts()[0], " / ", data['TF'].value_counts()[1])
print ("Judging (J) – Perceiving (P):\t\t", data['JP'].value_counts()[0], " / ", data['JP'].value_counts()[1])
N = 4
but = (data['IE'].value_counts()[0], data['NS'].value_counts()[0], data['TF'].value_counts()[0], data['JP'].value_counts()[0])
top = (data['IE'].value_counts()[1], data['NS'].value_counts()[1], data['TF'].value_counts()[1], data['JP'].value_counts()[1])
ind = np.arange(N) # the x locations for the groups
width = 0.7 # the width of the bars: can also be len(x) sequence
p1 = plt.bar(ind, but, width)
p2 = plt.bar(ind, top, width, bottom=but)
plt.ylabel('Count')
plt.title('Distribution accoss types indicators')
plt.xticks(ind, ('I/E', 'N/S', 'T/F', 'J/P',))
plt.show()
data[['IE','NS','TF','JP']].corr()
cmap = plt.cm.RdBu
corr = data[['IE','NS','TF','JP']].corr()
plt.figure(figsize=(12,10))
plt.title('Pearson Features Correlation', size=15)
sns.heatmap(corr, cmap=cmap, annot=True, linewidths=1)
b_Pers = {'I':0, 'E':1, 'N':0, 'S':1, 'F':0, 'T':1, 'J':0, 'P':1}
b_Pers_list = [{0:'I', 1:'E'}, {0:'N', 1:'S'}, {0:'F', 1:'T'}, {0:'J', 1:'P'}]
def translate_personality(personality):
# transform mbti to binary vector
return [b_Pers[l] for l in personality]
def translate_back(personality):
# transform binary vector to mbti personality
s = ""
for i, l in enumerate(personality):
s += b_Pers_list[i][l]
return s
# Check ...
d = data.head(4)
list_personality_bin = np.array([translate_personality(p) for p in d.type])
print("Binarize MBTI list: \n%s" % list_personality_bin)
# We want to remove these from the psosts
unique_type_list = ['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ']
unique_type_list = [x.lower() for x in unique_type_list]
# Lemmatize
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()
# Cache the stop words for speed
cachedStopWords = stopwords.words("english")
def pre_process_data(data, remove_stop_words=True, remove_mbti_profiles=True):
list_personality = []
list_posts = []
len_data = len(data)
i=0
for row in data.iterrows():
i+=1
if (i % 500 == 0 or i == 1 or i == len_data):
print("%s of %s rows" % (i, len_data))
##### Remove and clean comments
posts = row[1].posts
temp = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', posts)
temp = re.sub("[^a-zA-Z]", " ", temp)
temp = re.sub(' +', ' ', temp).lower()
if remove_stop_words:
temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(' ') if w not in cachedStopWords])
else:
temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(' ')])
if remove_mbti_profiles:
for t in unique_type_list:
temp = temp.replace(t,"")
type_labelized = translate_personality(row[1].type)
list_personality.append(type_labelized)
list_posts.append(temp)
list_posts = np.array(list_posts)
list_personality = np.array(list_personality)
return list_posts, list_personality
list_posts, list_personality = pre_process_data(data, remove_stop_words=True)
print("Num posts and personalities: ", list_posts.shape, list_personality.shape)
list_posts[0]
list_personality[0]
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
# Posts to a matrix of token counts
cntizer = CountVectorizer(analyzer="word",
max_features=1500,
tokenizer=None,
preprocessor=None,
stop_words=None,
max_df=0.7,
min_df=0.1)
# Learn the vocabulary dictionary and return term-document matrix
print("CountVectorizer...")
X_cnt = cntizer.fit_transform(list_posts)
# Transform the count matrix to a normalized tf or tf-idf representation
tfizer = TfidfTransformer()
print("Tf-idf...")
# Learn the idf vector (fit) and transform a count matrix to a tf-idf representation
X_tfidf = tfizer.fit_transform(X_cnt).toarray()
feature_names = list(enumerate(cntizer.get_feature_names()))
feature_names
X_tfidf.shape
print("X: Posts in tf-idf representation \n* 1st row:\n%s" % X_tfidf[0])
type_indicators = [ "IE: Introversion (I) / Extroversion (E)", "NS: Intuition (N) – Sensing (S)",
"FT: Feeling (F) - Thinking (T)", "JP: Judging (J) – Perceiving (P)" ]
for l in range(len(type_indicators)):
print(type_indicators[l])
print("MBTI 1st row: %s" % translate_back(list_personality[0,:]))
print("Y: Binarized MBTI 1st row: %s" % list_personality[0,:])
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Posts in tf-idf representation
X = X_tfidf
# Let's train type indicator individually
for l in range(len(type_indicators)):
print("%s ..." % (type_indicators[l]))
# Let's train type indicator individually
Y = list_personality[:,l]
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
# fit model on training data
model = XGBClassifier()
model.fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("* %s Accuracy: %.2f%%" % (type_indicators[l], accuracy * 100.0))
for l in range(len(type_indicators)):
print("%s ..." % (type_indicators[l]))
Y = list_personality[:,l]
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
# fit model on training data
model = XGBClassifier()
eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="logloss", eval_set=eval_set, verbose=True)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("* %s Accuracy: %.2f%%" % (type_indicators[l], accuracy * 100.0))
from xgboost import plot_importance
# Only the 1st indicator
y = list_personality[:,0]
# fit model on training data
model = XGBClassifier()
model.fit(X, y)
# plot feature importance
ax = plot_importance(model, max_num_features=25)
fig = ax.figure
fig.set_size_inches(15, 20)
plt.show()
features = sorted(list(enumerate(model.feature_importances_)), key=lambda x: x[1], reverse=True)
for f in features[0:25]:
print("%d\t%f\t%s" % (f[0],f[1],cntizer.get_feature_names()[f[0]]))
# Save xgb_params for late discussuin
default_get_xgb_params = model.get_xgb_params()
default_get_xgb_params = model.get_xgb_params()
print (default_get_xgb_params)
param = {}
param['n_estimators'] = 200
param['max_depth'] = 2
param['nthread'] = 8
param['learning_rate'] = 0.2
# Let's train type indicator individually
for l in range(len(type_indicators)):
print("%s ..." % (type_indicators[l]))
Y = list_personality[:,l]
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
# fit model on training data
model = XGBClassifier(**param)
model.fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("* %s Accuracy: %.2f%%" % (type_indicators[l], accuracy * 100.0))
# from numpy import loadtxt
# from xgboost import XGBClassifier
# from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import StratifiedKFold
# # Posts in tf-idf representation
# X = X_tfidf
# # setup parameters for xgboost
# param = {}
# param['n_estimators'] = 200
# param['max_depth'] = 2
# param['nthread'] = 8
# param['learning_rate'] = 0.2
# # Let's train type indicator individually
# for l in range(len(type_indicators)):
# print("%s ..." % (type_indicators[l]))
# Y = list_personality[:,l]
# model = XGBClassifier(**param)
# # learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
# # param_grid = dict(learning_rate=learning_rate)
# param_grid = {
# 'n_estimators' : [ 200, 300],
# 'learning_rate': [ 0.2, 0.3]
# # 'learning_rate': [ 0.01, 0.1, 0.2, 0.3],
# # 'max_depth': [2,3,4],
# }
# kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
# grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)
# grid_result = grid_search.fit(X, Y)
# # summarize results
# print("* Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
# print("* %f (%f) with: %r" % (mean, stdev, param))
# my_posts = """Getting started with data science and applying machine learning has never been as simple as it is now. There are many free and paid online tutorials and courses out there to help you to get started. I’ve recently started to learn, play, and work on Data Science & Machine Learning on Kaggle.com. In this brief post, I’d like to share my experience with the Kaggle Python Docker image, which simplifies the Data Scientist’s life.
# Awesome #AWS monitoring introduction.
# HPE Software (now @MicroFocusSW) won the platinum reader's choice #ITAWARDS 2017 in the new category #CloudMonitoring
# Certified as AWS Certified Solutions Architect
# Hi, please have a look at my Udacity interview about online learning and machine learning,
# Very interesting to see the lessons learnt during the HP Operations Orchestration to CloudSlang journey. http://bit.ly/1Xo41ci
# I came across a post on devopsdigest.com and need your input: “70% DevOps organizations Unhappy with DevOps Monitoring Tools”
# In a similar investigation I found out that many DevOps organizations use several monitoring tools in parallel. Senu, Nagios, LogStach and SaaS offerings such as DataDog or SignalFX to name a few. However, one element is missing: Consolidation of alerts and status in a single pane of glass, which enables fast remediation of application and infrastructure uptime and performance issues.
# Sure, there are commercial tools on the market for exactly this use case but these tools are not necessarily optimized for DevOps.
# So, here my question to you: In your DevOps project, have you encountered that the lack of consolidation of alerts and status is a real issue? If yes, how did you approach the problem? Or is an ChatOps approach just right?
# You will probably hear more and more about ChatOps - at conferences, DevOps meet-ups or simply from your co-worker at the coffee station. ChatOps is a term and concept coined by GitHub. It's about the conversation-driven development, automation, and operations.
# Now the question is: why and how would I, as an ops-focused engineer, implement and use ChatOps in my organization? The next question then is: How to include my tools into the chat conversation?
# Let’s begin by having a look at a use case. The Closed Looped Incidents Process (CLIP) can be rejuvenated with ChatOps. The work from the incident detection runs through monitoring until the resolution of issues in your application or infrastructure can be accelerated with improved, cross-team communication and collaboration.
# In this blog post, I am going to describe and share my experience with deploying HP Operations Manager i 10.0 (OMi) on HP Helion Public Cloud. An Infrastructure as a Service platform such as HP Helion Public Cloud Compute is a great place to quickly spin-up a Linux server and install HP Operations Manager i for various use scenarios. An example of a good use case is monitoring workloads across public clouds such as AWS and Azure.
# """
# # The type is just a dummy so that the data prep fucntion can be reused
# mydata = pd.DataFrame(data={'type': ['INFJ'], 'posts': [my_posts]})
# my_posts, dummy = pre_process_data(mydata, remove_stop_words=True)
# my_X_cnt = cntizer.transform(my_posts)
# my_X_tfidf = tfizer.transform(my_X_cnt).toarray()
# param = {}
# param['n_estimators'] = 200
# param['max_depth'] = 2
# param['nthread'] = 8
# param['learning_rate'] = 0.2
# result = []
# # Let's train type indicator individually
# for l in range(len(type_indicators)):
# print("%s ..." % (type_indicators[l]))
# Y = list_personality[:,l]
# # split data into train and test sets
# seed = 7
# test_size = 0.33
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
# # fit model on training data
# model = XGBClassifier(**param)
# model.fit(X_train, y_train)
# # make predictions for my data
# y_pred = model.predict(my_X_tfidf)
# result.append(y_pred[0])
# print("The result is: ", translate_back(result))