-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
104 lines (87 loc) · 4.17 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import string, nltk
import numpy as np
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score, accuracy_score, classification_report, roc_auc_score, plot_roc_curve
import pickle, os, glob
def preprocess(text):
soup = BeautifulSoup(text, "html.parser")
for data in soup(['script', 'style']):
data.decompose()
text = ' '.join(soup.stripped_strings)
text = text.lower()
temp = ""
for i in text:
if i in string.punctuation:
continue
else:
temp+=i
temp = temp.split()
temp = [w for w in temp if w not in nltk.corpus.stopwords.words('english')]
for i in range(len(temp)):
temp[i] = stemmer.stem(temp[i])
return ' '.join(temp)
def labelEncoder(dfcol):
le = LabelEncoder().fit(dfcol)
return le.transform(dfcol)
def createSplit(df, printsize=False):
total_size = len(df)
random_permutation = np.random.permutation(total_size)
train_size = int(total_size*0.7)
test_size = int((total_size - train_size)/2)
val_size = test_size
X_train = df['review'][random_permutation[:train_size]]
y_train = np.array(df['sentiment'][random_permutation[:train_size]])
X_test = df['review'][random_permutation[train_size:train_size+test_size]]
y_test = np.array(df['sentiment'][random_permutation[train_size:train_size+test_size]])
X_val = df['review'][random_permutation[-test_size:]]
y_val = np.array(df['sentiment'][random_permutation[-test_size:]])
if printsize==True:
print("Training set : Validation set : Test set = "+str(len(X_train))+" : "+str(len(X_val))+" : "+str(len(X_test)))
return (X_train, y_train, X_val, y_val, X_test, y_test)
def trainClassifier(clf, ngram_lb, ngram_ub, train_X, train_y, val_X, val_y):
tfidf = TfidfVectorizer(ngram_range=(ngram_lb, ngram_ub), max_features=None, sublinear_tf=True) #stop_words=nltk.corpus.stopwords.words('english')
pipe = Pipeline([('tfidf', tfidf), ('classifier', clf)])
pipe.fit(train_X.astype(str), train_y)
acc = accuracy_score(val_y, pipe.predict(val_X))
f1acc = np.mean(cross_val_score(pipe, val_X, val_y, scoring=make_scorer(f1_score), cv=10))
return {
'pipeline' : pipe,
'feature_extractor' : tfidf,
'acc' : acc,
'f1_cv' : f1acc
}
def saveModel(clf, model, path):
clf_name = f"{clf}-{model['f1_cv']*100:.5f}.sav"
vect_name = f"{clf}-{model['f1_cv']*100:.5f}.pk"
pickle.dump(model['pipeline'], open(os.path.join(path, clf_name), 'wb'))
pickle.dump(model['feature_extractor'], open(os.path.join(path, vect_name), 'wb'))
def printReport(X_test, y_test, model, roc=False):
#vect = pickle.load(open(model['vect'], 'rb'))
#X_test = vect.transform(X_test)
y_test = np.array(y_test)
print(f"Prediction for {model['clf'].split('/')[-1].split('-')[0]} on Test Data")
pipe = pickle.load(open(model['clf'], 'rb'))
y_pred = pipe.predict(X_test.astype(str))
print(f"Accuracy Score : {accuracy_score(y_test, y_pred):.3f}\nF1 Score : {f1_score(y_test, y_pred):.3f}")
print(classification_report(y_test, y_pred))
if roc==True:
plot_roc_curve(pipe, X_test, y_test)
def getBestModel(path, name, overwrite):
models = glob.glob(os.path.join(path, '*.sav'))
models = [m for m in models if m.split('/')[-1].startswith(name)]
models = sorted(models, key=lambda x: float(x.split('-')[-1][:-4]), reverse=True)
vects = glob.glob(os.path.join(path, '*.pk'))
vects = [m for m in vects if m.split('/')[-1].startswith(name)]
vects = sorted(vects, key=lambda x: float(x.split('-')[-1][:-3]), reverse=True)
if overwrite==True:
if len(models)>1:
for f in models[1:]:
os.remove(f)
for f in vects[1:]:
os.remove(f)
return {'clf' : pickle.load(open(models[0], 'rb')), 'vect' : pickle.load(open(vects[0], 'rb'))}