-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathw2v_lr.py
81 lines (61 loc) · 2.1 KB
/
w2v_lr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.utils import shuffle
from sklearn.externals import joblib
from matplotlib import pyplot
import pickle
HISTORY_FROM='04-01'
HISTORY_TO='04-10'
train_data_path='wp_'+HISTORY_FROM+'_'+HISTORY_TO+'.pkl'
with open(train_data_path,'rb') as f:
data=pickle.load(f)
print('Number of Actual Users: ',len(data))
train_data=[]
train_label=[]
test_data=[]
test_label=[]
# history value is just a summation, not an average
for pair in data:
hist=pair[0]
neg_samples=pair[1]
hist_sum=[0]*100
for i in range(len(hist)-2):
hist_sum=[sum(x) for x in zip(hist_sum,hist[i])]
train_data.append(hist_sum+hist[i+1])
train_label.append(1)
train_data.append(hist_sum+neg_samples[i+1])
train_label.append(0)
hist_sum=[sum(x) for x in zip(hist_sum,hist[-2])]
test_data.append(hist_sum+hist[-1])
test_label.append(1)
test_data.append(hist_sum+neg_samples[-1])
test_label.append(0)
assert len(train_data)==len(train_label)
assert len(test_data)==len(test_label)
print('train data length: ',len(train_data))
# need scaling for use of Stochastic Average Gradient descent solver ( much faster )
scaler=StandardScaler()
scaler.fit(train_data)
joblib.dump(scaler,'scaler.pkl')
X=scaler.transform(train_data)
X_test=scaler.transform(test_data)
lr=LogisticRegressionCV(penalty='l2',n_jobs=-1,solver='sag')
lr.fit(X,train_label)
joblib.dump(lr,'wplr.pkl')
score=lr.score(X_test,test_label)
print(score)
print('probability for a few results: \n')
print(lr.predict_proba(test_data[:10]))
print('original class of above data: \n')
print(test_label[:10])
predicted_probs=lr.predict_proba(X_test)[:,1]
fpr,tpr,threshold=roc_curve(test_label,predicted_probs,pos_label=1)
pyplot.plot(fpr,tpr)
pyplot.xlabel('False positive rate')
pyplot.ylabel('True positive rate')
pyplot.title('ROC curve')
pyplot.legend(loc='best')
auc_score=roc_auc_score(test_label,predicted_probs)
print('auc score: {:.4f}'.format(auc_score))
pyplot.show()