-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
137 lines (101 loc) · 4.59 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import logging
import _pickle as pickle
import numba
import pandas as pd
import numpy as np
import xgboost as xgb
import doppelspeller.settings as s
import doppelspeller.constants as c
from doppelspeller.feature_engineering import FeatureEngineering
LOGGER = logging.getLogger(__name__)
@numba.njit(fastmath=True)
def fast_custom_error(prediction, actual):
"""
Calculates the custom error, that is defined as,
- number_of_false_negatives + (number_of_false_positive * s.FALSE_POSITIVE_PENALTY_FACTOR)
"""
predictions_negative_indexes = (prediction <= s.PREDICTION_PROBABILITY_THRESHOLD).nonzero()[0]
predictions_positive_indexes = (prediction > s.PREDICTION_PROBABILITY_THRESHOLD).nonzero()[0]
false_negative_cost = np.sum(actual[predictions_negative_indexes])
false_positive_cost = np.sum(actual[predictions_positive_indexes] == 0) * s.FALSE_POSITIVE_PENALTY_FACTOR
return false_negative_cost + false_positive_cost
@numba.njit(fastmath=True)
def fast_weighted_log_loss(prediction, actual):
beta = s.FALSE_POSITIVE_PENALTY_FACTOR
gradient = prediction * (beta + actual - beta * actual) - actual
hessian = prediction * (1 - prediction) * (beta + actual - beta * actual)
return gradient, hessian
def custom_error(predictions, train_data_object):
return 'custom-error', fast_custom_error(predictions, train_data_object.get_label())
def weighted_log_loss(predictions, train_data_object):
return fast_weighted_log_loss(predictions, train_data_object.get_label())
def get_xgb_feats_importance(model):
features_score = model.get_fscore()
features_importance = []
for feature, score in features_score.items():
features_importance.append({'feature': feature, 'importance': score})
features_importance = pd.DataFrame(features_importance)
features_importance = features_importance.sort_values(by='importance', ascending=False).reset_index(drop=True)
features_importance['importance'] /= features_importance['importance'].sum()
return features_importance
def get_evaluation_error_matrix(model, evaluation_data, evaluation_target):
predictions = model.predict(evaluation_data, ntree_limit=model.best_ntree_limit)
predictions[predictions > s.PREDICTION_PROBABILITY_THRESHOLD] = 1
predictions[predictions <= s.PREDICTION_PROBABILITY_THRESHOLD] = 0
true_positives, true_negatives, false_positives, false_negatives = 0, 0, 0, 0
for index, actual_value in enumerate(evaluation_target):
prediction_value = predictions[index]
if prediction_value == 0:
if actual_value == prediction_value:
true_negatives += 1
else:
false_negatives += 1
else:
if actual_value == prediction_value:
true_positives += 1
else:
false_positives += 1
return true_positives, true_negatives, false_positives, false_negatives
def train_model():
LOGGER.info('Generating train and evaluation data-sets!')
features = FeatureEngineering(c.DATA_TYPE_TRAIN)
train, train_target, evaluation, evaluation_target = features.generate_train_and_evaluation_data_sets()
d_train = xgb.DMatrix(train, label=train_target)
d_evaluation = xgb.DMatrix(evaluation, label=evaluation_target)
scale_pos_weight = sum(train_target == 0) / sum(train_target == 1)
watch_list = [(d_train, 'train'), (d_evaluation, 'evaluation')]
# TODO: Use the optimized parameters
params = {
'params': {
'max_depth': 5,
'eta': 0.1,
'min_child_weight': 1,
'eval_metric': 'auc',
'objective': 'reg:logistic',
'scale_pos_weight': scale_pos_weight,
'subsample': 1,
},
'num_boost_round': 1000,
'verbose_eval': True,
'early_stopping_rounds': 50,
}
model = xgb.train(
dtrain=d_train,
evals=watch_list,
feval=custom_error,
obj=weighted_log_loss,
maximize=False,
**params
)
features_importance_data = get_xgb_feats_importance(model)
true_positives, true_negatives, false_positives, false_negatives = get_evaluation_error_matrix(
model, d_evaluation, evaluation_target)
LOGGER.info(f"""\n\nEvaluation Data Error Matrix:\n
True Positives {true_positives}
True Negatives {true_negatives}
False Positives {false_positives}
False Negatives {false_negatives}\n
""")
with open(s.MODEL_DUMP_FILE, 'wb') as file_object:
pickle.dump(model, file_object)
return features_importance_data