-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathdiscretize.py
140 lines (116 loc) · 5.31 KB
/
discretize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import numpy as np
import sklearn
import sklearn.tree
from sklearn.utils import check_random_state
from abc import ABCMeta, abstractmethod
class BaseDiscretizer():
__metaclass__ = ABCMeta
def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None):
self.to_discretize = ([x for x in range(data.shape[1])
if x not in categorical_features])
self.names = {}
self.lambdas = {}
self.means = {}
self.stds = {}
self.mins = {}
self.maxs = {}
self.random_state = check_random_state(random_state)
bins = self.bins(data, labels)
bins = [np.unique(x) for x in bins]
for feature, qts in zip(self.to_discretize, bins):
n_bins = qts.shape[0]
boundaries = np.min(data[:, feature]), np.max(data[:, feature])
name = feature_names[feature]
self.names[feature] = ['%s <= %.2f' % (name, qts[0])]
for i in range(n_bins - 1):
self.names[feature].append('%.2f < %s <= %.2f' %
(qts[i], name, qts[i + 1]))
self.names[feature].append('%s > %.2f' % (name, qts[n_bins - 1]))
self.lambdas[feature] = lambda x, qts=qts: np.searchsorted(qts, x)
discretized = self.lambdas[feature](data[:, feature])
self.means[feature] = []
self.stds[feature] = []
for x in range(n_bins + 1):
selection = data[discretized == x, feature]
mean = 0 if len(selection) == 0 else np.mean(selection)
self.means[feature].append(mean)
std = 0 if len(selection) == 0 else np.std(selection)
std += 0.00000000001
self.stds[feature].append(std)
self.mins[feature] = [boundaries[0]] + qts.tolist()
self.maxs[feature] = qts.tolist() + [boundaries[1]]
@abstractmethod
def bins(self, data, labels):
raise NotImplementedError("Must override bins() method")
def discretize(self, data):
ret = data.copy()
for feature in self.lambdas:
if len(data.shape) == 1:
ret[feature] = int(self.lambdas[feature](ret[feature]))
else:
ret[:, feature] = self.lambdas[feature](
ret[:, feature]).astype(int)
return ret
def undiscretize(self, data):
ret = data.copy()
for feature in self.means:
mins = self.mins[feature]
maxs = self.maxs[feature]
means = self.means[feature]
stds = self.stds[feature]
def get_inverse(q):
return max(mins[q],
min(self.random_state.normal(means[q], stds[q]), maxs[q]))
if len(data.shape) == 1:
q = int(ret[feature])
ret[feature] = get_inverse(q)
else:
ret[:, feature] = (
[get_inverse(int(x)) for x in ret[:, feature]])
return ret
class QuartileDiscretizer(BaseDiscretizer):
def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None):
BaseDiscretizer.__init__(self, data, categorical_features,
feature_names, labels=labels,
random_state=random_state)
def bins(self, data, labels):
bins = []
for feature in self.to_discretize:
qts = np.array(np.percentile(data[:, feature], [25, 50, 75]))
bins.append(qts)
return bins
class DecileDiscretizer(BaseDiscretizer):
def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None):
BaseDiscretizer.__init__(self, data, categorical_features,
feature_names, labels=labels,
random_state=random_state)
def bins(self, data, labels):
bins = []
for feature in self.to_discretize:
qts = np.array(np.percentile(data[:, feature],
[10, 20, 30, 40, 50, 60, 70, 80, 90]))
bins.append(qts)
return bins
class EntropyDiscretizer(BaseDiscretizer):
def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None):
if(labels is None):
raise ValueError('Labels must be not None when using \
EntropyDiscretizer')
BaseDiscretizer.__init__(self, data, categorical_features,
feature_names, labels=labels,
random_state=random_state)
def bins(self, data, labels):
bins = []
for feature in self.to_discretize:
dt = sklearn.tree.DecisionTreeClassifier(criterion='entropy',
max_depth=3,
random_state=self.random_state)
x = np.reshape(data[:, feature], (-1, 1))
dt.fit(x, labels)
qts = dt.tree_.threshold[np.where(dt.tree_.children_left > -1)]
if qts.shape[0] == 0:
qts = np.array([np.median(data[:, feature])])
else:
qts = np.sort(qts)
bins.append(qts)
return bins