-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmodel_functions.py
171 lines (156 loc) · 6.47 KB
/
model_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import logit
import patsy
print "MF reloaded 1"
def score_log_1(x):
return np.log10(x.clip(0) + 1)
def score_log(x):
return np.log10(x)
def score_k(x,k):
return x == k
def score_less_k(x,k):
return x < k
def score_more_k(x,k):
return x >= k
def score_ref_k(x,k):
return x - k
def score_inv(x):
return 1./x
def mv_func(*x, **kwargs):
"""MC - Multi category encoding
"""
#raise Exception("Not Implemented")
levels, reference = kwargs.get("levels", None), kwargs.get("reference", None)
weights = kwargs.get("weights", None)
nx = len(x)
if nx < 2:
raise Exception("Need at least 2 columns to do multival. Otherwise just use C(column_name)")
# If number of columns are 2 allow single weight
# Else for each column there should be a weight
if weights is not None:
if len(weights.shape) == 1:
if nx == 2:
print "using complimentary weights for 2 columns. w and 1-w"
weights = np.insert(weights[:, np.newaxis], 1, 1. - weights, axis=1) # Add complimentary weights
else:
weights = weights[:, np.newaxis] # Create weights into 1 column matrix
elif nx > 1 and weights.shape[1] != nx:
raise Exception("Either weights should be a 1d array or 2d array with number of columns equal to %s" % nx)
else:
weights = np.ones((x[0].shape[0], 1)) # Create 1 column all ones weight matrix
if len(x[0].shape) != 1:
raise Exception("Mismatching Shapes. All arrays should be 1d and should have the same shape")
for k in x:
if k.shape != x[0].shape:
raise Exception("Mismatching Shapes. All arrays should be 1d and should have the same shape")
if levels is None:
levels = np.sort(np.unique(np.hstack(x))) # Sort the unique values and then use this ordering as levels
else:
levels = np.array(levels)
if reference is None:
reference = levels[0]
#print "Levels: %s, reference: %s" % (levels, reference)
levels = levels[levels != reference] # Remove reference from levels
level_len = len(levels)
#print x[0].shape[0], level_len
out = np.zeros((x[0].shape[0], level_len))
for i, v in enumerate(levels):
# print i, v
for j,col in enumerate(x):
idx = np.where(np.array(col) == v)
out[idx, i] = weights[idx, min(nx-1,j)]
#print "Created matrix with shape: ", out.shape
colnames = ["T.%s" % k for k in levels]
return pd.DataFrame(out, columns=colnames)
# Use patsy contrast matrix
# contrast = Treatment(reference='N').code_without_intercept(levels) # Levels should be a list
# contrast.matrix[x, :] Here x should be array of index of categories in levels
class MultiVal(object):
def __init__(self):
print "Using class based MultiVal"
self.levels = []
self.reference = []
self.colnames = []
self.level_len = 0
self.nx = 0
self.levels_given = False
self.verbose = False
def memorize_chunk(self,*x,**kwargs):
self.verbose = kwargs.get("verbose", False)
levels, reference = kwargs.get("levels", None), kwargs.get("reference", None)
self.nx = len(x)
if self.nx < 2:
raise Exception("Need at least 2 columns to do multival. Otherwise just use C(column_name)")
# If number of columns are 2 allow single weight
# Else for each column there should be a weight
if len(x[0].shape) != 1:
raise Exception("Mismatching Shapes. All arrays should be 1d and should have the same shape")
for k in x:
if k.shape != x[0].shape:
raise Exception("Mismatching Shapes. All arrays should be 1d and should have the same shape")
if levels is None:
self.levels.extend(np.sort(np.unique(np.hstack(x))).tolist())
else:
self.levels_given = True
self.levels = levels
self.reference = reference
if self.verbose:
print "In memorize chunk"
print "Levels given: %s" % self.levels_given
print "LEVELS: %s" % levels
print "REFERENCE: %s" % reference
print "self.LEVELS: %s" % self.levels
print "self.REFERENCE: %s" % self.reference
def memorize_finish(self):
if not self.levels_given:
self.levels = np.array(list(set(self.levels)))
else:
self.levels = np.array(self.levels)
if self.reference is None:
self.reference = self.levels[0]
self.levels = self.levels[self.levels != self.reference] # Remove reference from levels
self.level_len = len(self.levels)
self.colnames = ["T.%s" % k for k in self.levels]
if self.verbose:
print "In memorize finish"
print "Levels given: %s" % self.levels_given
print "self.LEVELS[%s]: %s" % (self.level_len, self.levels)
print "self.REFERENCE: %s" % self.reference
def transform(self, *x, **kwargs):
out = np.zeros((x[0].shape[0], self.level_len))
weights = kwargs.get("weights", None)
if weights is None:
weights = np.ones((x[0].shape[0], 1)) # Create 1 column all ones weight matrix
if len(weights.shape) == 1:
if self.nx == 2:
print "using complimentary weights for 2 columns. w and 1-w"
weights = np.insert(weights[:, np.newaxis], 1, 1. - weights, axis=1) # Add complimentary weights
else:
weights = weights[:, np.newaxis] # Create weights into 1 column matrix
elif self.nx > 1 and weights.shape[1] != self.nx and weights.shape[1] != 1:
raise Exception("Either weights should be a 1d array or 2d array with number of columns equal to %s" % self.nx)
if self.verbose:
print "In transform"
print "x: %s, self.nx: %s" % (len(x), self.nx)
print "out.shape: %s, weights.shape: %s" % (out.shape, weights.shape)
print "Levels given: %s" % self.levels_given
print "self.LEVELS[%s]: %s" % (self.level_len, self.levels)
print "self.REFERENCE: %s" % self.reference
for i, v in enumerate(self.levels):
for j,col in enumerate(x):
col = col.values
idx = np.where(np.array(col) == v)[0]
w_col = min(weights.shape[1] -1,j)
if self.verbose:
print "Level: %s, %s" % (i,v)
print "Column: %s, W_col: %s" % (j,w_col)
print "Found values: %s" % (idx.shape,)
print "Setting values: %s" % (out[idx, i].shape,)
print "Setting weights: %s" % (weights[idx, w_col].shape,)
print "Unique weights: %s" % (np.unique(weights[idx, w_col]))
# Add the weights so as to allow for cases UNKNOWN, UNKNOWN
out[idx, i] += weights[idx, w_col]
return pd.DataFrame(out, columns=self.colnames, index=x[0].index)
MC = patsy.stateful_transform(MultiVal)