-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathSIR.py
77 lines (54 loc) · 1.88 KB
/
SIR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import numpy as np
from numpy import linalg
# Author: Russell Kunes
# Implementing the SIR supervised data reduction method as described in the 1991 paper by Professor Ker-Chau Li
class SIR:
def __init__(self, K = 2, H = 10, bins = None):
self.K = K
self.H = H
#default is equally spaced bins
self.bins = bins
def fit(self, X, Y):
self.X = X
self.Y = Y
#n is the number of observations
n = X.shape[0]
p = X.shape[1]
x_bar = np.mean(X,axis =0)
#compute the bins, assuming default
if self.bins == None:
n_h, bins = np.histogram(Y,bins = self.H)
else:
n_h,bins = np.histogram(Y, bins = self.bins)
#assign a bin to each observations
assignments = np.digitize(Y,bins)
#this is really hacky...
assignments[np.argmax(assignments)] -= 1
#loop through the slices, for each slice compute within slice mean
M = np.zeros((p,p))
for i in range(len(n_h)):
h = n_h[i]
if h != 0:
x_h_bar = np.mean(X[assignments == i + 1],axis = 0)
elif h ==0:
x_h_bar = np.zeros(p)
x_std = x_h_bar - x_bar
M += float(h) * np.outer(x_std,x_std)
#compute the estimate of the covariance matrix M
M = float(n)**(-1) * M
self.M = M
#eigendecomposition of V
cov = np.cov(X.T)
V = np.dot(linalg.inv(cov),M)
eigenvalues, eigenvectors = linalg.eig(V)
idx = eigenvalues.argsort()[::-1]
eigenvalues = eigenvalues[idx]
eigenvectors = eigenvectors[:,idx]
#assign first K columns to beta
beta = eigenvectors[:,0:self.K]
self.beta = beta
self.eigenvalues = eigenvalues
return
def transform(self, X_to_predict):
beta = self.beta
return np.dot(X_to_predict,beta)