-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmst_clustering.py
188 lines (151 loc) · 6.23 KB
/
mst_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
"""
The below code is slightly adapted from the astroML package.
"""
"""
Minimum Spanning Tree Clustering
"""
import numpy as np
from scipy import sparse
from sklearn.neighbors import kneighbors_graph
try:
from scipy.sparse.csgraph import minimum_spanning_tree, connected_components
except ImportError:
raise ValueError("scipy v0.11 or greater required " "for minimum spanning tree")
class HierarchicalClustering(object):
"""Hierarchical Clustering via Approximate Euclidean Minimum Spanning Tree
Parameters
----------
n_neighbors : int
number of neighbors of each point used for approximate Euclidean
minimum spanning tree (MST) algorithm. See Notes below.
edge_cutoff : float
specify a fraction of edges to keep when selecting clusters.
edge_cutoff should be between 0 and 1.
min_cluster_size : int, optional
specify a minimum number of points per cluster. If not specified,
all clusters will be kept.
Attributes
----------
X_train_ : ndarray
the training data
full_tree_ : sparse graph
the full approximate Euclidean MST spanning the data
cluster_graph_ : sparse graph
the final (truncated) graph showing clusters
n_components_ : int
the number of clusters found.
labels_ : int
the cluster labels for each training point. Labels range from -1
to n_components_ - 1: points labeled -1 are in the background (i.e.
their clusters were smaller than min_cluster_size)
Notes
-----
This routine uses an approximate Euclidean minimum spanning tree (MST)
to perform hierarchical clustering. A true Euclidean minimum spanning
tree naively costs O[N^3]. Graph traversal algorithms only help so much,
because all N^2 edges must be used as candidates. In this approximate
algorithm, we use k < N edges from each point, so that the cost is only
O[Nk log(Nk)]. For k = N, the approximation is exact; in practice for
well-behaved data sets, the result is exact for k << N."""
def __init__(self, n_neighbors=20, edge_cutoff=0.9, min_cluster_size=1):
self.n_neighbors = n_neighbors
self.edge_cutoff = edge_cutoff
self.min_cluster_size = min_cluster_size
def fit(self, X):
"""Fit the clustering model
Parameters
----------
X : array_like
the data to be clustered: shape = [n_samples, n_features]
"""
X = np.asarray(X, dtype=float)
self.X_train_ = X
# generate a sparse graph using the k nearest neighbors of each point
G = kneighbors_graph(X, n_neighbors=self.n_neighbors, mode="distance")
# Compute the minimum spanning tree of this graph
self.full_tree_ = minimum_spanning_tree(G, overwrite=True)
# Find the cluster labels
self.n_components_, self.labels_, self.cluster_graph_ = self.compute_clusters()
return self
def compute_clusters(
self, edge_cutoff=None, min_cluster_size=None, distcutoff=None
):
"""Compute the clusters given a trained tree
After fit() is called, this method may be called to obtain a
clustering result with a new edge_cutoff and min_cluster_size.
Parameters
----------
edge_cutoff : float, optional
specify a fraction of edges to keep when selecting clusters.
edge_cutoff should be between 0 and 1. If not specified,
self.edge_cutoff will be used.
min_cluster_size : int, optional
specify a minimum number of points per cluster. If not specified,
self.min_cluster_size will be used.
Returns
-------
n_components : int
the number of clusters found
labels : ndarray
the labels of each point. Labels range from -1 to
n_components_ - 1: points labeled -1 are in the background
(i.e. their clusters were smaller than min_cluster_size)
T_trunc : sparse matrix
the truncated minimum spanning tree
"""
if edge_cutoff is None:
edge_cutoff = self.edge_cutoff
if min_cluster_size is None:
min_cluster_size = self.min_cluster_size
if not hasattr(self, "full_tree_"):
raise ValueError("must call fit() before calling " "compute_clusters()")
T_trunc = self.full_tree_.copy()
# cut-off edges at the percentile given by edge_cutoff
if distcutoff is None:
cutoff = np.percentile(T_trunc.data, 100 * edge_cutoff)
else:
cutoff = distcutoff
T_trunc.data[T_trunc.data > cutoff] = 0
T_trunc.eliminate_zeros()
# find connected components
n_components, labels = connected_components(T_trunc, directed=False)
counts = np.bincount(labels)
# for all components with less than min_cluster_size points, set
# to background, and re-label the clusters
i_bg = np.where(counts < min_cluster_size)[0]
for i in i_bg:
labels[labels == i] = -1
if len(i_bg) > 0:
_, labels = np.unique(labels, return_inverse=True)
labels -= 1
n_components = labels.max() + 1
# eliminate links in T_trunc which are not clusters
I = sparse.eye(len(labels), len(labels))
I.data[0, labels < 0] = 0
T_trunc = I * T_trunc * I
return n_components, labels, T_trunc
def get_graph_segments(X, G):
"""Get graph segments for plotting a 2D graph
Parameters
----------
X : array_like
the data, of shape [n_samples, 2]
G : array_like or sparse graph
the [n_samples, n_samples] matrix encoding the graph of connectinons
on X
Returns
-------
x_coords, y_coords : ndarrays
the x and y coordinates for plotting the graph. They are of size
[2, n_links], and can be visualized using
``plt.plot(x_coords, y_coords, '-k')``
"""
X = np.asarray(X)
if (X.ndim != 2) or (X.shape[1] != 2):
raise ValueError("shape of X should be (n_samples, 2)")
G = sparse.coo_matrix(G)
A = X[G.row].T
B = X[G.col].T
x_coords = np.vstack([A[0], B[0]])
y_coords = np.vstack([A[1], B[1]])
return x_coords, y_coords