-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathkmedoids.py
82 lines (71 loc) · 2.6 KB
/
kmedoids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""
Implementation of kmedoids algorithm from
Bauckhage C. Numpy/scipy Recipes for Data Science: k-Medoids Clustering[R]. Technical Report, University of Bonn, 2015.
Code based on https://github.com/letiantian/kmedoids; labelarray function added.
"""
import numpy as np
import random
def kMedoids(D, k, tmax=100):
# determine dimensions of distance matrix D
m, n = D.shape
if k > n:
raise Exception('too many medoids')
# find a set of valid initial cluster medoid indices since we
# can't seed different clusters with two points at the same location
valid_medoid_inds = set(range(n))
invalid_medoid_inds = set([])
rs,cs = np.where(D==0)
# the rows, cols must be shuffled because we will keep the first duplicate below
index_shuf = range(len(rs))
np.random.shuffle(index_shuf)
rs = rs[index_shuf]
cs = cs[index_shuf]
for r,c in zip(rs,cs):
# if there are two points with a distance of 0...
# keep the first one for cluster init
if r < c and r not in invalid_medoid_inds:
invalid_medoid_inds.add(c)
valid_medoid_inds = list(valid_medoid_inds - invalid_medoid_inds)
if k > len(valid_medoid_inds):
raise Exception('too many medoids (after removing {} duplicate points)'.format(
len(invalid_medoid_inds)))
# randomly initialize an array of k medoid indices
M = np.array(valid_medoid_inds)
np.random.shuffle(M)
M = np.sort(M[:k])
# create a copy of the array of medoid indices
Mnew = np.copy(M)
# initialize a dictionary to represent clusters
C = {}
for t in xrange(tmax):
# determine clusters, i. e. arrays of data indices
J = np.argmin(D[:,M], axis=1)
for kappa in range(k):
C[kappa] = np.where(J==kappa)[0]
# update cluster medoids
for kappa in range(k):
J = np.mean(D[np.ix_(C[kappa],C[kappa])],axis=1)
j = np.argmin(J)
Mnew[kappa] = C[kappa][j]
np.sort(Mnew)
# check for convergence
if np.array_equal(M, Mnew):
break
M = np.copy(Mnew)
else:
# final update of cluster memberships
J = np.argmin(D[:,M], axis=1)
for kappa in range(k):
C[kappa] = np.where(J==kappa)[0]
labels = labelarray(C)
# return results
return M, C, labels
def labelarray(clusterinfo):
mydict = {}
for label in clusterinfo:
for point_idx in clusterinfo[label]:
mydict[point_idx] = label
kmed_labels = []
for key, value in sorted(mydict.items()):
kmed_labels.append(value)
return kmed_labels