-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathkmeans_tree.py
249 lines (204 loc) · 9.57 KB
/
kmeans_tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
import numpy as np
import utils
import numpy_indexed as npi
import cv2 as cv
from multiprocessing import Pool
from functools import partial
from collections import defaultdict
# helpers
def _build_tree_level_enough_cache(ids, des, k, criteria, attempts):
try:
_, indices, clusters = cv.kmeans(des, k, None, criteria, attempts, cv.KMEANS_PP_CENTERS)
return npi.group_by(indices, ids)[1], npi.group_by(indices, des)[1], clusters
except Exception as e:
print("Error while building level", e)
return None, None, None
def _check_level_ok(res, k, l, level):
if res is None:
return False
for i, _, _ in res:
for j in i:
if len(j) < k**(l-1-level):
return False
return True
def _cluster_nbs(level, index, k, l): # could be cleaner, but works fine like this
indeces = list()
base = 0
times = 1
for i in range(l):
if i >= level:
indeces.extend(list(np.arange(times) + base + k**(i-level) * index))
times = k**(i-level+1)
base += k**i
return indeces
def _leaf_nbs(level, index, k, l):
return np.arange(k**(l-level)) + index * k**(l-level)
def _process_node_initial_scoring(centroids, des, index):
return utils.get_closest_indexes(centroids, des) + index
def _get_closest_leafs(tree, current_des, K, L):
"""
For every descriptor search its closest (approximate) leave by traversing the tree.
First level: For every des calculate which centroid is closest. Group all des that are closest to the same centroid.
Next level:
For every centroid take all des that traversed the tree to that centroid.
For all those des, calculate which centroid it's closest to 'below' the current centroid.
Group all des that are closest to the same centroid.
Repeat L times, so that the closest leaf node for every des is found.
The concept is pretty simple, but how to keep track of the indices is tricky to understand.
"""
current_indices = utils.get_closest_indexes(tree.get_centroids(0), current_des)
current_indices, current_des = npi.group_by(current_indices, current_des)
l = 1
current_indices += 1
while l < L:
new_centroids = tree.get_centroids([current_indices])
current_indices *= K
current_indices += 1
des_next, indices_next = list(), list()
for centroids_parent, des_parent, indices_parent in zip(new_centroids, current_des, current_indices): # TODO: try multiprocessing
indices_child = _process_node_initial_scoring(centroids_parent, des_parent, indices_parent)
indices_child_grouped, des_child_grouped = npi.group_by(indices_child, des_parent)
indices_next.extend(indices_child_grouped)
des_next.extend(des_child_grouped)
current_indices = np.array(indices_next, np.uint32)
current_des = des_next
l += 1
return current_indices - np.sum([np.power(K, np.arange(L))]), [len(i) for i in current_des]
class KMeansTree:
def __init__(self, file="tree.p", nb_images=-1, k=10, l=6, criteria=(cv.TERM_CRITERIA_EPS + cv.TERM_CRITERIA_MAX_ITER, 5, 0.5), attempts=5, vector_dimensions=128):
self.file = file
self.tree = utils.get_pickled(file)
if self.tree is None: # if the file doesn not exist already
if nb_images == -1:
raise ValueError("nb_images not mentioned.")
self.tree = (np.zeros([np.sum(np.power(k, np.arange(l))), k, vector_dimensions], dtype=np.float32),
np.zeros(k ** l, dtype="float32, object, object"),
(k, l, criteria, attempts, nb_images))
self.criteria = criteria
self.attempts = attempts
self.k = k
self.l = l
self.N = nb_images
self.store()
else:
self.k, self.l, self.criteria, self.attempts, self.N = self.tree[2]
def get_k(self):
return self.k
def get_l(self):
return self.l
def get_centroids(self, indices):
if isinstance(indices, list):
indices = tuple(indices)
return self.tree[0][indices]
def get_leaves(self, leaf_indices):
if isinstance(leaf_indices, list):
leaf_indices = tuple(leaf_indices)
return self.tree[1][leaf_indices]
def store(self):
utils.pickle_data(self.tree, self.file)
def build_node_from_given_data(self, des, level=0, index=0):
# take given node
tree_index = index + np.sum(np.power(self.k, np.arange(level)))
node = self.tree[0][tree_index]
# check if node not already build
if node[0, 0] != 0:
print("Node at level {} and index {} already build.".format(level, index))
return None
print("Building node at level {} and index {}.".format(level, index))
# calculate clusters
clusters = cv.kmeans(des, self.k, None, self.criteria, self.attempts, cv.KMEANS_PP_CENTERS)[2]
# update clusters
self.tree[0][tree_index] = clusters
# store result!
self.store()
return clusters
def build_branch(self, ids_in, des_in, level=0, index=0, attempts_level=10):
# check if given node not already build
if self.tree[0][index + np.sum(np.power(self.k, np.arange(level))), 0, 0] != 0:
print("Node at level {} and index {} already build.".format(level, index))
return None
initial_level = level
print("Building in branch: level {}, index {}, nb of des to process {:_}.".format(level, index, len(ids_in)))
# build first level
ids, des, clusters = _build_tree_level_enough_cache(ids_in, des_in, self.k, self.criteria, self.attempts)
all_clusters = [clusters]
level += 1
# build all other levels
while level < self.l: # run till the depth of the tree
print("Building in branch: level {}.".format(level))
res = None
tries = attempts_level
while not _check_level_ok(res, self.k, self.l, level): # try a few times to successfully build a tree level
if tries != attempts_level:
print("trying level again.")
with Pool() as p:
res = p.starmap(partial(_build_tree_level_enough_cache,
k=self.k, criteria=self.criteria, attempts=self.attempts), zip(ids, des))
if tries == 0: # couldn't build tree level
raise RuntimeError("Unable to build branch at level {}. "
"Probable cause is too few descriptors for the amount of leaves.".format(level))
tries -= 1
# if building level successful, setup data for next level
ids, des = list(), list()
for i, d, clusters in res:
if i is None:
raise ValueError("An id list is None, this means an error occurred.")
ids.extend(i)
des.extend(d)
all_clusters.append(clusters)
del res
level += 1
# update clusters
print("Updating clusters.")
if initial_level > 0:
for j, cluster in zip(_cluster_nbs(initial_level, index, self.k, self.l), all_clusters):
self.tree[0][j] = cluster
else:
self.tree = (all_clusters[0], self.tree[1], self.tree[2])
# update leaves
print("Updating leaves.")
for leaf_nb, leaf_ids in zip(_leaf_nbs(initial_level, index, self.k, self.l), ids):
unique, counts = np.unique(leaf_ids, return_counts=True)
self.tree[1][leaf_nb] = np.array(0, dtype=np.float32), unique, counts.astype(np.uint32)
# store result!
print("Storing results.")
self.store()
def finalise(self):
# check if not already done!
if self.tree[1][-1][2].dtype == np.float32:
print("Tree already finalised.")
return None
all_items = defaultdict(lambda: np.array(0, dtype=np.float32))
# Entropy
for leaf_nb in np.arange(self.k ** self.l):
leaf = self.tree[1][leaf_nb]
w = np.log(self.N / len(leaf[2]))
leaf[2] = np.multiply(w, leaf[2], dtype=np.float32)
leaf[0] = w
for item, score in zip(leaf[1], leaf[2]):
all_items[item] += score
# Normalisation
print("Normalising scores.")
for leaf_nb in np.arange(self.k ** self.l):
leaf = self.tree[1][leaf_nb]
sizes = np.array([all_items[item] for item in leaf[1]], dtype=np.float32)
leaf[2] = np.divide(leaf[2], sizes, dtype=np.float32)
# store result!
self.store()
def initial_scoring(self, des):
"""
Perform the initial scoring.
"""
leaf_indices, nb_of_des = _get_closest_leafs(self, des, self.k, self.l)
leaves = self.get_leaves(leaf_indices)
weights = [i[0] for i in leaves]
values = np.multiply(nb_of_des, weights, dtype=np.float32) # weighting
values /= np.sum(values) # L-1 normalisation
# Calculate the score for every db index on every leaf node
indexes, scores = list(), list()
for leave, value in zip(leaves, values):
indexes.extend(leave[1])
scores.extend(np.subtract(leave[2] + value, np.abs(leave[2] - value), dtype=np.float32))
# sum all scores for every index
indexes, scores = npi.group_by(indexes, scores) # this should be way faster!!!
return indexes, [np.sum(i) for i in scores]