-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnDTreeImplementation.py
319 lines (264 loc) · 12.1 KB
/
nDTreeImplementation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
'''
Name: Flaviu Vadan
Date: Mon, Jun 17th, 2016
Email: flaviuvadan@gmail.com
DISCUS Lab
n-Dimensional Tree implementation
'''
import itertools as itr
import numpy as np
import pandas as pd
#=============================================================================#
# nTree implementation
#=============================================================================#
class nDTree:
def __init__(self, hypercube, depth=0):
'''
Initializer - defines the Tree
Parameters:
----------
hypercube : vector of the form [(a0, b0), (a1, b1), ..., (a_n, b_n)]
where a_i, b_i = min and max values coordinates defining
the hypercube
depth : default value of 0 for the root but incremented as we
advance further down the tree
'''
self.data = []
self.children = []
self.depth = depth
self.hypercube = hypercube
def _insert(self, to_insert):
'''
Insert data into Tree. The node hypercube coordinates will be assigned
based on the current hypercube
Parameters:
----------
to_insert : data tuple of the form [a0, a1, a2, ... , a_n]
where a_i represents the coordinates of one point
'''
for each in to_insert:
current_node = self
#if we are not at a leaf, insert the data, then insert into the children
while(current_node._isLeaf() is False):
current_node.data.append(each)
current_node = current_node._findChildThatFitsBounds(each)
#if the current node contains more than one data point
#split and further insert into children
if current_node._hasData():
#we have more than one data point/node so we split
self._splitAndInsert(current_node, each)
else:
current_node.data.append(each)
def _findChildThatFitsBounds(self, data):
'''
If the data point fits in any of the child nodes return the child
Parameters:
----------
data : data tuple of the form [a0, a1, a2, ... , a_n]
where a_i represents the coordinates of one point
'''
for child in self.children:
if(self._isEnclosed(data, child)):
return child
def _splitAndInsert(self, current_node, data_point):
'''
Splits the current node and inserts the data into the proper children
according to the coordinates
Parameters:
----------
current_node : the node that will be split
data_point : data to insert
'''
current_node._split()
#find the right child for the first data point and append the data point
#to the child
for parent_data in current_node.data:
child = current_node._findChildThatFitsBounds(parent_data)
child.data.append(parent_data)
if len(child.data) > 1:
self._splitAndInsert(child, data_point)
child = current_node._findChildThatFitsBounds(data_point)
if not data_point in child.data:
#now also append the data to the parent
child.data.append(data_point)
if len(child.data) > 1:
self._splitAndInsert(child, data_point)
if not data_point in current_node.data:
#also append the data point to the current node since we want every parent
#to contain the data points that their children contain
current_node.data.append(data_point)
def _isEnclosed(self, data, child):
'''
Check if a datapoint is within the boundaries of a hypercube
Parameters:
----------
data : data tuple of the form [a0, a1, a2, ... , a_n]
where a_i represents the coordinates of one point
child : child reference
Notes:
----------
The boundaries checking can be done with nested for loops but it
produced uncosistent results so we stick with checking the boundaries
for 7D (our case). If a solution is found for dynamically checking
the boundaries then the whole program will dynamically check the num.
of dimensions and the boundaries (FV 14/07/2016)
'''
#special case when we have cubes that have a side coordinate of 0
#so we allow data points to be placed exactly on the axis
if(child.hypercube[0][0]==0 or child.hypercube[1][0]==0 or
child.hypercube[2][0]==0 or child.hypercube[3][0]==0 or
child.hypercube[4][0]==0 or child.hypercube[5][0]==0 or
child.hypercube[6][0]==0 or child.hypercube[0][1]==0 or
child.hypercube[1][1]==0 or child.hypercube[2][1]==0 or
child.hypercube[3][1]==0 or child.hypercube[4][1]==0 or
child.hypercube[5][1]==0 or child.hypercube[6][1]==0):
if(data[0] >= child.hypercube[0][0] and data[0] <= child.hypercube[0][1] and
data[1] >= child.hypercube[1][0] and data[1] <= child.hypercube[1][1] and
data[2] >= child.hypercube[2][0] and data[2] <= child.hypercube[2][1] and
data[3] >= child.hypercube[3][0] and data[3] <= child.hypercube[3][1] and
data[4] >= child.hypercube[4][0] and data[4] <= child.hypercube[4][1] and
data[5] >= child.hypercube[5][0] and data[5] <= child.hypercube[5][1] and
data[6] >= child.hypercube[6][0] and data[6] <= child.hypercube[6][1]):
return True
#second case when data points are within the hypercube i.e. between boundaries
#we still allow datapoints to be placed on axes but only on one
elif(data[0] > child.hypercube[0][0] and data[0] <= child.hypercube[0][1] and
data[1] > child.hypercube[1][0] and data[1] <= child.hypercube[1][1] and
data[2] > child.hypercube[2][0] and data[2] <= child.hypercube[2][1] and
data[3] > child.hypercube[3][0] and data[3] <= child.hypercube[3][1] and
data[4] > child.hypercube[4][0] and data[4] <= child.hypercube[4][1] and
data[5] > child.hypercube[5][0] and data[5] <= child.hypercube[5][1] and
data[6] > child.hypercube[6][0] and data[6] <= child.hypercube[6][1]):
return True
def _isLeaf(self):
'''
Checks if a node is a leaf. Returns true if leaf, false otherwise
Parameters:
----------
None
'''
if not self.children:
return True
else:
return False
def _hasData(self):
'''
Checks if a tree contains data. If tree contains data, return false,
otherwise true
Parameters:
----------
None
'''
if not self.data and len(self.data) != 1:
return False
else:
return True
def _split(self):
'''
Splits the current node into 2^n nodes according to the number of
dimensions we are working with and the number of data points given
Parameters:
----------
None
'''
new_coordinates = self._getNewCoordinates()
self.children = [nDTree(hypercube=coords, depth=self.depth+1) for coords in new_coordinates]
def _getNewCoordinates(self):
'''
Returns the coordinates of the new hypercube
Parameters:
----------
None
'''
new_coords = []
for coord in self.hypercube:
low = coord[0]
top = coord[1]
mid = (low+top)/2
new_coords.append([(low,mid), (mid,top)])
#returning the cartesian product of the input iterables i.e. new_coords
#itr.product is the equivalent of nested for loops
#the loops cycle with the rightmost element advancing on every iteration
#this pattern creates a lexicographic ordering such that if the input's
#iterables are sorted, the resulting tuples are sorted as well
#see itertools.product
return (list(itr.product(*new_coords)))
def _printTree(self, level=0):
'''
Prints the tree in a hierarchic fashion. Only used for testing purposes.
Do not print trees with a great number of data points.
Parameters:
----------
level=0 : default parameter - we start with the root
'''
print('\t' * level + repr(self.data))
for child in self.children:
child._printTree(level+1)
def _countNodes(self, depth, nodes_data=0, leaves=0, total=0):
'''
Returns the number of nodes containing data at a specific level
Parameters:
----------
depth : the depth at which we are counting
count : standard value for the counter will be incremeneted and returned
'''
if depth == 0:
total = 1
if self.data:
nodes_data = 1
if self.children:
leaves = 0
else:
leaves = 1
return nodes_data, leaves, total
else:
for child in self.children:
if child.depth == depth:
total += 1
if child.data:
nodes_data += 1
if len(child.data) == 1:
leaves += 1
nodes_data, leaves, total = child._countNodes(depth, nodes_data, leaves, total)
else:
nodes_data, leaves, total = child._countNodes(depth, nodes_data, leaves, total)
return nodes_data, leaves, total
def _getEpsilon(self, treeLevel):
'''
Returns epsilon (side length of all the boxes at level n)
Parameters:
----------
treeLevel : the depth for which we are computing epsilon
'''
return 1/(len(self.hypercube)*(2**treeLevel))
def _buildDimensionalityTupleSet(self, treeLevel, table):
'''
Returns a set of tuples of the form [E, log(N(E))/log(E)]
where E is epsilon and N(E) is the number of boxes
of side length E (see Wiki documentation)
Parameters:
----------
treeLevel : maximum depth of tree to calculate epsilon
'''
numBoxesLevel = []
print("Data size: " + str(len(self.data)))
#we need epsilon to know N(E), where N(E) is the number of hypercubes of side length epsilon
#and E is defined by _getEpsilon
#we loop through and calculate epsilon for each level of the tree
#once we have epsilon, we get the number of hypercubes of side length epsilon
#and append it to the tuple list
for each_level in range(0, treeLevel):
epsilon = self._getEpsilon(each_level)
numBoxesWithData, numLeaves, numBoxes = self._countNodes(each_level)
if numBoxesWithData == 0:
break
numBoxesLevel.append((each_level, epsilon, numBoxes, numBoxesWithData, numLeaves, numBoxesWithData/numBoxes))
numBoxesLevel = pd.DataFrame(numBoxesLevel, columns=['level', 'epsilon', 'total_nodes', 'new_nodes_data', 'leaves_data', 'new_nodes_data_perc'])
numBoxesLevel = numBoxesLevel[numBoxesLevel['new_nodes_data'] > 0]
numBoxesLevel['nodes_data'] = numBoxesLevel['new_nodes_data'] + numBoxesLevel['leaves_data'].cumsum().shift().fillna(0)
numBoxesLevel['nodes_data_perc'] = numBoxesLevel['nodes_data']/numBoxesLevel['total_nodes'].cumsum()
numBoxesLevel.to_csv("results/shed" + table + "-numBoxesLevel.csv", index=False)
print("Number of leaves (should match the dataset size): " + str(numBoxesLevel['leaves_data'].sum()))
numBoxesLevel = numBoxesLevel[numBoxesLevel['epsilon'] > 0]
return pd.DataFrame({'dim_box': (np.log(numBoxesLevel['nodes_data'])/np.log(1/numBoxesLevel['epsilon'])).values,
'epsilon': numBoxesLevel['epsilon'].values})