-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathpreprocessor.py
533 lines (420 loc) · 18 KB
/
preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
"""
Created by Jonas Pfeiffer on 26/04/17.
"""
import cPickle
import os.path
import pickle
from operator import itemgetter
import numpy as np
import scipy.io
from sklearn.cluster import KMeans
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
class Preprocessor:
def __init__(self):
"""
If a model does not yet exist, that is the is not a model pickle file in "pickle_files"
the model is trained again and stored as a pickle file in "pickle_files"
No input, only intitialization of preprocessor models
"""
self.data = None
# Labels for flipping, left, right and middle outliers. Only needed if the models do not yet exist
self.all_labels = None
# models that are trained for classification of each ECG
self.flipping_model = self.get_flipping_model()
self.left_outlier_model = self.get_side_outlier_model(left=0.1, side="left")
self.right_outlier_model = self.get_side_outlier_model(left=0.9, side="right")
self.middle_outlier_model = self.get_middle_outlier_model()
def process(self, data):
"""
Processes the ECG: Flipps it, deletes obvious outliers at the start, middle and end
:param data: ECG
:return: preprocessed ECG
"""
# ECG data
self.data = data
# features and prediction of left outliers. left_part contains an array of the indexes that need to be
# elminated if prediction == 1
left_feat, left_part = self.get_features_outliers(data, left=0.2)
try:
left_pred = self.left_outlier_model.predict(left_feat)
except:
left_pred = [1]
# features and prediction of right outliers. right_part contains an array of the indexes that need to be
# elminated if prediction == 1
right_feat, right_part = self.get_features_outliers(data, left=0.9)
try:
right_pred = self.right_outlier_model.predict(right_feat)
except:
# TODO GOES HERE AT FILE A07331
right_pred = [1]
# features and prediction of middle outliers. k_means_points contains an array of the indexes that need to be
# elminated if prediction == 1
middle_feat, k_means_data, k_means_points = self.get_features_outliers_middle(data)
try:
middle_pred = self.middle_outlier_model.predict(middle_feat)
except:
middle_pred = [1]
# append the the parts that need to be deleted if predicted to contain outliers
to_delete = []
if left_pred[0] == 1:
to_delete.append([0, left_part])
if right_pred[0] == 1:
to_delete.append([right_part, len(self.data)])
if middle_pred[0] == 1:
# do k-means clustering if the middle part contains outliers to identify which parts need to be eliminated
test = self.k_means_splitting(k_means_data, k_means_points)
to_delete += self.k_means_splitting(k_means_data, k_means_points)
outliers = []
if to_delete != []:
outliers = self.delete_parts(to_delete)
# predict if dataset needs to be flipped on basis of the outlier removed ECG
flip_feat, base = self.get_features_flipping(data)
flip_pred = self.flipping_model.predict(flip_feat)
# if prediction of flipping is true, flip the dataset and return it
if flip_pred[0] == 1:
return -self.data, outliers
else:
return self.data, outliers
def get_all_labels(self):
"""
If all_labels has never been called, load it from the pickle file
:return: all_labels -> the hand labeld set for flipping, left, middle and right outliers
"""
if self.all_labels == None:
with open('pickle_files/all_labels.pickle', 'rb') as handle:
self.all_labels = pickle.load(handle)
return self.all_labels
def get_middle_outlier_model(self):
"""
generates the model to identify if the ECG contains outliers in the middle
:return: middle outlier model
"""
# If model was already generated once retrieve it from the pickle file
if os.path.isfile("pickle_files/middle_outlier_model.pickle"):
with open('pickle_files/middle_outlier_model.pickle', 'rb') as handle:
return pickle.load(handle)
# Else train the model
else:
print "optimizing middle"
all_labels = self.get_all_labels()
features = []
labels = []
# Loop through all the labeled files
for filename, values in all_labels.iteritems():
mat1 = scipy.io.loadmat('training_data/' + filename)
# get the ECG
y = mat1['val'][0]
# retrieve the features
feat, k_means_data, k_means_points = self.get_features_outliers_middle(y)
# append them to one training set
features.append(feat)
# retrieve the labeld data and append for training set
labels.append(int(values["middle"]))
classifier = AdaBoostClassifier(learning_rate=0.8,
base_estimator=RandomForestClassifier(criterion='gini', n_estimators=5,
max_features=9))
classifier.fit(features, labels)
# store the model in pickle file and return the trained model
with open('pickle_files/middle_outlier_model.pickle', 'wb') as handle:
cPickle.dump(classifier, handle, protocol=cPickle.HIGHEST_PROTOCOL)
return classifier
def get_side_outlier_model(self, left, side):
if os.path.isfile("pickle_files/" + side + "_outlier_model.pickle"):
with open("pickle_files/" + side + "_outlier_model.pickle", 'rb') as handle:
return pickle.load(handle)
else:
print "optimizing side: " + side
all_labels = self.get_all_labels()
features = []
labels = []
for filename, values in all_labels.iteritems():
mat1 = scipy.io.loadmat('training_data/' + filename)
y = mat1['val'][0]
feat, left_part = self.get_features_outliers(y, left=left)
features.append(feat)
labels.append(int(values[side]))
if side == "left":
classifier = AdaBoostClassifier(learning_rate=0.9,
base_estimator=RandomForestClassifier(criterion='gini', n_estimators=5,
max_features=5))
else:
classifier = AdaBoostClassifier(learning_rate=1.0,
base_estimator=RandomForestClassifier(criterion='gini', n_estimators=5,
max_features=5))
classifier.fit(features, labels)
with open("pickle_files/" + side + "_outlier_model.pickle", 'wb') as handle:
cPickle.dump(classifier, handle, protocol=cPickle.HIGHEST_PROTOCOL)
return classifier
def get_flipping_model(self):
if os.path.isfile("pickle_files/flipping_model.pickle"):
with open('pickle_files/flipping_model.pickle', 'rb') as handle:
return pickle.load(handle)
else:
print "optimizing flipping"
all_labels = self.get_all_labels()
features = []
labels = []
for filename, values in all_labels.iteritems():
mat1 = scipy.io.loadmat('training_data/' + filename)
y = mat1['val'][0]
feat, base = self.get_features_flipping(y)
features.append(feat)
labels.append(int(values["flip"]))
classifier = AdaBoostClassifier(learning_rate=0.79999999999999993,
base_estimator=RandomForestClassifier(criterion='gini', n_estimators=5,
max_features=1))
classifier.fit(features, labels)
with open('pickle_files/flipping_model.pickle', 'wb') as handle:
cPickle.dump(classifier, handle, protocol=cPickle.HIGHEST_PROTOCOL)
return classifier
def delete_parts(self, to_delete):
"""
delets the parts of the ECG that were found to be outliers
:param to_delete: list of arrays start to finish
: return : outliers
"""
# sort to_delete -> technically should never be unsorted
to_delete = sorted(to_delete, key=itemgetter(0))
# overlapping arrays need to be merged: [[0,200],[200,400],[600,800]] = [[0,400],[600,800]]
new_to_delete = []
i = 0
lower = to_delete[i][0]
upper = to_delete[i][1]
while True:
if i == len(to_delete) - 1:
new_to_delete.append([lower, upper])
break
if upper >= to_delete[i + 1][0]:
upper = to_delete[i + 1][1]
i += 1
else:
new_to_delete.append([lower, upper])
i += 1
lower = to_delete[i][0]
upper = to_delete[i][1]
# After deleting a part from the ECG data the indices are wrong: update according to distance
# of last tuple
distance = 0
for i in range(0, len(new_to_delete)):
new_to_delete[i][0] -= distance
new_to_delete[i][1] -= distance
distance += new_to_delete[i][1] - new_to_delete[i][0]
outliers = []
for elem in new_to_delete:
outliers.append(elem[0])
self.data = self.del_snippet(self.data, elem[0], elem[1])
return outliers
def del_snippet(self, data, start, end):
"""
delets a snippet from the ECG data
:param data: ECG
:param start: Starting index that needs to be deleted
:param end: End index that needs to be deleted
:return: updated ECG Data
"""
length = len(data)
new_data = []
if start != 0:
new_data = np.concatenate((new_data, data[0:start]))
if end != length:
new_data = np.concatenate((new_data, data[end:length]))
return new_data
def get_kmeans_stats(self, k_means_data, k_means_points, k):
"""
fit k means cluster with k cluster and get statistics
:param k_means_data: snippets of dataparts
:param k_means_points: range of snippet
:param k: k
:return: statistics to each cluster
"""
kmeans = KMeans(n_clusters=k)
prediction = kmeans.fit_predict(k_means_data)
stats = {}
for i in range(0, len(prediction)):
if prediction[i] not in stats:
stats[prediction[i]] = {}
stats[prediction[i]]["count"] = 0
stats[prediction[i]]["points"] = []
stats[prediction[i]]["count"] += 1
stats[prediction[i]]["points"].append(k_means_points[i])
return stats
def k_means_splitting(self, k_means_data, k_means_points):
"""
k means clustering und data snippets of middle parts of ECG.
:param k_means_data: snippets of data parts
:param k_means_points: range of snippet
:return: points to be deleted
"""
# do the k means clustering
stats = self.get_kmeans_stats(k_means_data, k_means_points, 2)
counts = 9999
delete_points = None
# evaluate which cluster needs to be added to the to_delete list.
for key, value in stats.iteritems():
# if one cluster has less parts included we assume that this is the outlier set, because we are
# hoping that there are always more non outlier areas than outliers
if value["count"] < counts:
counts = value["count"]
delete_points = value["points"]
# if we have the case that its even we look at the standard deviation and decide on basis of this.
elif value["count"] == counts:
old = []
new = []
for array in delete_points:
for i in range(array[0], array[1]):
old.append(self.data[i])
for array in value["points"]:
for i in range(array[0], array[1]):
new.append(self.data[i])
std_old = np.std(old)
std_new = np.std(new)
if std_new > std_old:
counts = value["count"]
delete_points = value["points"]
return delete_points
def get_features_flipping(self, data):
"""
generate features for flipping
:param data: ECG
:return: Feature array
"""
maxi = np.max(data)
mini = np.min(data)
std = np.std(data)
sum = np.sum(data)
up = []
down = []
for point in data:
if point >= 0:
up.append(point)
else:
down.append(point)
count_up = len(up)
count_down = len(down)
if up != []:
std_up = np.std(up)
else:
std_up = 0.0
if down != []:
std_down = np.std(down)
else:
std_down = 0.0
sum_up = np.std(up)
sum_down = np.std(down)
return [maxi, mini, std, sum, count_up, count_down, std_up, std_down, sum_up, sum_down], int(
count_down < count_up)
def get_features_outliers(self, data, left):
"""
generate features for left and right model
:param data: ECG
:param left: float indicating how the data should be split. left = 0.1 for left side left = 0.9 for right side
:return: Feature array
"""
length_data = len(data)
percent_left = left
left_part = int(round(length_data * percent_left))
data_left = data[0:left_part]
data_right = data[left_part:length_data]
maxi_left = np.max(data_left)
mini_left = np.min(data_left)
std_left = np.std(data_left)
sum_left = np.sum(data_left)
up_left = []
down_left = []
for point_left in data_left:
if point_left >= 0:
up_left.append(point_left)
else:
down_left.append(point_left)
count_up_left = len(up_left)
count_down_left = len(down_left)
if up_left != []:
std_up_left = np.std(up_left)
else:
std_up_left = 0.0
if down_left != []:
std_down_left = np.std(down_left)
else:
std_down_left = 0.0
sum_up_left = np.std(up_left)
sum_down_left = np.std(down_left)
maxi_right = np.max(data_right)
mini_right = np.min(data_right)
std_right = np.std(data_right)
sum_right = np.sum(data_right)
up_right = []
down_right = []
for point_right in data_right:
if point_right >= 0:
up_right.append(point_right)
else:
down_right.append(point_right)
count_up_right = len(up_right)
count_down_right = len(down_right)
if up_right != []:
std_up_right = np.std(up_right)
else:
std_up_right = 0.0
if down_right != []:
std_down_right = np.std(down_right)
else:
std_down_right = 0.0
sum_up_right = np.std(up_right)
sum_down_right = np.std(down_right)
features = [maxi_left, mini_left, std_left, sum_left, count_up_left, count_down_left, std_up_left,
std_down_left, sum_up_left, sum_down_left, maxi_right, mini_right, std_right, sum_right,
count_up_right, count_down_right, std_up_right, std_down_right, sum_up_right, sum_down_right]
return features, left_part
def get_features_outliers_middle(self, data):
"""
generate features for middle
:param data: ECG
:return: Feature array
"""
length_data = len(data)
percent = 0.1
part = int(round(length_data * percent))
start = part
features = []
k_means_data = []
k_means_points = []
while True:
points = []
if start + (1.1 * part) > length_data:
break
data_split = data[start:start + part]
k_means_points.append([start, start + part])
points.append(np.max(data_split))
points.append(np.min(data_split))
points.append(np.std(data_split))
points.append(np.sum(data_split))
up = []
down = []
for point in data_split:
if point >= 0:
up.append(point)
else:
down.append(point)
points.append(len(up))
points.append(len(down))
if up != []:
points.append(np.std(up))
else:
points.append(0.0)
if down != []:
points.append(np.std(down))
else:
points.append(0.0)
if up != []:
points.append(np.std(up))
else:
points.append(0.0)
if down != []:
points.append(np.std(down))
else:
points.append(0.0)
start += part
features += points
k_means_data.append(points)
return features, k_means_data, k_means_points