-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathprocess.cpp
118 lines (96 loc) · 3.43 KB
/
process.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#include <iostream>
#include <vector>
#include <string>
#include <cmath>
#include <limits>
#include <cassert>
#include <unordered_map>
#include <algorithm>
#include "classMember.h"
void normalizeFeatures(std::vector<ClassMember>& dataset) {
if (dataset.empty()) {
std::cerr << "Dataset is empty!" << std::endl;
return;
}
size_t numFeatures = dataset[0].features.size();
std::vector<double> means(numFeatures, 0.0);
std::vector<double> sigmas(numFeatures, 0.0);
// Calculate mean for each feature
for (const auto& obj : dataset) {
if (obj.features.size() != numFeatures) {
fprintf(stderr, "Inconsistent feature size: %zu != %zu\n", obj.features.size(), numFeatures);
return;
}
for (size_t i = 0; i < numFeatures; ++i) {
means[i] += obj.features[i];
}
}
for (double& mean : means) {
mean /= dataset.size();
}
// Calculate standard deviation for each feature
for (const auto& obj : dataset) {
for (size_t i = 0; i < numFeatures; ++i) {
sigmas[i] += (obj.features[i] - means[i]) * (obj.features[i] - means[i]);
}
}
for (double& sigma : sigmas) {
sigma = std::sqrt(sigma / dataset.size());
if (sigma == 0) {
std::cerr << "Standard deviation is zero for feature index " << (&sigma - &sigmas[0]) << std::endl;
return;
}
}
// Normalize the dataset
for (auto& obj : dataset) {
for (size_t i = 0; i < numFeatures; ++i) {
obj.features[i] = (obj.features[i] - means[i]) / sigmas[i];
}
}
}
double euclideanDistance(const std::vector<double>& a, const std::vector<double>& b) {
double sum = 0.0;
for (size_t i = 0; i < a.size(); ++i) {
sum += (a[i] - b[i]) * (a[i] - b[i]);
}
return std::sqrt(sum);
}
std::vector<double> computeNearestNeighborDistances(const std::vector<ClassMember>& dataset) {
std::unordered_map<std::string, std::vector<ClassMember> > classMap;
std::vector<double> distances;
// Group dataset by species
for (const auto& obj : dataset) {
classMap[obj.name].push_back(obj);
}
// Compute nearest neighbor distances for each class
for (const auto& pair : classMap) {
const auto& classData = pair.second;
for (const auto& obj : classData) {
double minDistance = std::numeric_limits<double>::max();
for (const auto& neighbor : classData) {
if (&obj != &neighbor) {
double distance = euclideanDistance(obj.features, neighbor.features);
if (distance < minDistance) {
minDistance = distance;
}
}
}
// if the result of distance is bigger than 1, it will be dropped.
if (minDistance <= 1) {
distances.push_back(minDistance);
}
}
}
return distances;
}
std::vector<double> process(std::vector<ClassMember> dataset){
// normalize features
normalizeFeatures(dataset);
// computer k nearest distance, k = 1
std::vector<double> distances = computeNearestNeighborDistances(dataset);
// sort distances in ascending order
std::sort(distances.begin(), distances.end());
// eliminate duplicated results
distances.erase(unique(distances.begin(), distances.end()),distances.end());
return distances;
}