-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAnomaly_detection_code.py
174 lines (134 loc) · 6.46 KB
/
Anomaly_detection_code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
##############################################################################################################
# Anomaly Detection
##############################################################################################################
# In the attached code we will use the Iris dataset, we will relabel two Setosa entries as Versicolor
# and check to see if the two Unsupervised learning models (K means clustering and Density based clustering method
# (DBSCAN)) will detect the mis-labelled entries. Please note we would never assume that
# any cases which are flagged from these models are anomalies without consulting with an expert in the field.
# Import packages
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from sklearn.cluster import DBSCAN
# Import the iris dataset
iris = sns.load_dataset("iris")
# Typical queries used to evaluate your data - always carry this out before completing any analysis
# on your data
iris.head() # the first 4 flowers are setosa
iris.info()
iris.describe()
iris.columns
iris.isnull().sum() # there are no null values in the data
# What are the unique flower types?
# setosa, versicolor and virginica.
iris['species'].unique()
# Rename the first and fourth species as Versicolor
# We will check to see if the unsupervised models will flag these cases as anomalies
iris.iloc[0,4] = 'versicolor'
iris.iloc[3,4] = 'versicolor'
iris.head() # the first and fourth species have been changed to Versicolor
# Only want to look at the Versicolor flowers
versi_df = iris.loc[iris['species']=='versicolor']
versi_df = versi_df.reset_index(drop=True)
versi_df.head()
# Scale the data using the MinMaxScaler
# You should always scale your data when you are looking at distance based models
versi_np = versi_df.iloc[:,:4].values
scaler = MinMaxScaler()
versi_sc = scaler.fit_transform(versi_np)
# Visualising the data to see if the mis-labelled cases look like anomalies
# plt.clf()
sns.set()
plt.subplot(1,2,1)
sns.scatterplot(data=versi_df, x='sepal_length', y='sepal_width')
plt.scatter(x=5.1, y=3.5, marker='X')
plt.scatter(x=4.6, y=3.1, marker='X')
plt.xlabel('sepal_length')
plt.ylabel('sepal_width')
plt.title('Sepal length v width')
plt.subplot(1,2,2)
sns.scatterplot(data=versi_df, x='petal_length', y='petal_width')
plt.scatter(x=1.4, y=0.2, marker='X')
plt.scatter(x=1.5, y=0.2, marker='X')
plt.xlabel('petal_length')
plt.ylabel('petal_width')
plt.title('Petal length v width')
plt.show()
##############################################################################################################
# K means
##############################################################################################################
"""# Using the elbow method in K means clustering to find the optimal number of clusters
# WCSS is the sum of the squared distances from each point in a cluster to the centre of the cluster.
# init refers to the initial cluster centres. k-means ++ speeds up convergence.
# 3 looks like a reasonable number of clusters"""
# plt.clf()
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42) # Firstly call the algorithm
kmeans.fit(versi_sc) # fit is always used to train an algorithm
wcss.append(kmeans.inertia_) # inertia_ gives us the wcss value for each cluster.
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method',fontsize=20)
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
# Training the K-Means model on the dataset
kmeans = KMeans(n_clusters = 3, init = 'k-means++', random_state = 2).fit(versi_sc)
####
# Test 1 - test to see if the anomalies are far from the cluster centroids
####
# Obtain predictions and calculate distance from cluster centroid
versi_sc_clusters = kmeans.predict(versi_sc)
versi_sc_clusters_centers = kmeans.cluster_centers_
dist = [np.linalg.norm(x-y) for x, y in zip(versi_sc, versi_sc_clusters_centers[versi_sc_clusters])]
print(versi_sc_clusters)
print(dist)
# Create fraud predictions based on outliers on clusters
km_y_pred = np.array(dist)
km_y_pred[dist >= np.percentile(dist, 95)] = 1
km_y_pred[dist < np.percentile(dist, 95)] = 0
# The anomalies flagged using distances from the centroid are not the mis-labelled cases. As you will see
# in test 2 this is because one of the three clusters contain only the mis-labelled cases.
####
# Test 2 - Testing to see if one of the clusters contain only the mis-labelled cases
####
# Versicolor dataframe with the clusters
versi_clus = pd.concat([versi_df,
pd.DataFrame(versi_sc_clusters,columns=['Clusters'])],axis=1)
# We can see that one of the clusters contain only the mis-labelled cases
# plt.clf()
plt.subplot(1,2,1)
sns.scatterplot(data=versi_clus, x='sepal_length', y='sepal_width', hue='Clusters', palette='deep')
plt.xlabel('sepal_length')
plt.ylabel('sepal_width')
plt.legend( loc='lower right')
plt.title('Sepal length v width')
plt.subplot(1,2,2)
sns.scatterplot(data=versi_clus, x='petal_length', y='petal_width', hue='Clusters', palette='deep')
plt.xlabel('petal_length')
plt.ylabel('petal_width')
plt.legend(loc='lower right')
plt.title('Petal length v width')
plt.show()
##############################################################################################################
# DBSCAN
##############################################################################################################
# Density based clustering method (DBSCAN) to detect anomalies.
# The advantage of DBSCAN is that you do not need to define the number of
# clusters beforehand. Also, DBSCAN can handle weirdly shaped data (i.e. non-convex) much
# better than K-means can. Similar to above we take the smallest clusters in the data and label those as anomalies.
# Initialize and fit the DBSCAN model
db = DBSCAN(eps=0.8, min_samples=1, n_jobs=-1).fit(versi_sc)
# Obtain the predicted labels and calculate number of clusters
pred_labels = db.labels_
n_clusters = len(set(pred_labels)) - (1 if -1 in pred_labels else 0)
# Print performance metrics for DBSCAN
# There are only two clusters
print('Estimated number of clusters: %d' % n_clusters)
versi_db = pd.concat([versi_clus,
pd.DataFrame(pred_labels,columns=['db_cluster'])],axis=1)
# Under both DBSCAN and K means clustering the first two cases are being flagged as anomalies
versi_db.head()