-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathkNearestNeighbors.py
163 lines (116 loc) · 5.09 KB
/
kNearestNeighbors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#Alan Balu
#import statements
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import numpy as np
from pprint import pprint
from sklearn import decomposition
#imports for preprocessing
from sklearn.preprocessing import normalize
from sklearn.preprocessing import LabelEncoder
#import for splitting test and training set
from sklearn.model_selection import train_test_split
#Import for summary of classification model
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from CDC_USCS_clustering import separateByRegion #(CDC_Data, state_label, is_abbrev): dataframe
from CdcClustering import CDCpreprocessing2 #(CDC_Data, categories, columns_to_drop): new_CDC_Data, old_columns_CDC_Data
from CdcClustering import normalizeCDC #(CDC_Data, columns_to_norm): CDC_data_norm
#setting display options
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 500)
#function (modified from Chau's code) to do the K Nearest Neighbors Classification
def predictive_KNN_model(myData):
# Separate training and final validation data set. First remove class
# label from data (X). Setup target class (Y)
# Then make the validation set 20% of the entire
# set of labeled data (X_validate, Y_validate)
pprint(myData)
scoring = 'accuracy'
valueArray = myData.values
X = valueArray[:, 0:-1]
Y = valueArray[:, -1:]
test_size = 0.20
seed = 7
X_train, X_validate, Y_train, Y_validate = train_test_split(X, Y, test_size=test_size, random_state=seed)
# Make predictions on validation dataset
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
knn_predictions = knn.predict(X_validate)
knn_accuracy = knn.predict(X_train)
print("test set accuracy: ", accuracy_score(Y_train, knn_accuracy))
#print the relevant scores for the classifier
print("KNN report: ")
print(accuracy_score(Y_validate, knn_predictions))
print(confusion_matrix(Y_validate, knn_predictions))
print(classification_report(Y_validate, knn_predictions))
doROCCurve(knn, X_validate, Y_validate)
#plots the ROC curve for the binary classification task
def doROCCurve(knn, X_validate, Y_validate):
# Compute ROC curve and ROC area for each class
y_scores = knn.predict_proba(X_validate)
fpr, tpr, threshold = roc_curve(Y_validate, y_scores[:, 1])
roc_auc = auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve of kNN')
plt.show()
'''
#bins the cancer rate (after z-sore normalization) and adds a new column for the binned label
def binRate(CDC_Data):
new_CDC_data = CDC_Data
def categorization(value):
if value > 2.0:
return 'very_high'
elif value <= 2.0 and value > 1.0:
return 'high'
elif value <= 1.0 and value > -1.0:
return 'medium'
elif value <= -1.0 and value > -2.0:
return 'low'
else:
return 'very low'
new_CDC_data['AGE_ADJUSTED_CANCER_RATE_level'] = new_CDC_data['AGE_ADJUSTED_CANCER_RATE'].apply(lambda x: categorization(x))
return new_CDC_data
'''
#bin the cancer rate into a binary class label for the classification task
def binRate(CDC_Data):
new_CDC_data = CDC_Data
def categorization(value):
if value > 0.0:
return 'above_avg'
else:
return 'below_avg'
new_CDC_data['AGE_ADJUSTED_CANCER_RATE_level'] = new_CDC_data['AGE_ADJUSTED_CANCER_RATE'].apply(lambda x: categorization(x))
return new_CDC_data
#driver to do the KNN classification with a binary class label
def main():
print("main")
merged_data = pd.read_csv('merged_data.csv' , sep=',', encoding='latin1')
normalized_data = normalizeCDC(merged_data, ['AGE_ADJUSTED_CANCER_RATE']) #normalize age adjusted rate column z-score
region_data = separateByRegion(normalized_data, 'STATE_ABBR', True)
binnedRate_data = binRate(region_data) #add column for rate bin level
print(binnedRate_data.columns)
binnedRate_data = binnedRate_data.dropna(subset=['AGE_ADJUSTED_CANCER_RATE'])
#pprint(region_CDC_data)
categ_columns = ['STATE_ABBR', 'AGE_ADJUSTED_CANCER_RATE_level' ,'region']
drop_columns = [] #remove columns that are unnecessary for clustering question
processed_CDC_data, old_columns_CDC_Data = CDCpreprocessing2(binnedRate_data, categ_columns, drop_columns) #(CDC_Data, categories, columns_to_drop): new_CDC_data, (series) list_of_dropped, list_of_categ
processed_CDC_data.fillna(0, inplace = True)
predictive_KNN_model(processed_CDC_data)
if __name__ == '__main__':
main()