-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassification.py
153 lines (124 loc) · 5.47 KB
/
classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
'''
SL Assignment - classification.py
CS 1410 Artificial Intelligence, Brown University
Written by whackett.
Usage-
To run classification using your perceptron implementation:
python classification.py
To run classification using our KNN implementation:
python classification.py -knn (or python classification.py -k)
'''
from sklearn import preprocessing
from knn import KNNClassifier
from perceptron import Perceptron
from cross_validate import cross_validate
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import argparse
def classification(method, train_size=0.9):
"""
Classifies data using the model type specified in method. Default is the
perceptron.
Returns the model accuracy on the test data.
"""
# Load the data. Each row represents a datapoint, consisting of all the
# feature values and a label, which indicates the class to which the point
# belongs. Here, each row is a patient. The features are calculated
# from a digital image of a fine needle aspirate (FNA) of a breast mass, and
# the label represents the patient's diagnosis, i.e. malignant or benign.
all_data = pd.read_csv("breast_cancer_diagnostic.csv")
# Uncomment to visualize the first 5 entries and get dataset information,
# such as the number of entries and column names.
# print(all_data.head)
# all_data.info()
# Remove the id and Unnamed:32 columns. They are not necessary for prediction.
all_data = all_data.drop(['Unnamed: 32', 'id'], axis = 1)
# Convert the diagnosis values M and B to numeric values, such that
# M (malignant) = 1 and B (benign) = 0
def convert_diagnosis(diagnosis):
if diagnosis == "B":
return 0
else:
return 1
all_data["diagnosis"] = all_data["diagnosis"].apply(convert_diagnosis)
# Store the features of the data
X = np.array(all_data.iloc[:, 1:])
# Store the labels of the data
y = np.array(all_data["diagnosis"])
# How much of the data do you want to use for training? The rest will be used
# as test data for cross validation.
train_size = 0.8
X_train, X_test, y_train, y_test = cross_validate(X, y, train_size)
print("-" * 30)
if method == "KNN":
# Set the number of neighboring points to compare each datapoint to.
# (You will want to adjust this to optimize the accuracy of your
# KNN Classifier. Implement optimal_k to figure out the optimal k.)
k = 1
# Normalize the feature data, so that values are between [0,1]. This allows
# us to use euclidean distance as a meaningful metric across features.
X_train = preprocessing.normalize(X_train)
X_test = preprocessing.normalize(X_test)
# For KNN, we want the feature function to return the value of the
# given feature.
def feature_func(x):
return x
# Initialize the KNN Classifier
classifier = KNNClassifier([feature_func], k)
# TODO: Implement optimal_k to explore which value of k is optimal,
# where k is the number of neighbors used in the KNN classifier.
# optimal_k(feature_func, X_train, X_test, y_train, y_test)
else:
# TODO: Implement classification using perceptrons. Explore what
# happens when you adjust learning_rate. A learning rate of 1 is
# most likely not going to have good results.
learning_rate = 1
is_classifier = True
def feature_func(x):
return x
classifier = Perceptron([feature_func], learning_rate, is_classifier)
# Fit the data on the train set
print("Training {} Classifier".format(method))
classifier.train(X_train, y_train)
# Evaluate the model's accuracy (between 0 and 1) on the test set
print("Testing {} Classifier".format(method))
accuracy = classifier.evaluate(X_test, y_test)
print("{} Model Accuracy: {:.2f}%".format(method, accuracy*100))
print("-" * 30)
return accuracy
def optimal_k(feature_func, X_train, X_test, y_train, y_test):
"""
1) Finds the optimal value of k, where k is the number of neighbors being
looked at during KNN.
2) Plots the accuracy values returned by performing cross validation on
the KNN model, with k values in the range [1, 50).
"""
pass
# optimal_k = ?
# max_accuracy = ?
# Feel free to delete the following commented stencil code.
# It is only here to help you implement and visualize optimal_k.
# Printing the optimal_k and max_accuracy:
#print("The optimal number of neighbors is {}".format(optimal_k))
#print("Model Accuracy with k={} is {:.2f}%".format(optimal_k, max_accuracy))
# Visualizing number of neighbors vs accuracy, if neighbors is a list of
# your values of k and accuracies is a list of the same size,
# corresponding to the cross validation accuracy returned for a given
# value of k:
# plt.figure(figsize = (10, 6))
# plt.plot(neighbors, np.multiply(accuracies,100))
# plt.xlabel('Number of Neighbors')
# plt.ylabel('Accuracy (% Correct)')
# plt.show()
# quit()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Supervised Learning - Classification")
parser.add_argument("-k", "--knn", help="Indicates to use KNN model. Otherwise, uses perceptron.",
action="store_true")
args = vars(parser.parse_args())
if args["knn"]:
method = "KNN"
else:
method = "Perceptron"
classification(method)