-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrunClassification_LSA.py
150 lines (108 loc) · 5.08 KB
/
runClassification_LSA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env python
"""
Run k-NN classification on the Reuters text dataset using LSA.
This script leverages modules in scikit-learn for performing tf-idf and SVD.
Classification is performed using k-NN with k=5 (majority wins).
The script measures the accuracy of plain tf-idf as a baseline, then LSA to
show the improvement.
@author: Chris McCormick
"""
import pickle
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.neighbors import KNeighborsClassifier
###############################################################################
# Load the raw text dataset.
###############################################################################
print("Loading dataset...")
# The raw text dataset is stored as tuple in the form:
# (X_train_raw, y_train_raw, X_test_raw, y_test)
# The 'filtered' dataset excludes any articles that we failed to retrieve
# fingerprints for.
raw_text_dataset = pickle.load( open( "data/raw_text_dataset.pickle", "rb" ) )
X_train_raw = raw_text_dataset[0]
y_train_labels = raw_text_dataset[1]
X_test_raw = raw_text_dataset[2]
y_test_labels = raw_text_dataset[3]
# The Reuters dataset consists of ~100 categories. However, we are going to
# simplify this to a binary classification problem. The 'positive class' will
# be the articles related to "acquisitions" (or "acq" in the dataset). All
# other articles will be negative.
y_train = ["acq" in y for y in y_train_labels]
y_test = ["acq" in y for y in y_test_labels]
print(" %d training examples (%d positive)" % (len(y_train), sum(y_train)))
print(" %d test examples (%d positive)" % (len(y_test), sum(y_test)))
###############################################################################
# Use LSA to vectorize the articles.
###############################################################################
# Tfidf vectorizer:
# - Strips out “stop words”
# - Filters out terms that occur in more than half of the docs (max_df=0.5)
# - Filters out terms that occur in only one document (min_df=2).
# - Selects the 10,000 most frequently occuring words in the corpus.
# - Normalizes the vector (L2 norm of 1.0) to normalize the effect of
# document length on the tf-idf values.
vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000,
min_df=2, stop_words='english',
use_idf=True)
# Build the tfidf vectorizer from the training data ("fit"), and apply it
# ("transform").
X_train_tfidf = vectorizer.fit_transform(X_train_raw)
print(" Actual number of tfidf features: %d" % X_train_tfidf.get_shape()[1])
print("\nPerforming dimensionality reduction using LSA")
t0 = time.time()
# Project the tfidf vectors onto the first N principal components.
# Though this is significantly fewer features than the original tfidf vector,
# they are stronger features, and the accuracy is higher.
svd = TruncatedSVD(100)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)
print(" done in %.3fsec" % (time.time() - t0))
explained_variance = svd.explained_variance_ratio_.sum()
print(" Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))
# Now apply the transformations to the test data as well.
X_test_tfidf = vectorizer.transform(X_test_raw)
X_test_lsa = lsa.transform(X_test_tfidf)
###############################################################################
# Run classification of the test articles
###############################################################################
print("\nClassifying tfidf vectors...")
# Time this step.
t0 = time.time()
# Build a k-NN classifier. Use k = 5 (majority wins), the cosine distance,
# and brute-force calculation of distances.
knn_tfidf = KNeighborsClassifier(n_neighbors=5, algorithm='brute', metric='cosine')
knn_tfidf.fit(X_train_tfidf, y_train)
# Classify the test vectors.
p = knn_tfidf.predict(X_test_tfidf)
# Measure accuracy
numRight = 0;
for i in range(0,len(p)):
if p[i] == y_test[i]:
numRight += 1
print(" (%d / %d) correct - %.2f%%" % (numRight, len(y_test), float(numRight) / float(len(y_test)) * 100.0))
# Calculate the elapsed time (in seconds)
elapsed = (time.time() - t0)
print(" done in %.3fsec" % elapsed)
print("\nClassifying LSA vectors...")
# Time this step.
t0 = time.time()
# Build a k-NN classifier. Use k = 5 (majority wins), the cosine distance,
# and brute-force calculation of distances.
knn_lsa = KNeighborsClassifier(n_neighbors=5, algorithm='brute', metric='cosine')
knn_lsa.fit(X_train_lsa, y_train)
# Classify the test vectors.
p = knn_lsa.predict(X_test_lsa)
# Measure accuracy
numRight = 0;
for i in range(0,len(p)):
if p[i] == y_test[i]:
numRight += 1
print(" (%d / %d) correct - %.2f%%" % (numRight, len(y_test), float(numRight) / float(len(y_test)) * 100.0))
# Calculate the elapsed time (in seconds)
elapsed = (time.time() - t0)
print(" done in %.3fsec" % elapsed)