-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathESA.py
290 lines (241 loc) · 10.7 KB
/
ESA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
from util import *
# Add your import statements here
import numpy as np
import time
from tqdm import tqdm
class ESA():
def __init__(self):
self.index = None
self.IDF = None
self.docVectors = None
self.article_terms = None
self.articleVectors = None
self.doc_art_matrix = None
print('Initializing ESA based IR system ...')
def buildIndex(self, docs, docIDs, articles, articleIDs):
"""
Builds the document index in terms of the document
IDs and stores it in the 'index' class variable
Parameters
----------
arg1 : list
A list of lists of lists where each sub-list is
a document and each sub-sub-list is a sentence of the document
arg2 : list
A list of integers denoting IDs of the documents
arg3 : list
A list of lists of lists where each sub-list is
an article and each sub-sub-list is a sentence of the article
arg4 : list
A list of integers denoting IDs of the articles
Returns
-------
None
"""
index = {}
article_index = {}
print('Building doc index :')
# building inverted index for documents
for doc, docID in tqdm(zip(docs, docIDs), total=len(docIDs), unit=' Documents', desc='Documents Processed : '):
for sentence in doc:
for word in sentence:
if word not in ['.' , ',', '?', '!']:
if word in index:
if docID not in index[word]:
index[word].append(docID)
else:
index[word] = [docID]
self.index = index
terms = [*self.index]
D = len(docs)
tfs, IDF = [], []
print('Calculating tf value for documents :')
# calculating tf values for each document
for doc in tqdm(docs, total=D, unit=' Documents', desc='Documents Processed : '):
tf = []
for term in terms:
tf.append(sum([sentence.count(term) for sentence in doc]))
tfs.append(tf)
print('Calculating IDF value for terms :')
# calculating IDF values for each term
for term in tqdm(terms, total=len(terms), unit=' Terms', desc='Terms Processed : '):
idf = np.log(D/len(self.index[term]))
IDF.append(idf)
tfs = np.asarray(tfs)
self.IDF = np.asarray(IDF)
# calculating tf-IDF matrix for documents
self.docVectors = np.multiply(tfs, self.IDF)
print('[ Document vectors created. ]')
print('Building article index :')
# building inverted index for articles
for article, articleID in tqdm(zip(articles, articleIDs), total=len(articleIDs), unit=' Articles', desc='Articles Processed : '):
for sentence in article:
for word in sentence:
if word in article_index:
if articleID not in article_index[word]:
article_index[word].append(articleID)
else:
article_index[word] = [docID]
self.article_terms = [*article_index]
a_tfs, a_IDF = [], []
print('Calculating tf value for articles :')
# calculating tf values for each article
for article in tqdm(articles, total=len(articles), unit=' Articles', desc='Articles Processed : '):
a_tf = []
for term in self.article_terms:
a_tf.append(sum([sentence.count(term) for sentence in article]))
a_tfs.append(a_tf)
print('Calculating IDF value for articles :')
# calculating IDF values for each articles
for term in tqdm(self.article_terms, total=len(self.article_terms), unit=' Terms', desc='Terms Processed : '):
a_idf = np.log(len(articles)/len(article_index[term]))
a_IDF.append(a_idf)
a_tfs = np.asarray(a_tfs)
# calculating tf-IDF matrix for articles
self.articleVectors = np.multiply(a_tfs, a_IDF)
print('[ Article vectors created. ]')
np.save('articleVectors.npy', self.articleVectors)
print('Creating Article-Document Matrix :')
# Computing article document matrix
doc_art_matrix = []
for docID in tqdm(docIDs, total=D, unit=' Documents', desc='Documents Processed : '):
doc_art_vec = []
for i, term in enumerate(terms):
try:
pos = self.article_terms.index(term)
wt_vec = self.articleVectors[:, pos]
except:
wt_vec = np.zeros(self.articleVectors.shape[0]).T
doc_art_vec.append(self.docVectors[docID-1][i]*wt_vec)
doc_art_matrix.append(sum(doc_art_vec))
self.doc_art_matrix = doc_art_matrix
print('[ Article-Document Matrix created. ]')
def buildIndex(self, docs, docIDs):
"""
Builds the document index in terms of the document
IDs and stores it in the 'index' class variable
Parameters
----------
arg1 : list
A list of lists of lists where each sub-list is
a document and each sub-sub-list is a sentence of the document
arg2 : list
A list of integers denoting IDs of the documents
Returns
-------
None
"""
index = {}
article_index = {}
print('Building doc index :')
# building inverted index for documents
for doc, docID in tqdm(zip(docs, docIDs), total=len(docIDs), unit=' Documents', desc='Documents Processed : '):
for sentence in doc:
for word in sentence:
if word not in ['.' , ',', '?', '!']:
if word in index:
if docID not in index[word]:
index[word].append(docID)
else:
index[word] = [docID]
self.index = index
terms = [*self.index]
D = len(docs)
tfs, IDF = [], []
print('Calculating tf value for documents :')
# calculating tf values for each document
for doc in tqdm(docs, total=D, unit=' Documents', desc='Documents Processed : '):
tf = []
for term in terms:
tf.append(sum([sentence.count(term) for sentence in doc]))
tfs.append(tf)
print('Calculating IDF value for terms :')
# calculating IDF values for each term
for term in tqdm(terms, total=len(terms), unit=' Terms', desc='Terms Processed : '):
idf = np.log(D/len(self.index[term]))
IDF.append(idf)
tfs = np.asarray(tfs)
self.IDF = np.asarray(IDF)
# calculating tf-IDF matrix for documents
self.docVectors = np.multiply(tfs, self.IDF)
print('[ Document vectors created. ]')
t = time.strftime("%m/%d/%Y, %H:%M:%S", time.localtime())
print(t,' Loading tf-IDF values for articles ...')
# Loading tf-IDF values for articles
self.articleVectors = np.load('articleVectors.npy')
t = time.strftime("%m/%d/%Y, %H:%M:%S", time.localtime())
print(t, ' [ Article vectors loaded. ]')
print('Creating Article-Document Matrix :')
# Computing article document matrix
doc_art_matrix = []
for docID in tqdm(docIDs, total=D, unit=' Documents', desc='Documents Processed : '):
doc_art_vec = []
for i, term in enumerate(terms):
try:
pos = self.article_terms.index(term)
wt_vec = self.articleVectors[:, pos]
except:
wt_vec = np.zeros(self.articleVectors.shape[0]).T
doc_art_vec.append(self.docVectors[docID-1][i]*wt_vec)
doc_art_matrix.append(sum(doc_art_vec))
self.doc_art_matrix = doc_art_matrix
print('[ Article-Document Matrix created. ]')
def rank(self, queries):
"""
Rank the documents according to relevance for each query
Parameters
----------
arg1 : list
A list of lists of lists where each sub-list is a query and
each sub-sub-list is a sentence of the query
Returns
-------
list
A list of lists of integers where the ith sub-list is a list of IDs
of documents in their predicted order of relevance to the ith query
"""
doc_IDs_ordered_all = []
terms = [*self.index]
queryVectors = []
print('Creating query vectors :')
# Creating query vectors
for query in tqdm(queries, total=len(queries), unit=' Queries', desc='Queries Processed : '):
sim_docs = {}
tf = []
for term in terms:
tf.append(sum([sentence.count(term) for sentence in query]))
tfVector = np.asarray(tf)
queryVector = np.multiply(tfVector, self.IDF)
queryVectors.append(queryVector)
print('Creating Query-Article Matrix :')
# Creating Query-Article Matrix
num_queries = len(queries)
query_art_matrix = []
for qID in tqdm(range(num_queries), unit=' Queries', desc='Queries Processed : '):
query_art_vec = []
for i, term in enumerate(terms):
try:
pos = self.article_terms.index(term)
wt_vec = self.articleVectors[:, pos]
except:
wt_vec = np.zeros(self.articleVectors.shape[0]).T
query_art_vec.append(queryVectors[qID][i]*wt_vec)
query_art_matrix.append(sum(query_art_vec))
query_art_matrix = np.asarray(query_art_matrix)
print('Finding relevent documents for queries :')
# Finding relevent documents for queries
for queryVector in tqdm(query_art_matrix, total=len(query_art_matrix), unit=' Queries', desc=' Queries Processed : '):
for docID, docVector in zip(range(1, len(self.doc_art_matrix)+1), self.doc_art_matrix):
try:
dot = np.dot(queryVector, docVector)
if dot == 0:
sim_docs[docID] = 0.0
else:
normD = np.linalg.norm(docVector)
normQ = np.linalg.norm(queryVector)
cosine = dot/normD/normQ
sim_docs[docID] = cosine
except:
pass
doc_IDs_ordered_all.append(sorted(sim_docs, key=sim_docs.get, reverse=True))
return doc_IDs_ordered_all