-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLSA.py
152 lines (128 loc) · 5.36 KB
/
LSA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
from numpy.lib.function_base import append
from numpy.ma.core import dot
from util import *
# Add your import statements here
from tqdm import tqdm
import time
import numpy as np
class LSA():
def __init__(self):
self.index = None
self.IDF = None
self.term_document_matrix = None
self.term_document_matrix_reduced = None
self.term_space = None
self.doc_space = None
print('Initializing LSA based IR system ...')
def buildIndex(self, docs, docIDs):
"""
Builds the document index in terms of the document
IDs and stores it in the 'index' class variable
Parameters
----------
arg1 : list
A list of lists of lists where each sub-list is
a document and each sub-sub-list is a sentence of the document
arg2 : list
A list of integers denoting IDs of the documents
Returns
-------
None
"""
index = {}
print('Building doc index :')
# Building doc index
for doc, docID in tqdm(zip(docs, docIDs), total=len(docIDs), unit=' Documents', desc='Documents Processed : '):
for sentence in doc:
for word in sentence:
if word not in ['.', ',', '?', '!']:
if word in index:
if docID not in index[word]:
index[word].append(docID)
else:
index[word] = [docID]
self.index = index
terms = [*self.index]
D = len(docs)
tfs, IDF = [], []
print('Calculating tf values for documents :')
# Calculating tf values for documents
for doc in tqdm(docs, total=D, unit=' Documents', desc='Documents Processed : '):
tf = []
for term in terms:
tf.append(sum([sentence.count(term) for sentence in doc]))
tfs.append(tf)
print('Calculating IDF values for terms :')
# Calculating IDF values for terms
for term in tqdm(terms, total=len(terms), unit=' Terms', desc='Terms Processed : '):
idf = np.log(D/len(self.index[term]))
IDF.append(idf)
tfs = np.asarray(tfs)
self.IDF = np.asarray(IDF)
self.term_document_matrix = np.multiply(tfs, self.IDF)
print('[ Document vectors created. ]')
t = time.strftime("%m/%d/%Y, %H:%M:%S", time.localtime())
print(t,' Performing SVD ...')
# Performing SVD
k = 505 #np.linalg.matrix_rank(self.term_document_matrix)
U, S, V = np.linalg.svd(self.term_document_matrix, full_matrices=True)
U_k, S_k, V_k = U[:, :k], np.diag(S[:k]), V[:k]
self.term_document_matrix_reduced = np.dot(U_k, np.dot(S_k, V_k))
self.term_space = np.dot(U_k, S_k)
self.doc_space = np.dot(S_k, V_k)
t = time.strftime("%m/%d/%Y, %H:%M:%S", time.localtime())
print(t,' [ SVD Performed. ]')
def rank(self, queries):
"""
Rank the documents according to relevance for each query
Parameters
----------
arg1 : list
A list of lists of lists where each sub-list is a query and
each sub-sub-list is a sentence of the query
Returns
-------
list
A list of lists of integers where the ith sub-list is a list of IDs
of documents in their predicted order of relevance to the ith query
"""
doc_IDs_ordered_all = []
terms = [*self.index]
queryVectors = []
print('Creating query vectors :')
# Creating query vectors
for query in queries:
tf = []
for term in terms:
tf.append(sum([sentence.count(term) for sentence in query]))
tfVector = np.asarray(tf)
queryVectors.append(np.multiply(tfVector, self.IDF))
# for query in tqdm(queries, total=len(queries), unit=' Queries', desc='Queries Processed : '):
# sim_docs = {}
# term_index_vector = []
# for sentence in query:
# for word in sentence:
# if word in terms:
# term_index_vector.append(terms.index(word))
# queryVector = np.mean(self.term_space[term_index_vector], axis=0)
# queryVectors.append(queryVector)
docVectors = self.term_document_matrix_reduced #self.doc_space.T
print('Finding Similar documents for queries : ')
# Finding Similar documents for queries
for queryVector in tqdm(queryVectors, total=len(queryVectors), unit=' Queries', desc='Queries Processed : '):
sim_docs = {}
for docID, docVector in zip(range(1, len(docVectors)+1), docVectors):
try:
dot = np.dot(queryVector, docVector)
if dot == 0:
sim_docs[docID] = 0.0
else:
normD = np.linalg.norm(docVector)
normQ = np.linalg.norm(queryVector)
cosine = dot/normD/normQ
sim_docs[docID] = cosine
except:
pass
doc_IDs_ordered_all.append(
sorted(sim_docs, key=sim_docs.get, reverse=True))
return doc_IDs_ordered_all