-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinformationRetrieval.py
121 lines (98 loc) · 3.94 KB
/
informationRetrieval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from util import *
# Add your import statements here
from tqdm import tqdm
import math
import numpy as np
class InformationRetrieval():
def __init__(self):
self.index = None
self.IDF = None
self.docVectors = None
print('Initializing Vector Space based IR system ...')
def buildIndex(self, docs, docIDs):
"""
Builds the document index in terms of the document
IDs and stores it in the 'index' class variable
Parameters
----------
arg1 : list
A list of lists of lists where each sub-list is
a document and each sub-sub-list is a sentence of the document
arg2 : list
A list of integers denoting IDs of the documents
Returns
-------
None
"""
index = {}
print('Building doc index :')
# Building doc index
for doc, docID in tqdm(zip(docs, docIDs), total=len(docIDs), unit=' Documents', desc='Documents Processed : '):
for sentence in doc:
for word in sentence:
if word not in ['.' , ',', '?', '!']:
if word in index:
if docID not in index[word]:
index[word].append(docID)
else:
index[word] = [docID]
self.index = index
terms = [*self.index]
D = len(docs)
tfs, IDF = [], []
print('Calculating tf values for documents :')
# Calculating tf values for documents
for doc in tqdm(docs, total=D, unit=' Documents', desc='Documents Processed : '):
tf = []
for term in terms:
tf.append(sum([sentence.count(term) for sentence in doc]))
tfs.append(tf)
print('Calculating IDF values for terms :')
# Calculating IDF values for terms
for term in tqdm(terms, total=len(terms), unit=' Terms', desc='Terms Processed : '):
idf = math.log(D/len(self.index[term]))
IDF.append(idf)
tfs = np.asarray(tfs)
self.IDF = np.asarray(IDF)
self.docVectors = np.multiply(tfs, self.IDF)
print('[ Document vectors created. ]')
def rank(self, queries):
"""
Rank the documents according to relevance for each query
Parameters
----------
arg1 : list
A list of lists of lists where each sub-list is a query and
each sub-sub-list is a sentence of the query
Returns
-------
list
A list of lists of integers where the ith sub-list is a list of IDs
of documents in their predicted order of relevance to the ith query
"""
doc_IDs_ordered_all = []
terms = [*self.index]
print('Creating query vectors :')
# Creating query vectors
for query in tqdm(queries, total=len(queries), unit=' Queries', desc='Queries Processed : '):
sim_docs = {}
tf = []
for term in terms:
tf.append(sum([sentence.count(term) for sentence in query]))
tfVector = np.asarray(tf)
queryVector = np.multiply(tfVector, self.IDF)
# Finding Similar documents for queries
for docID, docVector in zip(range(1, len(self.docVectors)+1), self.docVectors):
try:
dot = np.dot(queryVector, docVector)
if dot == 0:
sim_docs[docID] = 0.0
else:
normD = np.linalg.norm(docVector)
normQ = np.linalg.norm(queryVector)
cosine = dot/normD/normQ
sim_docs[docID] = cosine
except:
pass
doc_IDs_ordered_all.append(sorted(sim_docs, key=sim_docs.get, reverse=True))
return doc_IDs_ordered_all