-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathmain.py
66 lines (52 loc) · 2.23 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from StoryGeneration import StoryGenerator
from TFIDF_optim import TFIDF_optim
from MinHash import MinHash
from LSH import LSH
from FalsePositiveRemoval import FalsePositiveRemoval
from DeclareStories import DeclareStories
from community_detection import community_detection
import os
import pickle
RUN_MODE = 'COMMUNITY'
PICKLE_FILE = './signature_matrix_cache.dat'
stories, titles = StoryGenerator("./Dataset").getAllStories()
tfidf = TFIDF_optim(stories)
tfidf.tfidf()
important_words = tfidf.get_important_words()
print('tfidf completed')
signature_matrix = None
if not os.path.exists(PICKLE_FILE):
minHasher = MinHash(tfidf.stories, important_words)
signature_matrix = minHasher.get_signature_matrix()
print('signature matrix generated and cached')
with open(PICKLE_FILE, 'wb') as wfile:
pickle.dump(signature_matrix, wfile, pickle.HIGHEST_PROTOCOL)
else:
with open(PICKLE_FILE, 'rb') as rfile:
signature_matrix = pickle.load(rfile)
print('Signature matrix loaded from cache')
# print('imp words\n', important_words)
# print('sig mat\n', signature_matrix)
if RUN_MODE == 'LSH':
lsh = LSH( signature_matrix, stories, important_words)
candidates = lsh.hash_get_candidates()
candidatesNum = len(candidates)
print('candidate pairs generated')
print('number of candidates:', candidatesNum)
FPRemover = FalsePositiveRemoval(candidates, tfidf.stories, titles)
true_pairs, true_weights = FPRemover.RemoveFalsePositives()
print('false positives identified')
print ((1-(len(true_pairs)/candidatesNum)) * 100, 'percent of candidate pairs were false positives')
StorySplitter = DeclareStories(true_pairs, len(stories))
connectedComponents = StorySplitter.findConnectedComponents()
print('Connected Components have been seperated')
print('Found', len(connectedComponents), ' connected components of graph')
for value in sorted(connectedComponents.values(), key = lambda l: len(l), reverse = True):
#print("Connected Stories:-")
for i, story in enumerate(value):
if (i >= 5):
break
#print('\t', titles[story]['title'])
else:
comm = community_detection(signature_matrix)
comm.detect_community()