-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaccelegator_NLP.py
154 lines (138 loc) · 5.06 KB
/
accelegator_NLP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
"""Produce LDA analysis and visualization."""
from gensim import corpora, models
from profanity import profanity
from stop_words import get_stop_words
from six import iteritems, viewitems
from colorama import Fore, Style
import logging
import pyLDAvis
import pyLDAvis.gensim
import gensim
import warnings
""" Uses gensim to analyze the text of the responses to accelegator"""
def gensim_analysis(list_responses):
"""Completes the analysis for each answer"""
warnings.filterwarnings('ignore')
tokens, nanNum = create_tokens(list_responses)
if len(list_responses) == nanNum:
return
dictionary = dictionary_create(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
corp_eval(dictionary, tokens, corpus)
logging.info("Analyzes gensim and returns the repeated words")
def create_tokens(list_responses):
"""Takes in the list of responses and makes each word a token"""
stoplist = get_stop_words('en')
texts = []
tokens = []
nanNum = 0
for i in list_responses:
temp = []
for i in i:
if not isinstance(i, int):
i = i.lower()
if profanity.contains_profanity(i) is False:
if i not in stoplist:
if i != 'nan':
temp.append(i)
if i == 'nan':
nanNum += 1
tokens.append(temp)
print(tokens)
return(tokens, nanNum)
logging.info("creates tokens from the responses")
def dictionary_create(tokens):
"""Creates the dictionary from the tokens of the answer"""
dictionary = corpora.Dictionary(tokens)
logging.info("creates a dictionary using the tokens")
return(dictionary)
def corp_eval(dictionary, tokens, corpus):
i = len(tokens)
lda = gensim.models.ldamodel.LdaModel(
corpus,
id2word=dictionary,
num_topics=3,
passes=1,
alpha='symmetric',
eta=None)
corpus = [dictionary.doc2bow(token) for token in tokens]
print(lda)
vis = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
print(Fore.YELLOW + "These are the current topics:" + Style.RESET_ALL)
print(lda.print_topics(i))
print(
Fore.CYAN +
"Showing the lda visually, please hit control+c to access the next set of responses:",
Style.RESET_ALL)
pyLDAvis.show(vis)
logging.info("Evaluates the dictionary to see if words are repeated")
return(dictionary.dfs)
def read_responses_question(data, FIRST_QUESTION, arg2):
(rows, columns) = data.shape
responses = list()
# Checking if the argument passed was an in representing the question
# number
if(isinstance(arg2, int)):
column = arg2
texts = []
for row in range(0, rows):
texts.append(str(data.iat[row, column]))
responses = [[word for word in document.split()]for document in texts]
print(responses)
gensim_analysis(responses)
# If the argument is the default it prints every questions responses with
# analysis
else:
for column in range(FIRST_QUESTION, columns):
print(column)
texts = []
for row in range(0, rows):
# if(str(data.iat[row, column]) != "nan"):
# print(data.iat[row, column])
texts.append(str(data.iat[row, column]))
print("i am printing the texts")
print(texts)
responses = [[word for word in document.split()]
for document in texts]
gensim_analysis(responses)
def read_responses_person(data, FIRST_QUESTION, arg2):
(rows, columns) = data.shape
responses = list()
column = 1
texts = []
for row in range(0, rows):
if(arg2 == str(data.iat[row, column])):
exists = True
row = row
break
else:
exists = False
if(exists):
for column in range(FIRST_QUESTION, columns):
texts.append(str(data.iat[row, column]))
responses = [[word for word in document.split()]for document in texts]
print(responses)
gensim_analysis(responses)
# runs every single person if the argument is not an email that appears in
# a list
else:
for row in range(0, rows):
texts = []
for column in range(10, columns):
# if(str(data.iat[row, column]) != "nan"):
# print(data.iat[row, column])
texts.append(str(data.iat[row, column]))
print("i am printing the texts")
print(texts)
responses = [[word for word in document.split()]
for document in texts]
gensim_analysis(responses)
def read_responses_all(data, FIRST_QUESTION):
texts = []
(rows, columns) = data.shape
for row in range(0, rows):
for column in range(FIRST_QUESTION, columns):
texts.append(str(data.iat[row, column]))
responses = [[word for word in document.split()]for document in texts]
# print(responses)
gensim_analysis(responses)