-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathpreprocess.py
213 lines (197 loc) · 7.93 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from pymongo import MongoClient
import re
import pandas as pd
import numpy as np
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
from nltk.corpus import wordnet as wn
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
class ProcessRestaurantItem(object):
db_name = 'restaurantinfo'
fields = ['rest_name', 'rest_city', 'rest_reviews']
df = None
def __init__(self):
# Setup Client for MongoDB
self.client = MongoClient('mongodb://localhost:27017/restaurantinfo')
self.db = self.client[self.db_name]
def load_mongodb_to_pandas(self):
restaurants = []
for doc in self.db.restaurantreviews.find():
restaurants.append([doc['rest_name'], doc['rest_city'], doc['rest_reviews']])
self.df = pd.DataFrame(restaurants, columns=self.fields)
def decontracted(self, review):
# specific
review = re.sub(r"won't", "will not", review)
review = re.sub(r"can\'t", "can not", review)
# general
review = re.sub(r"n\'t", " not", review)
review = re.sub(r"\'re", " are", review)
review = re.sub(r"\'s", " is", review)
review = re.sub(r"\'d", " would", review)
review = re.sub(r"\'ll", " will", review)
review = re.sub(r"\'t", " not", review)
review = re.sub(r"\'ve", " have", review)
review = re.sub(r"\'m", " am", review)
return review
def add_rest_features(self, freq_features):
manual_rest_features = ['value', 'location', 'food', 'service',
'price', 'atmosphere', 'vibe', 'cuisine',
'ambience', 'decor', 'quality']
freq_features.extend(manual_rest_features)
return freq_features
def frequent_itemsets(self, rest_features):
te = TransactionEncoder()
te_ary = te.fit(rest_features).transform(rest_features)
freq_df = pd.DataFrame(te_ary, columns=te.columns_)
frequent_itemsets = apriori(freq_df, min_support=0.1, use_colnames=True)
# collect all frequent features
freq_features = []
for itemset in frequent_itemsets['itemsets']:
for item in itemset:
freq_features.append(item)
freq_features = self.add_rest_features(freq_features)
return set(freq_features)
def extract_opinion_words(self, freq_features, review_df):
opinion_words = list()
for review_sent in review_df['review_sent']:
review_no_tag = [words[0] for words in review_sent]
for feature in freq_features:
if feature in review_no_tag:
for words in review_sent:
if (words[1] == 'JJ'):
opinion_words.append(words[0].lower())
return set(opinion_words)
def opinion_orientation(self, opinion_words):
# lists to save the opinion based on orientation
# some initial opinion words
pos_opinion = ['authentic', 'cheap', 'inexpensive', 'quick', 'warm', 'hot']
neg_opinion = ['slow', 'expensive', 'disappointed', 'bland', 'overdone', 'overcooked']
sid = SentimentIntensityAnalyzer()
# grow the orientation lists based on user reviews
for word in opinion_words:
if sid.polarity_scores(word)['pos'] == 1.0:
pos_opinion.append(word)
elif sid.polarity_scores(word)['neg'] == 1.0:
neg_opinion.append(word)
return set(pos_opinion), set(neg_opinion)
def sentence_orientation(self, pos_opinions, neg_opinions, review_df):
sid = SentimentIntensityAnalyzer()
processed_reviews = []
for rev_sent in review_df['review_sent']:
review_no_tag = [words[0] for words in rev_sent]
str_r = ' '.join(review_no_tag)
orientation = 0
for word in review_no_tag:
if word.lower() in pos_opinions or word.lower() in neg_opinions:
orientation += self.word_orientation(word.lower(), pos_opinions,
neg_opinions, review_no_tag)
if orientation > 0:
processed_reviews.append([str_r, 1])
elif orientation < 0:
processed_reviews.append([str_r, -1])
else:
pass
# ori = sid.polarity_scores(str_r)['compound']
processed_reviews = np.array(processed_reviews)
processed_reviews_df = pd.DataFrame(processed_reviews, columns=['reviews','sentiment'])
return processed_reviews_df
def word_orientation(self, word, pos_opinions, neg_opinions, review_no_tag):
if word in pos_opinions:
if self.diff_negation(word, review_no_tag):
return -1
else:
return 1
elif word in neg_opinions:
if self.diff_negation(word, review_no_tag):
return 1
else:
return -1
def diff_negation(self, word, review_no_tag):
negation_words = ['no', 'not', 'yet', 'but', 'nevertheless', 'while'
'however', 'instead', 'despite', 'although', 'though']
review_no_tag = [w.lower() for w in review_no_tag]
for nw in negation_words:
if nw in review_no_tag:
op_index = review_no_tag.index(word)
nw_index = review_no_tag.index(nw)
if abs(op_index - nw_index) <= 5:
return True
return False
def generate_summary(self, rest_name, city, freq_features, processed_reviews_df):
feature_summary_reviews = []
for feature in freq_features:
for i, review in enumerate(processed_reviews_df['reviews']):
if feature in review:
feature_summary_reviews.append([rest_name.lower(), city, feature, review,
processed_reviews_df['sentiment'][i]])
feature_summary_df = pd.DataFrame(feature_summary_reviews,
columns=['restaurant name', 'city', 'feature', 'review', 'sentiment'])
return feature_summary_df
def process_reviews(self):
df_collection = []
# one restaurant at a time -> summarize reviews
for i, review_collection in enumerate(self.df['rest_reviews']):
# get restaurant name
rest_name = self.df['rest_name'][i]
city = self.df['rest_city'][i]
# collection of restraunt specific pos-tagged review sentences
tagged_reviews_sent = []
# collection of features in the reviews
rest_features = []
for j, rev in enumerate(review_collection):
review_sentences = rev.split('. ')
for sentence in review_sentences:
# contraction to decontraction
sentence = self.decontracted(sentence)
# tokenize sentence
token_sentence = nltk.word_tokenize(sentence)
# part-of-speech tagging
pos_tag_sentence = nltk.pos_tag(token_sentence)
tagged_reviews_sent.append(pos_tag_sentence)
# extract nouns as features from the review sentence
# using chunking with regular expressions
grammar = r'''
NP: {<NNS><NN>}
{<NN>}
'''
# regex for noun phrases: {<DT|PP\$>?<JJ>*<NN.*>+}
exp = nltk.RegexpParser(grammar)
sent_tree = exp.parse(pos_tag_sentence)
features = []
for subtree in sent_tree.subtrees():
if (subtree.label() == 'NP'):
nps = ''.join(word[0] for word in subtree.leaves())
features.append(nps)
# remove stopwords
features = [feat.lower() for feat in features if i not in stop]
# lemmatize feature words
features = [lmtzr.lemmatize(feat) for feat in features]
if len(features) != 0:
rest_features.append(features)
# convert lists to numpy array
tagged_reviews_sent = np.array(tagged_reviews_sent)
# store pos tagged sentences in a dataframe for processing
review_df = pd.DataFrame(tagged_reviews_sent, columns=['review_sent'])
freq_features = self.frequent_itemsets(rest_features)
opinion_words = self.extract_opinion_words(freq_features, review_df)
# store the pos, neg opinion words
pos_opinion, neg_opinion = self.opinion_orientation(opinion_words)
processed_reviews_df = self.sentence_orientation(pos_opinion, neg_opinion, review_df)
# generate feature based review summary
feature_summary_df = self.generate_summary(rest_name, city, freq_features, processed_reviews_df)
df_collection.append(feature_summary_df)
feature_collection_summary = pd.concat(df_collection, ignore_index=True)
feature_collection_summary.to_csv('restaurantapp/app/feature_review_summary.csv', index=False)
if __name__ == '__main__':
process = ProcessRestaurantItem()
process.load_mongodb_to_pandas()
process.process_reviews()