-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNaiveBayes.py
142 lines (115 loc) · 5.43 KB
/
NaiveBayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import pickle
import csv
from nltk.corpus import stopwords
class NaiveBayes():
total_words = 0
total_sentimental_occurences = {'0':0,'2': 0, '4': 0}
feature_list = {}
#function to clean the tweet
def process_tweet(tweet):
# Convert to lower case
tweet = tweet.lower()
tweets = list(set(tweet.split))
feature_vector = []
for word in tweets:
#remove punctuations symbols
word = word.strip()
word = word.replace("\'", "")
word = word.replace("\\", "")
word = word.replace("?", "")
word = word.replace(".", "")
word = word.replace("!", "")
word = word.replace("\"", "")
word = word.replace(",", "")
word = word.replace("\'", "")
word = word.replace(")", "")
word = word.replace("(", "")
word = word.replace("[", "")
word = word.replace("]", "")
#ignore some text
if ((word in stopwords.words('English'))or(word.startswith(":"))or(word.startswith(" ")) or (word.startswith("@")) or (
word.startswith("&")) or (word.startswith("www")) or (
word.startswith("#")) or (word.startswith("http")) or (
word.isdigit())) :
continue
else:
#add to feature_vector if word is formatted
feature_vector.append(word.lower())
return feature_vector
def training(): #Feature Extractor which makes feature list
inp_tweets = csv.reader(open('newFile.csv', 'r',encoding='utf8'), delimiter=',', quotechar='\"', )
tweets = []
i=0
for row in inp_tweets:
i=i+1
if i==1: # Leaving The header of Csv File
continue
if i==20000:
break
#first element in given dataset gives sentiment and 6th gives the actual tweet
sentiment = row[3]
tweet = row[4]
#extract feature vector from a tweet by cleaning it
feature_vector = NaiveBayes.process_tweet(tweet)
for feature in feature_vector:
if not feature in NaiveBayes.feature_list:
#if a word is not in currently maintained list , then add it
NaiveBayes.feature_list[feature] = {'0': 0, '2': 0,'4':0, 'count': 0}
#increment the sentiment count of the word
NaiveBayes.feature_list[feature][sentiment] += 1
NaiveBayes.feature_list[feature]['count'] += 1
NaiveBayes.total_sentimental_occurences[sentiment] += 1
NaiveBayes.total_words += 1
NaiveBayes.feature_list['total_words']=NaiveBayes.total_words
NaiveBayes.feature_list['total_sentimental_occurences']=NaiveBayes.total_sentimental_occurences
ob=NaiveBayes.feature_list #Storing the trained set Uding Pickle library
file=open("classifier.pickle","wb")
pickle.dump(ob,file)
#function to find max element key in a dictionary
def find_max(mydict):
return max(mydict, key=mydict.get)
#get probability of occurence of a feature if it belongs to a certain class - P(feature|class)
def get_prob_features_under_class(feature_vector, sentiment):
result = 1
counter = 0
for word in feature_vector:
if word in NaiveBayes.feature_list:
counter = 1
if NaiveBayes.feature_list['total_sentimental_occurences'][sentiment] != 0:
result *= NaiveBayes.feature_list[word][sentiment] / NaiveBayes.feature_list['total_sentimental_occurences'][sentiment]
else:
return 0
if counter == 0:
return 0
return result
#get probability of occurence of a certain class - P(class)
def get_prob_class(sentiment):
result = NaiveBayes.feature_list['total_sentimental_occurences'][sentiment] / NaiveBayes.feature_list['total_words']
return result
#get probability of occurence of a certain feature - P(feature)
def get_prob_features(feature_vector):
result = 1.0000
counter = 0
for feature in feature_vector:
if feature in NaiveBayes.feature_list:
counter = 1
result *= NaiveBayes.feature_list[feature]['count'] / NaiveBayes.feature_list['total_words']
if counter == 0:
return 0
return result
#predict nature of a sentence according to its feature vector
def predict(tweet):
feature_vector=NaiveBayes.process_tweet(tweet)
prob_acc_to_sentiments = {}
prob_of_features = NaiveBayes.get_prob_features(feature_vector)
if prob_of_features == 0:
return '2'
#calculating probability of occuring of sentence in each class
else:
for sentiment in NaiveBayes.total_sentimental_occurences:
# P(class|features) = P(features|class)*P(class)/P(features)
prob_acc_to_sentiments[sentiment] = NaiveBayes.get_prob_features_under_class(feature_vector, sentiment) * \
NaiveBayes.get_prob_class(sentiment) / prob_of_features
#find class with maximum probability
result = NaiveBayes.find_max(prob_acc_to_sentiments)
return result