-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfrequency.py
60 lines (43 loc) · 1.37 KB
/
frequency.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import sys
import json
def lines(fp):
return str(len(fp.readlines()))
def tweetParse(tweet_file):
tweet = []
terms = {}
for t in tweet_file.readlines():
y= json.loads(t)
#print str(y.encode("utf-8"))
if y.has_key("text"):
tweet.append(str(y["text"].encode("utf-8")).replace('\n',''))
#print str(len(ss))
#for r in ss:
#print r #str(r.encode("utf-8")),"\n\t------\n"
# for w in r.split(' '):
# print str(w.encode("utf-8"))
return tweet
def Calculate_frequency(tweet):
terms = {}
i = 0
for t in tweet:
for w in t.split(' '):
if w not in terms.keys():
terms[w] = float(1)
elif w.encode('utf-8', "ignore")in terms.keys(): #.encode('utf-8', "ignore") in terms.keys():
terms[w] = float(terms[w] + 1)
i = i + 1
for term in terms:
terms[term] = float(float(terms[term])/i)
print "%s %.3f"%(term,terms[term])
def main():
tweet_file = open(sys.argv[1])
#tweetParse(tweet_file)
Calculate_frequency(tweetParse(tweet_file))
#lines(sent_file)
#lines(tweet_file)
#print str(tweet_file.readlines(1,2))
#print tweet_file[0]["text"].split()
#for (x,y) in dic:
# print y
if __name__ == '__main__':
main()