-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrendly.py
128 lines (90 loc) · 3 KB
/
trendly.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os
import sys
import getopt
import segment
import BeautifulSoup
import sqlite3 as lite
from speedy.speedydb import SpeedyDb
trendlySql_insert = "INSERT INTO Textly(SiteId, MonthId, Trendyness, LinkCount, Words) VALUES({0}, {1}, {2}, {3}, {4});"
trends = ['top task', 'straight to', 'residents', 'pay it', 'report it', 'find my nearest', 'popular tasks','highlights','faq','frequently asked','Popular topics','Quick links','Do it online', 'press releases', 'fostering']
trendcounts = range(len(trends))
from collections import Counter
c = Counter();
segger = segment.Segmenter()
def percentage(part, whole):
return float(part)/float(whole)
def FindTrendyString(content, search):
substring = content.lower().find(search.lower())
if ( substring > 0 ) :
# print 'found [{0}]'.format(search),
return 1
def GetTheTrendy(content):
trendyscore = 0
for i in range(len(trends)):
search = trends[i]
if FindTrendyString(content, search) == 1 :
trendcounts[i] = trendcounts[i] + 1
trendyscore = trendyscore+1
return trendyscore
def CountTheWords(content):
words = Counter();
for chunk in segger.get_chunks(content):
for word in chunk.split():
if len(word) > 3 and len(word) < 30:
words.update(word.lower().split())
found = '"'
for word, count in words.most_common(5):
found = found + "{0},".format(word.encode('utf-8').strip().translate(None, ',!@#$"'))
found = found.strip(',') + '"'
return found
def linkCounter(content):
soup = BeautifulSoup.BeautifulSoup(content)
return len(soup.findAll('a', href=True))
def runmonth(monthid):
# stuff...
here = os.path.dirname(__file__)
folder = os.path.join(here, "../results/{0}/html".format(monthid))
sitecount = 0;
con = lite.connect('speedyplus.db')
cur = con.cursor()
db = SpeedyDb()
sites = db.getSites()
for site in sites:
siteName = site[1]
siteFile = "{0}\\{1}.html".format(folder, siteName)
print "{0:<3} {1:<25}".format(site[0], site[1]),
if os.path.exists(siteFile):
sitecount = sitecount + 1
print "{0:25}".format(os.path.split(siteFile)[1]),
fo = open(siteFile, 'r')
content = fo.read()
trendyness = GetTheTrendy(content)
linkcount = linkCounter(content)
words = CountTheWords(content)
fo.close()
sql = trendlySql_insert.format(site[0], monthid, trendyness, linkcount, words)
# print sql
cur.execute(sql)
con.commit()
print '{0:<2} {1:<4} {2}'.format(trendyness, linkcount, words),
print '.'
print ''
for i in range(len(trends)):
print '{0:<30}: {1}\t{2:.0%}'.format(trends[i], trendcounts[i], percentage(trendcounts[i],sitecount))
for word, count in c.most_common(100):
print word, count
def main(argv):
monthid = 0
try:
opts, args = getopt.getopt(argv, "m:", ['month'])
except getopt.GetoptError:
print 'grabby.py -m <monthId>'
sys.exit(2)
for opt, arg in opts:
if opt in ('-m', '--month'):
monthid = arg
print 'MonthId [', monthid , ']'
if monthid != 0:
runmonth(monthid)
if __name__ == '__main__':
main(sys.argv[1:])