-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathword_freq_improved.py
67 lines (63 loc) · 3.23 KB
/
word_freq_improved.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from nltk.tokenize import RegexpTokenizer
token = RegexpTokenizer(r'\w+')
from nltk.corpus import stopwords
# with open('microsoft.txt','r+',encoding="utf-8") as file:
# text = file.read()
# text = '''
# Huawei Technologies founder and CEO Ren Zhengfei said on Thursday the Chinese company is willing to license its Ren told reporters he was not afraid of creating a rival by making Huawei's technology available to competitors, and the offer could also include chip design know-how.Huawei, the world's largest telecoms gear maker, has been on a US trade blacklist since May over concerns that its equipment could be used by Beijing to spy. Huawei has repeatedly denied such allegations.The sanctions cut off Huawei's access to essential US technologies. The latest version of its Mate 30 flagship phone, unveiled last week in Europe, will not come with Google Mobile Services.Ren's remarks come after he said this month that he is open to selling the firm's 5G technology - including patents, code, blueprints and production know-how - to Western firms for a one-off fee.The offer to license out 5G technology marks the latest attempt by Huawei, also the world's No.2 smartphone vendor, to minimise the impact of the US trade ban. It expects a drop of some $10bn in revenue from its phone business this year.
# '''
def word_freq_improved_summarize(text):
sen = text.split('.')
#normalise
small = [s.lower() for s in sen]
#remove punctuation
punc_free = []
for p in small: punc_free.extend(token.tokenize(p))
#remove stopwords
stop_words = set(stopwords.words('english'))
words = []
for x in punc_free:
if x not in stop_words: words.append(x)
#weighted frequency
wgt = {}
for x in words: wgt[x] = words.count(x)
max_freq = max(wgt.values())
for x in wgt.keys(): wgt[x] = wgt[x]/max_freq
#replace with weighted_frequency
order = {}
avg = len(sen)/2
for i in range(len(sen)):
sum = 0
wrd = sen[i].split()
for w in wrd:
current = (str(token.tokenize(w))[2:-2]).lower()
if current in wgt:
sum += wgt[current]
order[sen[i]] = sum*(1+0.1*abs(avg-i)/avg)
sorted_sen = dict(sorted(order.items(), key = lambda x:x[1], reverse=True))
# print('\n1.Text\n',text)
# print('\n2.List of Sentences\n',sen)
# print('\n3.List of sentences in small case\n',small)
# print('\n4.Removing punctuation\n',punc_free)
# print('\n5.Removing stop words\n',words)
# print('\n6.Word frequency\n',wgt)
# print('\n7.Sentences with sum of frequency of their words\n',order)
# print('\n8.Sorted sentences\n',sorted_sen)
# print('\n9.Final Summary:')
final_summary = ""
while True and len(sorted_sen)>0:
summ = max(sorted_sen, key=lambda x:sorted_sen[x])
if (len(final_summary)+len(summ))<240:
final_summary += summ
del sorted_sen[summ]
else:
if len(final_summary)<1:
del sorted_sen[summ]
continue
else:
break
return final_summary
if __name__ == "__main__":
with open('./passages/harmonyos.txt','r+',encoding="utf-8") as file:
text = file.read()
word_freq_improved_summarize(text)