-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
53 lines (38 loc) · 1.14 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import os
import platform
import re
import math
def get_line_separator():
if platform.system() == 'Windows':
return '\r\n'
elif platform.system() == 'Unix':
return '\n'
else:
return '\n'
def count_file(path):
count = 0
for root, dirs, files in os.walk(path):
for each in files:
count += 1
return count
def get_file_list(path):
list = []
for root, dirs, files in os.walk(path):
for dir in dirs:
list.append(os.path.join(root, dir))
for name in files:
list.append(os.path.join(root, name))
return list
def filter_text(source_text):
text = re.sub(u"[\s+\.\!\/_,$%^*()?\[\]\"\' ]+|[<>〉《》;:\-【 】●“”+—!,\r\n。:?、~@#¥%…&*()]+", u'', source_text)
return text
def isEnglish(word):
return all(97 < ord(c) < 122 or 65 < ord(c) < 90 for c in word)
def notChinese(word):
return word.isdigit() or re.match(r"^[0-9A-Za-z]+$",word) is not None or isEnglish(word)
print(notChinese("pladaily中国"))
total_files =160
tf = 6 / total_files
idf = math.log(total_files /6)
w = tf * idf
print(w)