-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
123 lines (90 loc) · 3.28 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import dataclasses
import pickle
import re
from copy import deepcopy
from typing import List, Iterator
import pandas as pd
from hazm import word_tokenize, stopwords_list, Lemmatizer
from parsivar import FindStems, Normalizer
bad_chars = ['/', '//', '\\', '\/', '@', '$', '%', '^', '&', '&', '*', '(', ')', '[', ']', '!', '#',
'{', '}', '(', ')', '~', '!', '+', ',', '=']
class Statistic:
@dataclasses.dataclass
class Item:
tokens: int
vocab_size: int
items = {
}
@classmethod
def save(cls, to_path):
with open(to_path, 'wb') as handle:
pickle.dump(cls.items, handle, protocol=pickle.HIGHEST_PROTOCOL)
@classmethod
def load(cls, from_path):
with open(from_path, 'rb') as handle:
cls.items = pickle.load(handle)
class NewsData:
def __init__(self, path):
self.path = path
def iter_items(self) -> Iterator[dict]:
df = pd.read_excel(self.path)
doc_id = 0
for row in df.iterrows():
yield {
'content': row[1]['content'],
'title': row[1]['title'],
'url': row[1]['url'],
'doc_id': doc_id,
}
doc_id += 1
@dataclasses.dataclass
class Token:
posting: int
doc_id: int
word: str
def __repr__(self):
return f'{self.doc_id}:{self.posting}:{self.word}'
def __hash__(self):
return hash(f'{self.word}')
class PreProcess:
all_tokens_count = 0
def __init__(self):
self.stemmer = FindStems()
self.lemmatizer = Lemmatizer()
self.stopwords = stopwords_list()
def start(self, text, doc_id) -> List[Token]:
text = self._normalize_text(text)
token_list = word_tokenize(text)
token_list = [Token(doc_id=doc_id, posting=i, word=word) for i, word in enumerate(token_list)]
token_list = self.remove_stopwords(token_list)
token_list = self.stem(token_list)
PreProcess.all_tokens_count += len(token_list)
return token_list
@staticmethod
def remove_punctuations(text):
text = re.sub(r'[:;?!-_.,/()،؛~% \\ »«٪”…<>؟$《 》═=&|”“′‘ # @ \+ \* \^ \" \' \{ \} ]', ' ', text)
return text
@staticmethod
def remove_links(text):
text = re.sub(r'(https?|t\.me|www\.)(\S+)\s?', '', text, flags=re.MULTILINE | re.IGNORECASE)
return text
def _normalize_text(self, txt: str):
my_normalizer = Normalizer(statistical_space_correction=True)
txt = my_normalizer.normalize(txt)
txt = self.remove_links(txt)
txt = self.remove_punctuations(txt)
return txt
def stem(self, token_list: List[Token]) -> List[Token]:
for i, token in enumerate(deepcopy(token_list)):
token_list[i].word = self.stemmer.convert_to_stem(token.word)
# token_list[i].word = self.lemmatizer.lemmatize(token.word)
return token_list
def remove_stopwords(self, token_list: List[Token]) -> List[Token]:
for token in deepcopy(token_list):
if token.word in self.stopwords or len(token.word) <= 1:
token_list.remove(token)
return token_list
if __name__ == '__main__':
text = 'سلام'
x = PreProcess().start(text, doc_id=2)
print(x)