-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclean.py
58 lines (49 loc) · 2.29 KB
/
clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#======================================================
#===================clean.py===========================
#======================================================
import nltk
import re
from nltk.corpus import stopwords
_wnl = nltk.WordNetLemmatizer()
def normalize_word(w):
return _wnl.lemmatize(w).lower()
def get_tokenized_lemmas(s):
return [normalize_word(t) for t in nltk.word_tokenize(s)]
#this clean function is used for term frequency calculating,
#therefore question mark & short words (like not, no) are considered.
def clean(s):
# Cleans a string: Lowercasing, trimming, removing non-alphanumeric
cleaned_text= " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()
#get rid of the nummbers
cleaned_text= re.sub(r'\d+', '',cleaned_text, flags=re.UNICODE)
#find out the question mark
question_mark=" ".join(re.findall(r'[?]\W', s, flags=re.UNICODE))
#combine the question mark and the cleaned strings
cleaned_text_with_question_mark=" ".join([cleaned_text,question_mark])
#tokenize
cleaned_text_with_lemma=get_tokenized_lemmas(cleaned_text_with_question_mark)
return cleaned_text_with_lemma
#This clean function is used for word2vec
def clean_delete_stopwords(s):
cleaned_text= " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()
#get rid of the nummbers
cleaned_text= re.sub(r'\d+', '',cleaned_text, flags=re.UNICODE)
#delete short words with length smaller than 3
cleaned_text=' '.join(word for word in cleaned_text.split() if len(word)>3)
#tokenize
cleaned_text_with_lemma=get_tokenized_lemmas(cleaned_text)
stopWords=set(stopwords.words('english'))
wordsFiltered=[]
for w in cleaned_text_with_lemma:
if w not in stopWords:
wordsFiltered.append(w)
return wordsFiltered
def clean_for_mutual_information(s):
cleaned_text= " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()
#get rid of the nummbers
cleaned_text= re.sub(r'\d+', '',cleaned_text, flags=re.UNICODE)
#delete short words with length smaller than 1
cleaned_text=' '.join(word for word in cleaned_text.split() if len(word)>1)
#tokenize, this process includes deleting the duplicated words.
cleaned_text_with_lemma=get_tokenized_lemmas(cleaned_text)
return cleaned_text_with_lemma