forked from Koredotcom/KnowledgeGraphGenerator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStringProcessor.py
155 lines (146 loc) · 5.29 KB
/
StringProcessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import re
class StringProcessor(object):
""" language specific string operations"""
def __init__(self):
self.contractions_dict = {
"can't've": "cannot have",
"couldn't've": "could not have",
"hadn't've": "had not have",
"he'd've": "he would have",
"he'll've": "he will have",
"how'd'y": "how do you",
"i'd've": "i would have",
"i'll've": "i will have",
"it'd've": "it would have",
"it'll've": "it will have",
"mightn't've": "might not have",
"mustn't've": "must not have",
"needn't've": "need not have",
"oughtn't've": "ought not have",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd've": "she would have",
"she'll've": "she will have",
"shouldn't've": "should not have",
"that'd've": "that would have",
"there'd've": "there would have",
"they'd've": "they would have",
"they'll've": "they will have",
"we'd've": "we would have",
"we'll've": "we will have",
"what'll've": "what will have",
"who'll've": "who will have",
"won't've": "will not have",
"wouldn't've": "would not have",
"y'all'd": "you all would",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd've": "you would have",
"you'll've": "you will have",
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"o'clock": "of the clock",
"oughtn't": "ought not",
"shan't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"so've": "so have",
"so's": "so is",
"that'd": "that had",
"that's": "that is",
"there'd": "there would",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"would've": "would have",
"wouldn't": "would not",
"y'all": "you all",
"you'd": "you would",
"you'll": "you will",
"you're": "you are",
"you've": "you have"""
}
self.contractions_re = re.compile(
'(%s)' %
'|'.join(list(self.contractions_dict.keys())), re.IGNORECASE)
def _expand_contractions(self, input_string):
""" expand standard english language contractions """
try:
def replace(match):
""" replace matched string"""
return self.contractions_dict[match.group(0).lower()]
return self.contractions_re.sub(replace, input_string)
except:
return input_string
def normalize(self, input_string, language_code):
""" clean the input string"""
return_string = input_string.lower()
if language_code == 'en':
expanded_string = self._expand_contractions(return_string)
if expanded_string.find("'") != -1:
expanded_string = self._expand_contractions(expanded_string)
return_string = re.sub(
r'\W+',
' ',
expanded_string) # Remove Non AlphaNumeric Character
return return_string