-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtext_functions.py
68 lines (52 loc) · 2.05 KB
/
text_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from json import load
import re
def remove_duplicates(text):
return list(dict.fromkeys([clean_text(i.lower()).strip() for i in text if i.strip()]))
def clean_text(text, replacement_string=None):
replacements = {
" ": "",
"\n": "",
"(": " ",
")": " ",
} if replacement_string is None else replacement_string
for replacement in replacements:
text = text.replace(replacement, replacements[replacement])
return text
def is_number(text):
replacements = {
" " : "",
"-" : "",
"+": "",
"\n": "",
"\t": "",
"/" : "",
"\\": ""
}
temp = "".join(clean_text(text, replacements).split())
if temp.isdigit() and (len(temp) in [*range(10, 15)]):
return True
return False
def get_info(text, existing_dict = None, filename="regex_lookup.json"):
json_obj = load(open(filename))
result_dict = existing_dict if existing_dict else {}
for key in json_obj:
if (result := [clean_text(found.group().strip()) for found in re.finditer(json_obj[key], text, re.IGNORECASE)]):
if len(result) >= 1:
result_dict[key] = result_dict.get(key, []) + remove_duplicates(result)
else:
result_dict[key] = result_dict.get(key, []) + result
# Extract Phone Numbers
result_dict[key] = [i for i in result_dict[key] if is_number(i)] if key=="Phone-Numbers" else result_dict[key]
result_dict[key] = remove_duplicates(result_dict[key]) if type(result_dict[key]) == list else result_dict[key]
if existing_dict:
result_dict.update({
key: remove_duplicates(existing_dict[key] + result_dict[key])
})
return result_dict
if __name__ == "__main__":
from pprint import pprint
pprint(get_info(
"""hello oxygen plasma (A+, A-,o-, AB+) and cylinder concentrator required
age 22 0132-2205379/78 +91-9763484463 asap verified needed hs@test.in ju2@hulu.com"""
)
)