-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdesegmentors.py
39 lines (31 loc) · 1.14 KB
/
desegmentors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def desegmentword(orig_word: str) -> str:
"""
Word segmentor that takes a Farasa Segmented Word and removes the '+' signs
Example:
>>> desegment("ال+يومي+ة")
اليومية
"""
word = orig_word.replace("ل+ال+","لل")
if "ال+ال" not in orig_word:
word = word.replace("ل+ال","لل")
word = word.replace("+","")
word = word.replace("للل","لل")
return word
def desegment_line(line: str ,sep: str = ' ') -> str:
"""
Simple wrapper over `desegmentword` that splits a string by the sep character
"""
return " ".join([desegmentword(word) for word in line.split(sep)])
def desegment_arabert(line: str) -> str:
"""
Use this function if sentence tokenization was done using
`from arabert.preprocess_arabert import preprocess` with Farasa enabled
AraBERT segmentation using Farasa adds a space after the '+' for prefixes,
and after before the '+' for suffixes
Example:
>>> desegment_arabert('ال+ دراس +ات')
الدراسات
"""
line = line.replace("+ ","+")
line = line.replace(" +","+")
return desegment_line(line)