-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdetokenizer.py
33 lines (29 loc) · 1.1 KB
/
detokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import re
def detokenize_quotmarks(sentence):
chars = []
closing_mark = False
no_space_after = False
for char in sentence:
if char == '"':
if not closing_mark:
no_space_after = True
closing_mark = True
else:
if len(chars) > 0 and chars[-1] == ' ':
chars.pop()
closing_mark = False
if not (no_space_after and char == ' '):
chars.append(char)
if not char == '"':
no_space_after = False
return ''.join(chars)
detokenized_sentences = []
with open('source_sentences.txt', 'r', encoding='utf-8') as f_tokenized,\
open('source_sentences_detokenized.txt', 'w', encoding='utf-8') as f_detokenized:
sentences = f_tokenized.read().splitlines()
for sent in sentences:
sent = re.sub(r'\s([.,?!:;)])', r'\1', sent)
sent = re.sub(r'([(])\s', r'\1', sent)
sent = detokenize_quotmarks(sent)
detokenized_sentences.append(sent)
f_detokenized.write('\n'.join(detokenized_sentences))