-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathcommon.py
44 lines (36 loc) · 1.47 KB
/
common.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import re
from collections import OrderedDict
from itertools import chain, tee
from functools import reduce
class FormatError(Exception):
pass
# TODO: Unicode support
TOKENIZATION_REGEXS = OrderedDict([
# NERsuite-like tokenization: alnum sequences preserved as single
# tokens, rest are single-character tokens.
('default', re.compile(r'([^\W_]+|.)')),
# Finer-grained tokenization: also split alphabetical from numeric.
('fine', re.compile(r'([0-9]+|[^\W0-9_]+|.)')),
# Whitespace tokenization
('space', re.compile(r'(\S+)')),
])
# adapted from http://docs.python.org/library/itertools.html#recipes
def pairwise(iterable, include_last=False):
"s -> (s0,s1), (s1,s2), (s2, s3), ..."
a, b = tee(iterable)
next(b, None)
if not include_last:
return list(zip(a, b))
else:
return list(zip(a, chain(b, (None, ))))
# from http://programmaticallyspeaking.com/split-on-separator-but-keep-the-separator-in-python.html
def split_keep_separator(s, sep='\n'):
return reduce(lambda acc, elem: acc[:-1] + [acc[-1] + elem] if elem == sep
else acc + [elem], re.split("(%s)" % re.escape(sep), s), [])
def sentence_to_tokens(text, tokenization_re=None):
"""Return list of tokens in given sentence using NERsuite tokenization."""
if tokenization_re is None:
tokenization_re = TOKENIZATION_REGEXS.get('default')
tok = [t for t in tokenization_re.split(text) if t]
assert ''.join(tok) == text
return tok