-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdiscretevectorconll.py
executable file
·43 lines (35 loc) · 1.6 KB
/
discretevectorconll.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/usr/local/bin/pypy
import sys
from conll import open2
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description='CoNLL file transformer')
parser.add_argument('file', help='CoNLL file to read')
parser.add_argument('vectorfile', help='Vector file to be used')
parser.add_argument('--target', type=str, default='FEATS',
choices=['FEATS', 'LEMMA'],
help="CoNLL file field to be replaced/extended")
parser.add_argument('--replace', action='store_true', default=False,
help="Replace/Extend the relevant field.")
args = parser.parse_args()
vlookup = {}
with open(args.vectorfile) as vf:
for token in vf:
fields = token.strip().split('\t')
vlookup[fields[0]] = fields[1:]
with open2(args.file) as cf:
for sentence in cf:
for word in sentence:
if word._form in vlookup:
if args.target == 'FEATS':
if word._feats or args.replace:
word._feats = word._feats + "|" + "|".join(
("F%d=%s" % (i, v) for i, v in enumerate(vlookup[word._form])))
else:
word._feats = "|".join(("F%d=%s" % (i, v) for i, v in enumerate(vlookup[word._form])))
else:
assert len(vlookup[word._form]) == 1
word._lemma = vlookup[word._form][0]
print >> sys.stdout, str(word)
print >> sys.stdout