-
Notifications
You must be signed in to change notification settings - Fork 49
/
Copy pathread_data.py
109 lines (78 loc) · 3.23 KB
/
read_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import copy
import json
import os
from tokenizer_tools.tagset.NER.IOB import IobSequenceEncoderDecoder
decoder = IobSequenceEncoderDecoder()
DATA_DIR = "data/raw_data/ms-cntk-atis"
import pickle
print(os.listdir(DATA_DIR))
def load_ds(fname='atis.train.pkl'):
with open(fname, 'rb') as stream:
ds, dicts = pickle.load(stream)
print('Done loading: ', fname)
print(' samples: {:4d}'.format(len(ds['query'])))
print(' vocab_size: {:4d}'.format(len(dicts['token_ids'])))
print(' slot count: {:4d}'.format(len(dicts['slot_ids'])))
print(' intent count: {:4d}'.format(len(dicts['intent_ids'])))
return ds, dicts
train_ds, dicts = load_ds(os.path.join(DATA_DIR, 'atis.train.pkl'))
test_ds, dicts = load_ds(os.path.join(DATA_DIR, 'atis.test.pkl'))
t2i, s2i, in2i = map(dicts.get, ['token_ids', 'slot_ids', 'intent_ids'])
i2t, i2s, i2in = map(lambda d: {d[k]: k for k in d.keys()}, [t2i, s2i, in2i])
query, slots, intent = map(train_ds.get,
['query', 'slot_labels', 'intent_labels'])
for i in range(5):
print('{:4d}:{:>15}: {}'.format(i, i2in[intent[i][0]],
' '.join(map(i2t.get, query[i]))))
for j in range(len(query[i])):
print('{:>33} {:>40}'.format(i2t[query[i][j]],
i2s[slots[i][j]]))
print('*' * 74)
def to_rasa_nlu_format(query, slots, intent, output_file):
tpl = {
"rasa_nlu_data": {
"common_examples": [],
"regex_features": [],
"lookup_tables": [],
"entity_synonyms": []
}
}
example_tpl = {
"text": "",
"intent": "",
"entities": []
}
entity_tpl = {
"start": 0,
"end": 0,
"value": "",
"entity": ""
}
data = copy.deepcopy(tpl)
data_len = len(query)
for i in range(data_len):
example = copy.deepcopy(example_tpl)
text_msg = ''.join(map(i2t.get, query[i]))
raw_query_text_list = query[i]
query_text_list = raw_query_text_list[1: -1] # remove BOS and EOS tag
example['text'] = ' '.join(map(i2t.get, query_text_list))
example['intent'] = i2in[intent[i][0]]
tag_seq = [i2s[j] for j in slots[i]]
offset_list = decoder.decode_to_offset(tag_seq)
print(offset_list)
entities_list = []
for offset in offset_list:
entity = copy.deepcopy(entity_tpl)
entity['start'] = len(' '.join(map(i2t.get, raw_query_text_list[1: offset[0]]))) + 1
entity['end'] = len(' '.join(map(i2t.get, raw_query_text_list[offset[0]: offset[1]]))) + entity['start']
entity['value'] = ' '.join(map(i2t.get, raw_query_text_list[offset[0]: offset[1]]))
entity['entity'] = offset[2]
entities_list.append(entity)
example['entities'] = entities_list
data['rasa_nlu_data']['common_examples'].append(example)
with open(output_file, 'wt') as fd:
json.dump(data, fd, ensure_ascii=False, indent=4)
to_rasa_nlu_format(query, slots, intent, 'train.json')
query, slots, intent = map(test_ds.get,
['query', 'slot_labels', 'intent_labels'])
to_rasa_nlu_format(query, slots, intent, 'test.json')