-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
75 lines (52 loc) · 2.15 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import numpy as np
import os
def iterate_minibatches_(inputs, batchsize, shuffle=False):
if shuffle:
indices = np.arange(len(inputs[0]))
np.random.shuffle(indices)
for start_idx in range(0, len(inputs[0]) - batchsize + 1, batchsize):
if shuffle:
excerpt = indices[start_idx:start_idx + batchsize]
else:
excerpt = slice(start_idx, start_idx + batchsize)
yield ( input[excerpt] for input in inputs )
def loadWord2VecMap(word2vec_path):
import cPickle as pickle
with open(word2vec_path,'r') as fid:
return pickle.load(fid)
def read_sequence_dataset(dataset_dir, dataset_name, maxlen=60):
a_s = os.path.join(dataset_dir, dataset_name+"/a.toks")
labs = os.path.join(dataset_dir, dataset_name+"/label.txt")
data_size = len([line.rstrip('\n') for line in open(a_s)])
Y_scores_pred = np.zeros((data_size, 6), dtype=np.float32)
Y_scores = np.zeros((data_size), dtype=np.float32)
labels = []
X = np.zeros((data_size, maxlen), dtype=np.int16)
X_mask = np.zeros((data_size, maxlen), dtype=np.int16)
from collections import defaultdict
words = defaultdict(int)
vocab_path = os.path.join(dataset_dir, 'vocab-cased.txt')
with open(vocab_path, 'r') as f:
for tok in f:
words[tok.rstrip('\n')] += 1
vocab = {}
vocab["<UNK>"] = 0
for word, idx in zip(words.iterkeys(), xrange(1, len(words)+1)):
vocab[word] = idx
with open(a_s, "rb") as f1, open(labs, 'rb') as f4:
for i, (a, ent) in enumerate(zip(f1,f4)):
a = a.rstrip('\n')
label = ent.rstrip('\n')
labels.append(label)
toks_a = a.split()
for j in range(maxlen):
if j < maxlen - len(toks_a):
X[i,j] = vocab["<UNK>"]
X_mask[i, j] = 0
else:
X[i, j] = vocab[toks_a[j-maxlen+len(toks_a)]]
X_mask[i, j] = 1
Y_labels = np.zeros((len(labels), 2))
for i in range(len(labels)):
Y_labels[i, labels[i]] = 1.
return X, X_mask, Y_labels