-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtraining.py
125 lines (88 loc) · 3.95 KB
/
training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import os
import pickle
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from keras import backend as K
from tqdm import tqdm
from keras.metrics import categorical_accuracy
from dynamic_clip_attention import DynamicClipAttention
from utils import mean_reciprocal_rank, parallelize_dataframe, tokenize, specific_save_epoch, prepare_submission, ListDataGenerator
## To prevent training to fail (MacOS)
os.environ['KMP_DUPLICATE_LIB_OK']='True'
PROC_DATA_PATH = "processed"
MODEL_NAME = "DYNAMIC_CLIP_ATTENTION_ELMO_WE"
MAX_LEN_ENCODING_QUERY = 15
MAX_LEN_ENCODING_PASSAGE = 70
MODEL_REPO = "model_weights"
MODEL_PATH = os.path.join(MODEL_REPO, MODEL_NAME)
train = pd.read_csv(os.path.join(PROC_DATA_PATH,"undersample_train.tsv"), sep= "\t")
val = pd.read_csv(os.path.join(PROC_DATA_PATH,"val.tsv"), sep= "\t")
model_param = dict(hidden_dim=100,
enc_timesteps=MAX_LEN_ENCODING_QUERY,
dec_timesteps=MAX_LEN_ENCODING_PASSAGE,
random_size=4,
lr=0.001
)
# Elmo embedding definition
elmo_model = hub.Module('https://tfhub.dev/google/elmo/2', trainable= True)
def ElmoEmbedding(x):
return elmo_model(tf.squeeze(tf.cast(x, tf.string)),
as_dict=True,
signature="default")["word_emb"]
def preprocess(df):
df["passage_text"], df["passage_mask"] = zip(*df.passage_text.apply(lambda x: tokenize(x, MAX_LEN_ENCODING_PASSAGE)))
df["query"], df["query_mask"] = zip(*df["query"].apply(lambda x: tokenize(x, MAX_LEN_ENCODING_QUERY)))
return df
# Preprocess the data for Elmo format
train = parallelize_dataframe(train, preprocess)
val = parallelize_dataframe(val, preprocess)
train = ListDataGenerator(train, batch_size=64, max_len_encoding_passage = 70,
max_len_encoding_query = 15)
## Initialize model
training_model, prediction_model = DynamicClipAttention(model_param, ElmoEmbedding)
## Initializing variable
sess = tf.Session()
with sess.as_default():
sess.run(tf.global_variables_initializer())
# Variable to store best MRR
best_val_mrr = 0.0
# 3 epochs seems to give the optimum MRR on validation dataset
NUM_EPOCHS = 10
for epoch in range(0, NUM_EPOCHS):
training_model.fit_generator(train,
epochs=(epoch+1),
verbose=1,
class_weight=None,
initial_epoch=epoch)
print("Evaluating")
try:
val_preds = prediction_model.predict([val["query"].values,
val["passage_text"].values,
np.vstack(val.query_mask.values),
np.vstack(val.passage_mask.values)],
batch_size = 128,
verbose = 1)
val_mrr = mean_reciprocal_rank(val_preds, val.label.values)
print("Validation mrr:{}".format(val_mrr))
if val_mrr > best_val_mrr:
best_val_mrr = val_mrr
specific_save_epoch(training_model, MODEL_PATH)
except Exception as e:
print(str(e))
# Predict on test dataset
test = pd.read_csv(os.path.join(PROC_DATA_PATH,"test.tsv"), sep= "\t")
test = parallelize_dataframe(test, preprocess)
print("Predicting on test dataset")
test_preds = prediction_model.predict([test["query"].values,
test["passage_text"].values,
np.vstack(test.query_mask.values),
np.vstack(test.passage_mask.values)],
batch_size = 128,
verbose = 1)
test_dataset = pd.DataFrame(test.query_id, columns= ["query_id"])
test_dataset["passage_id"] = test.passage_id
test_dataset["score"] = test_preds
print("Genrating submission", MODEL_NAME+".zip" )
prepare_submission(test_dataset, MODEL_NAME)