-
Notifications
You must be signed in to change notification settings - Fork 35
/
Copy pathmulti-heads-attention-mnist.py
120 lines (94 loc) · 4.18 KB
/
multi-heads-attention-mnist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from keras.layers import Dense, Dropout, Conv2D, Input, Lambda, Flatten, TimeDistributed
from keras.layers import Add, Reshape, MaxPooling2D, Concatenate, Embedding, RepeatVector
from keras.models import Model
from keras import backend as K
import numpy as np
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.utils import np_utils
from keras.engine.topology import Layer
import tensorflow as tf
from keras.callbacks import TensorBoard
def MultiHeadsAttModel(l=8*8, d=512, dv=64, dout=512, nv = 8 ):
v1 = Input(shape = (l, d))
q1 = Input(shape = (l, d))
k1 = Input(shape = (l, d))
v2 = Dense(dv*nv, activation = "relu")(v1)
q2 = Dense(dv*nv, activation = "relu")(q1)
k2 = Dense(dv*nv, activation = "relu")(k1)
v = Reshape([l, nv, dv])(v2)
q = Reshape([l, nv, dv])(q2)
k = Reshape([l, nv, dv])(k2)
att = Lambda(lambda x: K.batch_dot(x[0],x[1] ,axes=[-1,-1]) / np.sqrt(dv),
output_shape=(l, nv, nv))([q,k])# l, nv, nv
att = Lambda(lambda x: K.softmax(x) , output_shape=(l, nv, nv))(att)
out = Lambda(lambda x: K.batch_dot(x[0], x[1],axes=[4,3]), output_shape=(l, nv, dv))([att, v])
out = Reshape([l, d])(out)
out = Add()([out, q1])
out = Dense(dout, activation = "relu")(out)
return Model(inputs=[q1,k1,v1], outputs=out)
class NormL(Layer):
def __init__(self, **kwargs):
super(NormL, self).__init__(**kwargs)
def build(self, input_shape):
# Create a trainable weight variable for this layer.
self.a = self.add_weight(name='kernel',
shape=(1,input_shape[-1]),
initializer='ones',
trainable=True)
self.b = self.add_weight(name='kernel',
shape=(1,input_shape[-1]),
initializer='zeros',
trainable=True)
super(NormL, self).build(input_shape) # Be sure to call this somewhere!
def call(self, x):
eps = 0.000001
mu = K.mean(x, keepdims=True, axis=-1)
sigma = K.std(x, keepdims=True, axis=-1)
ln_out = (x - mu) / (sigma + eps)
return ln_out*self.a + self.b
def compute_output_shape(self, input_shape):
return input_shape
if __name__ == '__main__':
nb_classes = 10
# the data, shuffled and split between tran and test sets
(X_train, y_train), (X_test, y_test) = mnist.load_data()
print("X_train original shape", X_train.shape)
print("y_train original shape", y_train.shape)
X_train = X_train.reshape(60000, 28,28,1)
X_test = X_test.reshape(10000, 28,28,1)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255
X_test /= 255
print("Training matrix shape", X_train.shape)
print("Testing matrix shape", X_test.shape)
Y_train = np_utils.to_categorical(y_train, nb_classes)
Y_test = np_utils.to_categorical(y_test, nb_classes)
inp = Input(shape = (28,28,1))
x = Conv2D(32,(2,2),activation='relu', padding='same')(inp)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Conv2D(64,(2,2),activation='relu')(x)
x = MaxPooling2D(pool_size=(2, 2), padding='same')(x)
x = Conv2D(64*3,(2,2),activation='relu')(x)
if True:
x = Reshape([6*6,64*3])(x)
att = MultiHeadsAttModel(l=6*6, d=64*3 , dv=8*3, dout=32, nv = 8 )
x = att([x,x,x])
x = Reshape([6,6,32])(x)
x = NormL()(x)
x = Flatten()(x)
x = Dense(256, activation='relu')(x)
x = Dense(10, activation='softmax')(x)
model = Model(inputs=inp, outputs=x)
print(model.summary())
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
tbCallBack = TensorBoard(log_dir='./Graph/mhatt1', histogram_freq=0, write_graph=True, write_images=True)
model.fit(X_train, Y_train,
batch_size=128,
epochs=100,
verbose=1,
validation_data=(X_test, Y_test),
callbacks=[tbCallBack]
)