-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfaken_split_bert.py
118 lines (76 loc) · 3.1 KB
/
faken_split_bert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
import shutil
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from tensorflow import keras
from official.nlp import optimization
import numpy as np
from sklearn.model_selection import train_test_split
import collections
def main():
print("main")
tf.get_logger().setLevel('ERROR')
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 55
raw_train_ds = keras.utils.text_dataset_from_directory(
'data/pheme_split/train',
seed=seed
)
raw_test_ds = keras.utils.text_dataset_from_directory(
'data/pheme_split/test'
)
x_train = np.array([])
y_train = np.array([])
for x, y in raw_train_ds:
x_train = np.concatenate([x_train, x])
y_train = np.concatenate([y_train, y])
class_names = raw_train_ds.class_names
use_generated = False
if use_generated:
raw_generated = keras.utils.text_dataset_from_directory(
'data/pheme_split_simple_generated2',
seed=seed
)
generated_xs = np.array([])
generated_ys = np.array([])
for x, y in raw_generated:
generated_xs = np.concatenate([generated_xs, x])
generated_ys = np.concatenate([generated_ys, y])
x_train = np.concatenate([x_train, generated_xs])
y_train = np.concatenate([y_train, generated_ys])
print("Num of duplicates ", len([item for item, count in collections.Counter(list(generated_xs)).items() if count > 1]))
classfier = build_classfier_model()
loss = keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()
epochs = 10
steps_per_epoch = len(x_train)
num_trains_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_trains_steps)
init_lr = 3e-5
optimizer = optimization.create_optimizer(
init_lr=init_lr,
num_train_steps=num_trains_steps,
num_warmup_steps=num_warmup_steps,
optimizer_type='adamw'
)
classfier.compile(optimizer=optimizer, loss=loss, metrics=metrics)
history = classfier.fit(x_train, y_train, epochs=epochs, validation_split=0.1, batch_size=batch_size)
loss, accuracy = classfier.evaluate(raw_test_ds, batch_size=batch_size)
print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')
def build_classfier_model():
bert_preprocess_model_link = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
bert_preprocess_model = hub.KerasLayer(bert_preprocess_model_link, name="preprocess")
bert_model_link = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1"
bert_model = hub.KerasLayer(bert_model_link, trainable=True, name='BERT_encoder')
text_input = keras.layers.Input(shape=(), dtype=tf.string, name='text')
encoder_inputs = bert_preprocess_model(text_input)
outputs = bert_model(encoder_inputs)
net = outputs['pooled_output']
net = keras.layers.Dropout(0.1)(net)
net = keras.layers.Dense(1, activation=None, name='classfier')(net)
return keras.Model(text_input, net)
if __name__ == '__main__':
main()