-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathNLP.py
152 lines (127 loc) · 5.56 KB
/
NLP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
# tf.enable_eager_execution() Necessary in Tensorflow 1.X
import numpy as np
import os
import time
class NLP:
def __init__(self):
print("> NLP initialized")
"""
Spliting to training and labeling sets (X,Y)
@:returns X,Y
"""
def splitXY(self,chunk):
X = chunk[:-1]
Y = chunk[1:]
return X,Y
"""
Creating the vocabulary and the coresponding dataset.
Encoding input text characters to RNN readable numbers
@:returns dataset, vocabulary
"""
def preprocess(self,text):
# Number of Characters in the text input
print("Length of text: {} characters".format(len(text)))
# Number of Unique Characters in the text input
vocabulary = sorted(set(text))
print('Vocabulary size: {}'.format(len(vocabulary)))
# Mapping characters to numbers
self.char_to_index = {u: i for i, u in enumerate(vocabulary)}
self.index_to_char = np.array(vocabulary)
text_as_int = np.array([self.char_to_index[c] for c in text])
# The RNN input sequence of characters length
seq_length = 100 # 100 characters per sentence
# Creating Training Sets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)
dataset = sequences.map(self.splitXY)
return dataset,vocabulary
"""
Shuffling the dataset and preparing the model settings
Play with these settings to improve the performance of the RNN
@:returns dataset,vocabulary_size,embedding_dimension,rnn_nodes,batch_size
"""
def prepareSettings(self,dataset,vocabulary):
# Preparing the Settings
batch_size = 64 # Training batch per iteration
buffer_size = 10000 # Elements in memory
dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)
vocabulary_size = len(vocabulary)
embedding_dimension = 256
rnn_nodes = 1024 # Number of neurons in Recursive Neural Network
return dataset,vocabulary_size,embedding_dimension,rnn_nodes,batch_size
"""
Bulding the Deep RNN model
NOTE: For different datasets and scenarios I would recommend
changing the batch_size and considering adding an additional
Dropout layer to improve the generalization capabilities and
performance of the Deep Model
@:returns model
"""
def buildModel(self,vocabulary_size,embedding_dimension,batch_size,rnn_nodes):
# Building the Deep Model
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocabulary_size, embedding_dimension,
batch_input_shape=[batch_size,None]),
tf.keras.layers.GRU(rnn_nodes,return_sequences=True,
stateful=True,
recurrent_initializer='glorot_uniform'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(vocabulary_size)
# Consider adding an additional Dropout layer to improve performance
])
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy')
return model
"""
Defining training callbacks (Checkpoints here) and training the model
After each epoch the weights of the RNN are stored in a file (only if
the loss has decreased)
"""
def trainModel(self,dataset,model,checkpoint_dir,n_epochs):
checkpoint_loc = os.path.join(checkpoint_dir,'checkpoint_{epoch}')
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
verbose=1,
monitor='loss',
filepath=checkpoint_loc,
save_best_only=True,
save_weights_only=True
)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
monitor="loss",
factor=0.1,
patience=7,
verbose=0,
mode="auto",
min_delta=0.0001,
cooldown=0,
min_lr=0
)
model.fit(dataset, epochs=n_epochs, callbacks=[checkpoint_callback,reduce_lr])
"""
Generating new text (one character at a time) based on an initial seed.
We make use of the latest checkpoint of the trained Deep RNN model
"""
def generateText(self,model,checkpoint,seed,text_len):
# Builds the model based on the last saved checkpoint
model.load_weights(checkpoint)
model.build(tf.TensorShape([1,None]))
# Generating the new Text based on seed
input_eval = [self.char_to_index[s] for s in seed] # Vectorizing seed
input_eval = tf.expand_dims(input_eval, 0)
text_generated = []
"""Useful note from Tensorflow Tutorial"""
# Low temperatures results in more predictable text.
# Higher temperatures results in more surprising text.
# Experiment to find the best setting.
temperature = 1.0
# Here batch size == 1
model.reset_states()
for i in range(text_len):
predictions = model(input_eval)
predictions = tf.squeeze(predictions, 0)
predictions = predictions / temperature
predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()
input_eval = tf.expand_dims([predicted_id], 0)
text_generated.append(self.index_to_char[predicted_id])
return (seed + ''.join(text_generated))