seq2seq_sleep_sleep-EDF.py

import numpy as np
import matplotlib.pyplot as plt
import scipy.io as spio
from  sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, f1_score
import random
import time
import os
from datetime import datetime
from sklearn.metrics import confusion_matrix
from sklearn.metrics import cohen_kappa_score
import tensorflow as tf
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import ADASYN
from sklearn.model_selection import train_test_split
from tensorflow.python.layers.core import Dense
from tensorflow.contrib.seq2seq.python.ops import beam_search_decoder
from dataloader import SeqDataLoader
import argparse

def batch_data(x, y, batch_size):
    shuffle = np.random.permutation(len(x))
    start = 0
#     from IPython.core.debugger import Tracer; Tracer()()
    x = x[shuffle]
    y = y[shuffle]
    while start + batch_size <= len(x):
        yield x[start:start+batch_size], y[start:start+batch_size]
        start += batch_size
def flatten(name, input_var):
    dim = 1
    for d in input_var.get_shape()[1:].as_list():
        dim *= d
    output_var = tf.reshape(input_var,
                            shape=[-1, dim],
                            name=name)

    return output_var
def build_firstPart_model(input_var,keep_prob_=0.5):
        # List to store the output of each CNNs
        output_conns = []

        ######### CNNs with small filter size at the first layer #########

        # Convolution
        network = tf.layers.conv1d(inputs=input_var, filters=64, kernel_size=50, strides=6,
                                 padding='same', activation=tf.nn.relu)

        network = tf.layers.max_pooling1d(inputs=network, pool_size=8, strides=8, padding='same')

        # Dropout
        network = tf.nn.dropout(network, keep_prob_)


        # Convolution
        network = tf.layers.conv1d(inputs=network, filters=128, kernel_size=8, strides=1,
                                 padding='same', activation=tf.nn.relu)

        network = tf.layers.conv1d(inputs=network, filters=128, kernel_size=8, strides=1,
                                 padding='same', activation=tf.nn.relu)
        network = tf.layers.conv1d(inputs=network, filters=128, kernel_size=8, strides=1,
                                 padding='same', activation=tf.nn.relu)


        # Max pooling
        network = tf.layers.max_pooling1d(inputs=network, pool_size=4, strides=4, padding='same')


        # Flatten
        network = flatten(name="flat1", input_var=network)


        output_conns.append(network)

        ######### CNNs with large filter size at the first layer #########


        # Convolution
        network = tf.layers.conv1d(inputs=input_var, filters=64, kernel_size=400, strides=50,
                                   padding='same', activation=tf.nn.relu)

        network = tf.layers.max_pooling1d(inputs=network, pool_size=4, strides=4, padding='same')

        # Dropout
        network = tf.nn.dropout(network, keep_prob_)

        # Convolution
        network = tf.layers.conv1d(inputs=network, filters=128, kernel_size=6, strides=1,
                                   padding='same', activation=tf.nn.relu)

        network = tf.layers.conv1d(inputs=network, filters=128, kernel_size=6, strides=1,
                                   padding='same', activation=tf.nn.relu)
        network = tf.layers.conv1d(inputs=network, filters=128, kernel_size=6, strides=1,
                                   padding='same', activation=tf.nn.relu)

        # Max pooling
        network = tf.layers.max_pooling1d(inputs=network, pool_size=2, strides=2, padding='same')

        # Flatten
        network = flatten(name="flat2", input_var=network)


        output_conns.append(network)

        # Concat
        network = tf.concat(output_conns,1, name="concat1")

        # Dropout
        network = tf.nn.dropout(network, keep_prob_)

        return network
def plot_attention(attention_map, input_tags = None, output_tags = None):
    attn_len = len(attention_map)

    # Plot the attention_map
    plt.clf()
    f = plt.figure(figsize=(15, 10))
    ax = f.add_subplot(1, 1, 1)

    # Add image
    i = ax.imshow(attention_map, interpolation='nearest', cmap='gray')

    # Add colorbar
    cbaxes = f.add_axes([0.2, 0, 0.6, 0.03])
    cbar = f.colorbar(i, cax=cbaxes, orientation='horizontal')
    cbar.ax.set_xlabel('Alpha value (Probability output of the "softmax")', labelpad=2)

    # Add labels
    ax.set_yticks(range(attn_len))
    if output_tags != None:
      ax.set_yticklabels(output_tags[:attn_len])

    ax.set_xticks(range(attn_len))
    if input_tags != None:
      ax.set_xticklabels(input_tags[:attn_len], rotation=45)

    ax.set_xlabel('Input Sequence')
    ax.set_ylabel('Output Sequence')

    # add grid and legend
    ax.grid()
    HERE = os.path.realpath(os.path.join(os.path.realpath(__file__), '..'))
    dir_save = os.path.join(HERE, 'attention_maps')
    if (os.path.exists(dir_save) == False):
        os.mkdir(dir_save)
    f.savefig(os.path.join(dir_save, 'a_map_1.pdf'), bbox_inches='tight')
    # f.show()
    plt.show()
def build_network(hparams,char2numY,inputs,dec_inputs,keep_prob_=0.5,):

    if hparams.akara2017 is True:
        _inputs = tf.reshape(inputs, [-1, hparams.input_depth,1])
        network = build_firstPart_model(_inputs, keep_prob_)
        shape = network.get_shape().as_list()
        data_input_embed = tf.reshape(network, (-1, hparams.max_time_step, shape[1]))
    else:
        _inputs = tf.reshape(inputs, [-1, hparams.n_channels, hparams.input_depth / hparams.n_channels])

        conv1 = tf.layers.conv1d(inputs=_inputs, filters=32, kernel_size=2, strides=1,
                                 padding='same', activation=tf.nn.relu)
        max_pool_1 = tf.layers.max_pooling1d(inputs=conv1, pool_size=2, strides=2, padding='same')

        conv2 = tf.layers.conv1d(inputs=max_pool_1, filters=64, kernel_size=2, strides=1,
                                 padding='same', activation=tf.nn.relu)
        max_pool_2 = tf.layers.max_pooling1d(inputs=conv2, pool_size=2, strides=2, padding='same')

        conv3 = tf.layers.conv1d(inputs=max_pool_2, filters=128, kernel_size=2, strides=1,
                                 padding='same', activation=tf.nn.relu)
        max_pool_3 = tf.layers.max_pooling1d(inputs=conv3, pool_size=2, strides=2, padding='same')

        shape = max_pool_3.get_shape().as_list()
        data_input_embed = tf.reshape(max_pool_3, (-1, hparams.max_time_step, shape[1] * shape[2]))

    # timesteps = max_time
    # lstm_in = tf.unstack(data_input_embed, timesteps, 1)
    # lstm_size = 128
    # # Get lstm cell output
    # # Add LSTM layers
    # lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    # data_input_embed, states = tf.contrib.rnn.static_rnn(lstm_cell, lstm_in, dtype=tf.float32)
    # data_input_embed = tf.stack(data_input_embed, 1)
    # shape = data_input_embed.get_shape().as_list()
    # embed_size = 10 #128 lstm_size # shape[1]*shape[2]

    # Embedding layers
    with tf.variable_scope("embeddin") as embedding_scope:
        decoder_embedding = tf.Variable(tf.random_uniform((len(char2numY), hparams.embed_size), -1.0, 1.0), name='dec_embedding') # +1 to consider <EOD>
        decoder_emb_inputs = tf.nn.embedding_lookup(decoder_embedding, dec_inputs)


    with tf.variable_scope("encoding") as encoding_scope:
        if not hparams.bidirectional:

            # Regular approach with LSTM units
            # encoder_cell = tf.contrib.rnn.LSTMCell(hparams.num_units)
            # encoder_cell = tf.nn.rnn_cell.MultiRNNCell([encoder_cell] * hparams.lstm_layers)
            def lstm_cell():
                lstm = tf.contrib.rnn.LSTMCell(hparams.num_units)
                return lstm
            encoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(hparams.lstm_layers)])
            encoder_outputs, encoder_state = tf.nn.dynamic_rnn(encoder_cell, inputs=data_input_embed, dtype=tf.float32)

        else:

            # Using a bidirectional LSTM architecture instead
            # enc_fw_cell = tf.contrib.rnn.LSTMCell(hparams.num_units)
            # enc_bw_cell = tf.contrib.rnn.LSTMCell(hparams.num_units)

            def lstm_cell():
                lstm = tf.contrib.rnn.LSTMCell(hparams.num_units)
                return lstm

            stacked_cell_fw = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(hparams.lstm_layers)],state_is_tuple=True)
            stacked_cell_bw = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(hparams.lstm_layers)],state_is_tuple=True)


            ((enc_fw_out, enc_bw_out), (enc_fw_final, enc_bw_final)) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw=stacked_cell_fw,
                cell_bw=stacked_cell_bw,
                inputs=data_input_embed,
                dtype=tf.float32)
            encoder_final_state = []
            for layer in range(hparams.lstm_layers):
                enc_fin_c = tf.concat((enc_fw_final[layer].c, enc_bw_final[layer].c), 1)
                enc_fin_h = tf.concat((enc_fw_final[layer].h, enc_bw_final[layer].h), 1)
                encoder_final_state.append(tf.contrib.rnn.LSTMStateTuple(c=enc_fin_c, h=enc_fin_h))

            encoder_state = tuple(encoder_final_state)
            encoder_outputs = tf.concat((enc_fw_out, enc_bw_out), 2)


    with tf.variable_scope("decoding") as decoding_scope:

        output_layer = Dense(
            len(char2numY), use_bias=False)
        decoder_lengths = np.ones((hparams.batch_size), dtype=np.int32) * (hparams.max_time_step+1)
        training_helper = tf.contrib.seq2seq.TrainingHelper(decoder_emb_inputs, decoder_lengths)

        if not hparams.bidirectional:
            # decoder_cell = tf.contrib.rnn.LSTMCell(hparams.num_units)
            def lstm_cell():
                lstm = tf.contrib.rnn.LSTMCell(hparams.num_units)
                return lstm
            decoder_cells = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(hparams.lstm_layers)])

        else:
            # decoder_cell = tf.contrib.rnn.LSTMCell(2 * hparams.num_units)
            def lstm_cell():
                lstm = tf.contrib.rnn.LSTMCell(2 * hparams.num_units)
                return lstm
            decoder_cells = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(hparams.lstm_layers)])

        if hparams.use_attention:
            # Create an attention mechanism
            attention_mechanism = tf.contrib.seq2seq.LuongAttention(
                hparams.num_units * 2 if hparams.bidirectional else hparams.num_units , encoder_outputs,
                memory_sequence_length=None)

            decoder_cells = tf.contrib.seq2seq.AttentionWrapper(
                decoder_cells, attention_mechanism,
                attention_layer_size=hparams.attention_size,alignment_history=True)

            encoder_state = decoder_cells.zero_state(hparams.batch_size, tf.float32).clone(cell_state=encoder_state)


        # Basic Decoder and decode
        decoder = tf.contrib.seq2seq.BasicDecoder(
            decoder_cells, training_helper, encoder_state,
            output_layer=output_layer)

        dec_outputs, _final_state, _final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(decoder,impute_finished=True)

        # dec_outputs, _ = tf.nn.dynamic_rnn(decoder_cell, inputs=decoder_emb_inputs, initial_state=encoder_state)

    logits = dec_outputs.rnn_output

    # Inference
    start_tokens =  tf.fill([hparams.batch_size], char2numY['<SOD>'])
    end_token = char2numY['<EOD>']
    if not hparams.use_beamsearch_decode:

        inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
            decoder_embedding,
            start_tokens,end_token)

        # Inference Decoder
        inference_decoder = tf.contrib.seq2seq.BasicDecoder(
            decoder_cells, inference_helper, encoder_state,
            output_layer=output_layer)
    else:

        encoder_state = tf.contrib.seq2seq.tile_batch(encoder_state, multiplier=hparams.beam_width)
        decoder_initial_state = decoder_cells.zero_state(hparams.batch_size * hparams.beam_width, tf.float32).clone(cell_state=encoder_state)

        inference_decoder = beam_search_decoder.BeamSearchDecoder(cell=decoder_cells,
                                                                  embedding=decoder_embedding,
                                                                  start_tokens=start_tokens,
                                                                  end_token=end_token,
                                                                  initial_state=decoder_initial_state,
                                                                  beam_width=hparams.beam_width,
                                                                  output_layer=output_layer)

    # Dynamic decoding
    outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
        inference_decoder,impute_finished = False, maximum_iterations=hparams.output_max_length)
    pred_outputs = outputs.sample_id
    if  hparams.use_beamsearch_decode:
          # [batch_size, max_time_step, beam_width]
          pred_outputs = pred_outputs[0]


    return logits,pred_outputs,_final_state
def tf_confusion_metrics(y_true_,y_pred_,num_classes=5):

    tf_cm = tf.cast(tf.confusion_matrix(y_true_, y_pred_,num_classes=None),"float")
    FP =  tf.reduce_sum(tf_cm,axis=0) - tf.diag_part(tf_cm)
    FN =  tf.reduce_sum(tf_cm,axis=1) - tf.diag_part(tf_cm)
    TP = tf.diag_part(tf_cm)
    TN = tf.reduce_sum(tf_cm) - (FP + FN + TP)

    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP / (TP + FN)
    # Specificity or true negative rate
    TNR = TN / (TN + FP)
    # Precision or positive predictive value
    PPV = TP / (TP + FP)
    # Negative predictive value
    NPV = TN / (TN + FN)
    # Fall out or false positive rate
    FPR = FP / (FP + TN)
    # False negative rate
    FNR = FN / (TP + FN)
    # False discovery rate
    FDR = FP / (TP + FP)

    return FPR, FNR
def evaluate_metrics(cm,classes):

    print ("Confusion matrix:")
    print (cm)

    cm = cm.astype(np.float32)
    FP = cm.sum(axis=0) - np.diag(cm)
    FN = cm.sum(axis=1) - np.diag(cm)
    TP = np.diag(cm)
    TN = cm.sum() - (FP + FN + TP)
    # https://stackoverflow.com/questions/31324218/scikit-learn-how-to-obtain-true-positive-true-negative-false-positive-and-fal
    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP / (TP + FN)
    # Specificity or true negative rate
    TNR = TN / (TN + FP)
    # Precision or positive predictive value
    PPV = TP / (TP + FP)
    # Negative predictive value
    NPV = TN / (TN + FN)
    # Fall out or false positive rate
    FPR = FP / (FP + TN)
    # False negative rate
    FNR = FN / (TP + FN)
    # False discovery rate
    FDR = FP / (TP + FP)

    # Overall accuracy
    ACC = (TP + TN) / (TP + FP + FN + TN)
    # ACC_micro = (sum(TP) + sum(TN)) / (sum(TP) + sum(FP) + sum(FN) + sum(TN))
    ACC_macro = np.mean(ACC) # to get a sense of effectiveness of our method on the small classes we computed this average (macro-average)

    F1 = (2 * PPV * TPR) / (PPV + TPR)
    F1_macro = np.mean(F1)

    print ("Sample: {}".format(int(np.sum(cm))))
    n_classes = len(classes)
    for index_ in range(n_classes):
        print ("{}: {}".format(classes[index_], int(TP[index_] + FN[index_])))


    return ACC_macro,ACC, F1_macro, F1, TPR, TNR, PPV
random.seed(654) # to make have the same training set and test set each time the code is run, we use a fixed random seed

def build_whole_model(hparams,char2numY,inputs, targets,dec_inputs, keep_prob_):
    # logits = build_network(inputs,dec_inputs=dec_inputs)
    logits, pred_outputs,dec_states = build_network(hparams,char2numY,inputs, dec_inputs, keep_prob_)
    decoder_prediction = tf.argmax(logits, 2)

    # optimization operation
    with tf.name_scope("optimization"):
        # Loss function
        vars = tf.trainable_variables()
        beta = 0.001
        lossL2 = tf.add_n([tf.nn.l2_loss(v) for v in vars
                           if 'bias' not in v.name]) * beta

        # class_ratio = [0.1,0.4, 0.1, 0.1, 0.1, 0.1,0.1]
        # class_weight = tf.constant(class_ratio)
        # weighted_logits = tf.multiply(logits, class_weight)

        loss_is = []
        for i in range(logits.get_shape().as_list()[-1]):
            class_fill_targets = tf.fill(tf.shape(targets), i)
            weights_i = tf.cast(tf.equal(targets, class_fill_targets), "float")
            loss_is.append(tf.contrib.seq2seq.sequence_loss(logits, targets, weights_i,average_across_batch=False))

        loss = tf.reduce_sum(loss_is,axis=0)

        # loss = tf.contrib.seq2seq.sequence_loss(logits, targets, tf.ones([hparams.batch_size, hparams.max_time_step+1])) #+1 is because of the <EOD> token
        # Optimizer
        loss = tf.reduce_mean(loss)+lossL2
        optimizer = tf.train.RMSPropOptimizer(1e-3).minimize(loss)

    return logits, pred_outputs, loss, optimizer,dec_states

def run_program(hparams,FLAGS):
    # load dataset
    num_folds = FLAGS.num_folds
    data_dir = FLAGS.data_dir
    if '13' in data_dir:
        data_version = 2013
    else:
        n_oversampling = 30000
        data_version = 2018

    output_dir = FLAGS.output_dir
    classes = FLAGS.classes
    n_classes = len(classes)

    path, channel_ename = os.path.split(data_dir)
    traindata_dir = os.path.join(os.path.abspath(os.path.join(data_dir, os.pardir)),'traindata/')
    print(str(datetime.now()))

    def evaluate_model(hparams, X_test, y_test, classes):
        acc_track = []
        n_classes = len(classes)
        y_true = []
        y_pred = []
        alignments_alphas_all = []  # (batch_num,B,max_time_step,max_time_step)
        for batch_i, (source_batch, target_batch) in enumerate(batch_data(X_test, y_test, hparams.batch_size)):
            # if source_batch.shape[1] != hparams.max_time_step:
            #     print ("Num of steps is: ", source_batch.shape[1])
            # try:
            pred_outputs_ = sess.run(pred_outputs,
                                     feed_dict={inputs: source_batch, keep_prob_: 1.0})

            alignments_alphas = sess.run(dec_states.alignment_history.stack(),
                                         feed_dict={inputs: source_batch, dec_inputs: target_batch[:, :-1],
                                                    keep_prob_: 1.0})

            # acc_track.append(np.mean(dec_input == target_batch))
            pred_outputs_ = pred_outputs_[:, :hparams.max_time_step]  # remove the last prediction <EOD>
            target_batch_ = target_batch[:, 1:-1]  # remove the last <EOD> and the first <SOD>
            acc_track.append(pred_outputs_ == target_batch_)

            alignments_alphas = alignments_alphas.transpose((1, 0, 2))
            alignments_alphas = alignments_alphas[:, :hparams.max_time_step]
            alignments_alphas_all.append(alignments_alphas)

            _y_true = target_batch_.flatten()
            _y_pred = pred_outputs_.flatten()

            y_true.extend(_y_true)
            y_pred.extend(_y_pred)

        cm = confusion_matrix(y_true, y_pred, labels=range(n_classes))
        ck_score = cohen_kappa_score(y_true, y_pred)
        acc_avg, acc, f1_macro, f1, sensitivity, specificity, PPV = evaluate_metrics(cm, classes)
        # print ("batch_i: {}").format(batch_i)
        print(
        'Average Accuracy -> {:>6.4f}, Macro F1 -> {:>6.4f} and Cohen\'s Kappa -> {:>6.4f} on test set'.format(acc_avg,
                                                                                                               f1_macro,
                                                                                                               ck_score))
        for index_ in range(n_classes):
            print(
            "\t{} rhythm -> Sensitivity: {:1.4f}, Specificity: {:1.4f}, Precision (PPV): {:1.4f}, F1 : {:1.4f} Accuracy: {:1.4f}".format(
                classes[index_],
                sensitivity[
                    index_],
                specificity[
                    index_], PPV[index_], f1[index_],
                acc[index_]))
        print(
        "\tAverage -> Sensitivity: {:1.4f}, Specificity: {:1.4f}, Precision (PPV): {:1.4f}, F1-score: {:1.4f}, Accuracy: {:1.4f}".format(
            np.mean(sensitivity), np.mean(specificity), np.mean(PPV), np.mean(f1), np.mean(acc)))

        return acc_avg, f1_macro, ck_score, y_true, y_pred, alignments_alphas_all

    def count_prameters():
        print ('# of Params: ', np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()]))

    # folds = [4,5,6,7]
    # # folds = [8,9,10,11]
    # # folds = [12,13,14,15]
    # # folds = [16,17,18,19] 
    # folds = [8]
    # for fold_idx in folds:
    for fold_idx in range(num_folds):
        start_time_fold_i = time.time()
        data_loader = SeqDataLoader(data_dir, num_folds, fold_idx, classes=classes)
        X_train, y_train, X_test, y_test = data_loader.load_data(seq_len=hparams.max_time_step)

        # preprocessing
        char2numY = dict(zip(classes, range(len(classes))))        
        pre_f1_macro = 0

        # <SOD> is a token to show start of decoding  and <EOD> is a token to indicate end of decoding
        char2numY['<SOD>'] = len(char2numY)
        char2numY['<EOD>'] = len(char2numY)
        num2charY = dict(zip(char2numY.values(), char2numY.keys()))


        # over-sampling: SMOTE:
        X_train = np.reshape(X_train,[X_train.shape[0]*X_train.shape[1],-1])
        y_train= y_train.flatten()

        if data_version == 2018:
            # extract just undersamples For 2018
            under_sample_len = 35000#30000
            Ws = np.where(y_train == char2numY['W'])[0]
            len_W = len(np.where(y_train == char2numY['W'])[0])
            permute = np.random.permutation(len_W)
            len_r = len_W - under_sample_len if (len_W - under_sample_len) > 0 else 0
            permute = permute[:len_r]
            y_train = np.delete(y_train,Ws[permute],axis =0)
            X_train = np.delete(X_train,Ws[permute],axis =0)

            under_sample_len = 35000 #40000
            N2s = np.where(y_train == char2numY['N2'])[0]
            len_N2 = len(np.where(y_train == char2numY['N2'])[0])
            permute = np.random.permutation(len_N2)
            len_r = len_N2 - under_sample_len if (len_N2 - under_sample_len) > 0 else 0
            permute = permute[:len_r]
            y_train = np.delete(y_train, N2s[permute],axis =0)
            X_train = np.delete(X_train, N2s[permute],axis =0)

        nums = []
        for cl in classes:
            nums.append(len(np.where(y_train == char2numY[cl])[0]))

        if (os.path.exists(traindata_dir) == False):
            os.mkdir(traindata_dir)
        fname = os.path.join(traindata_dir,'trainData_'+channel_ename+'_SMOTE_all_10s_f'+str(fold_idx)+'.npz')

        if (os.path.isfile(fname)):
            X_train, y_train,_ = data_loader.load_npz_file(fname)

        else:
            if data_version == 2013:
                n_osamples = nums[2] - 7000
                ratio = {0: n_osamples if nums[0] < n_osamples else nums[0], 1: n_osamples if nums[1] < n_osamples else nums[1],
                         2: nums[2], 3: n_osamples if nums[3] < n_osamples else nums[3], 4: n_osamples if nums[4] < n_osamples else nums[4]}


            if data_version==2018:
                ratio = {0: n_oversampling if nums[0] < n_oversampling else nums[0], 1: n_oversampling if nums[1] < n_oversampling else nums[1], 2: nums[2],
                     3: n_oversampling if nums[3] < n_oversampling else nums[3], 4: n_oversampling if nums[4] < n_oversampling else nums[4]}

            # ratio = {0: 40000 if nums[0] < 40000 else nums[0], 1: 27000 if nums[1] < 27000 else nums[1], 2: nums[2],
            #          3: 30000 if nums[3] < 30000 else nums[3], 4: 27000 if nums[4] < 27000 else nums[4]}
            sm = SMOTE(random_state=12,ratio=ratio)
            # sm = SMOTE(random_state=12, ratio=ratio)
            # sm = RandomUnderSampler(random_state=12,ratio=ratio)
            X_train, y_train = sm.fit_sample(X_train, y_train)
            data_loader.save_to_npz_file(X_train, y_train,data_loader.sampling_rate,fname)

        X_train = X_train[:(X_train.shape[0] // hparams.max_time_step) * hparams.max_time_step, :]
        y_train = y_train[:(X_train.shape[0] // hparams.max_time_step) * hparams.max_time_step]

        X_train = np.reshape(X_train,[-1,X_test.shape[1],X_test.shape[2]])
        y_train = np.reshape(y_train,[-1,y_test.shape[1],])

        # shuffle training data_2013
        permute = np.random.permutation(len(y_train))
        X_train = np.asarray(X_train)
        X_train = X_train[permute]
        y_train = y_train[permute]


        # add '<SOD>' to the beginning of each label sequence, and '<EOD>' to the end of each label sequence (both for training and test sets)
        y_train= [[char2numY['<SOD>']] + [y_ for y_ in date] + [char2numY['<EOD>']] for date in y_train]
        y_train = np.array(y_train)


        y_test= [[char2numY['<SOD>']] + [y_ for y_ in date] + [char2numY['<EOD>']] for date in y_test]
        y_test = np.array(y_test)

        print ('The training set after oversampling: ', classes)
        for cl in classes:
            print (cl, len(np.where(y_train==char2numY[cl])[0]))

        # training and testing the model
        if (os.path.exists(FLAGS.checkpoint_dir) == False):
            os.mkdir(FLAGS.checkpoint_dir)

        if (os.path.exists(output_dir) == False):
            os.makedirs(output_dir)
        loss_track = []
        with tf.Graph().as_default(), tf.Session() as sess:

            # Placeholders
            inputs = tf.placeholder(tf.float32, [None, hparams.max_time_step, hparams.input_depth], name='inputs')
            targets = tf.placeholder(tf.int32, (None, None), 'targets')
            dec_inputs = tf.placeholder(tf.int32, (None, None), 'decoder_inputs')
            keep_prob_ = tf.placeholder(tf.float32, name='keep')

            # model
            logits, pred_outputs, loss, optimizer,dec_states = build_whole_model(hparams,char2numY,inputs,targets, dec_inputs, keep_prob_)
            count_prameters()
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())
            saver = tf.train.Saver()
            print(str(datetime.now()))
            # ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
            ckpt_name = "model_fold{:02d}.ckpt".format(fold_idx)
            ckpt_exist = False
            for file in os.listdir(FLAGS.checkpoint_dir):
                if file.startswith(ckpt_name):
                    ckpt_exist=True
            ckpt_name = os.path.join(FLAGS.checkpoint_dir, ckpt_name)

            # if ckpt and ckpt.model_checkpoint_path:
            # if os.path.isfile(ckpt_name):
            if ckpt_exist:
                # # Restore
                # ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
                # saver.restore(session, os.path.join(checkpoint_dir, ckpt_name))
                # saver.restore(sess, tf.train.latest_checkpoint(FLAGS.checkpoint_dir))

                saver.restore(sess, ckpt_name)

                # or 'load meta graph' and restore weights
                # saver = tf.train.import_meta_graph(ckpt_name+".meta")
                # saver.restore(session,tf.train.latest_checkpoint(checkpoint_dir))
                evaluate_model(hparams,X_test, y_test, classes)
            else:

                for epoch_i in range(hparams.epochs):
                    start_time = time.time()
                    # train_acc = []
                    y_true = []
                    y_pred  =[]
                    for batch_i, (source_batch, target_batch) in enumerate(batch_data(X_train, y_train, hparams.batch_size)):

                        # _, batch_loss, batch_logits, alignments_alphas = sess.run([optimizer, loss, logits,dec_states.alignment_history.stack()],
                        #     feed_dict = {inputs: source_batch,
                        #                  dec_inputs: target_batch[:, :-1],
                        #                  targets: target_batch[:, 1:],keep_prob_: 0.5} #,
                        #                                        )

                        _, batch_loss, batch_logits = sess.run([optimizer, loss, logits],
                            feed_dict = {inputs: source_batch,
                                         dec_inputs: target_batch[:, :-1],
                                         targets: target_batch[:, 1:],keep_prob_: 0.5} #,
                                                               )
                        loss_track.append(batch_loss)
                        # alignments_alphas = alignments_alphas.transpose((1, 0, 2))
                        # alignments_alphas = alignments_alphas[:, :hparams.max_time_step]
                        # train_acc.append(batch_logits.argmax(axis=-1) == target_batch[:,1:])
                        y_pred_ = batch_logits[:, :hparams.max_time_step].argmax(axis=-1)
                        y_true_ = target_batch[:, 1:-1]

                        # input_tags - word representation of input sequence, use None to skip
                        # output_tags - word representation of output sequence, use None to skip
                        # i - index of input element in batch
                        # input_tags = [[num2charY[i] for i in seq] for seq in y_true_]
                        # output_tags = [[num2charY[i] for i in seq] for seq in y_pred_]
                        # plot_attention(alignments_alphas[1, :, :], input_tags[1], output_tags[1])

                        y_true.extend(y_true_)
                        y_pred.extend(y_pred_)
                    # accuracy = np.mean(train_acc)
                    y_true = np.asarray(y_true)
                    y_pred = np.asarray(y_pred)
                    y_true = y_true.flatten()
                    y_pred = y_pred.flatten()
                    n_examples = len(y_true)
                    cm = confusion_matrix(y_true, y_pred,labels=range(len(char2numY)-2))
                    accuracy = np.mean(y_true == y_pred)
                    mf1 = f1_score(y_true, y_pred, average="macro")
                    ck_score = cohen_kappa_score(y_true, y_pred)

                    print('Epoch {:3} Loss: {:>6.3f} Accuracy: {:>6.4f} F1-score: {:>6.4f} Cohen\'s Kappa: {:>6.4f} Epoch duration: {:>6.3f}s'.format(epoch_i, np.mean(batch_loss),
                                                                                      accuracy,mf1,ck_score, time.time() - start_time))
                    if (epoch_i+1)%hparams.test_step==0:
                        acc_avg, f1_macro,ck_score, y_true, y_pred,alignments_alphas_all = evaluate_model(hparams,X_test, y_test,classes)

                        if np.nan_to_num(f1_macro) > pre_f1_macro: # save the better model based on the f1 score
                            print('Loss {:.4f} after {} epochs (batch_size={})'.format(loss_track[-1], epoch_i + 1,
                                                                                       hparams.batch_size))
                            pre_f1_macro = f1_macro
                            ckpt_name = "model_fold{:02d}.ckpt".format(fold_idx)
                            save_path = os.path.join(FLAGS.checkpoint_dir, ckpt_name)
                            saver.save(sess, save_path)
                            print("The best model (till now) saved in path: %s" % save_path)

                            # Save
                            save_dict = {
                                "y_true": y_true,
                                "y_pred": y_pred,
                                "ck_score": ck_score,
                                "alignments_alphas_all":alignments_alphas_all[:200],# we save just the first 200 batch results because it is so huge
                                }
                            filename = "output_"+channel_ename+"_fold{:02d}.npz".format(fold_idx)
                            save_path = os.path.join(output_dir, filename)
                            np.savez(save_path, **save_dict)
                            print("The best results (till now) saved in path: %s" % save_path)


                # plt.plot(loss_track)
                # plt.show()
                # print 'Classes: ', classes

            print(str(datetime.now()))
            print ('Fold{} took: {:>6.3f}s'.format(fold_idx, time.time()-start_time_fold_i))

def main(args=None):

    FLAGS = tf.app.flags.FLAGS

    # outputs_eeg_fpz_cz
    tf.app.flags.DEFINE_string('data_dir', 'data_2013/eeg_fpz_cz',
                               """Directory where to load training data_2013.""")
    tf.app.flags.DEFINE_string('output_dir', 'outputs_2013/outputs_eeg_fpz_cz',
                               """Directory where to save trained models """
                               """and outputs.""")
    tf.app.flags.DEFINE_integer('num_folds', 20,
                                """Number of cross-validation folds.""")
    tf.app.flags.DEFINE_list('classes', ['W', 'N1', 'N2', 'N3', 'REM'],  """classes""")
    tf.app.flags.DEFINE_string('checkpoint_dir', 'checkpoints-seq2seq-sleep-EDF', """Directory to save checkpoints""")
    # tf.app.flags.DEFINE_string('ckpt_name', 'seq2seq_sleep.ckpt',"""Check point name""")

    # hyperparameters
    hparams = tf.contrib.training.HParams(
        epochs=120,  # 300
        batch_size=20,  # 10
        num_units=128,
        embed_size=10,
        input_depth=3000,
        n_channels=100,
        bidirectional=False,
        use_attention=True,
        lstm_layers=2,
        attention_size=64,
        beam_width=4,
        use_beamsearch_decode=False,
        max_time_step=10,  # 5 3 second best 10# 40 # 100
        output_max_length=10 + 2,  # max_time_step +1
        akara2017=True,
        test_step=5,  # each 10 epochs
    )
    # classes = ['W', 'N1', 'N2', 'N3', 'REM']
    run_program(hparams,FLAGS)
if __name__ == "__main__":
     tf.app.run()