From 1f4c82cd1f568583b84f4f839e6dd2e4ad598325 Mon Sep 17 00:00:00 2001 From: Arsenal Wang Date: Sun, 23 Jun 2019 16:23:31 +0800 Subject: [PATCH] Initial commit --- create_data_to_train.py | 151 +++++++++++++++ model.py | 393 ++++++++++++++++++++++++++++++++++++++++ read_tfrecords.py | 119 ++++++++++++ test.py | 69 +++++++ test_create.py | 37 ++++ train.py | 239 ++++++++++++++++++++++++ utils.py | 155 ++++++++++++++++ 7 files changed, 1163 insertions(+) create mode 100755 create_data_to_train.py create mode 100755 model.py create mode 100755 read_tfrecords.py create mode 100755 test.py create mode 100755 test_create.py create mode 100755 train.py create mode 100755 utils.py diff --git a/create_data_to_train.py b/create_data_to_train.py new file mode 100755 index 0000000..1009669 --- /dev/null +++ b/create_data_to_train.py @@ -0,0 +1,151 @@ +# -*- coding: utf-8 -*- +import os +import tensorflow as tf +import argparse +from utils import GENRES, load_track, get_default_shape, load_track_with_aug +import numpy as np +from tqdm import tqdm +import logging + +logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(filename)s - %(funcName)s: %(lineno)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +each_genres_num = 100 +genres_num = len(GENRES) + +def create_tfrecords_default(data_path, train_path, test_path, test_size=.2, aug=10): + writer_train = tf.python_io.TFRecordWriter(train_path) #输出成tfrecord文件 + writer_test = tf.python_io.TFRecordWriter(test_path) # 输出成tfrecord文件 + + test_size_per_genres = int(each_genres_num * test_size) + + default_shape = get_default_shape(data_path) # (647, 128) + total_data = genres_num * aug * each_genres_num + with tqdm(desc='creating===>>', total=total_data) as pbar: + for index, name in enumerate(GENRES): + audio_list = [os.path.join(data_path, name + '/' + audio) for audio in os.listdir(os.path.join(data_path, name)) + if audio.endswith('au')] + + np.random.shuffle(audio_list) + train_data = audio_list[:-test_size_per_genres] + test_data = audio_list[-test_size_per_genres:] + + # train data + for _, audio in enumerate(train_data): + + au, _ = load_track(audio, default_shape) + + au_flatten = au.flatten() + + example = tf.train.Example(features=tf.train.Features(feature={ + "genres": tf.train.Feature(int64_list=tf.train.Int64List(value=[index])), + 'au_flattern': tf.train.Feature(float_list=tf.train.FloatList(value=au_flatten)) + })) + writer_train.write(example.SerializeToString()) #序列化为字符串 + + pbar.update(aug * each_genres_num * (1 - test_size) / len(train_data) / aug) + + # data augmentation + if aug > 1: + for i in range(aug - 1): + + au_aug, _ = load_track_with_aug(audio, default_shape) + + au_flatten_aug = au_aug.flatten() + + example_aug = tf.train.Example(features=tf.train.Features(feature={ + "genres": tf.train.Feature(int64_list=tf.train.Int64List(value=[index])), + 'au_flattern': tf.train.Feature(float_list=tf.train.FloatList(value=au_flatten_aug)) + })) + writer_train.write(example_aug.SerializeToString()) # 序列化为字符串 + + pbar.update(aug * each_genres_num * (1 - test_size) / len(train_data) / aug) + + + + + # test data + for _, audio in enumerate(test_data): + + au, _ = load_track(audio, default_shape) + + au_flatten = au.flatten() + + example = tf.train.Example(features=tf.train.Features(feature={ + "genres": tf.train.Feature(int64_list=tf.train.Int64List(value=[index])), + 'au_flattern': tf.train.Feature(float_list=tf.train.FloatList(value=au_flatten)) + })) + writer_test.write(example.SerializeToString()) #序列化为字符串 + + pbar.update(aug * each_genres_num * test_size / len(test_data) / aug) + + # data augmentation + if aug > 1: + for i in range(aug - 1): + au_aug, _ = load_track_with_aug(audio, default_shape) + + au_flatten_aug = au_aug.flatten() + + example_aug = tf.train.Example(features=tf.train.Features(feature={ + "genres": tf.train.Feature(int64_list=tf.train.Int64List(value=[index])), + 'au_flattern': tf.train.Feature(float_list=tf.train.FloatList(value=au_flatten_aug)) + })) + writer_test.write(example_aug.SerializeToString()) # 序列化为字符串 + + pbar.update(aug * each_genres_num * test_size / len(test_data) / aug) + + writer_train.close() + writer_test.close() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_path', + type=str, + default='GTZAN/genres', + help='data_sets path.' + ) + + parser.add_argument( + '--train_path', + type=str, + default='tfrecords/train.tfrecords', + help='train tfrecords save path.' + ) + + parser.add_argument( + '--test_path', + type=str, + default='tfrecords/test.tfrecords', + help='test tfrecords save path.' + ) + + parser.add_argument( + '--test_size', + type=float, + default=.2, + help='Proportion of test data that between [0, 1]' + ) + + parser.add_argument( + '--aug', + type=int, + default=10, + help='the size of data sets up to (arg) times as original. 1 means not augmentation.' + ) + args = parser.parse_args() + test_size = args.test_size + data_path = args.data_path + train_path = args.train_path + test_path = args.test_path + aug = max(args.aug, 1) + + logger.info('\nThe following parameters will be applied for data creating:\n') + logger.info("data_sets path: {}".format(data_path)) + logger.info("train tfrecords save path: {}".format(train_path)) + logger.info("test tfrecords save path: {}".format(test_path)) + logger.info("Proportion of test data: {}".format(test_size)) + logger.info("the size of data sets up to {} times as original.".format(aug)) + + create_tfrecords_default(data_path, train_path, test_path, test_size, aug) diff --git a/model.py b/model.py new file mode 100755 index 0000000..c39e226 --- /dev/null +++ b/model.py @@ -0,0 +1,393 @@ + +"""Contains the definition of the Inception Resnet V2 architecture. + +As described in http://arxiv.org/abs/1602.07261. + + Inception-v4, Inception-ResNet and the Impact of Residual Connections + on Learning + Christian Szegedy, Sergey Ioffe, Vincent Vanhoucke, Alex Alemi +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +import tensorflow as tf + +import tensorflow.contrib.slim as slim + + +def block35(net, scale=1.0, activation_fn=tf.nn.relu, scope=None, reuse=None): + """Builds the 35x35 resnet block.""" + with tf.variable_scope(scope, 'Block35', [net], reuse=reuse): + with tf.variable_scope('Branch_0'): + tower_conv = slim.conv2d(net, 32, 1, scope='Conv2d_1x1') + with tf.variable_scope('Branch_1'): + tower_conv1_0 = slim.conv2d(net, 32, 1, scope='Conv2d_0a_1x1') + tower_conv1_1 = slim.conv2d(tower_conv1_0, 32, 3, scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_2'): + tower_conv2_0 = slim.conv2d(net, 32, 1, scope='Conv2d_0a_1x1') + tower_conv2_1 = slim.conv2d(tower_conv2_0, 48, 3, scope='Conv2d_0b_3x3') + tower_conv2_2 = slim.conv2d(tower_conv2_1, 64, 3, scope='Conv2d_0c_3x3') + mixed = tf.concat(axis=3, values=[tower_conv, tower_conv1_1, tower_conv2_2]) + up = slim.conv2d(mixed, net.get_shape()[3], 1, normalizer_fn=None, + activation_fn=None, scope='Conv2d_1x1') + scaled_up = up * scale + if activation_fn == tf.nn.relu6: + # Use clip_by_value to simulate bandpass activation. + scaled_up = tf.clip_by_value(scaled_up, -6.0, 6.0) + + net += scaled_up + if activation_fn: + net = activation_fn(net) + return net + + +def block17(net, scale=1.0, activation_fn=tf.nn.relu, scope=None, reuse=None): + """Builds the 17x17 resnet block.""" + with tf.variable_scope(scope, 'Block17', [net], reuse=reuse): + with tf.variable_scope('Branch_0'): + tower_conv = slim.conv2d(net, 192, 1, scope='Conv2d_1x1') + with tf.variable_scope('Branch_1'): + tower_conv1_0 = slim.conv2d(net, 128, 1, scope='Conv2d_0a_1x1') + tower_conv1_1 = slim.conv2d(tower_conv1_0, 160, [1, 7], + scope='Conv2d_0b_1x7') + tower_conv1_2 = slim.conv2d(tower_conv1_1, 192, [7, 1], + scope='Conv2d_0c_7x1') + mixed = tf.concat(axis=3, values=[tower_conv, tower_conv1_2]) + up = slim.conv2d(mixed, net.get_shape()[3], 1, normalizer_fn=None, + activation_fn=None, scope='Conv2d_1x1') + + scaled_up = up * scale + if activation_fn == tf.nn.relu6: + # Use clip_by_value to simulate bandpass activation. + scaled_up = tf.clip_by_value(scaled_up, -6.0, 6.0) + + net += scaled_up + if activation_fn: + net = activation_fn(net) + return net + + +def block8(net, scale=1.0, activation_fn=tf.nn.relu, scope=None, reuse=None): + """Builds the 8x8 resnet block.""" + with tf.variable_scope(scope, 'Block8', [net], reuse=reuse): + with tf.variable_scope('Branch_0'): + tower_conv = slim.conv2d(net, 192, 1, scope='Conv2d_1x1') + with tf.variable_scope('Branch_1'): + tower_conv1_0 = slim.conv2d(net, 192, 1, scope='Conv2d_0a_1x1') + tower_conv1_1 = slim.conv2d(tower_conv1_0, 224, [1, 3], + scope='Conv2d_0b_1x3') + tower_conv1_2 = slim.conv2d(tower_conv1_1, 256, [3, 1], + scope='Conv2d_0c_3x1') + mixed = tf.concat(axis=3, values=[tower_conv, tower_conv1_2]) + up = slim.conv2d(mixed, net.get_shape()[3], 1, normalizer_fn=None, + activation_fn=None, scope='Conv2d_1x1') + + scaled_up = up * scale + if activation_fn == tf.nn.relu6: + # Use clip_by_value to simulate bandpass activation. + scaled_up = tf.clip_by_value(scaled_up, -6.0, 6.0) + + net += scaled_up + if activation_fn: + net = activation_fn(net) + return net + + +def inception_resnet_v2_base(inputs, + final_endpoint='Conv2d_7b_1x1', + output_stride=16, + align_feature_maps=False, + scope=None, + activation_fn=tf.nn.relu): + """Inception model from http://arxiv.org/abs/1602.07261. + + Constructs an Inception Resnet v2 network from inputs to the given final + endpoint. This method can construct the network up to the final inception + block Conv2d_7b_1x1. + + Args: + inputs: a tensor of size [batch_size, height, width, channels]. + final_endpoint: specifies the endpoint to construct the network up to. It + can be one of ['Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3', + 'MaxPool_3a_3x3', 'Conv2d_3b_1x1', 'Conv2d_4a_3x3', 'MaxPool_5a_3x3', + 'Mixed_5b', 'Mixed_6a', 'PreAuxLogits', 'Mixed_7a', 'Conv2d_7b_1x1'] + output_stride: A scalar that specifies the requested ratio of input to + output spatial resolution. Only supports 8 and 16. + align_feature_maps: When true, changes all the VALID paddings in the network + to SAME padding so that the feature maps are aligned. + scope: Optional variable_scope. + activation_fn: Activation function for block scopes. + + Returns: + tensor_out: output tensor corresponding to the final_endpoint. + end_points: a set of activations for external use, for example summaries or + losses. + + Raises: + ValueError: if final_endpoint is not set to one of the predefined values, + or if the output_stride is not 8 or 16, or if the output_stride is 8 and + we request an end point after 'PreAuxLogits'. + """ + if output_stride != 8 and output_stride != 16: + raise ValueError('output_stride must be 8 or 16.') + + padding = 'SAME' if align_feature_maps else 'VALID' + + end_points = {} + + def add_and_check_final(name, net): + end_points[name] = net + return name == final_endpoint + + with tf.variable_scope(scope, 'InceptionResnetV2', [inputs]): + with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d], + stride=1, padding='SAME'): + # 149 x 149 x 32 + net = slim.conv2d(inputs, 32, 3, stride=2, padding=padding, + scope='Conv2d_1a_3x3') + if add_and_check_final('Conv2d_1a_3x3', net): return net, end_points + + # 147 x 147 x 32 + net = slim.conv2d(net, 32, 3, padding=padding, + scope='Conv2d_2a_3x3') + if add_and_check_final('Conv2d_2a_3x3', net): return net, end_points + # 147 x 147 x 64 + net = slim.conv2d(net, 64, 3, scope='Conv2d_2b_3x3') + if add_and_check_final('Conv2d_2b_3x3', net): return net, end_points + # 73 x 73 x 64 + net = slim.max_pool2d(net, 3, stride=2, padding=padding, + scope='MaxPool_3a_3x3') + if add_and_check_final('MaxPool_3a_3x3', net): return net, end_points + # 73 x 73 x 80 + net = slim.conv2d(net, 80, 1, padding=padding, + scope='Conv2d_3b_1x1') + if add_and_check_final('Conv2d_3b_1x1', net): return net, end_points + # 71 x 71 x 192 + net = slim.conv2d(net, 192, 3, padding=padding, + scope='Conv2d_4a_3x3') + if add_and_check_final('Conv2d_4a_3x3', net): return net, end_points + # 35 x 35 x 192 + net = slim.max_pool2d(net, 3, stride=2, padding=padding, + scope='MaxPool_5a_3x3') + if add_and_check_final('MaxPool_5a_3x3', net): return net, end_points + + # 35 x 35 x 320 + with tf.variable_scope('Mixed_5b'): + with tf.variable_scope('Branch_0'): + tower_conv = slim.conv2d(net, 96, 1, scope='Conv2d_1x1') + with tf.variable_scope('Branch_1'): + tower_conv1_0 = slim.conv2d(net, 48, 1, scope='Conv2d_0a_1x1') + tower_conv1_1 = slim.conv2d(tower_conv1_0, 64, 5, + scope='Conv2d_0b_5x5') + with tf.variable_scope('Branch_2'): + tower_conv2_0 = slim.conv2d(net, 64, 1, scope='Conv2d_0a_1x1') + tower_conv2_1 = slim.conv2d(tower_conv2_0, 96, 3, + scope='Conv2d_0b_3x3') + tower_conv2_2 = slim.conv2d(tower_conv2_1, 96, 3, + scope='Conv2d_0c_3x3') + with tf.variable_scope('Branch_3'): + tower_pool = slim.avg_pool2d(net, 3, stride=1, padding='SAME', + scope='AvgPool_0a_3x3') + tower_pool_1 = slim.conv2d(tower_pool, 64, 1, + scope='Conv2d_0b_1x1') + net = tf.concat( + [tower_conv, tower_conv1_1, tower_conv2_2, tower_pool_1], 3) + + if add_and_check_final('Mixed_5b', net): return net, end_points + # TODO(alemi): Register intermediate endpoints + net = slim.repeat(net, 10, block35, scale=0.17, + activation_fn=activation_fn) + + # 17 x 17 x 1088 if output_stride == 8, + # 33 x 33 x 1088 if output_stride == 16 + use_atrous = output_stride == 8 + + with tf.variable_scope('Mixed_6a'): + with tf.variable_scope('Branch_0'): + tower_conv = slim.conv2d(net, 384, 3, stride=1 if use_atrous else 2, + padding=padding, + scope='Conv2d_1a_3x3') + with tf.variable_scope('Branch_1'): + tower_conv1_0 = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1') + tower_conv1_1 = slim.conv2d(tower_conv1_0, 256, 3, + scope='Conv2d_0b_3x3') + tower_conv1_2 = slim.conv2d(tower_conv1_1, 384, 3, + stride=1 if use_atrous else 2, + padding=padding, + scope='Conv2d_1a_3x3') + with tf.variable_scope('Branch_2'): + tower_pool = slim.max_pool2d(net, 3, stride=1 if use_atrous else 2, + padding=padding, + scope='MaxPool_1a_3x3') + net = tf.concat([tower_conv, tower_conv1_2, tower_pool], 3) + + if add_and_check_final('Mixed_6a', net): return net, end_points + + # TODO(alemi): register intermediate endpoints + with slim.arg_scope([slim.conv2d], rate=2 if use_atrous else 1): + net = slim.repeat(net, 20, block17, scale=0.10, + activation_fn=activation_fn) + if add_and_check_final('PreAuxLogits', net): return net, end_points + + if output_stride == 8: + # TODO(gpapan): Properly support output_stride for the rest of the net. + raise ValueError('output_stride==8 is only supported up to the ' + 'PreAuxlogits end_point for now.') + + # 8 x 8 x 2080 + with tf.variable_scope('Mixed_7a'): + with tf.variable_scope('Branch_0'): + tower_conv = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1') + tower_conv_1 = slim.conv2d(tower_conv, 384, 3, stride=2, + padding=padding, + scope='Conv2d_1a_3x3') + with tf.variable_scope('Branch_1'): + tower_conv1 = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1') + tower_conv1_1 = slim.conv2d(tower_conv1, 288, 3, stride=2, + padding=padding, + scope='Conv2d_1a_3x3') + with tf.variable_scope('Branch_2'): + tower_conv2 = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1') + tower_conv2_1 = slim.conv2d(tower_conv2, 288, 3, + scope='Conv2d_0b_3x3') + tower_conv2_2 = slim.conv2d(tower_conv2_1, 320, 3, stride=2, + padding=padding, + scope='Conv2d_1a_3x3') + with tf.variable_scope('Branch_3'): + tower_pool = slim.max_pool2d(net, 3, stride=2, + padding=padding, + scope='MaxPool_1a_3x3') + net = tf.concat( + [tower_conv_1, tower_conv1_1, tower_conv2_2, tower_pool], 3) + + if add_and_check_final('Mixed_7a', net): return net, end_points + + # TODO(alemi): register intermediate endpoints + net = slim.repeat(net, 9, block8, scale=0.20, activation_fn=activation_fn) + net = block8(net, activation_fn=None) + + # 8 x 8 x 1536 + net = slim.conv2d(net, 1536, 1, scope='Conv2d_7b_1x1') + if add_and_check_final('Conv2d_7b_1x1', net): return net, end_points + + raise ValueError('final_endpoint (%s) not recognized', final_endpoint) + + +def inception_resnet_v2(inputs, num_classes=10, is_training=True, + dropout_keep_prob=0.8, + reuse=None, + scope='InceptionResnetV2', + create_aux_logits=True, + activation_fn=tf.nn.relu): + """Creates the Inception Resnet V2 model. + + Args: + inputs: a 4-D tensor of size [batch_size, height, width, 3]. + Dimension batch_size may be undefined. If create_aux_logits is false, + also height and width may be undefined. + num_classes: number of predicted classes. If 0 or None, the logits layer + is omitted and the input features to the logits layer (before dropout) + are returned instead. + is_training: whether is training or not. + dropout_keep_prob: float, the fraction to keep before final layer. + reuse: whether or not the network and its variables should be reused. To be + able to reuse 'scope' must be given. + scope: Optional variable_scope. + create_aux_logits: Whether to include the auxilliary logits. + activation_fn: Activation function for conv2d. + + Returns: + net: the output of the logits layer (if num_classes is a non-zero integer), + or the non-dropped-out input to the logits layer (if num_classes is 0 or + None). + end_points: the set of end_points from the inception model. + """ + end_points = {} + + with tf.variable_scope(scope, 'InceptionResnetV2', [inputs], + reuse=reuse) as scope: + with slim.arg_scope([slim.batch_norm, slim.dropout], + is_training=is_training): + + net, end_points = inception_resnet_v2_base(inputs, scope=scope, + activation_fn=activation_fn) + + if create_aux_logits and num_classes: + with tf.variable_scope('AuxLogits'): + aux = end_points['PreAuxLogits'] + aux = slim.avg_pool2d(aux, 5, stride=3, padding='VALID', + scope='Conv2d_1a_3x3') + aux = slim.conv2d(aux, 128, 1, scope='Conv2d_1b_1x1') + aux = slim.conv2d(aux, 768, aux.get_shape()[1:3], + padding='VALID', scope='Conv2d_2a_5x5') + aux = slim.flatten(aux) + aux = slim.fully_connected(aux, num_classes, activation_fn=None, + scope='Logits') + end_points['AuxLogits'] = aux + + with tf.variable_scope('Logits'): + # TODO(sguada,arnoegw): Consider adding a parameter global_pool which + # can be set to False to disable pooling here (as in resnet_*()). + kernel_size = net.get_shape()[1:3] + if kernel_size.is_fully_defined(): + net = slim.avg_pool2d(net, kernel_size, padding='VALID', + scope='AvgPool_1a_8x8') + else: + net = tf.reduce_mean(net, [1, 2], keep_dims=True, name='global_pool') + end_points['global_pool'] = net + if not num_classes: + return net, end_points + net = slim.flatten(net) + net = slim.dropout(net, dropout_keep_prob, is_training=is_training, + scope='Dropout') + end_points['PreLogitsFlatten'] = net + logits = slim.fully_connected(net, num_classes, activation_fn=None, + scope='Logits') + end_points['Logits'] = logits + end_points['Predictions'] = tf.nn.softmax(logits, name='Predictions') + + return logits, end_points +inception_resnet_v2.default_image_size = 299 + + +def inception_resnet_v2_arg_scope( + weight_decay=0.00004, + batch_norm_decay=0.9997, + batch_norm_epsilon=0.001, + activation_fn=tf.nn.relu, + batch_norm_updates_collections=tf.GraphKeys.UPDATE_OPS, + batch_norm_scale=False): + """Returns the scope with the default parameters for inception_resnet_v2. + + Args: + weight_decay: the weight decay for weights variables. + batch_norm_decay: decay for the moving average of batch_norm momentums. + batch_norm_epsilon: small float added to variance to avoid dividing by zero. + activation_fn: Activation function for conv2d. + batch_norm_updates_collections: Collection for the update ops for + batch norm. + batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the + activations in the batch normalization layer. + + Returns: + a arg_scope with the parameters needed for inception_resnet_v2. + """ + # Set weight_decay for weights in conv2d and fully_connected layers. + with slim.arg_scope([slim.conv2d, slim.fully_connected], + weights_regularizer=slim.l2_regularizer(weight_decay), + biases_regularizer=slim.l2_regularizer(weight_decay)): + + batch_norm_params = { + 'decay': batch_norm_decay, + 'epsilon': batch_norm_epsilon, + 'updates_collections': batch_norm_updates_collections, + 'fused': None, # Use fused batch norm if possible. + 'scale': batch_norm_scale, + } + # Set activation_fn and parameters for batch_norm. + with slim.arg_scope([slim.conv2d], activation_fn=activation_fn, + normalizer_fn=slim.batch_norm, + normalizer_params=batch_norm_params) as scope: + return scope \ No newline at end of file diff --git a/read_tfrecords.py b/read_tfrecords.py new file mode 100755 index 0000000..2875977 --- /dev/null +++ b/read_tfrecords.py @@ -0,0 +1,119 @@ +import tensorflow as tf +import numpy as np +import cv2 as cv +from utils import get_record_dataset +slim = tf.contrib.slim + + +# for serialized_example in tf.python_io.tf_record_iterator("train.tfrecords"): +# example = tf.train.Example() +# example.ParseFromString(serialized_example) +# +# image = example.features.feature['img_raw'].float_list.value +# label = example.features.feature['label'].int64_list.value +# # 可以做一些预处理之类的 +# print(image, label) + +# def read_and_decode(filename): +# # 根据文件名生成一个队列 +# filename_queue = tf.train.string_input_producer([filename]) +# +# reader = tf.TFRecordReader() +# _, serialized_example = reader.read(filename_queue) # 返回文件名和文件 +# features = tf.parse_single_example(serialized_example, +# features={ +# 'label': tf.FixedLenFeature([], tf.int64), +# 'img_raw': tf.VarLenFeature(tf.float32), +# }) +# +# #img = tf.decode_raw(features['img_raw'], tf.float32) +# img = features['img_raw'] +# img = tf.sparse_tensor_to_dense(img) +# img = tf.reshape(img, [224, 224, 3]) +# #img = tf.cast(tf.reshape(img, [224, 224, 3]) * 255, tf.uint8) +# img = tf.cast(img * 255, tf.uint8) +# label = tf.cast(features['label'], tf.int32) +# +# return img, label +# +# +# +# img, label = read_and_decode("train.tfrecords") +# +# #使用shuffle_batch可以随机打乱输入 +# img_batch, label_batch = tf.train.shuffle_batch([img, label], +# batch_size=1, capacity=10, +# min_after_dequeue=2) +# init = tf.initialize_all_variables() +# +# with tf.Session() as sess: +# sess.run(init) +# threads = tf.train.start_queue_runners(sess=sess) +# for i in range(2): +# val, l = sess.run([img_batch, label_batch]) +# val = np.reshape(val, (224, 224, 3)) +# cv.imshow('1', val) +# cv.waitKey() +# print(val.shape, l) + + +# def read_and_decode(filename): +# # 根据文件名生成一个队列 +# filename_queue = tf.train.string_input_producer([filename]) +# +# reader = tf.TFRecordReader() +# _, serialized_example = reader.read(filename_queue) # 返回文件名和文件 +# features = tf.parse_single_example(serialized_example, +# features={ +# 'label': tf.FixedLenFeature([], tf.int64), +# 'img_raw': tf.VarLenFeature(tf.float32), +# }) +# +# #img = tf.decode_raw(features['img_raw'], tf.float32) +# img = features['img_raw'] +# img = tf.sparse_tensor_to_dense(img) +# img = tf.reshape(img, [647,128]) +# #img = tf.cast(tf.reshape(img, [224, 224, 3]) * 255, tf.uint8) +# #img = tf.cast(img * 255, tf.uint8) +# label = tf.cast(features['label'], tf.int32) +# +# return img, label +# +# +# +# img, label = read_and_decode("train.tfrecords") +# +# #使用shuffle_batch可以随机打乱输入 +# img_batch, label_batch = tf.train.shuffle_batch([img, label], +# batch_size=1, capacity=10, +# min_after_dequeue=2) +# init = tf.initialize_all_variables() +# +# with tf.Session() as sess: +# sess.run(init) +# threads = tf.train.start_queue_runners(sess=sess) +# for i in range(2): +# val, l = sess.run([img_batch, label_batch]) +# print(val, l) + + +dataset = get_record_dataset('tfrecords/data.tfrecords', num_samples=1000, + num_classes=10) +data_provider = slim.dataset_data_provider.DatasetDataProvider(dataset) +label,image = data_provider.get(['genres', 'au_flattern']) + +inputs, labels = tf.train.batch([image, label], + batch_size=1, + # capacity=5*FLAGS.batch_size, + allow_smaller_final_batch=True) + +# 输出当前tensor的静态shape 和动态shape,与另一种读取方式进行对比 + +init = tf.initialize_all_variables() + +with tf.Session() as sess: + sess.run(init) + threads = tf.train.start_queue_runners(sess=sess) + for i in range(2): + val, l = sess.run([inputs, labels]) + print(val, l) \ No newline at end of file diff --git a/test.py b/test.py new file mode 100755 index 0000000..8075ebe --- /dev/null +++ b/test.py @@ -0,0 +1,69 @@ +from model import inception_resnet_v2 +import tensorflow as tf +from utils import read_and_decode +import numpy as np +import logging +from tqdm import tqdm + + +logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(filename)s - %(funcName)s: %(lineno)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + + + +model_path = './models/' +test_data = './tfrecords/test.tfrecords' + +tf.reset_default_graph() + +au_test, label_test = read_and_decode(test_data) + +au_test_batch, label_test_batch = tf.train.shuffle_batch([au_test, label_test], + batch_size=1, + num_threads=16, + capacity=800 + 3, + min_after_dequeue=800, + ) + +input_ = tf.placeholder(tf.float32, [None, 647, 128, 1]) + +logits_, _ = inception_resnet_v2(input_, is_training=False, dropout_keep_prob=1, create_aux_logits=False) + +with tf.Session() as sess: + + saver = tf.train.Saver() + + saver.restore(sess, "models/inception_resnet_v2_iteration_9999.ckpt") + + coord = tf.train.Coordinator() + + threads = tf.train.start_queue_runners(coord=coord) + + N = 1000 + top1 = 0 + top3 = 0 + for i in tqdm(range(N)): + data, labels = sess.run([au_test_batch, label_test_batch]) + + logits = sess.run(logits_, feed_dict={input_: data}).ravel() + + max_index = np.argsort(-logits) + + predict = np.argmax(logits) + if predict == int(labels): + top1 += 1 + if int(labels) in max_index[:3]: + top3 += 1 + + logging.info("top1: {:.2f}%".format(top1 / N * 100)) + logging.info("top3: {:.2f}%".format(top3 / N * 100)) + + coord.request_stop() + coord.join(threads) + + + + + + + diff --git a/test_create.py b/test_create.py new file mode 100755 index 0000000..3efa3d8 --- /dev/null +++ b/test_create.py @@ -0,0 +1,37 @@ +import os +import tensorflow as tf +import cv2 as cv +from utils import load_track + +# writer = tf.python_io.TFRecordWriter("train.tfrecords") +# path = '/Users/wangdong/Documents/DPED-master/dped/dped/blackberry/test_data/full_size_test_images/' +# for index in range(29): +# img_name = path + '{}.jpg'.format(index) +# +# img = cv.imread(img_name) +# img = cv.resize(img, (224, 224)) +# img = img.flatten() +# +# img = img / 256 +# print(img.shape) +# #img_raw = img.tobytes() #将图片转化为原生bytes +# example = tf.train.Example(features=tf.train.Features(feature={ +# "label": tf.train.Feature(int64_list=tf.train.Int64List(value=[index])), +# 'img_raw': tf.train.Feature(float_list=tf.train.FloatList(value=img)) +# })) +# writer.write(example.SerializeToString()) #序列化为字符串 +# writer.close() + +writer = tf.python_io.TFRecordWriter("train.tfrecords") +tmp_features, _ = load_track(os.path.join('GTZAN/genres', 'blues/blues.00000.au')) +print(tmp_features.shape) +data = tmp_features.flatten() + +print(data) +#img_raw = img.tobytes() #将图片转化为原生bytes +example = tf.train.Example(features=tf.train.Features(feature={ + "label": tf.train.Feature(int64_list=tf.train.Int64List(value=[12])), + 'img_raw': tf.train.Feature(float_list=tf.train.FloatList(value=data)) + })) +writer.write(example.SerializeToString()) #序列化为字符串 +writer.close() \ No newline at end of file diff --git a/train.py b/train.py new file mode 100755 index 0000000..10d0840 --- /dev/null +++ b/train.py @@ -0,0 +1,239 @@ +import tensorflow as tf +from utils import read_and_decode +from model import inception_resnet_v2 +import argparse +from tqdm import tqdm +import os +import logging + +slim = tf.contrib.slim +logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(filename)s - %(funcName)s: %(lineno)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +def train(train_data_, + decay_rate_, + global_steps_, + decay_steps_, + batch_size_, + learning_rate_, + eval_step_, + model_path_, + summary_path_, + load_model_): + + if not os.path.exists(model_path_): + os.mkdir(model_path_) + + if not os.path.exists(summary_path_): + os.mkdir(summary_path_) + + graph = tf.Graph() + + with graph.as_default(): + + au_train, label_train = read_and_decode(train_data_) + + min_fraction_of_examples_in_queue = 0.4 + test_size = .2 + aug = 10 + total_examples = 1000 * aug + min_queue_examples_train = int(total_examples * (1 - test_size) * min_fraction_of_examples_in_queue) + + au_train_batch, label_train_batch = tf.train.shuffle_batch([au_train, label_train], + batch_size=batch_size_, + num_threads=16, + capacity=min_queue_examples_train + 3 * batch_size_, + min_after_dequeue=min_queue_examples_train, + ) + + label_train_batch_ = tf.one_hot(tf.squeeze(label_train_batch), 10, 1, 0) + + logits, end_points = inception_resnet_v2(au_train_batch) + + if 'AuxLogits' in end_points: + slim.losses.softmax_cross_entropy( + end_points['AuxLogits'], label_train_batch_, weights=0.4, scope='aux_loss') + + slim.losses.softmax_cross_entropy( + logits, label_train_batch_, weights=1.0, scope='base_loss') + + total_loss = slim.losses.get_total_loss() + + tf.summary.scalar('loss', total_loss) + + global_ = tf.Variable(tf.constant(0), trainable=False) + + lr = tf.train.exponential_decay(learning_rate_, global_, decay_steps_, decay_rate_, staircase=True) + + tf.summary.scalar('lr', lr) + + optimizer = tf.train.AdamOptimizer(lr).minimize(total_loss) + + correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(label_train_batch_, 1)) + + accuracy_ = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) + + tf.summary.scalar('accuracy', accuracy_) + + saver = tf.train.Saver(max_to_keep=5) + + merged = tf.summary.merge_all() + + writer = tf.summary.FileWriter(summary_path_, graph=graph) + + init = tf.global_variables_initializer() + + with tf.Session() as sess, open('log/train_log.log', 'w') as log: + + if load_model_: + checkpoint = tf.train.get_checkpoint_state(model_path_) + + meta_graph_path = checkpoint.model_checkpoint_path + ".meta" + + restore = tf.train.import_meta_graph(meta_graph_path) + + restore.restore(sess, tf.train.latest_checkpoint(model_path_)) + + step = int(meta_graph_path.split("_")[-1].split(".")[0]) + else: + sess.run(init) + step = 0 + + coord = tf.train.Coordinator() + + threads = tf.train.start_queue_runners(coord=coord) + try: + for i in tqdm(range(step, global_steps_)): + + acc, loss, train_summary, _ = sess.run([accuracy_, total_loss, merged, optimizer], + feed_dict={global_: i}) + print("steps:{} train loss :{:.2f}, accuracy: {:.2f}".format(i, loss, acc), file=log) + log.flush() + + if (i + 1) % eval_step_ == 0: + + saver.save(sess, '{}/inception_resnet_v2_iteration_{}.ckpt'.format(model_path_, i)) + writer.add_summary(train_summary, i) + + except KeyboardInterrupt: + logger.exception('Interrupted') + coord.request_stop() + except Exception as e: + logger.exception(e) + coord.request_stop(e) + finally: + logger.info("Model saved in file: %s" % model_path_) + # When done, ask the threads to stop. + coord.request_stop() + coord.join(threads) + + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + parser.add_argument( + '--train_data', + type=str, + default='./tfrecords/train.tfrecords', + help='train_data path.' + ) + + parser.add_argument( + '--decay_rate', + type=float, + default=0.9, + help='learning rate decay rate.' + ) + + parser.add_argument( + '--global_steps', + type=int, + default=10000, + help='global steps' + ) + + parser.add_argument( + '--decay_steps', + type=int, + default=100, + help='learning rate decay steps.' + ) + + parser.add_argument( + '--learning_rate', + type=float, + default=1e-4, + help='learning rate.' + ) + parser.add_argument( + '--eval_step', + type=int, + default=100, + help='evaluation steps.' + ) + + parser.add_argument( + '--batch_size', + type=int, + default=50, + help='batch size.' + ) + + parser.add_argument( + '--model_path', + type=str, + default='models/', + help='tensorflow model path.' + ) + + parser.add_argument( + '--summary_path', + type=str, + default='summary/', + help='tensorflow summary path.' + ) + + parser.add_argument( + '--load_model', + type=bool, + default=False, + help='whether you wish to continue training.' + ) + + args = parser.parse_args() + + train_data = args.train_data + decay_rate = args.decay_rate + global_steps = args.global_steps # 总的迭代次数 + decay_steps = args.decay_steps # 衰减次数 + learning_rate = args.learning_rate + eval_step = args.eval_step + summary_path = args.summary_path + model_path = args.model_path + load_model = args.load_model + batch_size = args.batch_size + + logger.info('\nThe following parameters will be applied for data creating:\n') + logger.info('train_data path: {}.'.format(train_data)) + logger.info("learning rate decay rate: {}".format(decay_rate)) + logger.info("global steps: {}".format(global_steps)) + logger.info("learning rate decay steps: {} .".format(decay_steps)) + logger.info('batch size: {}.'.format(batch_size)) + logger.info('learning rate: {}.'.format(learning_rate)) + logger.info('evaluation steps: {}.'.format(eval_step)) + logger.info('tensorflow model path: {}.'.format(model_path)) + logger.info('tensorflow summary path: {}.'.format(summary_path)) + logger.info('whether you wish to continue training: {}.'.format(load_model)) + + train(train_data, + decay_rate, + global_steps, + decay_steps, + batch_size, + learning_rate, + eval_step, + model_path, + summary_path, + load_model) + diff --git a/utils.py b/utils.py new file mode 100755 index 0000000..dbb0304 --- /dev/null +++ b/utils.py @@ -0,0 +1,155 @@ +import numpy as np +import librosa as lbr +import tensorflow as tf +import os + +slim = tf.contrib.slim + +GENRES = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', + 'pop', 'reggae', 'rock'] + +WINDOW_SIZE = 2048 +WINDOW_STRIDE = WINDOW_SIZE // 2 +N_MELS = 128 +MEL_KWARGS = { + 'n_fft': WINDOW_SIZE, + 'hop_length': WINDOW_STRIDE, + 'n_mels': N_MELS +} + +def get_default_shape(dataset_path): + tmp_features, _ = load_track(os.path.join(dataset_path, + 'blues/blues.00000.au')) + return tmp_features.shape + + +def load_track(filename, enforce_shape=None): + new_input, sample_rate = lbr.load(filename, mono=True) + features = lbr.feature.melspectrogram(new_input, **MEL_KWARGS).T + + if enforce_shape is not None: + if features.shape[0] < enforce_shape[0]: + delta_shape = (enforce_shape[0] - features.shape[0], + enforce_shape[1]) + features = np.append(features, np.zeros(delta_shape), axis=0) + elif features.shape[0] > enforce_shape[0]: + features = features[: enforce_shape[0], :] + + + features[features == 0] = 1e-6 + return (np.log(features), float(new_input.shape[0]) / sample_rate) + + + +def read_and_decode(filename): # read train.tfrecords + filename_queue = tf.train.string_input_producer([filename])# create a queue + + reader = tf.TFRecordReader() + _, serialized_example = reader.read(filename_queue)#return file_name and file + features = tf.parse_single_example(serialized_example, + features={ + 'genres': tf.FixedLenFeature((1,), tf.int64), + 'au_flattern': tf.VarLenFeature(tf.float32), + })#return image and label + + + labels = tf.cast(features['genres'], tf.int32) #throw label tensor + au_flattern = features['au_flattern'] + au_flattern = tf.sparse_tensor_to_dense(au_flattern) + au = tf.reshape(au_flattern, get_default_shape('GTZAN/genres')) + au = tf.expand_dims(au, axis=2) + return au, labels + + + +def audio_augmention(data, sr): + # Adding white noise + wn = np.random.randn(len(data)) + data_wn = data + 0.005 * wn + + # Shifting the sound + steps = np.random.randint(-10, 10) + data_sf = lbr.effects.pitch_shift(data_wn, sr, n_steps=steps) + + # Changing volume + volume = np.random.uniform(.5, 2) + data_sf *= volume + + return data_sf + + +def load_track_with_aug(filename, enforce_shape=None): + new_input, sample_rate = lbr.load(filename, mono=True) + new_input_with_aug = audio_augmention(new_input, sample_rate) + features = lbr.feature.melspectrogram(new_input_with_aug, **MEL_KWARGS).T + + if enforce_shape is not None: + if features.shape[0] < enforce_shape[0]: + delta_shape = (enforce_shape[0] - features.shape[0], + enforce_shape[1]) + features = np.append(features, np.zeros(delta_shape), axis=0) + elif features.shape[0] > enforce_shape[0]: + features = features[: enforce_shape[0], :] + + features[features == 0] = 1e-6 + return (np.log(features), float(new_input.shape[0]) / sample_rate) + + + +def freeze_graph(input_checkpoint, output_graph): + ''' + :param input_checkpoint: + :param output_graph: PB模型保存路径 + :return: + ''' + # checkpoint = tf.train.get_checkpoint_state(model_folder) #检查目录下ckpt文件状态是否可用 + # input_checkpoint = checkpoint.model_checkpoint_path #得ckpt文件路径 + from tensorflow.python.framework.graph_util import convert_variables_to_constants + + # 指定输出的节点名称,该节点名称必须是原模型中存在的节点 + output_node_names = "evaluate/ArgMax" + saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True) + graph = tf.get_default_graph() # 获得默认的图 + input_graph_def = graph.as_graph_def() # 返回一个序列化的图代表当前的图 + + with tf.Session() as sess: + saver.restore(sess, input_checkpoint) # 恢复图并得到数据 + output_graph_def = convert_variables_to_constants( # 模型持久化,将变量值固定 + sess=sess, + input_graph_def=input_graph_def, # 等于:sess.graph_def + output_node_names=output_node_names.split(",")) # 如果有多个输出节点,以逗号隔开 + + with tf.gfile.GFile(output_graph, "wb") as f: # 保存模型 + f.write(output_graph_def.SerializeToString()) # 序列化输出 + print("%d ops in the final graph." % len(output_graph_def.node)) # 得到当前图有几个操作节点 + + # for op in graph.get_operations(): + # print(op.name, op.values()) + +#freeze_graph('model/inception_resnet_v2_iteration_9599.ckpt', 'model/test.pb') + +def print_node(): + from tensorflow.python import pywrap_tensorflow + import os + checkpoint_path=os.path.join('model/inception_resnet_v2_iteration_9599.ckpt') + reader=pywrap_tensorflow.NewCheckpointReader(checkpoint_path) + var_to_shape_map=reader.get_variable_to_shape_map() + # b = [b for b in var_to_shape_map if b.startswith('generator/b')] + # b.sort() + # w = [w for w in var_to_shape_map if w.startswith('generator/W')] + # w.sort() + # v = [v for v in var_to_shape_map if v.startswith('generator/V')] + # v.sort() + # + # print(w) + # print(b) + # print(v) + for key in var_to_shape_map: + print('tensor_name: ', key) + + + + + + +