From 1f4c82cd1f568583b84f4f839e6dd2e4ad598325 Mon Sep 17 00:00:00 2001
From: Arsenal Wang <dongwang@xiaohongshu.com>
Date: Sun, 23 Jun 2019 16:23:31 +0800
Subject: [PATCH] Initial commit

---
 create_data_to_train.py | 151 +++++++++++++++
 model.py                | 393 ++++++++++++++++++++++++++++++++++++++++
 read_tfrecords.py       | 119 ++++++++++++
 test.py                 |  69 +++++++
 test_create.py          |  37 ++++
 train.py                | 239 ++++++++++++++++++++++++
 utils.py                | 155 ++++++++++++++++
 7 files changed, 1163 insertions(+)
 create mode 100755 create_data_to_train.py
 create mode 100755 model.py
 create mode 100755 read_tfrecords.py
 create mode 100755 test.py
 create mode 100755 test_create.py
 create mode 100755 train.py
 create mode 100755 utils.py

diff --git a/create_data_to_train.py b/create_data_to_train.py
new file mode 100755
index 0000000..1009669
--- /dev/null
+++ b/create_data_to_train.py
@@ -0,0 +1,151 @@
+# -*- coding: utf-8 -*-
+import os
+import tensorflow as tf
+import argparse
+from utils import GENRES, load_track, get_default_shape, load_track_with_aug
+import numpy as np
+from tqdm import tqdm
+import logging
+
+logging.basicConfig(level=logging.DEBUG, format='%(asctime)s -  %(filename)s - %(funcName)s: %(lineno)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+each_genres_num = 100
+genres_num = len(GENRES)
+
+def create_tfrecords_default(data_path, train_path, test_path, test_size=.2, aug=10):
+    writer_train = tf.python_io.TFRecordWriter(train_path) #输出成tfrecord文件
+    writer_test = tf.python_io.TFRecordWriter(test_path)  # 输出成tfrecord文件
+
+    test_size_per_genres = int(each_genres_num * test_size)
+
+    default_shape = get_default_shape(data_path) # (647, 128)
+    total_data = genres_num * aug * each_genres_num
+    with tqdm(desc='creating===>>', total=total_data) as pbar:
+        for index, name in enumerate(GENRES):
+            audio_list = [os.path.join(data_path, name + '/' + audio) for audio in os.listdir(os.path.join(data_path, name))
+                          if audio.endswith('au')]
+
+            np.random.shuffle(audio_list)
+            train_data = audio_list[:-test_size_per_genres]
+            test_data = audio_list[-test_size_per_genres:]
+
+            # train data
+            for _, audio in enumerate(train_data):
+
+                au, _ = load_track(audio, default_shape)
+
+                au_flatten = au.flatten()
+
+                example = tf.train.Example(features=tf.train.Features(feature={
+                    "genres": tf.train.Feature(int64_list=tf.train.Int64List(value=[index])),
+                    'au_flattern': tf.train.Feature(float_list=tf.train.FloatList(value=au_flatten))
+                }))
+                writer_train.write(example.SerializeToString())  #序列化为字符串
+
+                pbar.update(aug * each_genres_num * (1 - test_size) / len(train_data) / aug)
+
+                # data augmentation
+                if aug > 1:
+                    for i in range(aug - 1):
+
+                        au_aug, _ = load_track_with_aug(audio, default_shape)
+
+                        au_flatten_aug = au_aug.flatten()
+
+                        example_aug = tf.train.Example(features=tf.train.Features(feature={
+                            "genres": tf.train.Feature(int64_list=tf.train.Int64List(value=[index])),
+                            'au_flattern': tf.train.Feature(float_list=tf.train.FloatList(value=au_flatten_aug))
+                        }))
+                        writer_train.write(example_aug.SerializeToString())  # 序列化为字符串
+
+                        pbar.update(aug * each_genres_num * (1 - test_size) / len(train_data) / aug)
+
+
+
+
+            # test data
+            for _, audio in enumerate(test_data):
+
+                au, _ = load_track(audio, default_shape)
+
+                au_flatten = au.flatten()
+
+                example = tf.train.Example(features=tf.train.Features(feature={
+                    "genres": tf.train.Feature(int64_list=tf.train.Int64List(value=[index])),
+                    'au_flattern': tf.train.Feature(float_list=tf.train.FloatList(value=au_flatten))
+                }))
+                writer_test.write(example.SerializeToString())  #序列化为字符串
+
+                pbar.update(aug * each_genres_num * test_size / len(test_data) / aug)
+
+                # data augmentation
+                if aug > 1:
+                    for i in range(aug - 1):
+                        au_aug, _ = load_track_with_aug(audio, default_shape)
+
+                        au_flatten_aug = au_aug.flatten()
+
+                        example_aug = tf.train.Example(features=tf.train.Features(feature={
+                            "genres": tf.train.Feature(int64_list=tf.train.Int64List(value=[index])),
+                            'au_flattern': tf.train.Feature(float_list=tf.train.FloatList(value=au_flatten_aug))
+                        }))
+                        writer_test.write(example_aug.SerializeToString())  # 序列化为字符串
+
+                        pbar.update(aug * each_genres_num * test_size / len(test_data) / aug)
+
+    writer_train.close()
+    writer_test.close()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--data_path',
+        type=str,
+        default='GTZAN/genres',
+        help='data_sets path.'
+    )
+
+    parser.add_argument(
+        '--train_path',
+        type=str,
+        default='tfrecords/train.tfrecords',
+        help='train tfrecords save path.'
+    )
+
+    parser.add_argument(
+        '--test_path',
+        type=str,
+        default='tfrecords/test.tfrecords',
+        help='test tfrecords save path.'
+    )
+
+    parser.add_argument(
+        '--test_size',
+        type=float,
+        default=.2,
+        help='Proportion of test data that between [0, 1]'
+    )
+
+    parser.add_argument(
+        '--aug',
+        type=int,
+        default=10,
+        help='the size of data sets up to (arg) times as original. 1 means not augmentation.'
+    )
+    args = parser.parse_args()
+    test_size = args.test_size
+    data_path = args.data_path
+    train_path = args.train_path
+    test_path = args.test_path
+    aug = max(args.aug, 1)
+
+    logger.info('\nThe following parameters will be applied for data creating:\n')
+    logger.info("data_sets path: {}".format(data_path))
+    logger.info("train tfrecords save path: {}".format(train_path))
+    logger.info("test tfrecords save path: {}".format(test_path))
+    logger.info("Proportion of test data: {}".format(test_size))
+    logger.info("the size of data sets up to {} times as original.".format(aug))
+
+    create_tfrecords_default(data_path, train_path, test_path, test_size, aug)
diff --git a/model.py b/model.py
new file mode 100755
index 0000000..c39e226
--- /dev/null
+++ b/model.py
@@ -0,0 +1,393 @@
+
+"""Contains the definition of the Inception Resnet V2 architecture.
+
+As described in http://arxiv.org/abs/1602.07261.
+
+  Inception-v4, Inception-ResNet and the Impact of Residual Connections
+    on Learning
+  Christian Szegedy, Sergey Ioffe, Vincent Vanhoucke, Alex Alemi
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+import tensorflow as tf
+
+import tensorflow.contrib.slim as slim
+
+
+def block35(net, scale=1.0, activation_fn=tf.nn.relu, scope=None, reuse=None):
+  """Builds the 35x35 resnet block."""
+  with tf.variable_scope(scope, 'Block35', [net], reuse=reuse):
+    with tf.variable_scope('Branch_0'):
+      tower_conv = slim.conv2d(net, 32, 1, scope='Conv2d_1x1')
+    with tf.variable_scope('Branch_1'):
+      tower_conv1_0 = slim.conv2d(net, 32, 1, scope='Conv2d_0a_1x1')
+      tower_conv1_1 = slim.conv2d(tower_conv1_0, 32, 3, scope='Conv2d_0b_3x3')
+    with tf.variable_scope('Branch_2'):
+      tower_conv2_0 = slim.conv2d(net, 32, 1, scope='Conv2d_0a_1x1')
+      tower_conv2_1 = slim.conv2d(tower_conv2_0, 48, 3, scope='Conv2d_0b_3x3')
+      tower_conv2_2 = slim.conv2d(tower_conv2_1, 64, 3, scope='Conv2d_0c_3x3')
+    mixed = tf.concat(axis=3, values=[tower_conv, tower_conv1_1, tower_conv2_2])
+    up = slim.conv2d(mixed, net.get_shape()[3], 1, normalizer_fn=None,
+                     activation_fn=None, scope='Conv2d_1x1')
+    scaled_up = up * scale
+    if activation_fn == tf.nn.relu6:
+      # Use clip_by_value to simulate bandpass activation.
+      scaled_up = tf.clip_by_value(scaled_up, -6.0, 6.0)
+
+    net += scaled_up
+    if activation_fn:
+      net = activation_fn(net)
+  return net
+
+
+def block17(net, scale=1.0, activation_fn=tf.nn.relu, scope=None, reuse=None):
+  """Builds the 17x17 resnet block."""
+  with tf.variable_scope(scope, 'Block17', [net], reuse=reuse):
+    with tf.variable_scope('Branch_0'):
+      tower_conv = slim.conv2d(net, 192, 1, scope='Conv2d_1x1')
+    with tf.variable_scope('Branch_1'):
+      tower_conv1_0 = slim.conv2d(net, 128, 1, scope='Conv2d_0a_1x1')
+      tower_conv1_1 = slim.conv2d(tower_conv1_0, 160, [1, 7],
+                                  scope='Conv2d_0b_1x7')
+      tower_conv1_2 = slim.conv2d(tower_conv1_1, 192, [7, 1],
+                                  scope='Conv2d_0c_7x1')
+    mixed = tf.concat(axis=3, values=[tower_conv, tower_conv1_2])
+    up = slim.conv2d(mixed, net.get_shape()[3], 1, normalizer_fn=None,
+                     activation_fn=None, scope='Conv2d_1x1')
+
+    scaled_up = up * scale
+    if activation_fn == tf.nn.relu6:
+      # Use clip_by_value to simulate bandpass activation.
+      scaled_up = tf.clip_by_value(scaled_up, -6.0, 6.0)
+
+    net += scaled_up
+    if activation_fn:
+      net = activation_fn(net)
+  return net
+
+
+def block8(net, scale=1.0, activation_fn=tf.nn.relu, scope=None, reuse=None):
+  """Builds the 8x8 resnet block."""
+  with tf.variable_scope(scope, 'Block8', [net], reuse=reuse):
+    with tf.variable_scope('Branch_0'):
+      tower_conv = slim.conv2d(net, 192, 1, scope='Conv2d_1x1')
+    with tf.variable_scope('Branch_1'):
+      tower_conv1_0 = slim.conv2d(net, 192, 1, scope='Conv2d_0a_1x1')
+      tower_conv1_1 = slim.conv2d(tower_conv1_0, 224, [1, 3],
+                                  scope='Conv2d_0b_1x3')
+      tower_conv1_2 = slim.conv2d(tower_conv1_1, 256, [3, 1],
+                                  scope='Conv2d_0c_3x1')
+    mixed = tf.concat(axis=3, values=[tower_conv, tower_conv1_2])
+    up = slim.conv2d(mixed, net.get_shape()[3], 1, normalizer_fn=None,
+                     activation_fn=None, scope='Conv2d_1x1')
+
+    scaled_up = up * scale
+    if activation_fn == tf.nn.relu6:
+      # Use clip_by_value to simulate bandpass activation.
+      scaled_up = tf.clip_by_value(scaled_up, -6.0, 6.0)
+
+    net += scaled_up
+    if activation_fn:
+      net = activation_fn(net)
+  return net
+
+
+def inception_resnet_v2_base(inputs,
+                             final_endpoint='Conv2d_7b_1x1',
+                             output_stride=16,
+                             align_feature_maps=False,
+                             scope=None,
+                             activation_fn=tf.nn.relu):
+  """Inception model from  http://arxiv.org/abs/1602.07261.
+
+  Constructs an Inception Resnet v2 network from inputs to the given final
+  endpoint. This method can construct the network up to the final inception
+  block Conv2d_7b_1x1.
+
+  Args:
+    inputs: a tensor of size [batch_size, height, width, channels].
+    final_endpoint: specifies the endpoint to construct the network up to. It
+      can be one of ['Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3',
+      'MaxPool_3a_3x3', 'Conv2d_3b_1x1', 'Conv2d_4a_3x3', 'MaxPool_5a_3x3',
+      'Mixed_5b', 'Mixed_6a', 'PreAuxLogits', 'Mixed_7a', 'Conv2d_7b_1x1']
+    output_stride: A scalar that specifies the requested ratio of input to
+      output spatial resolution. Only supports 8 and 16.
+    align_feature_maps: When true, changes all the VALID paddings in the network
+      to SAME padding so that the feature maps are aligned.
+    scope: Optional variable_scope.
+    activation_fn: Activation function for block scopes.
+
+  Returns:
+    tensor_out: output tensor corresponding to the final_endpoint.
+    end_points: a set of activations for external use, for example summaries or
+                losses.
+
+  Raises:
+    ValueError: if final_endpoint is not set to one of the predefined values,
+      or if the output_stride is not 8 or 16, or if the output_stride is 8 and
+      we request an end point after 'PreAuxLogits'.
+  """
+  if output_stride != 8 and output_stride != 16:
+    raise ValueError('output_stride must be 8 or 16.')
+
+  padding = 'SAME' if align_feature_maps else 'VALID'
+
+  end_points = {}
+
+  def add_and_check_final(name, net):
+    end_points[name] = net
+    return name == final_endpoint
+
+  with tf.variable_scope(scope, 'InceptionResnetV2', [inputs]):
+    with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
+                        stride=1, padding='SAME'):
+      # 149 x 149 x 32
+      net = slim.conv2d(inputs, 32, 3, stride=2, padding=padding,
+                        scope='Conv2d_1a_3x3')
+      if add_and_check_final('Conv2d_1a_3x3', net): return net, end_points
+
+      # 147 x 147 x 32
+      net = slim.conv2d(net, 32, 3, padding=padding,
+                        scope='Conv2d_2a_3x3')
+      if add_and_check_final('Conv2d_2a_3x3', net): return net, end_points
+      # 147 x 147 x 64
+      net = slim.conv2d(net, 64, 3, scope='Conv2d_2b_3x3')
+      if add_and_check_final('Conv2d_2b_3x3', net): return net, end_points
+      # 73 x 73 x 64
+      net = slim.max_pool2d(net, 3, stride=2, padding=padding,
+                            scope='MaxPool_3a_3x3')
+      if add_and_check_final('MaxPool_3a_3x3', net): return net, end_points
+      # 73 x 73 x 80
+      net = slim.conv2d(net, 80, 1, padding=padding,
+                        scope='Conv2d_3b_1x1')
+      if add_and_check_final('Conv2d_3b_1x1', net): return net, end_points
+      # 71 x 71 x 192
+      net = slim.conv2d(net, 192, 3, padding=padding,
+                        scope='Conv2d_4a_3x3')
+      if add_and_check_final('Conv2d_4a_3x3', net): return net, end_points
+      # 35 x 35 x 192
+      net = slim.max_pool2d(net, 3, stride=2, padding=padding,
+                            scope='MaxPool_5a_3x3')
+      if add_and_check_final('MaxPool_5a_3x3', net): return net, end_points
+
+      # 35 x 35 x 320
+      with tf.variable_scope('Mixed_5b'):
+        with tf.variable_scope('Branch_0'):
+          tower_conv = slim.conv2d(net, 96, 1, scope='Conv2d_1x1')
+        with tf.variable_scope('Branch_1'):
+          tower_conv1_0 = slim.conv2d(net, 48, 1, scope='Conv2d_0a_1x1')
+          tower_conv1_1 = slim.conv2d(tower_conv1_0, 64, 5,
+                                      scope='Conv2d_0b_5x5')
+        with tf.variable_scope('Branch_2'):
+          tower_conv2_0 = slim.conv2d(net, 64, 1, scope='Conv2d_0a_1x1')
+          tower_conv2_1 = slim.conv2d(tower_conv2_0, 96, 3,
+                                      scope='Conv2d_0b_3x3')
+          tower_conv2_2 = slim.conv2d(tower_conv2_1, 96, 3,
+                                      scope='Conv2d_0c_3x3')
+        with tf.variable_scope('Branch_3'):
+          tower_pool = slim.avg_pool2d(net, 3, stride=1, padding='SAME',
+                                       scope='AvgPool_0a_3x3')
+          tower_pool_1 = slim.conv2d(tower_pool, 64, 1,
+                                     scope='Conv2d_0b_1x1')
+        net = tf.concat(
+            [tower_conv, tower_conv1_1, tower_conv2_2, tower_pool_1], 3)
+
+      if add_and_check_final('Mixed_5b', net): return net, end_points
+      # TODO(alemi): Register intermediate endpoints
+      net = slim.repeat(net, 10, block35, scale=0.17,
+                        activation_fn=activation_fn)
+
+      # 17 x 17 x 1088 if output_stride == 8,
+      # 33 x 33 x 1088 if output_stride == 16
+      use_atrous = output_stride == 8
+
+      with tf.variable_scope('Mixed_6a'):
+        with tf.variable_scope('Branch_0'):
+          tower_conv = slim.conv2d(net, 384, 3, stride=1 if use_atrous else 2,
+                                   padding=padding,
+                                   scope='Conv2d_1a_3x3')
+        with tf.variable_scope('Branch_1'):
+          tower_conv1_0 = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1')
+          tower_conv1_1 = slim.conv2d(tower_conv1_0, 256, 3,
+                                      scope='Conv2d_0b_3x3')
+          tower_conv1_2 = slim.conv2d(tower_conv1_1, 384, 3,
+                                      stride=1 if use_atrous else 2,
+                                      padding=padding,
+                                      scope='Conv2d_1a_3x3')
+        with tf.variable_scope('Branch_2'):
+          tower_pool = slim.max_pool2d(net, 3, stride=1 if use_atrous else 2,
+                                       padding=padding,
+                                       scope='MaxPool_1a_3x3')
+        net = tf.concat([tower_conv, tower_conv1_2, tower_pool], 3)
+
+      if add_and_check_final('Mixed_6a', net): return net, end_points
+
+      # TODO(alemi): register intermediate endpoints
+      with slim.arg_scope([slim.conv2d], rate=2 if use_atrous else 1):
+        net = slim.repeat(net, 20, block17, scale=0.10,
+                          activation_fn=activation_fn)
+      if add_and_check_final('PreAuxLogits', net): return net, end_points
+
+      if output_stride == 8:
+        # TODO(gpapan): Properly support output_stride for the rest of the net.
+        raise ValueError('output_stride==8 is only supported up to the '
+                         'PreAuxlogits end_point for now.')
+
+      # 8 x 8 x 2080
+      with tf.variable_scope('Mixed_7a'):
+        with tf.variable_scope('Branch_0'):
+          tower_conv = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1')
+          tower_conv_1 = slim.conv2d(tower_conv, 384, 3, stride=2,
+                                     padding=padding,
+                                     scope='Conv2d_1a_3x3')
+        with tf.variable_scope('Branch_1'):
+          tower_conv1 = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1')
+          tower_conv1_1 = slim.conv2d(tower_conv1, 288, 3, stride=2,
+                                      padding=padding,
+                                      scope='Conv2d_1a_3x3')
+        with tf.variable_scope('Branch_2'):
+          tower_conv2 = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1')
+          tower_conv2_1 = slim.conv2d(tower_conv2, 288, 3,
+                                      scope='Conv2d_0b_3x3')
+          tower_conv2_2 = slim.conv2d(tower_conv2_1, 320, 3, stride=2,
+                                      padding=padding,
+                                      scope='Conv2d_1a_3x3')
+        with tf.variable_scope('Branch_3'):
+          tower_pool = slim.max_pool2d(net, 3, stride=2,
+                                       padding=padding,
+                                       scope='MaxPool_1a_3x3')
+        net = tf.concat(
+            [tower_conv_1, tower_conv1_1, tower_conv2_2, tower_pool], 3)
+
+      if add_and_check_final('Mixed_7a', net): return net, end_points
+
+      # TODO(alemi): register intermediate endpoints
+      net = slim.repeat(net, 9, block8, scale=0.20, activation_fn=activation_fn)
+      net = block8(net, activation_fn=None)
+
+      # 8 x 8 x 1536
+      net = slim.conv2d(net, 1536, 1, scope='Conv2d_7b_1x1')
+      if add_and_check_final('Conv2d_7b_1x1', net): return net, end_points
+
+    raise ValueError('final_endpoint (%s) not recognized', final_endpoint)
+
+
+def inception_resnet_v2(inputs, num_classes=10, is_training=True,
+                        dropout_keep_prob=0.8,
+                        reuse=None,
+                        scope='InceptionResnetV2',
+                        create_aux_logits=True,
+                        activation_fn=tf.nn.relu):
+  """Creates the Inception Resnet V2 model.
+
+  Args:
+    inputs: a 4-D tensor of size [batch_size, height, width, 3].
+      Dimension batch_size may be undefined. If create_aux_logits is false,
+      also height and width may be undefined.
+    num_classes: number of predicted classes. If 0 or None, the logits layer
+      is omitted and the input features to the logits layer (before  dropout)
+      are returned instead.
+    is_training: whether is training or not.
+    dropout_keep_prob: float, the fraction to keep before final layer.
+    reuse: whether or not the network and its variables should be reused. To be
+      able to reuse 'scope' must be given.
+    scope: Optional variable_scope.
+    create_aux_logits: Whether to include the auxilliary logits.
+    activation_fn: Activation function for conv2d.
+
+  Returns:
+    net: the output of the logits layer (if num_classes is a non-zero integer),
+      or the non-dropped-out input to the logits layer (if num_classes is 0 or
+      None).
+    end_points: the set of end_points from the inception model.
+  """
+  end_points = {}
+
+  with tf.variable_scope(scope, 'InceptionResnetV2', [inputs],
+                         reuse=reuse) as scope:
+    with slim.arg_scope([slim.batch_norm, slim.dropout],
+                        is_training=is_training):
+
+      net, end_points = inception_resnet_v2_base(inputs, scope=scope,
+                                                 activation_fn=activation_fn)
+
+      if create_aux_logits and num_classes:
+        with tf.variable_scope('AuxLogits'):
+          aux = end_points['PreAuxLogits']
+          aux = slim.avg_pool2d(aux, 5, stride=3, padding='VALID',
+                                scope='Conv2d_1a_3x3')
+          aux = slim.conv2d(aux, 128, 1, scope='Conv2d_1b_1x1')
+          aux = slim.conv2d(aux, 768, aux.get_shape()[1:3],
+                            padding='VALID', scope='Conv2d_2a_5x5')
+          aux = slim.flatten(aux)
+          aux = slim.fully_connected(aux, num_classes, activation_fn=None,
+                                     scope='Logits')
+          end_points['AuxLogits'] = aux
+
+      with tf.variable_scope('Logits'):
+        # TODO(sguada,arnoegw): Consider adding a parameter global_pool which
+        # can be set to False to disable pooling here (as in resnet_*()).
+        kernel_size = net.get_shape()[1:3]
+        if kernel_size.is_fully_defined():
+          net = slim.avg_pool2d(net, kernel_size, padding='VALID',
+                                scope='AvgPool_1a_8x8')
+        else:
+          net = tf.reduce_mean(net, [1, 2], keep_dims=True, name='global_pool')
+        end_points['global_pool'] = net
+        if not num_classes:
+          return net, end_points
+        net = slim.flatten(net)
+        net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
+                           scope='Dropout')
+        end_points['PreLogitsFlatten'] = net
+        logits = slim.fully_connected(net, num_classes, activation_fn=None,
+                                      scope='Logits')
+        end_points['Logits'] = logits
+        end_points['Predictions'] = tf.nn.softmax(logits, name='Predictions')
+
+    return logits, end_points
+inception_resnet_v2.default_image_size = 299
+
+
+def inception_resnet_v2_arg_scope(
+    weight_decay=0.00004,
+    batch_norm_decay=0.9997,
+    batch_norm_epsilon=0.001,
+    activation_fn=tf.nn.relu,
+    batch_norm_updates_collections=tf.GraphKeys.UPDATE_OPS,
+    batch_norm_scale=False):
+  """Returns the scope with the default parameters for inception_resnet_v2.
+
+  Args:
+    weight_decay: the weight decay for weights variables.
+    batch_norm_decay: decay for the moving average of batch_norm momentums.
+    batch_norm_epsilon: small float added to variance to avoid dividing by zero.
+    activation_fn: Activation function for conv2d.
+    batch_norm_updates_collections: Collection for the update ops for
+      batch norm.
+    batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the
+      activations in the batch normalization layer.
+
+  Returns:
+    a arg_scope with the parameters needed for inception_resnet_v2.
+  """
+  # Set weight_decay for weights in conv2d and fully_connected layers.
+  with slim.arg_scope([slim.conv2d, slim.fully_connected],
+                      weights_regularizer=slim.l2_regularizer(weight_decay),
+                      biases_regularizer=slim.l2_regularizer(weight_decay)):
+
+    batch_norm_params = {
+        'decay': batch_norm_decay,
+        'epsilon': batch_norm_epsilon,
+        'updates_collections': batch_norm_updates_collections,
+        'fused': None,  # Use fused batch norm if possible.
+        'scale': batch_norm_scale,
+    }
+    # Set activation_fn and parameters for batch_norm.
+    with slim.arg_scope([slim.conv2d], activation_fn=activation_fn,
+                        normalizer_fn=slim.batch_norm,
+                        normalizer_params=batch_norm_params) as scope:
+      return scope
\ No newline at end of file
diff --git a/read_tfrecords.py b/read_tfrecords.py
new file mode 100755
index 0000000..2875977
--- /dev/null
+++ b/read_tfrecords.py
@@ -0,0 +1,119 @@
+import tensorflow as tf
+import numpy as np
+import cv2 as cv
+from utils import get_record_dataset
+slim = tf.contrib.slim
+
+
+# for serialized_example in tf.python_io.tf_record_iterator("train.tfrecords"):
+#     example = tf.train.Example()
+#     example.ParseFromString(serialized_example)
+#
+#     image = example.features.feature['img_raw'].float_list.value
+#     label = example.features.feature['label'].int64_list.value
+#     # 可以做一些预处理之类的
+#     print(image, label)
+
+# def read_and_decode(filename):
+#     # 根据文件名生成一个队列
+#     filename_queue = tf.train.string_input_producer([filename])
+#
+#     reader = tf.TFRecordReader()
+#     _, serialized_example = reader.read(filename_queue)  # 返回文件名和文件
+#     features = tf.parse_single_example(serialized_example,
+#                                      features={
+#                                        'label': tf.FixedLenFeature([], tf.int64),
+#                                        'img_raw': tf.VarLenFeature(tf.float32),
+#                                      })
+#
+#     #img = tf.decode_raw(features['img_raw'], tf.float32)
+#     img = features['img_raw']
+#     img = tf.sparse_tensor_to_dense(img)
+#     img = tf.reshape(img, [224, 224, 3])
+#     #img = tf.cast(tf.reshape(img, [224, 224, 3]) * 255, tf.uint8)
+#     img = tf.cast(img * 255, tf.uint8)
+#     label = tf.cast(features['label'], tf.int32)
+#
+#     return img, label
+#
+#
+#
+# img, label = read_and_decode("train.tfrecords")
+#
+# #使用shuffle_batch可以随机打乱输入
+# img_batch, label_batch = tf.train.shuffle_batch([img, label],
+#                                                 batch_size=1, capacity=10,
+#                                                 min_after_dequeue=2)
+# init = tf.initialize_all_variables()
+#
+# with tf.Session() as sess:
+#     sess.run(init)
+#     threads = tf.train.start_queue_runners(sess=sess)
+#     for i in range(2):
+#         val, l = sess.run([img_batch, label_batch])
+#         val = np.reshape(val, (224, 224, 3))
+#         cv.imshow('1', val)
+#         cv.waitKey()
+#         print(val.shape, l)
+
+
+# def read_and_decode(filename):
+#     # 根据文件名生成一个队列
+#     filename_queue = tf.train.string_input_producer([filename])
+#
+#     reader = tf.TFRecordReader()
+#     _, serialized_example = reader.read(filename_queue)  # 返回文件名和文件
+#     features = tf.parse_single_example(serialized_example,
+#                                      features={
+#                                        'label': tf.FixedLenFeature([], tf.int64),
+#                                        'img_raw': tf.VarLenFeature(tf.float32),
+#                                      })
+#
+#     #img = tf.decode_raw(features['img_raw'], tf.float32)
+#     img = features['img_raw']
+#     img = tf.sparse_tensor_to_dense(img)
+#     img = tf.reshape(img, [647,128])
+#     #img = tf.cast(tf.reshape(img, [224, 224, 3]) * 255, tf.uint8)
+#     #img = tf.cast(img * 255, tf.uint8)
+#     label = tf.cast(features['label'], tf.int32)
+#
+#     return img, label
+#
+#
+#
+# img, label = read_and_decode("train.tfrecords")
+#
+# #使用shuffle_batch可以随机打乱输入
+# img_batch, label_batch = tf.train.shuffle_batch([img, label],
+#                                                 batch_size=1, capacity=10,
+#                                                 min_after_dequeue=2)
+# init = tf.initialize_all_variables()
+#
+# with tf.Session() as sess:
+#     sess.run(init)
+#     threads = tf.train.start_queue_runners(sess=sess)
+#     for i in range(2):
+#         val, l = sess.run([img_batch, label_batch])
+#         print(val, l)
+
+
+dataset = get_record_dataset('tfrecords/data.tfrecords', num_samples=1000,
+                             num_classes=10)
+data_provider = slim.dataset_data_provider.DatasetDataProvider(dataset)
+label,image = data_provider.get(['genres', 'au_flattern'])
+
+inputs, labels = tf.train.batch([image, label],
+                                batch_size=1,
+                                # capacity=5*FLAGS.batch_size,
+                                allow_smaller_final_batch=True)
+
+# 输出当前tensor的静态shape 和动态shape，与另一种读取方式进行对比
+
+init = tf.initialize_all_variables()
+
+with tf.Session() as sess:
+    sess.run(init)
+    threads = tf.train.start_queue_runners(sess=sess)
+    for i in range(2):
+        val, l = sess.run([inputs, labels])
+        print(val, l)
\ No newline at end of file
diff --git a/test.py b/test.py
new file mode 100755
index 0000000..8075ebe
--- /dev/null
+++ b/test.py
@@ -0,0 +1,69 @@
+from model import inception_resnet_v2
+import tensorflow as tf
+from utils import  read_and_decode
+import numpy as np
+import logging
+from tqdm import tqdm
+
+
+logging.basicConfig(level=logging.DEBUG, format='%(asctime)s -  %(filename)s - %(funcName)s: %(lineno)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+
+
+model_path = './models/'
+test_data = './tfrecords/test.tfrecords'
+
+tf.reset_default_graph()
+
+au_test, label_test = read_and_decode(test_data)
+
+au_test_batch, label_test_batch = tf.train.shuffle_batch([au_test, label_test],
+                                                            batch_size=1,
+                                                            num_threads=16,
+                                                            capacity=800 + 3,
+                                                            min_after_dequeue=800,
+                                                            )
+
+input_ = tf.placeholder(tf.float32, [None, 647, 128, 1])
+
+logits_, _ = inception_resnet_v2(input_, is_training=False, dropout_keep_prob=1, create_aux_logits=False)
+
+with tf.Session() as sess:
+
+    saver = tf.train.Saver()
+
+    saver.restore(sess, "models/inception_resnet_v2_iteration_9999.ckpt")
+
+    coord = tf.train.Coordinator()
+
+    threads = tf.train.start_queue_runners(coord=coord)
+
+    N = 1000
+    top1 = 0
+    top3 = 0
+    for i in tqdm(range(N)):
+        data, labels = sess.run([au_test_batch, label_test_batch])
+
+        logits = sess.run(logits_, feed_dict={input_: data}).ravel()
+
+        max_index = np.argsort(-logits)
+
+        predict = np.argmax(logits)
+        if predict == int(labels):
+            top1 += 1
+        if int(labels) in max_index[:3]:
+            top3 += 1
+
+    logging.info("top1: {:.2f}%".format(top1 / N * 100))
+    logging.info("top3: {:.2f}%".format(top3 / N * 100))
+
+    coord.request_stop()
+    coord.join(threads)
+
+
+
+
+
+
+
diff --git a/test_create.py b/test_create.py
new file mode 100755
index 0000000..3efa3d8
--- /dev/null
+++ b/test_create.py
@@ -0,0 +1,37 @@
+import os
+import tensorflow as tf
+import cv2 as cv
+from utils import load_track
+
+# writer = tf.python_io.TFRecordWriter("train.tfrecords")
+# path = '/Users/wangdong/Documents/DPED-master/dped/dped/blackberry/test_data/full_size_test_images/'
+# for index in range(29):
+#     img_name = path + '{}.jpg'.format(index)
+#
+#     img = cv.imread(img_name)
+#     img = cv.resize(img, (224, 224))
+#     img = img.flatten()
+#
+#     img = img / 256
+#     print(img.shape)
+#     #img_raw = img.tobytes()              #将图片转化为原生bytes
+#     example = tf.train.Example(features=tf.train.Features(feature={
+#             "label": tf.train.Feature(int64_list=tf.train.Int64List(value=[index])),
+#             'img_raw': tf.train.Feature(float_list=tf.train.FloatList(value=img))
+#         }))
+#     writer.write(example.SerializeToString())  #序列化为字符串
+# writer.close()
+
+writer = tf.python_io.TFRecordWriter("train.tfrecords")
+tmp_features, _ = load_track(os.path.join('GTZAN/genres', 'blues/blues.00000.au'))
+print(tmp_features.shape)
+data = tmp_features.flatten()
+
+print(data)
+#img_raw = img.tobytes()              #将图片转化为原生bytes
+example = tf.train.Example(features=tf.train.Features(feature={
+        "label": tf.train.Feature(int64_list=tf.train.Int64List(value=[12])),
+        'img_raw': tf.train.Feature(float_list=tf.train.FloatList(value=data))
+    }))
+writer.write(example.SerializeToString())  #序列化为字符串
+writer.close()
\ No newline at end of file
diff --git a/train.py b/train.py
new file mode 100755
index 0000000..10d0840
--- /dev/null
+++ b/train.py
@@ -0,0 +1,239 @@
+import tensorflow as tf
+from utils import read_and_decode
+from model import inception_resnet_v2
+import argparse
+from tqdm import tqdm
+import os
+import logging
+
+slim = tf.contrib.slim
+logging.basicConfig(level=logging.DEBUG, format='%(asctime)s -  %(filename)s - %(funcName)s: %(lineno)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+def train(train_data_,
+          decay_rate_,
+          global_steps_,
+          decay_steps_,
+          batch_size_,
+          learning_rate_,
+          eval_step_,
+          model_path_,
+          summary_path_,
+          load_model_):
+
+    if not os.path.exists(model_path_):
+        os.mkdir(model_path_)
+
+    if not os.path.exists(summary_path_):
+        os.mkdir(summary_path_)
+
+    graph = tf.Graph()
+
+    with graph.as_default():
+
+        au_train, label_train = read_and_decode(train_data_)
+
+        min_fraction_of_examples_in_queue = 0.4
+        test_size = .2
+        aug = 10
+        total_examples = 1000 * aug
+        min_queue_examples_train = int(total_examples * (1 - test_size) * min_fraction_of_examples_in_queue)
+
+        au_train_batch, label_train_batch = tf.train.shuffle_batch([au_train, label_train],
+                                                        batch_size=batch_size_,
+                                                        num_threads=16,
+                                                        capacity=min_queue_examples_train + 3 * batch_size_,
+                                                        min_after_dequeue=min_queue_examples_train,
+                                                        )
+
+        label_train_batch_ = tf.one_hot(tf.squeeze(label_train_batch), 10, 1, 0)
+
+        logits, end_points = inception_resnet_v2(au_train_batch)
+
+        if 'AuxLogits' in end_points:
+            slim.losses.softmax_cross_entropy(
+                end_points['AuxLogits'], label_train_batch_, weights=0.4, scope='aux_loss')
+
+        slim.losses.softmax_cross_entropy(
+            logits, label_train_batch_, weights=1.0, scope='base_loss')
+
+        total_loss = slim.losses.get_total_loss()
+
+        tf.summary.scalar('loss', total_loss)
+
+        global_ = tf.Variable(tf.constant(0), trainable=False)
+
+        lr = tf.train.exponential_decay(learning_rate_, global_, decay_steps_, decay_rate_, staircase=True)
+
+        tf.summary.scalar('lr', lr)
+
+        optimizer = tf.train.AdamOptimizer(lr).minimize(total_loss)
+
+        correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(label_train_batch_, 1))
+
+        accuracy_ = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+
+        tf.summary.scalar('accuracy', accuracy_)
+
+        saver = tf.train.Saver(max_to_keep=5)
+
+        merged = tf.summary.merge_all()
+
+        writer = tf.summary.FileWriter(summary_path_, graph=graph)
+
+        init = tf.global_variables_initializer()
+
+        with tf.Session() as sess, open('log/train_log.log', 'w') as log:
+
+            if load_model_:
+                checkpoint = tf.train.get_checkpoint_state(model_path_)
+
+                meta_graph_path = checkpoint.model_checkpoint_path + ".meta"
+
+                restore = tf.train.import_meta_graph(meta_graph_path)
+
+                restore.restore(sess, tf.train.latest_checkpoint(model_path_))
+
+                step = int(meta_graph_path.split("_")[-1].split(".")[0])
+            else:
+                sess.run(init)
+                step = 0
+
+            coord = tf.train.Coordinator()
+
+            threads = tf.train.start_queue_runners(coord=coord)
+            try:
+                for i in tqdm(range(step, global_steps_)):
+
+                    acc, loss, train_summary, _ = sess.run([accuracy_, total_loss, merged, optimizer],
+                                                           feed_dict={global_: i})
+                    print("steps:{} train loss :{:.2f}, accuracy: {:.2f}".format(i, loss, acc), file=log)
+                    log.flush()
+
+                    if (i + 1) % eval_step_ == 0:
+
+                        saver.save(sess, '{}/inception_resnet_v2_iteration_{}.ckpt'.format(model_path_, i))
+                        writer.add_summary(train_summary, i)
+
+            except KeyboardInterrupt:
+                logger.exception('Interrupted')
+                coord.request_stop()
+            except Exception as e:
+                logger.exception(e)
+                coord.request_stop(e)
+            finally:
+                logger.info("Model saved in file: %s" % model_path_)
+                # When done, ask the threads to stop.
+                coord.request_stop()
+                coord.join(threads)
+
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--train_data',
+        type=str,
+        default='./tfrecords/train.tfrecords',
+        help='train_data path.'
+    )
+
+    parser.add_argument(
+        '--decay_rate',
+        type=float,
+        default=0.9,
+        help='learning rate decay rate.'
+    )
+
+    parser.add_argument(
+        '--global_steps',
+        type=int,
+        default=10000,
+        help='global steps'
+    )
+
+    parser.add_argument(
+        '--decay_steps',
+        type=int,
+        default=100,
+        help='learning rate decay steps.'
+    )
+
+    parser.add_argument(
+        '--learning_rate',
+        type=float,
+        default=1e-4,
+        help='learning rate.'
+    )
+    parser.add_argument(
+        '--eval_step',
+        type=int,
+        default=100,
+        help='evaluation steps.'
+    )
+
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=50,
+        help='batch size.'
+    )
+
+    parser.add_argument(
+        '--model_path',
+        type=str,
+        default='models/',
+        help='tensorflow model path.'
+    )
+
+    parser.add_argument(
+        '--summary_path',
+        type=str,
+        default='summary/',
+        help='tensorflow summary path.'
+    )
+
+    parser.add_argument(
+        '--load_model',
+        type=bool,
+        default=False,
+        help='whether you wish to continue training.'
+    )
+
+    args = parser.parse_args()
+
+    train_data = args.train_data
+    decay_rate = args.decay_rate
+    global_steps = args.global_steps  # 总的迭代次数
+    decay_steps = args.decay_steps  # 衰减次数
+    learning_rate = args.learning_rate
+    eval_step = args.eval_step
+    summary_path = args.summary_path
+    model_path = args.model_path
+    load_model = args.load_model
+    batch_size = args.batch_size
+
+    logger.info('\nThe following parameters will be applied for data creating:\n')
+    logger.info('train_data path: {}.'.format(train_data))
+    logger.info("learning rate decay rate: {}".format(decay_rate))
+    logger.info("global steps: {}".format(global_steps))
+    logger.info("learning rate decay steps: {} .".format(decay_steps))
+    logger.info('batch size: {}.'.format(batch_size))
+    logger.info('learning rate: {}.'.format(learning_rate))
+    logger.info('evaluation steps: {}.'.format(eval_step))
+    logger.info('tensorflow model path: {}.'.format(model_path))
+    logger.info('tensorflow summary path: {}.'.format(summary_path))
+    logger.info('whether you wish to continue training: {}.'.format(load_model))
+
+    train(train_data,
+          decay_rate,
+          global_steps,
+          decay_steps,
+          batch_size,
+          learning_rate,
+          eval_step,
+          model_path,
+          summary_path,
+          load_model)
+
diff --git a/utils.py b/utils.py
new file mode 100755
index 0000000..dbb0304
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,155 @@
+import numpy as np
+import librosa as lbr
+import tensorflow as tf
+import os
+
+slim = tf.contrib.slim
+
+GENRES = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal',
+        'pop', 'reggae', 'rock']
+
+WINDOW_SIZE = 2048
+WINDOW_STRIDE = WINDOW_SIZE // 2
+N_MELS = 128
+MEL_KWARGS = {
+    'n_fft': WINDOW_SIZE,
+    'hop_length': WINDOW_STRIDE,
+    'n_mels': N_MELS
+}
+
+def get_default_shape(dataset_path):
+    tmp_features, _ = load_track(os.path.join(dataset_path,
+        'blues/blues.00000.au'))
+    return tmp_features.shape
+
+
+def load_track(filename, enforce_shape=None):
+    new_input, sample_rate = lbr.load(filename, mono=True)
+    features = lbr.feature.melspectrogram(new_input, **MEL_KWARGS).T
+
+    if enforce_shape is not None:
+        if features.shape[0] < enforce_shape[0]:
+            delta_shape = (enforce_shape[0] - features.shape[0],
+                    enforce_shape[1])
+            features = np.append(features, np.zeros(delta_shape), axis=0)
+        elif features.shape[0] > enforce_shape[0]:
+            features = features[: enforce_shape[0], :]
+
+
+    features[features == 0] = 1e-6
+    return (np.log(features), float(new_input.shape[0]) / sample_rate)
+
+
+
+def read_and_decode(filename): # read train.tfrecords
+    filename_queue = tf.train.string_input_producer([filename])# create a queue
+
+    reader = tf.TFRecordReader()
+    _, serialized_example = reader.read(filename_queue)#return file_name and file
+    features = tf.parse_single_example(serialized_example,
+                                       features={
+                                           'genres': tf.FixedLenFeature((1,), tf.int64),
+                                           'au_flattern': tf.VarLenFeature(tf.float32),
+                                       })#return image and label
+
+
+    labels = tf.cast(features['genres'], tf.int32) #throw label tensor
+    au_flattern = features['au_flattern']
+    au_flattern = tf.sparse_tensor_to_dense(au_flattern)
+    au = tf.reshape(au_flattern, get_default_shape('GTZAN/genres'))
+    au = tf.expand_dims(au, axis=2)
+    return au, labels
+
+
+
+def audio_augmention(data, sr):
+    # Adding white noise
+    wn = np.random.randn(len(data))
+    data_wn = data + 0.005 * wn
+
+    # Shifting the sound
+    steps = np.random.randint(-10, 10)
+    data_sf = lbr.effects.pitch_shift(data_wn, sr, n_steps=steps)
+
+    # Changing volume
+    volume = np.random.uniform(.5, 2)
+    data_sf *= volume
+
+    return data_sf
+
+
+def load_track_with_aug(filename, enforce_shape=None):
+    new_input, sample_rate = lbr.load(filename, mono=True)
+    new_input_with_aug = audio_augmention(new_input, sample_rate)
+    features = lbr.feature.melspectrogram(new_input_with_aug, **MEL_KWARGS).T
+
+    if enforce_shape is not None:
+        if features.shape[0] < enforce_shape[0]:
+            delta_shape = (enforce_shape[0] - features.shape[0],
+                    enforce_shape[1])
+            features = np.append(features, np.zeros(delta_shape), axis=0)
+        elif features.shape[0] > enforce_shape[0]:
+            features = features[: enforce_shape[0], :]
+
+    features[features == 0] = 1e-6
+    return (np.log(features), float(new_input.shape[0]) / sample_rate)
+
+
+
+def freeze_graph(input_checkpoint, output_graph):
+    '''
+    :param input_checkpoint:
+    :param output_graph: PB模型保存路径
+    :return:
+    '''
+    # checkpoint = tf.train.get_checkpoint_state(model_folder) #检查目录下ckpt文件状态是否可用
+    # input_checkpoint = checkpoint.model_checkpoint_path #得ckpt文件路径
+    from tensorflow.python.framework.graph_util import convert_variables_to_constants
+
+    # 指定输出的节点名称,该节点名称必须是原模型中存在的节点
+    output_node_names = "evaluate/ArgMax"
+    saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True)
+    graph = tf.get_default_graph()  # 获得默认的图
+    input_graph_def = graph.as_graph_def()  # 返回一个序列化的图代表当前的图
+
+    with tf.Session() as sess:
+        saver.restore(sess, input_checkpoint)  # 恢复图并得到数据
+        output_graph_def = convert_variables_to_constants(  # 模型持久化，将变量值固定
+            sess=sess,
+            input_graph_def=input_graph_def,  # 等于:sess.graph_def
+            output_node_names=output_node_names.split(","))  # 如果有多个输出节点，以逗号隔开
+
+        with tf.gfile.GFile(output_graph, "wb") as f:  # 保存模型
+            f.write(output_graph_def.SerializeToString())  # 序列化输出
+        print("%d ops in the final graph." % len(output_graph_def.node))  # 得到当前图有几个操作节点
+
+        # for op in graph.get_operations():
+        #     print(op.name, op.values())
+
+#freeze_graph('model/inception_resnet_v2_iteration_9599.ckpt', 'model/test.pb')
+
+def print_node():
+    from tensorflow.python import pywrap_tensorflow
+    import os
+    checkpoint_path=os.path.join('model/inception_resnet_v2_iteration_9599.ckpt')
+    reader=pywrap_tensorflow.NewCheckpointReader(checkpoint_path)
+    var_to_shape_map=reader.get_variable_to_shape_map()
+    # b = [b for b in var_to_shape_map if b.startswith('generator/b')]
+    # b.sort()
+    # w = [w for w in var_to_shape_map if w.startswith('generator/W')]
+    # w.sort()
+    # v = [v for v in var_to_shape_map if v.startswith('generator/V')]
+    # v.sort()
+    #
+    # print(w)
+    # print(b)
+    # print(v)
+    for key in var_to_shape_map:
+        print('tensor_name: ', key)
+
+
+
+
+
+
+