diff --git a/tutorials/README.md b/tutorials/README.md deleted file mode 100644 index 7dd5f46c8ef..00000000000 --- a/tutorials/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# Tutorial Models - -This folder contains models referenced to from the [TensorFlow tutorials](https://www.tensorflow.org/tutorials/). diff --git a/tutorials/__init__.py b/tutorials/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tutorials/embedding/README.md b/tutorials/embedding/README.md deleted file mode 100644 index cb84f532f5c..00000000000 --- a/tutorials/embedding/README.md +++ /dev/null @@ -1,52 +0,0 @@ -This directory contains models for unsupervised training of word embeddings -using the model described in: - -(Mikolov, et. al.) [Efficient Estimation of Word Representations in Vector Space](http://arxiv.org/abs/1301.3781), -ICLR 2013. - -Detailed instructions on how to get started and use them are available in the -tutorials. Brief instructions are below. - -* [Word2Vec Tutorial](http://tensorflow.org/tutorials/word2vec) - -Assuming you have cloned the git repository, navigate into this directory. To download the example text and evaluation data: - -```shell -curl http://mattmahoney.net/dc/text8.zip > text8.zip -unzip text8.zip -curl https://storage.googleapis.com/google-code-archive-source/v2/code.google.com/word2vec/source-archive.zip > source-archive.zip -unzip -p source-archive.zip word2vec/trunk/questions-words.txt > questions-words.txt -rm text8.zip source-archive.zip -``` - -You will need to compile the ops as follows (See -[Adding a New Op to TensorFlow](https://www.tensorflow.org/how_tos/adding_an_op/#building_the_op_library) -for more details).: - -```shell -TF_CFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))') ) -TF_LFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))') ) -g++ -std=c++11 -shared word2vec_ops.cc word2vec_kernels.cc -o word2vec_ops.so -fPIC ${TF_CFLAGS[@]} ${TF_LFLAGS[@]} -O2 -D_GLIBCXX_USE_CXX11_ABI=0 -``` - -On Mac, add `-undefined dynamic_lookup` to the g++ command. The flag `-D_GLIBCXX_USE_CXX11_ABI=0` is included to support newer versions of gcc. However, if you compiled TensorFlow from source using gcc 5 or later, you may need to exclude the flag. Specifically, if you get an error similar to the following: `word2vec_ops.so: undefined symbol: _ZN10tensorflow7strings6StrCatERKNS0_8AlphaNumES3_S3_S3_` then you likely need to exclude the flag. - -Once you've successfully compiled the ops, run the model as follows: - -```shell -python word2vec_optimized.py \ - --train_data=text8 \ - --eval_data=questions-words.txt \ - --save_path=/tmp/ -``` - -Here is a short overview of what is in this directory. - -File | What's in it? ---- | --- -`word2vec.py` | A version of word2vec implemented using TensorFlow ops and minibatching. -`word2vec_test.py` | Integration test for word2vec. -`word2vec_optimized.py` | A version of word2vec implemented using C ops that does no minibatching. -`word2vec_optimized_test.py` | Integration test for word2vec_optimized. -`word2vec_kernels.cc` | Kernels for the custom input and training ops. -`word2vec_ops.cc` | The declarations of the custom ops. diff --git a/tutorials/embedding/__init__.py b/tutorials/embedding/__init__.py deleted file mode 100644 index ea3259cd34e..00000000000 --- a/tutorials/embedding/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Import generated word2vec optimized ops into embedding package.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function diff --git a/tutorials/embedding/word2vec.py b/tutorials/embedding/word2vec.py deleted file mode 100644 index 72158647389..00000000000 --- a/tutorials/embedding/word2vec.py +++ /dev/null @@ -1,534 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Multi-threaded word2vec mini-batched skip-gram model. - -Trains the model described in: -(Mikolov, et. al.) Efficient Estimation of Word Representations in Vector Space -ICLR 2013. -http://arxiv.org/abs/1301.3781 -This model does traditional minibatching. - -The key ops used are: -* placeholder for feeding in tensors for each example. -* embedding_lookup for fetching rows from the embedding matrix. -* sigmoid_cross_entropy_with_logits to calculate the loss. -* GradientDescentOptimizer for optimizing the loss. -* skipgram custom op that does input processing. -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import sys -import threading -import time - -from six.moves import xrange # pylint: disable=redefined-builtin - -import numpy as np -import tensorflow as tf - -word2vec = tf.load_op_library(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'word2vec_ops.so')) - -flags = tf.app.flags - -flags.DEFINE_string("save_path", None, "Directory to write the model and " - "training summaries.") -flags.DEFINE_string("train_data", None, "Training text file. " - "E.g., unzipped file http://mattmahoney.net/dc/text8.zip.") -flags.DEFINE_string( - "eval_data", None, "File consisting of analogies of four tokens." - "embedding 2 - embedding 1 + embedding 3 should be close " - "to embedding 4." - "See README.md for how to get 'questions-words.txt'.") -flags.DEFINE_integer("embedding_size", 200, "The embedding dimension size.") -flags.DEFINE_integer( - "epochs_to_train", 15, - "Number of epochs to train. Each epoch processes the training data once " - "completely.") -flags.DEFINE_float("learning_rate", 0.2, "Initial learning rate.") -flags.DEFINE_integer("num_neg_samples", 100, - "Negative samples per training example.") -flags.DEFINE_integer("batch_size", 16, - "Number of training examples processed per step " - "(size of a minibatch).") -flags.DEFINE_integer("concurrent_steps", 12, - "The number of concurrent training steps.") -flags.DEFINE_integer("window_size", 5, - "The number of words to predict to the left and right " - "of the target word.") -flags.DEFINE_integer("min_count", 5, - "The minimum number of word occurrences for it to be " - "included in the vocabulary.") -flags.DEFINE_float("subsample", 1e-3, - "Subsample threshold for word occurrence. Words that appear " - "with higher frequency will be randomly down-sampled. Set " - "to 0 to disable.") -flags.DEFINE_boolean( - "interactive", False, - "If true, enters an IPython interactive session to play with the trained " - "model. E.g., try model.analogy(b'france', b'paris', b'russia') and " - "model.nearby([b'proton', b'elephant', b'maxwell'])") -flags.DEFINE_integer("statistics_interval", 5, - "Print statistics every n seconds.") -flags.DEFINE_integer("summary_interval", 5, - "Save training summary to file every n seconds (rounded " - "up to statistics interval).") -flags.DEFINE_integer("checkpoint_interval", 600, - "Checkpoint the model (i.e. save the parameters) every n " - "seconds (rounded up to statistics interval).") - -FLAGS = flags.FLAGS - - -class Options(object): - """Options used by our word2vec model.""" - - def __init__(self): - # Model options. - - # Embedding dimension. - self.emb_dim = FLAGS.embedding_size - - # Training options. - # The training text file. - self.train_data = FLAGS.train_data - - # Number of negative samples per example. - self.num_samples = FLAGS.num_neg_samples - - # The initial learning rate. - self.learning_rate = FLAGS.learning_rate - - # Number of epochs to train. After these many epochs, the learning - # rate decays linearly to zero and the training stops. - self.epochs_to_train = FLAGS.epochs_to_train - - # Concurrent training steps. - self.concurrent_steps = FLAGS.concurrent_steps - - # Number of examples for one training step. - self.batch_size = FLAGS.batch_size - - # The number of words to predict to the left and right of the target word. - self.window_size = FLAGS.window_size - - # The minimum number of word occurrences for it to be included in the - # vocabulary. - self.min_count = FLAGS.min_count - - # Subsampling threshold for word occurrence. - self.subsample = FLAGS.subsample - - # How often to print statistics. - self.statistics_interval = FLAGS.statistics_interval - - # How often to write to the summary file (rounds up to the nearest - # statistics_interval). - self.summary_interval = FLAGS.summary_interval - - # How often to write checkpoints (rounds up to the nearest statistics - # interval). - self.checkpoint_interval = FLAGS.checkpoint_interval - - # Where to write out summaries. - self.save_path = FLAGS.save_path - if not os.path.exists(self.save_path): - os.makedirs(self.save_path) - - # Eval options. - # The text file for eval. - self.eval_data = FLAGS.eval_data - - -class Word2Vec(object): - """Word2Vec model (Skipgram).""" - - def __init__(self, options, session): - self._options = options - self._session = session - self._word2id = {} - self._id2word = [] - self.build_graph() - self.build_eval_graph() - self.save_vocab() - - def read_analogies(self): - """Reads through the analogy question file. - - Returns: - questions: a [n, 4] numpy array containing the analogy question's - word ids. - questions_skipped: questions skipped due to unknown words. - """ - questions = [] - questions_skipped = 0 - with open(self._options.eval_data, "rb") as analogy_f: - for line in analogy_f: - if line.startswith(b":"): # Skip comments. - continue - words = line.strip().lower().split(b" ") - ids = [self._word2id.get(w.strip()) for w in words] - if None in ids or len(ids) != 4: - questions_skipped += 1 - else: - questions.append(np.array(ids)) - print("Eval analogy file: ", self._options.eval_data) - print("Questions: ", len(questions)) - print("Skipped: ", questions_skipped) - self._analogy_questions = np.array(questions, dtype=np.int32) - - def forward(self, examples, labels): - """Build the graph for the forward pass.""" - opts = self._options - - # Declare all variables we need. - # Embedding: [vocab_size, emb_dim] - init_width = 0.5 / opts.emb_dim - emb = tf.Variable( - tf.random_uniform( - [opts.vocab_size, opts.emb_dim], -init_width, init_width), - name="emb") - self._emb = emb - - # Softmax weight: [vocab_size, emb_dim]. Transposed. - sm_w_t = tf.Variable( - tf.zeros([opts.vocab_size, opts.emb_dim]), - name="sm_w_t") - - # Softmax bias: [vocab_size]. - sm_b = tf.Variable(tf.zeros([opts.vocab_size]), name="sm_b") - - # Global step: scalar, i.e., shape []. - self.global_step = tf.Variable(0, name="global_step") - - # Nodes to compute the nce loss w/ candidate sampling. - labels_matrix = tf.reshape( - tf.cast(labels, - dtype=tf.int64), - [opts.batch_size, 1]) - - # Negative sampling. - sampled_ids, _, _ = (tf.nn.fixed_unigram_candidate_sampler( - true_classes=labels_matrix, - num_true=1, - num_sampled=opts.num_samples, - unique=True, - range_max=opts.vocab_size, - distortion=0.75, - unigrams=opts.vocab_counts.tolist())) - - # Embeddings for examples: [batch_size, emb_dim] - example_emb = tf.nn.embedding_lookup(emb, examples) - - # Weights for labels: [batch_size, emb_dim] - true_w = tf.nn.embedding_lookup(sm_w_t, labels) - # Biases for labels: [batch_size, 1] - true_b = tf.nn.embedding_lookup(sm_b, labels) - - # Weights for sampled ids: [num_sampled, emb_dim] - sampled_w = tf.nn.embedding_lookup(sm_w_t, sampled_ids) - # Biases for sampled ids: [num_sampled, 1] - sampled_b = tf.nn.embedding_lookup(sm_b, sampled_ids) - - # True logits: [batch_size, 1] - true_logits = tf.reduce_sum(tf.multiply(example_emb, true_w), 1) + true_b - - # Sampled logits: [batch_size, num_sampled] - # We replicate sampled noise labels for all examples in the batch - # using the matmul. - sampled_b_vec = tf.reshape(sampled_b, [opts.num_samples]) - sampled_logits = tf.matmul(example_emb, - sampled_w, - transpose_b=True) + sampled_b_vec - return true_logits, sampled_logits - - def nce_loss(self, true_logits, sampled_logits): - """Build the graph for the NCE loss.""" - - # cross-entropy(logits, labels) - opts = self._options - true_xent = tf.nn.sigmoid_cross_entropy_with_logits( - labels=tf.ones_like(true_logits), logits=true_logits) - sampled_xent = tf.nn.sigmoid_cross_entropy_with_logits( - labels=tf.zeros_like(sampled_logits), logits=sampled_logits) - - # NCE-loss is the sum of the true and noise (sampled words) - # contributions, averaged over the batch. - nce_loss_tensor = (tf.reduce_sum(true_xent) + - tf.reduce_sum(sampled_xent)) / opts.batch_size - return nce_loss_tensor - - def optimize(self, loss): - """Build the graph to optimize the loss function.""" - - # Optimizer nodes. - # Linear learning rate decay. - opts = self._options - words_to_train = float(opts.words_per_epoch * opts.epochs_to_train) - lr = opts.learning_rate * tf.maximum( - 0.0001, 1.0 - tf.cast(self._words, tf.float32) / words_to_train) - self._lr = lr - optimizer = tf.train.GradientDescentOptimizer(lr) - train = optimizer.minimize(loss, - global_step=self.global_step, - gate_gradients=optimizer.GATE_NONE) - self._train = train - - def build_eval_graph(self): - """Build the eval graph.""" - # Eval graph - - # Each analogy task is to predict the 4th word (d) given three - # words: a, b, c. E.g., a=italy, b=rome, c=france, we should - # predict d=paris. - - # The eval feeds three vectors of word ids for a, b, c, each of - # which is of size N, where N is the number of analogies we want to - # evaluate in one batch. - analogy_a = tf.placeholder(dtype=tf.int32) # [N] - analogy_b = tf.placeholder(dtype=tf.int32) # [N] - analogy_c = tf.placeholder(dtype=tf.int32) # [N] - - # Normalized word embeddings of shape [vocab_size, emb_dim]. - nemb = tf.nn.l2_normalize(self._emb, 1) - - # Each row of a_emb, b_emb, c_emb is a word's embedding vector. - # They all have the shape [N, emb_dim] - a_emb = tf.gather(nemb, analogy_a) # a's embs - b_emb = tf.gather(nemb, analogy_b) # b's embs - c_emb = tf.gather(nemb, analogy_c) # c's embs - - # We expect that d's embedding vectors on the unit hyper-sphere is - # near: c_emb + (b_emb - a_emb), which has the shape [N, emb_dim]. - target = c_emb + (b_emb - a_emb) - - # Compute cosine distance between each pair of target and vocab. - # dist has shape [N, vocab_size]. - dist = tf.matmul(target, nemb, transpose_b=True) - - # For each question (row in dist), find the top 4 words. - _, pred_idx = tf.nn.top_k(dist, 4) - - # Nodes for computing neighbors for a given word according to - # their cosine distance. - nearby_word = tf.placeholder(dtype=tf.int32) # word id - nearby_emb = tf.gather(nemb, nearby_word) - nearby_dist = tf.matmul(nearby_emb, nemb, transpose_b=True) - nearby_val, nearby_idx = tf.nn.top_k(nearby_dist, - min(1000, self._options.vocab_size)) - - # Nodes in the construct graph which are used by training and - # evaluation to run/feed/fetch. - self._analogy_a = analogy_a - self._analogy_b = analogy_b - self._analogy_c = analogy_c - self._analogy_pred_idx = pred_idx - self._nearby_word = nearby_word - self._nearby_val = nearby_val - self._nearby_idx = nearby_idx - - def build_graph(self): - """Build the graph for the full model.""" - opts = self._options - # The training data. A text file. - (words, counts, words_per_epoch, self._epoch, self._words, examples, - labels) = word2vec.skipgram_word2vec(filename=opts.train_data, - batch_size=opts.batch_size, - window_size=opts.window_size, - min_count=opts.min_count, - subsample=opts.subsample) - (opts.vocab_words, opts.vocab_counts, - opts.words_per_epoch) = self._session.run([words, counts, words_per_epoch]) - opts.vocab_size = len(opts.vocab_words) - print("Data file: ", opts.train_data) - print("Vocab size: ", opts.vocab_size - 1, " + UNK") - print("Words per epoch: ", opts.words_per_epoch) - self._examples = examples - self._labels = labels - self._id2word = opts.vocab_words - for i, w in enumerate(self._id2word): - self._word2id[w] = i - true_logits, sampled_logits = self.forward(examples, labels) - loss = self.nce_loss(true_logits, sampled_logits) - tf.summary.scalar("NCE loss", loss) - self._loss = loss - self.optimize(loss) - - # Properly initialize all variables. - tf.global_variables_initializer().run() - - self.saver = tf.train.Saver() - - def save_vocab(self): - """Save the vocabulary to a file so the model can be reloaded.""" - opts = self._options - with open(os.path.join(opts.save_path, "vocab.txt"), "w") as f: - for i in xrange(opts.vocab_size): - vocab_word = tf.compat.as_text(opts.vocab_words[i]).encode("utf-8") - f.write("%s %d\n" % (vocab_word, - opts.vocab_counts[i])) - - def _train_thread_body(self): - initial_epoch, = self._session.run([self._epoch]) - while True: - _, epoch = self._session.run([self._train, self._epoch]) - if epoch != initial_epoch: - break - - def train(self): - """Train the model.""" - opts = self._options - - initial_epoch, initial_words = self._session.run([self._epoch, self._words]) - - summary_op = tf.summary.merge_all() - summary_writer = tf.summary.FileWriter(opts.save_path, self._session.graph) - workers = [] - for _ in xrange(opts.concurrent_steps): - t = threading.Thread(target=self._train_thread_body) - t.start() - workers.append(t) - - last_words, last_time, last_summary_time = initial_words, time.time(), 0 - last_checkpoint_time = 0 - while True: - time.sleep(opts.statistics_interval) # Reports our progress once a while. - (epoch, step, loss, words, lr) = self._session.run( - [self._epoch, self.global_step, self._loss, self._words, self._lr]) - now = time.time() - last_words, last_time, rate = words, now, (words - last_words) / ( - now - last_time) - print("Epoch %4d Step %8d: lr = %5.3f loss = %6.2f words/sec = %8.0f\r" % - (epoch, step, lr, loss, rate), end="") - sys.stdout.flush() - if now - last_summary_time > opts.summary_interval: - summary_str = self._session.run(summary_op) - summary_writer.add_summary(summary_str, step) - last_summary_time = now - if now - last_checkpoint_time > opts.checkpoint_interval: - self.saver.save(self._session, - os.path.join(opts.save_path, "model.ckpt"), - global_step=step.astype(int)) - last_checkpoint_time = now - if epoch != initial_epoch: - break - - for t in workers: - t.join() - - return epoch - - def _predict(self, analogy): - """Predict the top 4 answers for analogy questions.""" - idx, = self._session.run([self._analogy_pred_idx], { - self._analogy_a: analogy[:, 0], - self._analogy_b: analogy[:, 1], - self._analogy_c: analogy[:, 2] - }) - return idx - - def eval(self): - """Evaluate analogy questions and reports accuracy.""" - - # How many questions we get right at precision@1. - correct = 0 - - try: - total = self._analogy_questions.shape[0] - except AttributeError as e: - raise AttributeError("Need to read analogy questions.") - - start = 0 - while start < total: - limit = start + 2500 - sub = self._analogy_questions[start:limit, :] - idx = self._predict(sub) - start = limit - for question in xrange(sub.shape[0]): - for j in xrange(4): - if idx[question, j] == sub[question, 3]: - # Bingo! We predicted correctly. E.g., [italy, rome, france, paris]. - correct += 1 - break - elif idx[question, j] in sub[question, :3]: - # We need to skip words already in the question. - continue - else: - # The correct label is not the precision@1 - break - print() - print("Eval %4d/%d accuracy = %4.1f%%" % (correct, total, - correct * 100.0 / total)) - - def analogy(self, w0, w1, w2): - """Predict word w3 as in w0:w1 vs w2:w3.""" - wid = np.array([[self._word2id.get(w, 0) for w in [w0, w1, w2]]]) - idx = self._predict(wid) - for c in [self._id2word[i] for i in idx[0, :]]: - if c not in [w0, w1, w2]: - print(c) - return - print("unknown") - - def nearby(self, words, num=20): - """Prints out nearby words given a list of words.""" - ids = np.array([self._word2id.get(x, 0) for x in words]) - vals, idx = self._session.run( - [self._nearby_val, self._nearby_idx], {self._nearby_word: ids}) - for i in xrange(len(words)): - print("\n%s\n=====================================" % (words[i])) - for (neighbor, distance) in zip(idx[i, :num], vals[i, :num]): - print("%-20s %6.4f" % (self._id2word[neighbor], distance)) - - -def _start_shell(local_ns=None): - # An interactive shell is useful for debugging/development. - import IPython - user_ns = {} - if local_ns: - user_ns.update(local_ns) - user_ns.update(globals()) - IPython.start_ipython(argv=[], user_ns=user_ns) - - -def main(_): - """Train a word2vec model.""" - if not FLAGS.train_data or not FLAGS.eval_data or not FLAGS.save_path: - print("--train_data --eval_data and --save_path must be specified.") - sys.exit(1) - opts = Options() - with tf.Graph().as_default(), tf.Session() as session: - with tf.device("/cpu:0"): - model = Word2Vec(opts, session) - model.read_analogies() # Read analogy questions - for _ in xrange(opts.epochs_to_train): - model.train() # Process one epoch - model.eval() # Eval analogies. - # Perform a final save. - model.saver.save(session, - os.path.join(opts.save_path, "model.ckpt"), - global_step=model.global_step) - if FLAGS.interactive: - # E.g., - # [0]: model.analogy(b'france', b'paris', b'russia') - # [1]: model.nearby([b'proton', b'elephant', b'maxwell']) - _start_shell(locals()) - - -if __name__ == "__main__": - tf.app.run() diff --git a/tutorials/embedding/word2vec_kernels.cc b/tutorials/embedding/word2vec_kernels.cc deleted file mode 100644 index e105f8950a0..00000000000 --- a/tutorials/embedding/word2vec_kernels.cc +++ /dev/null @@ -1,355 +0,0 @@ -/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/lib/core/stringpiece.h" -#include "tensorflow/core/lib/gtl/map_util.h" -#include "tensorflow/core/lib/random/distribution_sampler.h" -#include "tensorflow/core/lib/random/philox_random.h" -#include "tensorflow/core/lib/random/simple_philox.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/platform/thread_annotations.h" -#include "tensorflow/core/util/guarded_philox_random.h" - -namespace tensorflow { - -// Number of examples to precalculate. -const int kPrecalc = 3000; -// Number of words to read into a sentence before processing. -const int kSentenceSize = 1000; - -namespace { - -bool ScanWord(StringPiece* input, string* word) { - str_util::RemoveLeadingWhitespace(input); - StringPiece tmp; - if (str_util::ConsumeNonWhitespace(input, &tmp)) { - word->assign(tmp.data(), tmp.size()); - return true; - } else { - return false; - } -} - -} // end namespace - -class SkipgramWord2vecOp : public OpKernel { - public: - explicit SkipgramWord2vecOp(OpKernelConstruction* ctx) - : OpKernel(ctx), rng_(&philox_) { - string filename; - OP_REQUIRES_OK(ctx, ctx->GetAttr("filename", &filename)); - OP_REQUIRES_OK(ctx, ctx->GetAttr("batch_size", &batch_size_)); - OP_REQUIRES_OK(ctx, ctx->GetAttr("window_size", &window_size_)); - OP_REQUIRES_OK(ctx, ctx->GetAttr("min_count", &min_count_)); - OP_REQUIRES_OK(ctx, ctx->GetAttr("subsample", &subsample_)); - OP_REQUIRES_OK(ctx, Init(ctx->env(), filename)); - - mutex_lock l(mu_); - example_pos_ = corpus_size_; - label_pos_ = corpus_size_; - label_limit_ = corpus_size_; - sentence_index_ = kSentenceSize; - for (int i = 0; i < kPrecalc; ++i) { - NextExample(&precalc_examples_[i].input, &precalc_examples_[i].label); - } - } - - void Compute(OpKernelContext* ctx) override { - Tensor words_per_epoch(DT_INT64, TensorShape({})); - Tensor current_epoch(DT_INT32, TensorShape({})); - Tensor total_words_processed(DT_INT64, TensorShape({})); - Tensor examples(DT_INT32, TensorShape({batch_size_})); - auto Texamples = examples.flat(); - Tensor labels(DT_INT32, TensorShape({batch_size_})); - auto Tlabels = labels.flat(); - { - mutex_lock l(mu_); - for (int i = 0; i < batch_size_; ++i) { - Texamples(i) = precalc_examples_[precalc_index_].input; - Tlabels(i) = precalc_examples_[precalc_index_].label; - precalc_index_++; - if (precalc_index_ >= kPrecalc) { - precalc_index_ = 0; - for (int j = 0; j < kPrecalc; ++j) { - NextExample(&precalc_examples_[j].input, - &precalc_examples_[j].label); - } - } - } - words_per_epoch.scalar()() = corpus_size_; - current_epoch.scalar()() = current_epoch_; - total_words_processed.scalar()() = total_words_processed_; - } - ctx->set_output(0, word_); - ctx->set_output(1, freq_); - ctx->set_output(2, words_per_epoch); - ctx->set_output(3, current_epoch); - ctx->set_output(4, total_words_processed); - ctx->set_output(5, examples); - ctx->set_output(6, labels); - } - - private: - struct Example { - int32 input; - int32 label; - }; - - int32 batch_size_ = 0; - int32 window_size_ = 5; - float subsample_ = 1e-3; - int min_count_ = 5; - int32 vocab_size_ = 0; - Tensor word_; - Tensor freq_; - int64 corpus_size_ = 0; - std::vector corpus_; - std::vector precalc_examples_; - int precalc_index_ = 0; - std::vector sentence_; - int sentence_index_ = 0; - - mutex mu_; - random::PhiloxRandom philox_ GUARDED_BY(mu_); - random::SimplePhilox rng_ GUARDED_BY(mu_); - int32 current_epoch_ GUARDED_BY(mu_) = -1; - int64 total_words_processed_ GUARDED_BY(mu_) = 0; - int64 example_pos_ GUARDED_BY(mu_); - int32 label_pos_ GUARDED_BY(mu_); - int32 label_limit_ GUARDED_BY(mu_); - - // {example_pos_, label_pos_} is the cursor for the next example. - // example_pos_ wraps around at the end of corpus_. For each - // example, we randomly generate [label_pos_, label_limit) for - // labels. - void NextExample(int32* example, int32* label) EXCLUSIVE_LOCKS_REQUIRED(mu_) { - while (true) { - if (label_pos_ >= label_limit_) { - ++total_words_processed_; - ++sentence_index_; - if (sentence_index_ >= kSentenceSize) { - sentence_index_ = 0; - for (int i = 0; i < kSentenceSize; ++i, ++example_pos_) { - if (example_pos_ >= corpus_size_) { - ++current_epoch_; - example_pos_ = 0; - } - if (subsample_ > 0) { - int32 word_freq = freq_.flat()(corpus_[example_pos_]); - // See Eq. 5 in http://arxiv.org/abs/1310.4546 - float keep_prob = - (std::sqrt(word_freq / (subsample_ * corpus_size_)) + 1) * - (subsample_ * corpus_size_) / word_freq; - if (rng_.RandFloat() > keep_prob) { - i--; - continue; - } - } - sentence_[i] = corpus_[example_pos_]; - } - } - const int32 skip = 1 + rng_.Uniform(window_size_); - label_pos_ = std::max(0, sentence_index_ - skip); - label_limit_ = - std::min(kSentenceSize, sentence_index_ + skip + 1); - } - if (sentence_index_ != label_pos_) { - break; - } - ++label_pos_; - } - *example = sentence_[sentence_index_]; - *label = sentence_[label_pos_++]; - } - - Status Init(Env* env, const string& filename) { - string data; - TF_RETURN_IF_ERROR(ReadFileToString(env, filename, &data)); - StringPiece input = data; - string w; - corpus_size_ = 0; - std::unordered_map word_freq; - while (ScanWord(&input, &w)) { - ++(word_freq[w]); - ++corpus_size_; - } - if (corpus_size_ < window_size_ * 10) { - return errors::InvalidArgument("The text file ", filename, - " contains too little data: ", - corpus_size_, " words"); - } - typedef std::pair WordFreq; - std::vector ordered; - for (const auto& p : word_freq) { - if (p.second >= min_count_) ordered.push_back(p); - } - LOG(INFO) << "Data file: " << filename << " contains " << data.size() - << " bytes, " << corpus_size_ << " words, " << word_freq.size() - << " unique words, " << ordered.size() - << " unique frequent words."; - word_freq.clear(); - std::sort(ordered.begin(), ordered.end(), - [](const WordFreq& x, const WordFreq& y) { - return x.second > y.second; - }); - vocab_size_ = static_cast(1 + ordered.size()); - Tensor word(DT_STRING, TensorShape({vocab_size_})); - Tensor freq(DT_INT32, TensorShape({vocab_size_})); - word.flat()(0) = "UNK"; - static const int32 kUnkId = 0; - std::unordered_map word_id; - int64 total_counted = 0; - for (std::size_t i = 0; i < ordered.size(); ++i) { - const auto& w = ordered[i].first; - auto id = i + 1; - word.flat()(id) = w; - auto word_count = ordered[i].second; - freq.flat()(id) = word_count; - total_counted += word_count; - word_id[w] = id; - } - freq.flat()(kUnkId) = corpus_size_ - total_counted; - word_ = word; - freq_ = freq; - corpus_.reserve(corpus_size_); - input = data; - while (ScanWord(&input, &w)) { - corpus_.push_back(gtl::FindWithDefault(word_id, w, kUnkId)); - } - precalc_examples_.resize(kPrecalc); - sentence_.resize(kSentenceSize); - return Status::OK(); - } -}; - -REGISTER_KERNEL_BUILDER(Name("SkipgramWord2vec").Device(DEVICE_CPU), SkipgramWord2vecOp); - -class NegTrainWord2vecOp : public OpKernel { - public: - explicit NegTrainWord2vecOp(OpKernelConstruction* ctx) : OpKernel(ctx) { - base_.Init(0, 0); - - OP_REQUIRES_OK(ctx, ctx->GetAttr("num_negative_samples", &num_samples_)); - - std::vector vocab_count; - OP_REQUIRES_OK(ctx, ctx->GetAttr("vocab_count", &vocab_count)); - - std::vector vocab_weights; - vocab_weights.reserve(vocab_count.size()); - for (const auto& f : vocab_count) { - float r = std::pow(static_cast(f), 0.75f); - vocab_weights.push_back(r); - } - sampler_ = new random::DistributionSampler(vocab_weights); - } - - ~NegTrainWord2vecOp() { delete sampler_; } - - void Compute(OpKernelContext* ctx) override { - Tensor w_in = ctx->mutable_input(0, false); - OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(w_in.shape()), - errors::InvalidArgument("Must be a matrix")); - Tensor w_out = ctx->mutable_input(1, false); - OP_REQUIRES(ctx, w_in.shape() == w_out.shape(), - errors::InvalidArgument("w_in.shape == w_out.shape")); - const Tensor& examples = ctx->input(2); - OP_REQUIRES(ctx, TensorShapeUtils::IsVector(examples.shape()), - errors::InvalidArgument("Must be a vector")); - const Tensor& labels = ctx->input(3); - OP_REQUIRES(ctx, examples.shape() == labels.shape(), - errors::InvalidArgument("examples.shape == labels.shape")); - const Tensor& learning_rate = ctx->input(4); - OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(learning_rate.shape()), - errors::InvalidArgument("Must be a scalar")); - - auto Tw_in = w_in.matrix(); - auto Tw_out = w_out.matrix(); - auto Texamples = examples.flat(); - auto Tlabels = labels.flat(); - auto lr = learning_rate.scalar()(); - const int64 vocab_size = w_in.dim_size(0); - const int64 dims = w_in.dim_size(1); - const int64 batch_size = examples.dim_size(0); - OP_REQUIRES(ctx, vocab_size == sampler_->num(), - errors::InvalidArgument("vocab_size mismatches: ", vocab_size, - " vs. ", sampler_->num())); - - // Gradient accumulator for v_in. - Tensor buf(DT_FLOAT, TensorShape({dims})); - auto Tbuf = buf.flat(); - - // Scalar buffer to hold sigmoid(+/- dot). - Tensor g_buf(DT_FLOAT, TensorShape({})); - auto g = g_buf.scalar(); - - // The following loop needs 2 random 32-bit values per negative - // sample. We reserve 8 values per sample just in case the - // underlying implementation changes. - auto rnd = base_.ReserveSamples32(batch_size * num_samples_ * 8); - random::SimplePhilox srnd(&rnd); - - for (int64 i = 0; i < batch_size; ++i) { - const int32 example = Texamples(i); - DCHECK(0 <= example && example < vocab_size) << example; - const int32 label = Tlabels(i); - DCHECK(0 <= label && label < vocab_size) << label; - auto v_in = Tw_in.chip<0>(example); - - // Positive: example predicts label. - // forward: x = v_in' * v_out - // l = log(sigmoid(x)) - // backward: dl/dx = g = sigmoid(-x) - // dl/d(v_in) = g * v_out' - // dl/d(v_out) = v_in' * g - { - auto v_out = Tw_out.chip<0>(label); - auto dot = (v_in * v_out).sum(); - g = (dot.exp() + 1.f).inverse(); - Tbuf = v_out * (g() * lr); - v_out += v_in * (g() * lr); - } - - // Negative samples: - // forward: x = v_in' * v_sample - // l = log(sigmoid(-x)) - // backward: dl/dx = g = -sigmoid(x) - // dl/d(v_in) = g * v_out' - // dl/d(v_out) = v_in' * g - for (int j = 0; j < num_samples_; ++j) { - const int sample = sampler_->Sample(&srnd); - if (sample == label) continue; // Skip. - auto v_sample = Tw_out.chip<0>(sample); - auto dot = (v_in * v_sample).sum(); - g = -((-dot).exp() + 1.f).inverse(); - Tbuf += v_sample * (g() * lr); - v_sample += v_in * (g() * lr); - } - - // Applies the gradient on v_in. - v_in += Tbuf; - } - } - - private: - int32 num_samples_ = 0; - random::DistributionSampler* sampler_ = nullptr; - GuardedPhiloxRandom base_; -}; - -REGISTER_KERNEL_BUILDER(Name("NegTrainWord2vec").Device(DEVICE_CPU), NegTrainWord2vecOp); - -} // end namespace tensorflow diff --git a/tutorials/embedding/word2vec_ops.cc b/tutorials/embedding/word2vec_ops.cc deleted file mode 100644 index cdffa4a7725..00000000000 --- a/tutorials/embedding/word2vec_ops.cc +++ /dev/null @@ -1,73 +0,0 @@ -/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/core/framework/op.h" - -namespace tensorflow { - -REGISTER_OP("SkipgramWord2vec") - .Output("vocab_word: string") - .Output("vocab_freq: int32") - .Output("words_per_epoch: int64") - .Output("current_epoch: int32") - .Output("total_words_processed: int64") - .Output("examples: int32") - .Output("labels: int32") - .SetIsStateful() - .Attr("filename: string") - .Attr("batch_size: int") - .Attr("window_size: int = 5") - .Attr("min_count: int = 5") - .Attr("subsample: float = 1e-3") - .Doc(R"doc( -Parses a text file and creates a batch of examples. - -vocab_word: A vector of words in the corpus. -vocab_freq: Frequencies of words. Sorted in the non-ascending order. -words_per_epoch: Number of words per epoch in the data file. -current_epoch: The current epoch number. -total_words_processed: The total number of words processed so far. -examples: A vector of word ids. -labels: A vector of word ids. -filename: The corpus's text file name. -batch_size: The size of produced batch. -window_size: The number of words to predict to the left and right of the target. -min_count: The minimum number of word occurrences for it to be included in the - vocabulary. -subsample: Threshold for word occurrence. Words that appear with higher - frequency will be randomly down-sampled. Set to 0 to disable. -)doc"); - -REGISTER_OP("NegTrainWord2vec") - .Input("w_in: Ref(float)") - .Input("w_out: Ref(float)") - .Input("examples: int32") - .Input("labels: int32") - .Input("lr: float") - .SetIsStateful() - .Attr("vocab_count: list(int)") - .Attr("num_negative_samples: int") - .Doc(R"doc( -Training via negative sampling. - -w_in: input word embedding. -w_out: output word embedding. -examples: A vector of word ids. -labels: A vector of word ids. -vocab_count: Count of words in the vocabulary. -num_negative_samples: Number of negative samples per example. -)doc"); - -} // end namespace tensorflow diff --git a/tutorials/embedding/word2vec_optimized.py b/tutorials/embedding/word2vec_optimized.py deleted file mode 100644 index 420991a8a5d..00000000000 --- a/tutorials/embedding/word2vec_optimized.py +++ /dev/null @@ -1,439 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Multi-threaded word2vec unbatched skip-gram model. - -Trains the model described in: -(Mikolov, et. al.) Efficient Estimation of Word Representations in Vector Space -ICLR 2013. -http://arxiv.org/abs/1301.3781 -This model does true SGD (i.e. no minibatching). To do this efficiently, custom -ops are used to sequentially process data within a 'batch'. - -The key ops used are: -* skipgram custom op that does input processing. -* neg_train custom op that efficiently calculates and applies the gradient using - true SGD. -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import sys -import threading -import time - -from six.moves import xrange # pylint: disable=redefined-builtin - -import numpy as np -import tensorflow as tf - -word2vec = tf.load_op_library(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'word2vec_ops.so')) - -flags = tf.app.flags - -flags.DEFINE_string("save_path", None, "Directory to write the model.") -flags.DEFINE_string( - "train_data", None, - "Training data. E.g., unzipped file http://mattmahoney.net/dc/text8.zip.") -flags.DEFINE_string( - "eval_data", None, "Analogy questions. " - "See README.md for how to get 'questions-words.txt'.") -flags.DEFINE_integer("embedding_size", 200, "The embedding dimension size.") -flags.DEFINE_integer( - "epochs_to_train", 15, - "Number of epochs to train. Each epoch processes the training data once " - "completely.") -flags.DEFINE_float("learning_rate", 0.025, "Initial learning rate.") -flags.DEFINE_integer("num_neg_samples", 25, - "Negative samples per training example.") -flags.DEFINE_integer("batch_size", 500, - "Numbers of training examples each step processes " - "(no minibatching).") -flags.DEFINE_integer("concurrent_steps", 12, - "The number of concurrent training steps.") -flags.DEFINE_integer("window_size", 5, - "The number of words to predict to the left and right " - "of the target word.") -flags.DEFINE_integer("min_count", 5, - "The minimum number of word occurrences for it to be " - "included in the vocabulary.") -flags.DEFINE_float("subsample", 1e-3, - "Subsample threshold for word occurrence. Words that appear " - "with higher frequency will be randomly down-sampled. Set " - "to 0 to disable.") -flags.DEFINE_boolean( - "interactive", False, - "If true, enters an IPython interactive session to play with the trained " - "model. E.g., try model.analogy(b'france', b'paris', b'russia') and " - "model.nearby([b'proton', b'elephant', b'maxwell'])") - -FLAGS = flags.FLAGS - - -class Options(object): - """Options used by our word2vec model.""" - - def __init__(self): - # Model options. - - # Embedding dimension. - self.emb_dim = FLAGS.embedding_size - - # Training options. - - # The training text file. - self.train_data = FLAGS.train_data - - # Number of negative samples per example. - self.num_samples = FLAGS.num_neg_samples - - # The initial learning rate. - self.learning_rate = FLAGS.learning_rate - - # Number of epochs to train. After these many epochs, the learning - # rate decays linearly to zero and the training stops. - self.epochs_to_train = FLAGS.epochs_to_train - - # Concurrent training steps. - self.concurrent_steps = FLAGS.concurrent_steps - - # Number of examples for one training step. - self.batch_size = FLAGS.batch_size - - # The number of words to predict to the left and right of the target word. - self.window_size = FLAGS.window_size - - # The minimum number of word occurrences for it to be included in the - # vocabulary. - self.min_count = FLAGS.min_count - - # Subsampling threshold for word occurrence. - self.subsample = FLAGS.subsample - - # Where to write out summaries. - self.save_path = FLAGS.save_path - if not os.path.exists(self.save_path): - os.makedirs(self.save_path) - - # Eval options. - - # The text file for eval. - self.eval_data = FLAGS.eval_data - - -class Word2Vec(object): - """Word2Vec model (Skipgram).""" - - def __init__(self, options, session): - self._options = options - self._session = session - self._word2id = {} - self._id2word = [] - self.build_graph() - self.build_eval_graph() - self.save_vocab() - - def read_analogies(self): - """Reads through the analogy question file. - - Returns: - questions: a [n, 4] numpy array containing the analogy question's - word ids. - questions_skipped: questions skipped due to unknown words. - """ - questions = [] - questions_skipped = 0 - with open(self._options.eval_data, "rb") as analogy_f: - for line in analogy_f: - if line.startswith(b":"): # Skip comments. - continue - words = line.strip().lower().split(b" ") - ids = [self._word2id.get(w.strip()) for w in words] - if None in ids or len(ids) != 4: - questions_skipped += 1 - else: - questions.append(np.array(ids)) - print("Eval analogy file: ", self._options.eval_data) - print("Questions: ", len(questions)) - print("Skipped: ", questions_skipped) - self._analogy_questions = np.array(questions, dtype=np.int32) - - def build_graph(self): - """Build the model graph.""" - opts = self._options - - # The training data. A text file. - (words, counts, words_per_epoch, current_epoch, total_words_processed, - examples, labels) = word2vec.skipgram_word2vec(filename=opts.train_data, - batch_size=opts.batch_size, - window_size=opts.window_size, - min_count=opts.min_count, - subsample=opts.subsample) - (opts.vocab_words, opts.vocab_counts, - opts.words_per_epoch) = self._session.run([words, counts, words_per_epoch]) - opts.vocab_size = len(opts.vocab_words) - print("Data file: ", opts.train_data) - print("Vocab size: ", opts.vocab_size - 1, " + UNK") - print("Words per epoch: ", opts.words_per_epoch) - - self._id2word = opts.vocab_words - for i, w in enumerate(self._id2word): - self._word2id[w] = i - - # Declare all variables we need. - # Input words embedding: [vocab_size, emb_dim] - w_in = tf.Variable( - tf.random_uniform( - [opts.vocab_size, - opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim), - name="w_in") - - # Global step: scalar, i.e., shape []. - w_out = tf.Variable(tf.zeros([opts.vocab_size, opts.emb_dim]), name="w_out") - - # Global step: [] - global_step = tf.Variable(0, name="global_step") - - # Linear learning rate decay. - words_to_train = float(opts.words_per_epoch * opts.epochs_to_train) - lr = opts.learning_rate * tf.maximum( - 0.0001, - 1.0 - tf.cast(total_words_processed, tf.float32) / words_to_train) - - # Training nodes. - inc = global_step.assign_add(1) - with tf.control_dependencies([inc]): - train = word2vec.neg_train_word2vec(w_in, - w_out, - examples, - labels, - lr, - vocab_count=opts.vocab_counts.tolist(), - num_negative_samples=opts.num_samples) - - self._w_in = w_in - self._examples = examples - self._labels = labels - self._lr = lr - self._train = train - self.global_step = global_step - self._epoch = current_epoch - self._words = total_words_processed - - def save_vocab(self): - """Save the vocabulary to a file so the model can be reloaded.""" - opts = self._options - with open(os.path.join(opts.save_path, "vocab.txt"), "w") as f: - for i in xrange(opts.vocab_size): - vocab_word = tf.compat.as_text(opts.vocab_words[i]).encode("utf-8") - f.write("%s %d\n" % (vocab_word, - opts.vocab_counts[i])) - - def build_eval_graph(self): - """Build the evaluation graph.""" - # Eval graph - opts = self._options - - # Each analogy task is to predict the 4th word (d) given three - # words: a, b, c. E.g., a=italy, b=rome, c=france, we should - # predict d=paris. - - # The eval feeds three vectors of word ids for a, b, c, each of - # which is of size N, where N is the number of analogies we want to - # evaluate in one batch. - analogy_a = tf.placeholder(dtype=tf.int32) # [N] - analogy_b = tf.placeholder(dtype=tf.int32) # [N] - analogy_c = tf.placeholder(dtype=tf.int32) # [N] - - # Normalized word embeddings of shape [vocab_size, emb_dim]. - nemb = tf.nn.l2_normalize(self._w_in, 1) - - # Each row of a_emb, b_emb, c_emb is a word's embedding vector. - # They all have the shape [N, emb_dim] - a_emb = tf.gather(nemb, analogy_a) # a's embs - b_emb = tf.gather(nemb, analogy_b) # b's embs - c_emb = tf.gather(nemb, analogy_c) # c's embs - - # We expect that d's embedding vectors on the unit hyper-sphere is - # near: c_emb + (b_emb - a_emb), which has the shape [N, emb_dim]. - target = c_emb + (b_emb - a_emb) - - # Compute cosine distance between each pair of target and vocab. - # dist has shape [N, vocab_size]. - dist = tf.matmul(target, nemb, transpose_b=True) - - # For each question (row in dist), find the top 4 words. - _, pred_idx = tf.nn.top_k(dist, 4) - - # Nodes for computing neighbors for a given word according to - # their cosine distance. - nearby_word = tf.placeholder(dtype=tf.int32) # word id - nearby_emb = tf.gather(nemb, nearby_word) - nearby_dist = tf.matmul(nearby_emb, nemb, transpose_b=True) - nearby_val, nearby_idx = tf.nn.top_k(nearby_dist, - min(1000, opts.vocab_size)) - - # Nodes in the construct graph which are used by training and - # evaluation to run/feed/fetch. - self._analogy_a = analogy_a - self._analogy_b = analogy_b - self._analogy_c = analogy_c - self._analogy_pred_idx = pred_idx - self._nearby_word = nearby_word - self._nearby_val = nearby_val - self._nearby_idx = nearby_idx - - # Properly initialize all variables. - tf.global_variables_initializer().run() - - self.saver = tf.train.Saver() - - def _train_thread_body(self): - initial_epoch, = self._session.run([self._epoch]) - while True: - _, epoch = self._session.run([self._train, self._epoch]) - if epoch != initial_epoch: - break - - def train(self): - """Train the model.""" - opts = self._options - - initial_epoch, initial_words = self._session.run([self._epoch, self._words]) - - workers = [] - for _ in xrange(opts.concurrent_steps): - t = threading.Thread(target=self._train_thread_body) - t.start() - workers.append(t) - - last_words, last_time = initial_words, time.time() - while True: - time.sleep(5) # Reports our progress once a while. - (epoch, step, words, lr) = self._session.run( - [self._epoch, self.global_step, self._words, self._lr]) - now = time.time() - last_words, last_time, rate = words, now, (words - last_words) / ( - now - last_time) - print("Epoch %4d Step %8d: lr = %5.3f words/sec = %8.0f\r" % (epoch, step, - lr, rate), - end="") - sys.stdout.flush() - if epoch != initial_epoch: - break - - for t in workers: - t.join() - - def _predict(self, analogy): - """Predict the top 4 answers for analogy questions.""" - idx, = self._session.run([self._analogy_pred_idx], { - self._analogy_a: analogy[:, 0], - self._analogy_b: analogy[:, 1], - self._analogy_c: analogy[:, 2] - }) - return idx - - def eval(self): - """Evaluate analogy questions and reports accuracy.""" - - # How many questions we get right at precision@1. - correct = 0 - - try: - total = self._analogy_questions.shape[0] - except AttributeError as e: - raise AttributeError("Need to read analogy questions.") - - start = 0 - while start < total: - limit = start + 2500 - sub = self._analogy_questions[start:limit, :] - idx = self._predict(sub) - start = limit - for question in xrange(sub.shape[0]): - for j in xrange(4): - if idx[question, j] == sub[question, 3]: - # Bingo! We predicted correctly. E.g., [italy, rome, france, paris]. - correct += 1 - break - elif idx[question, j] in sub[question, :3]: - # We need to skip words already in the question. - continue - else: - # The correct label is not the precision@1 - break - print() - print("Eval %4d/%d accuracy = %4.1f%%" % (correct, total, - correct * 100.0 / total)) - - def analogy(self, w0, w1, w2): - """Predict word w3 as in w0:w1 vs w2:w3.""" - wid = np.array([[self._word2id.get(w, 0) for w in [w0, w1, w2]]]) - idx = self._predict(wid) - for c in [self._id2word[i] for i in idx[0, :]]: - if c not in [w0, w1, w2]: - print(c) - break - print("unknown") - - def nearby(self, words, num=20): - """Prints out nearby words given a list of words.""" - ids = np.array([self._word2id.get(x, 0) for x in words]) - vals, idx = self._session.run( - [self._nearby_val, self._nearby_idx], {self._nearby_word: ids}) - for i in xrange(len(words)): - print("\n%s\n=====================================" % (words[i])) - for (neighbor, distance) in zip(idx[i, :num], vals[i, :num]): - print("%-20s %6.4f" % (self._id2word[neighbor], distance)) - - -def _start_shell(local_ns=None): - # An interactive shell is useful for debugging/development. - import IPython - user_ns = {} - if local_ns: - user_ns.update(local_ns) - user_ns.update(globals()) - IPython.start_ipython(argv=[], user_ns=user_ns) - - -def main(_): - """Train a word2vec model.""" - if not FLAGS.train_data or not FLAGS.eval_data or not FLAGS.save_path: - print("--train_data --eval_data and --save_path must be specified.") - sys.exit(1) - opts = Options() - with tf.Graph().as_default(), tf.Session() as session: - with tf.device("/cpu:0"): - model = Word2Vec(opts, session) - model.read_analogies() # Read analogy questions - for _ in xrange(opts.epochs_to_train): - model.train() # Process one epoch - model.eval() # Eval analogies. - # Perform a final save. - model.saver.save(session, os.path.join(opts.save_path, "model.ckpt"), - global_step=model.global_step) - if FLAGS.interactive: - # E.g., - # [0]: model.analogy(b'france', b'paris', b'russia') - # [1]: model.nearby([b'proton', b'elephant', b'maxwell']) - _start_shell(locals()) - - -if __name__ == "__main__": - tf.app.run() diff --git a/tutorials/embedding/word2vec_optimized_test.py b/tutorials/embedding/word2vec_optimized_test.py deleted file mode 100644 index d00a14f991d..00000000000 --- a/tutorials/embedding/word2vec_optimized_test.py +++ /dev/null @@ -1,62 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Tests for word2vec_optimized module.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os - -import tensorflow as tf - -import word2vec_optimized - -flags = tf.app.flags - -FLAGS = flags.FLAGS - - -class Word2VecTest(tf.test.TestCase): - - def setUp(self): - FLAGS.train_data = os.path.join(self.get_temp_dir() + "test-text.txt") - FLAGS.eval_data = os.path.join(self.get_temp_dir() + "eval-text.txt") - FLAGS.save_path = self.get_temp_dir() - with open(FLAGS.train_data, "w") as f: - f.write( - """alice was beginning to get very tired of sitting by her sister on - the bank, and of having nothing to do: once or twice she had peeped - into the book her sister was reading, but it had no pictures or - conversations in it, 'and what is the use of a book,' thought alice - 'without pictures or conversations?' So she was considering in her own - mind (as well as she could, for the hot day made her feel very sleepy - and stupid), whether the pleasure of making a daisy-chain would be - worth the trouble of getting up and picking the daisies, when suddenly - a White rabbit with pink eyes ran close by her.\n""") - with open(FLAGS.eval_data, "w") as f: - f.write("alice she rabbit once\n") - - def testWord2VecOptimized(self): - FLAGS.batch_size = 5 - FLAGS.num_neg_samples = 10 - FLAGS.epochs_to_train = 1 - FLAGS.min_count = 0 - word2vec_optimized.main([]) - - -if __name__ == "__main__": - tf.test.main() diff --git a/tutorials/embedding/word2vec_test.py b/tutorials/embedding/word2vec_test.py deleted file mode 100644 index b5068d85b47..00000000000 --- a/tutorials/embedding/word2vec_test.py +++ /dev/null @@ -1,62 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Tests for word2vec module.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os - -import tensorflow as tf - -import word2vec - -flags = tf.app.flags - -FLAGS = flags.FLAGS - - -class Word2VecTest(tf.test.TestCase): - - def setUp(self): - FLAGS.train_data = os.path.join(self.get_temp_dir(), "test-text.txt") - FLAGS.eval_data = os.path.join(self.get_temp_dir(), "eval-text.txt") - FLAGS.save_path = self.get_temp_dir() - with open(FLAGS.train_data, "w") as f: - f.write( - """alice was beginning to get very tired of sitting by her sister on - the bank, and of having nothing to do: once or twice she had peeped - into the book her sister was reading, but it had no pictures or - conversations in it, 'and what is the use of a book,' thought alice - 'without pictures or conversations?' So she was considering in her own - mind (as well as she could, for the hot day made her feel very sleepy - and stupid), whether the pleasure of making a daisy-chain would be - worth the trouble of getting up and picking the daisies, when suddenly - a White rabbit with pink eyes ran close by her.\n""") - with open(FLAGS.eval_data, "w") as f: - f.write("alice she rabbit once\n") - - def testWord2Vec(self): - FLAGS.batch_size = 5 - FLAGS.num_neg_samples = 10 - FLAGS.epochs_to_train = 1 - FLAGS.min_count = 0 - word2vec.main([]) - - -if __name__ == "__main__": - tf.test.main() diff --git a/tutorials/image/__init__.py b/tutorials/image/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tutorials/image/alexnet/BUILD b/tutorials/image/alexnet/BUILD deleted file mode 100644 index bbe29da6f5c..00000000000 --- a/tutorials/image/alexnet/BUILD +++ /dev/null @@ -1,29 +0,0 @@ -# Description: -# Benchmark for AlexNet. - -licenses(["notice"]) # Apache 2.0 - -exports_files(["LICENSE"]) - -py_binary( - name = "alexnet_benchmark", - srcs = [ - "alexnet_benchmark.py", - ], - srcs_version = "PY2AND3", - deps = [ - "//tensorflow:tensorflow_py", - ], -) - -filegroup( - name = "all_files", - srcs = glob( - ["**/*"], - exclude = [ - "**/METADATA", - "**/OWNERS", - ], - ), - visibility = ["//tensorflow:__subpackages__"], -) diff --git a/tutorials/image/alexnet/__init__.py b/tutorials/image/alexnet/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tutorials/image/alexnet/alexnet_benchmark.py b/tutorials/image/alexnet/alexnet_benchmark.py deleted file mode 100644 index 39fcb109f0a..00000000000 --- a/tutorials/image/alexnet/alexnet_benchmark.py +++ /dev/null @@ -1,259 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Timing benchmark for AlexNet inference. - -To run, use: - bazel run -c opt --config=cuda \ - models/tutorials/image/alexnet:alexnet_benchmark - -Across 100 steps on batch size = 128. - -Forward pass: -Run on Tesla K40c: 145 +/- 1.5 ms / batch -Run on Titan X: 70 +/- 0.1 ms / batch - -Forward-backward pass: -Run on Tesla K40c: 480 +/- 48 ms / batch -Run on Titan X: 244 +/- 30 ms / batch -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -from datetime import datetime -import math -import sys -import time - -from six.moves import xrange # pylint: disable=redefined-builtin -import tensorflow as tf - -FLAGS = None - - -def print_activations(t): - print(t.op.name, ' ', t.get_shape().as_list()) - - -def inference(images): - """Build the AlexNet model. - - Args: - images: Images Tensor - - Returns: - pool5: the last Tensor in the convolutional component of AlexNet. - parameters: a list of Tensors corresponding to the weights and biases of the - AlexNet model. - """ - parameters = [] - # conv1 - with tf.name_scope('conv1') as scope: - kernel = tf.Variable(tf.truncated_normal([11, 11, 3, 64], dtype=tf.float32, - stddev=1e-1), name='weights') - conv = tf.nn.conv2d(images, kernel, [1, 4, 4, 1], padding='SAME') - biases = tf.Variable(tf.constant(0.0, shape=[64], dtype=tf.float32), - trainable=True, name='biases') - bias = tf.nn.bias_add(conv, biases) - conv1 = tf.nn.relu(bias, name=scope) - print_activations(conv1) - parameters += [kernel, biases] - - # lrn1 - with tf.name_scope('lrn1') as scope: - lrn1 = tf.nn.local_response_normalization(conv1, - alpha=1e-4, - beta=0.75, - depth_radius=2, - bias=2.0) - - # pool1 - pool1 = tf.nn.max_pool(lrn1, - ksize=[1, 3, 3, 1], - strides=[1, 2, 2, 1], - padding='VALID', - name='pool1') - print_activations(pool1) - - # conv2 - with tf.name_scope('conv2') as scope: - kernel = tf.Variable(tf.truncated_normal([5, 5, 64, 192], dtype=tf.float32, - stddev=1e-1), name='weights') - conv = tf.nn.conv2d(pool1, kernel, [1, 1, 1, 1], padding='SAME') - biases = tf.Variable(tf.constant(0.0, shape=[192], dtype=tf.float32), - trainable=True, name='biases') - bias = tf.nn.bias_add(conv, biases) - conv2 = tf.nn.relu(bias, name=scope) - parameters += [kernel, biases] - print_activations(conv2) - - # lrn2 - with tf.name_scope('lrn2') as scope: - lrn2 = tf.nn.local_response_normalization(conv2, - alpha=1e-4, - beta=0.75, - depth_radius=2, - bias=2.0) - - # pool2 - pool2 = tf.nn.max_pool(lrn2, - ksize=[1, 3, 3, 1], - strides=[1, 2, 2, 1], - padding='VALID', - name='pool2') - print_activations(pool2) - - # conv3 - with tf.name_scope('conv3') as scope: - kernel = tf.Variable(tf.truncated_normal([3, 3, 192, 384], - dtype=tf.float32, - stddev=1e-1), name='weights') - conv = tf.nn.conv2d(pool2, kernel, [1, 1, 1, 1], padding='SAME') - biases = tf.Variable(tf.constant(0.0, shape=[384], dtype=tf.float32), - trainable=True, name='biases') - bias = tf.nn.bias_add(conv, biases) - conv3 = tf.nn.relu(bias, name=scope) - parameters += [kernel, biases] - print_activations(conv3) - - # conv4 - with tf.name_scope('conv4') as scope: - kernel = tf.Variable(tf.truncated_normal([3, 3, 384, 256], - dtype=tf.float32, - stddev=1e-1), name='weights') - conv = tf.nn.conv2d(conv3, kernel, [1, 1, 1, 1], padding='SAME') - biases = tf.Variable(tf.constant(0.0, shape=[256], dtype=tf.float32), - trainable=True, name='biases') - bias = tf.nn.bias_add(conv, biases) - conv4 = tf.nn.relu(bias, name=scope) - parameters += [kernel, biases] - print_activations(conv4) - - # conv5 - with tf.name_scope('conv5') as scope: - kernel = tf.Variable(tf.truncated_normal([3, 3, 256, 256], - dtype=tf.float32, - stddev=1e-1), name='weights') - conv = tf.nn.conv2d(conv4, kernel, [1, 1, 1, 1], padding='SAME') - biases = tf.Variable(tf.constant(0.0, shape=[256], dtype=tf.float32), - trainable=True, name='biases') - bias = tf.nn.bias_add(conv, biases) - conv5 = tf.nn.relu(bias, name=scope) - parameters += [kernel, biases] - print_activations(conv5) - - # pool5 - pool5 = tf.nn.max_pool(conv5, - ksize=[1, 3, 3, 1], - strides=[1, 2, 2, 1], - padding='VALID', - name='pool5') - print_activations(pool5) - - return pool5, parameters - - -def time_tensorflow_run(session, target, info_string): - """Run the computation to obtain the target tensor and print timing stats. - - Args: - session: the TensorFlow session to run the computation under. - target: the target Tensor that is passed to the session's run() function. - info_string: a string summarizing this run, to be printed with the stats. - - Returns: - None - """ - num_steps_burn_in = 10 - total_duration = 0.0 - total_duration_squared = 0.0 - for i in xrange(FLAGS.num_batches + num_steps_burn_in): - start_time = time.time() - _ = session.run(target) - duration = time.time() - start_time - if i >= num_steps_burn_in: - if not i % 10: - print ('%s: step %d, duration = %.3f' % - (datetime.now(), i - num_steps_burn_in, duration)) - total_duration += duration - total_duration_squared += duration * duration - mn = total_duration / FLAGS.num_batches - vr = total_duration_squared / FLAGS.num_batches - mn * mn - sd = math.sqrt(vr) - print ('%s: %s across %d steps, %.3f +/- %.3f sec / batch' % - (datetime.now(), info_string, FLAGS.num_batches, mn, sd)) - - - -def run_benchmark(): - """Run the benchmark on AlexNet.""" - with tf.Graph().as_default(): - # Generate some dummy images. - image_size = 224 - # Note that our padding definition is slightly different the cuda-convnet. - # In order to force the model to start with the same activations sizes, - # we add 3 to the image_size and employ VALID padding above. - images = tf.Variable(tf.random_normal([FLAGS.batch_size, - image_size, - image_size, 3], - dtype=tf.float32, - stddev=1e-1)) - - # Build a Graph that computes the logits predictions from the - # inference model. - pool5, parameters = inference(images) - - # Build an initialization operation. - init = tf.global_variables_initializer() - - # Start running operations on the Graph. - config = tf.ConfigProto() - config.gpu_options.allocator_type = 'BFC' - sess = tf.Session(config=config) - sess.run(init) - - # Run the forward benchmark. - time_tensorflow_run(sess, pool5, "Forward") - - # Add a simple objective so we can calculate the backward pass. - objective = tf.nn.l2_loss(pool5) - # Compute the gradient with respect to all the parameters. - grad = tf.gradients(objective, parameters) - # Run the backward benchmark. - time_tensorflow_run(sess, grad, "Forward-backward") - - -def main(_): - run_benchmark() - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--batch_size', - type=int, - default=128, - help='Batch size.' - ) - parser.add_argument( - '--num_batches', - type=int, - default=100, - help='Number of batches to run.' - ) - FLAGS, unparsed = parser.parse_known_args() - tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) diff --git a/tutorials/image/cifar10/BUILD b/tutorials/image/cifar10/BUILD deleted file mode 100644 index 9cf574f605e..00000000000 --- a/tutorials/image/cifar10/BUILD +++ /dev/null @@ -1,87 +0,0 @@ -# Description: -# Example TensorFlow models for CIFAR-10 - -licenses(["notice"]) # Apache 2.0 - -exports_files(["LICENSE"]) - -py_library( - name = "cifar10_input", - srcs = ["cifar10_input.py"], - srcs_version = "PY2AND3", - visibility = ["//tensorflow:internal"], - deps = [ - "//tensorflow:tensorflow_py", - ], -) - -py_test( - name = "cifar10_input_test", - size = "small", - srcs = ["cifar10_input_test.py"], - srcs_version = "PY2AND3", - deps = [ - ":cifar10_input", - "//tensorflow:tensorflow_py", - "//tensorflow/python:framework_test_lib", - "//tensorflow/python:platform_test", - ], -) - -py_library( - name = "cifar10", - srcs = ["cifar10.py"], - srcs_version = "PY2AND3", - deps = [ - ":cifar10_input", - "//tensorflow:tensorflow_py", - ], -) - -py_binary( - name = "cifar10_eval", - srcs = [ - "cifar10_eval.py", - ], - srcs_version = "PY2AND3", - visibility = ["//tensorflow:__subpackages__"], - deps = [ - ":cifar10", - ], -) - -py_binary( - name = "cifar10_train", - srcs = [ - "cifar10_train.py", - ], - srcs_version = "PY2AND3", - visibility = ["//tensorflow:__subpackages__"], - deps = [ - ":cifar10", - ], -) - -py_binary( - name = "cifar10_multi_gpu_train", - srcs = [ - "cifar10_multi_gpu_train.py", - ], - srcs_version = "PY2AND3", - visibility = ["//tensorflow:__subpackages__"], - deps = [ - ":cifar10", - ], -) - -filegroup( - name = "all_files", - srcs = glob( - ["**/*"], - exclude = [ - "**/METADATA", - "**/OWNERS", - ], - ), - visibility = ["//tensorflow:__subpackages__"], -) diff --git a/tutorials/image/cifar10/README.md b/tutorials/image/cifar10/README.md deleted file mode 100644 index 69b6d08e431..00000000000 --- a/tutorials/image/cifar10/README.md +++ /dev/null @@ -1,13 +0,0 @@ -**NOTE: For users interested in multi-GPU, we recommend looking at the newer [cifar10_estimator](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10_estimator) example instead.** - ---- - -CIFAR-10 is a common benchmark in machine learning for image recognition. - -http://www.cs.toronto.edu/~kriz/cifar.html - -Code in this directory demonstrates how to use TensorFlow to train and evaluate a convolutional neural network (CNN) on both CPU and GPU. We also demonstrate how to train a CNN over multiple GPUs. - -Detailed instructions on how to get started available at: - -https://www.tensorflow.org/tutorials/images/deep_cnn diff --git a/tutorials/image/cifar10/__init__.py b/tutorials/image/cifar10/__init__.py deleted file mode 100644 index 6b2729e7e0b..00000000000 --- a/tutorials/image/cifar10/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Makes helper libraries available in the cifar10 package.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import cifar10 -import cifar10_input diff --git a/tutorials/image/cifar10/cifar10.py b/tutorials/image/cifar10/cifar10.py deleted file mode 100644 index c725a890b82..00000000000 --- a/tutorials/image/cifar10/cifar10.py +++ /dev/null @@ -1,351 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Builds the CIFAR-10 network. - -Summary of available functions: - - # Compute input images and labels for training. If you would like to run - # evaluations, use inputs() instead. - inputs, labels = distorted_inputs() - - # Compute inference on the model inputs to make a prediction. - predictions = inference(inputs) - - # Compute the total loss of the prediction with respect to the labels. - loss = loss(predictions, labels) - - # Create a graph to run one step of training with respect to the loss. - train_op = train(loss, global_step) -""" -# pylint: disable=missing-docstring -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import re - -import tensorflow as tf - -import cifar10_input - -FLAGS = tf.app.flags.FLAGS - -# Basic model parameters. -tf.app.flags.DEFINE_integer('batch_size', 128, - """Number of images to process in a batch.""") -tf.app.flags.DEFINE_boolean('use_fp16', True, - """Train the model using fp16.""") - -# Global constants describing the CIFAR-10 data set. -IMAGE_SIZE = cifar10_input.IMAGE_SIZE -NUM_CLASSES = cifar10_input.NUM_CLASSES -NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN -NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL - - -# Constants describing the training process. -MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average. -NUM_EPOCHS_PER_DECAY = 350.0 # Epochs after which learning rate decays. -LEARNING_RATE_DECAY_FACTOR = 0.1 # Learning rate decay factor. -INITIAL_LEARNING_RATE = 0.1 # Initial learning rate. - -# If a model is trained with multiple GPUs, prefix all Op names with tower_name -# to differentiate the operations. Note that this prefix is removed from the -# names of the summaries when visualizing a model. -TOWER_NAME = 'tower' - - -def _activation_summary(x): - """Helper to create summaries for activations. - - Creates a summary that provides a histogram of activations. - Creates a summary that measures the sparsity of activations. - - Args: - x: Tensor - Returns: - nothing - """ - # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training - # session. This helps the clarity of presentation on tensorboard. - tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name) - tf.summary.histogram(tensor_name + '/activations', x) - tf.summary.scalar(tensor_name + '/sparsity', tf.nn.zero_fraction(x)) - - -def _variable_on_cpu(name, shape, initializer): - """Helper to create a Variable stored on CPU memory. - - Args: - name: name of the variable - shape: list of ints - initializer: initializer for Variable - - Returns: - Variable Tensor - """ - with tf.device('/cpu:0'): - dtype = tf.float16 if FLAGS.use_fp16 else tf.float32 - var = tf.get_variable(name, shape, initializer=initializer, dtype=dtype) - return var - - -def _variable_with_weight_decay(name, shape, stddev, wd): - """Helper to create an initialized Variable with weight decay. - - Note that the Variable is initialized with a truncated normal distribution. - A weight decay is added only if one is specified. - - Args: - name: name of the variable - shape: list of ints - stddev: standard deviation of a truncated Gaussian - wd: add L2Loss weight decay multiplied by this float. If None, weight - decay is not added for this Variable. - - Returns: - Variable Tensor - """ - dtype = tf.float16 if FLAGS.use_fp16 else tf.float32 - var = _variable_on_cpu( - name, - shape, - tf.truncated_normal_initializer(stddev=stddev, dtype=dtype)) - if wd is not None: - weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss') - tf.add_to_collection('losses', weight_decay) - return var - - -def distorted_inputs(): - """Construct distorted input for CIFAR training using the Reader ops. - - Returns: - images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size. - labels: Labels. 1D tensor of [batch_size] size. - """ - images, labels = cifar10_input.distorted_inputs(batch_size=FLAGS.batch_size) - if FLAGS.use_fp16: - images = tf.cast(images, tf.float16) - labels = tf.cast(labels, tf.float16) - return images, labels - - -def inputs(eval_data): - """Construct input for CIFAR evaluation using the Reader ops. - Args: - eval_data: bool, indicating if one should use the train or eval data set. - - Returns: - images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size. - labels: Labels. 1D tensor of [batch_size] size. - """ - images, labels = cifar10_input.inputs(eval_data=eval_data, batch_size=FLAGS.batch_size) - if FLAGS.use_fp16: - images = tf.cast(images, tf.float16) - labels = tf.cast(labels, tf.float16) - return images, labels - - -def inference(images): - """Build the CIFAR-10 model. - - Args: - images: Images returned from distorted_inputs() or inputs(). - - Returns: - Logits. - """ - # We instantiate all variables using tf.get_variable() instead of - # tf.Variable() in order to share variables across multiple GPU training runs. - # If we only ran this model on a single GPU, we could simplify this function - # by replacing all instances of tf.get_variable() with tf.Variable(). - # - # conv1 - with tf.variable_scope('conv1') as scope: - kernel = _variable_with_weight_decay('weights', - shape=[5, 5, 3, 64], - stddev=5e-2, - wd=None) - conv = tf.nn.conv2d(images, kernel, [1, 1, 1, 1], padding='SAME') - biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.0)) - pre_activation = tf.nn.bias_add(conv, biases) - conv1 = tf.nn.relu(pre_activation, name=scope.name) - _activation_summary(conv1) - - # pool1 - pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], - padding='SAME', name='pool1') - # norm1 - norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, - name='norm1') - - # conv2 - with tf.variable_scope('conv2') as scope: - kernel = _variable_with_weight_decay('weights', - shape=[5, 5, 64, 64], - stddev=5e-2, - wd=None) - conv = tf.nn.conv2d(norm1, kernel, [1, 1, 1, 1], padding='SAME') - biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1)) - pre_activation = tf.nn.bias_add(conv, biases) - conv2 = tf.nn.relu(pre_activation, name=scope.name) - _activation_summary(conv2) - - # norm2 - norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, - name='norm2') - # pool2 - pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1], - strides=[1, 2, 2, 1], padding='SAME', name='pool2') - - # local3 - with tf.variable_scope('local3') as scope: - # Move everything into depth so we can perform a single matrix multiply. - reshape = tf.keras.layers.Flatten()(pool2) - dim = reshape.get_shape()[1].value - weights = _variable_with_weight_decay('weights', shape=[dim, 384], - stddev=0.04, wd=0.004) - biases = _variable_on_cpu('biases', [384], tf.constant_initializer(0.1)) - local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name) - _activation_summary(local3) - - # local4 - with tf.variable_scope('local4') as scope: - weights = _variable_with_weight_decay('weights', shape=[384, 192], - stddev=0.04, wd=0.004) - biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.1)) - local4 = tf.nn.relu(tf.matmul(local3, weights) + biases, name=scope.name) - _activation_summary(local4) - - # linear layer(WX + b), - # We don't apply softmax here because - # tf.nn.sparse_softmax_cross_entropy_with_logits accepts the unscaled logits - # and performs the softmax internally for efficiency. - with tf.variable_scope('softmax_linear') as scope: - weights = _variable_with_weight_decay('weights', [192, NUM_CLASSES], - stddev=1/192.0, wd=None) - biases = _variable_on_cpu('biases', [NUM_CLASSES], - tf.constant_initializer(0.0)) - softmax_linear = tf.add(tf.matmul(local4, weights), biases, name=scope.name) - _activation_summary(softmax_linear) - - return softmax_linear - - -def loss(logits, labels): - """Add L2Loss to all the trainable variables. - - Add summary for "Loss" and "Loss/avg". - Args: - logits: Logits from inference(). - labels: Labels from distorted_inputs or inputs(). 1-D tensor - of shape [batch_size] - - Returns: - Loss tensor of type float. - """ - # Calculate the average cross entropy loss across the batch. - labels = tf.cast(labels, tf.int64) - cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( - labels=labels, logits=logits, name='cross_entropy_per_example') - cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') - tf.add_to_collection('losses', cross_entropy_mean) - - # The total loss is defined as the cross entropy loss plus all of the weight - # decay terms (L2 loss). - return tf.add_n(tf.get_collection('losses'), name='total_loss') - - -def _add_loss_summaries(total_loss): - """Add summaries for losses in CIFAR-10 model. - - Generates moving average for all losses and associated summaries for - visualizing the performance of the network. - - Args: - total_loss: Total loss from loss(). - Returns: - loss_averages_op: op for generating moving averages of losses. - """ - # Compute the moving average of all individual losses and the total loss. - loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') - losses = tf.get_collection('losses') - loss_averages_op = loss_averages.apply(losses + [total_loss]) - - # Attach a scalar summary to all individual losses and the total loss; do the - # same for the averaged version of the losses. - for l in losses + [total_loss]: - # Name each loss as '(raw)' and name the moving average version of the loss - # as the original loss name. - tf.summary.scalar(l.op.name + ' (raw)', l) - tf.summary.scalar(l.op.name, loss_averages.average(l)) - - return loss_averages_op - - -def train(total_loss, global_step): - """Train CIFAR-10 model. - - Create an optimizer and apply to all trainable variables. Add moving - average for all trainable variables. - - Args: - total_loss: Total loss from loss(). - global_step: Integer Variable counting the number of training steps - processed. - Returns: - train_op: op for training. - """ - # Variables that affect learning rate. - num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size - decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) - - # Decay the learning rate exponentially based on the number of steps. - lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE, - global_step, - decay_steps, - LEARNING_RATE_DECAY_FACTOR, - staircase=True) - tf.summary.scalar('learning_rate', lr) - - # Generate moving averages of all losses and associated summaries. - loss_averages_op = _add_loss_summaries(total_loss) - - # Compute gradients. - with tf.control_dependencies([loss_averages_op]): - opt = tf.train.GradientDescentOptimizer(lr) - grads = opt.compute_gradients(total_loss) - - # Apply gradients. - apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) - - # Add histograms for trainable variables. - for var in tf.trainable_variables(): - tf.summary.histogram(var.op.name, var) - - # Add histograms for gradients. - for grad, var in grads: - if grad is not None: - tf.summary.histogram(var.op.name + '/gradients', grad) - - # Track the moving averages of all trainable variables. - variable_averages = tf.train.ExponentialMovingAverage( - MOVING_AVERAGE_DECAY, global_step) - with tf.control_dependencies([apply_gradient_op]): - variables_averages_op = variable_averages.apply(tf.trainable_variables()) - - return variables_averages_op diff --git a/tutorials/image/cifar10/cifar10_eval.py b/tutorials/image/cifar10/cifar10_eval.py deleted file mode 100644 index fab39d1caf8..00000000000 --- a/tutorials/image/cifar10/cifar10_eval.py +++ /dev/null @@ -1,158 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Evaluation for CIFAR-10. - -Accuracy: -cifar10_train.py achieves 83.0% accuracy after 100K steps (256 epochs -of data) as judged by cifar10_eval.py. - -Speed: -On a single Tesla K40, cifar10_train.py processes a single batch of 128 images -in 0.25-0.35 sec (i.e. 350 - 600 images /sec). The model reaches ~86% -accuracy after 100K steps in 8 hours of training time. - -Usage: -Please see the tutorial and website for how to download the CIFAR-10 -data set, compile the program and train the model. - -http://tensorflow.org/tutorials/deep_cnn/ -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from datetime import datetime -import math -import time - -import numpy as np -import tensorflow as tf - -import cifar10 - -FLAGS = tf.app.flags.FLAGS - -tf.app.flags.DEFINE_string('eval_dir', '/tmp/cifar10_eval', - """Directory where to write event logs.""") -tf.app.flags.DEFINE_string('eval_data', 'test', - """Either 'test' or 'train_eval'.""") -tf.app.flags.DEFINE_string('checkpoint_dir', '/tmp/cifar10_train', - """Directory where to read model checkpoints.""") -tf.app.flags.DEFINE_integer('eval_interval_secs', 5, - """How often to run the eval.""") -tf.app.flags.DEFINE_integer('num_examples', 1000, - """Number of examples to run.""") -tf.app.flags.DEFINE_boolean('run_once', False, - """Whether to run eval only once.""") - - -def eval_once(saver, summary_writer, top_k_op, summary_op): - """Run Eval once. - - Args: - saver: Saver. - summary_writer: Summary writer. - top_k_op: Top K op. - summary_op: Summary op. - """ - with tf.Session() as sess: - ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) - if ckpt and ckpt.model_checkpoint_path: - # Restores from checkpoint - saver.restore(sess, ckpt.model_checkpoint_path) - # Assuming model_checkpoint_path looks something like: - # /my-favorite-path/cifar10_train/model.ckpt-0, - # extract global_step from it. - global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] - else: - print('No checkpoint file found') - return - - # Start the queue runners. - coord = tf.train.Coordinator() - try: - threads = [] - for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS): - threads.extend(qr.create_threads(sess, coord=coord, daemon=True, - start=True)) - - num_iter = int(math.ceil(float(FLAGS.num_examples) / FLAGS.batch_size)) - true_count = 0 # Counts the number of correct predictions. - total_sample_count = num_iter * FLAGS.batch_size - step = 0 - while step < num_iter and not coord.should_stop(): - predictions = sess.run([top_k_op]) - true_count += np.sum(predictions) - step += 1 - - # Compute precision @ 1. - precision = true_count / total_sample_count - print('%s: precision @ 1 = %.3f' % (datetime.now(), precision)) - - summary = tf.Summary() - summary.ParseFromString(sess.run(summary_op)) - summary.value.add(tag='Precision @ 1', simple_value=precision) - summary_writer.add_summary(summary, global_step) - except Exception as e: # pylint: disable=broad-except - coord.request_stop(e) - - coord.request_stop() - coord.join(threads, stop_grace_period_secs=10) - - -def evaluate(): - """Eval CIFAR-10 for a number of steps.""" - with tf.Graph().as_default() as g: - # Get images and labels for CIFAR-10. - images, labels = cifar10.inputs(eval_data=FLAGS.eval_data) - - # Build a Graph that computes the logits predictions from the - # inference model. - logits = cifar10.inference(images) - - logits = tf.cast(logits, "float32") - labels = tf.cast(labels, "int32") - - # Calculate predictions. - top_k_op = tf.nn.in_top_k(logits, labels, 1) - - # Restore the moving average version of the learned variables for eval. - variable_averages = tf.train.ExponentialMovingAverage( - cifar10.MOVING_AVERAGE_DECAY) - variables_to_restore = variable_averages.variables_to_restore() - saver = tf.train.Saver(variables_to_restore) - - # Build the summary operation based on the TF collection of Summaries. - summary_op = tf.summary.merge_all() - - summary_writer = tf.summary.FileWriter(FLAGS.eval_dir, g) - - while True: - eval_once(saver, summary_writer, top_k_op, summary_op) - if FLAGS.run_once: - break - time.sleep(FLAGS.eval_interval_secs) - - -def main(argv=None): # pylint: disable=unused-argument - if tf.gfile.Exists(FLAGS.eval_dir): - tf.gfile.DeleteRecursively(FLAGS.eval_dir) - tf.gfile.MakeDirs(FLAGS.eval_dir) - evaluate() - - -if __name__ == '__main__': - tf.app.run() diff --git a/tutorials/image/cifar10/cifar10_input.py b/tutorials/image/cifar10/cifar10_input.py deleted file mode 100644 index 82d460e2133..00000000000 --- a/tutorials/image/cifar10/cifar10_input.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Routine for decoding the CIFAR-10 binary file format.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf -import tensorflow_datasets as tfds - -# Process images of this size. Note that this differs from the original CIFAR -# image size of 32 x 32. If one alters this number, then the entire model -# architecture will change and any model would need to be retrained. -IMAGE_SIZE = 24 - -# Global constants describing the CIFAR-10 data set. -NUM_CLASSES = 10 -NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000 -NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = 10000 - - -def _get_images_labels(batch_size, split, distords=False): - """Returns Dataset for given split.""" - dataset = tfds.load(name='cifar10', split=split) - scope = 'data_augmentation' if distords else 'input' - with tf.name_scope(scope): - dataset = dataset.map(DataPreprocessor(distords), num_parallel_calls=10) - # Dataset is small enough to be fully loaded on memory: - dataset = dataset.prefetch(-1) - dataset = dataset.repeat().batch(batch_size) - iterator = dataset.make_one_shot_iterator() - images_labels = iterator.get_next() - images, labels = images_labels['input'], images_labels['target'] - tf.summary.image('images', images) - return images, labels - - -class DataPreprocessor(object): - """Applies transformations to dataset record.""" - - def __init__(self, distords): - self._distords = distords - - def __call__(self, record): - """Process img for training or eval.""" - img = record['image'] - img = tf.cast(img, tf.float32) - if self._distords: # training - # Randomly crop a [height, width] section of the image. - img = tf.random_crop(img, [IMAGE_SIZE, IMAGE_SIZE, 3]) - # Randomly flip the image horizontally. - img = tf.image.random_flip_left_right(img) - # Because these operations are not commutative, consider randomizing - # the order their operation. - # NOTE: since per_image_standardization zeros the mean and makes - # the stddev unit, this likely has no effect see tensorflow#1458. - img = tf.image.random_brightness(img, max_delta=63) - img = tf.image.random_contrast(img, lower=0.2, upper=1.8) - else: # Image processing for evaluation. - # Crop the central [height, width] of the image. - img = tf.image.resize_image_with_crop_or_pad(img, IMAGE_SIZE, IMAGE_SIZE) - # Subtract off the mean and divide by the variance of the pixels. - img = tf.image.per_image_standardization(img) - return dict(input=img, target=record['label']) - - -def distorted_inputs(batch_size): - """Construct distorted input for CIFAR training using the Reader ops. - - Args: - batch_size: Number of images per batch. - - Returns: - images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size. - labels: Labels. 1D tensor of [batch_size] size. - """ - return _get_images_labels(batch_size, tfds.Split.TRAIN, distords=True) - - -def inputs(eval_data, batch_size): - """Construct input for CIFAR evaluation using the Reader ops. - - Args: - eval_data: bool, indicating if one should use the train or eval data set. - batch_size: Number of images per batch. - - Returns: - images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size. - labels: Labels. 1D tensor of [batch_size] size. - """ - split = tfds.Split.TEST if eval_data == 'test' else tfds.Split.TRAIN - return _get_images_labels(batch_size, split) diff --git a/tutorials/image/cifar10/cifar10_input_test.py b/tutorials/image/cifar10/cifar10_input_test.py deleted file mode 100644 index dbae1cab411..00000000000 --- a/tutorials/image/cifar10/cifar10_input_test.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Tests for cifar10 input.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os - -import tensorflow as tf - -import cifar10_input - - -class CIFAR10InputTest(tf.test.TestCase): - - def _record(self, label, red, green, blue): - image_size = 32 * 32 - record = bytes(bytearray([label] + [red] * image_size + - [green] * image_size + [blue] * image_size)) - expected = [[[red, green, blue]] * 32] * 32 - return record, expected - - def testSimple(self): - labels = [9, 3, 0] - records = [self._record(labels[0], 0, 128, 255), - self._record(labels[1], 255, 0, 1), - self._record(labels[2], 254, 255, 0)] - contents = b"".join([record for record, _ in records]) - expected = [expected for _, expected in records] - filename = os.path.join(self.get_temp_dir(), "cifar") - open(filename, "wb").write(contents) - - with self.test_session() as sess: - q = tf.FIFOQueue(99, [tf.string], shapes=()) - q.enqueue([filename]).run() - q.close().run() - result = cifar10_input.read_cifar10(q) - - for i in range(3): - key, label, uint8image = sess.run([ - result.key, result.label, result.uint8image]) - self.assertEqual("%s:%d" % (filename, i), tf.compat.as_text(key)) - self.assertEqual(labels[i], label) - self.assertAllEqual(expected[i], uint8image) - - with self.assertRaises(tf.errors.OutOfRangeError): - sess.run([result.key, result.uint8image]) - - -if __name__ == "__main__": - tf.test.main() diff --git a/tutorials/image/cifar10/cifar10_multi_gpu_train.py b/tutorials/image/cifar10/cifar10_multi_gpu_train.py deleted file mode 100644 index 8cb8a096f84..00000000000 --- a/tutorials/image/cifar10/cifar10_multi_gpu_train.py +++ /dev/null @@ -1,279 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""A binary to train CIFAR-10 using multiple GPUs with synchronous updates. - -Accuracy: -cifar10_multi_gpu_train.py achieves ~86% accuracy after 100K steps (256 -epochs of data) as judged by cifar10_eval.py. - -Speed: With batch_size 128. - -System | Step Time (sec/batch) | Accuracy --------------------------------------------------------------------- -1 Tesla K20m | 0.35-0.60 | ~86% at 60K steps (5 hours) -1 Tesla K40m | 0.25-0.35 | ~86% at 100K steps (4 hours) -2 Tesla K20m | 0.13-0.20 | ~84% at 30K steps (2.5 hours) -3 Tesla K20m | 0.13-0.18 | ~84% at 30K steps -4 Tesla K20m | ~0.10 | ~84% at 30K steps - -Usage: -Please see the tutorial and website for how to download the CIFAR-10 -data set, compile the program and train the model. - -http://tensorflow.org/tutorials/deep_cnn/ -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os.path -import re -import time -from datetime import datetime - -import numpy as np -import tensorflow as tf -from six.moves import xrange # pylint: disable=redefined-builtin - -import cifar10 - -FLAGS = tf.app.flags.FLAGS - -tf.app.flags.DEFINE_string('train_dir', '/tmp/cifar10_train', - """Directory where to write event logs """ - """and checkpoint.""") -tf.app.flags.DEFINE_integer('max_steps', 1000000, - """Number of batches to run.""") -tf.app.flags.DEFINE_integer('num_gpus', 1, - """How many GPUs to use.""") -tf.app.flags.DEFINE_boolean('log_device_placement', False, - """Whether to log device placement.""") - - -def tower_loss(scope, images, labels): - """Calculate the total loss on a single tower running the CIFAR model. - - Args: - scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0' - images: Images. 4D tensor of shape [batch_size, height, width, 3]. - labels: Labels. 1D tensor of shape [batch_size]. - - Returns: - Tensor of shape [] containing the total loss for a batch of data - """ - - # Build inference Graph. - logits = cifar10.inference(images) - - # Build the portion of the Graph calculating the losses. Note that we will - # assemble the total_loss using a custom function below. - _ = cifar10.loss(logits, labels) - - # Assemble all of the losses for the current tower only. - losses = tf.get_collection('losses', scope) - - # Calculate the total loss for the current tower. - total_loss = tf.add_n(losses, name='total_loss') - - # Attach a scalar summary to all individual losses and the total loss; do the - # same for the averaged version of the losses. - for l in losses + [total_loss]: - # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training - # session. This helps the clarity of presentation on tensorboard. - loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name) - tf.summary.scalar(loss_name, l) - - return total_loss - - -def average_gradients(tower_grads): - """Calculate the average gradient for each shared variable across all towers. - - Note that this function provides a synchronization point across all towers. - - Args: - tower_grads: List of lists of (gradient, variable) tuples. The outer list - is over individual gradients. The inner list is over the gradient - calculation for each tower. - Returns: - List of pairs of (gradient, variable) where the gradient has been averaged - across all towers. - """ - average_grads = [] - for grad_and_vars in zip(*tower_grads): - # Note that each grad_and_vars looks like the following: - # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) - grads = [] - for g, _ in grad_and_vars: - # Add 0 dimension to the gradients to represent the tower. - expanded_g = tf.expand_dims(g, 0) - - # Append on a 'tower' dimension which we will average over below. - grads.append(expanded_g) - - # Average over the 'tower' dimension. - grad = tf.concat(axis=0, values=grads) - grad = tf.reduce_mean(grad, 0) - - # Keep in mind that the Variables are redundant because they are shared - # across towers. So .. we will just return the first tower's pointer to - # the Variable. - v = grad_and_vars[0][1] - grad_and_var = (grad, v) - average_grads.append(grad_and_var) - return average_grads - - -def train(): - """Train CIFAR-10 for a number of steps.""" - with tf.Graph().as_default(), tf.device('/cpu:0'): - # Create a variable to count the number of train() calls. This equals the - # number of batches processed * FLAGS.num_gpus. - global_step = tf.get_variable( - 'global_step', [], - initializer=tf.constant_initializer(0), trainable=False) - - # Calculate the learning rate schedule. - num_batches_per_epoch = (cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / - FLAGS.batch_size / FLAGS.num_gpus) - decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY) - - # Decay the learning rate exponentially based on the number of steps. - lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE, - global_step, - decay_steps, - cifar10.LEARNING_RATE_DECAY_FACTOR, - staircase=True) - - # Create an optimizer that performs gradient descent. - opt = tf.train.GradientDescentOptimizer(lr) - - # Get images and labels for CIFAR-10. - images, labels = cifar10.distorted_inputs() - images = tf.reshape(images, [cifar10.FLAGS.batch_size, 24, 24, 3]) - labels = tf.reshape(labels, [cifar10.FLAGS.batch_size]) - batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue( - [images, labels], capacity=2 * FLAGS.num_gpus) - # Calculate the gradients for each model tower. - tower_grads = [] - with tf.variable_scope(tf.get_variable_scope()): - for i in xrange(FLAGS.num_gpus): - with tf.device('/gpu:%d' % i): - with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, i)) as scope: - # Dequeues one batch for the GPU - image_batch, label_batch = batch_queue.dequeue() - # Calculate the loss for one tower of the CIFAR model. This function - # constructs the entire CIFAR model but shares the variables across - # all towers. - loss = tower_loss(scope, image_batch, label_batch) - - # Reuse variables for the next tower. - tf.get_variable_scope().reuse_variables() - - # Retain the summaries from the final tower. - summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) - - # Calculate the gradients for the batch of data on this CIFAR tower. - grads = opt.compute_gradients(loss) - - # Keep track of the gradients across all towers. - tower_grads.append(grads) - - # We must calculate the mean of each gradient. Note that this is the - # synchronization point across all towers. - grads = average_gradients(tower_grads) - - # Add a summary to track the learning rate. - summaries.append(tf.summary.scalar('learning_rate', lr)) - - # Add histograms for gradients. - for grad, var in grads: - if grad is not None: - summaries.append(tf.summary.histogram(var.op.name + '/gradients', grad)) - - # Apply the gradients to adjust the shared variables. - apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) - - # Add histograms for trainable variables. - for var in tf.trainable_variables(): - summaries.append(tf.summary.histogram(var.op.name, var)) - - # Track the moving averages of all trainable variables. - variable_averages = tf.train.ExponentialMovingAverage( - cifar10.MOVING_AVERAGE_DECAY, global_step) - variables_averages_op = variable_averages.apply(tf.trainable_variables()) - - # Group all updates to into a single train op. - train_op = tf.group(apply_gradient_op, variables_averages_op) - - # Create a saver. - saver = tf.train.Saver(tf.global_variables()) - - # Build the summary operation from the last tower summaries. - summary_op = tf.summary.merge(summaries) - - # Build an initialization operation to run below. - init = tf.global_variables_initializer() - - # Start running operations on the Graph. allow_soft_placement must be set to - # True to build towers on GPU, as some of the ops do not have GPU - # implementations. - sess = tf.Session(config=tf.ConfigProto( - allow_soft_placement=True, - log_device_placement=FLAGS.log_device_placement)) - sess.run(init) - - # Start the queue runners. - tf.train.start_queue_runners(sess=sess) - - summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) - - for step in xrange(FLAGS.max_steps): - start_time = time.time() - _, loss_value = sess.run([train_op, loss]) - duration = time.time() - start_time - - assert not np.isnan(loss_value), 'Model diverged with loss = NaN' - - if step % 10 == 0: - num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus - examples_per_sec = num_examples_per_step / duration - sec_per_batch = duration / FLAGS.num_gpus - - format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' - 'sec/batch)') - print (format_str % (datetime.now(), step, loss_value, - examples_per_sec, sec_per_batch)) - - if step % 100 == 0: - summary_str = sess.run(summary_op) - summary_writer.add_summary(summary_str, step) - - # Save the model checkpoint periodically. - if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: - checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') - saver.save(sess, checkpoint_path, global_step=step) - - -def main(argv=None): # pylint: disable=unused-argument - if tf.gfile.Exists(FLAGS.train_dir): - tf.gfile.DeleteRecursively(FLAGS.train_dir) - tf.gfile.MakeDirs(FLAGS.train_dir) - train() - - -if __name__ == '__main__': - tf.app.run() diff --git a/tutorials/image/cifar10/cifar10_train.py b/tutorials/image/cifar10/cifar10_train.py deleted file mode 100644 index 4b4d967bc0e..00000000000 --- a/tutorials/image/cifar10/cifar10_train.py +++ /dev/null @@ -1,126 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""A binary to train CIFAR-10 using a single GPU. - -Accuracy: -cifar10_train.py achieves ~86% accuracy after 100K steps (256 epochs of -data) as judged by cifar10_eval.py. - -Speed: With batch_size 128. - -System | Step Time (sec/batch) | Accuracy ------------------------------------------------------------------- -1 Tesla K20m | 0.35-0.60 | ~86% at 60K steps (5 hours) -1 Tesla K40m | 0.25-0.35 | ~86% at 100K steps (4 hours) - -Usage: -Please see the tutorial and website for how to download the CIFAR-10 -data set, compile the program and train the model. - -http://tensorflow.org/tutorials/deep_cnn/ -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from datetime import datetime -import time - -import tensorflow as tf - -import cifar10 - -FLAGS = tf.app.flags.FLAGS - -tf.app.flags.DEFINE_string('train_dir', '/tmp/cifar10_train', - """Directory where to write event logs """ - """and checkpoint.""") -tf.app.flags.DEFINE_integer('max_steps', 100000, - """Number of batches to run.""") -tf.app.flags.DEFINE_boolean('log_device_placement', False, - """Whether to log device placement.""") -tf.app.flags.DEFINE_integer('log_frequency', 10, - """How often to log results to the console.""") - - -def train(): - """Train CIFAR-10 for a number of steps.""" - with tf.Graph().as_default(): - global_step = tf.train.get_or_create_global_step() - - # Get images and labels for CIFAR-10. - # Force input pipeline to CPU:0 to avoid operations sometimes ending up on - # GPU and resulting in a slow down. - with tf.device('/cpu:0'): - images, labels = cifar10.distorted_inputs() - - # Build a Graph that computes the logits predictions from the - # inference model. - logits = cifar10.inference(images) - - # Calculate loss. - loss = cifar10.loss(logits, labels) - - # Build a Graph that trains the model with one batch of examples and - # updates the model parameters. - train_op = cifar10.train(loss, global_step) - - class _LoggerHook(tf.train.SessionRunHook): - """Logs loss and runtime.""" - - def begin(self): - self._step = -1 - self._start_time = time.time() - - def before_run(self, run_context): - self._step += 1 - return tf.train.SessionRunArgs(loss) # Asks for loss value. - - def after_run(self, run_context, run_values): - if self._step % FLAGS.log_frequency == 0: - current_time = time.time() - duration = current_time - self._start_time - self._start_time = current_time - - loss_value = run_values.results - examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration - sec_per_batch = float(duration / FLAGS.log_frequency) - - format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' - 'sec/batch)') - print (format_str % (datetime.now(), self._step, loss_value, - examples_per_sec, sec_per_batch)) - - with tf.train.MonitoredTrainingSession( - checkpoint_dir=FLAGS.train_dir, - hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), - tf.train.NanTensorHook(loss), - _LoggerHook()], - config=tf.ConfigProto( - log_device_placement=FLAGS.log_device_placement)) as mon_sess: - while not mon_sess.should_stop(): - mon_sess.run(train_op) - - -def main(argv=None): # pylint: disable=unused-argument - if tf.gfile.Exists(FLAGS.train_dir): - tf.gfile.DeleteRecursively(FLAGS.train_dir) - tf.gfile.MakeDirs(FLAGS.train_dir) - train() - - -if __name__ == '__main__': - tf.app.run() diff --git a/tutorials/image/cifar10_estimator/README.md b/tutorials/image/cifar10_estimator/README.md deleted file mode 100644 index 5627e9b9f59..00000000000 --- a/tutorials/image/cifar10_estimator/README.md +++ /dev/null @@ -1,523 +0,0 @@ -CIFAR-10 is a common benchmark in machine learning for image recognition. - -http://www.cs.toronto.edu/~kriz/cifar.html - -Code in this directory focuses on how to use TensorFlow Estimators to train and -evaluate a CIFAR-10 ResNet model on: - -* A single host with one CPU; -* A single host with multiple GPUs; -* Multiple hosts with CPU or multiple GPUs; - -Before trying to run the model we highly encourage you to read all the README. - -## Prerequisite - -1. [Install](https://www.tensorflow.org/install/) TensorFlow version 1.9.0 or -later. - -2. Download the CIFAR-10 dataset and generate TFRecord files using the provided -script. The script and associated command below will download the CIFAR-10 -dataset and then generate a TFRecord for the training, validation, and -evaluation datasets. - -```shell -python generate_cifar10_tfrecords.py --data-dir=${PWD}/cifar-10-data -``` - -After running the command above, you should see the following files in the ---data-dir (```ls -R cifar-10-data```): - -* train.tfrecords -* validation.tfrecords -* eval.tfrecords - - -## Training on a single machine with GPUs or CPU - -Run the training on CPU only. After training, it runs the evaluation. - -``` -python cifar10_main.py --data-dir=${PWD}/cifar-10-data \ - --job-dir=/tmp/cifar10 \ - --num-gpus=0 \ - --train-steps=1000 -``` - -Run the model on 2 GPUs using CPU as parameter server. After training, it runs -the evaluation. -``` -python cifar10_main.py --data-dir=${PWD}/cifar-10-data \ - --job-dir=/tmp/cifar10 \ - --num-gpus=2 \ - --train-steps=1000 -``` - -Run the model on 2 GPUs using GPU as parameter server. -It will run an experiment, which for local setting basically means it will run -stop training -a couple of times to perform evaluation. - -``` -python cifar10_main.py --data-dir=${PWD}/cifar-10-data \ - --job-dir=/tmp/cifar10 \ - --variable-strategy GPU \ - --num-gpus=2 \ -``` - -There are more command line flags to play with; run -`python cifar10_main.py --help` for details. - -## Run distributed training - -### (Optional) Running on Google Cloud Machine Learning Engine - -This example can be run on Google Cloud Machine Learning Engine (ML Engine), -which will configure the environment and take care of running workers, -parameters servers, and masters in a fault tolerant way. - -To install the command line tool, and set up a project and billing, see the -quickstart [here](https://cloud.google.com/ml-engine/docs/quickstarts/command-line). - -You'll also need a Google Cloud Storage bucket for the data. If you followed the -instructions above, you can just run: - -``` -MY_BUCKET=gs:// -gsutil cp -r ${PWD}/cifar-10-data $MY_BUCKET/ -``` - -Then run the following command from the `tutorials/image` directory of this -repository (the parent directory of this README): - -``` -gcloud ml-engine jobs submit training cifarmultigpu \ - --runtime-version 1.2 \ - --job-dir=$MY_BUCKET/model_dirs/cifarmultigpu \ - --config cifar10_estimator/cmle_config.yaml \ - --package-path cifar10_estimator/ \ - --module-name cifar10_estimator.cifar10_main \ - -- \ - --data-dir=$MY_BUCKET/cifar-10-data \ - --num-gpus=4 \ - --train-steps=1000 -``` - - -### Set TF_CONFIG - -Considering that you already have multiple hosts configured, all you need is a -`TF_CONFIG` environment variable on each host. You can set up the hosts manually -or check [tensorflow/ecosystem](https://github.com/tensorflow/ecosystem) for -instructions about how to set up a Cluster. - -The `TF_CONFIG` will be used by the `RunConfig` to know the existing hosts and -their task: `master`, `ps` or `worker`. - -Here's an example of `TF_CONFIG`. - -```python -cluster = {'master': ['master-ip:8000'], - 'ps': ['ps-ip:8000'], - 'worker': ['worker-ip:8000']} - -TF_CONFIG = json.dumps( - {'cluster': cluster, - 'task': {'type': master, 'index': 0}, - 'model_dir': 'gs:///', - 'environment': 'cloud' - }) -``` - -*Cluster* - -A cluster spec, which is basically a dictionary that describes all of the tasks -in the cluster. More about it [here](https://www.tensorflow.org/deploy/distributed). - -In this cluster spec we are defining a cluster with 1 master, 1 ps and 1 worker. - -* `ps`: saves the parameters among all workers. All workers can - read/write/update the parameters for model via ps. As some models are - extremely large the parameters are shared among the ps (each ps stores a - subset). - -* `worker`: does the training. - -* `master`: basically a special worker, it does training, but also restores and - saves checkpoints and do evaluation. - -*Task* - -The Task defines what is the role of the current node, for this example the node -is the master on index 0 on the cluster spec, the task will be different for -each node. An example of the `TF_CONFIG` for a worker would be: - -```python -cluster = {'master': ['master-ip:8000'], - 'ps': ['ps-ip:8000'], - 'worker': ['worker-ip:8000']} - -TF_CONFIG = json.dumps( - {'cluster': cluster, - 'task': {'type': worker, 'index': 0}, - 'model_dir': 'gs:///', - 'environment': 'cloud' - }) -``` - -*Model_dir* - -This is the path where the master will save the checkpoints, graph and -TensorBoard files. For a multi host environment you may want to use a -Distributed File System, Google Storage and DFS are supported. - -*Environment* - -By the default environment is *local*, for a distributed setting we need to -change it to *cloud*. - -### Running script - -Once you have a `TF_CONFIG` configured properly on each host you're ready to run -on distributed settings. - -#### Master -Run this on master: -Runs an Experiment in sync mode on 4 GPUs using CPU as parameter server for -40000 steps. It will run evaluation a couple of times during training. The -num_workers arugument is used only to update the learning rate correctly. Make -sure the model_dir is the same as defined on the TF_CONFIG. - -```shell -python cifar10_main.py --data-dir=gs://path/cifar-10-data \ - --job-dir=gs://path/model_dir/ \ - --num-gpus=4 \ - --train-steps=40000 \ - --sync \ - --num-workers=2 -``` - -*Output:* - -```shell -INFO:tensorflow:Using model_dir in TF_CONFIG: gs://path/model_dir/ -INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 1, '_keep_checkpoint_max': 5, '_task_type': u'master', '_is_chief': True, '_cluster_spec': , '_model_dir': 'gs://path/model_dir/', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_session_config': intra_op_parallelism_threads: 1 -gpu_options { -} -allow_soft_placement: true -, '_tf_random_seed': None, '_environment': u'cloud', '_num_worker_replicas': 1, '_task_id': 0, '_save_summary_steps': 100, '_tf_config': gpu_options { - per_process_gpu_memory_fraction: 1.0 -} -, '_evaluation_master': '', '_master': u'grpc://master-ip:8000'} -... -2017-08-01 19:59:26.496208: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 0 with properties: -name: Tesla K80 -major: 3 minor: 7 memoryClockRate (GHz) 0.8235 -pciBusID 0000:00:04.0 -Total memory: 11.17GiB -Free memory: 11.09GiB -2017-08-01 19:59:26.775660: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 1 with properties: -name: Tesla K80 -major: 3 minor: 7 memoryClockRate (GHz) 0.8235 -pciBusID 0000:00:05.0 -Total memory: 11.17GiB -Free memory: 11.10GiB -... -2017-08-01 19:59:29.675171: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000 -INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1/: (?, 16, 32, 32) -INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_1/: (?, 16, 32, 32) -INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_2/: (?, 16, 32, 32) -INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_3/: (?, 16, 32, 32) -INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_4/: (?, 16, 32, 32) -INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_5/: (?, 16, 32, 32) -INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_6/: (?, 16, 32, 32) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/avg_pool/: (?, 16, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_5/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_6/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/avg_pool/: (?, 32, 8, 8) -INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/: (?, 64, 8, 8) -INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_1/: (?, 64, 8, 8) -INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_2/: (?, 64, 8, 8) -INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_3/: (?, 64, 8, 8) -INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_4/: (?, 64, 8, 8) -INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_5/: (?, 64, 8, 8) -INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_6/: (?, 64, 8, 8) -INFO:tensorflow:image after unit resnet/tower_0/global_avg_pool/: (?, 64) -INFO:tensorflow:image after unit resnet/tower_0/fully_connected/: (?, 11) -INFO:tensorflow:SyncReplicasV2: replicas_to_aggregate=1; total_num_replicas=1 -INFO:tensorflow:Create CheckpointSaverHook. -INFO:tensorflow:Restoring parameters from gs://path/model_dir/model.ckpt-0 -2017-08-01 19:59:37.560775: I tensorflow/core/distributed_runtime/master_session.cc:999] Start master session 156fcb55fe6648d6 with config: -intra_op_parallelism_threads: 1 -gpu_options { - per_process_gpu_memory_fraction: 1 -} -allow_soft_placement: true - -INFO:tensorflow:Saving checkpoints for 1 into gs://path/model_dir/model.ckpt. -INFO:tensorflow:loss = 1.20682, step = 1 -INFO:tensorflow:loss = 1.20682, learning_rate = 0.1 -INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1/: (?, 16, 32, 32) -INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_1/: (?, 16, 32, 32) -INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_2/: (?, 16, 32, 32) -INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_3/: (?, 16, 32, 32) -INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_4/: (?, 16, 32, 32) -INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_5/: (?, 16, 32, 32) -INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_6/: (?, 16, 32, 32) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/avg_pool/: (?, 16, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_5/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_6/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/avg_pool/: (?, 32, 8, 8) -INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/: (?, 64, 8, 8) -INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_1/: (?, 64, 8, 8) -INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_2/: (?, 64, 8, 8) -INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_3/: (?, 64, 8, 8) -INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_4/: (?, 64, 8, 8) -INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_5/: (?, 64, 8, 8) -INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_6/: (?, 64, 8, 8) -INFO:tensorflow:image after unit resnet/tower_0/global_avg_pool/: (?, 64) -INFO:tensorflow:image after unit resnet/tower_0/fully_connected/: (?, 11) -INFO:tensorflow:SyncReplicasV2: replicas_to_aggregate=2; total_num_replicas=2 -INFO:tensorflow:Starting evaluation at 2017-08-01-20:00:14 -2017-08-01 20:00:15.745881: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K80, pci bus id: 0000:00:04.0) -2017-08-01 20:00:15.745949: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:1) -> (device: 1, name: Tesla K80, pci bus id: 0000:00:05.0) -2017-08-01 20:00:15.745958: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:2) -> (device: 2, name: Tesla K80, pci bus id: 0000:00:06.0) -2017-08-01 20:00:15.745964: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:3) -> (device: 3, name: Tesla K80, pci bus id: 0000:00:07.0) -2017-08-01 20:00:15.745969: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:4) -> (device: 4, name: Tesla K80, pci bus id: 0000:00:08.0) -2017-08-01 20:00:15.745975: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:5) -> (device: 5, name: Tesla K80, pci bus id: 0000:00:09.0) -2017-08-01 20:00:15.745987: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:6) -> (device: 6, name: Tesla K80, pci bus id: 0000:00:0a.0) -2017-08-01 20:00:15.745997: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:7) -> (device: 7, name: Tesla K80, pci bus id: 0000:00:0b.0) -INFO:tensorflow:Restoring parameters from gs://path/model_dir/model.ckpt-10023 -INFO:tensorflow:Evaluation [1/100] -INFO:tensorflow:Evaluation [2/100] -INFO:tensorflow:Evaluation [3/100] -INFO:tensorflow:Evaluation [4/100] -INFO:tensorflow:Evaluation [5/100] -INFO:tensorflow:Evaluation [6/100] -INFO:tensorflow:Evaluation [7/100] -INFO:tensorflow:Evaluation [8/100] -INFO:tensorflow:Evaluation [9/100] -INFO:tensorflow:Evaluation [10/100] -INFO:tensorflow:Evaluation [11/100] -INFO:tensorflow:Evaluation [12/100] -INFO:tensorflow:Evaluation [13/100] -... -INFO:tensorflow:Evaluation [100/100] -INFO:tensorflow:Finished evaluation at 2017-08-01-20:00:31 -INFO:tensorflow:Saving dict for global step 1: accuracy = 0.0994, global_step = 1, loss = 630.425 -``` - -#### Worker - -Run this on worker: -Runs an Experiment in sync mode on 4 GPUs using CPU as parameter server for -40000 steps. It will run evaluation a couple of times during training. Make sure -the model_dir is the same as defined on the TF_CONFIG. - -```shell -python cifar10_main.py --data-dir=gs://path/cifar-10-data \ - --job-dir=gs://path/model_dir/ \ - --num-gpus=4 \ - --train-steps=40000 \ - --sync -``` - -*Output:* - -```shell -INFO:tensorflow:Using model_dir in TF_CONFIG: gs://path/model_dir/ -INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, -'_num_ps_replicas': 1, '_keep_checkpoint_max': 5, '_task_type': u'worker', -'_is_chief': False, '_cluster_spec': -, -'_model_dir': 'gs:///model_dir/', -'_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, -'_session_config': intra_op_parallelism_threads: 1 -gpu_options { -} -allow_soft_placement: true -, '_tf_random_seed': None, '_environment': u'cloud', '_num_worker_replicas': 1, -'_task_id': 0, '_save_summary_steps': 100, '_tf_config': gpu_options { - per_process_gpu_memory_fraction: 1.0 - } -... -2017-08-01 19:59:26.496208: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 0 with properties: -name: Tesla K80 -major: 3 minor: 7 memoryClockRate (GHz) 0.8235 -pciBusID 0000:00:04.0 -Total memory: 11.17GiB -Free memory: 11.09GiB -2017-08-01 19:59:26.775660: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 1 with properties: -name: Tesla K80 -major: 3 minor: 7 memoryClockRate (GHz) 0.8235 -pciBusID 0000:00:05.0 -Total memory: 11.17GiB -Free memory: 11.10GiB -... -2017-08-01 19:59:29.675171: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000 -INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1/: (?, 16, 32, 32) -INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_1/: (?, 16, 32, 32) -INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_2/: (?, 16, 32, 32) -INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_3/: (?, 16, 32, 32) -INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_4/: (?, 16, 32, 32) -INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_5/: (?, 16, 32, 32) -INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_6/: (?, 16, 32, 32) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/avg_pool/: (?, 16, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_5/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_6/: (?, 32, 16, 16) -INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/avg_pool/: (?, 32, 8, 8) -INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/: (?, 64, 8, 8) -INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_1/: (?, 64, 8, 8) -INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_2/: (?, 64, 8, 8) -INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_3/: (?, 64, 8, 8) -INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_4/: (?, 64, 8, 8) -INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_5/: (?, 64, 8, 8) -INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_6/: (?, 64, 8, 8) -INFO:tensorflow:image after unit resnet/tower_0/global_avg_pool/: (?, 64) -INFO:tensorflow:image after unit resnet/tower_0/fully_connected/: (?, 11) -INFO:tensorflow:SyncReplicasV2: replicas_to_aggregate=2; total_num_replicas=2 -INFO:tensorflow:Create CheckpointSaverHook. -2017-07-31 22:38:04.629150: I -tensorflow/core/distributed_runtime/master.cc:209] CreateSession still waiting -for response from worker: /job:master/replica:0/task:0 -2017-07-31 22:38:09.263492: I -tensorflow/core/distributed_runtime/master_session.cc:999] Start master -session cc58f93b1e259b0c with config: -intra_op_parallelism_threads: 1 -gpu_options { -per_process_gpu_memory_fraction: 1 -} -allow_soft_placement: true -INFO:tensorflow:loss = 5.82382, step = 0 -INFO:tensorflow:loss = 5.82382, learning_rate = 0.8 -INFO:tensorflow:Average examples/sec: 1116.92 (1116.92), step = 10 -INFO:tensorflow:Average examples/sec: 1233.73 (1377.83), step = 20 -INFO:tensorflow:Average examples/sec: 1485.43 (2509.3), step = 30 -INFO:tensorflow:Average examples/sec: 1680.27 (2770.39), step = 40 -INFO:tensorflow:Average examples/sec: 1825.38 (2788.78), step = 50 -INFO:tensorflow:Average examples/sec: 1929.32 (2697.27), step = 60 -INFO:tensorflow:Average examples/sec: 2015.17 (2749.05), step = 70 -INFO:tensorflow:loss = 37.6272, step = 79 (19.554 sec) -INFO:tensorflow:loss = 37.6272, learning_rate = 0.8 (19.554 sec) -INFO:tensorflow:Average examples/sec: 2074.92 (2618.36), step = 80 -INFO:tensorflow:Average examples/sec: 2132.71 (2744.13), step = 90 -INFO:tensorflow:Average examples/sec: 2183.38 (2777.21), step = 100 -INFO:tensorflow:Average examples/sec: 2224.4 (2739.03), step = 110 -INFO:tensorflow:Average examples/sec: 2240.28 (2431.26), step = 120 -INFO:tensorflow:Average examples/sec: 2272.12 (2739.32), step = 130 -INFO:tensorflow:Average examples/sec: 2300.68 (2750.03), step = 140 -INFO:tensorflow:Average examples/sec: 2325.81 (2745.63), step = 150 -INFO:tensorflow:Average examples/sec: 2347.14 (2721.53), step = 160 -INFO:tensorflow:Average examples/sec: 2367.74 (2754.54), step = 170 -INFO:tensorflow:loss = 27.8453, step = 179 (18.893 sec) -... -``` - -#### PS - -Run this on ps: -The ps will not do training so most of the arguments won't affect the execution - -```shell -python cifar10_main.py --job-dir=gs://path/model_dir/ -``` - -*Output:* - -```shell -INFO:tensorflow:Using model_dir in TF_CONFIG: gs://path/model_dir/ -INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 1, '_keep_checkpoint_max': 5, '_task_type': u'ps', '_is_chief': False, '_cluster_spec': , '_model_dir': 'gs://path/model_dir/', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_session_config': intra_op_parallelism_threads: 1 -gpu_options { -} -allow_soft_placement: true -, '_tf_random_seed': None, '_environment': u'cloud', '_num_worker_replicas': 1, '_task_id': 0, '_save_summary_steps': 100, '_tf_config': gpu_options { - per_process_gpu_memory_fraction: 1.0 -} -, '_evaluation_master': '', '_master': u'grpc://master-ip:8000'} -2017-07-31 22:54:58.928088: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job master -> {0 -> master-ip:8000} -2017-07-31 22:54:58.928153: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> localhost:8000} -2017-07-31 22:54:58.928160: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> worker-ip:8000} -2017-07-31 22:54:58.929873: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000 -``` - -## Visualizing results with TensorBoard - -When using Estimators you can also visualize your data in TensorBoard, with no -changes in your code. You can use TensorBoard to visualize your TensorFlow -graph, plot quantitative metrics about the execution of your graph, and show -additional data like images that pass through it. - -You'll see something similar to this if you "point" TensorBoard to the -`job dir` parameter you used to train or evaluate your model. - -Check TensorBoard during training or after it. Just point TensorBoard to the -model_dir you chose on the previous step. - -```shell -tensorboard --log-dir="" -``` - -## Warnings - -When runninng `cifar10_main.py` with `--sync` argument you may see an error -similar to: - -```python -File "cifar10_main.py", line 538, in - tf.app.run() -File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 48, in run - _sys.exit(main(_sys.argv[:1] + flags_passthrough)) -File "cifar10_main.py", line 518, in main - hooks), run_config=config) -File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/learn_runner.py", line 210, in run - return _execute_schedule(experiment, schedule) -File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/learn_runner.py", line 47, in _execute_schedule - return task() -File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/experiment.py", line 501, in train_and_evaluate - hooks=self._eval_hooks) -File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/experiment.py", line 681, in _call_evaluate - hooks=hooks) -File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 292, in evaluate - name=name) -File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 638, in _evaluate_model - features, labels, model_fn_lib.ModeKeys.EVAL) -File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 545, in _call_model_fn - features=features, labels=labels, **kwargs) -File "cifar10_main.py", line 331, in _resnet_model_fn - gradvars, global_step=tf.train.get_global_step()) -File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/sync_replicas_optimizer.py", line 252, in apply_gradients - variables.global_variables()) -File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py", line 170, in wrapped - return _add_should_use_warning(fn(*args, **kwargs)) -File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py", line 139, in _add_should_use_warning - wrapped = TFShouldUseWarningWrapper(x) -File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py", line 96, in __init__ - stack = [s.strip() for s in traceback.format_stack()] -``` - -This should not affect your training, and should be fixed on the next releases. diff --git a/tutorials/image/cifar10_estimator/__init__.py b/tutorials/image/cifar10_estimator/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tutorials/image/cifar10_estimator/cifar10.py b/tutorials/image/cifar10_estimator/cifar10.py deleted file mode 100644 index 5e1a70895ad..00000000000 --- a/tutorials/image/cifar10_estimator/cifar10.py +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright 2017 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""CIFAR-10 data set. - -See http://www.cs.toronto.edu/~kriz/cifar.html. -""" -import os - -import tensorflow as tf - -HEIGHT = 32 -WIDTH = 32 -DEPTH = 3 - - -class Cifar10DataSet(object): - """Cifar10 data set. - - Described by http://www.cs.toronto.edu/~kriz/cifar.html. - """ - - def __init__(self, data_dir, subset='train', use_distortion=True): - self.data_dir = data_dir - self.subset = subset - self.use_distortion = use_distortion - - def get_filenames(self): - if self.subset in ['train', 'validation', 'eval']: - return [os.path.join(self.data_dir, self.subset + '.tfrecords')] - else: - raise ValueError('Invalid data subset "%s"' % self.subset) - - def parser(self, serialized_example): - """Parses a single tf.Example into image and label tensors.""" - # Dimensions of the images in the CIFAR-10 dataset. - # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the - # input format. - features = tf.parse_single_example( - serialized_example, - features={ - 'image': tf.FixedLenFeature([], tf.string), - 'label': tf.FixedLenFeature([], tf.int64), - }) - image = tf.decode_raw(features['image'], tf.uint8) - image.set_shape([DEPTH * HEIGHT * WIDTH]) - - # Reshape from [depth * height * width] to [depth, height, width]. - image = tf.cast( - tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]), - tf.float32) - label = tf.cast(features['label'], tf.int32) - - # Custom preprocessing. - image = self.preprocess(image) - - return image, label - - def make_batch(self, batch_size): - """Read the images and labels from 'filenames'.""" - filenames = self.get_filenames() - # Repeat infinitely. - dataset = tf.data.TFRecordDataset(filenames).repeat() - - # Parse records. - dataset = dataset.map( - self.parser, num_parallel_calls=batch_size) - - # Potentially shuffle records. - if self.subset == 'train': - min_queue_examples = int( - Cifar10DataSet.num_examples_per_epoch(self.subset) * 0.4) - # Ensure that the capacity is sufficiently large to provide good random - # shuffling. - dataset = dataset.shuffle(buffer_size=min_queue_examples + 3 * batch_size) - - # Batch it up. - dataset = dataset.batch(batch_size) - iterator = dataset.make_one_shot_iterator() - image_batch, label_batch = iterator.get_next() - - return image_batch, label_batch - - def preprocess(self, image): - """Preprocess a single image in [height, width, depth] layout.""" - if self.subset == 'train' and self.use_distortion: - # Pad 4 pixels on each dimension of feature map, done in mini-batch - image = tf.image.resize_image_with_crop_or_pad(image, 40, 40) - image = tf.random_crop(image, [HEIGHT, WIDTH, DEPTH]) - image = tf.image.random_flip_left_right(image) - return image - - @staticmethod - def num_examples_per_epoch(subset='train'): - if subset == 'train': - return 45000 - elif subset == 'validation': - return 5000 - elif subset == 'eval': - return 10000 - else: - raise ValueError('Invalid data subset "%s"' % subset) diff --git a/tutorials/image/cifar10_estimator/cifar10_main.py b/tutorials/image/cifar10_estimator/cifar10_main.py deleted file mode 100644 index 51da6b94fa2..00000000000 --- a/tutorials/image/cifar10_estimator/cifar10_main.py +++ /dev/null @@ -1,521 +0,0 @@ -# Copyright 2017 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""ResNet model for classifying images from CIFAR-10 dataset. - -Support single-host training with one or multiple devices. - -ResNet as proposed in: -Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun -Deep Residual Learning for Image Recognition. arXiv:1512.03385 - -CIFAR-10 as in: -http://www.cs.toronto.edu/~kriz/cifar.html - - -""" -from __future__ import division -from __future__ import print_function - -import argparse -import functools -import itertools -import os - -import cifar10 -import cifar10_model -import cifar10_utils -import numpy as np -import six -from six.moves import xrange # pylint: disable=redefined-builtin -import tensorflow as tf - -tf.logging.set_verbosity(tf.logging.INFO) - - -def get_model_fn(num_gpus, variable_strategy, num_workers): - """Returns a function that will build the resnet model.""" - - def _resnet_model_fn(features, labels, mode, params): - """Resnet model body. - - Support single host, one or more GPU training. Parameter distribution can - be either one of the following scheme. - 1. CPU is the parameter server and manages gradient updates. - 2. Parameters are distributed evenly across all GPUs, and the first GPU - manages gradient updates. - - Args: - features: a list of tensors, one for each tower - labels: a list of tensors, one for each tower - mode: ModeKeys.TRAIN or EVAL - params: Hyperparameters suitable for tuning - Returns: - A EstimatorSpec object. - """ - is_training = (mode == tf.estimator.ModeKeys.TRAIN) - weight_decay = params.weight_decay - momentum = params.momentum - - tower_features = features - tower_labels = labels - tower_losses = [] - tower_gradvars = [] - tower_preds = [] - - # channels first (NCHW) is normally optimal on GPU and channels last (NHWC) - # on CPU. The exception is Intel MKL on CPU which is optimal with - # channels_last. - data_format = params.data_format - if not data_format: - if num_gpus == 0: - data_format = 'channels_last' - else: - data_format = 'channels_first' - - if num_gpus == 0: - num_devices = 1 - device_type = 'cpu' - else: - num_devices = num_gpus - device_type = 'gpu' - - for i in range(num_devices): - worker_device = '/{}:{}'.format(device_type, i) - if variable_strategy == 'CPU': - device_setter = cifar10_utils.local_device_setter( - worker_device=worker_device) - elif variable_strategy == 'GPU': - device_setter = cifar10_utils.local_device_setter( - ps_device_type='gpu', - worker_device=worker_device, - ps_strategy=tf.contrib.training.GreedyLoadBalancingStrategy( - num_gpus, tf.contrib.training.byte_size_load_fn)) - with tf.variable_scope('resnet', reuse=bool(i != 0)): - with tf.name_scope('tower_%d' % i) as name_scope: - with tf.device(device_setter): - loss, gradvars, preds = _tower_fn( - is_training, weight_decay, tower_features[i], tower_labels[i], - data_format, params.num_layers, params.batch_norm_decay, - params.batch_norm_epsilon) - tower_losses.append(loss) - tower_gradvars.append(gradvars) - tower_preds.append(preds) - if i == 0: - # Only trigger batch_norm moving mean and variance update from - # the 1st tower. Ideally, we should grab the updates from all - # towers but these stats accumulate extremely fast so we can - # ignore the other stats from the other towers without - # significant detriment. - update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, - name_scope) - - # Now compute global loss and gradients. - gradvars = [] - with tf.name_scope('gradient_averaging'): - all_grads = {} - for grad, var in itertools.chain(*tower_gradvars): - if grad is not None: - all_grads.setdefault(var, []).append(grad) - for var, grads in six.iteritems(all_grads): - # Average gradients on the same device as the variables - # to which they apply. - with tf.device(var.device): - if len(grads) == 1: - avg_grad = grads[0] - else: - avg_grad = tf.multiply(tf.add_n(grads), 1. / len(grads)) - gradvars.append((avg_grad, var)) - - # Device that runs the ops to apply global gradient updates. - consolidation_device = '/gpu:0' if variable_strategy == 'GPU' else '/cpu:0' - with tf.device(consolidation_device): - # Suggested learning rate scheduling from - # https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/cifar10-resnet.py#L155 - num_batches_per_epoch = cifar10.Cifar10DataSet.num_examples_per_epoch( - 'train') // (params.train_batch_size * num_workers) - boundaries = [ - num_batches_per_epoch * x - for x in np.array([82, 123, 300], dtype=np.int64) - ] - staged_lr = [params.learning_rate * x for x in [1, 0.1, 0.01, 0.002]] - - learning_rate = tf.train.piecewise_constant(tf.train.get_global_step(), - boundaries, staged_lr) - - loss = tf.reduce_mean(tower_losses, name='loss') - - examples_sec_hook = cifar10_utils.ExamplesPerSecondHook( - params.train_batch_size, every_n_steps=10) - - tensors_to_log = {'learning_rate': learning_rate, 'loss': loss} - - logging_hook = tf.train.LoggingTensorHook( - tensors=tensors_to_log, every_n_iter=100) - - train_hooks = [logging_hook, examples_sec_hook] - - optimizer = tf.train.MomentumOptimizer( - learning_rate=learning_rate, momentum=momentum) - - if params.sync: - optimizer = tf.train.SyncReplicasOptimizer( - optimizer, replicas_to_aggregate=num_workers) - sync_replicas_hook = optimizer.make_session_run_hook(params.is_chief) - train_hooks.append(sync_replicas_hook) - - # Create single grouped train op - train_op = [ - optimizer.apply_gradients( - gradvars, global_step=tf.train.get_global_step()) - ] - train_op.extend(update_ops) - train_op = tf.group(*train_op) - - predictions = { - 'classes': - tf.concat([p['classes'] for p in tower_preds], axis=0), - 'probabilities': - tf.concat([p['probabilities'] for p in tower_preds], axis=0) - } - stacked_labels = tf.concat(labels, axis=0) - metrics = { - 'accuracy': - tf.metrics.accuracy(stacked_labels, predictions['classes']) - } - - return tf.estimator.EstimatorSpec( - mode=mode, - predictions=predictions, - loss=loss, - train_op=train_op, - training_hooks=train_hooks, - eval_metric_ops=metrics) - - return _resnet_model_fn - - -def _tower_fn(is_training, weight_decay, feature, label, data_format, - num_layers, batch_norm_decay, batch_norm_epsilon): - """Build computation tower (Resnet). - - Args: - is_training: true if is training graph. - weight_decay: weight regularization strength, a float. - feature: a Tensor. - label: a Tensor. - data_format: channels_last (NHWC) or channels_first (NCHW). - num_layers: number of layers, an int. - batch_norm_decay: decay for batch normalization, a float. - batch_norm_epsilon: epsilon for batch normalization, a float. - - Returns: - A tuple with the loss for the tower, the gradients and parameters, and - predictions. - - """ - model = cifar10_model.ResNetCifar10( - num_layers, - batch_norm_decay=batch_norm_decay, - batch_norm_epsilon=batch_norm_epsilon, - is_training=is_training, - data_format=data_format) - logits = model.forward_pass(feature, input_data_format='channels_last') - tower_pred = { - 'classes': tf.argmax(input=logits, axis=1), - 'probabilities': tf.nn.softmax(logits) - } - - tower_loss = tf.losses.sparse_softmax_cross_entropy( - logits=logits, labels=label) - tower_loss = tf.reduce_mean(tower_loss) - - model_params = tf.trainable_variables() - tower_loss += weight_decay * tf.add_n( - [tf.nn.l2_loss(v) for v in model_params]) - - tower_grad = tf.gradients(tower_loss, model_params) - - return tower_loss, zip(tower_grad, model_params), tower_pred - - -def input_fn(data_dir, - subset, - num_shards, - batch_size, - use_distortion_for_training=True): - """Create input graph for model. - - Args: - data_dir: Directory where TFRecords representing the dataset are located. - subset: one of 'train', 'validate' and 'eval'. - num_shards: num of towers participating in data-parallel training. - batch_size: total batch size for training to be divided by the number of - shards. - use_distortion_for_training: True to use distortions. - Returns: - two lists of tensors for features and labels, each of num_shards length. - """ - with tf.device('/cpu:0'): - use_distortion = subset == 'train' and use_distortion_for_training - dataset = cifar10.Cifar10DataSet(data_dir, subset, use_distortion) - image_batch, label_batch = dataset.make_batch(batch_size) - if num_shards <= 1: - # No GPU available or only 1 GPU. - return [image_batch], [label_batch] - - # Note that passing num=batch_size is safe here, even though - # dataset.batch(batch_size) can, in some cases, return fewer than batch_size - # examples. This is because it does so only when repeating for a limited - # number of epochs, but our dataset repeats forever. - image_batch = tf.unstack(image_batch, num=batch_size, axis=0) - label_batch = tf.unstack(label_batch, num=batch_size, axis=0) - feature_shards = [[] for i in range(num_shards)] - label_shards = [[] for i in range(num_shards)] - for i in xrange(batch_size): - idx = i % num_shards - feature_shards[idx].append(image_batch[i]) - label_shards[idx].append(label_batch[i]) - feature_shards = [tf.parallel_stack(x) for x in feature_shards] - label_shards = [tf.parallel_stack(x) for x in label_shards] - return feature_shards, label_shards - - -def get_experiment_fn(data_dir, - num_gpus, - variable_strategy, - use_distortion_for_training=True): - """Returns an Experiment function. - - Experiments perform training on several workers in parallel, - in other words experiments know how to invoke train and eval in a sensible - fashion for distributed training. Arguments passed directly to this - function are not tunable, all other arguments should be passed within - tf.HParams, passed to the enclosed function. - - Args: - data_dir: str. Location of the data for input_fns. - num_gpus: int. Number of GPUs on each worker. - variable_strategy: String. CPU to use CPU as the parameter server - and GPU to use the GPUs as the parameter server. - use_distortion_for_training: bool. See cifar10.Cifar10DataSet. - Returns: - A function (tf.estimator.RunConfig, tf.contrib.training.HParams) -> - tf.contrib.learn.Experiment. - - Suitable for use by tf.contrib.learn.learn_runner, which will run various - methods on Experiment (train, evaluate) based on information - about the current runner in `run_config`. - """ - - def _experiment_fn(run_config, hparams): - """Returns an Experiment.""" - # Create estimator. - train_input_fn = functools.partial( - input_fn, - data_dir, - subset='train', - num_shards=num_gpus, - batch_size=hparams.train_batch_size, - use_distortion_for_training=use_distortion_for_training) - - eval_input_fn = functools.partial( - input_fn, - data_dir, - subset='eval', - batch_size=hparams.eval_batch_size, - num_shards=num_gpus) - - num_eval_examples = cifar10.Cifar10DataSet.num_examples_per_epoch('eval') - if num_eval_examples % hparams.eval_batch_size != 0: - raise ValueError( - 'validation set size must be multiple of eval_batch_size') - - train_steps = hparams.train_steps - eval_steps = num_eval_examples // hparams.eval_batch_size - - classifier = tf.estimator.Estimator( - model_fn=get_model_fn(num_gpus, variable_strategy, - run_config.num_worker_replicas or 1), - config=run_config, - params=hparams) - - # Create experiment. - return tf.contrib.learn.Experiment( - classifier, - train_input_fn=train_input_fn, - eval_input_fn=eval_input_fn, - train_steps=train_steps, - eval_steps=eval_steps) - - return _experiment_fn - - -def main(job_dir, data_dir, num_gpus, variable_strategy, - use_distortion_for_training, log_device_placement, num_intra_threads, - **hparams): - # The env variable is on deprecation path, default is set to off. - os.environ['TF_SYNC_ON_FINISH'] = '0' - os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' - - # Session configuration. - sess_config = tf.ConfigProto( - allow_soft_placement=True, - log_device_placement=log_device_placement, - intra_op_parallelism_threads=num_intra_threads, - gpu_options=tf.GPUOptions(force_gpu_compatible=True)) - - config = cifar10_utils.RunConfig( - session_config=sess_config, model_dir=job_dir) - tf.contrib.learn.learn_runner.run( - get_experiment_fn(data_dir, num_gpus, variable_strategy, - use_distortion_for_training), - run_config=config, - hparams=tf.contrib.training.HParams( - is_chief=config.is_chief, - **hparams)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--data-dir', - type=str, - required=True, - help='The directory where the CIFAR-10 input data is stored.') - parser.add_argument( - '--job-dir', - type=str, - required=True, - help='The directory where the model will be stored.') - parser.add_argument( - '--variable-strategy', - choices=['CPU', 'GPU'], - type=str, - default='CPU', - help='Where to locate variable operations') - parser.add_argument( - '--num-gpus', - type=int, - default=1, - help='The number of gpus used. Uses only CPU if set to 0.') - parser.add_argument( - '--num-layers', - type=int, - default=44, - help='The number of layers of the model.') - parser.add_argument( - '--train-steps', - type=int, - default=80000, - help='The number of steps to use for training.') - parser.add_argument( - '--train-batch-size', - type=int, - default=128, - help='Batch size for training.') - parser.add_argument( - '--eval-batch-size', - type=int, - default=100, - help='Batch size for validation.') - parser.add_argument( - '--momentum', - type=float, - default=0.9, - help='Momentum for MomentumOptimizer.') - parser.add_argument( - '--weight-decay', - type=float, - default=2e-4, - help='Weight decay for convolutions.') - parser.add_argument( - '--learning-rate', - type=float, - default=0.1, - help="""\ - This is the inital learning rate value. The learning rate will decrease - during training. For more details check the model_fn implementation in - this file.\ - """) - parser.add_argument( - '--use-distortion-for-training', - type=bool, - default=True, - help='If doing image distortion for training.') - parser.add_argument( - '--sync', - action='store_true', - default=False, - help="""\ - If present when running in a distributed environment will run on sync mode.\ - """) - parser.add_argument( - '--num-intra-threads', - type=int, - default=0, - help="""\ - Number of threads to use for intra-op parallelism. When training on CPU - set to 0 to have the system pick the appropriate number or alternatively - set it to the number of physical CPU cores.\ - """) - parser.add_argument( - '--num-inter-threads', - type=int, - default=0, - help="""\ - Number of threads to use for inter-op parallelism. If set to 0, the - system will pick an appropriate number.\ - """) - parser.add_argument( - '--data-format', - type=str, - default=None, - help="""\ - If not set, the data format best for the training device is used. - Allowed values: channels_first (NCHW) channels_last (NHWC).\ - """) - parser.add_argument( - '--log-device-placement', - action='store_true', - default=False, - help='Whether to log device placement.') - parser.add_argument( - '--batch-norm-decay', - type=float, - default=0.997, - help='Decay for batch norm.') - parser.add_argument( - '--batch-norm-epsilon', - type=float, - default=1e-5, - help='Epsilon for batch norm.') - args = parser.parse_args() - - if args.num_gpus > 0: - assert tf.test.is_gpu_available(), "Requested GPUs but none found." - if args.num_gpus < 0: - raise ValueError( - 'Invalid GPU count: \"--num-gpus\" must be 0 or a positive integer.') - if args.num_gpus == 0 and args.variable_strategy == 'GPU': - raise ValueError('num-gpus=0, CPU must be used as parameter server. Set' - '--variable-strategy=CPU.') - if (args.num_layers - 2) % 6 != 0: - raise ValueError('Invalid --num-layers parameter.') - if args.num_gpus != 0 and args.train_batch_size % args.num_gpus != 0: - raise ValueError('--train-batch-size must be multiple of --num-gpus.') - if args.num_gpus != 0 and args.eval_batch_size % args.num_gpus != 0: - raise ValueError('--eval-batch-size must be multiple of --num-gpus.') - - main(**vars(args)) diff --git a/tutorials/image/cifar10_estimator/cifar10_model.py b/tutorials/image/cifar10_estimator/cifar10_model.py deleted file mode 100644 index d67c233dbba..00000000000 --- a/tutorials/image/cifar10_estimator/cifar10_model.py +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright 2017 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Model class for Cifar10 Dataset.""" -from __future__ import division -from __future__ import print_function - -import tensorflow as tf - -import model_base - - -class ResNetCifar10(model_base.ResNet): - """Cifar10 model with ResNetV1 and basic residual block.""" - - def __init__(self, - num_layers, - is_training, - batch_norm_decay, - batch_norm_epsilon, - data_format='channels_first'): - super(ResNetCifar10, self).__init__( - is_training, - data_format, - batch_norm_decay, - batch_norm_epsilon - ) - self.n = (num_layers - 2) // 6 - # Add one in case label starts with 1. No impact if label starts with 0. - self.num_classes = 10 + 1 - self.filters = [16, 16, 32, 64] - self.strides = [1, 2, 2] - - def forward_pass(self, x, input_data_format='channels_last'): - """Build the core model within the graph.""" - if self._data_format != input_data_format: - if input_data_format == 'channels_last': - # Computation requires channels_first. - x = tf.transpose(x, [0, 3, 1, 2]) - else: - # Computation requires channels_last. - x = tf.transpose(x, [0, 2, 3, 1]) - - # Image standardization. - x = x / 128 - 1 - - x = self._conv(x, 3, 16, 1) - x = self._batch_norm(x) - x = self._relu(x) - - # Use basic (non-bottleneck) block and ResNet V1 (post-activation). - res_func = self._residual_v1 - - # 3 stages of block stacking. - for i in range(3): - with tf.name_scope('stage'): - for j in range(self.n): - if j == 0: - # First block in a stage, filters and strides may change. - x = res_func(x, 3, self.filters[i], self.filters[i + 1], - self.strides[i]) - else: - # Following blocks in a stage, constant filters and unit stride. - x = res_func(x, 3, self.filters[i + 1], self.filters[i + 1], 1) - - x = self._global_avg_pool(x) - x = self._fully_connected(x, self.num_classes) - - return x diff --git a/tutorials/image/cifar10_estimator/cifar10_utils.py b/tutorials/image/cifar10_estimator/cifar10_utils.py deleted file mode 100644 index 9082cbfece4..00000000000 --- a/tutorials/image/cifar10_estimator/cifar10_utils.py +++ /dev/null @@ -1,139 +0,0 @@ -import collections -import six - -import tensorflow as tf - -from tensorflow.python.platform import tf_logging as logging -from tensorflow.core.framework import node_def_pb2 -from tensorflow.python.framework import device as pydev -from tensorflow.python.training import basic_session_run_hooks -from tensorflow.python.training import session_run_hook -from tensorflow.python.training import training_util -from tensorflow.python.training import device_setter -from tensorflow.contrib.learn.python.learn import run_config - - -# TODO(b/64848083) Remove once uid bug is fixed -class RunConfig(tf.contrib.learn.RunConfig): - def uid(self, whitelist=None): - """Generates a 'Unique Identifier' based on all internal fields. - Caller should use the uid string to check `RunConfig` instance integrity - in one session use, but should not rely on the implementation details, which - is subject to change. - Args: - whitelist: A list of the string names of the properties uid should not - include. If `None`, defaults to `_DEFAULT_UID_WHITE_LIST`, which - includes most properties user allowes to change. - Returns: - A uid string. - """ - if whitelist is None: - whitelist = run_config._DEFAULT_UID_WHITE_LIST - - state = {k: v for k, v in self.__dict__.items() if not k.startswith('__')} - # Pop out the keys in whitelist. - for k in whitelist: - state.pop('_' + k, None) - - ordered_state = collections.OrderedDict( - sorted(state.items(), key=lambda t: t[0])) - # For class instance without __repr__, some special cares are required. - # Otherwise, the object address will be used. - if '_cluster_spec' in ordered_state: - ordered_state['_cluster_spec'] = collections.OrderedDict( - sorted(ordered_state['_cluster_spec'].as_dict().items(), - key=lambda t: t[0]) - ) - return ', '.join( - '%s=%r' % (k, v) for (k, v) in six.iteritems(ordered_state)) - - -class ExamplesPerSecondHook(session_run_hook.SessionRunHook): - """Hook to print out examples per second. - - Total time is tracked and then divided by the total number of steps - to get the average step time and then batch_size is used to determine - the running average of examples per second. The examples per second for the - most recent interval is also logged. - """ - - def __init__( - self, - batch_size, - every_n_steps=100, - every_n_secs=None,): - """Initializer for ExamplesPerSecondHook. - - Args: - batch_size: Total batch size used to calculate examples/second from - global time. - every_n_steps: Log stats every n steps. - every_n_secs: Log stats every n seconds. - """ - if (every_n_steps is None) == (every_n_secs is None): - raise ValueError('exactly one of every_n_steps' - ' and every_n_secs should be provided.') - self._timer = basic_session_run_hooks.SecondOrStepTimer( - every_steps=every_n_steps, every_secs=every_n_secs) - - self._step_train_time = 0 - self._total_steps = 0 - self._batch_size = batch_size - - def begin(self): - self._global_step_tensor = training_util.get_global_step() - if self._global_step_tensor is None: - raise RuntimeError( - 'Global step should be created to use StepCounterHook.') - - def before_run(self, run_context): # pylint: disable=unused-argument - return basic_session_run_hooks.SessionRunArgs(self._global_step_tensor) - - def after_run(self, run_context, run_values): - _ = run_context - - global_step = run_values.results - if self._timer.should_trigger_for_step(global_step): - elapsed_time, elapsed_steps = self._timer.update_last_triggered_step( - global_step) - if elapsed_time is not None: - steps_per_sec = elapsed_steps / elapsed_time - self._step_train_time += elapsed_time - self._total_steps += elapsed_steps - - average_examples_per_sec = self._batch_size * ( - self._total_steps / self._step_train_time) - current_examples_per_sec = steps_per_sec * self._batch_size - # Average examples/sec followed by current examples/sec - logging.info('%s: %g (%g), step = %g', 'Average examples/sec', - average_examples_per_sec, current_examples_per_sec, - self._total_steps) - -def local_device_setter(num_devices=1, - ps_device_type='cpu', - worker_device='/cpu:0', - ps_ops=None, - ps_strategy=None): - if ps_ops == None: - ps_ops = ['Variable', 'VariableV2', 'VarHandleOp'] - - if ps_strategy is None: - ps_strategy = device_setter._RoundRobinStrategy(num_devices) - if not six.callable(ps_strategy): - raise TypeError("ps_strategy must be callable") - - def _local_device_chooser(op): - current_device = pydev.DeviceSpec.from_string(op.device or "") - - node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def - if node_def.op in ps_ops: - ps_device_spec = pydev.DeviceSpec.from_string( - '/{}:{}'.format(ps_device_type, ps_strategy(op))) - - ps_device_spec.merge_from(current_device) - return ps_device_spec.to_string() - else: - worker_device_spec = pydev.DeviceSpec.from_string(worker_device or "") - worker_device_spec.merge_from(current_device) - return worker_device_spec.to_string() - return _local_device_chooser diff --git a/tutorials/image/cifar10_estimator/cmle_config.yaml b/tutorials/image/cifar10_estimator/cmle_config.yaml deleted file mode 100644 index 76f920534ef..00000000000 --- a/tutorials/image/cifar10_estimator/cmle_config.yaml +++ /dev/null @@ -1,6 +0,0 @@ -trainingInput: - scaleTier: CUSTOM - masterType: complex_model_m_gpu - workerType: complex_model_m_gpu - parameterServerType: complex_model_m - workerCount: 1 diff --git a/tutorials/image/cifar10_estimator/generate_cifar10_tfrecords.py b/tutorials/image/cifar10_estimator/generate_cifar10_tfrecords.py deleted file mode 100644 index d1a599c31bf..00000000000 --- a/tutorials/image/cifar10_estimator/generate_cifar10_tfrecords.py +++ /dev/null @@ -1,118 +0,0 @@ -# Copyright 2017 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Read CIFAR-10 data from pickled numpy arrays and writes TFRecords. - -Generates tf.train.Example protos and writes them to TFRecord files from the -python version of the CIFAR-10 dataset downloaded from -https://www.cs.toronto.edu/~kriz/cifar.html. -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import os -import sys - -import tarfile -from six.moves import cPickle as pickle -from six.moves import xrange # pylint: disable=redefined-builtin -import tensorflow as tf - -CIFAR_FILENAME = 'cifar-10-python.tar.gz' -CIFAR_DOWNLOAD_URL = 'https://www.cs.toronto.edu/~kriz/' + CIFAR_FILENAME -CIFAR_LOCAL_FOLDER = 'cifar-10-batches-py' - - -def download_and_extract(data_dir): - # download CIFAR-10 if not already downloaded. - tf.contrib.learn.datasets.base.maybe_download(CIFAR_FILENAME, data_dir, - CIFAR_DOWNLOAD_URL) - tarfile.open(os.path.join(data_dir, CIFAR_FILENAME), - 'r:gz').extractall(data_dir) - - -def _int64_feature(value): - return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) - - -def _bytes_feature(value): - return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) - - -def _get_file_names(): - """Returns the file names expected to exist in the input_dir.""" - file_names = {} - file_names['train'] = ['data_batch_%d' % i for i in xrange(1, 5)] - file_names['validation'] = ['data_batch_5'] - file_names['eval'] = ['test_batch'] - return file_names - - -def read_pickle_from_file(filename): - with tf.gfile.Open(filename, 'rb') as f: - if sys.version_info >= (3, 0): - data_dict = pickle.load(f, encoding='bytes') - else: - data_dict = pickle.load(f) - return data_dict - - -def convert_to_tfrecord(input_files, output_file): - """Converts a file to TFRecords.""" - print('Generating %s' % output_file) - with tf.python_io.TFRecordWriter(output_file) as record_writer: - for input_file in input_files: - data_dict = read_pickle_from_file(input_file) - data = data_dict[b'data'] - labels = data_dict[b'labels'] - num_entries_in_batch = len(labels) - for i in range(num_entries_in_batch): - example = tf.train.Example(features=tf.train.Features( - feature={ - 'image': _bytes_feature(data[i].tobytes()), - 'label': _int64_feature(labels[i]) - })) - record_writer.write(example.SerializeToString()) - - -def main(data_dir): - print('Download from {} and extract.'.format(CIFAR_DOWNLOAD_URL)) - download_and_extract(data_dir) - file_names = _get_file_names() - input_dir = os.path.join(data_dir, CIFAR_LOCAL_FOLDER) - for mode, files in file_names.items(): - input_files = [os.path.join(input_dir, f) for f in files] - output_file = os.path.join(data_dir, mode + '.tfrecords') - try: - os.remove(output_file) - except OSError: - pass - # Convert to tf.train.Example and write the to TFRecords. - convert_to_tfrecord(input_files, output_file) - print('Done!') - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--data-dir', - type=str, - default='', - help='Directory to download and extract CIFAR-10 to.') - - args = parser.parse_args() - main(args.data_dir) diff --git a/tutorials/image/cifar10_estimator/model_base.py b/tutorials/image/cifar10_estimator/model_base.py deleted file mode 100644 index 35e52b8355d..00000000000 --- a/tutorials/image/cifar10_estimator/model_base.py +++ /dev/null @@ -1,219 +0,0 @@ -# Copyright 2017 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""ResNet model. - -Related papers: -https://arxiv.org/pdf/1603.05027v2.pdf -https://arxiv.org/pdf/1512.03385v1.pdf -https://arxiv.org/pdf/1605.07146v1.pdf -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf - - -class ResNet(object): - """ResNet model.""" - - def __init__(self, is_training, data_format, batch_norm_decay, batch_norm_epsilon): - """ResNet constructor. - - Args: - is_training: if build training or inference model. - data_format: the data_format used during computation. - one of 'channels_first' or 'channels_last'. - """ - self._batch_norm_decay = batch_norm_decay - self._batch_norm_epsilon = batch_norm_epsilon - self._is_training = is_training - assert data_format in ('channels_first', 'channels_last') - self._data_format = data_format - - def forward_pass(self, x): - raise NotImplementedError( - 'forward_pass() is implemented in ResNet sub classes') - - def _residual_v1(self, - x, - kernel_size, - in_filter, - out_filter, - stride, - activate_before_residual=False): - """Residual unit with 2 sub layers, using Plan A for shortcut connection.""" - - del activate_before_residual - with tf.name_scope('residual_v1') as name_scope: - orig_x = x - - x = self._conv(x, kernel_size, out_filter, stride) - x = self._batch_norm(x) - x = self._relu(x) - - x = self._conv(x, kernel_size, out_filter, 1) - x = self._batch_norm(x) - - if in_filter != out_filter: - orig_x = self._avg_pool(orig_x, stride, stride) - pad = (out_filter - in_filter) // 2 - if self._data_format == 'channels_first': - orig_x = tf.pad(orig_x, [[0, 0], [pad, pad], [0, 0], [0, 0]]) - else: - orig_x = tf.pad(orig_x, [[0, 0], [0, 0], [0, 0], [pad, pad]]) - - x = self._relu(tf.add(x, orig_x)) - - tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) - return x - - def _residual_v2(self, - x, - in_filter, - out_filter, - stride, - activate_before_residual=False): - """Residual unit with 2 sub layers with preactivation, plan A shortcut.""" - - with tf.name_scope('residual_v2') as name_scope: - if activate_before_residual: - x = self._batch_norm(x) - x = self._relu(x) - orig_x = x - else: - orig_x = x - x = self._batch_norm(x) - x = self._relu(x) - - x = self._conv(x, 3, out_filter, stride) - - x = self._batch_norm(x) - x = self._relu(x) - x = self._conv(x, 3, out_filter, [1, 1, 1, 1]) - - if in_filter != out_filter: - pad = (out_filter - in_filter) // 2 - orig_x = self._avg_pool(orig_x, stride, stride) - if self._data_format == 'channels_first': - orig_x = tf.pad(orig_x, [[0, 0], [pad, pad], [0, 0], [0, 0]]) - else: - orig_x = tf.pad(orig_x, [[0, 0], [0, 0], [0, 0], [pad, pad]]) - - x = tf.add(x, orig_x) - - tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) - return x - - def _bottleneck_residual_v2(self, - x, - in_filter, - out_filter, - stride, - activate_before_residual=False): - """Bottleneck residual unit with 3 sub layers, plan B shortcut.""" - - with tf.name_scope('bottle_residual_v2') as name_scope: - if activate_before_residual: - x = self._batch_norm(x) - x = self._relu(x) - orig_x = x - else: - orig_x = x - x = self._batch_norm(x) - x = self._relu(x) - - x = self._conv(x, 1, out_filter // 4, stride, is_atrous=True) - - x = self._batch_norm(x) - x = self._relu(x) - # pad when stride isn't unit - x = self._conv(x, 3, out_filter // 4, 1, is_atrous=True) - - x = self._batch_norm(x) - x = self._relu(x) - x = self._conv(x, 1, out_filter, 1, is_atrous=True) - - if in_filter != out_filter: - orig_x = self._conv(orig_x, 1, out_filter, stride, is_atrous=True) - x = tf.add(x, orig_x) - - tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) - return x - - def _conv(self, x, kernel_size, filters, strides, is_atrous=False): - """Convolution.""" - - padding = 'SAME' - if not is_atrous and strides > 1: - pad = kernel_size - 1 - pad_beg = pad // 2 - pad_end = pad - pad_beg - if self._data_format == 'channels_first': - x = tf.pad(x, [[0, 0], [0, 0], [pad_beg, pad_end], [pad_beg, pad_end]]) - else: - x = tf.pad(x, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]]) - padding = 'VALID' - return tf.layers.conv2d( - inputs=x, - kernel_size=kernel_size, - filters=filters, - strides=strides, - padding=padding, - use_bias=False, - data_format=self._data_format) - - def _batch_norm(self, x): - if self._data_format == 'channels_first': - data_format = 'NCHW' - else: - data_format = 'NHWC' - return tf.contrib.layers.batch_norm( - x, - decay=self._batch_norm_decay, - center=True, - scale=True, - epsilon=self._batch_norm_epsilon, - is_training=self._is_training, - fused=True, - data_format=data_format) - - def _relu(self, x): - return tf.nn.relu(x) - - def _fully_connected(self, x, out_dim): - with tf.name_scope('fully_connected') as name_scope: - x = tf.layers.dense(x, out_dim) - - tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) - return x - - def _avg_pool(self, x, pool_size, stride): - with tf.name_scope('avg_pool') as name_scope: - x = tf.layers.average_pooling2d( - x, pool_size, stride, 'SAME', data_format=self._data_format) - - tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) - return x - - def _global_avg_pool(self, x): - with tf.name_scope('global_avg_pool') as name_scope: - assert x.get_shape().ndims == 4 - if self._data_format == 'channels_first': - x = tf.reduce_mean(x, [2, 3]) - else: - x = tf.reduce_mean(x, [1, 2]) - tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) - return x diff --git a/tutorials/image/imagenet/BUILD b/tutorials/image/imagenet/BUILD deleted file mode 100644 index b3ff258eb4e..00000000000 --- a/tutorials/image/imagenet/BUILD +++ /dev/null @@ -1,30 +0,0 @@ -# Description: -# Example TensorFlow models for ImageNet. - -licenses(["notice"]) # Apache 2.0 - -exports_files(["LICENSE"]) - -py_binary( - name = "classify_image", - srcs = [ - "classify_image.py", - ], - srcs_version = "PY2AND3", - visibility = ["//tensorflow:__subpackages__"], - deps = [ - "//tensorflow:tensorflow_py", - ], -) - -filegroup( - name = "all_files", - srcs = glob( - ["**/*"], - exclude = [ - "**/METADATA", - "**/OWNERS", - ], - ), - visibility = ["//tensorflow:__subpackages__"], -) diff --git a/tutorials/image/imagenet/classify_image.py b/tutorials/image/imagenet/classify_image.py deleted file mode 100644 index c2850f58ea3..00000000000 --- a/tutorials/image/imagenet/classify_image.py +++ /dev/null @@ -1,227 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Simple image classification with Inception. - -Run image classification with Inception trained on ImageNet 2012 Challenge data -set. - -This program creates a graph from a saved GraphDef protocol buffer, -and runs inference on an input JPEG image. It outputs human readable -strings of the top 5 predictions along with their probabilities. - -Change the --image_file argument to any jpg image to compute a -classification of that image. - -Please see the tutorial and website for a detailed description of how -to use this script to perform image recognition. - -https://tensorflow.org/tutorials/image_recognition/ -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import os.path -import re -import sys -import tarfile - -import numpy as np -from six.moves import urllib -import tensorflow as tf - -FLAGS = None - -# pylint: disable=line-too-long -DATA_URL = 'http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz' -# pylint: enable=line-too-long - - -class NodeLookup(object): - """Converts integer node ID's to human readable labels.""" - - def __init__(self, - label_lookup_path=None, - uid_lookup_path=None): - if not label_lookup_path: - label_lookup_path = os.path.join( - FLAGS.model_dir, 'imagenet_2012_challenge_label_map_proto.pbtxt') - if not uid_lookup_path: - uid_lookup_path = os.path.join( - FLAGS.model_dir, 'imagenet_synset_to_human_label_map.txt') - self.node_lookup = self.load(label_lookup_path, uid_lookup_path) - - def load(self, label_lookup_path, uid_lookup_path): - """Loads a human readable English name for each softmax node. - - Args: - label_lookup_path: string UID to integer node ID. - uid_lookup_path: string UID to human-readable string. - - Returns: - dict from integer node ID to human-readable string. - """ - if not tf.gfile.Exists(uid_lookup_path): - tf.logging.fatal('File does not exist %s', uid_lookup_path) - if not tf.gfile.Exists(label_lookup_path): - tf.logging.fatal('File does not exist %s', label_lookup_path) - - # Loads mapping from string UID to human-readable string - proto_as_ascii_lines = tf.gfile.GFile(uid_lookup_path).readlines() - uid_to_human = {} - p = re.compile(r'[n\d]*[ \S,]*') - for line in proto_as_ascii_lines: - parsed_items = p.findall(line) - uid = parsed_items[0] - human_string = parsed_items[2] - uid_to_human[uid] = human_string - - # Loads mapping from string UID to integer node ID. - node_id_to_uid = {} - proto_as_ascii = tf.gfile.GFile(label_lookup_path).readlines() - for line in proto_as_ascii: - if line.startswith(' target_class:'): - target_class = int(line.split(': ')[1]) - if line.startswith(' target_class_string:'): - target_class_string = line.split(': ')[1] - node_id_to_uid[target_class] = target_class_string[1:-2] - - # Loads the final mapping of integer node ID to human-readable string - node_id_to_name = {} - for key, val in node_id_to_uid.items(): - if val not in uid_to_human: - tf.logging.fatal('Failed to locate: %s', val) - name = uid_to_human[val] - node_id_to_name[key] = name - - return node_id_to_name - - def id_to_string(self, node_id): - if node_id not in self.node_lookup: - return '' - return self.node_lookup[node_id] - - -def create_graph(): - """Creates a graph from saved GraphDef file and returns a saver.""" - # Creates graph from saved graph_def.pb. - with tf.gfile.FastGFile(os.path.join( - FLAGS.model_dir, 'classify_image_graph_def.pb'), 'rb') as f: - graph_def = tf.GraphDef() - graph_def.ParseFromString(f.read()) - _ = tf.import_graph_def(graph_def, name='') - - -def run_inference_on_image(image): - """Runs inference on an image. - - Args: - image: Image file name. - - Returns: - Nothing - """ - if not tf.gfile.Exists(image): - tf.logging.fatal('File does not exist %s', image) - image_data = tf.gfile.FastGFile(image, 'rb').read() - - # Creates graph from saved GraphDef. - create_graph() - - with tf.Session() as sess: - # Some useful tensors: - # 'softmax:0': A tensor containing the normalized prediction across - # 1000 labels. - # 'pool_3:0': A tensor containing the next-to-last layer containing 2048 - # float description of the image. - # 'DecodeJpeg/contents:0': A tensor containing a string providing JPEG - # encoding of the image. - # Runs the softmax tensor by feeding the image_data as input to the graph. - softmax_tensor = sess.graph.get_tensor_by_name('softmax:0') - predictions = sess.run(softmax_tensor, - {'DecodeJpeg/contents:0': image_data}) - predictions = np.squeeze(predictions) - - # Creates node ID --> English string lookup. - node_lookup = NodeLookup() - - top_k = predictions.argsort()[-FLAGS.num_top_predictions:][::-1] - for node_id in top_k: - human_string = node_lookup.id_to_string(node_id) - score = predictions[node_id] - print('%s (score = %.5f)' % (human_string, score)) - - -def maybe_download_and_extract(): - """Download and extract model tar file.""" - dest_directory = FLAGS.model_dir - if not os.path.exists(dest_directory): - os.makedirs(dest_directory) - filename = DATA_URL.split('/')[-1] - filepath = os.path.join(dest_directory, filename) - if not os.path.exists(filepath): - def _progress(count, block_size, total_size): - sys.stdout.write('\r>> Downloading %s %.1f%%' % ( - filename, float(count * block_size) / float(total_size) * 100.0)) - sys.stdout.flush() - filepath, _ = urllib.request.urlretrieve(DATA_URL, filepath, _progress) - print() - statinfo = os.stat(filepath) - print('Successfully downloaded', filename, statinfo.st_size, 'bytes.') - tarfile.open(filepath, 'r:gz').extractall(dest_directory) - - -def main(_): - maybe_download_and_extract() - image = (FLAGS.image_file if FLAGS.image_file else - os.path.join(FLAGS.model_dir, 'cropped_panda.jpg')) - run_inference_on_image(image) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - # classify_image_graph_def.pb: - # Binary representation of the GraphDef protocol buffer. - # imagenet_synset_to_human_label_map.txt: - # Map from synset ID to a human readable string. - # imagenet_2012_challenge_label_map_proto.pbtxt: - # Text representation of a protocol buffer mapping a label to synset ID. - parser.add_argument( - '--model_dir', - type=str, - default='/tmp/imagenet', - help="""\ - Path to classify_image_graph_def.pb, - imagenet_synset_to_human_label_map.txt, and - imagenet_2012_challenge_label_map_proto.pbtxt.\ - """ - ) - parser.add_argument( - '--image_file', - type=str, - default='', - help='Absolute path to image file.' - ) - parser.add_argument( - '--num_top_predictions', - type=int, - default=5, - help='Display this many predictions.' - ) - FLAGS, unparsed = parser.parse_known_args() - tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) diff --git a/tutorials/image/mnist/BUILD b/tutorials/image/mnist/BUILD deleted file mode 100644 index a9b6d78e5e6..00000000000 --- a/tutorials/image/mnist/BUILD +++ /dev/null @@ -1,42 +0,0 @@ -# Description: -# Example TensorFlow models for MNIST that achieves high accuracy - -licenses(["notice"]) # Apache 2.0 - -exports_files(["LICENSE"]) - -py_binary( - name = "convolutional", - srcs = [ - "convolutional.py", - ], - srcs_version = "PY2AND3", - visibility = ["//tensorflow:__subpackages__"], - deps = ["//tensorflow:tensorflow_py"], -) - -py_test( - name = "convolutional_test", - size = "medium", - srcs = [ - "convolutional.py", - ], - args = [ - "--self_test", - ], - main = "convolutional.py", - srcs_version = "PY2AND3", - deps = ["//tensorflow:tensorflow_py"], -) - -filegroup( - name = "all_files", - srcs = glob( - ["**/*"], - exclude = [ - "**/METADATA", - "**/OWNERS", - ], - ), - visibility = ["//tensorflow:__subpackages__"], -) diff --git a/tutorials/image/mnist/__init__.py b/tutorials/image/mnist/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tutorials/image/mnist/convolutional.py b/tutorials/image/mnist/convolutional.py deleted file mode 100644 index b38d4bd351b..00000000000 --- a/tutorials/image/mnist/convolutional.py +++ /dev/null @@ -1,340 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Simple, end-to-end, LeNet-5-like convolutional MNIST model example. - -This should achieve a test error of 0.7%. Please keep this model as simple and -linear as possible, it is meant as a tutorial for simple convolutional models. -Run with --self_test on the command line to execute a short self-test. -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import gzip -import os -import sys -import time - -import numpy -from six.moves import urllib -from six.moves import xrange # pylint: disable=redefined-builtin -import tensorflow as tf - -# CVDF mirror of http://yann.lecun.com/exdb/mnist/ -SOURCE_URL = 'https://storage.googleapis.com/cvdf-datasets/mnist/' -WORK_DIRECTORY = 'data' -IMAGE_SIZE = 28 -NUM_CHANNELS = 1 -PIXEL_DEPTH = 255 -NUM_LABELS = 10 -VALIDATION_SIZE = 5000 # Size of the validation set. -SEED = 66478 # Set to None for random seed. -BATCH_SIZE = 64 -NUM_EPOCHS = 10 -EVAL_BATCH_SIZE = 64 -EVAL_FREQUENCY = 100 # Number of steps between evaluations. - - -FLAGS = None - - -def data_type(): - """Return the type of the activations, weights, and placeholder variables.""" - if FLAGS.use_fp16: - return tf.float16 - else: - return tf.float32 - - -def maybe_download(filename): - """Download the data from Yann's website, unless it's already here.""" - if not tf.gfile.Exists(WORK_DIRECTORY): - tf.gfile.MakeDirs(WORK_DIRECTORY) - filepath = os.path.join(WORK_DIRECTORY, filename) - if not tf.gfile.Exists(filepath): - filepath, _ = urllib.request.urlretrieve(SOURCE_URL + filename, filepath) - with tf.gfile.GFile(filepath) as f: - size = f.size() - print('Successfully downloaded', filename, size, 'bytes.') - return filepath - - -def extract_data(filename, num_images): - """Extract the images into a 4D tensor [image index, y, x, channels]. - - Values are rescaled from [0, 255] down to [-0.5, 0.5]. - """ - print('Extracting', filename) - with gzip.open(filename) as bytestream: - bytestream.read(16) - buf = bytestream.read(IMAGE_SIZE * IMAGE_SIZE * num_images * NUM_CHANNELS) - data = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.float32) - data = (data - (PIXEL_DEPTH / 2.0)) / PIXEL_DEPTH - data = data.reshape(num_images, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS) - return data - - -def extract_labels(filename, num_images): - """Extract the labels into a vector of int64 label IDs.""" - print('Extracting', filename) - with gzip.open(filename) as bytestream: - bytestream.read(8) - buf = bytestream.read(1 * num_images) - labels = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.int64) - return labels - - -def fake_data(num_images): - """Generate a fake dataset that matches the dimensions of MNIST.""" - data = numpy.ndarray( - shape=(num_images, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS), - dtype=numpy.float32) - labels = numpy.zeros(shape=(num_images,), dtype=numpy.int64) - for image in xrange(num_images): - label = image % 2 - data[image, :, :, 0] = label - 0.5 - labels[image] = label - return data, labels - - -def error_rate(predictions, labels): - """Return the error rate based on dense predictions and sparse labels.""" - return 100.0 - ( - 100.0 * - numpy.sum(numpy.argmax(predictions, 1) == labels) / - predictions.shape[0]) - - -def main(_): - if FLAGS.self_test: - print('Running self-test.') - train_data, train_labels = fake_data(256) - validation_data, validation_labels = fake_data(EVAL_BATCH_SIZE) - test_data, test_labels = fake_data(EVAL_BATCH_SIZE) - num_epochs = 1 - else: - # Get the data. - train_data_filename = maybe_download('train-images-idx3-ubyte.gz') - train_labels_filename = maybe_download('train-labels-idx1-ubyte.gz') - test_data_filename = maybe_download('t10k-images-idx3-ubyte.gz') - test_labels_filename = maybe_download('t10k-labels-idx1-ubyte.gz') - - # Extract it into numpy arrays. - train_data = extract_data(train_data_filename, 60000) - train_labels = extract_labels(train_labels_filename, 60000) - test_data = extract_data(test_data_filename, 10000) - test_labels = extract_labels(test_labels_filename, 10000) - - # Generate a validation set. - validation_data = train_data[:VALIDATION_SIZE, ...] - validation_labels = train_labels[:VALIDATION_SIZE] - train_data = train_data[VALIDATION_SIZE:, ...] - train_labels = train_labels[VALIDATION_SIZE:] - num_epochs = NUM_EPOCHS - train_size = train_labels.shape[0] - - # This is where training samples and labels are fed to the graph. - # These placeholder nodes will be fed a batch of training data at each - # training step using the {feed_dict} argument to the Run() call below. - train_data_node = tf.placeholder( - data_type(), - shape=(BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)) - train_labels_node = tf.placeholder(tf.int64, shape=(BATCH_SIZE,)) - eval_data = tf.placeholder( - data_type(), - shape=(EVAL_BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)) - - # The variables below hold all the trainable weights. They are passed an - # initial value which will be assigned when we call: - # {tf.global_variables_initializer().run()} - conv1_weights = tf.Variable( - tf.truncated_normal([5, 5, NUM_CHANNELS, 32], # 5x5 filter, depth 32. - stddev=0.1, - seed=SEED, dtype=data_type())) - conv1_biases = tf.Variable(tf.zeros([32], dtype=data_type())) - conv2_weights = tf.Variable(tf.truncated_normal( - [5, 5, 32, 64], stddev=0.1, - seed=SEED, dtype=data_type())) - conv2_biases = tf.Variable(tf.constant(0.1, shape=[64], dtype=data_type())) - fc1_weights = tf.Variable( # fully connected, depth 512. - tf.truncated_normal([IMAGE_SIZE // 4 * IMAGE_SIZE // 4 * 64, 512], - stddev=0.1, - seed=SEED, - dtype=data_type())) - fc1_biases = tf.Variable(tf.constant(0.1, shape=[512], dtype=data_type())) - fc2_weights = tf.Variable(tf.truncated_normal([512, NUM_LABELS], - stddev=0.1, - seed=SEED, - dtype=data_type())) - fc2_biases = tf.Variable(tf.constant( - 0.1, shape=[NUM_LABELS], dtype=data_type())) - - # We will replicate the model structure for the training subgraph, as well - # as the evaluation subgraphs, while sharing the trainable parameters. - def model(data, train=False): - """The Model definition.""" - # 2D convolution, with 'SAME' padding (i.e. the output feature map has - # the same size as the input). Note that {strides} is a 4D array whose - # shape matches the data layout: [image index, y, x, depth]. - conv = tf.nn.conv2d(data, - conv1_weights, - strides=[1, 1, 1, 1], - padding='SAME') - # Bias and rectified linear non-linearity. - relu = tf.nn.relu(tf.nn.bias_add(conv, conv1_biases)) - # Max pooling. The kernel size spec {ksize} also follows the layout of - # the data. Here we have a pooling window of 2, and a stride of 2. - pool = tf.nn.max_pool(relu, - ksize=[1, 2, 2, 1], - strides=[1, 2, 2, 1], - padding='SAME') - conv = tf.nn.conv2d(pool, - conv2_weights, - strides=[1, 1, 1, 1], - padding='SAME') - relu = tf.nn.relu(tf.nn.bias_add(conv, conv2_biases)) - pool = tf.nn.max_pool(relu, - ksize=[1, 2, 2, 1], - strides=[1, 2, 2, 1], - padding='SAME') - # Reshape the feature map cuboid into a 2D matrix to feed it to the - # fully connected layers. - pool_shape = pool.get_shape().as_list() - reshape = tf.reshape( - pool, - [pool_shape[0], pool_shape[1] * pool_shape[2] * pool_shape[3]]) - # Fully connected layer. Note that the '+' operation automatically - # broadcasts the biases. - hidden = tf.nn.relu(tf.matmul(reshape, fc1_weights) + fc1_biases) - # Add a 50% dropout during training only. Dropout also scales - # activations such that no rescaling is needed at evaluation time. - if train: - hidden = tf.nn.dropout(hidden, 0.5, seed=SEED) - return tf.matmul(hidden, fc2_weights) + fc2_biases - - # Training computation: logits + cross-entropy loss. - logits = model(train_data_node, True) - loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits( - labels=train_labels_node, logits=logits)) - - # L2 regularization for the fully connected parameters. - regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases) + - tf.nn.l2_loss(fc2_weights) + tf.nn.l2_loss(fc2_biases)) - # Add the regularization term to the loss. - loss += 5e-4 * regularizers - - # Optimizer: set up a variable that's incremented once per batch and - # controls the learning rate decay. - batch = tf.Variable(0, dtype=data_type()) - # Decay once per epoch, using an exponential schedule starting at 0.01. - learning_rate = tf.train.exponential_decay( - 0.01, # Base learning rate. - batch * BATCH_SIZE, # Current index into the dataset. - train_size, # Decay step. - 0.95, # Decay rate. - staircase=True) - # Use simple momentum for the optimization. - optimizer = tf.train.MomentumOptimizer(learning_rate, - 0.9).minimize(loss, - global_step=batch) - - # Predictions for the current training minibatch. - train_prediction = tf.nn.softmax(logits) - - # Predictions for the test and validation, which we'll compute less often. - eval_prediction = tf.nn.softmax(model(eval_data)) - - # Small utility function to evaluate a dataset by feeding batches of data to - # {eval_data} and pulling the results from {eval_predictions}. - # Saves memory and enables this to run on smaller GPUs. - def eval_in_batches(data, sess): - """Get all predictions for a dataset by running it in small batches.""" - size = data.shape[0] - if size < EVAL_BATCH_SIZE: - raise ValueError("batch size for evals larger than dataset: %d" % size) - predictions = numpy.ndarray(shape=(size, NUM_LABELS), dtype=numpy.float32) - for begin in xrange(0, size, EVAL_BATCH_SIZE): - end = begin + EVAL_BATCH_SIZE - if end <= size: - predictions[begin:end, :] = sess.run( - eval_prediction, - feed_dict={eval_data: data[begin:end, ...]}) - else: - batch_predictions = sess.run( - eval_prediction, - feed_dict={eval_data: data[-EVAL_BATCH_SIZE:, ...]}) - predictions[begin:, :] = batch_predictions[begin - size:, :] - return predictions - - # Create a local session to run the training. - start_time = time.time() - with tf.Session() as sess: - # Run all the initializers to prepare the trainable parameters. - tf.global_variables_initializer().run() - print('Initialized!') - # Loop through training steps. - for step in xrange(int(num_epochs * train_size) // BATCH_SIZE): - # Compute the offset of the current minibatch in the data. - # Note that we could use better randomization across epochs. - offset = (step * BATCH_SIZE) % (train_size - BATCH_SIZE) - batch_data = train_data[offset:(offset + BATCH_SIZE), ...] - batch_labels = train_labels[offset:(offset + BATCH_SIZE)] - # This dictionary maps the batch data (as a numpy array) to the - # node in the graph it should be fed to. - feed_dict = {train_data_node: batch_data, - train_labels_node: batch_labels} - # Run the optimizer to update weights. - sess.run(optimizer, feed_dict=feed_dict) - # print some extra information once reach the evaluation frequency - if step % EVAL_FREQUENCY == 0: - # fetch some extra nodes' data - l, lr, predictions = sess.run([loss, learning_rate, train_prediction], - feed_dict=feed_dict) - elapsed_time = time.time() - start_time - start_time = time.time() - print('Step %d (epoch %.2f), %.1f ms' % - (step, float(step) * BATCH_SIZE / train_size, - 1000 * elapsed_time / EVAL_FREQUENCY)) - print('Minibatch loss: %.3f, learning rate: %.6f' % (l, lr)) - print('Minibatch error: %.1f%%' % error_rate(predictions, batch_labels)) - print('Validation error: %.1f%%' % error_rate( - eval_in_batches(validation_data, sess), validation_labels)) - sys.stdout.flush() - # Finally print the result! - test_error = error_rate(eval_in_batches(test_data, sess), test_labels) - print('Test error: %.1f%%' % test_error) - if FLAGS.self_test: - print('test_error', test_error) - assert test_error == 0.0, 'expected 0.0 test_error, got %.2f' % ( - test_error,) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--use_fp16', - default=False, - help='Use half floats instead of full floats if True.', - action='store_true') - parser.add_argument( - '--self_test', - default=False, - action='store_true', - help='True if running a self test.') - - FLAGS, unparsed = parser.parse_known_args() - tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) diff --git a/tutorials/rnn/BUILD b/tutorials/rnn/BUILD deleted file mode 100644 index 118884fd28d..00000000000 --- a/tutorials/rnn/BUILD +++ /dev/null @@ -1,80 +0,0 @@ -# Description: -# Example RNN models, including language models and sequence-to-sequence models. - -package(default_visibility = ["//visibility:public"]) - -licenses(["notice"]) # Apache 2.0 - -exports_files(["LICENSE"]) - -py_library( - name = "linear", - srcs = [ - "linear.py", - ], - srcs_version = "PY2AND3", - deps = [ - "//tensorflow:tensorflow_py", - ], -) - -py_library( - name = "rnn_cell", - srcs = [ - "rnn_cell.py", - ], - srcs_version = "PY2AND3", - deps = [ - ":linear", - "//tensorflow:tensorflow_py", - ], -) - -py_library( - name = "package", - srcs = [ - "__init__.py", - ], - srcs_version = "PY2AND3", - deps = [ - ":rnn", - ":rnn_cell", - ":seq2seq", - ], -) - -py_library( - name = "rnn", - srcs = [ - "rnn.py", - ], - srcs_version = "PY2AND3", - deps = [ - ":rnn_cell", - "//tensorflow:tensorflow_py", - ], -) - -py_library( - name = "seq2seq", - srcs = [ - "seq2seq.py", - ], - srcs_version = "PY2AND3", - deps = [ - ":rnn", - "//tensorflow:tensorflow_py", - ], -) - -filegroup( - name = "all_files", - srcs = glob( - ["**/*"], - exclude = [ - "**/METADATA", - "**/OWNERS", - ], - ), - visibility = ["//tensorflow:__subpackages__"], -) diff --git a/tutorials/rnn/README.md b/tutorials/rnn/README.md deleted file mode 100644 index 5166d14c3c6..00000000000 --- a/tutorials/rnn/README.md +++ /dev/null @@ -1,16 +0,0 @@ -This directory contains functions for creating recurrent neural networks -and sequence-to-sequence models. Detailed instructions on how to get started -and use them are available in the -[tutorials on tensorflow.org](http://tensorflow.org/tutorials/). - -Here is a short overview of what is in this directory: - - -File | What's in it? ------------- | ------------- -`ptb/` | PTB language model, see the [RNN Tutorial](http://tensorflow.org/tutorials/recurrent/) -`quickdraw/` | Quick, Draw! model, see the [RNN Tutorial for Drawing Classification](https://www.tensorflow.org/versions/master/tutorials/recurrent_quickdraw) - -If you're looking for the -[`seq2seq` tutorial code](http://tensorflow.org/tutorials/seq2seq/), it lives -in [its own repo](https://github.com/tensorflow/nmt). \ No newline at end of file diff --git a/tutorials/rnn/__init__.py b/tutorials/rnn/__init__.py deleted file mode 100644 index 844cc0b854e..00000000000 --- a/tutorials/rnn/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Libraries to build Recurrent Neural Networks.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function diff --git a/tutorials/rnn/ptb/BUILD b/tutorials/rnn/ptb/BUILD deleted file mode 100644 index a79fa202784..00000000000 --- a/tutorials/rnn/ptb/BUILD +++ /dev/null @@ -1,69 +0,0 @@ -# Description: -# Python support for TensorFlow. - -package(default_visibility = ["//tensorflow:internal"]) - -licenses(["notice"]) # Apache 2.0 - -exports_files(["LICENSE"]) - -py_library( - name = "package", - srcs = [ - "__init__.py", - ], - srcs_version = "PY2AND3", - deps = [ - ":reader", - ], -) - -py_library( - name = "reader", - srcs = ["reader.py"], - srcs_version = "PY2AND3", - deps = ["//tensorflow:tensorflow_py"], -) - -py_test( - name = "reader_test", - size = "small", - srcs = ["reader_test.py"], - srcs_version = "PY2AND3", - deps = [ - ":reader", - "//tensorflow:tensorflow_py", - ], -) - -py_library( - name = "util", - srcs = ["util.py"], - srcs_version = "PY2AND3", - deps = ["//tensorflow:tensorflow_py"], -) - -py_binary( - name = "ptb_word_lm", - srcs = [ - "ptb_word_lm.py", - ], - srcs_version = "PY2AND3", - deps = [ - ":reader", - ":util", - "//tensorflow:tensorflow_py", - ], -) - -filegroup( - name = "all_files", - srcs = glob( - ["**/*"], - exclude = [ - "**/METADATA", - "**/OWNERS", - ], - ), - visibility = ["//tensorflow:__subpackages__"], -) diff --git a/tutorials/rnn/ptb/__init__.py b/tutorials/rnn/ptb/__init__.py deleted file mode 100644 index 47ba9a74fb1..00000000000 --- a/tutorials/rnn/ptb/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Makes helper libraries available in the ptb package.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import reader -import util diff --git a/tutorials/rnn/ptb/ptb_word_lm.py b/tutorials/rnn/ptb/ptb_word_lm.py deleted file mode 100644 index 502863de3f2..00000000000 --- a/tutorials/rnn/ptb/ptb_word_lm.py +++ /dev/null @@ -1,529 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Example / benchmark for building a PTB LSTM model. - -Trains the model described in: -(Zaremba, et. al.) Recurrent Neural Network Regularization -http://arxiv.org/abs/1409.2329 - -There are 3 supported model configurations: -=========================================== -| config | epochs | train | valid | test -=========================================== -| small | 13 | 37.99 | 121.39 | 115.91 -| medium | 39 | 48.45 | 86.16 | 82.07 -| large | 55 | 37.87 | 82.62 | 78.29 -The exact results may vary depending on the random initialization. - -The hyperparameters used in the model: -- init_scale - the initial scale of the weights -- learning_rate - the initial value of the learning rate -- max_grad_norm - the maximum permissible norm of the gradient -- num_layers - the number of LSTM layers -- num_steps - the number of unrolled steps of LSTM -- hidden_size - the number of LSTM units -- max_epoch - the number of epochs trained with the initial learning rate -- max_max_epoch - the total number of epochs for training -- keep_prob - the probability of keeping weights in the dropout layer -- lr_decay - the decay of the learning rate for each epoch after "max_epoch" -- batch_size - the batch size -- rnn_mode - the low level implementation of lstm cell: one of CUDNN, - BASIC, or BLOCK, representing cudnn_lstm, basic_lstm, and - lstm_block_cell classes. - -The data required for this example is in the data/ dir of the -PTB dataset from Tomas Mikolov's webpage: - -$ wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz -$ tar xvf simple-examples.tgz - -To run: - -$ python ptb_word_lm.py --data_path=simple-examples/data/ - -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import time - -import numpy as np -import tensorflow as tf - -import reader -import util - -from tensorflow.python.client import device_lib - -from distutils.version import StrictVersion - -flags = tf.flags -logging = tf.logging - -flags.DEFINE_string( - "model", "small", - "A type of model. Possible options are: small, medium, large.") -flags.DEFINE_string("data_path", None, - "Where the training/test data is stored.") -flags.DEFINE_string("save_path", None, - "Model output directory.") -flags.DEFINE_bool("use_fp16", False, - "Train using 16-bit floats instead of 32bit floats") -flags.DEFINE_integer("num_gpus", 1, - "If larger than 1, Grappler AutoParallel optimizer " - "will create multiple training replicas with each GPU " - "running one replica.") -flags.DEFINE_string("rnn_mode", None, - "The low level implementation of lstm cell: one of CUDNN, " - "BASIC, and BLOCK, representing cudnn_lstm, basic_lstm, " - "and lstm_block_cell classes.") -FLAGS = flags.FLAGS -BASIC = "basic" -CUDNN = "cudnn" -BLOCK = "block" - - -def data_type(): - return tf.float16 if FLAGS.use_fp16 else tf.float32 - - -class PTBInput(object): - """The input data.""" - - def __init__(self, config, data, name=None): - self.batch_size = batch_size = config.batch_size - self.num_steps = num_steps = config.num_steps - self.epoch_size = ((len(data) // batch_size) - 1) // num_steps - self.input_data, self.targets = reader.ptb_producer( - data, batch_size, num_steps, name=name) - - -class PTBModel(object): - """The PTB model.""" - - def __init__(self, is_training, config, input_): - self._is_training = is_training - self._input = input_ - self._rnn_params = None - self._cell = None - self.batch_size = input_.batch_size - self.num_steps = input_.num_steps - size = config.hidden_size - vocab_size = config.vocab_size - - with tf.device("/cpu:0"): - embedding = tf.get_variable( - "embedding", [vocab_size, size], dtype=data_type()) - inputs = tf.nn.embedding_lookup(embedding, input_.input_data) - - if is_training and config.keep_prob < 1: - inputs = tf.nn.dropout(inputs, config.keep_prob) - - output, state = self._build_rnn_graph(inputs, config, is_training) - - softmax_w = tf.get_variable( - "softmax_w", [size, vocab_size], dtype=data_type()) - softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type()) - logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b) - # Reshape logits to be a 3-D tensor for sequence loss - logits = tf.reshape(logits, [self.batch_size, self.num_steps, vocab_size]) - - # Use the contrib sequence loss and average over the batches - loss = tf.contrib.seq2seq.sequence_loss( - logits, - input_.targets, - tf.ones([self.batch_size, self.num_steps], dtype=data_type()), - average_across_timesteps=False, - average_across_batch=True) - - # Update the cost - self._cost = tf.reduce_sum(loss) - self._final_state = state - - if not is_training: - return - - self._lr = tf.Variable(0.0, trainable=False) - tvars = tf.trainable_variables() - grads, _ = tf.clip_by_global_norm(tf.gradients(self._cost, tvars), - config.max_grad_norm) - optimizer = tf.train.GradientDescentOptimizer(self._lr) - self._train_op = optimizer.apply_gradients( - zip(grads, tvars), - global_step=tf.train.get_or_create_global_step()) - - self._new_lr = tf.placeholder( - tf.float32, shape=[], name="new_learning_rate") - self._lr_update = tf.assign(self._lr, self._new_lr) - - def _build_rnn_graph(self, inputs, config, is_training): - if config.rnn_mode == CUDNN: - return self._build_rnn_graph_cudnn(inputs, config, is_training) - else: - return self._build_rnn_graph_lstm(inputs, config, is_training) - - def _build_rnn_graph_cudnn(self, inputs, config, is_training): - """Build the inference graph using CUDNN cell.""" - inputs = tf.transpose(inputs, [1, 0, 2]) - self._cell = tf.contrib.cudnn_rnn.CudnnLSTM( - num_layers=config.num_layers, - num_units=config.hidden_size, - input_size=config.hidden_size, - dropout=1 - config.keep_prob if is_training else 0) - params_size_t = self._cell.params_size() - self._rnn_params = tf.get_variable( - "lstm_params", - initializer=tf.random_uniform( - [params_size_t], -config.init_scale, config.init_scale), - validate_shape=False) - c = tf.zeros([config.num_layers, self.batch_size, config.hidden_size], - tf.float32) - h = tf.zeros([config.num_layers, self.batch_size, config.hidden_size], - tf.float32) - self._initial_state = (tf.contrib.rnn.LSTMStateTuple(h=h, c=c),) - outputs, h, c = self._cell(inputs, h, c, self._rnn_params, is_training) - outputs = tf.transpose(outputs, [1, 0, 2]) - outputs = tf.reshape(outputs, [-1, config.hidden_size]) - return outputs, (tf.contrib.rnn.LSTMStateTuple(h=h, c=c),) - - def _get_lstm_cell(self, config, is_training): - if config.rnn_mode == BASIC: - return tf.contrib.rnn.BasicLSTMCell( - config.hidden_size, forget_bias=0.0, state_is_tuple=True, - reuse=not is_training) - if config.rnn_mode == BLOCK: - return tf.contrib.rnn.LSTMBlockCell( - config.hidden_size, forget_bias=0.0) - raise ValueError("rnn_mode %s not supported" % config.rnn_mode) - - def _build_rnn_graph_lstm(self, inputs, config, is_training): - """Build the inference graph using canonical LSTM cells.""" - # Slightly better results can be obtained with forget gate biases - # initialized to 1 but the hyperparameters of the model would need to be - # different than reported in the paper. - def make_cell(): - cell = self._get_lstm_cell(config, is_training) - if is_training and config.keep_prob < 1: - cell = tf.contrib.rnn.DropoutWrapper( - cell, output_keep_prob=config.keep_prob) - return cell - - cell = tf.contrib.rnn.MultiRNNCell( - [make_cell() for _ in range(config.num_layers)], state_is_tuple=True) - - self._initial_state = cell.zero_state(config.batch_size, data_type()) - state = self._initial_state - # Simplified version of tf.nn.static_rnn(). - # This builds an unrolled LSTM for tutorial purposes only. - # In general, use tf.nn.static_rnn() or tf.nn.static_state_saving_rnn(). - # - # The alternative version of the code below is: - # - # inputs = tf.unstack(inputs, num=self.num_steps, axis=1) - # outputs, state = tf.nn.static_rnn(cell, inputs, - # initial_state=self._initial_state) - outputs = [] - with tf.variable_scope("RNN"): - for time_step in range(self.num_steps): - if time_step > 0: tf.get_variable_scope().reuse_variables() - (cell_output, state) = cell(inputs[:, time_step, :], state) - outputs.append(cell_output) - output = tf.reshape(tf.concat(outputs, 1), [-1, config.hidden_size]) - return output, state - - def assign_lr(self, session, lr_value): - session.run(self._lr_update, feed_dict={self._new_lr: lr_value}) - - def export_ops(self, name): - """Exports ops to collections.""" - self._name = name - ops = {util.with_prefix(self._name, "cost"): self._cost} - if self._is_training: - ops.update(lr=self._lr, new_lr=self._new_lr, lr_update=self._lr_update) - if self._rnn_params: - ops.update(rnn_params=self._rnn_params) - for name, op in ops.items(): - tf.add_to_collection(name, op) - self._initial_state_name = util.with_prefix(self._name, "initial") - self._final_state_name = util.with_prefix(self._name, "final") - util.export_state_tuples(self._initial_state, self._initial_state_name) - util.export_state_tuples(self._final_state, self._final_state_name) - - def import_ops(self): - """Imports ops from collections.""" - if self._is_training: - self._train_op = tf.get_collection_ref("train_op")[0] - self._lr = tf.get_collection_ref("lr")[0] - self._new_lr = tf.get_collection_ref("new_lr")[0] - self._lr_update = tf.get_collection_ref("lr_update")[0] - rnn_params = tf.get_collection_ref("rnn_params") - if self._cell and rnn_params: - params_saveable = tf.contrib.cudnn_rnn.RNNParamsSaveable( - self._cell, - self._cell.params_to_canonical, - self._cell.canonical_to_params, - rnn_params, - base_variable_scope="Model/RNN") - tf.add_to_collection(tf.GraphKeys.SAVEABLE_OBJECTS, params_saveable) - self._cost = tf.get_collection_ref(util.with_prefix(self._name, "cost"))[0] - num_replicas = FLAGS.num_gpus if self._name == "Train" else 1 - self._initial_state = util.import_state_tuples( - self._initial_state, self._initial_state_name, num_replicas) - self._final_state = util.import_state_tuples( - self._final_state, self._final_state_name, num_replicas) - - @property - def input(self): - return self._input - - @property - def initial_state(self): - return self._initial_state - - @property - def cost(self): - return self._cost - - @property - def final_state(self): - return self._final_state - - @property - def lr(self): - return self._lr - - @property - def train_op(self): - return self._train_op - - @property - def initial_state_name(self): - return self._initial_state_name - - @property - def final_state_name(self): - return self._final_state_name - - -class SmallConfig(object): - """Small config.""" - init_scale = 0.1 - learning_rate = 1.0 - max_grad_norm = 5 - num_layers = 2 - num_steps = 20 - hidden_size = 200 - max_epoch = 4 - max_max_epoch = 13 - keep_prob = 1.0 - lr_decay = 0.5 - batch_size = 20 - vocab_size = 10000 - rnn_mode = BLOCK - - -class MediumConfig(object): - """Medium config.""" - init_scale = 0.05 - learning_rate = 1.0 - max_grad_norm = 5 - num_layers = 2 - num_steps = 35 - hidden_size = 650 - max_epoch = 6 - max_max_epoch = 39 - keep_prob = 0.5 - lr_decay = 0.8 - batch_size = 20 - vocab_size = 10000 - rnn_mode = BLOCK - - -class LargeConfig(object): - """Large config.""" - init_scale = 0.04 - learning_rate = 1.0 - max_grad_norm = 10 - num_layers = 2 - num_steps = 35 - hidden_size = 1500 - max_epoch = 14 - max_max_epoch = 55 - keep_prob = 0.35 - lr_decay = 1 / 1.15 - batch_size = 20 - vocab_size = 10000 - rnn_mode = BLOCK - - -class TestConfig(object): - """Tiny config, for testing.""" - init_scale = 0.1 - learning_rate = 1.0 - max_grad_norm = 1 - num_layers = 1 - num_steps = 2 - hidden_size = 2 - max_epoch = 1 - max_max_epoch = 1 - keep_prob = 1.0 - lr_decay = 0.5 - batch_size = 20 - vocab_size = 10000 - rnn_mode = BLOCK - - -def run_epoch(session, model, eval_op=None, verbose=False): - """Runs the model on the given data.""" - start_time = time.time() - costs = 0.0 - iters = 0 - state = session.run(model.initial_state) - - fetches = { - "cost": model.cost, - "final_state": model.final_state, - } - if eval_op is not None: - fetches["eval_op"] = eval_op - - for step in range(model.input.epoch_size): - feed_dict = {} - for i, (c, h) in enumerate(model.initial_state): - feed_dict[c] = state[i].c - feed_dict[h] = state[i].h - - vals = session.run(fetches, feed_dict) - cost = vals["cost"] - state = vals["final_state"] - - costs += cost - iters += model.input.num_steps - - if verbose and step % (model.input.epoch_size // 10) == 10: - print("%.3f perplexity: %.3f speed: %.0f wps" % - (step * 1.0 / model.input.epoch_size, np.exp(costs / iters), - iters * model.input.batch_size * max(1, FLAGS.num_gpus) / - (time.time() - start_time))) - - return np.exp(costs / iters) - - -def get_config(): - """Get model config.""" - config = None - if FLAGS.model == "small": - config = SmallConfig() - elif FLAGS.model == "medium": - config = MediumConfig() - elif FLAGS.model == "large": - config = LargeConfig() - elif FLAGS.model == "test": - config = TestConfig() - else: - raise ValueError("Invalid model: %s", FLAGS.model) - if FLAGS.rnn_mode: - config.rnn_mode = FLAGS.rnn_mode - if FLAGS.num_gpus != 1 or StrictVersion(tf.__version__) < StrictVersion("1.3.0") : - config.rnn_mode = BASIC - return config - - -def main(_): - if not FLAGS.data_path: - raise ValueError("Must set --data_path to PTB data directory") - gpus = [ - x.name for x in device_lib.list_local_devices() if x.device_type == "GPU" - ] - if FLAGS.num_gpus > len(gpus): - raise ValueError( - "Your machine has only %d gpus " - "which is less than the requested --num_gpus=%d." - % (len(gpus), FLAGS.num_gpus)) - - raw_data = reader.ptb_raw_data(FLAGS.data_path) - train_data, valid_data, test_data, _ = raw_data - - config = get_config() - eval_config = get_config() - eval_config.batch_size = 1 - eval_config.num_steps = 1 - - with tf.Graph().as_default(): - initializer = tf.random_uniform_initializer(-config.init_scale, - config.init_scale) - - with tf.name_scope("Train"): - train_input = PTBInput(config=config, data=train_data, name="TrainInput") - with tf.variable_scope("Model", reuse=None, initializer=initializer): - m = PTBModel(is_training=True, config=config, input_=train_input) - tf.summary.scalar("Training Loss", m.cost) - tf.summary.scalar("Learning Rate", m.lr) - - with tf.name_scope("Valid"): - valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") - with tf.variable_scope("Model", reuse=True, initializer=initializer): - mvalid = PTBModel(is_training=False, config=config, input_=valid_input) - tf.summary.scalar("Validation Loss", mvalid.cost) - - with tf.name_scope("Test"): - test_input = PTBInput( - config=eval_config, data=test_data, name="TestInput") - with tf.variable_scope("Model", reuse=True, initializer=initializer): - mtest = PTBModel(is_training=False, config=eval_config, - input_=test_input) - - models = {"Train": m, "Valid": mvalid, "Test": mtest} - for name, model in models.items(): - model.export_ops(name) - metagraph = tf.train.export_meta_graph() - if StrictVersion(tf.__version__) < StrictVersion("1.1.0") and FLAGS.num_gpus > 1: - raise ValueError("num_gpus > 1 is not supported for TensorFlow versions " - "below 1.1.0") - soft_placement = False - if FLAGS.num_gpus > 1: - soft_placement = True - util.auto_parallel(metagraph, m) - - with tf.Graph().as_default(): - tf.train.import_meta_graph(metagraph) - for model in models.values(): - model.import_ops() - sv = tf.train.Supervisor(logdir=FLAGS.save_path) - config_proto = tf.ConfigProto(allow_soft_placement=soft_placement) - with sv.managed_session(config=config_proto) as session: - for i in range(config.max_max_epoch): - lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0) - m.assign_lr(session, config.learning_rate * lr_decay) - - print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) - train_perplexity = run_epoch(session, m, eval_op=m.train_op, - verbose=True) - print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) - valid_perplexity = run_epoch(session, mvalid) - print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) - - test_perplexity = run_epoch(session, mtest) - print("Test Perplexity: %.3f" % test_perplexity) - - if FLAGS.save_path: - print("Saving model to %s." % FLAGS.save_path) - sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step) - - -if __name__ == "__main__": - tf.app.run() diff --git a/tutorials/rnn/ptb/reader.py b/tutorials/rnn/ptb/reader.py deleted file mode 100644 index da1dee32b2b..00000000000 --- a/tutorials/rnn/ptb/reader.py +++ /dev/null @@ -1,127 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - - -"""Utilities for parsing PTB text files.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import collections -import os -import sys - -import tensorflow as tf - -Py3 = sys.version_info[0] == 3 - -def _read_words(filename): - with tf.gfile.GFile(filename, "r") as f: - if Py3: - return f.read().replace("\n", "").split() - else: - return f.read().decode("utf-8").replace("\n", "").split() - - -def _build_vocab(filename): - data = _read_words(filename) - - counter = collections.Counter(data) - count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0])) - - words, _ = list(zip(*count_pairs)) - word_to_id = dict(zip(words, range(len(words)))) - - return word_to_id - - -def _file_to_word_ids(filename, word_to_id): - data = _read_words(filename) - return [word_to_id[word] for word in data if word in word_to_id] - - -def ptb_raw_data(data_path=None): - """Load PTB raw data from data directory "data_path". - - Reads PTB text files, converts strings to integer ids, - and performs mini-batching of the inputs. - - The PTB dataset comes from Tomas Mikolov's webpage: - - http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz - - Args: - data_path: string path to the directory where simple-examples.tgz has - been extracted. - - Returns: - tuple (train_data, valid_data, test_data, vocabulary) - where each of the data objects can be passed to PTBIterator. - """ - - train_path = os.path.join(data_path, "ptb.train.txt") - valid_path = os.path.join(data_path, "ptb.valid.txt") - test_path = os.path.join(data_path, "ptb.test.txt") - - word_to_id = _build_vocab(train_path) - train_data = _file_to_word_ids(train_path, word_to_id) - valid_data = _file_to_word_ids(valid_path, word_to_id) - test_data = _file_to_word_ids(test_path, word_to_id) - vocabulary = len(word_to_id) - return train_data, valid_data, test_data, vocabulary - - -def ptb_producer(raw_data, batch_size, num_steps, name=None): - """Iterate on the raw PTB data. - - This chunks up raw_data into batches of examples and returns Tensors that - are drawn from these batches. - - Args: - raw_data: one of the raw data outputs from ptb_raw_data. - batch_size: int, the batch size. - num_steps: int, the number of unrolls. - name: the name of this operation (optional). - - Returns: - A pair of Tensors, each shaped [batch_size, num_steps]. The second element - of the tuple is the same data time-shifted to the right by one. - - Raises: - tf.errors.InvalidArgumentError: if batch_size or num_steps are too high. - """ - with tf.name_scope(name, "PTBProducer", [raw_data, batch_size, num_steps]): - raw_data = tf.convert_to_tensor(raw_data, name="raw_data", dtype=tf.int32) - - data_len = tf.size(raw_data) - batch_len = data_len // batch_size - data = tf.reshape(raw_data[0 : batch_size * batch_len], - [batch_size, batch_len]) - - epoch_size = (batch_len - 1) // num_steps - assertion = tf.assert_positive( - epoch_size, - message="epoch_size == 0, decrease batch_size or num_steps") - with tf.control_dependencies([assertion]): - epoch_size = tf.identity(epoch_size, name="epoch_size") - - i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue() - x = tf.strided_slice(data, [0, i * num_steps], - [batch_size, (i + 1) * num_steps]) - x.set_shape([batch_size, num_steps]) - y = tf.strided_slice(data, [0, i * num_steps + 1], - [batch_size, (i + 1) * num_steps + 1]) - y.set_shape([batch_size, num_steps]) - return x, y diff --git a/tutorials/rnn/ptb/reader_test.py b/tutorials/rnn/ptb/reader_test.py deleted file mode 100644 index ab0191aef6e..00000000000 --- a/tutorials/rnn/ptb/reader_test.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Tests for models.tutorials.rnn.ptb.reader.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os.path - -import tensorflow as tf - -import reader - - -class PtbReaderTest(tf.test.TestCase): - - def setUp(self): - self._string_data = "\n".join( - [" hello there i am", - " rain as day", - " want some cheesy puffs ?"]) - - def testPtbRawData(self): - tmpdir = tf.test.get_temp_dir() - for suffix in "train", "valid", "test": - filename = os.path.join(tmpdir, "ptb.%s.txt" % suffix) - with tf.gfile.GFile(filename, "w") as fh: - fh.write(self._string_data) - # Smoke test - output = reader.ptb_raw_data(tmpdir) - self.assertEqual(len(output), 4) - - def testPtbProducer(self): - raw_data = [4, 3, 2, 1, 0, 5, 6, 1, 1, 1, 1, 0, 3, 4, 1] - batch_size = 3 - num_steps = 2 - x, y = reader.ptb_producer(raw_data, batch_size, num_steps) - with self.test_session() as session: - coord = tf.train.Coordinator() - tf.train.start_queue_runners(session, coord=coord) - try: - xval, yval = session.run([x, y]) - self.assertAllEqual(xval, [[4, 3], [5, 6], [1, 0]]) - self.assertAllEqual(yval, [[3, 2], [6, 1], [0, 3]]) - xval, yval = session.run([x, y]) - self.assertAllEqual(xval, [[2, 1], [1, 1], [3, 4]]) - self.assertAllEqual(yval, [[1, 0], [1, 1], [4, 1]]) - finally: - coord.request_stop() - coord.join() - - -if __name__ == "__main__": - tf.test.main() diff --git a/tutorials/rnn/ptb/util.py b/tutorials/rnn/ptb/util.py deleted file mode 100644 index f23581e69b2..00000000000 --- a/tutorials/rnn/ptb/util.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright 2017 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Utilities for Grappler autoparallel optimizer.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf - -from tensorflow.core.framework import variable_pb2 -from tensorflow.core.protobuf import rewriter_config_pb2 - -FLAGS = tf.flags.FLAGS - - -def export_state_tuples(state_tuples, name): - for state_tuple in state_tuples: - tf.add_to_collection(name, state_tuple.c) - tf.add_to_collection(name, state_tuple.h) - - -def import_state_tuples(state_tuples, name, num_replicas): - restored = [] - for i in range(len(state_tuples) * num_replicas): - c = tf.get_collection_ref(name)[2 * i + 0] - h = tf.get_collection_ref(name)[2 * i + 1] - restored.append(tf.contrib.rnn.LSTMStateTuple(c, h)) - return tuple(restored) - - -def with_prefix(prefix, name): - """Adds prefix to name.""" - return "/".join((prefix, name)) - - -def with_autoparallel_prefix(replica_id, name): - return with_prefix("AutoParallel-Replica-%d" % replica_id, name) - - -class UpdateCollection(object): - """Update collection info in MetaGraphDef for AutoParallel optimizer.""" - - def __init__(self, metagraph, model): - self._metagraph = metagraph - self.replicate_states(model.initial_state_name) - self.replicate_states(model.final_state_name) - self.update_snapshot_name("variables") - self.update_snapshot_name("trainable_variables") - - def update_snapshot_name(self, var_coll_name): - var_list = self._metagraph.collection_def[var_coll_name] - for i, value in enumerate(var_list.bytes_list.value): - var_def = variable_pb2.VariableDef() - var_def.ParseFromString(value) - # Somehow node Model/global_step/read doesn't have any fanout and seems to - # be only used for snapshot; this is different from all other variables. - if var_def.snapshot_name != "Model/global_step/read:0": - var_def.snapshot_name = with_autoparallel_prefix( - 0, var_def.snapshot_name) - value = var_def.SerializeToString() - var_list.bytes_list.value[i] = value - - def replicate_states(self, state_coll_name): - state_list = self._metagraph.collection_def[state_coll_name] - num_states = len(state_list.node_list.value) - for replica_id in range(1, FLAGS.num_gpus): - for i in range(num_states): - state_list.node_list.value.append(state_list.node_list.value[i]) - for replica_id in range(FLAGS.num_gpus): - for i in range(num_states): - index = replica_id * num_states + i - state_list.node_list.value[index] = with_autoparallel_prefix( - replica_id, state_list.node_list.value[index]) - - -def auto_parallel(metagraph, model): - from tensorflow.python.grappler import tf_optimizer - rewriter_config = rewriter_config_pb2.RewriterConfig() - rewriter_config.optimizers.append("autoparallel") - rewriter_config.auto_parallel.enable = True - rewriter_config.auto_parallel.num_replicas = FLAGS.num_gpus - optimized_graph = tf_optimizer.OptimizeGraph(rewriter_config, metagraph) - metagraph.graph_def.CopyFrom(optimized_graph) - UpdateCollection(metagraph, model) diff --git a/tutorials/rnn/quickdraw/BUILD b/tutorials/rnn/quickdraw/BUILD deleted file mode 100644 index 33c3faeced4..00000000000 --- a/tutorials/rnn/quickdraw/BUILD +++ /dev/null @@ -1,42 +0,0 @@ -# Description: -# Example classification model on Quick, Draw! dataset. - -package(default_visibility = ["//visibility:public"]) - -licenses(["notice"]) # Apache 2.0 - -exports_files(["LICENSE"]) - -py_binary( - name = "train_model", - srcs = [ - "train_model.py", - ], - srcs_version = "PY2AND3", - deps = [ - "//third_party/py/tensorflow", - ], -) - -py_binary( - name = "create_dataset", - srcs = [ - "create_dataset.py", - ], - deps = [ - "//third_party/py/numpy", - "//third_party/py/tensorflow", - ], -) - -filegroup( - name = "all_files", - srcs = glob( - ["**/*"], - exclude = [ - "**/METADATA", - "**/OWNERS", - ], - ), - visibility = ["//third_party/tensorflow:__subpackages__"], -) diff --git a/tutorials/rnn/quickdraw/create_dataset.py b/tutorials/rnn/quickdraw/create_dataset.py deleted file mode 100644 index af2f019ebac..00000000000 --- a/tutorials/rnn/quickdraw/create_dataset.py +++ /dev/null @@ -1,211 +0,0 @@ -# Copyright 2017 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -r"""Creates training and eval data from Quickdraw NDJSON files. - -This tool reads the NDJSON files from https://quickdraw.withgoogle.com/data -and converts them into tensorflow.Example stored in TFRecord files. - -The tensorflow example will contain 3 features: - shape - contains the shape of the sequence [length, dim] where dim=3. - class_index - the class index of the class for the example. - ink - a length * dim vector of the ink. - -It creates disjoint training and evaluation sets. - -python create_dataset.py \ - --ndjson_path ${HOME}/ndjson \ - --output_path ${HOME}/tfrecord -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import json -import os -import random -import sys -import numpy as np -import tensorflow as tf - - -def parse_line(ndjson_line): - """Parse an ndjson line and return ink (as np array) and classname.""" - sample = json.loads(ndjson_line) - class_name = sample["word"] - if not class_name: - print ("Empty classname") - return None, None - inkarray = sample["drawing"] - stroke_lengths = [len(stroke[0]) for stroke in inkarray] - total_points = sum(stroke_lengths) - np_ink = np.zeros((total_points, 3), dtype=np.float32) - current_t = 0 - if not inkarray: - print("Empty inkarray") - return None, None - for stroke in inkarray: - if len(stroke[0]) != len(stroke[1]): - print("Inconsistent number of x and y coordinates.") - return None, None - for i in [0, 1]: - np_ink[current_t:(current_t + len(stroke[0])), i] = stroke[i] - current_t += len(stroke[0]) - np_ink[current_t - 1, 2] = 1 # stroke_end - # Preprocessing. - # 1. Size normalization. - lower = np.min(np_ink[:, 0:2], axis=0) - upper = np.max(np_ink[:, 0:2], axis=0) - scale = upper - lower - scale[scale == 0] = 1 - np_ink[:, 0:2] = (np_ink[:, 0:2] - lower) / scale - # 2. Compute deltas. - np_ink[1:, 0:2] -= np_ink[0:-1, 0:2] - np_ink = np_ink[1:, :] - return np_ink, class_name - - -def convert_data(trainingdata_dir, - observations_per_class, - output_file, - classnames, - output_shards=10, - offset=0): - """Convert training data from ndjson files into tf.Example in tf.Record. - - Args: - trainingdata_dir: path to the directory containin the training data. - The training data is stored in that directory as ndjson files. - observations_per_class: the number of items to load per class. - output_file: path where to write the output. - classnames: array with classnames - is auto created if not passed in. - output_shards: the number of shards to write the output in. - offset: the number of items to skip at the beginning of each file. - - Returns: - classnames: the class names as strings. classnames[classes[i]] is the - textual representation of the class of the i-th data point. - """ - - def _pick_output_shard(): - return random.randint(0, output_shards - 1) - - file_handles = [] - # Open all input files. - for filename in sorted(tf.gfile.ListDirectory(trainingdata_dir)): - if not filename.endswith(".ndjson"): - print("Skipping", filename) - continue - file_handles.append( - tf.gfile.GFile(os.path.join(trainingdata_dir, filename), "r")) - if offset: # Fast forward all files to skip the offset. - count = 0 - for _ in file_handles[-1]: - count += 1 - if count == offset: - break - - writers = [] - for i in range(FLAGS.output_shards): - writers.append( - tf.python_io.TFRecordWriter("%s-%05i-of-%05i" % (output_file, i, - output_shards))) - - reading_order = list(range(len(file_handles))) * observations_per_class - random.shuffle(reading_order) - - for c in reading_order: - line = file_handles[c].readline() - ink = None - while ink is None: - ink, class_name = parse_line(line) - if ink is None: - print ("Couldn't parse ink from '" + line + "'.") - if class_name not in classnames: - classnames.append(class_name) - features = {} - features["class_index"] = tf.train.Feature(int64_list=tf.train.Int64List( - value=[classnames.index(class_name)])) - features["ink"] = tf.train.Feature(float_list=tf.train.FloatList( - value=ink.flatten())) - features["shape"] = tf.train.Feature(int64_list=tf.train.Int64List( - value=ink.shape)) - f = tf.train.Features(feature=features) - example = tf.train.Example(features=f) - writers[_pick_output_shard()].write(example.SerializeToString()) - - # Close all files - for w in writers: - w.close() - for f in file_handles: - f.close() - # Write the class list. - with tf.gfile.GFile(output_file + ".classes", "w") as f: - for class_name in classnames: - f.write(class_name + "\n") - return classnames - - -def main(argv): - del argv - classnames = convert_data( - FLAGS.ndjson_path, - FLAGS.train_observations_per_class, - os.path.join(FLAGS.output_path, "training.tfrecord"), - classnames=[], - output_shards=FLAGS.output_shards, - offset=0) - convert_data( - FLAGS.ndjson_path, - FLAGS.eval_observations_per_class, - os.path.join(FLAGS.output_path, "eval.tfrecord"), - classnames=classnames, - output_shards=FLAGS.output_shards, - offset=FLAGS.train_observations_per_class) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.register("type", "bool", lambda v: v.lower() == "true") - parser.add_argument( - "--ndjson_path", - type=str, - default="", - help="Directory where the ndjson files are stored.") - parser.add_argument( - "--output_path", - type=str, - default="", - help="Directory where to store the output TFRecord files.") - parser.add_argument( - "--train_observations_per_class", - type=int, - default=10000, - help="How many items per class to load for training.") - parser.add_argument( - "--eval_observations_per_class", - type=int, - default=1000, - help="How many items per class to load for evaluation.") - parser.add_argument( - "--output_shards", - type=int, - default=10, - help="Number of shards for the output.") - - FLAGS, unparsed = parser.parse_known_args() - tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) diff --git a/tutorials/rnn/quickdraw/train_model.py b/tutorials/rnn/quickdraw/train_model.py deleted file mode 100644 index f98d8202355..00000000000 --- a/tutorials/rnn/quickdraw/train_model.py +++ /dev/null @@ -1,378 +0,0 @@ -# Copyright 2017 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -r"""Binary for training a RNN-based classifier for the Quick, Draw! data. - -python train_model.py \ - --training_data train_data \ - --eval_data eval_data \ - --model_dir /tmp/quickdraw_model/ \ - --cell_type cudnn_lstm - -When running on GPUs using --cell_type cudnn_lstm is much faster. - -The expected performance is ~75% in 1.5M steps with the default configuration. -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -import argparse -import ast -import functools -import sys - -import tensorflow as tf - - -def get_num_classes(): - classes = [] - with tf.gfile.GFile(FLAGS.classes_file, "r") as f: - classes = [x for x in f] - num_classes = len(classes) - return num_classes - - -def get_input_fn(mode, tfrecord_pattern, batch_size): - """Creates an input_fn that stores all the data in memory. - - Args: - mode: one of tf.contrib.learn.ModeKeys.{TRAIN, INFER, EVAL} - tfrecord_pattern: path to a TF record file created using create_dataset.py. - batch_size: the batch size to output. - - Returns: - A valid input_fn for the model estimator. - """ - - def _parse_tfexample_fn(example_proto, mode): - """Parse a single record which is expected to be a tensorflow.Example.""" - feature_to_type = { - "ink": tf.VarLenFeature(dtype=tf.float32), - "shape": tf.FixedLenFeature([2], dtype=tf.int64) - } - if mode != tf.estimator.ModeKeys.PREDICT: - # The labels won't be available at inference time, so don't add them - # to the list of feature_columns to be read. - feature_to_type["class_index"] = tf.FixedLenFeature([1], dtype=tf.int64) - - parsed_features = tf.parse_single_example(example_proto, feature_to_type) - labels = None - if mode != tf.estimator.ModeKeys.PREDICT: - labels = parsed_features["class_index"] - parsed_features["ink"] = tf.sparse_tensor_to_dense(parsed_features["ink"]) - return parsed_features, labels - - def _input_fn(): - """Estimator `input_fn`. - - Returns: - A tuple of: - - Dictionary of string feature name to `Tensor`. - - `Tensor` of target labels. - """ - dataset = tf.data.TFRecordDataset.list_files(tfrecord_pattern) - if mode == tf.estimator.ModeKeys.TRAIN: - dataset = dataset.shuffle(buffer_size=10) - dataset = dataset.repeat() - # Preprocesses 10 files concurrently and interleaves records from each file. - dataset = dataset.interleave( - tf.data.TFRecordDataset, - cycle_length=10, - block_length=1) - dataset = dataset.map( - functools.partial(_parse_tfexample_fn, mode=mode), - num_parallel_calls=10) - dataset = dataset.prefetch(10000) - if mode == tf.estimator.ModeKeys.TRAIN: - dataset = dataset.shuffle(buffer_size=1000000) - # Our inputs are variable length, so pad them. - dataset = dataset.padded_batch( - batch_size, padded_shapes=dataset.output_shapes) - features, labels = dataset.make_one_shot_iterator().get_next() - return features, labels - - return _input_fn - - -def model_fn(features, labels, mode, params): - """Model function for RNN classifier. - - This function sets up a neural network which applies convolutional layers (as - configured with params.num_conv and params.conv_len) to the input. - The output of the convolutional layers is given to LSTM layers (as configured - with params.num_layers and params.num_nodes). - The final state of the all LSTM layers are concatenated and fed to a fully - connected layer to obtain the final classification scores. - - Args: - features: dictionary with keys: inks, lengths. - labels: one hot encoded classes - mode: one of tf.estimator.ModeKeys.{TRAIN, INFER, EVAL} - params: a parameter dictionary with the following keys: num_layers, - num_nodes, batch_size, num_conv, conv_len, num_classes, learning_rate. - - Returns: - ModelFnOps for Estimator API. - """ - - def _get_input_tensors(features, labels): - """Converts the input dict into inks, lengths, and labels tensors.""" - # features[ink] is a sparse tensor that is [8, batch_maxlen, 3] - # inks will be a dense tensor of [8, maxlen, 3] - # shapes is [batchsize, 2] - shapes = features["shape"] - # lengths will be [batch_size] - lengths = tf.squeeze( - tf.slice(shapes, begin=[0, 0], size=[params.batch_size, 1])) - inks = tf.reshape(features["ink"], [params.batch_size, -1, 3]) - if labels is not None: - labels = tf.squeeze(labels) - return inks, lengths, labels - - def _add_conv_layers(inks, lengths): - """Adds convolution layers.""" - convolved = inks - for i in range(len(params.num_conv)): - convolved_input = convolved - if params.batch_norm: - convolved_input = tf.layers.batch_normalization( - convolved_input, - training=(mode == tf.estimator.ModeKeys.TRAIN)) - # Add dropout layer if enabled and not first convolution layer. - if i > 0 and params.dropout: - convolved_input = tf.layers.dropout( - convolved_input, - rate=params.dropout, - training=(mode == tf.estimator.ModeKeys.TRAIN)) - convolved = tf.layers.conv1d( - convolved_input, - filters=params.num_conv[i], - kernel_size=params.conv_len[i], - activation=None, - strides=1, - padding="same", - name="conv1d_%d" % i) - return convolved, lengths - - def _add_regular_rnn_layers(convolved, lengths): - """Adds RNN layers.""" - if params.cell_type == "lstm": - cell = tf.nn.rnn_cell.BasicLSTMCell - elif params.cell_type == "block_lstm": - cell = tf.contrib.rnn.LSTMBlockCell - cells_fw = [cell(params.num_nodes) for _ in range(params.num_layers)] - cells_bw = [cell(params.num_nodes) for _ in range(params.num_layers)] - if params.dropout > 0.0: - cells_fw = [tf.contrib.rnn.DropoutWrapper(cell) for cell in cells_fw] - cells_bw = [tf.contrib.rnn.DropoutWrapper(cell) for cell in cells_bw] - outputs, _, _ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn( - cells_fw=cells_fw, - cells_bw=cells_bw, - inputs=convolved, - sequence_length=lengths, - dtype=tf.float32, - scope="rnn_classification") - return outputs - - def _add_cudnn_rnn_layers(convolved): - """Adds CUDNN LSTM layers.""" - # Convolutions output [B, L, Ch], while CudnnLSTM is time-major. - convolved = tf.transpose(convolved, [1, 0, 2]) - lstm = tf.contrib.cudnn_rnn.CudnnLSTM( - num_layers=params.num_layers, - num_units=params.num_nodes, - dropout=params.dropout if mode == tf.estimator.ModeKeys.TRAIN else 0.0, - direction="bidirectional") - outputs, _ = lstm(convolved) - # Convert back from time-major outputs to batch-major outputs. - outputs = tf.transpose(outputs, [1, 0, 2]) - return outputs - - def _add_rnn_layers(convolved, lengths): - """Adds recurrent neural network layers depending on the cell type.""" - if params.cell_type != "cudnn_lstm": - outputs = _add_regular_rnn_layers(convolved, lengths) - else: - outputs = _add_cudnn_rnn_layers(convolved) - # outputs is [batch_size, L, N] where L is the maximal sequence length and N - # the number of nodes in the last layer. - mask = tf.tile( - tf.expand_dims(tf.sequence_mask(lengths, tf.shape(outputs)[1]), 2), - [1, 1, tf.shape(outputs)[2]]) - zero_outside = tf.where(mask, outputs, tf.zeros_like(outputs)) - outputs = tf.reduce_sum(zero_outside, axis=1) - return outputs - - def _add_fc_layers(final_state): - """Adds a fully connected layer.""" - return tf.layers.dense(final_state, params.num_classes) - - # Build the model. - inks, lengths, labels = _get_input_tensors(features, labels) - convolved, lengths = _add_conv_layers(inks, lengths) - final_state = _add_rnn_layers(convolved, lengths) - logits = _add_fc_layers(final_state) - # Add the loss. - cross_entropy = tf.reduce_mean( - tf.nn.sparse_softmax_cross_entropy_with_logits( - labels=labels, logits=logits)) - # Add the optimizer. - train_op = tf.contrib.layers.optimize_loss( - loss=cross_entropy, - global_step=tf.train.get_global_step(), - learning_rate=params.learning_rate, - optimizer="Adam", - # some gradient clipping stabilizes training in the beginning. - clip_gradients=params.gradient_clipping_norm, - summaries=["learning_rate", "loss", "gradients", "gradient_norm"]) - # Compute current predictions. - predictions = tf.argmax(logits, axis=1) - return tf.estimator.EstimatorSpec( - mode=mode, - predictions={"logits": logits, "predictions": predictions}, - loss=cross_entropy, - train_op=train_op, - eval_metric_ops={"accuracy": tf.metrics.accuracy(labels, predictions)}) - - -def create_estimator_and_specs(run_config): - """Creates an Experiment configuration based on the estimator and input fn.""" - model_params = tf.contrib.training.HParams( - num_layers=FLAGS.num_layers, - num_nodes=FLAGS.num_nodes, - batch_size=FLAGS.batch_size, - num_conv=ast.literal_eval(FLAGS.num_conv), - conv_len=ast.literal_eval(FLAGS.conv_len), - num_classes=get_num_classes(), - learning_rate=FLAGS.learning_rate, - gradient_clipping_norm=FLAGS.gradient_clipping_norm, - cell_type=FLAGS.cell_type, - batch_norm=FLAGS.batch_norm, - dropout=FLAGS.dropout) - - estimator = tf.estimator.Estimator( - model_fn=model_fn, - config=run_config, - params=model_params) - - train_spec = tf.estimator.TrainSpec(input_fn=get_input_fn( - mode=tf.estimator.ModeKeys.TRAIN, - tfrecord_pattern=FLAGS.training_data, - batch_size=FLAGS.batch_size), max_steps=FLAGS.steps) - - eval_spec = tf.estimator.EvalSpec(input_fn=get_input_fn( - mode=tf.estimator.ModeKeys.EVAL, - tfrecord_pattern=FLAGS.eval_data, - batch_size=FLAGS.batch_size)) - - return estimator, train_spec, eval_spec - - -def main(unused_args): - estimator, train_spec, eval_spec = create_estimator_and_specs( - run_config=tf.estimator.RunConfig( - model_dir=FLAGS.model_dir, - save_checkpoints_secs=300, - save_summary_steps=100)) - tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.register("type", "bool", lambda v: v.lower() == "true") - parser.add_argument( - "--training_data", - type=str, - default="", - help="Path to training data (tf.Example in TFRecord format)") - parser.add_argument( - "--eval_data", - type=str, - default="", - help="Path to evaluation data (tf.Example in TFRecord format)") - parser.add_argument( - "--classes_file", - type=str, - default="", - help="Path to a file with the classes - one class per line") - parser.add_argument( - "--num_layers", - type=int, - default=3, - help="Number of recurrent neural network layers.") - parser.add_argument( - "--num_nodes", - type=int, - default=128, - help="Number of node per recurrent network layer.") - parser.add_argument( - "--num_conv", - type=str, - default="[48, 64, 96]", - help="Number of conv layers along with number of filters per layer.") - parser.add_argument( - "--conv_len", - type=str, - default="[5, 5, 3]", - help="Length of the convolution filters.") - parser.add_argument( - "--cell_type", - type=str, - default="lstm", - help="Cell type used for rnn layers: cudnn_lstm, lstm or block_lstm.") - parser.add_argument( - "--batch_norm", - type="bool", - default="False", - help="Whether to enable batch normalization or not.") - parser.add_argument( - "--learning_rate", - type=float, - default=0.0001, - help="Learning rate used for training.") - parser.add_argument( - "--gradient_clipping_norm", - type=float, - default=9.0, - help="Gradient clipping norm used during training.") - parser.add_argument( - "--dropout", - type=float, - default=0.3, - help="Dropout used for convolutions and bidi lstm layers.") - parser.add_argument( - "--steps", - type=int, - default=100000, - help="Number of training steps.") - parser.add_argument( - "--batch_size", - type=int, - default=8, - help="Batch size to use for training/evaluation.") - parser.add_argument( - "--model_dir", - type=str, - default="", - help="Path for storing the model checkpoints.") - parser.add_argument( - "--self_test", - type="bool", - default="False", - help="Whether to enable batch normalization or not.") - - FLAGS, unparsed = parser.parse_known_args() - tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)