# Copyright 2018 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== r"""Creates and runs `Estimator` for object detection model on TPUs. This uses the TPUEstimator API to define and run a model in TRAIN/EVAL modes. """ # pylint: enable=line-too-long from __future__ import absolute_import from __future__ import division from __future__ import print_function from absl import flags import tensorflow as tf from object_detection import model_hparams from object_detection import model_lib tf.flags.DEFINE_bool('use_tpu', True, 'Use TPUs rather than plain CPUs') # Cloud TPU Cluster Resolvers flags.DEFINE_string( 'gcp_project', default=None, help='Project name for the Cloud TPU-enabled project. If not specified, we ' 'will attempt to automatically detect the GCE project from metadata.') flags.DEFINE_string( 'tpu_zone', default=None, help='GCE zone where the Cloud TPU is located in. If not specified, we ' 'will attempt to automatically detect the GCE project from metadata.') flags.DEFINE_string( 'tpu_name', default=None, help='Name of the Cloud TPU for Cluster Resolvers.') flags.DEFINE_integer('num_shards', 8, 'Number of shards (TPU cores).') flags.DEFINE_integer('iterations_per_loop', 100, 'Number of iterations per TPU training loop.') # For mode=train_and_eval, evaluation occurs after training is finished. # Note: independently of steps_per_checkpoint, estimator will save the most # recent checkpoint every 10 minutes by default for train_and_eval flags.DEFINE_string('mode', 'train', 'Mode to run: train, eval') flags.DEFINE_integer('train_batch_size', None, 'Batch size for training. If ' 'this is not provided, batch size is read from training ' 'config.') flags.DEFINE_string( 'hparams_overrides', None, 'Comma-separated list of ' 'hyperparameters to override defaults.') flags.DEFINE_integer('num_train_steps', None, 'Number of train steps.') flags.DEFINE_boolean('eval_training_data', False, 'If training data should be evaluated for this job.') flags.DEFINE_integer('sample_1_of_n_eval_examples', 1, 'Will sample one of ' 'every n eval input examples, where n is provided.') flags.DEFINE_integer('sample_1_of_n_eval_on_train_examples', 5, 'Will sample ' 'one of every n train input examples for evaluation, ' 'where n is provided. This is only used if ' '`eval_training_data` is True.') flags.DEFINE_string( 'model_dir', None, 'Path to output model directory ' 'where event and checkpoint files will be written.') flags.DEFINE_string('pipeline_config_path', None, 'Path to pipeline config ' 'file.') FLAGS = tf.flags.FLAGS def main(unused_argv): flags.mark_flag_as_required('model_dir') flags.mark_flag_as_required('pipeline_config_path') tpu_cluster_resolver = ( tf.contrib.cluster_resolver.TPUClusterResolver( tpu=[FLAGS.tpu_name], zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)) tpu_grpc_url = tpu_cluster_resolver.get_master() config = tf.contrib.tpu.RunConfig( master=tpu_grpc_url, evaluation_master=tpu_grpc_url, model_dir=FLAGS.model_dir, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_shards)) kwargs = {} if FLAGS.train_batch_size: kwargs['batch_size'] = FLAGS.train_batch_size train_and_eval_dict = model_lib.create_estimator_and_inputs( run_config=config, hparams=model_hparams.create_hparams(FLAGS.hparams_overrides), pipeline_config_path=FLAGS.pipeline_config_path, train_steps=FLAGS.num_train_steps, sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples, sample_1_of_n_eval_on_train_examples=( FLAGS.sample_1_of_n_eval_on_train_examples), use_tpu_estimator=True, use_tpu=FLAGS.use_tpu, num_shards=FLAGS.num_shards, save_final_config=FLAGS.mode == 'train', **kwargs) estimator = train_and_eval_dict['estimator'] train_input_fn = train_and_eval_dict['train_input_fn'] eval_input_fns = train_and_eval_dict['eval_input_fns'] eval_on_train_input_fn = train_and_eval_dict['eval_on_train_input_fn'] train_steps = train_and_eval_dict['train_steps'] if FLAGS.mode == 'train': estimator.train(input_fn=train_input_fn, max_steps=train_steps) # Continuously evaluating. if FLAGS.mode == 'eval': if FLAGS.eval_training_data: name = 'training_data' input_fn = eval_on_train_input_fn else: name = 'validation_data' # Currently only a single eval input is allowed. input_fn = eval_input_fns[0] model_lib.continuous_eval(estimator, FLAGS.model_dir, input_fn, train_steps, name) if __name__ == '__main__': tf.app.run()