# Copyright 2017 Google, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Scripts for meta-optimization.""" from __future__ import print_function import os import tensorflow as tf import metaopt from learned_optimizer.optimizer import coordinatewise_rnn from learned_optimizer.optimizer import global_learning_rate from learned_optimizer.optimizer import hierarchical_rnn from learned_optimizer.optimizer import learning_rate_schedule from learned_optimizer.optimizer import trainable_adam from learned_optimizer.problems import problem_sets as ps from learned_optimizer.problems import problem_spec tf.app.flags.DEFINE_string("train_dir", "/tmp/lol/", """Directory to store parameters and results.""") tf.app.flags.DEFINE_integer("task", 0, """Task id of the replica running the training.""") tf.app.flags.DEFINE_integer("worker_tasks", 1, """Number of tasks in the worker job.""") tf.app.flags.DEFINE_integer("num_problems", 1000, """Number of sub-problems to run.""") tf.app.flags.DEFINE_integer("num_meta_iterations", 5, """Number of meta-iterations to optimize.""") tf.app.flags.DEFINE_integer("num_unroll_scale", 40, """The scale parameter of the exponential distribution from which the number of partial unrolls is drawn""") tf.app.flags.DEFINE_integer("min_num_unrolls", 1, """The minimum number of unrolls per problem.""") tf.app.flags.DEFINE_integer("num_partial_unroll_itr_scale", 200, """The scale parameter of the exponential distribution from which the number of iterations per unroll is drawn.""") tf.app.flags.DEFINE_integer("min_num_itr_partial_unroll", 50, """The minimum number of iterations for one unroll.""") tf.app.flags.DEFINE_string("optimizer", "HierarchicalRNN", """Which meta-optimizer to train.""") # CoordinatewiseRNN-specific flags tf.app.flags.DEFINE_integer("cell_size", 20, """Size of the RNN hidden state in each layer.""") tf.app.flags.DEFINE_integer("num_cells", 2, """Number of RNN layers.""") tf.app.flags.DEFINE_string("cell_cls", "GRUCell", """Type of RNN cell to use.""") # Metaoptimization parameters tf.app.flags.DEFINE_float("meta_learning_rate", 1e-6, """The learning rate for the meta-optimizer.""") tf.app.flags.DEFINE_float("gradient_clip_level", 1e4, """The level to clip gradients to.""") # Training set selection tf.app.flags.DEFINE_boolean("include_quadratic_problems", False, """Include non-noisy quadratic problems.""") tf.app.flags.DEFINE_boolean("include_noisy_quadratic_problems", True, """Include noisy quadratic problems.""") tf.app.flags.DEFINE_boolean("include_large_quadratic_problems", True, """Include very large quadratic problems.""") tf.app.flags.DEFINE_boolean("include_bowl_problems", True, """Include 2D bowl problems.""") tf.app.flags.DEFINE_boolean("include_softmax_2_class_problems", True, """Include 2-class logistic regression problems.""") tf.app.flags.DEFINE_boolean("include_noisy_softmax_2_class_problems", True, """Include noisy 2-class logistic regression problems.""") tf.app.flags.DEFINE_boolean("include_optimization_test_problems", True, """Include non-noisy versions of classic optimization test problems, e.g. Rosenbrock.""") tf.app.flags.DEFINE_boolean("include_noisy_optimization_test_problems", True, """Include gradient-noise versions of classic optimization test problems, e.g. Rosenbrock""") tf.app.flags.DEFINE_boolean("include_fully_connected_random_2_class_problems", True, """Include MLP problems for 2 classes.""") tf.app.flags.DEFINE_boolean("include_matmul_problems", True, """Include matrix multiplication problems.""") tf.app.flags.DEFINE_boolean("include_log_objective_problems", True, """Include problems where the objective is the log objective of another problem, e.g. Bowl.""") tf.app.flags.DEFINE_boolean("include_rescale_problems", True, """Include problems where the parameters are scaled version of the original parameters.""") tf.app.flags.DEFINE_boolean("include_norm_problems", True, """Include problems where the objective is the N-norm of another problem, e.g. Quadratic.""") tf.app.flags.DEFINE_boolean("include_sum_problems", True, """Include problems where the objective is the sum of the objectives of the subproblems that make up the problem parameters. Per-problem tensors are still independent of each other.""") tf.app.flags.DEFINE_boolean("include_sparse_gradient_problems", True, """Include problems where the gradient is set to 0 with some high probability.""") tf.app.flags.DEFINE_boolean("include_sparse_softmax_problems", False, """Include sparse softmax problems.""") tf.app.flags.DEFINE_boolean("include_one_hot_sparse_softmax_problems", False, """Include one-hot sparse softmax problems.""") tf.app.flags.DEFINE_boolean("include_noisy_bowl_problems", True, """Include noisy bowl problems.""") tf.app.flags.DEFINE_boolean("include_noisy_norm_problems", True, """Include noisy norm problems.""") tf.app.flags.DEFINE_boolean("include_noisy_sum_problems", True, """Include noisy sum problems.""") tf.app.flags.DEFINE_boolean("include_sum_of_quadratics_problems", False, """Include sum of quadratics problems.""") tf.app.flags.DEFINE_boolean("include_projection_quadratic_problems", False, """Include projection quadratic problems.""") tf.app.flags.DEFINE_boolean("include_outward_snake_problems", False, """Include outward snake problems.""") tf.app.flags.DEFINE_boolean("include_dependency_chain_problems", False, """Include dependency chain problems.""") tf.app.flags.DEFINE_boolean("include_min_max_well_problems", False, """Include min-max well problems.""") # Optimizer parameters: initialization and scale values tf.app.flags.DEFINE_float("min_lr", 1e-6, """The minimum initial learning rate.""") tf.app.flags.DEFINE_float("max_lr", 1e-2, """The maximum initial learning rate.""") # Optimizer parameters: small features. tf.app.flags.DEFINE_boolean("zero_init_lr_weights", True, """Whether to initialize the learning rate weights to 0 rather than the scaled random initialization used for other RNN variables.""") tf.app.flags.DEFINE_boolean("use_relative_lr", True, """Whether to use the relative learning rate as an input during training. Can only be used if learnable_decay is also True.""") tf.app.flags.DEFINE_boolean("use_extreme_indicator", False, """Whether to use the extreme indicator for learning rates as an input during training. Can only be used if learnable_decay is also True.""") tf.app.flags.DEFINE_boolean("use_log_means_squared", True, """Whether to track the log of the mean squared grads instead of the means squared grads.""") tf.app.flags.DEFINE_boolean("use_problem_lr_mean", True, """Whether to use the mean over all learning rates in the problem when calculating the relative learning rate.""") # Optimizer parameters: major features tf.app.flags.DEFINE_boolean("learnable_decay", True, """Whether to learn weights that dynamically modulate the input scale via RMS decay.""") tf.app.flags.DEFINE_boolean("dynamic_output_scale", True, """Whether to learn weights that dynamically modulate the output scale.""") tf.app.flags.DEFINE_boolean("use_log_objective", True, """Whether to use the log of the scaled objective rather than just the scaled obj for training.""") tf.app.flags.DEFINE_boolean("use_attention", False, """Whether to learn where to attend.""") tf.app.flags.DEFINE_boolean("use_second_derivatives", True, """Whether to use second derivatives.""") tf.app.flags.DEFINE_integer("num_gradient_scales", 4, """How many different timescales to keep for gradient history. If > 1, also learns a scale factor for gradient history.""") tf.app.flags.DEFINE_float("max_log_lr", 33, """The maximum log learning rate allowed.""") tf.app.flags.DEFINE_float("objective_training_max_multiplier", -1, """How much the objective can grow before training on this problem / param pair is terminated. Sets a max on the objective value when multiplied by the initial objective. If <= 0, not used.""") tf.app.flags.DEFINE_boolean("use_gradient_shortcut", True, """Whether to add a learned affine projection of the gradient to the update delta in addition to the gradient function computed by the RNN.""") tf.app.flags.DEFINE_boolean("use_lr_shortcut", False, """Whether to add the difference between the current learning rate and the desired learning rate to the RNN input.""") tf.app.flags.DEFINE_boolean("use_grad_products", True, """Whether to use gradient products in the input to the RNN. Only applicable when num_gradient_scales > 1.""") tf.app.flags.DEFINE_boolean("use_multiple_scale_decays", False, """Whether to use many-timescale scale decays.""") tf.app.flags.DEFINE_boolean("use_numerator_epsilon", False, """Whether to use epsilon in the numerator of the log objective.""") tf.app.flags.DEFINE_boolean("learnable_inp_decay", True, """Whether to learn input decay weight and bias.""") tf.app.flags.DEFINE_boolean("learnable_rnn_init", True, """Whether to learn RNN state initialization.""") FLAGS = tf.app.flags.FLAGS # The Size of the RNN hidden state in each layer: # [PerParam, PerTensor, Global]. The length of this list must be 1, 2, or 3. # If less than 3, the Global and/or PerTensor RNNs will not be created. HRNN_CELL_SIZES = [10, 20, 20] def register_optimizers(): opts = {} opts["CoordinatewiseRNN"] = coordinatewise_rnn.CoordinatewiseRNN opts["GlobalLearningRate"] = global_learning_rate.GlobalLearningRate opts["HierarchicalRNN"] = hierarchical_rnn.HierarchicalRNN opts["LearningRateSchedule"] = learning_rate_schedule.LearningRateSchedule opts["TrainableAdam"] = trainable_adam.TrainableAdam return opts def main(unused_argv): """Runs the main script.""" opts = register_optimizers() # Choose a set of problems to optimize. By default this includes quadratics, # 2-dimensional bowls, 2-class softmax problems, and non-noisy optimization # test problems (e.g. Rosenbrock, Beale) problems_and_data = [] if FLAGS.include_sparse_softmax_problems: problems_and_data.extend(ps.sparse_softmax_2_class_sparse_problems()) if FLAGS.include_one_hot_sparse_softmax_problems: problems_and_data.extend( ps.one_hot_sparse_softmax_2_class_sparse_problems()) if FLAGS.include_quadratic_problems: problems_and_data.extend(ps.quadratic_problems()) if FLAGS.include_noisy_quadratic_problems: problems_and_data.extend(ps.quadratic_problems_noisy()) if FLAGS.include_large_quadratic_problems: problems_and_data.extend(ps.quadratic_problems_large()) if FLAGS.include_bowl_problems: problems_and_data.extend(ps.bowl_problems()) if FLAGS.include_noisy_bowl_problems: problems_and_data.extend(ps.bowl_problems_noisy()) if FLAGS.include_softmax_2_class_problems: problems_and_data.extend(ps.softmax_2_class_problems()) if FLAGS.include_noisy_softmax_2_class_problems: problems_and_data.extend(ps.softmax_2_class_problems_noisy()) if FLAGS.include_optimization_test_problems: problems_and_data.extend(ps.optimization_test_problems()) if FLAGS.include_noisy_optimization_test_problems: problems_and_data.extend(ps.optimization_test_problems_noisy()) if FLAGS.include_fully_connected_random_2_class_problems: problems_and_data.extend(ps.fully_connected_random_2_class_problems()) if FLAGS.include_matmul_problems: problems_and_data.extend(ps.matmul_problems()) if FLAGS.include_log_objective_problems: problems_and_data.extend(ps.log_objective_problems()) if FLAGS.include_rescale_problems: problems_and_data.extend(ps.rescale_problems()) if FLAGS.include_norm_problems: problems_and_data.extend(ps.norm_problems()) if FLAGS.include_noisy_norm_problems: problems_and_data.extend(ps.norm_problems_noisy()) if FLAGS.include_sum_problems: problems_and_data.extend(ps.sum_problems()) if FLAGS.include_noisy_sum_problems: problems_and_data.extend(ps.sum_problems_noisy()) if FLAGS.include_sparse_gradient_problems: problems_and_data.extend(ps.sparse_gradient_problems()) if FLAGS.include_fully_connected_random_2_class_problems: problems_and_data.extend(ps.sparse_gradient_problems_mlp()) if FLAGS.include_min_max_well_problems: problems_and_data.extend(ps.min_max_well_problems()) if FLAGS.include_sum_of_quadratics_problems: problems_and_data.extend(ps.sum_of_quadratics_problems()) if FLAGS.include_projection_quadratic_problems: problems_and_data.extend(ps.projection_quadratic_problems()) if FLAGS.include_outward_snake_problems: problems_and_data.extend(ps.outward_snake_problems()) if FLAGS.include_dependency_chain_problems: problems_and_data.extend(ps.dependency_chain_problems()) # log directory logdir = os.path.join(FLAGS.train_dir, "{}_{}_{}_{}".format(FLAGS.optimizer, FLAGS.cell_cls, FLAGS.cell_size, FLAGS.num_cells)) # get the optimizer class and arguments optimizer_cls = opts[FLAGS.optimizer] assert len(HRNN_CELL_SIZES) in [1, 2, 3] optimizer_args = (HRNN_CELL_SIZES,) optimizer_kwargs = { "init_lr_range": (FLAGS.min_lr, FLAGS.max_lr), "learnable_decay": FLAGS.learnable_decay, "dynamic_output_scale": FLAGS.dynamic_output_scale, "cell_cls": getattr(tf.contrib.rnn, FLAGS.cell_cls), "use_attention": FLAGS.use_attention, "use_log_objective": FLAGS.use_log_objective, "num_gradient_scales": FLAGS.num_gradient_scales, "zero_init_lr_weights": FLAGS.zero_init_lr_weights, "use_log_means_squared": FLAGS.use_log_means_squared, "use_relative_lr": FLAGS.use_relative_lr, "use_extreme_indicator": FLAGS.use_extreme_indicator, "max_log_lr": FLAGS.max_log_lr, "obj_train_max_multiplier": FLAGS.objective_training_max_multiplier, "use_problem_lr_mean": FLAGS.use_problem_lr_mean, "use_gradient_shortcut": FLAGS.use_gradient_shortcut, "use_second_derivatives": FLAGS.use_second_derivatives, "use_lr_shortcut": FLAGS.use_lr_shortcut, "use_grad_products": FLAGS.use_grad_products, "use_multiple_scale_decays": FLAGS.use_multiple_scale_decays, "use_numerator_epsilon": FLAGS.use_numerator_epsilon, "learnable_inp_decay": FLAGS.learnable_inp_decay, "learnable_rnn_init": FLAGS.learnable_rnn_init, } optimizer_spec = problem_spec.Spec( optimizer_cls, optimizer_args, optimizer_kwargs) # make log directory tf.gfile.MakeDirs(logdir) is_chief = FLAGS.task == 0 # if this is a distributed run, make the chief run through problems in order select_random_problems = FLAGS.worker_tasks == 1 or not is_chief def num_unrolls(): return metaopt.sample_numiter(FLAGS.num_unroll_scale, FLAGS.min_num_unrolls) def num_partial_unroll_itrs(): return metaopt.sample_numiter(FLAGS.num_partial_unroll_itr_scale, FLAGS.min_num_itr_partial_unroll) # run it metaopt.train_optimizer( logdir, optimizer_spec, problems_and_data, FLAGS.num_problems, FLAGS.num_meta_iterations, num_unrolls, num_partial_unroll_itrs, learning_rate=FLAGS.meta_learning_rate, gradient_clip=FLAGS.gradient_clip_level, is_chief=is_chief, select_random_problems=select_random_problems, obj_train_max_multiplier=FLAGS.objective_training_max_multiplier, callbacks=[]) return 0 if __name__ == "__main__": tf.app.run()