# Copyright 2017 The TensorFlow Authors All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Model is responsible for setting up Tensorflow graph. Creates policy and value networks. Also sets up all optimization ops, including gradient ops, trust region ops, and value optimizers. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf class Model(object): def __init__(self, env_spec, global_step, target_network_lag=0.95, sample_from='online', get_policy=None, get_baseline=None, get_objective=None, get_trust_region_p_opt=None, get_value_opt=None): self.env_spec = env_spec self.global_step = global_step self.inc_global_step = self.global_step.assign_add(1) self.target_network_lag = target_network_lag self.sample_from = sample_from self.policy = get_policy() self.baseline = get_baseline() self.objective = get_objective() self.baseline.eps_lambda = self.objective.eps_lambda # TODO: do this better self.trust_region_policy_opt = get_trust_region_p_opt() self.value_opt = get_value_opt() def setup_placeholders(self): """Create the Tensorflow placeholders.""" # summary placeholder self.avg_episode_reward = tf.placeholder( tf.float32, [], 'avg_episode_reward') self.greedy_episode_reward = tf.placeholder( tf.float32, [], 'greedy_episode_reward') # sampling placeholders self.internal_state = tf.placeholder(tf.float32, [None, self.policy.rnn_state_dim], 'internal_state') self.single_observation = [] for i, (obs_dim, obs_type) in enumerate(self.env_spec.obs_dims_and_types): if self.env_spec.is_discrete(obs_type): self.single_observation.append( tf.placeholder(tf.int32, [None], 'obs%d' % i)) elif self.env_spec.is_box(obs_type): self.single_observation.append( tf.placeholder(tf.float32, [None, obs_dim], 'obs%d' % i)) else: assert False self.single_action = [] for i, (action_dim, action_type) in \ enumerate(self.env_spec.act_dims_and_types): if self.env_spec.is_discrete(action_type): self.single_action.append( tf.placeholder(tf.int32, [None], 'act%d' % i)) elif self.env_spec.is_box(action_type): self.single_action.append( tf.placeholder(tf.float32, [None, action_dim], 'act%d' % i)) else: assert False # training placeholders self.observations = [] for i, (obs_dim, obs_type) in enumerate(self.env_spec.obs_dims_and_types): if self.env_spec.is_discrete(obs_type): self.observations.append( tf.placeholder(tf.int32, [None, None], 'all_obs%d' % i)) else: self.observations.append( tf.placeholder(tf.float32, [None, None, obs_dim], 'all_obs%d' % i)) self.actions = [] self.other_logits = [] for i, (action_dim, action_type) in \ enumerate(self.env_spec.act_dims_and_types): if self.env_spec.is_discrete(action_type): self.actions.append( tf.placeholder(tf.int32, [None, None], 'all_act%d' % i)) if self.env_spec.is_box(action_type): self.actions.append( tf.placeholder(tf.float32, [None, None, action_dim], 'all_act%d' % i)) self.other_logits.append( tf.placeholder(tf.float32, [None, None, None], 'other_logits%d' % i)) self.rewards = tf.placeholder(tf.float32, [None, None], 'rewards') self.terminated = tf.placeholder(tf.float32, [None], 'terminated') self.pads = tf.placeholder(tf.float32, [None, None], 'pads') self.prev_log_probs = tf.placeholder(tf.float32, [None, None], 'prev_log_probs') def setup(self, train=True): """Setup Tensorflow Graph.""" self.setup_placeholders() tf.summary.scalar('avg_episode_reward', self.avg_episode_reward) tf.summary.scalar('greedy_episode_reward', self.greedy_episode_reward) with tf.variable_scope('model', reuse=None): # policy network with tf.variable_scope('policy_net'): (self.policy_internal_states, self.logits, self.log_probs, self.entropies, self.self_kls) = \ self.policy.multi_step(self.observations, self.internal_state, self.actions) self.out_log_probs = sum(self.log_probs) self.kl = self.policy.calculate_kl(self.other_logits, self.logits) self.avg_kl = (tf.reduce_sum(sum(self.kl)[:-1] * (1 - self.pads)) / tf.reduce_sum(1 - self.pads)) # value network with tf.variable_scope('value_net'): (self.values, self.regression_input, self.regression_weight) = self.baseline.get_values( self.observations, self.actions, self.policy_internal_states, self.logits) # target policy network with tf.variable_scope('target_policy_net'): (self.target_policy_internal_states, self.target_logits, self.target_log_probs, _, _) = \ self.policy.multi_step(self.observations, self.internal_state, self.actions) # target value network with tf.variable_scope('target_value_net'): (self.target_values, _, _) = self.baseline.get_values( self.observations, self.actions, self.target_policy_internal_states, self.target_logits) # construct copy op online --> target all_vars = tf.trainable_variables() online_vars = [p for p in all_vars if '/policy_net' in p.name or '/value_net' in p.name] target_vars = [p for p in all_vars if 'target_policy_net' in p.name or 'target_value_net' in p.name] online_vars.sort(key=lambda p: p.name) target_vars.sort(key=lambda p: p.name) aa = self.target_network_lag self.copy_op = tf.group(*[ target_p.assign(aa * target_p + (1 - aa) * online_p) for online_p, target_p in zip(online_vars, target_vars)]) if train: # evaluate objective (self.loss, self.raw_loss, self.regression_target, self.gradient_ops, self.summary) = self.objective.get( self.rewards, self.pads, self.values[:-1, :], self.values[-1, :] * (1 - self.terminated), self.log_probs, self.prev_log_probs, self.target_log_probs, self.entropies, self.logits, self.target_values[:-1, :], self.target_values[-1, :] * (1 - self.terminated)) self.regression_target = tf.reshape(self.regression_target, [-1]) self.policy_vars = [ v for v in tf.trainable_variables() if '/policy_net' in v.name] self.value_vars = [ v for v in tf.trainable_variables() if '/value_net' in v.name] # trust region optimizer if self.trust_region_policy_opt is not None: with tf.variable_scope('trust_region_policy', reuse=None): avg_self_kl = ( tf.reduce_sum(sum(self.self_kls) * (1 - self.pads)) / tf.reduce_sum(1 - self.pads)) self.trust_region_policy_opt.setup( self.policy_vars, self.raw_loss, avg_self_kl, self.avg_kl) # value optimizer if self.value_opt is not None: with tf.variable_scope('trust_region_value', reuse=None): self.value_opt.setup( self.value_vars, tf.reshape(self.values[:-1, :], [-1]), self.regression_target, tf.reshape(self.pads, [-1]), self.regression_input, self.regression_weight) # we re-use variables for the sampling operations with tf.variable_scope('model', reuse=True): scope = ('target_policy_net' if self.sample_from == 'target' else 'policy_net') with tf.variable_scope(scope): self.next_internal_state, self.sampled_actions = \ self.policy.sample_step(self.single_observation, self.internal_state, self.single_action) self.greedy_next_internal_state, self.greedy_sampled_actions = \ self.policy.sample_step(self.single_observation, self.internal_state, self.single_action, greedy=True) def sample_step(self, sess, single_observation, internal_state, single_action, greedy=False): """Sample batch of steps from policy.""" if greedy: outputs = [self.greedy_next_internal_state, self.greedy_sampled_actions] else: outputs = [self.next_internal_state, self.sampled_actions] feed_dict = {self.internal_state: internal_state} for action_place, action in zip(self.single_action, single_action): feed_dict[action_place] = action for obs_place, obs in zip(self.single_observation, single_observation): feed_dict[obs_place] = obs return sess.run(outputs, feed_dict=feed_dict) def train_step(self, sess, observations, internal_state, actions, rewards, terminated, pads, avg_episode_reward=0, greedy_episode_reward=0): """Train network using standard gradient descent.""" outputs = [self.raw_loss, self.gradient_ops, self.summary] feed_dict = {self.internal_state: internal_state, self.rewards: rewards, self.terminated: terminated, self.pads: pads, self.avg_episode_reward: avg_episode_reward, self.greedy_episode_reward: greedy_episode_reward} time_len = None for action_place, action in zip(self.actions, actions): if time_len is None: time_len = len(action) assert time_len == len(action) feed_dict[action_place] = action for obs_place, obs in zip(self.observations, observations): assert time_len == len(obs) feed_dict[obs_place] = obs assert len(rewards) == time_len - 1 return sess.run(outputs, feed_dict=feed_dict) def trust_region_step(self, sess, observations, internal_state, actions, rewards, terminated, pads, avg_episode_reward=0, greedy_episode_reward=0): """Train policy using trust region step.""" feed_dict = {self.internal_state: internal_state, self.rewards: rewards, self.terminated: terminated, self.pads: pads, self.avg_episode_reward: avg_episode_reward, self.greedy_episode_reward: greedy_episode_reward} for action_place, action in zip(self.actions, actions): feed_dict[action_place] = action for obs_place, obs in zip(self.observations, observations): feed_dict[obs_place] = obs (prev_log_probs, prev_logits) = sess.run( [self.out_log_probs, self.logits], feed_dict=feed_dict) feed_dict[self.prev_log_probs] = prev_log_probs for other_logit, prev_logit in zip(self.other_logits, prev_logits): feed_dict[other_logit] = prev_logit # fit policy self.trust_region_policy_opt.optimize(sess, feed_dict) ret = sess.run([self.raw_loss, self.summary], feed_dict=feed_dict) ret = [ret[0], None, ret[1]] return ret def fit_values(self, sess, observations, internal_state, actions, rewards, terminated, pads): """Train value network using value-specific optimizer.""" feed_dict = {self.internal_state: internal_state, self.rewards: rewards, self.terminated: terminated, self.pads: pads} for action_place, action in zip(self.actions, actions): feed_dict[action_place] = action for obs_place, obs in zip(self.observations, observations): feed_dict[obs_place] = obs # fit values if self.value_opt is None: raise ValueError('Specific value optimizer does not exist') self.value_opt.optimize(sess, feed_dict)