# Copyright 2018 The TensorFlow Authors All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Interface for the policy of the agents use for navigation.""" import abc import tensorflow as tf from absl import logging import embedders from envs import task_env slim = tf.contrib.slim def _print_debug_ios(history, goal, output): """Prints sizes of history, goal and outputs.""" if history is not None: shape = history.get_shape().as_list() # logging.info('history embedding shape ') # logging.info(shape) if len(shape) != 3: raise ValueError('history Tensor must have rank=3') if goal is not None: logging.info('goal embedding shape ') logging.info(goal.get_shape().as_list()) if output is not None: logging.info('targets shape ') logging.info(output.get_shape().as_list()) class Policy(object): """Represents the policy of the agent for navigation tasks. Instantiates a policy that takes embedders for each modality and builds a model to infer the actions. """ __metaclass__ = abc.ABCMeta def __init__(self, embedders_dict, action_size): """Instantiates the policy. Args: embedders_dict: Dictionary of embedders for different modalities. Keys should be identical to keys of observation modality. action_size: Number of possible actions. """ self._embedders = embedders_dict self._action_size = action_size @abc.abstractmethod def build(self, observations, prev_state): """Builds the model that represents the policy of the agent. Args: observations: Dictionary of observations from different modalities. Keys are the name of the modalities. prev_state: The tensor of the previous state of the model. Should be set to None if the policy is stateless Returns: Tuple of (action, state) where action is the action logits and state is the state of the model after taking new observation. """ raise NotImplementedError( 'Needs implementation as part of Policy interface') class LSTMPolicy(Policy): """Represents the implementation of the LSTM based policy. The architecture of the model is as follows. It embeds all the observations using the embedders, concatenates the embeddings of all the modalities. Feed them through two fully connected layers. The lstm takes the features from fully connected layer and the previous action and success of previous action and feed them to LSTM. The value for each action is predicted afterwards. Although the class name has the word LSTM in it, it also supports a mode that builds the network without LSTM just for comparison purposes. """ def __init__(self, modality_names, embedders_dict, action_size, params, max_episode_length, feedforward_mode=False): """Instantiates the LSTM policy. Args: modality_names: List of modality names. Makes sure the ordering in concatenation remains the same as modality_names list. Each modality needs to be in the embedders_dict. embedders_dict: Dictionary of embedders for different modalities. Keys should be identical to keys of observation modality. Values should be instance of Embedder class. All the observations except PREV_ACTION requires embedder. action_size: Number of possible actions. params: is instance of tf.hparams and contains the hyperparameters for the policy network. max_episode_length: integer, specifying the maximum length of each episode. feedforward_mode: If True, it does not add LSTM to the model. It should only be set True for comparison between LSTM and feedforward models. """ super(LSTMPolicy, self).__init__(embedders_dict, action_size) self._modality_names = modality_names self._lstm_state_size = params.lstm_state_size self._fc_channels = params.fc_channels self._weight_decay = params.weight_decay self._target_embedding_size = params.target_embedding_size self._max_episode_length = max_episode_length self._feedforward_mode = feedforward_mode def _build_lstm(self, encoded_inputs, prev_state, episode_length, prev_action=None): """Builds an LSTM on top of the encoded inputs. If prev_action is not None then it concatenates them to the input of LSTM. Args: encoded_inputs: The embedding of the observations and goal. prev_state: previous state of LSTM. episode_length: The tensor that contains the length of the sequence for each element of the batch. prev_action: tensor to previous chosen action and additional bit for indicating whether the previous action was successful or not. Returns: a tuple of (lstm output, lstm state). """ # Adding prev action and success in addition to the embeddings of the # modalities. if prev_action is not None: encoded_inputs = tf.concat([encoded_inputs, prev_action], axis=-1) with tf.variable_scope('LSTM'): lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self._lstm_state_size) if prev_state is None: # If prev state is set to None, a state of all zeros will be # passed as a previous value for the cell. Should be used for the # first step of each episode. tf_prev_state = lstm_cell.zero_state( encoded_inputs.get_shape().as_list()[0], dtype=tf.float32) else: tf_prev_state = tf.nn.rnn_cell.LSTMStateTuple(prev_state[0], prev_state[1]) lstm_outputs, lstm_state = tf.nn.dynamic_rnn( cell=lstm_cell, inputs=encoded_inputs, sequence_length=episode_length, initial_state=tf_prev_state, dtype=tf.float32, ) lstm_outputs = tf.reshape(lstm_outputs, [-1, lstm_cell.output_size]) return lstm_outputs, lstm_state def build( self, observations, prev_state, ): """Builds the model that represents the policy of the agent. Args: observations: Dictionary of observations from different modalities. Keys are the name of the modalities. Observation should have the following key-values. observations['goal']: One-hot tensor that indicates the semantic category of the goal. The shape should be (batch_size x max_sequence_length x goals). observations[task_env.ModalityTypes.PREV_ACTION]: has action_size + 1 elements where the first action_size numbers are the one hot vector of the previous action and the last element indicates whether the previous action was successful or not. If task_env.ModalityTypes.PREV_ACTION is not in the observation, it will not be used in the policy. prev_state: Previous state of the model. It should be a tuple of (c,h) where c and h are the previous cell value and hidden state of the lstm. Each element of tuple has shape of (batch_size x lstm_cell_size). If it is set to None, then it initializes the state of the lstm with all zeros. Returns: Tuple of (action, state) where action is the action logits and state is the state of the model after taking new observation. Raises: ValueError: If any of the modality names is not in observations or embedders_dict. ValueError: If 'goal' is not in the observations. """ for modality_name in self._modality_names: if modality_name not in observations: raise ValueError('modality name does not exist in observations: {} not ' 'in {}'.format(modality_name, observations.keys())) if modality_name not in self._embedders: if modality_name == task_env.ModalityTypes.PREV_ACTION: continue raise ValueError('modality name does not have corresponding embedder' ' {} not in {}'.format(modality_name, self._embedders.keys())) if task_env.ModalityTypes.GOAL not in observations: raise ValueError('goal should be provided in the observations') goal = observations[task_env.ModalityTypes.GOAL] prev_action = None if task_env.ModalityTypes.PREV_ACTION in observations: prev_action = observations[task_env.ModalityTypes.PREV_ACTION] with tf.variable_scope('policy'): with slim.arg_scope( [slim.fully_connected], activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer(stddev=0.01), weights_regularizer=slim.l2_regularizer(self._weight_decay)): all_inputs = [] # Concatenating the embedding of each modality by applying the embedders # to corresponding observations. def embed(name): with tf.variable_scope('embed_{}'.format(name)): # logging.info('Policy uses embedding %s', name) return self._embedders[name].build(observations[name]) all_inputs = map(embed, [ x for x in self._modality_names if x != task_env.ModalityTypes.PREV_ACTION ]) # Computing goal embedding. shape = goal.get_shape().as_list() with tf.variable_scope('embed_goal'): encoded_goal = tf.reshape(goal, [shape[0] * shape[1], -1]) encoded_goal = slim.fully_connected(encoded_goal, self._target_embedding_size) encoded_goal = tf.reshape(encoded_goal, [shape[0], shape[1], -1]) all_inputs.append(encoded_goal) # Concatenating all the modalities and goal. all_inputs = tf.concat(all_inputs, axis=-1, name='concat_embeddings') shape = all_inputs.get_shape().as_list() all_inputs = tf.reshape(all_inputs, [shape[0] * shape[1], shape[2]]) # Applying fully connected layers. encoded_inputs = slim.fully_connected(all_inputs, self._fc_channels) encoded_inputs = slim.fully_connected(encoded_inputs, self._fc_channels) if not self._feedforward_mode: encoded_inputs = tf.reshape(encoded_inputs, [shape[0], shape[1], self._fc_channels]) lstm_outputs, lstm_state = self._build_lstm( encoded_inputs=encoded_inputs, prev_state=prev_state, episode_length=tf.ones((shape[0],), dtype=tf.float32) * self._max_episode_length, prev_action=prev_action, ) else: # If feedforward_mode=True, directly compute bypass the whole LSTM # computations. lstm_outputs = encoded_inputs lstm_outputs = slim.fully_connected(lstm_outputs, self._fc_channels) action_values = slim.fully_connected( lstm_outputs, self._action_size, activation_fn=None) action_values = tf.reshape(action_values, [shape[0], shape[1], -1]) if not self._feedforward_mode: return action_values, lstm_state else: return action_values, None class TaskPolicy(Policy): """A covenience abstract class providing functionality to deal with Tasks.""" def __init__(self, task_config, model_hparams=None, embedder_hparams=None, train_hparams=None): """Constructs a policy which knows how to work with tasks (see tasks.py). It allows to read task history, goal and outputs in consistency with the task config. Args: task_config: an object of type tasks.TaskIOConfig (see tasks.py) model_hparams: a tf.HParams object containing parameter pertaining to model (these are implementation specific) embedder_hparams: a tf.HParams object containing parameter pertaining to history, goal embedders (these are implementation specific) train_hparams: a tf.HParams object containing parameter pertaining to trainin (these are implementation specific)` """ super(TaskPolicy, self).__init__(None, None) self._model_hparams = model_hparams self._embedder_hparams = embedder_hparams self._train_hparams = train_hparams self._task_config = task_config self._extra_train_ops = [] @property def extra_train_ops(self): """Training ops in addition to the loss, e.g. batch norm updates. Returns: A list of tf ops. """ return self._extra_train_ops def _embed_task_ios(self, streams): """Embeds a list of heterogenous streams. These streams correspond to task history, goal and output. The number of streams is equal to the total number of history, plus one for the goal if present, plus one for the output. If the number of history is k, then the first k streams are the history. The used embedders depend on the input (or goal) types. If an input is an image, then a ResNet embedder is used, otherwise MLPEmbedder (see embedders.py). Args: streams: a list of Tensors. Returns: Three float Tensors history, goal, output. If there are no history, or no goal, then the corresponding returned values are None. The shape of the embedded history is batch_size x sequence_length x sum of all embedding dimensions for all history. The shape of the goal is embedding dimension. """ # EMBED history. index = 0 inps = [] scopes = [] for c in self._task_config.inputs: if c == task_env.ModalityTypes.IMAGE: scope_name = 'image_embedder/image' reuse = scope_name in scopes scopes.append(scope_name) with tf.variable_scope(scope_name, reuse=reuse): resnet_embedder = embedders.ResNet(self._embedder_hparams.image) image_embeddings = resnet_embedder.build(streams[index]) # Uncover batch norm ops. if self._embedder_hparams.image.is_train: self._extra_train_ops += resnet_embedder.extra_train_ops inps.append(image_embeddings) index += 1 else: scope_name = 'input_embedder/vector' reuse = scope_name in scopes scopes.append(scope_name) with tf.variable_scope(scope_name, reuse=reuse): input_vector_embedder = embedders.MLPEmbedder( layers=self._embedder_hparams.vector) vector_embedder = input_vector_embedder.build(streams[index]) inps.append(vector_embedder) index += 1 history = tf.concat(inps, axis=2) if inps else None # EMBED goal. goal = None if self._task_config.query is not None: scope_name = 'image_embedder/query' reuse = scope_name in scopes scopes.append(scope_name) with tf.variable_scope(scope_name, reuse=reuse): resnet_goal_embedder = embedders.ResNet(self._embedder_hparams.goal) goal = resnet_goal_embedder.build(streams[index]) if self._embedder_hparams.goal.is_train: self._extra_train_ops += resnet_goal_embedder.extra_train_ops index += 1 # Embed true targets if needed (tbd). true_target = streams[index] return history, goal, true_target @abc.abstractmethod def build(self, feeds, prev_state): pass class ReactivePolicy(TaskPolicy): """A policy which ignores history. It processes only the current observation (last element in history) and the goal to output a prediction. """ def __init__(self, *args, **kwargs): super(ReactivePolicy, self).__init__(*args, **kwargs) # The current implementation ignores the prev_state as it is purely reactive. # It returns None for the current state. def build(self, feeds, prev_state): history, goal, _ = self._embed_task_ios(feeds) _print_debug_ios(history, goal, None) with tf.variable_scope('output_decoder'): # Concatenate the embeddings of the current observation and the goal. reactive_input = tf.concat([tf.squeeze(history[:, -1, :]), goal], axis=1) oconfig = self._task_config.output.shape assert len(oconfig) == 1 decoder = embedders.MLPEmbedder( layers=self._embedder_hparams.predictions.layer_sizes + oconfig) predictions = decoder.build(reactive_input) return predictions, None class RNNPolicy(TaskPolicy): """A policy which takes into account the full history via RNN. The implementation might and will change. The history, together with the goal, is processed using a stacked LSTM. The output of the last LSTM step is used to produce a prediction. Currently, only a single step output is supported. """ def __init__(self, lstm_hparams, *args, **kwargs): super(RNNPolicy, self).__init__(*args, **kwargs) self._lstm_hparams = lstm_hparams # The prev_state is ignored as for now the full history is specified as first # element of the feeds. It might turn out to be beneficial to keep the state # as part of the policy object. def build(self, feeds, state): history, goal, _ = self._embed_task_ios(feeds) _print_debug_ios(history, goal, None) params = self._lstm_hparams cell = lambda: tf.contrib.rnn.BasicLSTMCell(params.cell_size) stacked_lstm = tf.contrib.rnn.MultiRNNCell( [cell() for _ in range(params.num_layers)]) # history is of shape batch_size x seq_len x embedding_dimension batch_size, seq_len, _ = tuple(history.get_shape().as_list()) if state is None: state = stacked_lstm.zero_state(batch_size, tf.float32) for t in range(seq_len): if params.concat_goal_everywhere: lstm_input = tf.concat([tf.squeeze(history[:, t, :]), goal], axis=1) else: lstm_input = tf.squeeze(history[:, t, :]) output, state = stacked_lstm(lstm_input, state) with tf.variable_scope('output_decoder'): oconfig = self._task_config.output.shape assert len(oconfig) == 1 features = tf.concat([output, goal], axis=1) assert len(output.get_shape().as_list()) == 2 assert len(goal.get_shape().as_list()) == 2 decoder = embedders.MLPEmbedder( layers=self._embedder_hparams.predictions.layer_sizes + oconfig) # Prediction is done off the last step lstm output and the goal. predictions = decoder.build(features) return predictions, state