Spaces:

NCTCMumbai
/

NCTC

Running

App Files Files Community

NCTC / models /research /efficient-hrl /eval.py

NCTCMumbai

Upload 2571 files

0b8359d over 1 year ago

raw

history blame

18.5 kB

	# Copyright 2018 The TensorFlow Authors All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================

	r"""Script for evaluating a UVF agent.

	To run locally: See run_eval.py

	To run on borg: See train_eval.borg
	"""

	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	import os
	import tensorflow as tf
	slim = tf.contrib.slim
	import gin.tf
	# pylint: disable=unused-import
	import agent
	import train
	from utils import utils as uvf_utils
	from utils import eval_utils
	from environments import create_maze_env
	# pylint: enable=unused-import

	flags = tf.app.flags

	flags.DEFINE_string('eval_dir', None,
	'Directory for writing logs/summaries during eval.')
	flags.DEFINE_string('checkpoint_dir', None,
	'Directory containing checkpoints to eval.')
	FLAGS = flags.FLAGS


	def get_evaluate_checkpoint_fn(master, output_dir, eval_step_fns,
	model_rollout_fn, gamma, max_steps_per_episode,
	num_episodes_eval, num_episodes_videos,
	tuner_hook, generate_videos,
	generate_summaries, video_settings):
	"""Returns a function that evaluates a given checkpoint.

	Args:
	master: BNS name of the TensorFlow master
	output_dir: The output directory to which the metric summaries are written.
	eval_step_fns: A dictionary of a functions that return a list of
	[state, action, reward, discount, transition_type] tensors,
	indexed by summary tag name.
	model_rollout_fn: Model rollout fn.
	gamma: Discount factor for the reward.
	max_steps_per_episode: Maximum steps to run each episode for.
	num_episodes_eval: Number of episodes to evaluate and average reward over.
	num_episodes_videos: Number of episodes to record for video.
	tuner_hook: A callable(average reward, global step) that updates a Vizier
	tuner trial.
	generate_videos: Whether to generate videos of the agent in action.
	generate_summaries: Whether to generate summaries.
	video_settings: Settings for generating videos of the agent.

	Returns:
	A function that evaluates a checkpoint.
	"""
	sess = tf.Session(master, graph=tf.get_default_graph())
	sess.run(tf.global_variables_initializer())
	sess.run(tf.local_variables_initializer())
	summary_writer = tf.summary.FileWriter(output_dir)

	def evaluate_checkpoint(checkpoint_path):
	"""Performs a one-time evaluation of the given checkpoint.

	Args:
	checkpoint_path: Checkpoint to evaluate.
	Returns:
	True if the evaluation process should stop
	"""
	restore_fn = tf.contrib.framework.assign_from_checkpoint_fn(
	checkpoint_path,
	uvf_utils.get_all_vars(),
	ignore_missing_vars=True,
	reshape_variables=False)
	assert restore_fn is not None, 'cannot restore %s' % checkpoint_path
	restore_fn(sess)
	global_step = sess.run(slim.get_global_step())
	should_stop = False
	max_reward = -1e10
	max_meta_reward = -1e10

	for eval_tag, (eval_step, env_base,) in sorted(eval_step_fns.items()):
	if hasattr(env_base, 'set_sess'):
	env_base.set_sess(sess) # set session

	if generate_summaries:
	tf.logging.info(
	'[%s] Computing average reward over %d episodes at global step %d.',
	eval_tag, num_episodes_eval, global_step)
	(average_reward, last_reward,
	average_meta_reward, last_meta_reward, average_success,
	states, actions) = eval_utils.compute_average_reward(
	sess, env_base, eval_step, gamma, max_steps_per_episode,
	num_episodes_eval)
	tf.logging.info('[%s] Average reward = %f', eval_tag, average_reward)
	tf.logging.info('[%s] Last reward = %f', eval_tag, last_reward)
	tf.logging.info('[%s] Average meta reward = %f', eval_tag, average_meta_reward)
	tf.logging.info('[%s] Last meta reward = %f', eval_tag, last_meta_reward)
	tf.logging.info('[%s] Average success = %f', eval_tag, average_success)
	if model_rollout_fn is not None:
	preds, model_losses = eval_utils.compute_model_loss(
	sess, model_rollout_fn, states, actions)
	for i, (pred, state, model_loss) in enumerate(
	zip(preds, states, model_losses)):
	tf.logging.info('[%s] Model rollout step %d: loss=%f', eval_tag, i,
	model_loss)
	tf.logging.info('[%s] Model rollout step %d: pred=%s', eval_tag, i,
	str(pred.tolist()))
	tf.logging.info('[%s] Model rollout step %d: state=%s', eval_tag, i,
	str(state.tolist()))

	# Report the eval stats to the tuner.
	if average_reward > max_reward:
	max_reward = average_reward
	if average_meta_reward > max_meta_reward:
	max_meta_reward = average_meta_reward

	for (tag, value) in [('Reward/average_%s_reward', average_reward),
	('Reward/last_%s_reward', last_reward),
	('Reward/average_%s_meta_reward', average_meta_reward),
	('Reward/last_%s_meta_reward', last_meta_reward),
	('Reward/average_%s_success', average_success)]:
	summary_str = tf.Summary(value=[
	tf.Summary.Value(
	tag=tag % eval_tag,
	simple_value=value)
	])
	summary_writer.add_summary(summary_str, global_step)
	summary_writer.flush()

	if generate_videos or should_stop:
	# Do a manual reset before generating the video to see the initial
	# pose of the robot, towards which the reset controller is moving.
	if hasattr(env_base, '_gym_env'):
	tf.logging.info('Resetting before recording video')
	if hasattr(env_base._gym_env, 'reset_model'):
	env_base._gym_env.reset_model() # pylint: disable=protected-access
	else:
	env_base._gym_env.wrapped_env.reset_model()
	video_filename = os.path.join(output_dir, 'videos',
	'%s_step_%d.mp4' % (eval_tag,
	global_step))
	eval_utils.capture_video(sess, eval_step, env_base,
	max_steps_per_episode * num_episodes_videos,
	video_filename, video_settings,
	reset_every=max_steps_per_episode)

	should_stop = should_stop or (generate_summaries and tuner_hook and
	tuner_hook(max_reward, global_step))
	return bool(should_stop)

	return evaluate_checkpoint


	def get_model_rollout(uvf_agent, tf_env):
	"""Model rollout function."""
	state_spec = tf_env.observation_spec()[0]
	action_spec = tf_env.action_spec()[0]
	state_ph = tf.placeholder(dtype=state_spec.dtype, shape=state_spec.shape)
	action_ph = tf.placeholder(dtype=action_spec.dtype, shape=action_spec.shape)

	merged_state = uvf_agent.merged_state(state_ph)
	diff_value = uvf_agent.critic_net(tf.expand_dims(merged_state, 0),
	tf.expand_dims(action_ph, 0))[0]
	diff_value = tf.cast(diff_value, dtype=state_ph.dtype)
	state_ph.shape.assert_is_compatible_with(diff_value.shape)
	next_state = state_ph + diff_value

	def model_rollout_fn(sess, state, action):
	return sess.run(next_state, feed_dict={state_ph: state, action_ph: action})

	return model_rollout_fn


	def get_eval_step(uvf_agent,
	state_preprocess,
	tf_env,
	action_fn,
	meta_action_fn,
	environment_steps,
	num_episodes,
	mode='eval'):
	"""Get one-step policy/env stepping ops.

	Args:
	uvf_agent: A UVF agent.
	tf_env: A TFEnvironment.
	action_fn: A function to produce actions given current state.
	meta_action_fn: A function to produce meta actions given current state.
	environment_steps: A variable to count the number of steps in the tf_env.
	num_episodes: A variable to count the number of episodes.
	mode: a string representing the mode=[train, explore, eval].

	Returns:
	A collect_experience_op that excute an action and store into the
	replay_buffer
	"""

	tf_env.start_collect()
	state = tf_env.current_obs()
	action = action_fn(state, context=None)
	state_repr = state_preprocess(state)

	action_spec = tf_env.action_spec()
	action_ph = tf.placeholder(dtype=action_spec.dtype, shape=action_spec.shape)
	with tf.control_dependencies([state]):
	transition_type, reward, discount = tf_env.step(action_ph)

	def increment_step():
	return environment_steps.assign_add(1)

	def increment_episode():
	return num_episodes.assign_add(1)

	def no_op_int():
	return tf.constant(0, dtype=tf.int64)

	step_cond = uvf_agent.step_cond_fn(state, action,
	transition_type,
	environment_steps, num_episodes)
	reset_episode_cond = uvf_agent.reset_episode_cond_fn(
	state, action,
	transition_type, environment_steps, num_episodes)
	reset_env_cond = uvf_agent.reset_env_cond_fn(state, action,
	transition_type,
	environment_steps, num_episodes)

	increment_step_op = tf.cond(step_cond, increment_step, no_op_int)
	with tf.control_dependencies([increment_step_op]):
	increment_episode_op = tf.cond(reset_episode_cond, increment_episode,
	no_op_int)

	with tf.control_dependencies([reward, discount]):
	next_state = tf_env.current_obs()
	next_state_repr = state_preprocess(next_state)

	with tf.control_dependencies([increment_episode_op]):
	post_reward, post_meta_reward = uvf_agent.cond_begin_episode_op(
	tf.logical_not(reset_episode_cond),
	[state, action_ph, reward, next_state,
	state_repr, next_state_repr],
	mode=mode, meta_action_fn=meta_action_fn)

	# Important: do manual reset after getting the final reward from the
	# unreset environment.
	with tf.control_dependencies([post_reward, post_meta_reward]):
	cond_reset_op = tf.cond(reset_env_cond,
	tf_env.reset,
	tf_env.current_time_step)

	# Add a dummy control dependency to force the reset_op to run
	with tf.control_dependencies(cond_reset_op):
	post_reward, post_meta_reward = map(tf.identity, [post_reward, post_meta_reward])

	eval_step = [next_state, action_ph, transition_type, post_reward, post_meta_reward, discount, uvf_agent.context_vars, state_repr]

	if callable(action):
	def step_fn(sess):
	action_value = action(sess)
	return sess.run(eval_step, feed_dict={action_ph: action_value})
	else:
	action = uvf_utils.clip_to_spec(action, action_spec)
	def step_fn(sess):
	action_value = sess.run(action)
	return sess.run(eval_step, feed_dict={action_ph: action_value})

	return step_fn


	@gin.configurable
	def evaluate(checkpoint_dir,
	eval_dir,
	environment=None,
	num_bin_actions=3,
	agent_class=None,
	meta_agent_class=None,
	state_preprocess_class=None,
	gamma=1.0,
	num_episodes_eval=10,
	eval_interval_secs=60,
	max_number_of_evaluations=None,
	checkpoint_timeout=None,
	timeout_fn=None,
	tuner_hook=None,
	generate_videos=False,
	generate_summaries=True,
	num_episodes_videos=5,
	video_settings=None,
	eval_modes=('eval',),
	eval_model_rollout=False,
	policy_save_dir='policy',
	checkpoint_range=None,
	checkpoint_path=None,
	max_steps_per_episode=None,
	evaluate_nohrl=False):
	"""Loads and repeatedly evaluates a checkpointed model at a set interval.

	Args:
	checkpoint_dir: The directory where the checkpoints reside.
	eval_dir: Directory to save the evaluation summary results.
	environment: A BaseEnvironment to evaluate.
	num_bin_actions: Number of bins for discretizing continuous actions.
	agent_class: An RL agent class.
	meta_agent_class: A Meta agent class.
	gamma: Discount factor for the reward.
	num_episodes_eval: Number of episodes to evaluate and average reward over.
	eval_interval_secs: The number of seconds between each evaluation run.
	max_number_of_evaluations: The max number of evaluations. If None the
	evaluation continues indefinitely.
	checkpoint_timeout: The maximum amount of time to wait between checkpoints.
	If left as `None`, then the process will wait indefinitely.
	timeout_fn: Optional function to call after a timeout.
	tuner_hook: A callable that takes the average reward and global step and
	updates a Vizier tuner trial.
	generate_videos: Whether to generate videos of the agent in action.
	generate_summaries: Whether to generate summaries.
	num_episodes_videos: Number of episodes to evaluate for generating videos.
	video_settings: Settings for generating videos of the agent.
	optimal action based on the critic.
	eval_modes: A tuple of eval modes.
	eval_model_rollout: Evaluate model rollout.
	policy_save_dir: Optional sub-directory where the policies are
	saved.
	checkpoint_range: Optional. If provided, evaluate all checkpoints in
	the range.
	checkpoint_path: Optional sub-directory specifying which checkpoint to
	evaluate. If None, will evaluate the most recent checkpoint.
	"""
	tf_env = create_maze_env.TFPyEnvironment(environment)
	observation_spec = [tf_env.observation_spec()]
	action_spec = [tf_env.action_spec()]

	assert max_steps_per_episode, 'max_steps_per_episode need to be set'

	if agent_class.ACTION_TYPE == 'discrete':
	assert False
	else:
	assert agent_class.ACTION_TYPE == 'continuous'

	if meta_agent_class is not None:
	assert agent_class.ACTION_TYPE == meta_agent_class.ACTION_TYPE
	with tf.variable_scope('meta_agent'):
	meta_agent = meta_agent_class(
	observation_spec,
	action_spec,
	tf_env,
	)
	else:
	meta_agent = None

	with tf.variable_scope('uvf_agent'):
	uvf_agent = agent_class(
	observation_spec,
	action_spec,
	tf_env,
	)
	uvf_agent.set_meta_agent(agent=meta_agent)

	with tf.variable_scope('state_preprocess'):
	state_preprocess = state_preprocess_class()

	# run both actor and critic once to ensure networks are initialized
	# and gin configs will be saved
	# pylint: disable=protected-access
	temp_states = tf.expand_dims(
	tf.zeros(
	dtype=uvf_agent._observation_spec.dtype,
	shape=uvf_agent._observation_spec.shape), 0)
	# pylint: enable=protected-access
	temp_actions = uvf_agent.actor_net(temp_states)
	uvf_agent.critic_net(temp_states, temp_actions)

	# create eval_step_fns for each action function
	eval_step_fns = dict()
	meta_agent = uvf_agent.meta_agent
	for meta in [True] + [False] * evaluate_nohrl:
	meta_tag = 'hrl' if meta else 'nohrl'
	uvf_agent.set_meta_agent(meta_agent if meta else None)
	for mode in eval_modes:
	# wrap environment
	wrapped_environment = uvf_agent.get_env_base_wrapper(
	environment, mode=mode)
	action_wrapper = lambda agent_: agent_.action
	action_fn = action_wrapper(uvf_agent)
	meta_action_fn = action_wrapper(meta_agent)
	eval_step_fns['%s_%s' % (mode, meta_tag)] = (get_eval_step(
	uvf_agent=uvf_agent,
	state_preprocess=state_preprocess,
	tf_env=tf_env,
	action_fn=action_fn,
	meta_action_fn=meta_action_fn,
	environment_steps=tf.Variable(
	0, dtype=tf.int64, name='environment_steps'),
	num_episodes=tf.Variable(0, dtype=tf.int64, name='num_episodes'),
	mode=mode), wrapped_environment,)

	model_rollout_fn = None
	if eval_model_rollout:
	model_rollout_fn = get_model_rollout(uvf_agent, tf_env)

	tf.train.get_or_create_global_step()

	if policy_save_dir:
	checkpoint_dir = os.path.join(checkpoint_dir, policy_save_dir)

	tf.logging.info('Evaluating policies at %s', checkpoint_dir)
	tf.logging.info('Running episodes for max %d steps', max_steps_per_episode)

	evaluate_checkpoint_fn = get_evaluate_checkpoint_fn(
	'', eval_dir, eval_step_fns, model_rollout_fn, gamma,
	max_steps_per_episode, num_episodes_eval, num_episodes_videos, tuner_hook,
	generate_videos, generate_summaries, video_settings)

	if checkpoint_path is not None:
	checkpoint_path = os.path.join(checkpoint_dir, checkpoint_path)
	evaluate_checkpoint_fn(checkpoint_path)
	elif checkpoint_range is not None:
	model_files = tf.gfile.Glob(
	os.path.join(checkpoint_dir, 'model.ckpt-*.index'))
	tf.logging.info('Found %s policies at %s', len(model_files), checkpoint_dir)
	model_files = {
	int(f.split('model.ckpt-', 1)[1].split('.', 1)[0]):
	os.path.splitext(f)[0]
	for f in model_files
	}
	model_files = {
	k: v
	for k, v in model_files.items()
	if k >= checkpoint_range[0] and k <= checkpoint_range[1]
	}
	tf.logging.info('Evaluating %d policies at %s',
	len(model_files), checkpoint_dir)
	for _, checkpoint_path in sorted(model_files.items()):
	evaluate_checkpoint_fn(checkpoint_path)
	else:
	eval_utils.evaluate_checkpoint_repeatedly(
	checkpoint_dir,
	evaluate_checkpoint_fn,
	eval_interval_secs=eval_interval_secs,
	max_number_of_evaluations=max_number_of_evaluations,
	checkpoint_timeout=checkpoint_timeout,
	timeout_fn=timeout_fn)