Spaces:

NCTCMumbai
/

NCTC

Running

App Files Files Community

NCTC / models /research /brain_coder /common /rollout.py

NCTCMumbai

Upload 2571 files

0b8359d almost 2 years ago

raw

history blame

12.1 kB

	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	"""Utilities related to computing training batches from episode rollouts.

	Implementations here are based on code from Open AI:
	https://github.com/openai/universe-starter-agent/blob/master/a3c.py.
	"""

	from collections import namedtuple
	import numpy as np
	import scipy.signal

	from common import utils # brain coder


	class Rollout(object):
	"""Holds a rollout for an episode.

	A rollout is a record of the states observed in some environment and actions
	taken by the agent to arrive at those states. Other information includes
	rewards received after each action, values estimated for each state, whether
	the rollout concluded the episide, and total reward received. Everything
	should be given in time order.

	At each time t, the agent sees state s_t, takes action a_t, and then receives
	reward r_t. The agent may optionally estimate a state value V(s_t) for each
	state.

	For an episode of length T:
	states = [s_0, ..., s_(T-1)]
	actions = [a_0, ..., a_(T-1)]
	rewards = [r_0, ..., r_(T-1)]
	values = [V(s_0), ..., V(s_(T-1))]

	Note that there is an extra state s_T observed after taking action a_(T-1),
	but this is not included in the rollout.

	Rollouts have an `terminated` attribute which is True when the rollout is
	"finalized", i.e. it holds a full episode. terminated will be False when
	time steps are still being added to it.
	"""

	def __init__(self):
	self.states = []
	self.actions = []
	self.rewards = []
	self.values = []
	self.total_reward = 0.0
	self.terminated = False

	def add(self, state, action, reward, value=0.0, terminated=False):
	"""Add the next timestep to this rollout.

	Args:
	state: The state observed at the start of this timestep.
	action: The action taken after observing the given state.
	reward: The reward received for taking the given action.
	value: The value estimated for the given state.
	terminated: Whether this timestep ends the episode.

	Raises:
	ValueError: If this.terminated is already True, meaning that the episode
	has already ended.
	"""
	if self.terminated:
	raise ValueError(
	'Trying to add timestep to an already terminal rollout.')
	self.states += [state]
	self.actions += [action]
	self.rewards += [reward]
	self.values += [value]
	self.terminated = terminated
	self.total_reward += reward

	def add_many(self, states, actions, rewards, values=None, terminated=False):
	"""Add many timesteps to this rollout.

	Arguments are the same as `add`, but are lists of equal size.

	Args:
	states: The states observed.
	actions: The actions taken.
	rewards: The rewards received.
	values: The values estimated for the given states.
	terminated: Whether this sequence ends the episode.

	Raises:
	ValueError: If the lengths of all the input lists are not equal.
	ValueError: If this.terminated is already True, meaning that the episode
	has already ended.
	"""
	if len(states) != len(actions):
	raise ValueError(
	'Number of states and actions must be the same. Got %d states and '
	'%d actions' % (len(states), len(actions)))
	if len(states) != len(rewards):
	raise ValueError(
	'Number of states and rewards must be the same. Got %d states and '
	'%d rewards' % (len(states), len(rewards)))
	if values is not None and len(states) != len(values):
	raise ValueError(
	'Number of states and values must be the same. Got %d states and '
	'%d values' % (len(states), len(values)))
	if self.terminated:
	raise ValueError(
	'Trying to add timesteps to an already terminal rollout.')
	self.states += states
	self.actions += actions
	self.rewards += rewards
	self.values += values if values is not None else [0.0] * len(states)
	self.terminated = terminated
	self.total_reward += sum(rewards)

	def extend(self, other):
	"""Append another rollout to this rollout."""
	assert not self.terminated
	self.states.extend(other.states)
	self.actions.extend(other.actions)
	self.rewards.extend(other.rewards)
	self.values.extend(other.values)
	self.terminated = other.terminated
	self.total_reward += other.total_reward


	def discount(x, gamma):
	"""Returns discounted sums for each value in x, with discount factor gamma.

	This can be used to compute the return (discounted sum of rewards) at each
	timestep given a sequence of rewards. See the definitions for return and
	REINFORCE in section 3 of https://arxiv.org/pdf/1602.01783.pdf.

	Let g^k mean gamma ** k.
	For list [x_0, ..., x_N], the following list of discounted sums is computed:
	[x_0 + g^1 * x_1 + g^2 * x_2 + ... g^N * x_N,
	x_1 + g^1 * x_2 + g^2 * x_3 + ... g^(N-1) * x_N,
	x_2 + g^1 * x_3 + g^2 * x_4 + ... g^(N-2) * x_N,
	...,
	x_(N-1) + g^1 * x_N,
	x_N]

	Args:
	x: List of numbers [x_0, ..., x_N].
	gamma: Float between 0 and 1 (inclusive). This is the discount factor.

	Returns:
	List of discounted sums.
	"""
	return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]


	def discounted_advantage_and_rewards(rewards, values, gamma, lambda_=1.0):
	"""Compute advantages and returns (discounted sum of rewards).

	For an episode of length T, rewards = [r_0, ..., r_(T-1)].
	Each reward r_t is observed after taking action a_t at state s_t. A final
	state s_T is observed but no reward is given at this state since no action
	a_T is taken (otherwise there would be a new state s_(T+1)).

	`rewards` and `values` are for a single episode. Return R_t is the discounted
	sum of future rewards starting at time t, where `gamma` is the discount
	factor.
	R_t = r_t + gamma * r_(t+1) + gamma*2 r_(t+2) + ...
	+ gamma*(T-1-t) r_(T-1)

	Advantage A(a_t, s_t) is approximated by computing A(a_t, s_t) = R_t - V(s_t)
	where V(s_t) is an approximation of the value at that state, given in the
	`values` list. Returns R_t are needed for all REINFORCE algorithms. Advantage
	is used for the advantage actor critic variant of REINFORCE.
	See algorithm S3 in https://arxiv.org/pdf/1602.01783.pdf.

	Additionally another parameter `lambda_` controls the bias-variance tradeoff.
	See "Generalized Advantage Estimation": https://arxiv.org/abs/1506.02438.
	lambda_ = 1 reduces to regular advantage.
	0 <= lambda_ < 1 trades off variance for bias, with lambda_ = 0 being the
	most biased.

	Bootstrapping is also supported. If an episode does not end in a terminal
	state (either because the episode was ended early, or the environment does not
	have end states), the true return cannot be computed from the rewards alone.
	However, it can be estimated by computing the value (an approximation of
	return) of the last state s_T. Thus the `values` list will have an extra item:
	values = [V(s_0), ..., V(s_(T-1)), V(s_T)].

	Args:
	rewards: List of observed rewards [r_0, ..., r_(T-1)].
	values: List of estimated values [V(s_0), ..., V(s_(T-1))] with an optional
	extra V(s_T) item.
	gamma: Discount factor. Number between 0 and 1. 1 means no discount.
	If not 1, gamma is typically near 1, like 0.99.
	lambda_: Bias-variance tradeoff factor. Between 0 and 1.

	Returns:
	empirical_values: Returns at each timestep.
	generalized_advantage: Avantages at each timestep.

	Raises:
	ValueError: If shapes of `rewards` and `values` are not rank 1.
	ValueError: If len(values) not in (len(rewards), len(rewards) + 1).
	"""
	rewards = np.asarray(rewards, dtype=np.float32)
	values = np.asarray(values, dtype=np.float32)
	if rewards.ndim != 1:
	raise ValueError('Single episode only. rewards must be rank 1.')
	if values.ndim != 1:
	raise ValueError('Single episode only. values must be rank 1.')
	if len(values) == len(rewards):
	# No bootstrapping.
	values = np.append(values, 0)
	empirical_values = discount(rewards, gamma)
	elif len(values) == len(rewards) + 1:
	# With bootstrapping.
	# Last value is for the terminal state (final state after last action was
	# taken).
	empirical_values = discount(np.append(rewards, values[-1]), gamma)[:-1]
	else:
	raise ValueError('values should contain the same number of items or one '
	'more item than rewards')
	delta = rewards + gamma * values[1:] - values[:-1]
	generalized_advantage = discount(delta, gamma * lambda_)

	# empirical_values is the discounted sum of rewards into the future.
	# generalized_advantage is the target for each policy update.
	return empirical_values, generalized_advantage


	"""Batch holds a minibatch of episodes.

	Let bi = batch_index, i.e. the index of each episode in the minibatch.
	Let t = time.

	Attributes:
	states: States for each timestep in each episode. Indexed by states[bi, t].
	actions: Actions for each timestep in each episode. Indexed by actions[bi, t].
	discounted_adv: Advantages (computed by discounted_advantage_and_rewards)
	for each timestep in each episode. Indexed by discounted_adv[bi, t].
	discounted_r: Returns (discounted sum of rewards computed by
	discounted_advantage_and_rewards) for each timestep in each episode.
	Indexed by discounted_r[bi, t].
	total_rewards: Total reward for each episode, i.e. sum of rewards across all
	timesteps (not discounted). Indexed by total_rewards[bi].
	episode_lengths: Number of timesteps in each episode. If an episode has
	N actions, N rewards, and N states, then its length is N. Indexed by
	episode_lengths[bi].
	batch_size: Number of episodes in this minibatch. An integer.
	max_time: Maximum episode length in the batch. An integer.
	""" # pylint: disable=pointless-string-statement
	Batch = namedtuple(
	'Batch',
	['states', 'actions', 'discounted_adv', 'discounted_r', 'total_rewards',
	'episode_lengths', 'batch_size', 'max_time'])


	def process_rollouts(rollouts, gamma, lambda_=1.0):
	"""Convert a batch of rollouts into tensors ready to be fed into a model.

	Lists from each episode are stacked into 2D tensors and padded with 0s up to
	the maximum timestep in the batch.

	Args:
	rollouts: A list of Rollout instances.
	gamma: The discount factor. A number between 0 and 1 (inclusive). See gamma
	argument in discounted_advantage_and_rewards.
	lambda_: See lambda_ argument in discounted_advantage_and_rewards.

	Returns:
	Batch instance. states, actions, discounted_adv, and discounted_r are
	numpy arrays with shape (batch_size, max_episode_length). episode_lengths
	is a list of ints. total_rewards is a list of floats (total reward in each
	episode). batch_size and max_time are ints.

	Raises:
	ValueError: If any of the rollouts are not terminal.
	"""
	for ro in rollouts:
	if not ro.terminated:
	raise ValueError('Can only process terminal rollouts.')

	episode_lengths = [len(ro.states) for ro in rollouts]
	batch_size = len(rollouts)
	max_time = max(episode_lengths)

	states = utils.stack_pad([ro.states for ro in rollouts], 0, max_time)
	actions = utils.stack_pad([ro.actions for ro in rollouts], 0, max_time)

	discounted_rewards = [None] * batch_size
	discounted_adv = [None] * batch_size
	for i, ro in enumerate(rollouts):
	disc_r, disc_adv = discounted_advantage_and_rewards(
	ro.rewards, ro.values, gamma, lambda_)
	discounted_rewards[i] = disc_r
	discounted_adv[i] = disc_adv
	discounted_rewards = utils.stack_pad(discounted_rewards, 0, max_time)
	discounted_adv = utils.stack_pad(discounted_adv, 0, max_time)

	total_rewards = [sum(ro.rewards) for ro in rollouts]

	return Batch(states=states,
	actions=actions,
	discounted_adv=discounted_adv,
	discounted_r=discounted_rewards,
	total_rewards=total_rewards,
	episode_lengths=episode_lengths,
	batch_size=batch_size,
	max_time=max_time)