|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Controller coordinates sampling and training model. |
|
""" |
|
|
|
from __future__ import absolute_import |
|
from __future__ import division |
|
from __future__ import print_function |
|
|
|
from six.moves import xrange |
|
import tensorflow as tf |
|
import numpy as np |
|
import pickle |
|
import random |
|
|
|
flags = tf.flags |
|
gfile = tf.gfile |
|
|
|
FLAGS = flags.FLAGS |
|
|
|
|
|
def find_best_eps_lambda(rewards, lengths): |
|
"""Find the best lambda given a desired epsilon = FLAGS.max_divergence.""" |
|
|
|
desired_div = FLAGS.max_divergence * np.mean(lengths) |
|
|
|
def calc_divergence(eps_lambda): |
|
max_reward = np.max(rewards) |
|
logz = (max_reward / eps_lambda + |
|
np.log(np.mean(np.exp((rewards - max_reward) / eps_lambda)))) |
|
exprr = np.mean(np.exp(rewards / eps_lambda - logz) * |
|
rewards / eps_lambda) |
|
return exprr - logz |
|
|
|
left = 0.0 |
|
right = 1000.0 |
|
|
|
if len(rewards) <= 8: |
|
return (left + right) / 2 |
|
|
|
num_iter = max(4, 1 + int(np.log((right - left) / 0.1) / np.log(2.0))) |
|
for _ in xrange(num_iter): |
|
mid = (left + right) / 2 |
|
cur_div = calc_divergence(mid) |
|
if cur_div > desired_div: |
|
left = mid |
|
else: |
|
right = mid |
|
|
|
return (left + right) / 2 |
|
|
|
|
|
class Controller(object): |
|
|
|
def __init__(self, env, env_spec, internal_dim, |
|
use_online_batch=True, |
|
batch_by_steps=False, |
|
unify_episodes=False, |
|
replay_batch_size=None, |
|
max_step=None, |
|
cutoff_agent=1, |
|
save_trajectories_file=None, |
|
use_trust_region=False, |
|
use_value_opt=False, |
|
update_eps_lambda=False, |
|
prioritize_by='rewards', |
|
get_model=None, |
|
get_replay_buffer=None, |
|
get_buffer_seeds=None): |
|
self.env = env |
|
self.env_spec = env_spec |
|
self.internal_dim = internal_dim |
|
self.use_online_batch = use_online_batch |
|
self.batch_by_steps = batch_by_steps |
|
self.unify_episodes = unify_episodes |
|
self.replay_batch_size = replay_batch_size |
|
self.max_step = max_step |
|
self.cutoff_agent = cutoff_agent |
|
self.save_trajectories_file = save_trajectories_file |
|
self.use_trust_region = use_trust_region |
|
self.use_value_opt = use_value_opt |
|
self.update_eps_lambda = update_eps_lambda |
|
self.prioritize_by = prioritize_by |
|
|
|
self.model = get_model() |
|
self.replay_buffer = get_replay_buffer() |
|
self.seed_replay_buffer(get_buffer_seeds()) |
|
|
|
self.internal_state = np.array([self.initial_internal_state()] * |
|
len(self.env)) |
|
self.last_obs = self.env_spec.initial_obs(len(self.env)) |
|
self.last_act = self.env_spec.initial_act(len(self.env)) |
|
self.last_pad = np.zeros(len(self.env)) |
|
|
|
self.start_episode = np.array([True] * len(self.env)) |
|
self.step_count = np.array([0] * len(self.env)) |
|
self.episode_running_rewards = np.zeros(len(self.env)) |
|
self.episode_running_lengths = np.zeros(len(self.env)) |
|
self.episode_rewards = [] |
|
self.greedy_episode_rewards = [] |
|
self.episode_lengths = [] |
|
self.total_rewards = [] |
|
|
|
self.best_batch_rewards = None |
|
|
|
def setup(self, train=True): |
|
self.model.setup(train=train) |
|
|
|
def initial_internal_state(self): |
|
return np.zeros(self.model.policy.rnn_state_dim) |
|
|
|
def _sample_episodes(self, sess, greedy=False): |
|
"""Sample episodes from environment using model.""" |
|
|
|
obs_after_reset = self.env.reset_if(self.start_episode) |
|
|
|
for i, obs in enumerate(obs_after_reset): |
|
if obs is not None: |
|
self.step_count[i] = 0 |
|
self.internal_state[i] = self.initial_internal_state() |
|
for j in xrange(len(self.env_spec.obs_dims)): |
|
self.last_obs[j][i] = obs[j] |
|
for j in xrange(len(self.env_spec.act_dims)): |
|
self.last_act[j][i] = -1 |
|
self.last_pad[i] = 0 |
|
|
|
|
|
|
|
if self.unify_episodes: |
|
assert len(obs_after_reset) == 1 |
|
new_ep = obs_after_reset[0] is not None |
|
else: |
|
new_ep = True |
|
|
|
self.start_id = 0 if new_ep else len(self.all_obs[:]) |
|
|
|
initial_state = self.internal_state |
|
all_obs = [] if new_ep else self.all_obs[:] |
|
all_act = ([self.last_act] if new_ep else self.all_act[:]) |
|
all_pad = [] if new_ep else self.all_pad[:] |
|
rewards = [] if new_ep else self.rewards[:] |
|
|
|
|
|
step = 0 |
|
while not self.env.all_done(): |
|
self.step_count += 1 - np.array(self.env.dones) |
|
|
|
next_internal_state, sampled_actions = self.model.sample_step( |
|
sess, self.last_obs, self.internal_state, self.last_act, |
|
greedy=greedy) |
|
|
|
env_actions = self.env_spec.convert_actions_to_env(sampled_actions) |
|
next_obs, reward, next_dones, _ = self.env.step(env_actions) |
|
|
|
all_obs.append(self.last_obs) |
|
all_act.append(sampled_actions) |
|
all_pad.append(self.last_pad) |
|
rewards.append(reward) |
|
|
|
self.internal_state = next_internal_state |
|
self.last_obs = next_obs |
|
self.last_act = sampled_actions |
|
self.last_pad = np.array(next_dones).astype('float32') |
|
|
|
step += 1 |
|
if self.max_step and step >= self.max_step: |
|
break |
|
|
|
self.all_obs = all_obs[:] |
|
self.all_act = all_act[:] |
|
self.all_pad = all_pad[:] |
|
self.rewards = rewards[:] |
|
|
|
|
|
all_obs.append(self.last_obs) |
|
|
|
return initial_state, all_obs, all_act, rewards, all_pad |
|
|
|
def sample_episodes(self, sess, greedy=False): |
|
"""Sample steps from the environment until we have enough for a batch.""" |
|
|
|
|
|
if self.unify_episodes: |
|
self.all_new_ep = self.start_episode[0] |
|
|
|
|
|
episodes = [] |
|
total_steps = 0 |
|
while total_steps < self.max_step * len(self.env): |
|
(initial_state, |
|
observations, actions, rewards, |
|
pads) = self._sample_episodes(sess, greedy=greedy) |
|
|
|
observations = zip(*observations) |
|
actions = zip(*actions) |
|
|
|
terminated = np.array(self.env.dones) |
|
|
|
self.total_rewards = np.sum(np.array(rewards[self.start_id:]) * |
|
(1 - np.array(pads[self.start_id:])), axis=0) |
|
self.episode_running_rewards *= 1 - self.start_episode |
|
self.episode_running_lengths *= 1 - self.start_episode |
|
self.episode_running_rewards += self.total_rewards |
|
self.episode_running_lengths += np.sum(1 - np.array(pads[self.start_id:]), axis=0) |
|
|
|
episodes.extend(self.convert_from_batched_episodes( |
|
initial_state, observations, actions, rewards, |
|
terminated, pads)) |
|
total_steps += np.sum(1 - np.array(pads)) |
|
|
|
|
|
self.start_episode = np.logical_or(terminated, |
|
self.step_count >= self.cutoff_agent) |
|
episode_rewards = self.episode_running_rewards[self.start_episode].tolist() |
|
self.episode_rewards.extend(episode_rewards) |
|
self.episode_lengths.extend(self.episode_running_lengths[self.start_episode].tolist()) |
|
self.episode_rewards = self.episode_rewards[-100:] |
|
self.episode_lengths = self.episode_lengths[-100:] |
|
|
|
if (self.save_trajectories_file is not None and |
|
(self.best_batch_rewards is None or |
|
np.mean(self.total_rewards) > self.best_batch_rewards)): |
|
self.best_batch_rewards = np.mean(self.total_rewards) |
|
my_episodes = self.convert_from_batched_episodes( |
|
initial_state, observations, actions, rewards, |
|
terminated, pads) |
|
with gfile.GFile(self.save_trajectories_file, 'w') as f: |
|
pickle.dump(my_episodes, f) |
|
|
|
if not self.batch_by_steps: |
|
return (initial_state, |
|
observations, actions, rewards, |
|
terminated, pads) |
|
|
|
return self.convert_to_batched_episodes(episodes) |
|
|
|
def _train(self, sess, |
|
observations, initial_state, actions, |
|
rewards, terminated, pads): |
|
"""Train model using batch.""" |
|
avg_episode_reward = np.mean(self.episode_rewards) |
|
greedy_episode_reward = (np.mean(self.greedy_episode_rewards) |
|
if self.greedy_episode_rewards else |
|
avg_episode_reward) |
|
loss, summary = None, None |
|
if self.use_trust_region: |
|
|
|
loss, _, summary = self.model.trust_region_step( |
|
sess, |
|
observations, initial_state, actions, |
|
rewards, terminated, pads, |
|
avg_episode_reward=avg_episode_reward, |
|
greedy_episode_reward=greedy_episode_reward) |
|
else: |
|
loss, _, summary = self.model.train_step( |
|
sess, |
|
observations, initial_state, actions, |
|
rewards, terminated, pads, |
|
avg_episode_reward=avg_episode_reward, |
|
greedy_episode_reward=greedy_episode_reward) |
|
|
|
if self.use_value_opt: |
|
self.model.fit_values( |
|
sess, |
|
observations, initial_state, actions, |
|
rewards, terminated, pads) |
|
|
|
return loss, summary |
|
|
|
def train(self, sess): |
|
"""Sample some episodes and train on some episodes.""" |
|
cur_step = sess.run(self.model.inc_global_step) |
|
self.cur_step = cur_step |
|
|
|
|
|
if self.cur_step == 0: |
|
for _ in xrange(100): |
|
sess.run(self.model.copy_op) |
|
|
|
sess.run(self.model.copy_op) |
|
|
|
|
|
(initial_state, |
|
observations, actions, rewards, |
|
terminated, pads) = self.sample_episodes(sess) |
|
|
|
|
|
self.add_to_replay_buffer( |
|
initial_state, observations, actions, |
|
rewards, terminated, pads) |
|
|
|
loss, summary = 0, None |
|
|
|
if self.use_online_batch: |
|
loss, summary = self._train( |
|
sess, |
|
observations, initial_state, actions, |
|
rewards, terminated, pads) |
|
|
|
|
|
if self.update_eps_lambda: |
|
episode_rewards = np.array(self.episode_rewards) |
|
episode_lengths = np.array(self.episode_lengths) |
|
eps_lambda = find_best_eps_lambda( |
|
episode_rewards[-20:], episode_lengths[-20:]) |
|
sess.run(self.model.objective.assign_eps_lambda, |
|
feed_dict={self.model.objective.new_eps_lambda: eps_lambda}) |
|
|
|
|
|
replay_batch, replay_probs = self.get_from_replay_buffer( |
|
self.replay_batch_size) |
|
if replay_batch: |
|
(initial_state, |
|
observations, actions, rewards, |
|
terminated, pads) = replay_batch |
|
|
|
loss, summary = self._train( |
|
sess, |
|
observations, initial_state, actions, |
|
rewards, terminated, pads) |
|
|
|
return loss, summary, self.total_rewards, self.episode_rewards |
|
|
|
def eval(self, sess): |
|
"""Use greedy sampling.""" |
|
(initial_state, |
|
observations, actions, rewards, |
|
pads, terminated) = self.sample_episodes(sess, greedy=True) |
|
|
|
total_rewards = np.sum(np.array(rewards) * (1 - np.array(pads)), axis=0) |
|
return total_rewards, self.episode_rewards |
|
|
|
def convert_from_batched_episodes( |
|
self, initial_state, observations, actions, rewards, |
|
terminated, pads): |
|
"""Convert time-major batch of episodes to batch-major list of episodes.""" |
|
|
|
rewards = np.array(rewards) |
|
pads = np.array(pads) |
|
observations = [np.array(obs) for obs in observations] |
|
actions = [np.array(act) for act in actions] |
|
|
|
total_rewards = np.sum(rewards * (1 - pads), axis=0) |
|
total_length = np.sum(1 - pads, axis=0).astype('int32') |
|
|
|
episodes = [] |
|
num_episodes = rewards.shape[1] |
|
for i in xrange(num_episodes): |
|
length = total_length[i] |
|
ep_initial = initial_state[i] |
|
ep_obs = [obs[:length + 1, i, ...] for obs in observations] |
|
ep_act = [act[:length + 1, i, ...] for act in actions] |
|
ep_rewards = rewards[:length, i] |
|
|
|
episodes.append( |
|
[ep_initial, ep_obs, ep_act, ep_rewards, terminated[i]]) |
|
|
|
return episodes |
|
|
|
def convert_to_batched_episodes(self, episodes, max_length=None): |
|
"""Convert batch-major list of episodes to time-major batch of episodes.""" |
|
lengths = [len(ep[-2]) for ep in episodes] |
|
max_length = max_length or max(lengths) |
|
|
|
new_episodes = [] |
|
for ep, length in zip(episodes, lengths): |
|
initial, observations, actions, rewards, terminated = ep |
|
observations = [np.resize(obs, [max_length + 1] + list(obs.shape)[1:]) |
|
for obs in observations] |
|
actions = [np.resize(act, [max_length + 1] + list(act.shape)[1:]) |
|
for act in actions] |
|
pads = np.array([0] * length + [1] * (max_length - length)) |
|
rewards = np.resize(rewards, [max_length]) * (1 - pads) |
|
new_episodes.append([initial, observations, actions, rewards, |
|
terminated, pads]) |
|
|
|
(initial, observations, actions, rewards, |
|
terminated, pads) = zip(*new_episodes) |
|
observations = [np.swapaxes(obs, 0, 1) |
|
for obs in zip(*observations)] |
|
actions = [np.swapaxes(act, 0, 1) |
|
for act in zip(*actions)] |
|
rewards = np.transpose(rewards) |
|
pads = np.transpose(pads) |
|
|
|
return (initial, observations, actions, rewards, terminated, pads) |
|
|
|
def add_to_replay_buffer(self, initial_state, |
|
observations, actions, rewards, |
|
terminated, pads): |
|
"""Add batch of episodes to replay buffer.""" |
|
if self.replay_buffer is None: |
|
return |
|
|
|
rewards = np.array(rewards) |
|
pads = np.array(pads) |
|
total_rewards = np.sum(rewards * (1 - pads), axis=0) |
|
|
|
episodes = self.convert_from_batched_episodes( |
|
initial_state, observations, actions, rewards, |
|
terminated, pads) |
|
|
|
priorities = (total_rewards if self.prioritize_by == 'reward' |
|
else self.cur_step) |
|
|
|
if not self.unify_episodes or self.all_new_ep: |
|
self.last_idxs = self.replay_buffer.add( |
|
episodes, priorities) |
|
else: |
|
|
|
|
|
|
|
|
|
self.replay_buffer.add(episodes[:1], priorities, self.last_idxs[-1:]) |
|
if len(episodes) > 1: |
|
self.replay_buffer.add(episodes[1:], priorities) |
|
|
|
def get_from_replay_buffer(self, batch_size): |
|
"""Sample a batch of episodes from the replay buffer.""" |
|
if self.replay_buffer is None or len(self.replay_buffer) < 1 * batch_size: |
|
return None, None |
|
|
|
desired_count = batch_size * self.max_step |
|
|
|
|
|
while True: |
|
if batch_size > len(self.replay_buffer): |
|
batch_size = len(self.replay_buffer) |
|
episodes, probs = self.replay_buffer.get_batch(batch_size) |
|
count = sum(len(ep[-2]) for ep in episodes) |
|
if count >= desired_count or not self.batch_by_steps: |
|
break |
|
if batch_size == len(self.replay_buffer): |
|
return None, None |
|
batch_size *= 1.2 |
|
|
|
return (self.convert_to_batched_episodes(episodes), probs) |
|
|
|
def seed_replay_buffer(self, episodes): |
|
"""Seed the replay buffer with some episodes.""" |
|
if self.replay_buffer is None: |
|
return |
|
|
|
|
|
for i in xrange(len(episodes)): |
|
episodes[i] = [self.initial_internal_state()] + episodes[i] |
|
|
|
self.replay_buffer.seed_buffer(episodes) |
|
|