| | import numpy as np |
| | from baselines.common.runners import AbstractEnvRunner |
| | from collections import deque |
| |
|
| |
|
| | class Runner(AbstractEnvRunner): |
| | """ |
| | We use this object to make a mini batch of experiences |
| | __init__: |
| | - Initialize the runner |
| | |
| | run(): |
| | - Make a mini batch |
| | """ |
| |
|
| | def __init__(self, *, env, model, nsteps, gamma, lam, num_embeddings): |
| | super().__init__(env=env, model=model, nsteps=nsteps) |
| | |
| | self.lam = lam |
| | |
| | self.gamma = gamma |
| | self.num_embeddings = num_embeddings |
| |
|
| | def run(self): |
| | |
| | mb_obs, mb_rewards, mb_actions, mb_values, mb_cluster_value_losses, mb_encoding_indices, mb_dones, mb_neglogpacs = [], [], [], [], [], [], [], [] |
| | mb_states = self.states |
| | epinfos = [] |
| | |
| | for _ in range(self.nsteps): |
| | |
| | |
| | actions, values, encoding_indices, self.states, neglogpacs = self.model.step( |
| | self.obs, S=self.states, M=self.dones) |
| | mb_obs.append(self.obs.copy()) |
| | mb_actions.append(actions) |
| | mb_values.append(values) |
| | encoding_indices = np.squeeze(encoding_indices) |
| |
|
| | mb_encoding_indices.append(encoding_indices) |
| | mb_neglogpacs.append(neglogpacs) |
| | mb_dones.append(self.dones) |
| |
|
| | |
| | |
| | self.obs[:], rewards, self.dones, infos = self.env.step(actions) |
| | for info in infos: |
| | maybeepinfo = info.get('episode') |
| | if maybeepinfo: epinfos.append(maybeepinfo) |
| | mb_rewards.append(rewards) |
| | |
| | mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype) |
| | mb_rewards = np.asarray(mb_rewards, dtype=np.float32) |
| | mb_actions = np.asarray(mb_actions) |
| | mb_values = np.asarray(mb_values, dtype=np.float32) |
| |
|
| | mb_encoding_indices = np.asarray(mb_encoding_indices, dtype=np.float32) |
| | mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32) |
| | mb_dones = np.asarray(mb_dones, dtype=np.bool) |
| | last_values = self.model.value(self.obs, S=self.states, M=self.dones) |
| |
|
| | |
| | mb_returns = np.zeros_like(mb_rewards) |
| | mb_advs = np.zeros_like(mb_rewards) |
| | lastgaelam = 0 |
| | for t in reversed(range(self.nsteps)): |
| | if t == self.nsteps - 1: |
| | nextnonterminal = 1.0 - self.dones |
| | nextvalues = last_values |
| | else: |
| | nextnonterminal = 1.0 - mb_dones[t + 1] |
| | nextvalues = mb_values[t + 1] |
| | delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t] |
| | mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam |
| | mb_returns = mb_advs + mb_values |
| | return (*map(sf01, ( |
| | mb_obs, mb_returns, mb_encoding_indices, mb_dones, mb_actions, |
| | mb_values, |
| | mb_neglogpacs)), |
| | mb_states, epinfos) |
| |
|
| |
|
| | |
| | def sf01(arr): |
| | """ |
| | swap and then flatten axes 0 and 1 |
| | """ |
| | s = arr.shape |
| | try: |
| | return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:]) |
| | except: |
| | print() |
| |
|