Spaces:
Running
Running
| from abc import ABC, abstractmethod | |
| import numpy as np | |
| import torch | |
| from torch_ac.format import default_preprocess_obss | |
| from torch_ac.utils import DictList, ParallelEnv | |
| from torch_ac.intrinsic_reward_models import * | |
| from collections import Counter | |
| class BaseAlgo(ABC): | |
| """The base class for RL algorithms.""" | |
| def __init__(self, | |
| envs, | |
| acmodel, | |
| device, | |
| num_frames_per_proc, | |
| discount, | |
| lr, | |
| gae_lambda, | |
| entropy_coef, | |
| value_loss_coef, | |
| max_grad_norm, | |
| recurrence, | |
| preprocess_obss, | |
| reshape_reward, | |
| exploration_bonus=False, | |
| exploration_bonus_params=None, | |
| exploration_bonus_tanh=None, | |
| expert_exploration_bonus=False, | |
| exploration_bonus_type="lang", | |
| episodic_exploration_bonus=True, | |
| utterance_moa_net=True, # used for social influence | |
| clipped_rewards=False, | |
| # default is set to fit RND | |
| intrinsic_reward_loss_coef=0.1, # also used for social influence | |
| intrinsic_reward_coef=0.1, # also used for social influence | |
| intrinsic_reward_learning_rate=0.0001, | |
| intrinsic_reward_momentum=0, | |
| intrinsic_reward_epsilon=0.01, | |
| intrinsic_reward_alpha=0.99, | |
| intrinsic_reward_max_grad_norm=40, | |
| intrinsic_reward_forward_loss_coef=10, | |
| intrinsic_reward_inverse_loss_coef=0.1, | |
| reset_rnd_ride_at_phase=False, | |
| # social_influence | |
| balance_moa_training=False, | |
| moa_memory_dim=128, | |
| ): | |
| """ | |
| Initializes a `BaseAlgo` instance. | |
| Parameters: | |
| ---------- | |
| envs : list | |
| a list of environments that will be run in parallel | |
| acmodel : torch.Module | |
| the model | |
| num_frames_per_proc : int | |
| the number of frames collected by every process for an update | |
| discount : float | |
| the discount for future rewards | |
| lr : float | |
| the learning rate for optimizers | |
| gae_lambda : float | |
| the lambda coefficient in the GAE formula | |
| ([Schulman et al., 2015](https://arxiv.org/abs/1506.02438)) | |
| entropy_coef : float | |
| the weight of the entropy cost in the final objective | |
| value_loss_coef : float | |
| the weight of the value loss in the final objective | |
| max_grad_norm : float | |
| gradient will be clipped to be at most this value | |
| recurrence : int | |
| the number of steps the gradient is propagated back in time | |
| preprocess_obss : function | |
| a function that takes observations returned by the environment | |
| and converts them into the format that the model can handle | |
| reshape_reward : function | |
| a function that shapes the reward, takes an | |
| (observation, action, reward, done) tuple as an input | |
| """ | |
| # Store parameters | |
| self.env = ParallelEnv(envs) | |
| self.acmodel = acmodel | |
| self.device = device | |
| self.num_frames_per_proc = num_frames_per_proc | |
| self.discount = discount | |
| self.lr = lr | |
| self.gae_lambda = gae_lambda | |
| self.entropy_coef = entropy_coef | |
| self.value_loss_coef = value_loss_coef | |
| self.max_grad_norm = max_grad_norm | |
| self.recurrence = recurrence | |
| self.preprocess_obss = preprocess_obss or default_preprocess_obss | |
| self.reshape_reward = reshape_reward | |
| self.exploration_bonus = exploration_bonus | |
| self.expert_exploration_bonus = expert_exploration_bonus | |
| self.exploration_bonus_type = exploration_bonus_type | |
| self.episodic_exploration_bonus = episodic_exploration_bonus | |
| self.clipped_rewards = clipped_rewards | |
| self.update_epoch = 0 | |
| self.utterance_moa_net = utterance_moa_net # todo: as parameter | |
| self.reset_rnd_ride_at_phase = reset_rnd_ride_at_phase | |
| self.was_reset = False | |
| # Control parameters | |
| assert self.acmodel.recurrent or self.recurrence == 1 | |
| assert self.num_frames_per_proc % self.recurrence == 0 | |
| # Configure acmodel | |
| self.acmodel.to(self.device) | |
| self.acmodel.train() | |
| # Store helpers values | |
| self.num_procs = len(envs) | |
| self.num_frames = self.num_frames_per_proc * self.num_procs | |
| # Initialize experience values | |
| shape = (self.num_frames_per_proc, self.num_procs) | |
| self.obs = self.env.reset() | |
| self.obss = [None]*(shape[0]) | |
| self.info = [{}]*(shape[0]) | |
| self.infos = [None]*(shape[0]) | |
| if self.acmodel.recurrent: | |
| self.memory = torch.zeros(shape[1], self.acmodel.memory_size, device=self.device) | |
| self.memories = torch.zeros(*shape, self.acmodel.memory_size, device=self.device) | |
| self.mask = torch.ones(shape[1], device=self.device) | |
| self.masks = torch.zeros(*shape, device=self.device) | |
| self.next_masks = torch.zeros(*shape, device=self.device) | |
| self.values = torch.zeros(*shape, device=self.device) | |
| self.next_values = torch.zeros(*shape, device=self.device) | |
| self.rewards = torch.zeros(*shape, device=self.device) | |
| self.extrinsic_rewards = torch.zeros(*shape, device=self.device) | |
| self.advantages = torch.zeros(*shape, device=self.device) | |
| # as_shape = self.env.envs[0].action_space.shape | |
| as_shape = self.acmodel.model_raw_action_space.shape | |
| self.actions = torch.zeros(*(shape+as_shape), device=self.device, dtype=torch.int) | |
| self.log_probs = torch.zeros(*(shape+as_shape), device=self.device) | |
| # Initialize log values | |
| self.log_episode_return = torch.zeros(self.num_procs, device=self.device) | |
| self.log_episode_extrinsic_return = torch.zeros(self.num_procs, device=self.device) | |
| self.log_episode_exploration_bonus = torch.zeros(self.num_procs, device=self.device) | |
| self.log_episode_success_rate = torch.zeros(self.num_procs, device=self.device) | |
| self.log_episode_curriculum_mean_perf = torch.zeros(self.num_procs, device=self.device) | |
| self.log_episode_reshaped_return = torch.zeros(self.num_procs, device=self.device) | |
| self.log_episode_num_frames = torch.zeros(self.num_procs, device=self.device) | |
| self.log_episode_mission_string_observed = torch.zeros(self.num_procs, device=self.device) | |
| self.log_episode_NPC_introduced_to = np.zeros(self.num_procs).astype(bool) | |
| self.log_episode_curriculum_param = torch.zeros(self.num_procs, device=self.device) | |
| self.intrinsic_reward_loss_coef = intrinsic_reward_loss_coef | |
| self.intrinsic_reward_coef = intrinsic_reward_coef | |
| self.intrinsic_reward_learning_rate = intrinsic_reward_learning_rate | |
| self.intrinsic_reward_momentum = intrinsic_reward_momentum | |
| self.intrinsic_reward_epsilon = intrinsic_reward_epsilon | |
| self.intrinsic_reward_alpha = intrinsic_reward_alpha | |
| self.intrinsic_reward_max_grad_norm = intrinsic_reward_max_grad_norm | |
| self.intrinsic_reward_forward_loss_coef = intrinsic_reward_forward_loss_coef | |
| self.intrinsic_reward_inverse_loss_coef = intrinsic_reward_inverse_loss_coef | |
| self.balance_moa_training = balance_moa_training | |
| self.moa_memory_dim = moa_memory_dim | |
| self.log_done_counter = 0 | |
| self.log_return = [0] * self.num_procs | |
| self.log_extrinsic_return = [0] * self.num_procs | |
| self.log_exploration_bonus = [0] * self.num_procs | |
| self.log_success_rate = [0] * self.num_procs | |
| self.log_curriculum_max_mean_perf = [0] * self.num_procs | |
| self.log_curriculum_param = [0] * self.num_procs | |
| self.log_reshaped_return = [0] * self.num_procs | |
| self.log_num_frames = [0] * self.num_procs | |
| self.log_mission_string_observed = [0] * self.num_procs | |
| self.log_NPC_introduced_to = [False] * self.num_procs | |
| self.images_counter = [Counter() for _ in range(self.num_procs)] | |
| if self.exploration_bonus: | |
| self.visitation_counter = {} | |
| self.exploration_bonus_params = {} | |
| self.exploration_bonus_tanh = {} | |
| for i, bonus_type in enumerate(self.exploration_bonus_type): | |
| if bonus_type == "rnd": | |
| assert not self.episodic_exploration_bonus | |
| self.init_rnd_networks_and_optimizer() | |
| elif bonus_type == "ride": | |
| self.init_ride_networks_and_optimizer() | |
| elif bonus_type == "soc_inf": | |
| # npc actions | |
| self.fn_name_to_npc_prim_act = self.env.envs[0].npc_prim_actions_dict | |
| self.num_npc_prim_actions = len(self.fn_name_to_npc_prim_act) | |
| self.npc_utterance_to_id = {a: i for i, a in enumerate(self.env.envs[0].all_npc_utterance_actions)} | |
| self.num_npc_utterance_actions = len(self.npc_utterance_to_id) | |
| if self.utterance_moa_net: | |
| self.num_npc_all_actions = self.num_npc_prim_actions * self.num_npc_utterance_actions | |
| else: | |
| self.num_npc_all_actions = self.num_npc_prim_actions | |
| # construct possible agent_action's list | |
| self.all_possible_agent_actions, self.act_to_ind_dict = self.construct_all_possible_agent_actions() | |
| self.agent_actions_tiled_all = None | |
| im_shape = self.env.observation_space['image'].shape | |
| embedding_size = self.acmodel.semi_memory_size | |
| input_size = embedding_size \ | |
| + self.num_npc_prim_actions \ | |
| + self.acmodel.model_raw_action_space.nvec[0] \ | |
| + self.acmodel.model_raw_action_space.nvec[2] \ | |
| + self.acmodel.model_raw_action_space.nvec[3] | |
| if self.utterance_moa_net: | |
| input_size += self.num_npc_utterance_actions # todo: feed as index or as text? | |
| self.moa_net = LSTMMoaNet( | |
| input_size=input_size, | |
| num_npc_prim_actions=self.num_npc_prim_actions, | |
| num_npc_utterance_actions=self.num_npc_utterance_actions if self.utterance_moa_net else None, | |
| acmodel=self.acmodel, | |
| memory_dim=self.moa_memory_dim | |
| ).to(device=self.device) | |
| # memory | |
| assert shape == (self.num_frames_per_proc, self.num_procs) | |
| self.moa_memory = torch.zeros(shape[1], self.moa_net.memory_size, device=self.device) | |
| self.moa_memories = torch.zeros(*shape, self.moa_net.memory_size, device=self.device) | |
| elif bonus_type in ["cell", "grid", "lang"]: | |
| if self.episodic_exploration_bonus: | |
| self.visitation_counter[bonus_type] = [Counter() for _ in range(self.num_procs)] | |
| else: | |
| self.visitation_counter[bonus_type] = Counter() | |
| if exploration_bonus_params: | |
| self.exploration_bonus_params[bonus_type] = exploration_bonus_params[2*i:2*i+2] | |
| else: | |
| self.exploration_bonus_params[bonus_type] = (100, 50.) | |
| if exploration_bonus_tanh is None: | |
| self.exploration_bonus_tanh[bonus_type] = None | |
| else: | |
| self.exploration_bonus_tanh[bonus_type] = exploration_bonus_tanh[i] | |
| else: | |
| raise ValueError(f"bonus type: {bonus_type} unknown.") | |
| def load_status_dict(self, status): | |
| self.acmodel.load_state_dict(status["model_state"]) | |
| if hasattr(self.env, "curriculum") and self.env.curriculum is not None: | |
| self.env.curriculum.load_status_dict(status) | |
| self.env.broadcast_curriculum_parameters(self.env.curriculum.get_parameters()) | |
| # self.optimizer.load_state_dict(status["optimizer_state"]) | |
| if self.exploration_bonus: | |
| for i, bonus_type in enumerate(self.exploration_bonus_type): | |
| if bonus_type == "rnd": | |
| self.random_target_network.load_state_dict(status["random_target_network"]) | |
| self.predictor_network.load_state_dict(status["predictor_network"]) | |
| self.intrinsic_reward_optimizer.load_state_dict(status["intrinsic_reward_optimizer"]) | |
| elif bonus_type == "ride": | |
| self.forward_dynamics_model.load_state_dict(status["forward_dynamics_model"]) | |
| self.inverse_dynamics_model.load_state_dict(status["inverse_dynamics_model"]) | |
| self.state_embedding_model.load_state_dict(status["state_embedding_model"]) | |
| self.state_embedding_optimizer.load_state_dict(status["state_embedding_optimizer"]) | |
| self.inverse_dynamics_optimizer.load_state_dict(status["inverse_dynamics_optimizer"]) | |
| self.forward_dynamics_optimizer.load_state_dict(status["forward_dynamics_optimizer"]) | |
| elif bonus_type == "soc_inf": | |
| self.moa_net.load_state_dict(status["moa_net"]) | |
| def get_status_dict(self): | |
| algo_status_dict = { | |
| "model_state": self.acmodel.state_dict(), | |
| } | |
| if hasattr(self.env, "curriculum") and self.env.curriculum is not None: | |
| algo_status_dict = { | |
| **algo_status_dict, | |
| **self.env.curriculum.get_status_dict() | |
| } | |
| if self.exploration_bonus: | |
| for i, bonus_type in enumerate(self.exploration_bonus_type): | |
| if bonus_type == "rnd": | |
| algo_status_dict["random_target_network"] = self.random_target_network.state_dict() | |
| algo_status_dict["predictor_network"] = self.predictor_network.state_dict() | |
| algo_status_dict["intrinsic_reward_optimizer"] = self.intrinsic_reward_optimizer.state_dict() | |
| elif bonus_type == "ride": | |
| algo_status_dict["forward_dynamics_model"] = self.forward_dynamics_model.state_dict() | |
| algo_status_dict["inverse_dynamics_model"] = self.inverse_dynamics_model.state_dict() | |
| algo_status_dict["state_embedding_model"] = self.state_embedding_model.state_dict() | |
| algo_status_dict["state_embedding_optimizer"] = self.state_embedding_optimizer.state_dict() | |
| algo_status_dict["inverse_dynamics_optimizer"] = self.inverse_dynamics_optimizer.state_dict() | |
| algo_status_dict["forward_dynamics_optimizer"] = self.forward_dynamics_optimizer.state_dict() | |
| elif bonus_type == "soc_inf": | |
| algo_status_dict["moa_net"] = self.moa_net.state_dict() | |
| return algo_status_dict | |
| def construct_all_possible_agent_actions(self): | |
| if self.acmodel is None: | |
| raise ValueError("This should be called after the model has been set") | |
| # add non-speaking actions | |
| # a non-speaking actions look like (?, 0, 0, 0) | |
| # the last two zeros would normally mean the frst template and first word, but here they are to be | |
| # ignored because of the second 0 (which means to not speak) | |
| non_speaking_action_subspace = (self.acmodel.model_raw_action_space.nvec[0], 1, 1, 1) | |
| non_speaking_actions = np.array(list(np.ndindex(non_speaking_action_subspace))) | |
| # add speaking actions | |
| speaking_action_subspace = ( | |
| self.acmodel.model_raw_action_space.nvec[0], | |
| 1, # one action, | |
| self.acmodel.model_raw_action_space.nvec[2], | |
| self.acmodel.model_raw_action_space.nvec[3], | |
| ) | |
| speaking_actions = np.array(list(np.ndindex(speaking_action_subspace))) | |
| speaking_actions = self.acmodel.no_speak_to_speak_action(speaking_actions) | |
| # all actions | |
| all_possible_agent_actions = np.concatenate([non_speaking_actions, speaking_actions]) | |
| # create the action -> index dict | |
| act_to_ind_dict = {tuple(act): ind for ind, act in enumerate(all_possible_agent_actions)} | |
| # map other non-speaking actions to the (?, 0, 0, 0), ex. (3, 0, 4, 12) -> (3, 0, 0, 0) | |
| other_non_speaking_action_subspace = ( | |
| self.acmodel.model_raw_action_space.nvec[0], | |
| 1, | |
| self.acmodel.model_raw_action_space.nvec[2], | |
| self.acmodel.model_raw_action_space.nvec[3] | |
| ) | |
| for action in np.ndindex(other_non_speaking_action_subspace): | |
| assert action[1] == 0 # non-speaking | |
| act_to_ind_dict[tuple(action)] = act_to_ind_dict[(action[0], 0, 0, 0)] | |
| return all_possible_agent_actions, act_to_ind_dict | |
| def step_to_n_frames(self, step): | |
| return step * self.num_frames_per_proc * self.num_procs | |
| def calculate_exploration_bonus(self, obs=None, done=None, prev_obs=None, info=None, prev_info=None, agent_actions=None, dist=None, | |
| i_step=None, embeddings=None): | |
| def state_hashes(observation, exploration_bonus_type): | |
| if exploration_bonus_type == "lang": | |
| hashes = [observation['utterance']] | |
| assert len(hashes) == 1 | |
| elif exploration_bonus_type == "cell": | |
| # for all new cells | |
| im = observation["image"] | |
| hashes = np.unique(im.reshape(-1, im.shape[-1]), axis=0) | |
| hashes = np.apply_along_axis(lambda a: a.data.tobytes(), 1, hashes) | |
| elif exploration_bonus_type == "grid": | |
| # for seeing new grid configurations | |
| im = observation["image"] | |
| hashes = [im.data.tobytes()] | |
| assert len(hashes) == 1 | |
| else: | |
| raise ValueError(f"Unknown exploration bonus type {bonus_type}") | |
| return hashes | |
| total_bonus = [0]*len(obs) | |
| for bonus_type in self.exploration_bonus_type: | |
| if bonus_type == "rnd": | |
| # -- [unroll_length x batch_size x height x width x channels] == [1, n_proc, 7, 7, 4] | |
| batch = torch.tensor(np.array([[o['image'] for o in obs]])).to(self.device) | |
| with torch.no_grad(): | |
| random_embedding = self.random_target_network(batch).reshape(len(obs), 128) | |
| predicted_embedding = self.predictor_network(batch).reshape(len(obs), 128) | |
| intrinsic_rewards = torch.norm(predicted_embedding.detach() - random_embedding.detach(), dim=1, p=2) | |
| intrinsic_reward_coef = self.intrinsic_reward_coef | |
| intrinsic_rewards *= intrinsic_reward_coef | |
| # is this the best way? should we somehow extract the next_state? | |
| bonus = [0.0 if d else float(r) for d, r in zip(done, intrinsic_rewards)] | |
| elif bonus_type == "ride": | |
| with torch.no_grad(): | |
| _obs = torch.tensor(np.array([[o['image'] for o in prev_obs]])).to(self.device) | |
| _next_obs = torch.tensor(np.array([[o['image'] for o in obs]])).to(self.device) | |
| # counts - number of times a state was seen during the SAME episode -> can be computed here | |
| count_rewards = torch.tensor([1/np.sqrt(self.images_counter[p_i][np.array(o.to("cpu")).tobytes()]) for p_i, o in enumerate(_next_obs[0])]).to(self.device) | |
| assert not any(torch.isinf(count_rewards)) | |
| state_emb = self.state_embedding_model(_obs.to(device=self.device)).reshape(len(obs), 128) | |
| next_state_emb = self.state_embedding_model(_next_obs.to(device=self.device)).reshape(len(obs), 128) | |
| control_rewards = torch.norm(next_state_emb - state_emb, dim=1, p=2) | |
| intrinsic_rewards = self.intrinsic_reward_coef*(count_rewards * control_rewards) | |
| # is this the best way? should we somehow extract the next_state? | |
| bonus = [0.0 if d else float(r) for d, r in zip(done, intrinsic_rewards)] | |
| elif bonus_type == "soc_inf": | |
| if prev_info == [{}] * len(prev_info): | |
| # this is the first step, info is not given during reset | |
| # first step in the episode no influence can be estimated as there is no previous action | |
| # todo: padd with zeros, and estimate anyway? | |
| bonus = [0.0 for _ in done] | |
| else: | |
| # social influence | |
| n_procs = len(obs) | |
| _prev_NPC_prim_actions = torch.tensor( | |
| [self.fn_name_to_npc_prim_act[o["NPC_prim_action"]] for o in prev_info] | |
| ).to(self.device) | |
| # todo: what is the best way to feed utt action? | |
| _prev_NPC_utt_actions = torch.tensor( | |
| [self.npc_utterance_to_id[o["NPC_utterance"]] for o in prev_info] | |
| ).to(self.device) | |
| # new | |
| # calculate counterfactuals | |
| npc_previous_prim_actions_all = _prev_NPC_prim_actions.repeat(len(self.all_possible_agent_actions)) # [A_ag*n_procs, ...] | |
| npc_previous_utt_actions_all = _prev_NPC_utt_actions.repeat(len(self.all_possible_agent_actions)) # [A_ag*n_procs, ...] | |
| # agent actions tiled | |
| if self.agent_actions_tiled_all is not None: | |
| agent_actions_tiled_all = self.agent_actions_tiled_all | |
| else: | |
| # only first time, we can't do it in init because we need len(im_obs) | |
| agent_actions_tiled_all = [] | |
| for pot_agent_action in self.all_possible_agent_actions: | |
| pot_agent_action_tiled = torch.from_numpy(np.tile(pot_agent_action, (n_procs, 1))) # [n_procs,...] | |
| agent_actions_tiled_all.append(pot_agent_action_tiled.to(self.device)) | |
| agent_actions_tiled_all = torch.concat(agent_actions_tiled_all) # [A_ag*n_procs,....] | |
| self.agent_actions_tiled_all = agent_actions_tiled_all | |
| with torch.no_grad(): | |
| # todo: move this tiling above? | |
| masked_memory = self.moa_memory * self.mask.unsqueeze(1) | |
| masked_memory_tiled_all = masked_memory.repeat([len(self.all_possible_agent_actions), 1]) | |
| embedding_tiled_all = embeddings.repeat([len(self.all_possible_agent_actions), 1]) | |
| # use current memory for every action | |
| counterfactuals_logits, moa_memory = self.moa_net( | |
| embeddings=embedding_tiled_all, | |
| # observations=observations_all, | |
| npc_previous_prim_actions=npc_previous_prim_actions_all, | |
| npc_previous_utterance_actions=npc_previous_utt_actions_all if self.utterance_moa_net else None, | |
| agent_actions=agent_actions_tiled_all, | |
| memory=masked_memory_tiled_all | |
| ) # logits : [A_ag * n_procs, A_npc] | |
| counterfactuals_logits = counterfactuals_logits.reshape( | |
| [len(self.all_possible_agent_actions), n_procs, self.num_npc_all_actions]) | |
| counterfactuals_logits = counterfactuals_logits.swapaxes(0, 1) # [n_procs, A_ag, A_npc] | |
| assert counterfactuals_logits.shape == (len(obs), len(self.all_possible_agent_actions), self.num_npc_all_actions) | |
| # compute npc logits p(A_npc|A_ag, s) | |
| # note: ex (5,0,5,2) is mapped to (5,0,0,0), todo: is this ok everywhere? | |
| agent_action_indices = [self.act_to_ind_dict[tuple(act.cpu().numpy())] for act in agent_actions] | |
| # ~ p(a_npc| a_ag, ...) | |
| predicted_logits = torch.stack([ctr[ind] for ctr, ind in zip(counterfactuals_logits, agent_action_indices)]) | |
| assert i_step is not None | |
| self.moa_memories[i_step] = self.moa_memory | |
| # only save for the actions actually taken | |
| self.moa_memory = moa_memory[agent_action_indices] | |
| assert predicted_logits.shape == (len(obs), self.num_npc_all_actions) | |
| predicted_probs = torch.softmax(predicted_logits, dim=1) # use exp_softmax or something? | |
| # compute marginal npc logits p(A_npc|s) = sum( p(A_NPC|A_ag,s), for every A_ag ) | |
| # compute agent logits for all possible agent actions | |
| per_non_speaking_action_log_probs = dist[0].logits + dist[1].logits[:, :1] | |
| per_speaking_action_log_probs = [] | |
| for p in range(n_procs): | |
| log_probs_for_proc_p = [d.logits[p].cpu().numpy() for d in dist] | |
| # speaking actions | |
| speaking_log_probs = log_probs_for_proc_p | |
| speaking_log_probs[1] = speaking_log_probs[1][1:] # only the speak action | |
| # sum everybody with everybody | |
| out = np.add.outer(speaking_log_probs[0], speaking_log_probs[1]).reshape(-1) | |
| out = np.add.outer(out, speaking_log_probs[2]).reshape(-1) | |
| out = np.add.outer(out, speaking_log_probs[3]).reshape(-1) | |
| per_speaking_action_log_probs_proc_p = out | |
| per_speaking_action_log_probs.append(per_speaking_action_log_probs_proc_p) | |
| per_speaking_action_log_probs = np.stack(per_speaking_action_log_probs) | |
| agent_log_probs = torch.concat([ | |
| per_non_speaking_action_log_probs, | |
| torch.tensor(per_speaking_action_log_probs).to(device=self.device), | |
| ], dim=1) | |
| # assert | |
| for p in range(n_procs): | |
| log_probs_for_proc_p = [d.logits[p].cpu().numpy() for d in dist] | |
| assert torch.abs(agent_log_probs[p][self.act_to_ind_dict[(0, 1, 3, 1)]] - sum([p[a] for p, a in list(zip(log_probs_for_proc_p, (0, 1, 3, 1)))])) < 1e-5 | |
| assert torch.abs(agent_log_probs[p][self.act_to_ind_dict[(0, 1, 1, 10)]] - sum([p[a] for p, a in list(zip(log_probs_for_proc_p, (0, 1, 1, 10)))])) < 1e-5 | |
| agent_probs = agent_log_probs.exp() | |
| counterfactuals_probs = counterfactuals_logits.softmax(dim=-1) # [n_procs, A_ag, A_npc] | |
| counterfactuals_perm = counterfactuals_probs.permute(0, 2, 1) # [n_procs, A_npc, A_agent] | |
| # compute marginal distributions | |
| marginals = (counterfactuals_perm * agent_probs[:, None, :]).sum(-1) | |
| # this already sums to one, so the following normalization is not needed | |
| marginal_probs = marginals / marginals.sum(1, keepdims=True) # sum over npc_actions | |
| assert marginal_probs.shape == (n_procs, self.num_npc_all_actions) # [batch, A_npc] | |
| KL_loss = (predicted_probs * (predicted_probs.log() - marginal_probs.log())).sum(axis=-1) | |
| intrinsic_rewards = self.intrinsic_reward_coef * KL_loss | |
| # is the NPC observed in the image that is fed as input in this step | |
| # (returned by the previous step() call ) | |
| NPC_observed = torch.tensor([pi["NPC_observed"] for pi in prev_info]).to(self.device) | |
| intrinsic_rewards = intrinsic_rewards * NPC_observed | |
| bonus = [0.0 if d else float(r) for d, r in zip(done, intrinsic_rewards)] | |
| elif bonus_type in ["cell", "grid", "lang"]: | |
| C, M = self.exploration_bonus_params[bonus_type] | |
| C_ = C / self.num_frames_per_proc | |
| if self.expert_exploration_bonus: | |
| # expert | |
| raise DeprecationWarning("Deprecated exploration bonus type") | |
| elif self.episodic_exploration_bonus: | |
| hashes = [state_hashes(o, bonus_type) for o in obs] | |
| bonus = [ | |
| 0 if d else # no bonus if done | |
| np.sum([ | |
| C_ / ((self.visitation_counter[bonus_type][i_p][h] + 1) ** M) for h in hs | |
| ]) | |
| for i_p, (hs, d) in enumerate(zip(hashes, done)) | |
| ] | |
| # update the counters | |
| for i_p, (o, d, hs) in enumerate(zip(obs, done, hashes)): | |
| if not d: | |
| for h in hs: | |
| self.visitation_counter[bonus_type][i_p][h] += 1 | |
| else: | |
| raise DeprecationWarning("Use episodic exploration bonus.") | |
| # non-episodic exploration bonus | |
| bonus = [ | |
| 0 if d else # no bonus if done | |
| np.sum([ | |
| C_ / ((self.visitation_counter[bonus_type][h] + 1) ** M) for h in state_hashes(o. bonus_type) | |
| ]) for o, d in zip(obs, done) | |
| ] | |
| # update the counters | |
| for o, d in zip(obs, done): | |
| if not d: | |
| for h in state_hashes(o, self.exploration_bonus_type): | |
| self.visitation_counter[bonus_type][h] += 1 | |
| if self.exploration_bonus_tanh[bonus_type] is not None: | |
| bonus = [np.tanh(b)*self.exploration_bonus_tanh[bonus_type] for b in bonus] | |
| else: | |
| raise ValueError(f"Unknown exploration bonus type {bonus_type}") | |
| assert len(total_bonus) == len(bonus) | |
| total_bonus = [tb+b for tb, b in zip(total_bonus, bonus)] | |
| return total_bonus | |
| def collect_experiences(self): | |
| """Collects rollouts and computes advantages. | |
| Runs several environments concurrently. The next actions are computed | |
| in a batch mode for all environments at the same time. The rollouts | |
| and advantages from all environments are concatenated together. | |
| Returns | |
| ------- | |
| exps : DictList | |
| Contains actions, rewards, advantages etc as attributes. | |
| Each attribute, e.g. `exps.reward` has a shape | |
| (self.num_frames_per_proc * num_envs, ...). k-th block | |
| of consecutive `self.num_frames_per_proc` frames contains | |
| data obtained from the k-th environment. Be careful not to mix | |
| data from different environments! | |
| logs : dict | |
| Useful stats about the training process, including the average | |
| reward, policy loss, value loss, etc. | |
| """ | |
| for i_step in range(self.num_frames_per_proc): | |
| # Do one agent-environment interaction | |
| preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) | |
| with torch.no_grad(): | |
| if self.acmodel.recurrent: | |
| dist, value, memory, policy_embedding = self.acmodel(preprocessed_obs, self.memory * self.mask.unsqueeze(1), return_embeddings=True) | |
| else: | |
| dist, value, policy_embedding = self.acmodel(preprocessed_obs, return_embeddings=True) | |
| action = self.acmodel.sample_action(dist) | |
| obs, reward, done, info = self.env.step( | |
| self.acmodel.construct_final_action( | |
| action.cpu().numpy() | |
| ) | |
| ) | |
| if hasattr(self.env, "curriculum") and self.env.curriculum is not None: | |
| curriculum_params = self.env.curriculum.update_parameters({ | |
| "obs": obs, | |
| "reward": reward, | |
| "done": done, | |
| "info": info, | |
| }) | |
| # broadcast new parameters to all parallel environments | |
| self.env.broadcast_curriculum_parameters(curriculum_params) | |
| if self.reset_rnd_ride_at_phase and curriculum_params['phase'] == 2 and not self.was_reset: | |
| self.was_reset = True | |
| assert not self.episodic_exploration_bonus | |
| for i, bonus_type in enumerate(self.exploration_bonus_type): | |
| if bonus_type == "rnd": | |
| self.init_rnd_networks_and_optimizer() | |
| elif bonus_type == "ride": | |
| self.init_ride_networks_and_optimizer() | |
| for p_i, o in enumerate(obs): | |
| self.images_counter[p_i][o['image'].tobytes()] += 1 | |
| extrinsic_reward = reward | |
| exploration_bonus = (0,) * len(reward) | |
| if self.exploration_bonus: | |
| bonus = self.calculate_exploration_bonus( | |
| obs=obs, done=done, prev_obs=self.obs, info=info, prev_info=self.info, agent_actions=action, dist=dist, | |
| i_step=i_step, embeddings=policy_embedding, | |
| ) | |
| exploration_bonus = bonus | |
| reward = [r + b for r, b in zip(reward, bonus)] | |
| if self.clipped_rewards: | |
| # this should not be used with classic count-based rewards as they often, | |
| # when combined with extr. rew go past 1.0 | |
| reward = list(map(float, torch.clamp(torch.tensor(reward), -1, 1))) | |
| # Update experiences values | |
| self.obss[i_step] = self.obs | |
| self.obs = obs | |
| self.infos[i_step] = info # info of this step is the current info | |
| self.info = info # save as previous info | |
| if self.acmodel.recurrent: | |
| self.memories[i_step] = self.memory | |
| self.memory = memory | |
| self.masks[i_step] = self.mask | |
| self.mask = 1 - torch.tensor(done, device=self.device, dtype=torch.float) | |
| self.actions[i_step] = action | |
| self.values[i_step] = value | |
| if self.reshape_reward is not None: | |
| self.rewards[i_step] = torch.tensor([ | |
| self.reshape_reward(obs_, action_, reward_, done_) | |
| for obs_, action_, reward_, done_ in zip(obs, action, reward, done) | |
| ], device=self.device) | |
| else: | |
| self.rewards[i_step] = torch.tensor(reward, device=self.device) | |
| self.log_probs[i_step] = self.acmodel.calculate_log_probs(dist, action) | |
| # Update log values | |
| self.log_episode_return += torch.tensor(reward, device=self.device, dtype=torch.float) | |
| self.log_episode_extrinsic_return += torch.tensor(extrinsic_reward, device=self.device, dtype=torch.float) | |
| self.log_episode_exploration_bonus += torch.tensor(exploration_bonus, device=self.device, dtype=torch.float) | |
| self.log_episode_success_rate = torch.tensor([i["success"] for i in info]).float().to(self.device) | |
| self.log_episode_curriculum_mean_perf = torch.tensor([i.get("curriculum_info_max_mean_perf", 0) for i in info]).float().to(self.device) | |
| self.log_episode_reshaped_return += self.rewards[i_step] | |
| self.log_episode_num_frames += torch.ones(self.num_procs, device=self.device) | |
| self.log_episode_curriculum_param = torch.tensor([i.get("curriculum_info_param", 0.0) for i in info]).float().to(self.device) | |
| # self.log_episode_curriculum_param = torch.tensor([i.get("curriculum_info_mean_perf", 0.0) for i in info]).float().to(self.device) | |
| assert self.log_episode_curriculum_param.var() == 0 | |
| log_episode_NPC_introduced_to_current = np.array([i.get('NPC_was_introduced_to', False) for i in info]) | |
| assert all((self.log_episode_NPC_introduced_to | log_episode_NPC_introduced_to_current) == log_episode_NPC_introduced_to_current) | |
| self.log_episode_NPC_introduced_to = self.log_episode_NPC_introduced_to | log_episode_NPC_introduced_to_current | |
| self.log_episode_mission_string_observed += torch.tensor([ | |
| float(m in o.get("utterance", '')) | |
| for m, o in zip(self.env.get_mission(), self.obs) | |
| ], device=self.device, dtype=torch.float) | |
| for p, done_ in enumerate(done): | |
| if done_: | |
| self.log_mission_string_observed.append( | |
| torch.clamp(self.log_episode_mission_string_observed[p], 0, 1).item() | |
| ) | |
| self.log_done_counter += 1 | |
| self.log_return.append(self.log_episode_return[p].item()) | |
| self.log_extrinsic_return.append(self.log_episode_extrinsic_return[p].item()) | |
| self.log_exploration_bonus.append(self.log_episode_exploration_bonus[p].item()) | |
| self.log_success_rate.append(self.log_episode_success_rate[p].item()) | |
| self.log_curriculum_max_mean_perf.append(self.log_episode_curriculum_mean_perf[p].item()) | |
| self.log_reshaped_return.append(self.log_episode_reshaped_return[p].item()) | |
| self.log_num_frames.append(self.log_episode_num_frames[p].item()) | |
| self.log_curriculum_param.append(self.log_episode_curriculum_param[p].item()) | |
| if self.episodic_exploration_bonus: | |
| for v in self.visitation_counter.values(): | |
| v[p] = Counter() | |
| self.images_counter[p] = Counter() | |
| self.log_NPC_introduced_to.append(self.log_episode_NPC_introduced_to[p]) | |
| # print("log history:", self.log_success_rate) | |
| # print("log history len:", len(self.log_success_rate)-16) | |
| self.log_episode_mission_string_observed *= self.mask | |
| self.log_episode_return *= self.mask | |
| self.log_episode_extrinsic_return *= self.mask | |
| self.log_episode_exploration_bonus *= self.mask | |
| self.log_episode_success_rate *= self.mask | |
| self.log_episode_curriculum_mean_perf *= self.mask | |
| self.log_episode_reshaped_return *= self.mask | |
| self.log_episode_num_frames *= self.mask | |
| self.log_episode_NPC_introduced_to *= self.mask.cpu().numpy().astype(bool) | |
| self.log_episode_curriculum_param *= self.mask | |
| # Add advantage and return to experiences | |
| preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) | |
| with torch.no_grad(): | |
| if self.acmodel.recurrent: | |
| _, next_value, _ = self.acmodel(preprocessed_obs, self.memory * self.mask.unsqueeze(1)) | |
| else: | |
| _, next_value = self.acmodel(preprocessed_obs) | |
| for f in reversed(range(self.num_frames_per_proc)): | |
| next_mask = self.masks[f+1] if f < self.num_frames_per_proc - 1 else self.mask | |
| next_value = self.values[f+1] if f < self.num_frames_per_proc - 1 else next_value | |
| next_advantage = self.advantages[f+1] if f < self.num_frames_per_proc - 1 else 0 | |
| self.next_masks[f] = next_mask | |
| self.next_values[f] = next_value | |
| delta = self.rewards[f] + self.discount * next_value * next_mask - self.values[f] | |
| self.advantages[f] = delta + self.discount * self.gae_lambda * next_advantage * next_mask | |
| # Define experiences: | |
| # the whole experience is the concatenation of the experience | |
| # of each process. | |
| # In comments below: | |
| # - T is self.num_frames_per_proc, | |
| # - P is self.num_procs, | |
| # - D is the dimensionality. | |
| exps = DictList() | |
| exps.obs = [self.obss[f][p] | |
| for p in range(self.num_procs) | |
| for f in range(self.num_frames_per_proc)] | |
| exps.infos = np.array([self.infos[f][p] | |
| for p in range(self.num_procs) | |
| for f in range(self.num_frames_per_proc)]) | |
| # obs: (p1 (f1,f2,f3) ; p2 (f1,f2,f3); p3 (f1,f2,f3) | |
| if self.acmodel.recurrent: | |
| # T x P x D -> P x T x D -> (P * T) x D | |
| exps.memory = self.memories.transpose(0, 1).reshape(-1, *self.memories.shape[2:]) | |
| # T x P -> P x T -> (P * T) x 1 | |
| exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1) | |
| exps.next_mask = self.next_masks.transpose(0, 1).reshape(-1).unsqueeze(1) | |
| if self.exploration_bonus and "soc_inf" in self.exploration_bonus_type: | |
| exps.moa_memory = self.moa_memories.transpose(0, 1).reshape(-1, *self.moa_memories.shape[2:]) | |
| # for all tensors below, T x P -> P x T -> P * T | |
| exps.action = self.actions.transpose(0, 1).reshape((-1, self.actions.shape[-1])) | |
| exps.log_prob = self.log_probs.transpose(0, 1).reshape((-1, self.actions.shape[-1])) | |
| exps.value = self.values.transpose(0, 1).reshape(-1) | |
| exps.next_value = self.next_values.transpose(0, 1).reshape(-1) | |
| exps.reward = self.rewards.transpose(0, 1).reshape(-1) | |
| exps.advantage = self.advantages.transpose(0, 1).reshape(-1) | |
| exps.returnn = exps.value + exps.advantage | |
| # Preprocess experiences | |
| exps.obs = self.preprocess_obss(exps.obs, device=self.device) | |
| # Log some values | |
| keep = max(self.log_done_counter, self.num_procs) | |
| flat_actions = self.actions.reshape(-1, self.actions.shape[-1]) | |
| action_modalities = { | |
| "action_modality_{}".format(m): flat_actions[:, m].cpu().numpy() for m in range(self.actions.shape[-1]) | |
| } | |
| if not self.exploration_bonus: | |
| assert self.log_return == self.log_extrinsic_return | |
| logs = { | |
| "return_per_episode": self.log_return[-keep:], | |
| "mission_string_observed": self.log_mission_string_observed[-keep:], | |
| "extrinsic_return_per_episode": self.log_extrinsic_return[-keep:], | |
| "exploration_bonus_per_episode": self.log_exploration_bonus[-keep:], | |
| "success_rate_per_episode": self.log_success_rate[-keep:], | |
| "curriculum_max_mean_perf_per_episode": self.log_curriculum_max_mean_perf[-keep:], | |
| "curriculum_param_per_episode": self.log_curriculum_param[-keep:], | |
| "reshaped_return_per_episode": self.log_reshaped_return[-keep:], | |
| "num_frames_per_episode": self.log_num_frames[-keep:], | |
| "num_frames": self.num_frames, | |
| "NPC_introduced_to": self.log_NPC_introduced_to[-keep:], | |
| **action_modalities | |
| } | |
| self.log_done_counter = 0 | |
| self.log_return = self.log_return[-self.num_procs:] | |
| self.log_extrinsic_return = self.log_extrinsic_return[-self.num_procs:] | |
| self.log_exploration_bonus = self.log_exploration_bonus[-self.num_procs:] | |
| self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:] | |
| self.log_num_frames = self.log_num_frames[-self.num_procs:] | |
| return exps, logs | |
| def compute_advantages_and_returnn(self, exps): | |
| """ | |
| This function can be used for algorithms which reuse old data (not online RL) to | |
| recompute non episodic intrinsic rewards on old experience. | |
| This method is not used in PPO training. | |
| Example usage from update_parameters | |
| advs, retnn = self.compute_advantages_and_returnn(exps) | |
| # if you want to do a sanity check | |
| assert torch.equal(exps.advantage, advs) | |
| assert torch.equal(exps.returnn, retnn) | |
| exps.advantages, exps.returnn = advs, retnn | |
| """ | |
| shape = (self.num_frames_per_proc, self.num_procs) | |
| advs = torch.zeros(*shape, device=self.device) | |
| rewards = exps.reward.reshape(self.num_procs, self.num_frames_per_proc).transpose(0, 1) | |
| values = exps.value.reshape(self.num_procs, self.num_frames_per_proc).transpose(0, 1) | |
| next_values = exps.next_value.reshape(self.num_procs, self.num_frames_per_proc).transpose(0, 1) | |
| next_masks = exps.next_mask.reshape(self.num_procs, self.num_frames_per_proc).transpose(0, 1) | |
| for f in reversed(range(self.num_frames_per_proc)): | |
| next_advantage = advs[f+1] if f < self.num_frames_per_proc - 1 else 0 | |
| delta = rewards[f] + self.discount * next_values[f] * next_masks[f] - values[f] | |
| advs[f] = delta + self.discount * self.gae_lambda * next_advantage * next_masks[f] | |
| advantage = advs.transpose(0, 1).reshape(-1) | |
| returnn = exps.value + advantage | |
| return advantage, returnn | |
| def update_parameters(self): | |
| pass | |
| def init_rnd_networks_and_optimizer(self): | |
| self.random_target_network = MinigridStateEmbeddingNet(self.env.observation_space['image'].shape).to( | |
| device=self.device) | |
| self.predictor_network = MinigridStateEmbeddingNet(self.env.observation_space['image'].shape).to(device=self.device) | |
| self.intrinsic_reward_optimizer = torch.optim.RMSprop( | |
| self.predictor_network.parameters(), | |
| lr=self.intrinsic_reward_learning_rate, | |
| momentum=self.intrinsic_reward_momentum, | |
| eps=self.intrinsic_reward_epsilon, | |
| alpha=self.intrinsic_reward_alpha, | |
| ) | |
| def init_ride_networks_and_optimizer(self): | |
| self.state_embedding_model = MinigridStateEmbeddingNet(self.env.observation_space['image'].shape).to( | |
| device=self.device) | |
| # linquistic actions | |
| # n_actions = self.acmodel.model_raw_action_space.nvec.prod | |
| # we only use primitive actions for ride | |
| n_actions = self.acmodel.model_raw_action_space.nvec[0] | |
| self.forward_dynamics_model = MinigridForwardDynamicsNet(n_actions).to(device=self.device) | |
| self.inverse_dynamics_model = MinigridInverseDynamicsNet(n_actions).to(device=self.device) | |
| self.state_embedding_optimizer = torch.optim.RMSprop( | |
| self.state_embedding_model.parameters(), | |
| lr=self.intrinsic_reward_learning_rate, | |
| momentum=self.intrinsic_reward_momentum, | |
| eps=self.intrinsic_reward_epsilon, | |
| alpha=self.intrinsic_reward_alpha) | |
| self.inverse_dynamics_optimizer = torch.optim.RMSprop( | |
| self.inverse_dynamics_model.parameters(), | |
| lr=self.intrinsic_reward_learning_rate, | |
| momentum=self.intrinsic_reward_momentum, | |
| eps=self.intrinsic_reward_epsilon, | |
| alpha=self.intrinsic_reward_alpha) | |
| self.forward_dynamics_optimizer = torch.optim.RMSprop( | |
| self.forward_dynamics_model.parameters(), | |
| lr=self.intrinsic_reward_learning_rate, | |
| momentum=self.intrinsic_reward_momentum, | |
| eps=self.intrinsic_reward_epsilon, | |
| alpha=self.intrinsic_reward_alpha) | |