diff --git a/README.md b/README.md index 91cc9932b699be5e749ff01f100ed11675c290e6..845939ab3ac8b63a9d51613f0d934d0304e1ee86 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ model-index: results: - metrics: - type: mean_reward - value: 321.19 +/- 0.96 + value: 312.94 +/- 1.25 name: mean_reward task: type: reinforcement-learning @@ -23,17 +23,17 @@ model-index: This is a trained model of a **PPO** agent playing **BipedalWalker-v3** using the [/sgoodfriend/rl-algo-impls](https://github.com/sgoodfriend/rl-algo-impls) repo. -All models trained at this commit can be found at https://api.wandb.ai/links/sgoodfriend/09frjfcs. +All models trained at this commit can be found at https://api.wandb.ai/links/sgoodfriend/7lx79bf0. ## Training Results -This model was trained from 3 trainings of **PPO** agents using different initial seeds. These agents were trained by checking out [2067e21](https://github.com/sgoodfriend/rl-algo-impls/tree/2067e21d62fff5db60168687e7d9e89019a8bfc0). The best and last models were kept from each training. This submission has loaded the best models from each training, reevaluates them, and selects the best model from these latest evaluations (mean - std). +This model was trained from 3 trainings of **PPO** agents using different initial seeds. These agents were trained by checking out [0511de3](https://github.com/sgoodfriend/rl-algo-impls/tree/0511de345b17175b7cf1ea706c3e05981f11761c). The best and last models were kept from each training. This submission has loaded the best models from each training, reevaluates them, and selects the best model from these latest evaluations (mean - std). | algo | env | seed | reward_mean | reward_std | eval_episodes | best | wandb_url | |:-------|:-----------------|-------:|--------------:|-------------:|----------------:|:-------|:-----------------------------------------------------------------------------| -| ppo | BipedalWalker-v3 | 1 | 321.191 | 0.963568 | 16 | * | [wandb](https://wandb.ai/sgoodfriend/rl-algo-impls-benchmarks/runs/0yrrmija) | -| ppo | BipedalWalker-v3 | 2 | 301.816 | 74.3355 | 16 | | [wandb](https://wandb.ai/sgoodfriend/rl-algo-impls-benchmarks/runs/6dkreb9q) | -| ppo | BipedalWalker-v3 | 3 | 312.156 | 2.09454 | 16 | | [wandb](https://wandb.ai/sgoodfriend/rl-algo-impls-benchmarks/runs/ftp9jqov) | +| ppo | BipedalWalker-v3 | 1 | 312.35 | 1.56202 | 16 | | [wandb](https://wandb.ai/sgoodfriend/rl-algo-impls-benchmarks/runs/ewm5f78f) | +| ppo | BipedalWalker-v3 | 2 | 279.439 | 89.8767 | 16 | | [wandb](https://wandb.ai/sgoodfriend/rl-algo-impls-benchmarks/runs/4kv820zs) | +| ppo | BipedalWalker-v3 | 3 | 312.939 | 1.24905 | 16 | * | [wandb](https://wandb.ai/sgoodfriend/rl-algo-impls-benchmarks/runs/whcjj38j) | ### Prerequisites: Weights & Biases (WandB) @@ -53,10 +53,10 @@ login`. Note: While the model state dictionary and hyperaparameters are saved, the latest implementation could be sufficiently different to not be able to reproduce similar results. You might need to checkout the commit the agent was trained on: -[2067e21](https://github.com/sgoodfriend/rl-algo-impls/tree/2067e21d62fff5db60168687e7d9e89019a8bfc0). +[0511de3](https://github.com/sgoodfriend/rl-algo-impls/tree/0511de345b17175b7cf1ea706c3e05981f11761c). ``` # Downloads the model, sets hyperparameters, and runs agent for 3 episodes -python enjoy.py --wandb-run-path=sgoodfriend/rl-algo-impls-benchmarks/0yrrmija +python enjoy.py --wandb-run-path=sgoodfriend/rl-algo-impls-benchmarks/whcjj38j ``` Setup hasn't been completely worked out yet, so you might be best served by using Google @@ -68,11 +68,11 @@ notebook. ## Training If you want the highest chance to reproduce these results, you'll want to checkout the -commit the agent was trained on: [2067e21](https://github.com/sgoodfriend/rl-algo-impls/tree/2067e21d62fff5db60168687e7d9e89019a8bfc0). While +commit the agent was trained on: [0511de3](https://github.com/sgoodfriend/rl-algo-impls/tree/0511de345b17175b7cf1ea706c3e05981f11761c). While training is deterministic, different hardware will give different results. ``` -python train.py --algo ppo --env BipedalWalker-v3 --seed 1 +python train.py --algo ppo --env BipedalWalker-v3 --seed 3 ``` Setup hasn't been completely worked out yet, so you might be best served by using Google @@ -83,7 +83,7 @@ notebook. ## Benchmarking (with Lambda Labs instance) -This and other models from https://api.wandb.ai/links/sgoodfriend/09frjfcs were generated by running a script on a Lambda +This and other models from https://api.wandb.ai/links/sgoodfriend/7lx79bf0 were generated by running a script on a Lambda Labs instance. In a Lambda Labs instance terminal: ``` git clone git@github.com:sgoodfriend/rl-algo-impls.git @@ -105,6 +105,7 @@ can be used. However, this requires a Google Colab Pro+ subscription and running This isn't exactly the format of hyperparams in hyperparams/ppo.yml, but instead the Wandb Run Config. However, it's very close and has some additional data: ``` +additional_keys_to_log: [] algo: ppo algo_hyperparams: batch_size: 64 @@ -126,13 +127,15 @@ env_id: null eval_params: {} n_timesteps: 10000000 policy_hyperparams: {} -seed: 1 +seed: 3 use_deterministic_algorithms: true wandb_entity: null wandb_group: null wandb_project_name: rl-algo-impls-benchmarks wandb_tags: -- benchmark_2067e21 -- host_155-248-199-228 +- benchmark_0511de3 +- host_152-67-249-42 +- branch_main +- v0.0.8 ``` diff --git a/pyproject.toml b/pyproject.toml index eb996603ea40cffd2969ed65f2c3d2d1ab4516ea..dcfbed2c67b57c57c58f907284937021a4d716d7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "rl_algo_impls" -version = "0.0.4" +version = "0.0.8" description = "Implementations of reinforcement learning algorithms" authors = [ {name = "Scott Goodfriend", email = "goodfriend.scott@gmail.com"}, @@ -35,6 +35,7 @@ dependencies = [ "dash", "kaleido", "PyYAML", + "scikit-learn", ] [tool.setuptools] @@ -55,10 +56,30 @@ procgen = [ "glfw >= 1.12.0, < 1.13", "procgen; platform_machine=='x86_64'", ] +microrts-old = [ + "numpy < 1.24.0", # Support for gym-microrts < 0.6.0 + "gym-microrts == 0.2.0", # Match ppo-implementation-details +] +microrts = [ + "numpy < 1.24.0", # Support for gym-microrts < 0.6.0 + "gym-microrts == 0.3.2", +] +jupyter = [ + "jupyter", + "notebook" +] +all = [ + "rl-algo-impls[test]", + "rl-algo-impls[procgen]", + "rl-algo-impls[microrts]", +] [project.urls] "Homepage" = "https://github.com/sgoodfriend/rl-algo-impls" [build-system] requires = ["setuptools==65.5.0", "setuptools-scm"] -build-backend = "setuptools.build_meta" \ No newline at end of file +build-backend = "setuptools.build_meta" + +[tool.isort] +profile = "black" \ No newline at end of file diff --git a/replay.meta.json b/replay.meta.json index 9ae1e98da34f88686a3413142a15ec844f725a9f..8d98b5ec1ec41fa3e3311c2c5126baadaeca248d 100644 --- a/replay.meta.json +++ b/replay.meta.json @@ -1 +1 @@ -{"content_type": "video/mp4", "encoder_version": {"backend": "ffmpeg", "version": "b'ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers\\nbuilt with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)\\nconfiguration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-nvenc --enable-chromaprint --enable-frei0r --enable-libx264 --enable-shared\\nlibavutil 56. 31.100 / 56. 31.100\\nlibavcodec 58. 54.100 / 58. 54.100\\nlibavformat 58. 29.100 / 58. 29.100\\nlibavdevice 58. 8.100 / 58. 8.100\\nlibavfilter 7. 57.100 / 7. 57.100\\nlibavresample 4. 0. 0 / 4. 0. 0\\nlibswscale 5. 5.100 / 5. 5.100\\nlibswresample 3. 5.100 / 3. 5.100\\nlibpostproc 55. 5.100 / 55. 5.100\\n'", "cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-f", "rawvideo", "-s:v", "600x400", "-pix_fmt", "rgb24", "-framerate", "50", "-i", "-", "-vf", "scale=trunc(iw/2)*2:trunc(ih/2)*2", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "-r", "50", "/tmp/tmp3sid2qvd/ppo-BipedalWalker-v3/replay.mp4"]}, "episode": {"r": 322.5491027832031, "l": 1093, "t": 11.553547}} \ No newline at end of file +{"content_type": "video/mp4", "encoder_version": {"backend": "ffmpeg", "version": "b'ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers\\nbuilt with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)\\nconfiguration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-nvenc --enable-chromaprint --enable-frei0r --enable-libx264 --enable-shared\\nlibavutil 56. 31.100 / 56. 31.100\\nlibavcodec 58. 54.100 / 58. 54.100\\nlibavformat 58. 29.100 / 58. 29.100\\nlibavdevice 58. 8.100 / 58. 8.100\\nlibavfilter 7. 57.100 / 7. 57.100\\nlibavresample 4. 0. 0 / 4. 0. 0\\nlibswscale 5. 5.100 / 5. 5.100\\nlibswresample 3. 5.100 / 3. 5.100\\nlibpostproc 55. 5.100 / 55. 5.100\\n'", "cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-f", "rawvideo", "-s:v", "600x400", "-pix_fmt", "rgb24", "-framerate", "50", "-i", "-", "-vf", "scale=trunc(iw/2)*2:trunc(ih/2)*2", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "-r", "50", "/tmp/tmpnqz5cfob/ppo-BipedalWalker-v3/replay.mp4"]}, "episode": {"r": 310.78118896484375, "l": 1384, "t": 14.46221}} \ No newline at end of file diff --git a/replay.mp4 b/replay.mp4 index 80f7cd462e721b8a649ff2fa2f03966b938dca45..ecc746c899d2247db3bec5d98b5dd48a0b432136 100644 Binary files a/replay.mp4 and b/replay.mp4 differ diff --git a/rl_algo_impls/a2c/a2c.py b/rl_algo_impls/a2c/a2c.py index a5075c31b3a2366ebf299cc74f22691c4c2e66f3..18e77c5845c9fd6149611ed9bc15cc451b1e3cd9 100644 --- a/rl_algo_impls/a2c/a2c.py +++ b/rl_algo_impls/a2c/a2c.py @@ -10,6 +10,7 @@ from typing import Optional, TypeVar from rl_algo_impls.shared.algorithm import Algorithm from rl_algo_impls.shared.callbacks.callback import Callback +from rl_algo_impls.shared.gae import compute_advantages from rl_algo_impls.shared.policy.on_policy import ActorCritic from rl_algo_impls.shared.schedule import schedule, update_learning_rate from rl_algo_impls.shared.stats import log_scalars @@ -84,12 +85,12 @@ class A2C(Algorithm): obs = np.zeros(epoch_dim + obs_space.shape, dtype=obs_space.dtype) actions = np.zeros(epoch_dim + act_space.shape, dtype=act_space.dtype) rewards = np.zeros(epoch_dim, dtype=np.float32) - episode_starts = np.zeros(epoch_dim, dtype=np.byte) + episode_starts = np.zeros(epoch_dim, dtype=np.bool8) values = np.zeros(epoch_dim, dtype=np.float32) logprobs = np.zeros(epoch_dim, dtype=np.float32) next_obs = self.env.reset() - next_episode_starts = np.ones(step_dim, dtype=np.byte) + next_episode_starts = np.full(step_dim, True, dtype=np.bool8) timesteps_elapsed = start_timesteps while timesteps_elapsed < start_timesteps + train_timesteps: @@ -126,23 +127,16 @@ class A2C(Algorithm): clamped_action ) - advantages = np.zeros(epoch_dim, dtype=np.float32) - last_gae_lam = 0 - for t in reversed(range(self.n_steps)): - if t == self.n_steps - 1: - next_nonterminal = 1.0 - next_episode_starts - next_value = self.policy.value(next_obs) - else: - next_nonterminal = 1.0 - episode_starts[t + 1] - next_value = values[t + 1] - delta = ( - rewards[t] + self.gamma * next_value * next_nonterminal - values[t] - ) - last_gae_lam = ( - delta - + self.gamma * self.gae_lambda * next_nonterminal * last_gae_lam - ) - advantages[t] = last_gae_lam + advantages = compute_advantages( + rewards, + values, + episode_starts, + next_episode_starts, + next_obs, + self.policy, + self.gamma, + self.gae_lambda, + ) returns = advantages + values b_obs = torch.tensor(obs.reshape((-1,) + obs_space.shape)).to(self.device) diff --git a/rl_algo_impls/a2c/optimize.py b/rl_algo_impls/a2c/optimize.py index baf9299ef141ea7c9ae49b0a0c707fed4ac14bc6..cd3cb807f8de22634dab26b1d525a484b32ae7d5 100644 --- a/rl_algo_impls/a2c/optimize.py +++ b/rl_algo_impls/a2c/optimize.py @@ -3,7 +3,7 @@ import optuna from copy import deepcopy from rl_algo_impls.runner.config import Config, Hyperparams, EnvHyperparams -from rl_algo_impls.runner.env import make_eval_env +from rl_algo_impls.shared.vec_env import make_eval_env from rl_algo_impls.shared.policy.optimize_on_policy import sample_on_policy_hyperparams from rl_algo_impls.tuning.optimize_env import sample_env_hyperparams diff --git a/rl_algo_impls/benchmark_publish.py b/rl_algo_impls/benchmark_publish.py index 2e1045f01c316068f9192db791ae6cc10b617df6..1d2010efab9a48121f4b79cc7192ac6a7e5524b0 100644 --- a/rl_algo_impls/benchmark_publish.py +++ b/rl_algo_impls/benchmark_publish.py @@ -54,8 +54,8 @@ def benchmark_publish() -> None: "--virtual-display", action="store_true", help="Use headless virtual display" ) # parser.set_defaults( - # wandb_tags=["benchmark_2067e21", "host_155-248-199-228"], - # wandb_report_url="https://api.wandb.ai/links/sgoodfriend/09frjfcs", + # wandb_tags=["benchmark_e47a44c", "host_129-146-2-230"], + # wandb_report_url="https://api.wandb.ai/links/sgoodfriend/v4wd7cp5", # envs=[], # exclude_envs=[], # ) diff --git a/rl_algo_impls/compare_runs.py b/rl_algo_impls/compare_runs.py index 18d1341f62eeb6a54ab79a31d76c21148cdf1458..d6f6e8b763c41fc6d39fff90ae9b27c6e14fb39d 100644 --- a/rl_algo_impls/compare_runs.py +++ b/rl_algo_impls/compare_runs.py @@ -194,5 +194,6 @@ def compare_runs() -> None: df.loc["mean"] = df.mean(numeric_only=True) print(df.to_markdown()) + if __name__ == "__main__": - compare_runs() \ No newline at end of file + compare_runs() diff --git a/rl_algo_impls/dqn/policy.py b/rl_algo_impls/dqn/policy.py index b7189c107b3882d785b94b30501ef36b8123ae38..bc8bcf0cb8a6ca51dd5fbd1b54af09eceb886a96 100644 --- a/rl_algo_impls/dqn/policy.py +++ b/rl_algo_impls/dqn/policy.py @@ -1,16 +1,16 @@ -import numpy as np import os -import torch - from typing import Optional, Sequence, TypeVar +import numpy as np +import torch + from rl_algo_impls.dqn.q_net import QNetwork from rl_algo_impls.shared.policy.policy import Policy from rl_algo_impls.wrappers.vectorable_wrapper import ( VecEnv, VecEnvObs, - single_observation_space, single_action_space, + single_observation_space, ) DQNPolicySelf = TypeVar("DQNPolicySelf", bound="DQNPolicy") @@ -21,7 +21,7 @@ class DQNPolicy(Policy): self, env: VecEnv, hidden_sizes: Sequence[int] = [], - cnn_feature_dim: int = 512, + cnn_flatten_dim: int = 512, cnn_style: str = "nature", cnn_layers_init_orthogonal: Optional[bool] = None, impala_channels: Sequence[int] = (16, 32, 32), @@ -32,16 +32,23 @@ class DQNPolicy(Policy): single_observation_space(env), single_action_space(env), hidden_sizes, - cnn_feature_dim=cnn_feature_dim, + cnn_flatten_dim=cnn_flatten_dim, cnn_style=cnn_style, cnn_layers_init_orthogonal=cnn_layers_init_orthogonal, impala_channels=impala_channels, ) def act( - self, obs: VecEnvObs, eps: float = 0, deterministic: bool = True + self, + obs: VecEnvObs, + eps: float = 0, + deterministic: bool = True, + action_masks: Optional[np.ndarray] = None, ) -> np.ndarray: assert eps == 0 if deterministic else eps >= 0 + assert ( + action_masks is None + ), f"action_masks not currently supported in {self.__class__.__name__}" if not deterministic and np.random.random() < eps: return np.array( [ diff --git a/rl_algo_impls/dqn/q_net.py b/rl_algo_impls/dqn/q_net.py index 1e9233547f60b7eabdf58633afcd067cdb2ca345..4b2f556483673c5a428c6820f156cc63fce6a3f6 100644 --- a/rl_algo_impls/dqn/q_net.py +++ b/rl_algo_impls/dqn/q_net.py @@ -1,11 +1,11 @@ +from typing import Optional, Sequence, Type + import gym import torch as th import torch.nn as nn - from gym.spaces import Discrete -from typing import Optional, Sequence, Type -from rl_algo_impls.shared.module.feature_extractor import FeatureExtractor +from rl_algo_impls.shared.encoder import Encoder from rl_algo_impls.shared.module.module import mlp @@ -16,17 +16,17 @@ class QNetwork(nn.Module): action_space: gym.Space, hidden_sizes: Sequence[int] = [], activation: Type[nn.Module] = nn.ReLU, # Used by stable-baselines3 - cnn_feature_dim: int = 512, + cnn_flatten_dim: int = 512, cnn_style: str = "nature", cnn_layers_init_orthogonal: Optional[bool] = None, impala_channels: Sequence[int] = (16, 32, 32), ) -> None: super().__init__() assert isinstance(action_space, Discrete) - self._feature_extractor = FeatureExtractor( + self._feature_extractor = Encoder( observation_space, activation, - cnn_feature_dim=cnn_feature_dim, + cnn_flatten_dim=cnn_flatten_dim, cnn_style=cnn_style, cnn_layers_init_orthogonal=cnn_layers_init_orthogonal, impala_channels=impala_channels, diff --git a/rl_algo_impls/huggingface_publish.py b/rl_algo_impls/huggingface_publish.py index afe1d57345bfc831044f65f331f1031310fc3dfe..c89a4eecde5d7043c43477312b9dc743a591f126 100644 --- a/rl_algo_impls/huggingface_publish.py +++ b/rl_algo_impls/huggingface_publish.py @@ -19,7 +19,7 @@ from pyvirtualdisplay.display import Display from rl_algo_impls.publish.markdown_format import EvalTableData, model_card_text from rl_algo_impls.runner.config import EnvHyperparams from rl_algo_impls.runner.evaluate import EvalArgs, evaluate_model -from rl_algo_impls.runner.env import make_eval_env +from rl_algo_impls.shared.vec_env import make_eval_env from rl_algo_impls.shared.callbacks.eval_callback import evaluate from rl_algo_impls.wrappers.vec_episode_recorder import VecEpisodeRecorder diff --git a/rl_algo_impls/hyperparams/a2c.yml b/rl_algo_impls/hyperparams/a2c.yml index 217892eb3e72cb326e327ea7061c262899bdce2c..a15f29300f71cfb7cf21419d444fc7df15c76092 100644 --- a/rl_algo_impls/hyperparams/a2c.yml +++ b/rl_algo_impls/hyperparams/a2c.yml @@ -97,31 +97,35 @@ Walker2DBulletEnv-v0: HopperBulletEnv-v0: <<: *pybullet-defaults +# Tuned CarRacing-v0: n_timesteps: !!float 4e6 env_hyperparams: - n_envs: 8 + n_envs: 16 frame_stack: 4 normalize: true normalize_kwargs: norm_obs: false norm_reward: true policy_hyperparams: - use_sde: true - log_std_init: -2 - init_layers_orthogonal: false - activation_fn: relu + use_sde: false + log_std_init: -1.3502584927786276 + init_layers_orthogonal: true + activation_fn: tanh share_features_extractor: false - cnn_feature_dim: 256 + cnn_flatten_dim: 256 hidden_sizes: [256] algo_hyperparams: - n_steps: 512 - learning_rate: !!float 1.62e-5 - gamma: 0.997 - gae_lambda: 0.975 - ent_coef: 0 - sde_sample_freq: 128 - vf_coef: 0.64 + n_steps: 16 + learning_rate: 0.000025630993245026736 + learning_rate_decay: linear + gamma: 0.99957617037542 + gae_lambda: 0.949455676599436 + ent_coef: !!float 1.707983205298309e-7 + vf_coef: 0.10428178193833336 + max_grad_norm: 0.5406643389792273 + normalize_advantage: true + use_rms_prop: false _atari: &atari-defaults n_timesteps: !!float 1e7 diff --git a/rl_algo_impls/hyperparams/dqn.yml b/rl_algo_impls/hyperparams/dqn.yml index 66003a67dd8c7865fd9ea269f6fc84b5d95fb428..4274deaf842e186873808dfb1f3da8da4d36440e 100644 --- a/rl_algo_impls/hyperparams/dqn.yml +++ b/rl_algo_impls/hyperparams/dqn.yml @@ -108,7 +108,7 @@ _impala-atari: &impala-atari-defaults <<: *atari-defaults policy_hyperparams: cnn_style: impala - cnn_feature_dim: 256 + cnn_flatten_dim: 256 init_layers_orthogonal: true cnn_layers_init_orthogonal: false diff --git a/rl_algo_impls/hyperparams/ppo.yml b/rl_algo_impls/hyperparams/ppo.yml index 0136fc46b019f9722aaad54bfe93f36ed88f4bc8..ec533e646a89540a958f8e57225f942ba6db875a 100644 --- a/rl_algo_impls/hyperparams/ppo.yml +++ b/rl_algo_impls/hyperparams/ppo.yml @@ -112,7 +112,7 @@ CarRacing-v0: &carracing-defaults init_layers_orthogonal: false activation_fn: relu share_features_extractor: false - cnn_feature_dim: 256 + cnn_flatten_dim: 256 hidden_sizes: [256] algo_hyperparams: n_steps: 512 @@ -152,7 +152,7 @@ _atari: &atari-defaults vec_env_class: async policy_hyperparams: &atari-policy-defaults activation_fn: relu - algo_hyperparams: + algo_hyperparams: &atari-algo-defaults n_steps: 128 batch_size: 256 n_epochs: 4 @@ -192,7 +192,7 @@ _impala-atari: &impala-atari-defaults policy_hyperparams: <<: *atari-policy-defaults cnn_style: impala - cnn_feature_dim: 256 + cnn_flatten_dim: 256 init_layers_orthogonal: true cnn_layers_init_orthogonal: false @@ -212,6 +212,126 @@ impala-QbertNoFrameskip-v4: <<: *impala-atari-defaults env_id: QbertNoFrameskip-v4 +_microrts: µrts-defaults + <<: *atari-defaults + n_timesteps: !!float 2e6 + env_hyperparams: µrts-env-defaults + n_envs: 8 + vec_env_class: sync + mask_actions: true + policy_hyperparams: µrts-policy-defaults + <<: *atari-policy-defaults + cnn_style: microrts + cnn_flatten_dim: 128 + algo_hyperparams: µrts-algo-defaults + <<: *atari-algo-defaults + clip_range_decay: none + clip_range_vf: 0.1 + ppo2_vf_coef_halving: true + eval_params: + deterministic: false # Good idea because MultiCategorical mode isn't great + +_no-mask-microrts: &no-mask-microrts-defaults + <<: *microrts-defaults + env_hyperparams: + <<: *microrts-env-defaults + mask_actions: false + +MicrortsMining-v1-NoMask: + <<: *no-mask-microrts-defaults + env_id: MicrortsMining-v1 + +MicrortsAttackShapedReward-v1-NoMask: + <<: *no-mask-microrts-defaults + env_id: MicrortsAttackShapedReward-v1 + +MicrortsRandomEnemyShapedReward3-v1-NoMask: + <<: *no-mask-microrts-defaults + env_id: MicrortsRandomEnemyShapedReward3-v1 + +_microrts_ai: µrts-ai-defaults + <<: *microrts-defaults + n_timesteps: !!float 100e6 + additional_keys_to_log: ["microrts_stats"] + env_hyperparams: µrts-ai-env-defaults + n_envs: 24 + env_type: microrts + make_kwargs: + num_selfplay_envs: 0 + max_steps: 2000 + render_theme: 2 + map_path: maps/16x16/basesWorkers16x16.xml + reward_weight: [10.0, 1.0, 1.0, 0.2, 1.0, 4.0] + policy_hyperparams: µrts-ai-policy-defaults + <<: *microrts-policy-defaults + cnn_flatten_dim: 256 + actor_head_style: gridnet + algo_hyperparams: µrts-ai-algo-defaults + <<: *microrts-algo-defaults + learning_rate: !!float 2.5e-4 + learning_rate_decay: linear + n_steps: 512 + batch_size: 3072 + n_epochs: 4 + ent_coef: 0.01 + vf_coef: 0.5 + max_grad_norm: 0.5 + clip_range: 0.1 + clip_range_vf: 0.1 + +MicrortsAttackPassiveEnemySparseReward-v3: + <<: *microrts-ai-defaults + n_timesteps: !!float 2e6 + env_id: MicrortsAttackPassiveEnemySparseReward-v3 # Workaround to keep model name simple + env_hyperparams: + <<: *microrts-ai-env-defaults + bots: + passiveAI: 24 + +MicrortsDefeatRandomEnemySparseReward-v3: µrts-random-ai-defaults + <<: *microrts-ai-defaults + n_timesteps: !!float 2e6 + env_id: MicrortsDefeatRandomEnemySparseReward-v3 # Workaround to keep model name simple + env_hyperparams: + <<: *microrts-ai-env-defaults + bots: + randomBiasedAI: 24 + +enc-dec-MicrortsDefeatRandomEnemySparseReward-v3: + <<: *microrts-random-ai-defaults + policy_hyperparams: + <<: *microrts-ai-policy-defaults + cnn_style: gridnet_encoder + actor_head_style: gridnet_decoder + v_hidden_sizes: [128] + +MicrortsDefeatCoacAIShaped-v3: µrts-coacai-defaults + <<: *microrts-ai-defaults + env_id: MicrortsDefeatCoacAIShaped-v3 # Workaround to keep model name simple + n_timesteps: !!float 300e6 + env_hyperparams: µrts-coacai-env-defaults + <<: *microrts-ai-env-defaults + bots: + coacAI: 24 + +MicrortsDefeatCoacAIShaped-v3-diverseBots: µrts-diverse-defaults + <<: *microrts-coacai-defaults + env_hyperparams: + <<: *microrts-coacai-env-defaults + bots: + coacAI: 18 + randomBiasedAI: 2 + lightRushAI: 2 + workerRushAI: 2 + +enc-dec-MicrortsDefeatCoacAIShaped-v3-diverseBots: + <<: *microrts-diverse-defaults + policy_hyperparams: + <<: *microrts-ai-policy-defaults + cnn_style: gridnet_encoder + actor_head_style: gridnet_decoder + v_hidden_sizes: [128] + HalfCheetahBulletEnv-v0: &pybullet-defaults n_timesteps: !!float 2e6 env_hyperparams: &pybullet-env-defaults @@ -282,7 +402,7 @@ _procgen: &procgen-defaults policy_hyperparams: &procgen-policy-defaults activation_fn: relu cnn_style: impala - cnn_feature_dim: 256 + cnn_flatten_dim: 256 init_layers_orthogonal: true cnn_layers_init_orthogonal: false algo_hyperparams: &procgen-algo-defaults @@ -368,7 +488,7 @@ procgen-starpilot-hard-2xIMPALA-fat: policy_hyperparams: <<: *procgen-policy-defaults impala_channels: [32, 64, 64] - cnn_feature_dim: 512 + cnn_flatten_dim: 512 algo_hyperparams: <<: *procgen-hard-algo-defaults learning_rate: !!float 2.5e-4 diff --git a/rl_algo_impls/hyperparams/vpg.yml b/rl_algo_impls/hyperparams/vpg.yml index e472a9226b830c127f044718672d6d0c9e8c83dc..0193dc1ae5e791edf2f72bb40cd4a8d143c29b79 100644 --- a/rl_algo_impls/hyperparams/vpg.yml +++ b/rl_algo_impls/hyperparams/vpg.yml @@ -110,7 +110,7 @@ CarRacing-v0: log_std_init: -2 init_layers_orthogonal: false activation_fn: relu - cnn_feature_dim: 256 + cnn_flatten_dim: 256 hidden_sizes: [256] algo_hyperparams: n_steps: 1000 @@ -175,9 +175,9 @@ FrozenLake-v1: save_best: true _atari: &atari-defaults - n_timesteps: !!float 25e6 + n_timesteps: !!float 10e6 env_hyperparams: - n_envs: 4 + n_envs: 2 frame_stack: 4 no_reward_timeout_steps: 1000 no_reward_fire_steps: 500 @@ -185,7 +185,7 @@ _atari: &atari-defaults policy_hyperparams: activation_fn: relu algo_hyperparams: - n_steps: 2048 + n_steps: 3072 pi_lr: !!float 5e-5 gamma: 0.99 gae_lambda: 0.95 diff --git a/rl_algo_impls/optimize.py b/rl_algo_impls/optimize.py index 1078a9bea61fd49f728d25b7ece323202ad31104..6ea2a57b9c8f36405161c726f8cb1f582313a48f 100644 --- a/rl_algo_impls/optimize.py +++ b/rl_algo_impls/optimize.py @@ -17,7 +17,7 @@ from typing import Callable, List, NamedTuple, Optional, Sequence, Union from rl_algo_impls.a2c.optimize import sample_params as a2c_sample_params from rl_algo_impls.runner.config import Config, EnvHyperparams, RunArgs -from rl_algo_impls.runner.env import make_env, make_eval_env +from rl_algo_impls.shared.vec_env import make_env, make_eval_env from rl_algo_impls.runner.running_utils import ( base_parser, load_hyperparams, @@ -194,7 +194,7 @@ def simple_optimize(trial: optuna.Trial, args: RunArgs, study_args: StudyArgs) - env = make_env( config, EnvHyperparams(**config.env_hyperparams), tb_writer=tb_writer ) - device = get_device(config.device, env) + device = get_device(config, env) policy = make_policy(args.algo, env, device, **config.policy_hyperparams) algo = ALGOS[args.algo](policy, env, device, tb_writer, **config.algo_hyperparams) @@ -274,7 +274,7 @@ def stepwise_optimize( project=study_args.wandb_project_name, entity=study_args.wandb_entity, config=asdict(hyperparams), - name=f"{study_args.study_name}-{str(trial.number)}", + name=f"{str(trial.number)}-S{base_config.seed()}", tags=study_args.wandb_tags, group=study_args.wandb_group, save_code=True, @@ -298,7 +298,7 @@ def stepwise_optimize( normalize_load_path=config.model_dir_path() if i > 0 else None, tb_writer=tb_writer, ) - device = get_device(config.device, env) + device = get_device(config, env) policy = make_policy(arg.algo, env, device, **config.policy_hyperparams) if i > 0: policy.load(config.model_dir_path()) @@ -433,6 +433,7 @@ def optimize() -> None: fig1 = plot_optimization_history(study) fig1.write_image("opt_history.png") + fig2 = plot_param_importances(study) fig2.write_image("param_importances.png") diff --git a/rl_algo_impls/ppo/ppo.py b/rl_algo_impls/ppo/ppo.py index f1a5850ebe02ed1d946cd399b49e64375c0a6089..cfa5975c52b725b3b5cc8046e6dac4a17ac844af 100644 --- a/rl_algo_impls/ppo/ppo.py +++ b/rl_algo_impls/ppo/ppo.py @@ -1,59 +1,26 @@ +import logging +from dataclasses import asdict, dataclass +from time import perf_counter +from typing import List, NamedTuple, Optional, TypeVar + import numpy as np import torch import torch.nn as nn - -from dataclasses import asdict, dataclass, field -from time import perf_counter from torch.optim import Adam from torch.utils.tensorboard.writer import SummaryWriter -from typing import List, Optional, NamedTuple, TypeVar from rl_algo_impls.shared.algorithm import Algorithm from rl_algo_impls.shared.callbacks.callback import Callback -from rl_algo_impls.shared.gae import compute_advantage, compute_rtg_and_advantage +from rl_algo_impls.shared.gae import compute_advantages from rl_algo_impls.shared.policy.on_policy import ActorCritic -from rl_algo_impls.shared.schedule import ( - constant_schedule, - linear_schedule, - update_learning_rate, +from rl_algo_impls.shared.schedule import schedule, update_learning_rate +from rl_algo_impls.shared.stats import log_scalars +from rl_algo_impls.wrappers.action_mask_wrapper import find_action_masker +from rl_algo_impls.wrappers.vectorable_wrapper import ( + VecEnv, + single_action_space, + single_observation_space, ) -from rl_algo_impls.shared.trajectory import Trajectory, TrajectoryAccumulator -from rl_algo_impls.wrappers.vectorable_wrapper import VecEnv, VecEnvObs - - -@dataclass -class PPOTrajectory(Trajectory): - logp_a: List[float] = field(default_factory=list) - - def add( - self, - obs: np.ndarray, - act: np.ndarray, - next_obs: np.ndarray, - rew: float, - terminated: bool, - v: float, - logp_a: float, - ): - super().add(obs, act, next_obs, rew, terminated, v) - self.logp_a.append(logp_a) - - -class PPOTrajectoryAccumulator(TrajectoryAccumulator): - def __init__(self, num_envs: int) -> None: - super().__init__(num_envs, PPOTrajectory) - - def step( - self, - obs: VecEnvObs, - action: np.ndarray, - next_obs: VecEnvObs, - reward: np.ndarray, - done: np.ndarray, - val: np.ndarray, - logp_a: np.ndarray, - ) -> None: - super().step(obs, action, next_obs, reward, done, val, logp_a) class TrainStepStats(NamedTuple): @@ -132,39 +99,31 @@ class PPO(Algorithm): vf_coef: float = 0.5, ppo2_vf_coef_halving: bool = False, max_grad_norm: float = 0.5, - update_rtg_between_epochs: bool = False, sde_sample_freq: int = -1, + update_advantage_between_epochs: bool = True, + update_returns_between_epochs: bool = False, ) -> None: super().__init__(policy, env, device, tb_writer) self.policy = policy + self.action_masker = find_action_masker(env) self.gamma = gamma self.gae_lambda = gae_lambda self.optimizer = Adam(self.policy.parameters(), lr=learning_rate, eps=1e-7) - self.lr_schedule = ( - linear_schedule(learning_rate, 0) - if learning_rate_decay == "linear" - else constant_schedule(learning_rate) - ) + self.lr_schedule = schedule(learning_rate_decay, learning_rate) self.max_grad_norm = max_grad_norm - self.clip_range_schedule = ( - linear_schedule(clip_range, 0) - if clip_range_decay == "linear" - else constant_schedule(clip_range) - ) + self.clip_range_schedule = schedule(clip_range_decay, clip_range) self.clip_range_vf_schedule = None if clip_range_vf: - self.clip_range_vf_schedule = ( - linear_schedule(clip_range_vf, 0) - if clip_range_vf_decay == "linear" - else constant_schedule(clip_range_vf) - ) + self.clip_range_vf_schedule = schedule(clip_range_vf_decay, clip_range_vf) + + if normalize_advantage: + assert ( + env.num_envs * n_steps > 1 and batch_size > 1 + ), f"Each minibatch must be larger than 1 to support normalization" self.normalize_advantage = normalize_advantage - self.ent_coef_schedule = ( - linear_schedule(ent_coef, 0) - if ent_coef_decay == "linear" - else constant_schedule(ent_coef) - ) + + self.ent_coef_schedule = schedule(ent_coef_decay, ent_coef) self.vf_coef = vf_coef self.ppo2_vf_coef_halving = ppo2_vf_coef_halving @@ -173,181 +132,243 @@ class PPO(Algorithm): self.n_epochs = n_epochs self.sde_sample_freq = sde_sample_freq - self.update_rtg_between_epochs = update_rtg_between_epochs + self.update_advantage_between_epochs = update_advantage_between_epochs + self.update_returns_between_epochs = update_returns_between_epochs def learn( self: PPOSelf, - total_timesteps: int, + train_timesteps: int, callback: Optional[Callback] = None, + total_timesteps: Optional[int] = None, + start_timesteps: int = 0, ) -> PPOSelf: - obs = self.env.reset() - ts_elapsed = 0 - while ts_elapsed < total_timesteps: - start_time = perf_counter() - accumulator = self._collect_trajectories(obs) - rollout_steps = self.n_steps * self.env.num_envs - ts_elapsed += rollout_steps - progress = ts_elapsed / total_timesteps - train_stats = self.train(accumulator.all_trajectories, progress, ts_elapsed) - train_stats.write_to_tensorboard(self.tb_writer, ts_elapsed) - end_time = perf_counter() - self.tb_writer.add_scalar( - "train/steps_per_second", - rollout_steps / (end_time - start_time), - ts_elapsed, + if total_timesteps is None: + total_timesteps = train_timesteps + assert start_timesteps + train_timesteps <= total_timesteps + + epoch_dim = (self.n_steps, self.env.num_envs) + step_dim = (self.env.num_envs,) + obs_space = single_observation_space(self.env) + act_space = single_action_space(self.env) + act_shape = self.policy.action_shape + + next_obs = self.env.reset() + next_action_masks = ( + self.action_masker.action_masks() if self.action_masker else None + ) + next_episode_starts = np.full(step_dim, True, dtype=np.bool8) + + obs = np.zeros(epoch_dim + obs_space.shape, dtype=obs_space.dtype) # type: ignore + actions = np.zeros(epoch_dim + act_shape, dtype=act_space.dtype) # type: ignore + rewards = np.zeros(epoch_dim, dtype=np.float32) + episode_starts = np.zeros(epoch_dim, dtype=np.bool8) + values = np.zeros(epoch_dim, dtype=np.float32) + logprobs = np.zeros(epoch_dim, dtype=np.float32) + action_masks = ( + np.zeros( + (self.n_steps,) + next_action_masks.shape, dtype=next_action_masks.dtype ) - if callback: - callback.on_step(timesteps_elapsed=rollout_steps) - - return self - - def _collect_trajectories(self, obs: VecEnvObs) -> PPOTrajectoryAccumulator: - self.policy.eval() - accumulator = PPOTrajectoryAccumulator(self.env.num_envs) - self.policy.reset_noise() - for i in range(self.n_steps): - if self.sde_sample_freq > 0 and i > 0 and i % self.sde_sample_freq == 0: - self.policy.reset_noise() - action, value, logp_a, clamped_action = self.policy.step(obs) - next_obs, reward, done, _ = self.env.step(clamped_action) - accumulator.step(obs, action, next_obs, reward, done, value, logp_a) - obs = next_obs - return accumulator - - def train( - self, trajectories: List[PPOTrajectory], progress: float, timesteps_elapsed: int - ) -> TrainStats: - self.policy.train() - learning_rate = self.lr_schedule(progress) - update_learning_rate(self.optimizer, learning_rate) - self.tb_writer.add_scalar( - "charts/learning_rate", - self.optimizer.param_groups[0]["lr"], - timesteps_elapsed, + if next_action_masks is not None + else None ) - pi_clip = self.clip_range_schedule(progress) - self.tb_writer.add_scalar("charts/pi_clip", pi_clip, timesteps_elapsed) - if self.clip_range_vf_schedule: - v_clip = self.clip_range_vf_schedule(progress) - self.tb_writer.add_scalar("charts/v_clip", v_clip, timesteps_elapsed) - else: - v_clip = None - ent_coef = self.ent_coef_schedule(progress) - self.tb_writer.add_scalar("charts/ent_coef", ent_coef, timesteps_elapsed) - - obs = torch.as_tensor( - np.concatenate([np.array(t.obs) for t in trajectories]), device=self.device - ) - act = torch.as_tensor( - np.concatenate([np.array(t.act) for t in trajectories]), device=self.device - ) - rtg, adv = compute_rtg_and_advantage( - trajectories, self.policy, self.gamma, self.gae_lambda, self.device - ) - orig_v = torch.as_tensor( - np.concatenate([np.array(t.v) for t in trajectories]), device=self.device - ) - orig_logp_a = torch.as_tensor( - np.concatenate([np.array(t.logp_a) for t in trajectories]), - device=self.device, - ) + timesteps_elapsed = start_timesteps + while timesteps_elapsed < start_timesteps + train_timesteps: + start_time = perf_counter() - step_stats = [] - for _ in range(self.n_epochs): - step_stats.clear() - if self.update_rtg_between_epochs: - rtg, adv = compute_rtg_and_advantage( - trajectories, self.policy, self.gamma, self.gae_lambda, self.device - ) + progress = timesteps_elapsed / total_timesteps + ent_coef = self.ent_coef_schedule(progress) + learning_rate = self.lr_schedule(progress) + update_learning_rate(self.optimizer, learning_rate) + pi_clip = self.clip_range_schedule(progress) + chart_scalars = { + "learning_rate": self.optimizer.param_groups[0]["lr"], + "ent_coef": ent_coef, + "pi_clip": pi_clip, + } + if self.clip_range_vf_schedule: + v_clip = self.clip_range_vf_schedule(progress) + chart_scalars["v_clip"] = v_clip else: - adv = compute_advantage( - trajectories, self.policy, self.gamma, self.gae_lambda, self.device + v_clip = None + log_scalars(self.tb_writer, "charts", chart_scalars, timesteps_elapsed) + + self.policy.eval() + self.policy.reset_noise() + for s in range(self.n_steps): + timesteps_elapsed += self.env.num_envs + if self.sde_sample_freq > 0 and s > 0 and s % self.sde_sample_freq == 0: + self.policy.reset_noise() + + obs[s] = next_obs + episode_starts[s] = next_episode_starts + if action_masks is not None: + action_masks[s] = next_action_masks + + ( + actions[s], + values[s], + logprobs[s], + clamped_action, + ) = self.policy.step(next_obs, action_masks=next_action_masks) + next_obs, rewards[s], next_episode_starts, _ = self.env.step( + clamped_action ) - idxs = torch.randperm(len(obs)) - for i in range(0, len(obs), self.batch_size): - mb_idxs = idxs[i : i + self.batch_size] - mb_adv = adv[mb_idxs] - if self.normalize_advantage: - mb_adv = (mb_adv - mb_adv.mean(-1)) / (mb_adv.std(-1) + 1e-8) - self.policy.reset_noise(self.batch_size) - step_stats.append( - self._train_step( - pi_clip, - v_clip, - ent_coef, - obs[mb_idxs], - act[mb_idxs], - rtg[mb_idxs], - mb_adv, - orig_v[mb_idxs], - orig_logp_a[mb_idxs], - ) + next_action_masks = ( + self.action_masker.action_masks() if self.action_masker else None ) - y_pred, y_true = orig_v.cpu().numpy(), rtg.cpu().numpy() - var_y = np.var(y_true).item() - explained_var = ( - np.nan if var_y == 0 else 1 - np.var(y_true - y_pred).item() / var_y - ) + self.policy.train() + + b_obs = torch.tensor(obs.reshape((-1,) + obs_space.shape)).to(self.device) # type: ignore + b_actions = torch.tensor(actions.reshape((-1,) + act_shape)).to( # type: ignore + self.device + ) + b_logprobs = torch.tensor(logprobs.reshape(-1)).to(self.device) + b_action_masks = ( + torch.tensor(action_masks.reshape((-1,) + next_action_masks.shape[1:])).to( # type: ignore + self.device + ) + if action_masks is not None + else None + ) + + y_pred = values.reshape(-1) + b_values = torch.tensor(y_pred).to(self.device) + + step_stats = [] + # Define variables that will definitely be set through the first epoch + advantages: np.ndarray = None # type: ignore + b_advantages: torch.Tensor = None # type: ignore + y_true: np.ndarray = None # type: ignore + b_returns: torch.Tensor = None # type: ignore + for e in range(self.n_epochs): + if e == 0 or self.update_advantage_between_epochs: + advantages = compute_advantages( + rewards, + values, + episode_starts, + next_episode_starts, + next_obs, + self.policy, + self.gamma, + self.gae_lambda, + ) + b_advantages = torch.tensor(advantages.reshape(-1)).to(self.device) + if e == 0 or self.update_returns_between_epochs: + returns = advantages + values + y_true = returns.reshape(-1) + b_returns = torch.tensor(y_true).to(self.device) + + b_idxs = torch.randperm(len(b_obs)) + # Only record last epoch's stats + step_stats.clear() + for i in range(0, len(b_obs), self.batch_size): + self.policy.reset_noise(self.batch_size) + + mb_idxs = b_idxs[i : i + self.batch_size] + + mb_obs = b_obs[mb_idxs] + mb_actions = b_actions[mb_idxs] + mb_values = b_values[mb_idxs] + mb_logprobs = b_logprobs[mb_idxs] + mb_action_masks = ( + b_action_masks[mb_idxs] if b_action_masks is not None else None + ) - return TrainStats(step_stats, explained_var) + mb_adv = b_advantages[mb_idxs] + if self.normalize_advantage: + mb_adv = (mb_adv - mb_adv.mean()) / (mb_adv.std() + 1e-8) + mb_returns = b_returns[mb_idxs] - def _train_step( - self, - pi_clip: float, - v_clip: Optional[float], - ent_coef: float, - obs: torch.Tensor, - act: torch.Tensor, - rtg: torch.Tensor, - adv: torch.Tensor, - orig_v: torch.Tensor, - orig_logp_a: torch.Tensor, - ) -> TrainStepStats: - logp_a, entropy, v = self.policy(obs, act) - logratio = logp_a - orig_logp_a - ratio = torch.exp(logratio) - clip_ratio = torch.clamp(ratio, min=1 - pi_clip, max=1 + pi_clip) - pi_loss = torch.maximum(-ratio * adv, -clip_ratio * adv).mean() - - v_loss_unclipped = (v - rtg) ** 2 - if v_clip: - v_loss_clipped = ( - orig_v + torch.clamp(v - orig_v, -v_clip, v_clip) - rtg - ) ** 2 - v_loss = torch.max(v_loss_unclipped, v_loss_clipped).mean() - else: - v_loss = v_loss_unclipped.mean() - if self.ppo2_vf_coef_halving: - v_loss *= 0.5 - - entropy_loss = -entropy.mean() - - loss = pi_loss + ent_coef * entropy_loss + self.vf_coef * v_loss - - self.optimizer.zero_grad() - loss.backward() - nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm) - self.optimizer.step() - - with torch.no_grad(): - approx_kl = ((ratio - 1) - logratio).mean().cpu().numpy().item() - clipped_frac = ( - ((ratio - 1).abs() > pi_clip).float().mean().cpu().numpy().item() + new_logprobs, entropy, new_values = self.policy( + mb_obs, mb_actions, action_masks=mb_action_masks + ) + + logratio = new_logprobs - mb_logprobs + ratio = torch.exp(logratio) + clipped_ratio = torch.clamp(ratio, min=1 - pi_clip, max=1 + pi_clip) + pi_loss = torch.max(-ratio * mb_adv, -clipped_ratio * mb_adv).mean() + + v_loss_unclipped = (new_values - mb_returns) ** 2 + if v_clip: + v_loss_clipped = ( + mb_values + + torch.clamp(new_values - mb_values, -v_clip, v_clip) + - mb_returns + ) ** 2 + v_loss = torch.max(v_loss_unclipped, v_loss_clipped).mean() + else: + v_loss = v_loss_unclipped.mean() + + if self.ppo2_vf_coef_halving: + v_loss *= 0.5 + + entropy_loss = -entropy.mean() + + loss = pi_loss + ent_coef * entropy_loss + self.vf_coef * v_loss + + self.optimizer.zero_grad() + loss.backward() + nn.utils.clip_grad_norm_( + self.policy.parameters(), self.max_grad_norm + ) + self.optimizer.step() + + with torch.no_grad(): + approx_kl = ((ratio - 1) - logratio).mean().cpu().numpy().item() + clipped_frac = ( + ((ratio - 1).abs() > pi_clip) + .float() + .mean() + .cpu() + .numpy() + .item() + ) + val_clipped_frac = ( + ((new_values - mb_values).abs() > v_clip) + .float() + .mean() + .cpu() + .numpy() + .item() + if v_clip + else 0 + ) + + step_stats.append( + TrainStepStats( + loss.item(), + pi_loss.item(), + v_loss.item(), + entropy_loss.item(), + approx_kl, + clipped_frac, + val_clipped_frac, + ) + ) + + var_y = np.var(y_true).item() + explained_var = ( + np.nan if var_y == 0 else 1 - np.var(y_true - y_pred).item() / var_y ) - val_clipped_frac = ( - (((v - orig_v).abs() > v_clip).float().mean().cpu().numpy().item()) - if v_clip - else 0 + TrainStats(step_stats, explained_var).write_to_tensorboard( + self.tb_writer, timesteps_elapsed ) - return TrainStepStats( - loss.item(), - pi_loss.item(), - v_loss.item(), - entropy_loss.item(), - approx_kl, - clipped_frac, - val_clipped_frac, - ) + end_time = perf_counter() + rollout_steps = self.n_steps * self.env.num_envs + self.tb_writer.add_scalar( + "train/steps_per_second", + rollout_steps / (end_time - start_time), + timesteps_elapsed, + ) + + if callback: + if not callback.on_step(timesteps_elapsed=rollout_steps): + logging.info( + f"Callback terminated training at {timesteps_elapsed} timesteps" + ) + break + + return self diff --git a/rl_algo_impls/runner/config.py b/rl_algo_impls/runner/config.py index 565913e716c537208e7353a13b70cff23ed0eb09..d92758eaae6929e319038444bf10846ba001ad95 100644 --- a/rl_algo_impls/runner/config.py +++ b/rl_algo_impls/runner/config.py @@ -2,12 +2,10 @@ import dataclasses import inspect import itertools import os - -from datetime import datetime from dataclasses import dataclass +from datetime import datetime from typing import Any, Dict, List, Optional, Type, TypeVar, Union - RunArgsSelf = TypeVar("RunArgsSelf", bound="RunArgs") @@ -50,6 +48,9 @@ class EnvHyperparams: video_step_interval: Union[int, float] = 1_000_000 initial_steps_to_truncate: Optional[int] = None clip_atari_rewards: bool = True + normalize_type: Optional[str] = None + mask_actions: bool = False + bots: Optional[Dict[str, int]] = None HyperparamsSelf = TypeVar("HyperparamsSelf", bound="Hyperparams") @@ -64,6 +65,7 @@ class Hyperparams: algo_hyperparams: Dict[str, Any] = dataclasses.field(default_factory=dict) eval_params: Dict[str, Any] = dataclasses.field(default_factory=dict) env_id: Optional[str] = None + additional_keys_to_log: List[str] = dataclasses.field(default_factory=list) @classmethod def from_dict_with_extra_fields( @@ -119,6 +121,10 @@ class Config: def env_id(self) -> str: return self.hyperparams.env_id or self.args.env + @property + def additional_keys_to_log(self) -> List[str]: + return self.hyperparams.additional_keys_to_log + def model_name(self, include_seed: bool = True) -> str: # Use arg env name instead of environment name parts = [self.algo, self.args.env] diff --git a/rl_algo_impls/runner/evaluate.py b/rl_algo_impls/runner/evaluate.py index ad85bcac91ba9cfef99ff43b67d71b223c4e7051..41eb34bb183bc451be44a14ce770a32be196d51e 100644 --- a/rl_algo_impls/runner/evaluate.py +++ b/rl_algo_impls/runner/evaluate.py @@ -4,7 +4,7 @@ import shutil from dataclasses import dataclass from typing import NamedTuple, Optional -from rl_algo_impls.runner.env import make_eval_env +from rl_algo_impls.shared.vec_env import make_eval_env from rl_algo_impls.runner.config import Config, EnvHyperparams, Hyperparams, RunArgs from rl_algo_impls.runner.running_utils import ( load_hyperparams, @@ -75,7 +75,7 @@ def evaluate_model(args: EvalArgs, root_dir: str) -> Evaluation: render=args.render, normalize_load_path=model_path, ) - device = get_device(config.device, env) + device = get_device(config, env) policy = make_policy( args.algo, env, diff --git a/rl_algo_impls/runner/running_utils.py b/rl_algo_impls/runner/running_utils.py index ee1ef1304c175e34f69b205f2bf2b139eafc16f4..9a872448708c6435c223589b0ad94f2ba35f8c29 100644 --- a/rl_algo_impls/runner/running_utils.py +++ b/rl_algo_impls/runner/running_utils.py @@ -1,32 +1,32 @@ import argparse -import gym import json -import matplotlib.pyplot as plt -import numpy as np import os import random +from dataclasses import asdict +from pathlib import Path +from typing import Dict, Optional, Type, Union + +import gym +import matplotlib.pyplot as plt +import numpy as np import torch import torch.backends.cudnn import yaml - -from dataclasses import asdict from gym.spaces import Box, Discrete -from pathlib import Path from torch.utils.tensorboard.writer import SummaryWriter -from typing import Dict, Optional, Type, Union - -from rl_algo_impls.runner.config import Hyperparams -from rl_algo_impls.shared.algorithm import Algorithm -from rl_algo_impls.shared.callbacks.eval_callback import EvalCallback -from rl_algo_impls.shared.policy.on_policy import ActorCritic -from rl_algo_impls.shared.policy.policy import Policy from rl_algo_impls.a2c.a2c import A2C from rl_algo_impls.dqn.dqn import DQN from rl_algo_impls.dqn.policy import DQNPolicy from rl_algo_impls.ppo.ppo import PPO -from rl_algo_impls.vpg.vpg import VanillaPolicyGradient +from rl_algo_impls.runner.config import Config, Hyperparams +from rl_algo_impls.shared.algorithm import Algorithm +from rl_algo_impls.shared.callbacks.eval_callback import EvalCallback +from rl_algo_impls.shared.policy.on_policy import ActorCritic +from rl_algo_impls.shared.policy.policy import Policy +from rl_algo_impls.shared.vec_env.utils import import_for_env_id, is_microrts from rl_algo_impls.vpg.policy import VPGActorCritic +from rl_algo_impls.vpg.vpg import VanillaPolicyGradient from rl_algo_impls.wrappers.vectorable_wrapper import VecEnv, single_observation_space ALGOS: Dict[str, Type[Algorithm]] = { @@ -81,16 +81,19 @@ def load_hyperparams(algo: str, env_id: str) -> Hyperparams: if env_id in hyperparams_dict: return Hyperparams(**hyperparams_dict[env_id]) - if "BulletEnv" in env_id: - import pybullet_envs + import_for_env_id(env_id) spec = gym.spec(env_id) - if "AtariEnv" in str(spec.entry_point) and "_atari" in hyperparams_dict: + entry_point_name = str(spec.entry_point) # type: ignore + if "AtariEnv" in entry_point_name and "_atari" in hyperparams_dict: return Hyperparams(**hyperparams_dict["_atari"]) + elif "gym_microrts" in entry_point_name and "_microrts" in hyperparams_dict: + return Hyperparams(**hyperparams_dict["_microrts"]) else: raise ValueError(f"{env_id} not specified in {algo} hyperparameters file") -def get_device(device: str, env: VecEnv) -> torch.device: +def get_device(config: Config, env: VecEnv) -> torch.device: + device = config.device # cuda by default if device == "auto": device = "cuda" @@ -108,6 +111,16 @@ def get_device(device: str, env: VecEnv) -> torch.device: device = "cpu" elif isinstance(obs_space, Box) and len(obs_space.shape) == 1: device = "cpu" + if is_microrts(config): + try: + from gym_microrts.envs.vec_env import MicroRTSGridModeVecEnv + + # Models that move more than one unit at a time should use mps + if not isinstance(env.unwrapped, MicroRTSGridModeVecEnv): + device = "cpu" + except ModuleNotFoundError: + # Likely on gym_microrts v0.0.2 to match ppo-implementation-details + device = "cpu" print(f"Device: {device}") return torch.device(device) @@ -187,6 +200,8 @@ def hparam_dict( flattened[key] = str(sv) else: flattened[key] = sv + elif isinstance(v, list): + flattened[k] = json.dumps(v) else: flattened[k] = v # type: ignore return flattened # type: ignore diff --git a/rl_algo_impls/runner/train.py b/rl_algo_impls/runner/train.py index 359117263e09ba9d4cf4859fe96901eaf85f49a5..eb7b94bc24c18855656f1caeb723565d7a84764e 100644 --- a/rl_algo_impls/runner/train.py +++ b/rl_algo_impls/runner/train.py @@ -5,26 +5,26 @@ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" import dataclasses import shutil -import wandb -import yaml - from dataclasses import asdict, dataclass -from torch.utils.tensorboard.writer import SummaryWriter from typing import Any, Dict, Optional, Sequence -from rl_algo_impls.shared.callbacks.eval_callback import EvalCallback +import yaml +from torch.utils.tensorboard.writer import SummaryWriter + +import wandb from rl_algo_impls.runner.config import Config, EnvHyperparams, RunArgs -from rl_algo_impls.runner.env import make_env, make_eval_env from rl_algo_impls.runner.running_utils import ( ALGOS, - load_hyperparams, - set_seeds, get_device, + hparam_dict, + load_hyperparams, make_policy, plot_eval_callback, - hparam_dict, + set_seeds, ) +from rl_algo_impls.shared.callbacks.eval_callback import EvalCallback from rl_algo_impls.shared.stats import EpisodesStats +from rl_algo_impls.shared.vec_env import make_env, make_eval_env @dataclass @@ -65,7 +65,7 @@ def train(args: TrainArgs): env = make_env( config, EnvHyperparams(**config.env_hyperparams), tb_writer=tb_writer ) - device = get_device(config.device, env) + device = get_device(config, env) policy = make_policy(args.algo, env, device, **config.policy_hyperparams) algo = ALGOS[args.algo](policy, env, device, tb_writer, **config.algo_hyperparams) @@ -94,6 +94,7 @@ def train(args: TrainArgs): if record_best_videos else None, best_video_dir=config.best_videos_dir, + additional_keys_to_log=config.additional_keys_to_log, ) algo.learn(config.n_timesteps, callback=callback) diff --git a/rl_algo_impls/shared/actor/__init__.py b/rl_algo_impls/shared/actor/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f4b59b46ff97bdca05c14145e79b8e361c47eeec --- /dev/null +++ b/rl_algo_impls/shared/actor/__init__.py @@ -0,0 +1,2 @@ +from rl_algo_impls.shared.actor.actor import Actor, PiForward +from rl_algo_impls.shared.actor.make_actor import actor_head diff --git a/rl_algo_impls/shared/actor/actor.py b/rl_algo_impls/shared/actor/actor.py new file mode 100644 index 0000000000000000000000000000000000000000..2da077a0175a080dcb85af67029bae57d1553393 --- /dev/null +++ b/rl_algo_impls/shared/actor/actor.py @@ -0,0 +1,42 @@ +from abc import ABC, abstractmethod +from typing import NamedTuple, Optional, Tuple + +import numpy as np +import torch +import torch.nn as nn +from torch.distributions import Distribution + + +class PiForward(NamedTuple): + pi: Distribution + logp_a: Optional[torch.Tensor] + entropy: Optional[torch.Tensor] + + +class Actor(nn.Module, ABC): + @abstractmethod + def forward( + self, + obs: torch.Tensor, + actions: Optional[torch.Tensor] = None, + action_masks: Optional[torch.Tensor] = None, + ) -> PiForward: + ... + + def sample_weights(self, batch_size: int = 1) -> None: + pass + + @property + @abstractmethod + def action_shape(self) -> Tuple[int, ...]: + ... + + def pi_forward( + self, distribution: Distribution, actions: Optional[torch.Tensor] = None + ) -> PiForward: + logp_a = None + entropy = None + if actions is not None: + logp_a = distribution.log_prob(actions) + entropy = distribution.entropy() + return PiForward(distribution, logp_a, entropy) diff --git a/rl_algo_impls/shared/actor/categorical.py b/rl_algo_impls/shared/actor/categorical.py new file mode 100644 index 0000000000000000000000000000000000000000..6392ead5472148894f550b9c1208bb3670551db1 --- /dev/null +++ b/rl_algo_impls/shared/actor/categorical.py @@ -0,0 +1,64 @@ +from typing import Optional, Tuple, Type + +import torch +import torch.nn as nn +from torch.distributions import Categorical + +from rl_algo_impls.shared.actor import Actor, PiForward +from rl_algo_impls.shared.module.module import mlp + + +class MaskedCategorical(Categorical): + def __init__( + self, + probs=None, + logits=None, + validate_args=None, + mask: Optional[torch.Tensor] = None, + ): + if mask is not None: + assert logits is not None, "mask requires logits and not probs" + logits = torch.where(mask, logits, -1e8) + self.mask = mask + super().__init__(probs, logits, validate_args) + + def entropy(self) -> torch.Tensor: + if self.mask is None: + return super().entropy() + # If mask set, then use approximation for entropy + p_log_p = self.logits * self.probs # type: ignore + masked = torch.where(self.mask, p_log_p, 0) + return -masked.sum(-1) + + +class CategoricalActorHead(Actor): + def __init__( + self, + act_dim: int, + in_dim: int, + hidden_sizes: Tuple[int, ...] = (32,), + activation: Type[nn.Module] = nn.Tanh, + init_layers_orthogonal: bool = True, + ) -> None: + super().__init__() + layer_sizes = (in_dim,) + hidden_sizes + (act_dim,) + self._fc = mlp( + layer_sizes, + activation, + init_layers_orthogonal=init_layers_orthogonal, + final_layer_gain=0.01, + ) + + def forward( + self, + obs: torch.Tensor, + actions: Optional[torch.Tensor] = None, + action_masks: Optional[torch.Tensor] = None, + ) -> PiForward: + logits = self._fc(obs) + pi = MaskedCategorical(logits=logits, mask=action_masks) + return self.pi_forward(pi, actions) + + @property + def action_shape(self) -> Tuple[int, ...]: + return () diff --git a/rl_algo_impls/shared/actor/gaussian.py b/rl_algo_impls/shared/actor/gaussian.py new file mode 100644 index 0000000000000000000000000000000000000000..3867477ed7009442eb8d465c51b3420d97c99342 --- /dev/null +++ b/rl_algo_impls/shared/actor/gaussian.py @@ -0,0 +1,61 @@ +from typing import Optional, Tuple, Type + +import torch +import torch.nn as nn +from torch.distributions import Distribution, Normal + +from rl_algo_impls.shared.actor.actor import Actor, PiForward +from rl_algo_impls.shared.module.module import mlp + + +class GaussianDistribution(Normal): + def log_prob(self, a: torch.Tensor) -> torch.Tensor: + return super().log_prob(a).sum(axis=-1) + + def sample(self) -> torch.Tensor: + return self.rsample() + + +class GaussianActorHead(Actor): + def __init__( + self, + act_dim: int, + in_dim: int, + hidden_sizes: Tuple[int, ...] = (32,), + activation: Type[nn.Module] = nn.Tanh, + init_layers_orthogonal: bool = True, + log_std_init: float = -0.5, + ) -> None: + super().__init__() + self.act_dim = act_dim + layer_sizes = (in_dim,) + hidden_sizes + (act_dim,) + self.mu_net = mlp( + layer_sizes, + activation, + init_layers_orthogonal=init_layers_orthogonal, + final_layer_gain=0.01, + ) + self.log_std = nn.Parameter( + torch.ones(act_dim, dtype=torch.float32) * log_std_init + ) + + def _distribution(self, obs: torch.Tensor) -> Distribution: + mu = self.mu_net(obs) + std = torch.exp(self.log_std) + return GaussianDistribution(mu, std) + + def forward( + self, + obs: torch.Tensor, + actions: Optional[torch.Tensor] = None, + action_masks: Optional[torch.Tensor] = None, + ) -> PiForward: + assert ( + not action_masks + ), f"{self.__class__.__name__} does not support action_masks" + pi = self._distribution(obs) + return self.pi_forward(pi, actions) + + @property + def action_shape(self) -> Tuple[int, ...]: + return (self.act_dim,) diff --git a/rl_algo_impls/shared/actor/gridnet.py b/rl_algo_impls/shared/actor/gridnet.py new file mode 100644 index 0000000000000000000000000000000000000000..a6746428ccd9be1156de6f613b7ee365e9e01cfd --- /dev/null +++ b/rl_algo_impls/shared/actor/gridnet.py @@ -0,0 +1,108 @@ +from typing import Dict, Optional, Tuple, Type + +import numpy as np +import torch +import torch.nn as nn +from numpy.typing import NDArray +from torch.distributions import Distribution, constraints + +from rl_algo_impls.shared.actor import Actor, PiForward +from rl_algo_impls.shared.actor.categorical import MaskedCategorical +from rl_algo_impls.shared.encoder import EncoderOutDim +from rl_algo_impls.shared.module.module import mlp + + +class GridnetDistribution(Distribution): + def __init__( + self, + map_size: int, + action_vec: NDArray[np.int64], + logits: torch.Tensor, + masks: torch.Tensor, + validate_args: Optional[bool] = None, + ) -> None: + self.map_size = map_size + self.action_vec = action_vec + + masks = masks.view(-1, masks.shape[-1]) + split_masks = torch.split(masks[:, 1:], action_vec.tolist(), dim=1) + + grid_logits = logits.reshape(-1, action_vec.sum()) + split_logits = torch.split(grid_logits, action_vec.tolist(), dim=1) + self.categoricals = [ + MaskedCategorical(logits=lg, validate_args=validate_args, mask=m) + for lg, m in zip(split_logits, split_masks) + ] + + batch_shape = logits.size()[:-1] if logits.ndimension() > 1 else torch.Size() + super().__init__(batch_shape=batch_shape, validate_args=validate_args) + + def log_prob(self, action: torch.Tensor) -> torch.Tensor: + prob_stack = torch.stack( + [ + c.log_prob(a) + for a, c in zip(action.view(-1, action.shape[-1]).T, self.categoricals) + ], + dim=-1, + ) + logprob = prob_stack.view(-1, self.map_size, len(self.action_vec)) + return logprob.sum(dim=(1, 2)) + + def entropy(self) -> torch.Tensor: + ent = torch.stack([c.entropy() for c in self.categoricals], dim=-1) + ent = ent.view(-1, self.map_size, len(self.action_vec)) + return ent.sum(dim=(1, 2)) + + def sample(self, sample_shape: torch.Size = torch.Size()) -> torch.Tensor: + s = torch.stack([c.sample(sample_shape) for c in self.categoricals], dim=-1) + return s.view(-1, self.map_size, len(self.action_vec)) + + @property + def mode(self) -> torch.Tensor: + m = torch.stack([c.mode for c in self.categoricals], dim=-1) + return m.view(-1, self.map_size, len(self.action_vec)) + + @property + def arg_constraints(self) -> Dict[str, constraints.Constraint]: + # Constraints handled by child distributions in dist + return {} + + +class GridnetActorHead(Actor): + def __init__( + self, + map_size: int, + action_vec: NDArray[np.int64], + in_dim: EncoderOutDim, + hidden_sizes: Tuple[int, ...] = (32,), + activation: Type[nn.Module] = nn.ReLU, + init_layers_orthogonal: bool = True, + ) -> None: + super().__init__() + self.map_size = map_size + self.action_vec = action_vec + assert isinstance(in_dim, int) + layer_sizes = (in_dim,) + hidden_sizes + (map_size * action_vec.sum(),) + self._fc = mlp( + layer_sizes, + activation, + init_layers_orthogonal=init_layers_orthogonal, + final_layer_gain=0.01, + ) + + def forward( + self, + obs: torch.Tensor, + actions: Optional[torch.Tensor] = None, + action_masks: Optional[torch.Tensor] = None, + ) -> PiForward: + assert ( + action_masks is not None + ), f"No mask case unhandled in {self.__class__.__name__}" + logits = self._fc(obs) + pi = GridnetDistribution(self.map_size, self.action_vec, logits, action_masks) + return self.pi_forward(pi, actions) + + @property + def action_shape(self) -> Tuple[int, ...]: + return (self.map_size, len(self.action_vec)) diff --git a/rl_algo_impls/shared/actor/gridnet_decoder.py b/rl_algo_impls/shared/actor/gridnet_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..21a83e92a84737ad10b4fd6d20fc3ea5d8f5edb7 --- /dev/null +++ b/rl_algo_impls/shared/actor/gridnet_decoder.py @@ -0,0 +1,80 @@ +from typing import Optional, Tuple, Type + +import numpy as np +import torch +import torch.nn as nn +from numpy.typing import NDArray + +from rl_algo_impls.shared.actor import Actor, PiForward +from rl_algo_impls.shared.actor.categorical import MaskedCategorical +from rl_algo_impls.shared.actor.gridnet import GridnetDistribution +from rl_algo_impls.shared.encoder import EncoderOutDim +from rl_algo_impls.shared.module.module import layer_init + + +class Transpose(nn.Module): + def __init__(self, permutation: Tuple[int, ...]) -> None: + super().__init__() + self.permutation = permutation + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x.permute(self.permutation) + + +class GridnetDecoder(Actor): + def __init__( + self, + map_size: int, + action_vec: NDArray[np.int64], + in_dim: EncoderOutDim, + activation: Type[nn.Module] = nn.ReLU, + init_layers_orthogonal: bool = True, + ) -> None: + super().__init__() + self.map_size = map_size + self.action_vec = action_vec + assert isinstance(in_dim, tuple) + self.deconv = nn.Sequential( + layer_init( + nn.ConvTranspose2d( + in_dim[0], 128, 3, stride=2, padding=1, output_padding=1 + ), + init_layers_orthogonal=init_layers_orthogonal, + ), + activation(), + layer_init( + nn.ConvTranspose2d(128, 64, 3, stride=2, padding=1, output_padding=1), + init_layers_orthogonal=init_layers_orthogonal, + ), + activation(), + layer_init( + nn.ConvTranspose2d(64, 32, 3, stride=2, padding=1, output_padding=1), + init_layers_orthogonal=init_layers_orthogonal, + ), + activation(), + layer_init( + nn.ConvTranspose2d( + 32, action_vec.sum(), 3, stride=2, padding=1, output_padding=1 + ), + init_layers_orthogonal=init_layers_orthogonal, + std=0.01, + ), + Transpose((0, 2, 3, 1)), + ) + + def forward( + self, + obs: torch.Tensor, + actions: Optional[torch.Tensor] = None, + action_masks: Optional[torch.Tensor] = None, + ) -> PiForward: + assert ( + action_masks is not None + ), f"No mask case unhandled in {self.__class__.__name__}" + logits = self.deconv(obs) + pi = GridnetDistribution(self.map_size, self.action_vec, logits, action_masks) + return self.pi_forward(pi, actions) + + @property + def action_shape(self) -> Tuple[int, ...]: + return (self.map_size, len(self.action_vec)) diff --git a/rl_algo_impls/shared/actor/make_actor.py b/rl_algo_impls/shared/actor/make_actor.py new file mode 100644 index 0000000000000000000000000000000000000000..831e1f800dd46cb2db056bfb686aa10820c82666 --- /dev/null +++ b/rl_algo_impls/shared/actor/make_actor.py @@ -0,0 +1,95 @@ +from typing import Tuple, Type + +import gym +import torch.nn as nn +from gym.spaces import Box, Discrete, MultiDiscrete + +from rl_algo_impls.shared.actor.actor import Actor +from rl_algo_impls.shared.actor.categorical import CategoricalActorHead +from rl_algo_impls.shared.actor.gaussian import GaussianActorHead +from rl_algo_impls.shared.actor.gridnet import GridnetActorHead +from rl_algo_impls.shared.actor.gridnet_decoder import GridnetDecoder +from rl_algo_impls.shared.actor.multi_discrete import MultiDiscreteActorHead +from rl_algo_impls.shared.actor.state_dependent_noise import ( + StateDependentNoiseActorHead, +) +from rl_algo_impls.shared.encoder import EncoderOutDim + + +def actor_head( + action_space: gym.Space, + in_dim: EncoderOutDim, + hidden_sizes: Tuple[int, ...], + init_layers_orthogonal: bool, + activation: Type[nn.Module], + log_std_init: float = -0.5, + use_sde: bool = False, + full_std: bool = True, + squash_output: bool = False, + actor_head_style: str = "single", +) -> Actor: + assert not use_sde or isinstance( + action_space, Box + ), "use_sde only valid if Box action_space" + assert not squash_output or use_sde, "squash_output only valid if use_sde" + if isinstance(action_space, Discrete): + assert isinstance(in_dim, int) + return CategoricalActorHead( + action_space.n, # type: ignore + in_dim=in_dim, + hidden_sizes=hidden_sizes, + activation=activation, + init_layers_orthogonal=init_layers_orthogonal, + ) + elif isinstance(action_space, Box): + assert isinstance(in_dim, int) + if use_sde: + return StateDependentNoiseActorHead( + action_space.shape[0], # type: ignore + in_dim=in_dim, + hidden_sizes=hidden_sizes, + activation=activation, + init_layers_orthogonal=init_layers_orthogonal, + log_std_init=log_std_init, + full_std=full_std, + squash_output=squash_output, + ) + else: + return GaussianActorHead( + action_space.shape[0], # type: ignore + in_dim=in_dim, + hidden_sizes=hidden_sizes, + activation=activation, + init_layers_orthogonal=init_layers_orthogonal, + log_std_init=log_std_init, + ) + elif isinstance(action_space, MultiDiscrete): + if actor_head_style == "single": + return MultiDiscreteActorHead( + action_space.nvec, # type: ignore + in_dim=in_dim, + hidden_sizes=hidden_sizes, + activation=activation, + init_layers_orthogonal=init_layers_orthogonal, + ) + elif actor_head_style == "gridnet": + return GridnetActorHead( + action_space.nvec[0], # type: ignore + action_space.nvec[1:], # type: ignore + in_dim=in_dim, + hidden_sizes=hidden_sizes, + activation=activation, + init_layers_orthogonal=init_layers_orthogonal, + ) + elif actor_head_style == "gridnet_decoder": + return GridnetDecoder( + action_space.nvec[0], # type: ignore + action_space.nvec[1:], # type: ignore + in_dim=in_dim, + activation=activation, + init_layers_orthogonal=init_layers_orthogonal, + ) + else: + raise ValueError(f"Doesn't support actor_head_style {actor_head_style}") + else: + raise ValueError(f"Unsupported action space: {action_space}") diff --git a/rl_algo_impls/shared/actor/multi_discrete.py b/rl_algo_impls/shared/actor/multi_discrete.py new file mode 100644 index 0000000000000000000000000000000000000000..26a60d6c90f2e0ac244f57432ec426493ebeefdf --- /dev/null +++ b/rl_algo_impls/shared/actor/multi_discrete.py @@ -0,0 +1,101 @@ +from typing import Dict, Optional, Tuple, Type + +import numpy as np +import torch +import torch.nn as nn +from numpy.typing import NDArray +from torch.distributions import Distribution, constraints + +from rl_algo_impls.shared.actor.actor import Actor, PiForward +from rl_algo_impls.shared.actor.categorical import MaskedCategorical +from rl_algo_impls.shared.encoder import EncoderOutDim +from rl_algo_impls.shared.module.module import mlp + + +class MultiCategorical(Distribution): + def __init__( + self, + nvec: NDArray[np.int64], + probs=None, + logits=None, + validate_args=None, + masks: Optional[torch.Tensor] = None, + ): + # Either probs or logits should be set + assert (probs is None) != (logits is None) + masks_split = ( + torch.split(masks, nvec.tolist(), dim=1) + if masks is not None + else [None] * len(nvec) + ) + if probs: + self.dists = [ + MaskedCategorical(probs=p, validate_args=validate_args, mask=m) + for p, m in zip(torch.split(probs, nvec.tolist(), dim=1), masks_split) + ] + param = probs + else: + assert logits is not None + self.dists = [ + MaskedCategorical(logits=lg, validate_args=validate_args, mask=m) + for lg, m in zip(torch.split(logits, nvec.tolist(), dim=1), masks_split) + ] + param = logits + batch_shape = param.size()[:-1] if param.ndimension() > 1 else torch.Size() + super().__init__(batch_shape=batch_shape, validate_args=validate_args) + + def log_prob(self, action: torch.Tensor) -> torch.Tensor: + prob_stack = torch.stack( + [c.log_prob(a) for a, c in zip(action.T, self.dists)], dim=-1 + ) + return prob_stack.sum(dim=-1) + + def entropy(self) -> torch.Tensor: + return torch.stack([c.entropy() for c in self.dists], dim=-1).sum(dim=-1) + + def sample(self, sample_shape: torch.Size = torch.Size()) -> torch.Tensor: + return torch.stack([c.sample(sample_shape) for c in self.dists], dim=-1) + + @property + def mode(self) -> torch.Tensor: + return torch.stack([c.mode for c in self.dists], dim=-1) + + @property + def arg_constraints(self) -> Dict[str, constraints.Constraint]: + # Constraints handled by child distributions in dist + return {} + + +class MultiDiscreteActorHead(Actor): + def __init__( + self, + nvec: NDArray[np.int64], + in_dim: EncoderOutDim, + hidden_sizes: Tuple[int, ...] = (32,), + activation: Type[nn.Module] = nn.ReLU, + init_layers_orthogonal: bool = True, + ) -> None: + super().__init__() + self.nvec = nvec + assert isinstance(in_dim, int) + layer_sizes = (in_dim,) + hidden_sizes + (nvec.sum(),) + self._fc = mlp( + layer_sizes, + activation, + init_layers_orthogonal=init_layers_orthogonal, + final_layer_gain=0.01, + ) + + def forward( + self, + obs: torch.Tensor, + actions: Optional[torch.Tensor] = None, + action_masks: Optional[torch.Tensor] = None, + ) -> PiForward: + logits = self._fc(obs) + pi = MultiCategorical(self.nvec, logits=logits, masks=action_masks) + return self.pi_forward(pi, actions) + + @property + def action_shape(self) -> Tuple[int, ...]: + return (len(self.nvec),) diff --git a/rl_algo_impls/shared/policy/actor.py b/rl_algo_impls/shared/actor/state_dependent_noise.py similarity index 54% rename from rl_algo_impls/shared/policy/actor.py rename to rl_algo_impls/shared/actor/state_dependent_noise.py index f6ec5c8615bdb2bb24e5a6d37efaec204977cd2a..333c2549d511537e02edb655f74f912cf054b6b2 100644 --- a/rl_algo_impls/shared/policy/actor.py +++ b/rl_algo_impls/shared/actor/state_dependent_noise.py @@ -1,99 +1,13 @@ -import gym +from typing import Optional, Tuple, Type, TypeVar, Union + import torch import torch.nn as nn +from torch.distributions import Distribution, Normal -from abc import ABC, abstractmethod -from gym.spaces import Box, Discrete -from torch.distributions import Categorical, Distribution, Normal -from typing import NamedTuple, Optional, Sequence, Type, TypeVar, Union - +from rl_algo_impls.shared.actor.actor import Actor, PiForward from rl_algo_impls.shared.module.module import mlp -class PiForward(NamedTuple): - pi: Distribution - logp_a: Optional[torch.Tensor] - entropy: Optional[torch.Tensor] - - -class Actor(nn.Module, ABC): - @abstractmethod - def forward(self, obs: torch.Tensor, a: Optional[torch.Tensor] = None) -> PiForward: - ... - - -class CategoricalActorHead(Actor): - def __init__( - self, - act_dim: int, - hidden_sizes: Sequence[int] = (32,), - activation: Type[nn.Module] = nn.Tanh, - init_layers_orthogonal: bool = True, - ) -> None: - super().__init__() - layer_sizes = tuple(hidden_sizes) + (act_dim,) - self._fc = mlp( - layer_sizes, - activation, - init_layers_orthogonal=init_layers_orthogonal, - final_layer_gain=0.01, - ) - - def forward(self, obs: torch.Tensor, a: Optional[torch.Tensor] = None) -> PiForward: - logits = self._fc(obs) - pi = Categorical(logits=logits) - logp_a = None - entropy = None - if a is not None: - logp_a = pi.log_prob(a) - entropy = pi.entropy() - return PiForward(pi, logp_a, entropy) - - -class GaussianDistribution(Normal): - def log_prob(self, a: torch.Tensor) -> torch.Tensor: - return super().log_prob(a).sum(axis=-1) - - def sample(self) -> torch.Tensor: - return self.rsample() - - -class GaussianActorHead(Actor): - def __init__( - self, - act_dim: int, - hidden_sizes: Sequence[int] = (32,), - activation: Type[nn.Module] = nn.Tanh, - init_layers_orthogonal: bool = True, - log_std_init: float = -0.5, - ) -> None: - super().__init__() - layer_sizes = tuple(hidden_sizes) + (act_dim,) - self.mu_net = mlp( - layer_sizes, - activation, - init_layers_orthogonal=init_layers_orthogonal, - final_layer_gain=0.01, - ) - self.log_std = nn.Parameter( - torch.ones(act_dim, dtype=torch.float32) * log_std_init - ) - - def _distribution(self, obs: torch.Tensor) -> Distribution: - mu = self.mu_net(obs) - std = torch.exp(self.log_std) - return GaussianDistribution(mu, std) - - def forward(self, obs: torch.Tensor, a: Optional[torch.Tensor] = None) -> PiForward: - pi = self._distribution(obs) - logp_a = None - entropy = None - if a is not None: - logp_a = pi.log_prob(a) - entropy = pi.entropy() - return PiForward(pi, logp_a, entropy) - - class TanhBijector: def __init__(self, epsilon: float = 1e-6) -> None: self.epsilon = epsilon @@ -173,7 +87,8 @@ class StateDependentNoiseActorHead(Actor): def __init__( self, act_dim: int, - hidden_sizes: Sequence[int] = (32,), + in_dim: int, + hidden_sizes: Tuple[int, ...] = (32,), activation: Type[nn.Module] = nn.Tanh, init_layers_orthogonal: bool = True, log_std_init: float = -0.5, @@ -183,7 +98,7 @@ class StateDependentNoiseActorHead(Actor): ) -> None: super().__init__() self.act_dim = act_dim - layer_sizes = tuple(hidden_sizes) + (self.act_dim,) + layer_sizes = (in_dim,) + hidden_sizes + (act_dim,) if len(layer_sizes) == 2: self.latent_net = nn.Identity() elif len(layer_sizes) > 2: @@ -193,8 +108,6 @@ class StateDependentNoiseActorHead(Actor): output_activation=activation, init_layers_orthogonal=init_layers_orthogonal, ) - else: - raise ValueError("hidden_sizes must be of at least length 1") self.mu_net = mlp( layer_sizes[-2:], activation, @@ -202,7 +115,7 @@ class StateDependentNoiseActorHead(Actor): final_layer_gain=0.01, ) self.full_std = full_std - std_dim = (hidden_sizes[-1], act_dim if self.full_std else 1) + std_dim = (layer_sizes[-2], act_dim if self.full_std else 1) self.log_std = nn.Parameter( torch.ones(std_dim, dtype=torch.float32) * log_std_init ) @@ -249,14 +162,17 @@ class StateDependentNoiseActorHead(Actor): ones = ones.to(self.device) return ones * std - def forward(self, obs: torch.Tensor, a: Optional[torch.Tensor] = None) -> PiForward: + def forward( + self, + obs: torch.Tensor, + actions: Optional[torch.Tensor] = None, + action_masks: Optional[torch.Tensor] = None, + ) -> PiForward: + assert ( + not action_masks + ), f"{self.__class__.__name__} does not support action_masks" pi = self._distribution(obs) - logp_a = None - entropy = None - if a is not None: - logp_a = pi.log_prob(a) - entropy = -logp_a if self.bijector else sum_independent_dims(pi.entropy()) - return PiForward(pi, logp_a, entropy) + return self.pi_forward(pi, actions) def sample_weights(self, batch_size: int = 1) -> None: std = self._get_std() @@ -265,46 +181,20 @@ class StateDependentNoiseActorHead(Actor): self.exploration_mat = weights_dist.rsample() self.exploration_matrices = weights_dist.rsample(torch.Size((batch_size,))) + @property + def action_shape(self) -> Tuple[int, ...]: + return (self.act_dim,) -def actor_head( - action_space: gym.Space, - hidden_sizes: Sequence[int], - init_layers_orthogonal: bool, - activation: Type[nn.Module], - log_std_init: float = -0.5, - use_sde: bool = False, - full_std: bool = True, - squash_output: bool = False, -) -> Actor: - assert not use_sde or isinstance( - action_space, Box - ), "use_sde only valid if Box action_space" - assert not squash_output or use_sde, "squash_output only valid if use_sde" - if isinstance(action_space, Discrete): - return CategoricalActorHead( - action_space.n, - hidden_sizes=hidden_sizes, - activation=activation, - init_layers_orthogonal=init_layers_orthogonal, - ) - elif isinstance(action_space, Box): - if use_sde: - return StateDependentNoiseActorHead( - action_space.shape[0], - hidden_sizes=hidden_sizes, - activation=activation, - init_layers_orthogonal=init_layers_orthogonal, - log_std_init=log_std_init, - full_std=full_std, - squash_output=squash_output, - ) - else: - return GaussianActorHead( - action_space.shape[0], - hidden_sizes=hidden_sizes, - activation=activation, - init_layers_orthogonal=init_layers_orthogonal, - log_std_init=log_std_init, + def pi_forward( + self, distribution: Distribution, actions: Optional[torch.Tensor] = None + ) -> PiForward: + logp_a = None + entropy = None + if actions is not None: + logp_a = distribution.log_prob(actions) + entropy = ( + -logp_a + if self.bijector + else sum_independent_dims(distribution.entropy()) ) - else: - raise ValueError(f"Unsupported action space: {action_space}") + return PiForward(distribution, logp_a, entropy) diff --git a/rl_algo_impls/shared/callbacks/eval_callback.py b/rl_algo_impls/shared/callbacks/eval_callback.py index f32b2d1c6dede8bf3c35e6086b26ca636d84958f..04b8ee1b24667c3767e57820ac0dfd9d4b01b1ce 100644 --- a/rl_algo_impls/shared/callbacks/eval_callback.py +++ b/rl_algo_impls/shared/callbacks/eval_callback.py @@ -1,14 +1,15 @@ import itertools -import numpy as np import os - from time import perf_counter +from typing import Dict, List, Optional, Union + +import numpy as np from torch.utils.tensorboard.writer import SummaryWriter -from typing import List, Optional, Union from rl_algo_impls.shared.callbacks.callback import Callback from rl_algo_impls.shared.policy.policy import Policy from rl_algo_impls.shared.stats import Episode, EpisodeAccumulator, EpisodesStats +from rl_algo_impls.wrappers.action_mask_wrapper import find_action_masker from rl_algo_impls.wrappers.vec_episode_recorder import VecEpisodeRecorder from rl_algo_impls.wrappers.vectorable_wrapper import VecEnv @@ -20,6 +21,7 @@ class EvaluateAccumulator(EpisodeAccumulator): goal_episodes: int, print_returns: bool = True, ignore_first_episode: bool = False, + additional_keys_to_log: Optional[List[str]] = None, ): super().__init__(num_envs) self.completed_episodes_by_env_idx = [[] for _ in range(num_envs)] @@ -36,8 +38,11 @@ class EvaluateAccumulator(EpisodeAccumulator): self.should_record_done = should_record_done else: self.should_record_done = lambda idx: True + self.additional_keys_to_log = additional_keys_to_log - def on_done(self, ep_idx: int, episode: Episode) -> None: + def on_done(self, ep_idx: int, episode: Episode, info: Dict) -> None: + if self.additional_keys_to_log: + episode.info = {k: info[k] for k in self.additional_keys_to_log} if ( self.should_record_done(ep_idx) and len(self.completed_episodes_by_env_idx[ep_idx]) @@ -74,19 +79,29 @@ def evaluate( deterministic: bool = True, print_returns: bool = True, ignore_first_episode: bool = False, + additional_keys_to_log: Optional[List[str]] = None, ) -> EpisodesStats: policy.sync_normalization(env) policy.eval() episodes = EvaluateAccumulator( - env.num_envs, n_episodes, print_returns, ignore_first_episode + env.num_envs, + n_episodes, + print_returns, + ignore_first_episode, + additional_keys_to_log=additional_keys_to_log, ) obs = env.reset() + action_masker = find_action_masker(env) while not episodes.is_done(): - act = policy.act(obs, deterministic=deterministic) - obs, rew, done, _ = env.step(act) - episodes.step(rew, done) + act = policy.act( + obs, + deterministic=deterministic, + action_masks=action_masker.action_masks() if action_masker else None, + ) + obs, rew, done, info = env.step(act) + episodes.step(rew, done, info) if render: env.render() stats = EpisodesStats(episodes.episodes) @@ -111,6 +126,7 @@ class EvalCallback(Callback): best_video_dir: Optional[str] = None, max_video_length: int = 3600, ignore_first_episode: bool = False, + additional_keys_to_log: Optional[List[str]] = None, ) -> None: super().__init__() self.policy = policy @@ -133,8 +149,8 @@ class EvalCallback(Callback): os.makedirs(best_video_dir, exist_ok=True) self.max_video_length = max_video_length self.best_video_base_path = None - self.ignore_first_episode = ignore_first_episode + self.additional_keys_to_log = additional_keys_to_log def on_step(self, timesteps_elapsed: int = 1) -> bool: super().on_step(timesteps_elapsed) @@ -153,6 +169,7 @@ class EvalCallback(Callback): deterministic=self.deterministic, print_returns=print_returns or False, ignore_first_episode=self.ignore_first_episode, + additional_keys_to_log=self.additional_keys_to_log, ) end_time = perf_counter() self.tb_writer.add_scalar( diff --git a/rl_algo_impls/shared/encoder/__init__.py b/rl_algo_impls/shared/encoder/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..aaf5a2c2c0bb28caee966aa4c9a100be5307396c --- /dev/null +++ b/rl_algo_impls/shared/encoder/__init__.py @@ -0,0 +1,2 @@ +from rl_algo_impls.shared.encoder.cnn import EncoderOutDim +from rl_algo_impls.shared.encoder.encoder import Encoder diff --git a/rl_algo_impls/shared/encoder/cnn.py b/rl_algo_impls/shared/encoder/cnn.py new file mode 100644 index 0000000000000000000000000000000000000000..b4e324bdedc4d4ce8d82231e6b2b218cdca9b5ba --- /dev/null +++ b/rl_algo_impls/shared/encoder/cnn.py @@ -0,0 +1,72 @@ +from abc import ABC, abstractmethod +from typing import Optional, Tuple, Type, Union + +import gym +import numpy as np +import torch +import torch.nn as nn + +from rl_algo_impls.shared.module.module import layer_init + +EncoderOutDim = Union[int, Tuple[int, ...]] + + +class CnnEncoder(nn.Module, ABC): + @abstractmethod + def __init__( + self, + obs_space: gym.Space, + **kwargs, + ) -> None: + super().__init__() + self.range_size = np.max(obs_space.high) - np.min(obs_space.low) # type: ignore + + def preprocess(self, obs: torch.Tensor) -> torch.Tensor: + if len(obs.shape) == 3: + obs = obs.unsqueeze(0) + return obs.float() / self.range_size + + def forward(self, obs: torch.Tensor) -> torch.Tensor: + return self.preprocess(obs) + + @property + @abstractmethod + def out_dim(self) -> EncoderOutDim: + ... + + +class FlattenedCnnEncoder(CnnEncoder): + def __init__( + self, + obs_space: gym.Space, + activation: Type[nn.Module], + linear_init_layers_orthogonal: bool, + cnn_flatten_dim: int, + cnn: nn.Module, + **kwargs, + ) -> None: + super().__init__(obs_space, **kwargs) + self.cnn = cnn + self.flattened_dim = cnn_flatten_dim + with torch.no_grad(): + cnn_out = torch.flatten( + cnn(self.preprocess(torch.as_tensor(obs_space.sample()))), start_dim=1 + ) + self.fc = nn.Sequential( + nn.Flatten(), + layer_init( + nn.Linear(cnn_out.shape[1], cnn_flatten_dim), + linear_init_layers_orthogonal, + ), + activation(), + ) + + def forward(self, obs: torch.Tensor) -> torch.Tensor: + x = super().forward(obs) + x = self.cnn(x) + x = self.fc(x) + return x + + @property + def out_dim(self) -> EncoderOutDim: + return self.flattened_dim diff --git a/rl_algo_impls/shared/encoder/encoder.py b/rl_algo_impls/shared/encoder/encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..39dd5ba192f593c79ebe470ead65fae0ed83dd47 --- /dev/null +++ b/rl_algo_impls/shared/encoder/encoder.py @@ -0,0 +1,73 @@ +from typing import Dict, Optional, Sequence, Type + +import gym +import torch +import torch.nn as nn +import torch.nn.functional as F +from gym.spaces import Box, Discrete +from stable_baselines3.common.preprocessing import get_flattened_obs_dim + +from rl_algo_impls.shared.encoder.cnn import CnnEncoder +from rl_algo_impls.shared.encoder.gridnet_encoder import GridnetEncoder +from rl_algo_impls.shared.encoder.impala_cnn import ImpalaCnn +from rl_algo_impls.shared.encoder.microrts_cnn import MicrortsCnn +from rl_algo_impls.shared.encoder.nature_cnn import NatureCnn +from rl_algo_impls.shared.module.module import layer_init + +CNN_EXTRACTORS_BY_STYLE: Dict[str, Type[CnnEncoder]] = { + "nature": NatureCnn, + "impala": ImpalaCnn, + "microrts": MicrortsCnn, + "gridnet_encoder": GridnetEncoder, +} + + +class Encoder(nn.Module): + def __init__( + self, + obs_space: gym.Space, + activation: Type[nn.Module], + init_layers_orthogonal: bool = False, + cnn_flatten_dim: int = 512, + cnn_style: str = "nature", + cnn_layers_init_orthogonal: Optional[bool] = None, + impala_channels: Sequence[int] = (16, 32, 32), + ) -> None: + super().__init__() + if isinstance(obs_space, Box): + # Conv2D: (channels, height, width) + if len(obs_space.shape) == 3: # type: ignore + self.preprocess = None + cnn = CNN_EXTRACTORS_BY_STYLE[cnn_style]( + obs_space, + activation=activation, + cnn_init_layers_orthogonal=cnn_layers_init_orthogonal, + linear_init_layers_orthogonal=init_layers_orthogonal, + cnn_flatten_dim=cnn_flatten_dim, + impala_channels=impala_channels, + ) + self.feature_extractor = cnn + self.out_dim = cnn.out_dim + elif len(obs_space.shape) == 1: # type: ignore + + def preprocess(obs: torch.Tensor) -> torch.Tensor: + if len(obs.shape) == 1: + obs = obs.unsqueeze(0) + return obs.float() + + self.preprocess = preprocess + self.feature_extractor = nn.Flatten() + self.out_dim = get_flattened_obs_dim(obs_space) + else: + raise ValueError(f"Unsupported observation space: {obs_space}") + elif isinstance(obs_space, Discrete): + self.preprocess = lambda x: F.one_hot(x, obs_space.n).float() + self.feature_extractor = nn.Flatten() + self.out_dim = obs_space.n # type: ignore + else: + raise NotImplementedError + + def forward(self, obs: torch.Tensor) -> torch.Tensor: + if self.preprocess: + obs = self.preprocess(obs) + return self.feature_extractor(obs) diff --git a/rl_algo_impls/shared/encoder/gridnet_encoder.py b/rl_algo_impls/shared/encoder/gridnet_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..5930894e98992c90d78768f68844ba9abe9644b8 --- /dev/null +++ b/rl_algo_impls/shared/encoder/gridnet_encoder.py @@ -0,0 +1,64 @@ +from typing import Optional, Tuple, Type, Union + +import gym +import torch +import torch.nn as nn + +from rl_algo_impls.shared.encoder.cnn import CnnEncoder, EncoderOutDim +from rl_algo_impls.shared.module.module import layer_init + + +class GridnetEncoder(CnnEncoder): + """ + Encoder for encoder-decoder for Gym-MicroRTS + """ + + def __init__( + self, + obs_space: gym.Space, + activation: Type[nn.Module] = nn.ReLU, + cnn_init_layers_orthogonal: Optional[bool] = None, + **kwargs + ) -> None: + if cnn_init_layers_orthogonal is None: + cnn_init_layers_orthogonal = True + super().__init__(obs_space, **kwargs) + in_channels = obs_space.shape[0] # type: ignore + self.encoder = nn.Sequential( + layer_init( + nn.Conv2d(in_channels, 32, kernel_size=3, padding=1), + cnn_init_layers_orthogonal, + ), + nn.MaxPool2d(3, stride=2, padding=1), + activation(), + layer_init( + nn.Conv2d(32, 64, kernel_size=3, padding=1), + cnn_init_layers_orthogonal, + ), + nn.MaxPool2d(3, stride=2, padding=1), + activation(), + layer_init( + nn.Conv2d(64, 128, kernel_size=3, padding=1), + cnn_init_layers_orthogonal, + ), + nn.MaxPool2d(3, stride=2, padding=1), + activation(), + layer_init( + nn.Conv2d(128, 256, kernel_size=3, padding=1), + cnn_init_layers_orthogonal, + ), + nn.MaxPool2d(3, stride=2, padding=1), + activation(), + ) + with torch.no_grad(): + encoder_out = self.encoder( + self.preprocess(torch.as_tensor(obs_space.sample())) # type: ignore + ) + self._out_dim = encoder_out.shape[1:] + + def forward(self, obs: torch.Tensor) -> torch.Tensor: + return self.encoder(super().forward(obs)) + + @property + def out_dim(self) -> EncoderOutDim: + return self._out_dim diff --git a/rl_algo_impls/shared/encoder/impala_cnn.py b/rl_algo_impls/shared/encoder/impala_cnn.py new file mode 100644 index 0000000000000000000000000000000000000000..d14a8a51776792eea647ccc727f77fa9c0991e2f --- /dev/null +++ b/rl_algo_impls/shared/encoder/impala_cnn.py @@ -0,0 +1,92 @@ +from typing import Optional, Sequence, Type + +import gym +import torch +import torch.nn as nn + +from rl_algo_impls.shared.encoder.cnn import FlattenedCnnEncoder +from rl_algo_impls.shared.module.module import layer_init + + +class ResidualBlock(nn.Module): + def __init__( + self, + channels: int, + activation: Type[nn.Module] = nn.ReLU, + init_layers_orthogonal: bool = False, + ) -> None: + super().__init__() + self.residual = nn.Sequential( + activation(), + layer_init( + nn.Conv2d(channels, channels, 3, padding=1), init_layers_orthogonal + ), + activation(), + layer_init( + nn.Conv2d(channels, channels, 3, padding=1), init_layers_orthogonal + ), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x + self.residual(x) + + +class ConvSequence(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + activation: Type[nn.Module] = nn.ReLU, + init_layers_orthogonal: bool = False, + ) -> None: + super().__init__() + self.seq = nn.Sequential( + layer_init( + nn.Conv2d(in_channels, out_channels, 3, padding=1), + init_layers_orthogonal, + ), + nn.MaxPool2d(3, stride=2, padding=1), + ResidualBlock(out_channels, activation, init_layers_orthogonal), + ResidualBlock(out_channels, activation, init_layers_orthogonal), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.seq(x) + + +class ImpalaCnn(FlattenedCnnEncoder): + """ + IMPALA-style CNN architecture + """ + + def __init__( + self, + obs_space: gym.Space, + activation: Type[nn.Module], + cnn_init_layers_orthogonal: Optional[bool], + linear_init_layers_orthogonal: bool, + cnn_flatten_dim: int, + impala_channels: Sequence[int] = (16, 32, 32), + **kwargs, + ) -> None: + if cnn_init_layers_orthogonal is None: + cnn_init_layers_orthogonal = False + in_channels = obs_space.shape[0] # type: ignore + sequences = [] + for out_channels in impala_channels: + sequences.append( + ConvSequence( + in_channels, out_channels, activation, cnn_init_layers_orthogonal + ) + ) + in_channels = out_channels + sequences.append(activation()) + cnn = nn.Sequential(*sequences) + super().__init__( + obs_space, + activation, + linear_init_layers_orthogonal, + cnn_flatten_dim, + cnn, + **kwargs, + ) diff --git a/rl_algo_impls/shared/encoder/microrts_cnn.py b/rl_algo_impls/shared/encoder/microrts_cnn.py new file mode 100644 index 0000000000000000000000000000000000000000..29f18af52350308923bf9c51db1d25e4cbb49601 --- /dev/null +++ b/rl_algo_impls/shared/encoder/microrts_cnn.py @@ -0,0 +1,45 @@ +from typing import Optional, Type + +import gym +import torch +import torch.nn as nn + +from rl_algo_impls.shared.encoder.cnn import FlattenedCnnEncoder +from rl_algo_impls.shared.module.module import layer_init + + +class MicrortsCnn(FlattenedCnnEncoder): + """ + Base CNN architecture for Gym-MicroRTS + """ + + def __init__( + self, + obs_space: gym.Space, + activation: Type[nn.Module], + cnn_init_layers_orthogonal: Optional[bool], + linear_init_layers_orthogonal: bool, + cnn_flatten_dim: int, + **kwargs, + ) -> None: + if cnn_init_layers_orthogonal is None: + cnn_init_layers_orthogonal = True + in_channels = obs_space.shape[0] # type: ignore + cnn = nn.Sequential( + layer_init( + nn.Conv2d(in_channels, 16, kernel_size=3, stride=2), + cnn_init_layers_orthogonal, + ), + activation(), + layer_init(nn.Conv2d(16, 32, kernel_size=2), cnn_init_layers_orthogonal), + activation(), + nn.Flatten(), + ) + super().__init__( + obs_space, + activation, + linear_init_layers_orthogonal, + cnn_flatten_dim, + cnn, + **kwargs, + ) diff --git a/rl_algo_impls/shared/encoder/nature_cnn.py b/rl_algo_impls/shared/encoder/nature_cnn.py new file mode 100644 index 0000000000000000000000000000000000000000..21a77f9cdb0a4b1029b10a150ec81990bbfaff3a --- /dev/null +++ b/rl_algo_impls/shared/encoder/nature_cnn.py @@ -0,0 +1,53 @@ +from typing import Optional, Type + +import gym +import torch.nn as nn + +from rl_algo_impls.shared.encoder.cnn import FlattenedCnnEncoder +from rl_algo_impls.shared.module.module import layer_init + + +class NatureCnn(FlattenedCnnEncoder): + """ + CNN from DQN Nature paper: Mnih, Volodymyr, et al. + "Human-level control through deep reinforcement learning." + Nature 518.7540 (2015): 529-533. + """ + + def __init__( + self, + obs_space: gym.Space, + activation: Type[nn.Module], + cnn_init_layers_orthogonal: Optional[bool], + linear_init_layers_orthogonal: bool, + cnn_flatten_dim: int, + **kwargs, + ) -> None: + if cnn_init_layers_orthogonal is None: + cnn_init_layers_orthogonal = True + in_channels = obs_space.shape[0] # type: ignore + cnn = nn.Sequential( + layer_init( + nn.Conv2d(in_channels, 32, kernel_size=8, stride=4), + cnn_init_layers_orthogonal, + ), + activation(), + layer_init( + nn.Conv2d(32, 64, kernel_size=4, stride=2), + cnn_init_layers_orthogonal, + ), + activation(), + layer_init( + nn.Conv2d(64, 64, kernel_size=3, stride=1), + cnn_init_layers_orthogonal, + ), + activation(), + ) + super().__init__( + obs_space, + activation, + linear_init_layers_orthogonal, + cnn_flatten_dim, + cnn, + **kwargs, + ) diff --git a/rl_algo_impls/shared/gae.py b/rl_algo_impls/shared/gae.py index f4d02ded6524d99be6eacc79984e5349fa97af55..7b5cbacc2d82e7c262c91b52afa4a6929f4c439e 100644 --- a/rl_algo_impls/shared/gae.py +++ b/rl_algo_impls/shared/gae.py @@ -5,6 +5,7 @@ from typing import NamedTuple, Sequence from rl_algo_impls.shared.policy.on_policy import OnPolicy from rl_algo_impls.shared.trajectory import Trajectory +from rl_algo_impls.wrappers.vectorable_wrapper import VecEnvObs class RtgAdvantage(NamedTuple): @@ -19,7 +20,7 @@ def discounted_cumsum(x: np.ndarray, gamma: float) -> np.ndarray: return dc -def compute_advantage( +def compute_advantage_from_trajectories( trajectories: Sequence[Trajectory], policy: OnPolicy, gamma: float, @@ -40,7 +41,7 @@ def compute_advantage( ) -def compute_rtg_and_advantage( +def compute_rtg_and_advantage_from_trajectories( trajectories: Sequence[Trajectory], policy: OnPolicy, gamma: float, @@ -65,3 +66,29 @@ def compute_rtg_and_advantage( ), torch.as_tensor(np.concatenate(advantages), dtype=torch.float32, device=device), ) + + +def compute_advantages( + rewards: np.ndarray, + values: np.ndarray, + episode_starts: np.ndarray, + next_episode_starts: np.ndarray, + next_obs: VecEnvObs, + policy: OnPolicy, + gamma: float, + gae_lambda: float, +) -> np.ndarray: + advantages = np.zeros_like(rewards) + last_gae_lam = 0 + n_steps = advantages.shape[0] + for t in reversed(range(n_steps)): + if t == n_steps - 1: + next_nonterminal = 1.0 - next_episode_starts + next_value = policy.value(next_obs) + else: + next_nonterminal = 1.0 - episode_starts[t + 1] + next_value = values[t + 1] + delta = rewards[t] + gamma * next_value * next_nonterminal - values[t] + last_gae_lam = delta + gamma * gae_lambda * next_nonterminal * last_gae_lam + advantages[t] = last_gae_lam + return advantages diff --git a/rl_algo_impls/shared/module/feature_extractor.py b/rl_algo_impls/shared/module/feature_extractor.py deleted file mode 100644 index 16ccaeefae90377ca93aa67285e28d5bb136977e..0000000000000000000000000000000000000000 --- a/rl_algo_impls/shared/module/feature_extractor.py +++ /dev/null @@ -1,215 +0,0 @@ -import gym -import torch -import torch.nn as nn -import torch.nn.functional as F - -from abc import ABC, abstractmethod -from gym.spaces import Box, Discrete -from stable_baselines3.common.preprocessing import get_flattened_obs_dim -from typing import Dict, Optional, Sequence, Type - -from rl_algo_impls.shared.module.module import layer_init - - -class CnnFeatureExtractor(nn.Module, ABC): - @abstractmethod - def __init__( - self, - in_channels: int, - activation: Type[nn.Module] = nn.ReLU, - init_layers_orthogonal: Optional[bool] = None, - **kwargs, - ) -> None: - super().__init__() - - -class NatureCnn(CnnFeatureExtractor): - """ - CNN from DQN Nature paper: Mnih, Volodymyr, et al. - "Human-level control through deep reinforcement learning." - Nature 518.7540 (2015): 529-533. - """ - - def __init__( - self, - in_channels: int, - activation: Type[nn.Module] = nn.ReLU, - init_layers_orthogonal: Optional[bool] = None, - **kwargs, - ) -> None: - if init_layers_orthogonal is None: - init_layers_orthogonal = True - super().__init__(in_channels, activation, init_layers_orthogonal) - self.cnn = nn.Sequential( - layer_init( - nn.Conv2d(in_channels, 32, kernel_size=8, stride=4), - init_layers_orthogonal, - ), - activation(), - layer_init( - nn.Conv2d(32, 64, kernel_size=4, stride=2), - init_layers_orthogonal, - ), - activation(), - layer_init( - nn.Conv2d(64, 64, kernel_size=3, stride=1), - init_layers_orthogonal, - ), - activation(), - nn.Flatten(), - ) - - def forward(self, obs: torch.Tensor) -> torch.Tensor: - return self.cnn(obs) - - -class ResidualBlock(nn.Module): - def __init__( - self, - channels: int, - activation: Type[nn.Module] = nn.ReLU, - init_layers_orthogonal: bool = False, - ) -> None: - super().__init__() - self.residual = nn.Sequential( - activation(), - layer_init( - nn.Conv2d(channels, channels, 3, padding=1), init_layers_orthogonal - ), - activation(), - layer_init( - nn.Conv2d(channels, channels, 3, padding=1), init_layers_orthogonal - ), - ) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - return x + self.residual(x) - - -class ConvSequence(nn.Module): - def __init__( - self, - in_channels: int, - out_channels: int, - activation: Type[nn.Module] = nn.ReLU, - init_layers_orthogonal: bool = False, - ) -> None: - super().__init__() - self.seq = nn.Sequential( - layer_init( - nn.Conv2d(in_channels, out_channels, 3, padding=1), - init_layers_orthogonal, - ), - nn.MaxPool2d(3, stride=2, padding=1), - ResidualBlock(out_channels, activation, init_layers_orthogonal), - ResidualBlock(out_channels, activation, init_layers_orthogonal), - ) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - return self.seq(x) - - -class ImpalaCnn(CnnFeatureExtractor): - """ - IMPALA-style CNN architecture - """ - - def __init__( - self, - in_channels: int, - activation: Type[nn.Module] = nn.ReLU, - init_layers_orthogonal: Optional[bool] = None, - impala_channels: Sequence[int] = (16, 32, 32), - **kwargs, - ) -> None: - if init_layers_orthogonal is None: - init_layers_orthogonal = False - super().__init__(in_channels, activation, init_layers_orthogonal) - sequences = [] - for out_channels in impala_channels: - sequences.append( - ConvSequence( - in_channels, out_channels, activation, init_layers_orthogonal - ) - ) - in_channels = out_channels - sequences.extend( - [ - activation(), - nn.Flatten(), - ] - ) - self.seq = nn.Sequential(*sequences) - - def forward(self, obs: torch.Tensor) -> torch.Tensor: - return self.seq(obs) - - -CNN_EXTRACTORS_BY_STYLE: Dict[str, Type[CnnFeatureExtractor]] = { - "nature": NatureCnn, - "impala": ImpalaCnn, -} - - -class FeatureExtractor(nn.Module): - def __init__( - self, - obs_space: gym.Space, - activation: Type[nn.Module], - init_layers_orthogonal: bool = False, - cnn_feature_dim: int = 512, - cnn_style: str = "nature", - cnn_layers_init_orthogonal: Optional[bool] = None, - impala_channels: Sequence[int] = (16, 32, 32), - ) -> None: - super().__init__() - if isinstance(obs_space, Box): - # Conv2D: (channels, height, width) - if len(obs_space.shape) == 3: - cnn = CNN_EXTRACTORS_BY_STYLE[cnn_style]( - obs_space.shape[0], - activation, - init_layers_orthogonal=cnn_layers_init_orthogonal, - impala_channels=impala_channels, - ) - - def preprocess(obs: torch.Tensor) -> torch.Tensor: - if len(obs.shape) == 3: - obs = obs.unsqueeze(0) - return obs.float() / 255.0 - - with torch.no_grad(): - cnn_out = cnn(preprocess(torch.as_tensor(obs_space.sample()))) - self.preprocess = preprocess - self.feature_extractor = nn.Sequential( - cnn, - layer_init( - nn.Linear(cnn_out.shape[1], cnn_feature_dim), - init_layers_orthogonal, - ), - activation(), - ) - self.out_dim = cnn_feature_dim - elif len(obs_space.shape) == 1: - - def preprocess(obs: torch.Tensor) -> torch.Tensor: - if len(obs.shape) == 1: - obs = obs.unsqueeze(0) - return obs.float() - - self.preprocess = preprocess - self.feature_extractor = nn.Flatten() - self.out_dim = get_flattened_obs_dim(obs_space) - else: - raise ValueError(f"Unsupported observation space: {obs_space}") - elif isinstance(obs_space, Discrete): - self.preprocess = lambda x: F.one_hot(x, obs_space.n).float() - self.feature_extractor = nn.Flatten() - self.out_dim = obs_space.n - else: - raise NotImplementedError - - def forward(self, obs: torch.Tensor) -> torch.Tensor: - if self.preprocess: - obs = self.preprocess(obs) - return self.feature_extractor(obs) diff --git a/rl_algo_impls/shared/module/module.py b/rl_algo_impls/shared/module/module.py index c579fb2a3808de47ec8d4c5233eea947b5cf0d28..2e1e9c8fff2a5539faaf2578f8d3de4ea05e9dc2 100644 --- a/rl_algo_impls/shared/module/module.py +++ b/rl_algo_impls/shared/module/module.py @@ -1,8 +1,8 @@ +from typing import Sequence, Type + import numpy as np import torch.nn as nn -from typing import Sequence, Type - def mlp( layer_sizes: Sequence[int], @@ -10,12 +10,15 @@ def mlp( output_activation: Type[nn.Module] = nn.Identity, init_layers_orthogonal: bool = False, final_layer_gain: float = np.sqrt(2), + hidden_layer_gain: float = np.sqrt(2), ) -> nn.Module: layers = [] for i in range(len(layer_sizes) - 2): layers.append( layer_init( - nn.Linear(layer_sizes[i], layer_sizes[i + 1]), init_layers_orthogonal + nn.Linear(layer_sizes[i], layer_sizes[i + 1]), + init_layers_orthogonal, + std=hidden_layer_gain, ) ) layers.append(activation()) diff --git a/rl_algo_impls/shared/policy/critic.py b/rl_algo_impls/shared/policy/critic.py index 8fceb1c3d31d7133fae14878f03662a504eccda7..ffb0752eeeab6de71e24a8ea5f716ae8921fa543 100644 --- a/rl_algo_impls/shared/policy/critic.py +++ b/rl_algo_impls/shared/policy/critic.py @@ -1,27 +1,39 @@ -import gym +from typing import Sequence, Type + +import numpy as np import torch import torch.nn as nn -from typing import Sequence, Type - +from rl_algo_impls.shared.encoder import EncoderOutDim from rl_algo_impls.shared.module.module import mlp class CriticHead(nn.Module): def __init__( self, - hidden_sizes: Sequence[int] = (32,), + in_dim: EncoderOutDim, + hidden_sizes: Sequence[int] = (), activation: Type[nn.Module] = nn.Tanh, init_layers_orthogonal: bool = True, ) -> None: super().__init__() - layer_sizes = tuple(hidden_sizes) + (1,) - self._fc = mlp( - layer_sizes, - activation, - init_layers_orthogonal=init_layers_orthogonal, - final_layer_gain=1.0, + seq = [] + if isinstance(in_dim, tuple): + seq.append(nn.Flatten()) + in_channels = int(np.prod(in_dim)) + else: + in_channels = in_dim + layer_sizes = (in_channels,) + tuple(hidden_sizes) + (1,) + seq.append( + mlp( + layer_sizes, + activation, + init_layers_orthogonal=init_layers_orthogonal, + final_layer_gain=1.0, + hidden_layer_gain=1.0, + ) ) + self._fc = nn.Sequential(*seq) def forward(self, obs: torch.Tensor) -> torch.Tensor: v = self._fc(obs) diff --git a/rl_algo_impls/shared/policy/on_policy.py b/rl_algo_impls/shared/policy/on_policy.py index 2c1fc3e11a31dc840903b92ec94c5e361f51ad99..4484c053eda3a17a2a575961cad235ba09e5bff7 100644 --- a/rl_algo_impls/shared/policy/on_policy.py +++ b/rl_algo_impls/shared/policy/on_policy.py @@ -1,24 +1,20 @@ +from abc import abstractmethod +from typing import NamedTuple, Optional, Sequence, Tuple, TypeVar + import gym import numpy as np import torch - -from abc import abstractmethod from gym.spaces import Box, Discrete, Space -from typing import NamedTuple, Optional, Sequence, Tuple, TypeVar -from rl_algo_impls.shared.module.feature_extractor import FeatureExtractor -from rl_algo_impls.shared.policy.actor import ( - PiForward, - StateDependentNoiseActorHead, - actor_head, -) +from rl_algo_impls.shared.actor import PiForward, actor_head +from rl_algo_impls.shared.encoder import Encoder from rl_algo_impls.shared.policy.critic import CriticHead from rl_algo_impls.shared.policy.policy import ACTIVATION, Policy from rl_algo_impls.wrappers.vectorable_wrapper import ( VecEnv, VecEnvObs, - single_observation_space, single_action_space, + single_observation_space, ) @@ -77,7 +73,12 @@ class OnPolicy(Policy): ... @abstractmethod - def step(self, obs: VecEnvObs) -> Step: + def step(self, obs: VecEnvObs, action_masks: Optional[np.ndarray] = None) -> Step: + ... + + @property + @abstractmethod + def action_shape(self) -> Tuple[int, ...]: ... @@ -94,10 +95,11 @@ class ActorCritic(OnPolicy): full_std: bool = True, squash_output: bool = False, share_features_extractor: bool = True, - cnn_feature_dim: int = 512, + cnn_flatten_dim: int = 512, cnn_style: str = "nature", cnn_layers_init_orthogonal: Optional[bool] = None, impala_channels: Sequence[int] = (16, 32, 32), + actor_head_style: str = "single", **kwargs, ) -> None: super().__init__(env, **kwargs) @@ -120,52 +122,56 @@ class ActorCritic(OnPolicy): self.action_space = action_space self.squash_output = squash_output self.share_features_extractor = share_features_extractor - self._feature_extractor = FeatureExtractor( + self._feature_extractor = Encoder( observation_space, activation, init_layers_orthogonal=init_layers_orthogonal, - cnn_feature_dim=cnn_feature_dim, + cnn_flatten_dim=cnn_flatten_dim, cnn_style=cnn_style, cnn_layers_init_orthogonal=cnn_layers_init_orthogonal, impala_channels=impala_channels, ) self._pi = actor_head( self.action_space, - (self._feature_extractor.out_dim,) + tuple(pi_hidden_sizes), + self._feature_extractor.out_dim, + tuple(pi_hidden_sizes), init_layers_orthogonal, activation, log_std_init=log_std_init, use_sde=use_sde, full_std=full_std, squash_output=squash_output, + actor_head_style=actor_head_style, ) if not share_features_extractor: - self._v_feature_extractor = FeatureExtractor( + self._v_feature_extractor = Encoder( observation_space, activation, init_layers_orthogonal=init_layers_orthogonal, - cnn_feature_dim=cnn_feature_dim, + cnn_flatten_dim=cnn_flatten_dim, cnn_style=cnn_style, cnn_layers_init_orthogonal=cnn_layers_init_orthogonal, ) - v_hidden_sizes = (self._v_feature_extractor.out_dim,) + tuple( - v_hidden_sizes - ) + critic_in_dim = self._v_feature_extractor.out_dim else: self._v_feature_extractor = None - v_hidden_sizes = (self._feature_extractor.out_dim,) + tuple(v_hidden_sizes) + critic_in_dim = self._feature_extractor.out_dim self._v = CriticHead( + in_dim=critic_in_dim, hidden_sizes=v_hidden_sizes, activation=activation, init_layers_orthogonal=init_layers_orthogonal, ) def _pi_forward( - self, obs: torch.Tensor, action: Optional[torch.Tensor] = None + self, + obs: torch.Tensor, + action_masks: Optional[torch.Tensor], + action: Optional[torch.Tensor] = None, ) -> Tuple[PiForward, torch.Tensor]: p_fe = self._feature_extractor(obs) - pi_forward = self._pi(p_fe, action) + pi_forward = self._pi(p_fe, actions=action, action_masks=action_masks) return pi_forward, p_fe @@ -173,8 +179,13 @@ class ActorCritic(OnPolicy): v_fe = self._v_feature_extractor(obs) if self._v_feature_extractor else p_fc return self._v(v_fe) - def forward(self, obs: torch.Tensor, action: torch.Tensor) -> ACForward: - (_, logp_a, entropy), p_fc = self._pi_forward(obs, action) + def forward( + self, + obs: torch.Tensor, + action: torch.Tensor, + action_masks: Optional[torch.Tensor] = None, + ) -> ACForward: + (_, logp_a, entropy), p_fc = self._pi_forward(obs, action_masks, action=action) v = self._v_forward(obs, p_fc) assert logp_a is not None @@ -192,10 +203,11 @@ class ActorCritic(OnPolicy): v = self._v(fe) return v.cpu().numpy() - def step(self, obs: VecEnvObs) -> Step: + def step(self, obs: VecEnvObs, action_masks: Optional[np.ndarray] = None) -> Step: o = self._as_tensor(obs) + a_masks = self._as_tensor(action_masks) if action_masks is not None else None with torch.no_grad(): - (pi, _, _), p_fc = self._pi_forward(o) + (pi, _, _), p_fc = self._pi_forward(o, action_masks=a_masks) a = pi.sample() logp_a = pi.log_prob(a) @@ -205,13 +217,21 @@ class ActorCritic(OnPolicy): clamped_a_np = clamp_actions(a_np, self.action_space, self.squash_output) return Step(a_np, v.cpu().numpy(), logp_a.cpu().numpy(), clamped_a_np) - def act(self, obs: np.ndarray, deterministic: bool = True) -> np.ndarray: + def act( + self, + obs: np.ndarray, + deterministic: bool = True, + action_masks: Optional[np.ndarray] = None, + ) -> np.ndarray: if not deterministic: - return self.step(obs).clamped_a + return self.step(obs, action_masks=action_masks).clamped_a else: o = self._as_tensor(obs) + a_masks = ( + self._as_tensor(action_masks) if action_masks is not None else None + ) with torch.no_grad(): - (pi, _, _), _ = self._pi_forward(o) + (pi, _, _), _ = self._pi_forward(o, action_masks=a_masks) a = pi.mode return clamp_actions(a.cpu().numpy(), self.action_space, self.squash_output) @@ -220,7 +240,10 @@ class ActorCritic(OnPolicy): self.reset_noise() def reset_noise(self, batch_size: Optional[int] = None) -> None: - if isinstance(self._pi, StateDependentNoiseActorHead): - self._pi.sample_weights( - batch_size=batch_size if batch_size else self.env.num_envs - ) + self._pi.sample_weights( + batch_size=batch_size if batch_size else self.env.num_envs + ) + + @property + def action_shape(self) -> Tuple[int, ...]: + return self._pi.action_shape diff --git a/rl_algo_impls/shared/policy/policy.py b/rl_algo_impls/shared/policy/policy.py index 41d49004aec251235744341890412db9ef7ce389..d84a9a5ec2f386ea540b9a05e733d5c29138fe4c 100644 --- a/rl_algo_impls/shared/policy/policy.py +++ b/rl_algo_impls/shared/policy/policy.py @@ -46,7 +46,12 @@ class Policy(nn.Module, ABC): return self @abstractmethod - def act(self, obs: VecEnvObs, deterministic: bool = True) -> np.ndarray: + def act( + self, + obs: VecEnvObs, + deterministic: bool = True, + action_masks: Optional[np.ndarray] = None, + ) -> np.ndarray: ... def save(self, path: str) -> None: diff --git a/rl_algo_impls/shared/schedule.py b/rl_algo_impls/shared/schedule.py index 1461a782341eff5d89a53f16aebdc268bf9f7f52..67f947b4585848fb9d68e274ff3ec89643ee0690 100644 --- a/rl_algo_impls/shared/schedule.py +++ b/rl_algo_impls/shared/schedule.py @@ -20,10 +20,38 @@ def constant_schedule(val: float) -> Schedule: return lambda f: val +def spike_schedule( + max_value: float, + start_fraction: float = 1e-2, + end_fraction: float = 1e-4, + peak_progress: float = 0.1, +) -> Schedule: + assert 0 < peak_progress < 1 + + def func(progress_fraction: float) -> float: + if progress_fraction < peak_progress: + fraction = ( + start_fraction + + (1 - start_fraction) * progress_fraction / peak_progress + ) + else: + fraction = 1 + (end_fraction - 1) * (progress_fraction - peak_progress) / ( + 1 - peak_progress + ) + return max_value * fraction + + return func + + def schedule(name: str, start_val: float) -> Schedule: if name == "linear": return linear_schedule(start_val, 0) - return constant_schedule(start_val) + elif name == "none": + return constant_schedule(start_val) + elif name == "spike": + return spike_schedule(start_val) + else: + raise ValueError(f"Schedule {name} not supported") def update_learning_rate(optimizer: Optimizer, learning_rate: float) -> None: diff --git a/rl_algo_impls/shared/stats.py b/rl_algo_impls/shared/stats.py index 2315e6bb0de04ee56ca577cb7444f17e93e88fc0..fe020c6d930ae4ed9b1cc98bb32fd72a35f0fd90 100644 --- a/rl_algo_impls/shared/stats.py +++ b/rl_algo_impls/shared/stats.py @@ -1,14 +1,17 @@ -import numpy as np - +import dataclasses +from collections import defaultdict from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Sequence, TypeVar, Union + +import numpy as np from torch.utils.tensorboard.writer import SummaryWriter -from typing import Dict, List, Optional, Sequence, Union, TypeVar @dataclass class Episode: score: float = 0 length: int = 0 + info: Dict[str, Dict[str, Any]] = dataclasses.field(default_factory=dict) StatisticSelf = TypeVar("StatisticSelf", bound="Statistic") @@ -75,12 +78,25 @@ class EpisodesStats: simple: bool score: Statistic length: Statistic + additional_stats: Dict[str, Statistic] def __init__(self, episodes: Sequence[Episode], simple: bool = False) -> None: self.episodes = episodes self.simple = simple self.score = Statistic(np.array([e.score for e in episodes])) self.length = Statistic(np.array([e.length for e in episodes]), round_digits=0) + additional_values = defaultdict(list) + for e in self.episodes: + if e.info: + for k, v in e.info.items(): + if isinstance(v, dict): + for k2, v2 in v.items(): + additional_values[f"{k}_{k2}"].append(v2) + else: + additional_values[k].append(v) + self.additional_stats = { + k: Statistic(np.array(values)) for k, values in additional_values.items() + } def __gt__(self: EpisodesStatsSelf, o: EpisodesStatsSelf) -> bool: return self.score > o.score @@ -118,6 +134,8 @@ class EpisodesStats: "length": self.length.mean, } ) + for k, addl_stats in self.additional_stats.items(): + stats[k] = addl_stats.mean for name, value in stats.items(): tb_writer.add_scalar(f"{main_tag}/{name}", value, global_step=global_step) @@ -131,19 +149,19 @@ class EpisodeAccumulator: def episodes(self) -> List[Episode]: return self._episodes - def step(self, reward: np.ndarray, done: np.ndarray) -> None: + def step(self, reward: np.ndarray, done: np.ndarray, info: List[Dict]) -> None: for idx, current in enumerate(self.current_episodes): current.score += reward[idx] current.length += 1 if done[idx]: self._episodes.append(current) self.current_episodes[idx] = Episode() - self.on_done(idx, current) + self.on_done(idx, current, info[idx]) def __len__(self) -> int: return len(self.episodes) - def on_done(self, ep_idx: int, episode: Episode) -> None: + def on_done(self, ep_idx: int, episode: Episode, info: Dict) -> None: pass def stats(self) -> EpisodesStats: diff --git a/rl_algo_impls/shared/vec_env/__init__.py b/rl_algo_impls/shared/vec_env/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dc0b9ef22dfc40a559381b561c7c4a03a0721ba4 --- /dev/null +++ b/rl_algo_impls/shared/vec_env/__init__.py @@ -0,0 +1 @@ +from rl_algo_impls.shared.vec_env.make_env import make_env, make_eval_env diff --git a/rl_algo_impls/shared/vec_env/make_env.py b/rl_algo_impls/shared/vec_env/make_env.py new file mode 100644 index 0000000000000000000000000000000000000000..f14baf2db982bf7e689313ed55d5d6d88a8eabfb --- /dev/null +++ b/rl_algo_impls/shared/vec_env/make_env.py @@ -0,0 +1,66 @@ +from dataclasses import asdict +from typing import Optional + +from torch.utils.tensorboard.writer import SummaryWriter + +from rl_algo_impls.runner.config import Config, EnvHyperparams +from rl_algo_impls.shared.vec_env.microrts import make_microrts_env +from rl_algo_impls.shared.vec_env.procgen import make_procgen_env +from rl_algo_impls.shared.vec_env.vec_env import make_vec_env +from rl_algo_impls.wrappers.vectorable_wrapper import VecEnv + + +def make_env( + config: Config, + hparams: EnvHyperparams, + training: bool = True, + render: bool = False, + normalize_load_path: Optional[str] = None, + tb_writer: Optional[SummaryWriter] = None, +) -> VecEnv: + if hparams.env_type == "procgen": + return make_procgen_env( + config, + hparams, + training=training, + render=render, + normalize_load_path=normalize_load_path, + tb_writer=tb_writer, + ) + elif hparams.env_type in {"sb3vec", "gymvec"}: + return make_vec_env( + config, + hparams, + training=training, + render=render, + normalize_load_path=normalize_load_path, + tb_writer=tb_writer, + ) + elif hparams.env_type == "microrts": + return make_microrts_env( + config, + hparams, + training=training, + render=render, + normalize_load_path=normalize_load_path, + tb_writer=tb_writer, + ) + else: + raise ValueError(f"env_type {hparams.env_type} not supported") + + +def make_eval_env( + config: Config, + hparams: EnvHyperparams, + override_n_envs: Optional[int] = None, + **kwargs, +) -> VecEnv: + kwargs = kwargs.copy() + kwargs["training"] = False + if override_n_envs is not None: + hparams_kwargs = asdict(hparams) + hparams_kwargs["n_envs"] = override_n_envs + if override_n_envs == 1: + hparams_kwargs["vec_env_class"] = "sync" + hparams = EnvHyperparams(**hparams_kwargs) + return make_env(config, hparams, **kwargs) \ No newline at end of file diff --git a/rl_algo_impls/shared/vec_env/microrts.py b/rl_algo_impls/shared/vec_env/microrts.py new file mode 100644 index 0000000000000000000000000000000000000000..8d43dd0e0c299c58074679d671ccd3a15b08d8dd --- /dev/null +++ b/rl_algo_impls/shared/vec_env/microrts.py @@ -0,0 +1,94 @@ +from dataclasses import astuple +from typing import Optional + +import gym +import numpy as np +from torch.utils.tensorboard.writer import SummaryWriter + +from rl_algo_impls.runner.config import Config, EnvHyperparams +from rl_algo_impls.wrappers.action_mask_wrapper import MicrortsMaskWrapper +from rl_algo_impls.wrappers.episode_stats_writer import EpisodeStatsWriter +from rl_algo_impls.wrappers.hwc_to_chw_observation import HwcToChwObservation +from rl_algo_impls.wrappers.is_vector_env import IsVectorEnv +from rl_algo_impls.wrappers.microrts_stats_recorder import MicrortsStatsRecorder +from rl_algo_impls.wrappers.vectorable_wrapper import VecEnv + + +def make_microrts_env( + config: Config, + hparams: EnvHyperparams, + training: bool = True, + render: bool = False, + normalize_load_path: Optional[str] = None, + tb_writer: Optional[SummaryWriter] = None, +) -> VecEnv: + import gym_microrts + from gym_microrts import microrts_ai + + from rl_algo_impls.shared.vec_env.microrts_compat import ( + MicroRTSGridModeVecEnvCompat, + ) + + ( + _, # env_type + n_envs, + _, # frame_stack + make_kwargs, + _, # no_reward_timeout_steps + _, # no_reward_fire_steps + _, # vec_env_class + _, # normalize + _, # normalize_kwargs, + rolling_length, + _, # train_record_video + _, # video_step_interval + _, # initial_steps_to_truncate + _, # clip_atari_rewards + _, # normalize_type + _, # mask_actions + bots, + ) = astuple(hparams) + + seed = config.seed(training=training) + + make_kwargs = make_kwargs or {} + if "num_selfplay_envs" not in make_kwargs: + make_kwargs["num_selfplay_envs"] = 0 + if "num_bot_envs" not in make_kwargs: + make_kwargs["num_bot_envs"] = n_envs - make_kwargs["num_selfplay_envs"] + if "reward_weight" in make_kwargs: + make_kwargs["reward_weight"] = np.array(make_kwargs["reward_weight"]) + if bots: + ai2s = [] + for ai_name, n in bots.items(): + for _ in range(n): + if len(ai2s) >= make_kwargs["num_bot_envs"]: + break + ai = getattr(microrts_ai, ai_name) + assert ai, f"{ai_name} not in microrts_ai" + ai2s.append(ai) + else: + ai2s = [microrts_ai.randomAI for _ in make_kwargs["num_bot_envs"]] + make_kwargs["ai2s"] = ai2s + envs = MicroRTSGridModeVecEnvCompat(**make_kwargs) + envs = HwcToChwObservation(envs) + envs = IsVectorEnv(envs) + envs = MicrortsMaskWrapper(envs) + + if seed is not None: + envs.action_space.seed(seed) + envs.observation_space.seed(seed) + + envs = gym.wrappers.RecordEpisodeStatistics(envs) + envs = MicrortsStatsRecorder(envs, config.algo_hyperparams.get("gamma", 0.99)) + if training: + assert tb_writer + envs = EpisodeStatsWriter( + envs, + tb_writer, + training=training, + rolling_length=rolling_length, + additional_keys_to_log=config.additional_keys_to_log, + ) + + return envs diff --git a/rl_algo_impls/shared/vec_env/microrts_compat.py b/rl_algo_impls/shared/vec_env/microrts_compat.py new file mode 100644 index 0000000000000000000000000000000000000000..c49908771dda12ec666615b8bec62312dc5b0580 --- /dev/null +++ b/rl_algo_impls/shared/vec_env/microrts_compat.py @@ -0,0 +1,49 @@ +from typing import TypeVar + +import numpy as np +from gym_microrts.envs.vec_env import MicroRTSGridModeVecEnv +from jpype.types import JArray, JInt + +from rl_algo_impls.wrappers.vectorable_wrapper import VecEnvStepReturn + +MicroRTSGridModeVecEnvCompatSelf = TypeVar( + "MicroRTSGridModeVecEnvCompatSelf", bound="MicroRTSGridModeVecEnvCompat" +) + + +class MicroRTSGridModeVecEnvCompat(MicroRTSGridModeVecEnv): + def step(self, action: np.ndarray) -> VecEnvStepReturn: + indexed_actions = np.concatenate( + [ + np.expand_dims( + np.stack( + [np.arange(0, action.shape[1]) for i in range(self.num_envs)] + ), + axis=2, + ), + action, + ], + axis=2, + ) + action_mask = np.array(self.vec_client.getMasks(0), dtype=np.bool8).reshape( + indexed_actions.shape[:-1] + (-1,) + ) + valid_action_mask = action_mask[:, :, 0] + valid_actions_counts = valid_action_mask.sum(1) + valid_actions = indexed_actions[valid_action_mask] + valid_actions_idx = 0 + + all_valid_actions = [] + for env_act_cnt in valid_actions_counts: + env_valid_actions = [] + for _ in range(env_act_cnt): + env_valid_actions.append(JArray(JInt)(valid_actions[valid_actions_idx])) + valid_actions_idx += 1 + all_valid_actions.append(JArray(JArray(JInt))(env_valid_actions)) + return super().step(JArray(JArray(JArray(JInt)))(all_valid_actions)) # type: ignore + + @property + def unwrapped( + self: MicroRTSGridModeVecEnvCompatSelf, + ) -> MicroRTSGridModeVecEnvCompatSelf: + return self diff --git a/rl_algo_impls/shared/vec_env/procgen.py b/rl_algo_impls/shared/vec_env/procgen.py new file mode 100644 index 0000000000000000000000000000000000000000..d339799b75f8ec8724e3ef6b345a7e290d731c49 --- /dev/null +++ b/rl_algo_impls/shared/vec_env/procgen.py @@ -0,0 +1,81 @@ +from dataclasses import astuple +from typing import Optional + +import gym +import numpy as np +from torch.utils.tensorboard.writer import SummaryWriter + +from rl_algo_impls.runner.config import Config, EnvHyperparams +from rl_algo_impls.wrappers.episode_stats_writer import EpisodeStatsWriter +from rl_algo_impls.wrappers.hwc_to_chw_observation import HwcToChwObservation +from rl_algo_impls.wrappers.is_vector_env import IsVectorEnv +from rl_algo_impls.wrappers.vectorable_wrapper import VecEnv + + +def make_procgen_env( + config: Config, + hparams: EnvHyperparams, + training: bool = True, + render: bool = False, + normalize_load_path: Optional[str] = None, + tb_writer: Optional[SummaryWriter] = None, +) -> VecEnv: + from gym3 import ExtractDictObWrapper, ViewerWrapper + from procgen.env import ProcgenGym3Env, ToBaselinesVecEnv + + ( + _, # env_type + n_envs, + _, # frame_stack + make_kwargs, + _, # no_reward_timeout_steps + _, # no_reward_fire_steps + _, # vec_env_class + normalize, + normalize_kwargs, + rolling_length, + _, # train_record_video + _, # video_step_interval + _, # initial_steps_to_truncate + _, # clip_atari_rewards + _, # normalize_type + _, # mask_actions + _, # bots + ) = astuple(hparams) + + seed = config.seed(training=training) + + make_kwargs = make_kwargs or {} + make_kwargs["render_mode"] = "rgb_array" + if seed is not None: + make_kwargs["rand_seed"] = seed + + envs = ProcgenGym3Env(n_envs, config.env_id, **make_kwargs) + envs = ExtractDictObWrapper(envs, key="rgb") + if render: + envs = ViewerWrapper(envs, info_key="rgb") + envs = ToBaselinesVecEnv(envs) + envs = IsVectorEnv(envs) + # TODO: Handle Grayscale and/or FrameStack + envs = HwcToChwObservation(envs) + + envs = gym.wrappers.RecordEpisodeStatistics(envs) + + if seed is not None: + envs.action_space.seed(seed) + envs.observation_space.seed(seed) + + if training: + assert tb_writer + envs = EpisodeStatsWriter( + envs, tb_writer, training=training, rolling_length=rolling_length + ) + if normalize and training: + normalize_kwargs = normalize_kwargs or {} + envs = gym.wrappers.NormalizeReward(envs) + clip_obs = normalize_kwargs.get("clip_reward", 10.0) + envs = gym.wrappers.TransformReward( + envs, lambda r: np.clip(r, -clip_obs, clip_obs) + ) + + return envs # type: ignore diff --git a/rl_algo_impls/shared/vec_env/utils.py b/rl_algo_impls/shared/vec_env/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..af2e69ddd5e49be855f72f98e7c8a4a325f5c075 --- /dev/null +++ b/rl_algo_impls/shared/vec_env/utils.py @@ -0,0 +1,31 @@ +import gym + +from rl_algo_impls.runner.config import Config + + +def import_for_env_id(env_id: str) -> None: + if "BulletEnv" in env_id: + import pybullet_envs + if "Microrts" in env_id: + import gym_microrts + + +def is_atari(config: Config) -> bool: + spec = gym.spec(config.env_id) + return "AtariEnv" in str(spec.entry_point) + + +def is_bullet_env(config: Config) -> bool: + return "BulletEnv" in config.env_id + + +def is_car_racing(config: Config) -> bool: + return "CarRacing" in config.env_id + + +def is_gym_procgen(config: Config) -> bool: + return "procgen" in config.env_id + + +def is_microrts(config: Config) -> bool: + return "Microrts" in config.env_id \ No newline at end of file diff --git a/rl_algo_impls/runner/env.py b/rl_algo_impls/shared/vec_env/vec_env.py similarity index 60% rename from rl_algo_impls/runner/env.py rename to rl_algo_impls/shared/vec_env/vec_env.py index ccdd8343c9f8c853e839a030b858fda50e53a7cb..68079fc08cb3b45afd2de4710e3daa506d911c7b 100644 --- a/rl_algo_impls/runner/env.py +++ b/rl_algo_impls/shared/vec_env/vec_env.py @@ -1,32 +1,38 @@ -import gym -import numpy as np import os +from dataclasses import astuple +from typing import Callable, Optional -from dataclasses import asdict, astuple +import gym from gym.vector.async_vector_env import AsyncVectorEnv from gym.vector.sync_vector_env import SyncVectorEnv -from gym.wrappers.resize_observation import ResizeObservation -from gym.wrappers.gray_scale_observation import GrayScaleObservation from gym.wrappers.frame_stack import FrameStack -from stable_baselines3.common.atari_wrappers import ( - MaxAndSkipEnv, - NoopResetEnv, -) +from gym.wrappers.gray_scale_observation import GrayScaleObservation +from gym.wrappers.resize_observation import ResizeObservation +from stable_baselines3.common.atari_wrappers import MaxAndSkipEnv, NoopResetEnv from stable_baselines3.common.vec_env.dummy_vec_env import DummyVecEnv from stable_baselines3.common.vec_env.subproc_vec_env import SubprocVecEnv from stable_baselines3.common.vec_env.vec_normalize import VecNormalize from torch.utils.tensorboard.writer import SummaryWriter -from typing import Callable, Optional from rl_algo_impls.runner.config import Config, EnvHyperparams from rl_algo_impls.shared.policy.policy import VEC_NORMALIZE_FILENAME +from rl_algo_impls.shared.vec_env.utils import ( + import_for_env_id, + is_atari, + is_bullet_env, + is_car_racing, + is_gym_procgen, + is_microrts, +) +from rl_algo_impls.wrappers.action_mask_wrapper import SingleActionMaskWrapper from rl_algo_impls.wrappers.atari_wrappers import ( + ClipRewardEnv, EpisodicLifeEnv, FireOnLifeStarttEnv, - ClipRewardEnv, ) from rl_algo_impls.wrappers.episode_record_video import EpisodeRecordVideo from rl_algo_impls.wrappers.episode_stats_writer import EpisodeStatsWriter +from rl_algo_impls.wrappers.hwc_to_chw_observation import HwcToChwObservation from rl_algo_impls.wrappers.initial_step_truncate_wrapper import ( InitialStepTruncateWrapper, ) @@ -37,59 +43,11 @@ from rl_algo_impls.wrappers.normalize import NormalizeObservation, NormalizeRewa from rl_algo_impls.wrappers.sync_vector_env_render_compat import ( SyncVectorEnvRenderCompat, ) -from rl_algo_impls.wrappers.transpose_image_observation import TransposeImageObservation from rl_algo_impls.wrappers.vectorable_wrapper import VecEnv from rl_algo_impls.wrappers.video_compat_wrapper import VideoCompatWrapper -def make_env( - config: Config, - hparams: EnvHyperparams, - training: bool = True, - render: bool = False, - normalize_load_path: Optional[str] = None, - tb_writer: Optional[SummaryWriter] = None, -) -> VecEnv: - if hparams.env_type == "procgen": - return _make_procgen_env( - config, - hparams, - training=training, - render=render, - normalize_load_path=normalize_load_path, - tb_writer=tb_writer, - ) - elif hparams.env_type in {"sb3vec", "gymvec"}: - return _make_vec_env( - config, - hparams, - training=training, - render=render, - normalize_load_path=normalize_load_path, - tb_writer=tb_writer, - ) - else: - raise ValueError(f"env_type {hparams.env_type} not supported") - - -def make_eval_env( - config: Config, - hparams: EnvHyperparams, - override_n_envs: Optional[int] = None, - **kwargs, -) -> VecEnv: - kwargs = kwargs.copy() - kwargs["training"] = False - if override_n_envs is not None: - hparams_kwargs = asdict(hparams) - hparams_kwargs["n_envs"] = override_n_envs - if override_n_envs == 1: - hparams_kwargs["vec_env_class"] = "sync" - hparams = EnvHyperparams(**hparams_kwargs) - return make_env(config, hparams, **kwargs) - - -def _make_vec_env( +def make_vec_env( config: Config, hparams: EnvHyperparams, training: bool = True, @@ -112,22 +70,22 @@ def _make_vec_env( video_step_interval, initial_steps_to_truncate, clip_atari_rewards, + normalize_type, + mask_actions, + _, # bots ) = astuple(hparams) - if "BulletEnv" in config.env_id: - import pybullet_envs + import_for_env_id(config.env_id) - spec = gym.spec(config.env_id) seed = config.seed(training=training) make_kwargs = make_kwargs.copy() if make_kwargs is not None else {} - if "BulletEnv" in config.env_id and render: + if is_bullet_env(config) and render: make_kwargs["render"] = True - if "CarRacing" in config.env_id: + if is_car_racing(config): make_kwargs["verbose"] = 0 - if "procgen" in config.env_id: - if not render: - make_kwargs["render_mode"] = "rgb_array" + if is_gym_procgen(config) and not render: + make_kwargs["render_mode"] = "rgb_array" def make(idx: int) -> Callable[[], gym.Env]: def _make() -> gym.Env: @@ -145,7 +103,7 @@ def _make_vec_env( env = InitialStepTruncateWrapper( env, idx * initial_steps_to_truncate // n_envs ) - if "AtariEnv" in spec.entry_point: # type: ignore + if is_atari(config): # type: ignore env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env, skip=4) env = EpisodicLifeEnv(env, training=training) @@ -157,16 +115,18 @@ def _make_vec_env( env = ResizeObservation(env, (84, 84)) env = GrayScaleObservation(env, keep_dim=False) env = FrameStack(env, frame_stack) - elif "CarRacing" in config.env_id: + elif is_car_racing(config): env = ResizeObservation(env, (64, 64)) env = GrayScaleObservation(env, keep_dim=False) env = FrameStack(env, frame_stack) - elif "procgen" in config.env_id: + elif is_gym_procgen(config): # env = GrayScaleObservation(env, keep_dim=False) env = NoopEnvSeed(env) - env = TransposeImageObservation(env) + env = HwcToChwObservation(env) if frame_stack > 1: env = FrameStack(env, frame_stack) + elif is_microrts(config): + env = HwcToChwObservation(env) if no_reward_timeout_steps: env = NoRewardTimeout( @@ -191,14 +151,20 @@ def _make_vec_env( envs = VecEnvClass([make(i) for i in range(n_envs)]) if env_type == "gymvec" and vec_env_class == "sync": envs = SyncVectorEnvRenderCompat(envs) + if env_type == "sb3vec": + envs = IsVectorEnv(envs) + if mask_actions: + envs = SingleActionMaskWrapper(envs) if training: assert tb_writer envs = EpisodeStatsWriter( envs, tb_writer, training=training, rolling_length=rolling_length ) if normalize: + if normalize_type is None: + normalize_type = "sb3" if env_type == "sb3vec" else "gymlike" normalize_kwargs = normalize_kwargs or {} - if env_type == "sb3vec": + if normalize_type == "sb3": if normalize_load_path: envs = VecNormalize.load( os.path.join(normalize_load_path, VEC_NORMALIZE_FILENAME), @@ -212,7 +178,7 @@ def _make_vec_env( ) if not training: envs.norm_reward = False - else: + elif normalize_type == "gymlike": if normalize_kwargs.get("norm_obs", True): envs = NormalizeObservation( envs, training=training, clip=normalize_kwargs.get("clip_obs", 10.0) @@ -223,70 +189,8 @@ def _make_vec_env( training=training, clip=normalize_kwargs.get("clip_reward", 10.0), ) + else: + raise ValueError( + f"normalize_type {normalize_type} not supported (sb3 or gymlike)" + ) return envs - - -def _make_procgen_env( - config: Config, - hparams: EnvHyperparams, - training: bool = True, - render: bool = False, - normalize_load_path: Optional[str] = None, - tb_writer: Optional[SummaryWriter] = None, -) -> VecEnv: - from gym3 import ViewerWrapper, ExtractDictObWrapper - from procgen.env import ProcgenGym3Env, ToBaselinesVecEnv - - ( - _, # env_type - n_envs, - _, # frame_stack - make_kwargs, - _, # no_reward_timeout_steps - _, # no_reward_fire_steps - _, # vec_env_class - normalize, - normalize_kwargs, - rolling_length, - _, # train_record_video - _, # video_step_interval - _, # initial_steps_to_truncate - _, # clip_atari_rewards - ) = astuple(hparams) - - seed = config.seed(training=training) - - make_kwargs = make_kwargs or {} - make_kwargs["render_mode"] = "rgb_array" - if seed is not None: - make_kwargs["rand_seed"] = seed - - envs = ProcgenGym3Env(n_envs, config.env_id, **make_kwargs) - envs = ExtractDictObWrapper(envs, key="rgb") - if render: - envs = ViewerWrapper(envs, info_key="rgb") - envs = ToBaselinesVecEnv(envs) - envs = IsVectorEnv(envs) - # TODO: Handle Grayscale and/or FrameStack - envs = TransposeImageObservation(envs) - - envs = gym.wrappers.RecordEpisodeStatistics(envs) - - if seed is not None: - envs.action_space.seed(seed) - envs.observation_space.seed(seed) - - if training: - assert tb_writer - envs = EpisodeStatsWriter( - envs, tb_writer, training=training, rolling_length=rolling_length - ) - if normalize and training: - normalize_kwargs = normalize_kwargs or {} - envs = gym.wrappers.NormalizeReward(envs) - clip_obs = normalize_kwargs.get("clip_reward", 10.0) - envs = gym.wrappers.TransformReward( - envs, lambda r: np.clip(r, -clip_obs, clip_obs) - ) - - return envs # type: ignore diff --git a/rl_algo_impls/train.py b/rl_algo_impls/train.py index b1518ca570e1aade189c976fcfc654dbfb4ffc18..afbbc2fc63d9784befe308279dd85731dc36033c 100644 --- a/rl_algo_impls/train.py +++ b/rl_algo_impls/train.py @@ -6,7 +6,8 @@ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" from multiprocessing import Pool from rl_algo_impls.runner.running_utils import base_parser -from rl_algo_impls.runner.train import train as runner_train, TrainArgs +from rl_algo_impls.runner.train import TrainArgs +from rl_algo_impls.runner.train import train as runner_train def train() -> None: diff --git a/rl_algo_impls/vpg/policy.py b/rl_algo_impls/vpg/policy.py index e016592fd079ba614ae40f51c70e0119a29113bc..65d29e7d0bc4cb90866cb4c293d4092e313c765a 100644 --- a/rl_algo_impls/vpg/policy.py +++ b/rl_algo_impls/vpg/policy.py @@ -1,21 +1,15 @@ +from typing import Optional, Sequence, Tuple + import numpy as np import torch import torch.nn as nn -from typing import Optional, Sequence - -from rl_algo_impls.shared.module.feature_extractor import FeatureExtractor -from rl_algo_impls.shared.policy.actor import ( - PiForward, - Actor, - StateDependentNoiseActorHead, - actor_head, -) +from rl_algo_impls.shared.actor import Actor, PiForward, actor_head +from rl_algo_impls.shared.encoder import Encoder from rl_algo_impls.shared.policy.critic import CriticHead from rl_algo_impls.shared.policy.on_policy import ( - Step, - ACForward, OnPolicy, + Step, clamp_actions, default_hidden_sizes, ) @@ -23,8 +17,8 @@ from rl_algo_impls.shared.policy.policy import ACTIVATION from rl_algo_impls.wrappers.vectorable_wrapper import ( VecEnv, VecEnvObs, - single_observation_space, single_action_space, + single_observation_space, ) PI_FILE_NAME = "pi.pt" @@ -32,7 +26,7 @@ V_FILE_NAME = "v.pt" class VPGActor(Actor): - def __init__(self, feature_extractor: FeatureExtractor, head: Actor) -> None: + def __init__(self, feature_extractor: Encoder, head: Actor) -> None: super().__init__() self.feature_extractor = feature_extractor self.head = head @@ -41,6 +35,13 @@ class VPGActor(Actor): fe = self.feature_extractor(obs) return self.head(fe, a) + def sample_weights(self, batch_size: int = 1) -> None: + self.head.sample_weights(batch_size=batch_size) + + @property + def action_shape(self) -> Tuple[int, ...]: + return self.head.action_shape + class VPGActorCritic(OnPolicy): def __init__( @@ -68,12 +69,13 @@ class VPGActorCritic(OnPolicy): else default_hidden_sizes(obs_space) ) - pi_feature_extractor = FeatureExtractor( + pi_feature_extractor = Encoder( obs_space, activation, init_layers_orthogonal=init_layers_orthogonal ) pi_head = actor_head( self.action_space, - (pi_feature_extractor.out_dim,) + tuple(hidden_sizes), + pi_feature_extractor.out_dim, + tuple(hidden_sizes), init_layers_orthogonal, activation, log_std_init=log_std_init, @@ -83,11 +85,12 @@ class VPGActorCritic(OnPolicy): ) self.pi = VPGActor(pi_feature_extractor, pi_head) - v_feature_extractor = FeatureExtractor( + v_feature_extractor = Encoder( obs_space, activation, init_layers_orthogonal=init_layers_orthogonal ) v_head = CriticHead( - (v_feature_extractor.out_dim,) + tuple(hidden_sizes), + v_feature_extractor.out_dim, + tuple(hidden_sizes), activation=activation, init_layers_orthogonal=init_layers_orthogonal, ) @@ -99,7 +102,10 @@ class VPGActorCritic(OnPolicy): v = self.v(o) return v.cpu().numpy() - def step(self, obs: VecEnvObs) -> Step: + def step(self, obs: VecEnvObs, action_masks: Optional[np.ndarray] = None) -> Step: + assert ( + action_masks is None + ), f"action_masks not currently supported in {self.__class__.__name__}" o = self._as_tensor(obs) with torch.no_grad(): pi, _, _ = self.pi(o) @@ -112,7 +118,15 @@ class VPGActorCritic(OnPolicy): clamped_a_np = clamp_actions(a_np, self.action_space, self.squash_output) return Step(a_np, v.cpu().numpy(), logp_a.cpu().numpy(), clamped_a_np) - def act(self, obs: np.ndarray, deterministic: bool = True) -> np.ndarray: + def act( + self, + obs: np.ndarray, + deterministic: bool = True, + action_masks: Optional[np.ndarray] = None, + ) -> np.ndarray: + assert ( + action_masks is None + ), f"action_masks not currently supported in {self.__class__.__name__}" if not deterministic: return self.step(obs).clamped_a else: @@ -127,7 +141,10 @@ class VPGActorCritic(OnPolicy): self.reset_noise() def reset_noise(self, batch_size: Optional[int] = None) -> None: - if isinstance(self.pi.head, StateDependentNoiseActorHead): - self.pi.head.sample_weights( - batch_size=batch_size if batch_size else self.env.num_envs - ) + self.pi.sample_weights( + batch_size=batch_size if batch_size else self.env.num_envs + ) + + @property + def action_shape(self) -> Tuple[int, ...]: + return self.pi.action_shape diff --git a/rl_algo_impls/vpg/vpg.py b/rl_algo_impls/vpg/vpg.py index 9605efb23097a11d4cfd50b6e86b67162b10873e..9a61a860a4a4be66ed2747bb799fa36903832bae 100644 --- a/rl_algo_impls/vpg/vpg.py +++ b/rl_algo_impls/vpg/vpg.py @@ -10,7 +10,7 @@ from typing import Optional, Sequence, TypeVar from rl_algo_impls.shared.algorithm import Algorithm from rl_algo_impls.shared.callbacks.callback import Callback -from rl_algo_impls.shared.gae import compute_rtg_and_advantage, compute_advantage +from rl_algo_impls.shared.gae import compute_rtg_and_advantage_from_trajectories from rl_algo_impls.shared.trajectory import Trajectory, TrajectoryAccumulator from rl_algo_impls.vpg.policy import VPGActorCritic from rl_algo_impls.wrappers.vectorable_wrapper import VecEnv @@ -58,7 +58,6 @@ class VanillaPolicyGradient(Algorithm): max_grad_norm: float = 10.0, n_steps: int = 4_000, sde_sample_freq: int = -1, - update_rtg_between_v_iters: bool = False, ent_coef: float = 0.0, ) -> None: super().__init__(policy, env, device, tb_writer) @@ -73,7 +72,6 @@ class VanillaPolicyGradient(Algorithm): self.n_steps = n_steps self.train_v_iters = train_v_iters self.sde_sample_freq = sde_sample_freq - self.update_rtg_between_v_iters = update_rtg_between_v_iters self.ent_coef = ent_coef @@ -118,7 +116,7 @@ class VanillaPolicyGradient(Algorithm): act = torch.as_tensor( np.concatenate([np.array(t.act) for t in trajectories]), device=self.device ) - rtg, adv = compute_rtg_and_advantage( + rtg, adv = compute_rtg_and_advantage_from_trajectories( trajectories, self.policy, self.gamma, self.gae_lambda, self.device ) @@ -135,10 +133,6 @@ class VanillaPolicyGradient(Algorithm): v_loss = 0 for _ in range(self.train_v_iters): - if self.update_rtg_between_v_iters: - rtg = compute_advantage( - trajectories, self.policy, self.gamma, self.gae_lambda, self.device - ) v = self.policy.v(obs) v_loss = ((v - rtg) ** 2).mean() diff --git a/rl_algo_impls/wrappers/action_mask_wrapper.py b/rl_algo_impls/wrappers/action_mask_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..fda16444611a9cc6f348638bd193ce36bddb7db0 --- /dev/null +++ b/rl_algo_impls/wrappers/action_mask_wrapper.py @@ -0,0 +1,42 @@ +from typing import Optional, Union + +import numpy as np + +from rl_algo_impls.wrappers.vectorable_wrapper import ( + VecEnv, + VecotarableWrapper, + find_wrapper, +) + + +class IncompleteArrayError(Exception): + pass + + +class SingleActionMaskWrapper(VecotarableWrapper): + def action_masks(self) -> Optional[np.ndarray]: + envs = getattr(self.env.unwrapped, "envs") + assert ( + envs + ), f"{self.__class__.__name__} expects to wrap synchronous vectorized env" + masks = [getattr(e.unwrapped, "action_mask") for e in envs] + assert all(m is not None for m in masks) + return np.array(masks, dtype=np.bool8) + + +class MicrortsMaskWrapper(VecotarableWrapper): + def action_masks(self) -> np.ndarray: + microrts_env = self.env.unwrapped # type: ignore + vec_client = getattr(microrts_env, "vec_client") + assert ( + vec_client + ), f"{microrts_env.__class__.__name__} must have vec_client property (as MicroRTSVecEnv does)" + return np.array(vec_client.getMasks(0), dtype=np.bool8) + + +def find_action_masker( + env: VecEnv, +) -> Optional[Union[SingleActionMaskWrapper, MicrortsMaskWrapper]]: + return find_wrapper(env, SingleActionMaskWrapper) or find_wrapper( + env, MicrortsMaskWrapper + ) diff --git a/rl_algo_impls/wrappers/episode_stats_writer.py b/rl_algo_impls/wrappers/episode_stats_writer.py index 21018e3e03e9f98a3948df5a93c889bb0e17d42f..e82c6643134c8ce0694cd660a57192031591d376 100644 --- a/rl_algo_impls/wrappers/episode_stats_writer.py +++ b/rl_algo_impls/wrappers/episode_stats_writer.py @@ -1,14 +1,14 @@ -import numpy as np - from collections import deque +from typing import Any, Dict, List, Optional + +import numpy as np from torch.utils.tensorboard.writer import SummaryWriter -from typing import Any, Dict, List from rl_algo_impls.shared.stats import Episode, EpisodesStats from rl_algo_impls.wrappers.vectorable_wrapper import ( - VecotarableWrapper, - VecEnvStepReturn, VecEnvObs, + VecEnvStepReturn, + VecotarableWrapper, ) @@ -19,6 +19,7 @@ class EpisodeStatsWriter(VecotarableWrapper): tb_writer: SummaryWriter, training: bool = True, rolling_length=100, + additional_keys_to_log: Optional[List[str]] = None, ): super().__init__(env) self.training = training @@ -28,6 +29,9 @@ class EpisodeStatsWriter(VecotarableWrapper): self.total_steps = 0 self.episode_cnt = 0 self.last_episode_cnt_print = 0 + self.additional_keys_to_log = ( + additional_keys_to_log if additional_keys_to_log is not None else [] + ) def step(self, actions: np.ndarray) -> VecEnvStepReturn: obs, rews, dones, infos = self.env.step(actions) @@ -46,7 +50,8 @@ class EpisodeStatsWriter(VecotarableWrapper): for info in infos: ep_info = info.get("episode") if ep_info: - episode = Episode(ep_info["r"], ep_info["l"]) + additional_info = {k: info[k] for k in self.additional_keys_to_log} + episode = Episode(ep_info["r"], ep_info["l"], info=additional_info) step_episodes.append(episode) self.episodes.append(episode) if step_episodes: diff --git a/rl_algo_impls/wrappers/transpose_image_observation.py b/rl_algo_impls/wrappers/hwc_to_chw_observation.py similarity index 95% rename from rl_algo_impls/wrappers/transpose_image_observation.py rename to rl_algo_impls/wrappers/hwc_to_chw_observation.py index 7076c9146fa28cd266a2466b58ac6a6b6555d59b..fa570dca9c955d1411a8432d3369b19566e1ab8d 100644 --- a/rl_algo_impls/wrappers/transpose_image_observation.py +++ b/rl_algo_impls/wrappers/hwc_to_chw_observation.py @@ -5,7 +5,7 @@ from gym import ObservationWrapper from gym.spaces import Box -class TransposeImageObservation(ObservationWrapper): +class HwcToChwObservation(ObservationWrapper): def __init__(self, env: gym.Env) -> None: super().__init__(env) diff --git a/rl_algo_impls/wrappers/microrts_stats_recorder.py b/rl_algo_impls/wrappers/microrts_stats_recorder.py new file mode 100644 index 0000000000000000000000000000000000000000..7e90a845eaf09420f05e82551e55267eac08792d --- /dev/null +++ b/rl_algo_impls/wrappers/microrts_stats_recorder.py @@ -0,0 +1,36 @@ +from typing import Any, Dict, List + +import numpy as np + +from rl_algo_impls.wrappers.vectorable_wrapper import ( + VecEnvObs, + VecEnvStepReturn, + VecotarableWrapper, +) + + +class MicrortsStatsRecorder(VecotarableWrapper): + def __init__(self, env, gamma: float) -> None: + super().__init__(env) + self.gamma = gamma + self.raw_rewards = [[] for _ in range(self.num_envs)] + + def reset(self) -> VecEnvObs: + obs = super().reset() + self.raw_rewards = [[] for _ in range(self.num_envs)] + return obs + + def step(self, actions: np.ndarray) -> VecEnvStepReturn: + obs, rews, dones, infos = self.env.step(actions) + self._update_infos(infos, dones) + return obs, rews, dones, infos + + def _update_infos(self, infos: List[Dict[str, Any]], dones: np.ndarray) -> None: + for idx, info in enumerate(infos): + self.raw_rewards[idx].append(info["raw_rewards"]) + for idx, (info, done) in enumerate(zip(infos, dones)): + if done: + raw_rewards = np.array(self.raw_rewards[idx]).sum(0) + raw_names = [str(rf) for rf in self.env.unwrapped.rfs] + info["microrts_stats"] = dict(zip(raw_names, raw_rewards)) + self.raw_rewards[idx] = [] diff --git a/rl_algo_impls/wrappers/vectorable_wrapper.py b/rl_algo_impls/wrappers/vectorable_wrapper.py index 03df8d1400ab84242353f5dc1288a4394158b941..e2ae459698ae80aacecfb48977feeef1eee25c8a 100644 --- a/rl_algo_impls/wrappers/vectorable_wrapper.py +++ b/rl_algo_impls/wrappers/vectorable_wrapper.py @@ -1,8 +1,8 @@ +from typing import Dict, List, Optional, Tuple, Type, TypeVar, Union + import numpy as np from gym import Env, Space, Wrapper - from stable_baselines3.common.vec_env import VecEnv as SB3VecEnv -from typing import Dict, List, Optional, Type, TypeVar, Tuple, Union VecEnvObs = Union[np.ndarray, Dict[str, np.ndarray], Tuple[np.ndarray, ...]] VecEnvStepReturn = Tuple[VecEnvObs, np.ndarray, np.ndarray, List[Dict]] diff --git a/saved_models/ppo-BipedalWalker-v3-S1-best/model.pth b/saved_models/ppo-BipedalWalker-v3-S1-best/model.pth deleted file mode 100644 index b7730ff84f463066c7ca8180c38f22f0c9ebe5a1..0000000000000000000000000000000000000000 --- a/saved_models/ppo-BipedalWalker-v3-S1-best/model.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c52d6b0b055f9eb72d1e2ad30b46251fa2ca31cacf456da75aef38a1a0f5679b -size 51232 diff --git a/saved_models/ppo-BipedalWalker-v3-S1-best/norm_obs.npz b/saved_models/ppo-BipedalWalker-v3-S1-best/norm_obs.npz deleted file mode 100644 index 9d5de8e511bd164303429447cee63a7295c08510..0000000000000000000000000000000000000000 --- a/saved_models/ppo-BipedalWalker-v3-S1-best/norm_obs.npz +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4d59dca3f8c1f7274eae367a669bc8214f12982b3090c35074ad71c95ba8effe -size 974 diff --git a/saved_models/ppo-BipedalWalker-v3-S1-best/norm_reward.npz b/saved_models/ppo-BipedalWalker-v3-S1-best/norm_reward.npz deleted file mode 100644 index 6f4b2b9eed59f09d327793cd33b29afb41717468..0000000000000000000000000000000000000000 --- a/saved_models/ppo-BipedalWalker-v3-S1-best/norm_reward.npz +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:af1f0061efcb6ae1e439584790e2717b086a347dec17bb6f33ca34ff5f8fb50a -size 581 diff --git a/saved_models/ppo-BipedalWalker-v3-S3-best/model.pth b/saved_models/ppo-BipedalWalker-v3-S3-best/model.pth new file mode 100644 index 0000000000000000000000000000000000000000..79b1d1befdf23e43c214974623ef62311b12c288 --- /dev/null +++ b/saved_models/ppo-BipedalWalker-v3-S3-best/model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49678f9ff4f809a7ae1ec14f9113c2a961c7b0e94e4f8195ec2a03da729409fb +size 51296 diff --git a/saved_models/ppo-BipedalWalker-v3-S3-best/norm_obs.npz b/saved_models/ppo-BipedalWalker-v3-S3-best/norm_obs.npz new file mode 100644 index 0000000000000000000000000000000000000000..2da83aab5ae6b5c2c6c6ef126afbfdea22de36ce --- /dev/null +++ b/saved_models/ppo-BipedalWalker-v3-S3-best/norm_obs.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:646e54e590818d9ffe025d4c1a7019a1e0b78d220a3b00c81364f23b774f7052 +size 974 diff --git a/saved_models/ppo-BipedalWalker-v3-S3-best/norm_reward.npz b/saved_models/ppo-BipedalWalker-v3-S3-best/norm_reward.npz new file mode 100644 index 0000000000000000000000000000000000000000..03656e7b56a79ca98c356ba52387a6e7865439ee --- /dev/null +++ b/saved_models/ppo-BipedalWalker-v3-S3-best/norm_reward.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc0881f75ed5f9a0081f0c59cedac6417b42d237c42e787fdb4ed6cd4b7d5151 +size 581 diff --git a/scripts/benchmark.sh b/scripts/benchmark.sh index edb726cfd0e2e8fcc7cce9aa5cfd03bad82492fd..e42b20dbee4b107c095acae7a8588c5b8f99012e 100644 --- a/scripts/benchmark.sh +++ b/scripts/benchmark.sh @@ -6,23 +6,26 @@ do -p) project_name=$2 ;; -s) seeds=$2 ;; -e) envs=$2 ;; - --procgen) procgen=t + --procgen) procgen=t ;; + --microrts) microrts=t ;; + --no-mask-microrts) no_mask_microrts=t ;; + --microrts-ai) microrts_ai=t ;; esac shift done -algos="${algos:-ppo}" +algos="${algos:-ppo a2c dqn vpg}" n_jobs="${n_jobs:-6}" project_name="${project_name:-rl-algo-impls-benchmarks}" seeds="${seeds:-1 2 3}" -DISCRETE_ENVS=( - # Basic +BASIC_ENVS=( "CartPole-v1" "MountainCar-v0" "Acrobot-v1" "LunarLander-v2" - # Atari +) +ATARI_ENVS=( "PongNoFrameskip-v4" "BreakoutNoFrameskip-v4" "SpaceInvadersNoFrameskip-v4" @@ -42,25 +45,45 @@ BOX_ENVS=( ) for algo in $(echo $algos); do - if [ "$algo" = "dqn" ]; then - BENCHMARK_ENVS="${DISCRETE_ENVS[*]}" - else - BENCHMARK_ENVS="${DISCRETE_ENVS[*]} ${BOX_ENVS[*]}" - fi - algo_envs=$envs - if [ -z $algo_envs ]; then - echo "-e unspecified; therefore, benchmark training on ${BENCHMARK_ENVS[*]}" - algo_envs=${BENCHMARK_ENVS[*]} - fi - - PROCGEN_ENVS=( - "procgen-coinrun-easy" - "procgen-starpilot-easy" - "procgen-bossfight-easy" - "procgen-bigfish-easy" - ) if [ "$procgen" = "t" ]; then + PROCGEN_ENVS=( + "procgen-coinrun-easy" + "procgen-starpilot-easy" + "procgen-bossfight-easy" + "procgen-bigfish-easy" + ) algo_envs=${PROCGEN_ENVS[*]} + elif [ "$microrts" = "t" ]; then + MICRORTS_ENVS=( + "MicrortsMining-v1" + "MicrortsAttackShapedReward-v1" + "MicrortsRandomEnemyShapedReward3-v1" + ) + algo_envs=${MICRORTS_ENVS[*]} + elif [ "$no_mask_microrts" = "t" ]; then + NO_MASK_MICRORTS_ENVS=( + "MicrortsMining-v1-NoMask" + "MicrortsAttackShapedReward-v1-NoMask" + "MicrortsRandomEnemyShapedReward3-v1-NoMask" + ) + algo_envs=${NO_MASK_MICRORTS_ENVS[*]} + elif [ "$microrts_ai" == "t" ]; then + MICRORTS_AI_ENVS=( + "MicrortsDefeatCoacAIShaped-v3" + "MicrortsDefeatCoacAIShaped-v3-diverseBots" + ) + algo_envs=${MICRORTS_AI_ENVS[*]} + elif [ -z "$envs" ]; then + if [ "$algo" = "dqn" ]; then + BENCHMARK_ENVS="${BASIC_ENVS[*]} ${ATARI_ENVS[*]}" + elif [ "$algo" = "vpg" ]; then + BENCHMARK_ENVS="${BASIC_ENVS[*]} ${BOX_ENVS[*]}" + else + BENCHMARK_ENVS="${BASIC_ENVS[*]} ${BOX_ENVS[*]} ${ATARI_ENVS[*]}" + fi + algo_envs=${BENCHMARK_ENVS[*]} + else + algo_envs=$envs fi bash scripts/train_loop.sh -a $algo -e "$algo_envs" -p $project_name -s "$seeds" | xargs -I CMD -P $n_jobs bash -c CMD diff --git a/scripts/setup.sh b/scripts/setup.sh index 46862d85eb13670353bca988b85207d3f68bb023..1d77a35da518a682dda22cadb825eb255009489c 100644 --- a/scripts/setup.sh +++ b/scripts/setup.sh @@ -3,8 +3,9 @@ sudo apt install -y python-opengl sudo apt install -y ffmpeg sudo apt install -y xvfb sudo apt install -y swig +sudo apt install -y default-jdk python3 -m pip install --upgrade pip pip install --upgrade torch torchvision torchaudio -python -m pip install --upgrade '.[test,procgen]' \ No newline at end of file +python -m pip install --upgrade '.[all]' \ No newline at end of file diff --git a/scripts/tags_benchmark.sh b/scripts/tags_benchmark.sh index cfde37478f2663ccc41681906ae25cc550bcb814..b44b6404cc9d7ca8fa7fdc1295d234d7dc661ccd 100644 --- a/scripts/tags_benchmark.sh +++ b/scripts/tags_benchmark.sh @@ -1 +1,5 @@ -echo "benchmark_$(git rev-parse --short HEAD) host_$(hostname)" \ No newline at end of file +commit="benchmark_$(git rev-parse --short HEAD)" +host="host_$(hostname)" +branch="branch_$(git rev-parse --abbrev-ref HEAD)" +version="v$(pip show rl_algo_impls | grep Version | sed -e 's#Version:\ ##')" +echo "$commit $host $branch $version" \ No newline at end of file