sgoodfriend commited on Apr 23, 2023

Commit

9d36d7e

•

1 Parent(s): 0589ae3

PPO playing MicrortsDefeatCoacAIShaped-v3 from https://github.com/sgoodfriend/rl-algo-impls/tree/9ba0ab50894e5cea207289f4af8b53cbafa47748

Browse files

Files changed (28) hide show

README.md +16 -15
pyproject.toml +1 -1
replay.meta.json +1 -1
replay.mp4 +0 -0
rl_algo_impls/a2c/optimize.py +9 -5
rl_algo_impls/huggingface_publish.py +1 -1
rl_algo_impls/hyperparams/a2c.yml +13 -12
rl_algo_impls/hyperparams/ppo.yml +5 -5
rl_algo_impls/optimize.py +2 -2
rl_algo_impls/ppo/ppo.py +1 -1
rl_algo_impls/runner/config.py +1 -0
rl_algo_impls/runner/evaluate.py +1 -1
rl_algo_impls/runner/selfplay_evaluate.py +142 -0
rl_algo_impls/runner/train.py +6 -8
rl_algo_impls/selfplay_enjoy.py +53 -0
rl_algo_impls/shared/actor/state_dependent_noise.py +5 -5
rl_algo_impls/shared/callbacks/eval_callback.py +13 -1
rl_algo_impls/shared/policy/actor_critic.py +1 -1
rl_algo_impls/shared/vec_env/make_env.py +7 -6
rl_algo_impls/shared/vec_env/microrts.py +6 -2
rl_algo_impls/shared/vec_env/procgen.py +2 -0
rl_algo_impls/shared/vec_env/vec_env.py +2 -0
rl_algo_impls/wrappers/action_mask_wrapper.py +2 -2
rl_algo_impls/wrappers/microrts_stats_recorder.py +26 -2
rl_algo_impls/wrappers/self_play_wrapper.py +42 -3
rl_algo_impls/wrappers/vec_episode_recorder.py +16 -5
saved_models/ppo-Microrts-selfplay-unet-decay-S1-best/model.pth +1 -1
selfplay_enjoy.py +4 -0

README.md CHANGED Viewed

@@ -10,7 +10,7 @@ model-index:
   results:
   - metrics:
     - type: mean_reward
-      value: 0.85 +/- 0.53
       name: mean_reward
     task:
       type: reinforcement-learning
@@ -27,13 +27,13 @@ All models trained at this commit can be found at https://api.wandb.ai/links/sgo
 ## Training Results
-This model was trained from 3 trainings of **PPO** agents using different initial seeds. These agents were trained by checking out [f7c6f26](https://github.com/sgoodfriend/rl-algo-impls/tree/f7c6f26745a35b21529f65cf3c71dfd6bbf33919). The best and last models were kept from each training. This submission has loaded the best models from each training, reevaluates them, and selects the best model from these latest evaluations (mean - std).
 | algo   | env                           |   seed |   reward_mean |   reward_std |   eval_episodes | best   | wandb_url                                                                    |
 |:-------|:------------------------------|-------:|--------------:|-------------:|----------------:|:-------|:-----------------------------------------------------------------------------|
-| ppo    | MicrortsDefeatCoacAIShaped-v3 |      1 |      0.846154 |     0.532939 |              26 | *      | [wandb](https://wandb.ai/sgoodfriend/rl-algo-impls-benchmarks/runs/coezd1h3) |
-| ppo    | MicrortsDefeatCoacAIShaped-v3 |      2 |      0.615385 |     0.788227 |              26 |        | [wandb](https://wandb.ai/sgoodfriend/rl-algo-impls-benchmarks/runs/leapictl) |
-| ppo    | MicrortsDefeatCoacAIShaped-v3 |      3 |      0.461538 |     0.88712  |              26 |        | [wandb](https://wandb.ai/sgoodfriend/rl-algo-impls-benchmarks/runs/eznziz1s) |
 ### Prerequisites: Weights & Biases (WandB)
@@ -53,10 +53,10 @@ login`.
 Note: While the model state dictionary and hyperaparameters are saved, the latest
 implementation could be sufficiently different to not be able to reproduce similar
 results. You might need to checkout the commit the agent was trained on:
-[f7c6f26](https://github.com/sgoodfriend/rl-algo-impls/tree/f7c6f26745a35b21529f65cf3c71dfd6bbf33919).
 ```
 # Downloads the model, sets hyperparameters, and runs agent for 3 episodes
-python enjoy.py --wandb-run-path=sgoodfriend/rl-algo-impls-benchmarks/coezd1h3
 ```
 Setup hasn't been completely worked out yet, so you might be best served by using Google
@@ -68,7 +68,7 @@ notebook.
 ## Training
 If you want the highest chance to reproduce these results, you'll want to checkout the
-commit the agent was trained on: [f7c6f26](https://github.com/sgoodfriend/rl-algo-impls/tree/f7c6f26745a35b21529f65cf3c71dfd6bbf33919). While
 training is deterministic, different hardware will give different results.
 ```
@@ -107,6 +107,7 @@ close and has some additional data:
 ```
 additional_keys_to_log:
 - microrts_stats
 algo: ppo
 algo_hyperparams:
   batch_size: 3072
@@ -129,7 +130,7 @@ env_hyperparams:
   make_kwargs:
     map_paths:
     - maps/16x16/basesWorkers16x16.xml
-    max_steps: 2000
     num_selfplay_envs: 36
     render_theme: 2
     reward_weight:
@@ -142,10 +143,10 @@ env_hyperparams:
   n_envs: 24
   self_play_kwargs:
     num_old_policies: 12
-    save_steps: 200000
-    swap_steps: 10000
     swap_window_size: 4
-    window: 25
 env_id: MicrortsDefeatCoacAIShaped-v3
 eval_hyperparams:
   deterministic: false
@@ -199,9 +200,9 @@ wandb_entity: null
 wandb_group: null
 wandb_project_name: rl-algo-impls-benchmarks
 wandb_tags:
-- benchmark_f7c6f26
-- host_192-9-151-120
-- branch_selfplay
 - v0.0.9
 ```

   results:
   - metrics:
     - type: mean_reward
+      value: 0.77 +/- 0.64
       name: mean_reward
     task:
       type: reinforcement-learning
 ## Training Results
+This model was trained from 3 trainings of **PPO** agents using different initial seeds. These agents were trained by checking out [9ba0ab5](https://github.com/sgoodfriend/rl-algo-impls/tree/9ba0ab50894e5cea207289f4af8b53cbafa47748). The best and last models were kept from each training. This submission has loaded the best models from each training, reevaluates them, and selects the best model from these latest evaluations (mean - std).
 | algo   | env                           |   seed |   reward_mean |   reward_std |   eval_episodes | best   | wandb_url                                                                    |
 |:-------|:------------------------------|-------:|--------------:|-------------:|----------------:|:-------|:-----------------------------------------------------------------------------|
+| ppo    | MicrortsDefeatCoacAIShaped-v3 |      1 |      0.769231 |     0.638971 |              26 | *      | [wandb](https://wandb.ai/sgoodfriend/rl-algo-impls-benchmarks/runs/a0smxvhw) |
+| ppo    | MicrortsDefeatCoacAIShaped-v3 |      2 |      0.692308 |     0.721602 |              26 |        | [wandb](https://wandb.ai/sgoodfriend/rl-algo-impls-benchmarks/runs/8ees317u) |
+| ppo    | MicrortsDefeatCoacAIShaped-v3 |      3 |      0.423077 |     0.884615 |              26 |        | [wandb](https://wandb.ai/sgoodfriend/rl-algo-impls-benchmarks/runs/ifj50v2t) |
 ### Prerequisites: Weights & Biases (WandB)
 Note: While the model state dictionary and hyperaparameters are saved, the latest
 implementation could be sufficiently different to not be able to reproduce similar
 results. You might need to checkout the commit the agent was trained on:
+[9ba0ab5](https://github.com/sgoodfriend/rl-algo-impls/tree/9ba0ab50894e5cea207289f4af8b53cbafa47748).
 ```
 # Downloads the model, sets hyperparameters, and runs agent for 3 episodes
+python enjoy.py --wandb-run-path=sgoodfriend/rl-algo-impls-benchmarks/a0smxvhw
 ```
 Setup hasn't been completely worked out yet, so you might be best served by using Google
 ## Training
 If you want the highest chance to reproduce these results, you'll want to checkout the
+commit the agent was trained on: [9ba0ab5](https://github.com/sgoodfriend/rl-algo-impls/tree/9ba0ab50894e5cea207289f4af8b53cbafa47748). While
 training is deterministic, different hardware will give different results.
 ```
 ```
 additional_keys_to_log:
 - microrts_stats
+- microrts_results
 algo: ppo
 algo_hyperparams:
   batch_size: 3072
   make_kwargs:
     map_paths:
     - maps/16x16/basesWorkers16x16.xml
+    max_steps: 4000
     num_selfplay_envs: 36
     render_theme: 2
     reward_weight:
   n_envs: 24
   self_play_kwargs:
     num_old_policies: 12
+    save_steps: 300000
+    swap_steps: 6000
     swap_window_size: 4
+    window: 33
 env_id: MicrortsDefeatCoacAIShaped-v3
 eval_hyperparams:
   deterministic: false
 wandb_group: null
 wandb_project_name: rl-algo-impls-benchmarks
 wandb_tags:
+- benchmark_9ba0ab5
+- host_192-9-155-233
+- branch_main
 - v0.0.9
 ```

pyproject.toml CHANGED Viewed

@@ -26,7 +26,7 @@ dependencies = [
     "stable-baselines3[extra] >= 1.7.0, < 1.8",
     "gym[box2d] >= 0.21.0, < 0.22",
     "pyglet == 1.5.27",
-    "wandb",
     "pyvirtualdisplay",
     "pybullet",
     "tabulate",

     "stable-baselines3[extra] >= 1.7.0, < 1.8",
     "gym[box2d] >= 0.21.0, < 0.22",
     "pyglet == 1.5.27",
+    "wandb == 0.13.10",
     "pyvirtualdisplay",
     "pybullet",
     "tabulate",

replay.meta.json CHANGED Viewed

@@ -1 +1 @@

- {"content_type": "video/mp4", "encoder_version": {"backend": "ffmpeg", "version": "b'ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers\\nbuilt with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)\\nconfiguration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-nvenc --enable-chromaprint --enable-frei0r --enable-libx264 --enable-shared\\nlibavutil 56. 31.100 / 56. 31.100\\nlibavcodec 58. 54.100 / 58. 54.100\\nlibavformat 58. 29.100 / 58. 29.100\\nlibavdevice 58. 8.100 / 58. 8.100\\nlibavfilter 7. 57.100 / 7. 57.100\\nlibavresample 4. 0. 0 / 4. 0. 0\\nlibswscale 5. 5.100 / 5. 5.100\\nlibswresample 3. 5.100 / 3. 5.100\\nlibpostproc 55. 5.100 / 55. 5.100\\n'", "cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-f", "rawvideo", "-s:v", "640x640", "-pix_fmt", "rgb24", "-framerate", "150", "-i", "-", "-vf", "scale=trunc(iw/2)*2:trunc(ih/2)*2", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "-r", "150", "/tmp/~~tmp2qhg_um5~~/ppo-Microrts-selfplay-unet-decay/replay.mp4"]}, "~~episode~~": {"r": 1.0, "l": ~~801~~, "t": 10.~~68842~~}}

+ {"content_type": "video/mp4", "encoder_version": {"backend": "ffmpeg", "version": "b'ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers\\nbuilt with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)\\nconfiguration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-nvenc --enable-chromaprint --enable-frei0r --enable-libx264 --enable-shared\\nlibavutil 56. 31.100 / 56. 31.100\\nlibavcodec 58. 54.100 / 58. 54.100\\nlibavformat 58. 29.100 / 58. 29.100\\nlibavdevice 58. 8.100 / 58. 8.100\\nlibavfilter 7. 57.100 / 7. 57.100\\nlibavresample 4. 0. 0 / 4. 0. 0\\nlibswscale 5. 5.100 / 5. 5.100\\nlibswresample 3. 5.100 / 3. 5.100\\nlibpostproc 55. 5.100 / 55. 5.100\\n'", "cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-f", "rawvideo", "-s:v", "640x640", "-pix_fmt", "rgb24", "-framerate", "150", "-i", "-", "-vf", "scale=trunc(iw/2)*2:trunc(ih/2)*2", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "-r", "150", "/tmp/tmpo54rvbdq/ppo-Microrts-selfplay-unet-decay/replay.mp4"]}, "episodes": [{"r": 1.0, "l": 740, "t": 9.992017}]}

replay.mp4 CHANGED Viewed

Binary files a/replay.mp4 and b/replay.mp4 differ

rl_algo_impls/a2c/optimize.py CHANGED Viewed

@@ -1,10 +1,10 @@
-import optuna
 from copy import deepcopy
-from rl_algo_impls.runner.config import Config, Hyperparams, EnvHyperparams
-from rl_algo_impls.shared.vec_env import make_eval_env
 from rl_algo_impls.shared.policy.optimize_on_policy import sample_on_policy_hyperparams
 from rl_algo_impls.tuning.optimize_env import sample_env_hyperparams
@@ -16,7 +16,11 @@ def sample_params(
     hyperparams = deepcopy(base_hyperparams)
     base_env_hyperparams = EnvHyperparams(**hyperparams.env_hyperparams)
-    env = make_eval_env(base_config, base_env_hyperparams, override_n_envs=1)
     # env_hyperparams
     env_hyperparams = sample_env_hyperparams(trial, hyperparams.env_hyperparams, env)

 from copy import deepcopy
+import optuna
+from rl_algo_impls.runner.config import Config, EnvHyperparams, Hyperparams
 from rl_algo_impls.shared.policy.optimize_on_policy import sample_on_policy_hyperparams
+from rl_algo_impls.shared.vec_env import make_eval_env
 from rl_algo_impls.tuning.optimize_env import sample_env_hyperparams
     hyperparams = deepcopy(base_hyperparams)
     base_env_hyperparams = EnvHyperparams(**hyperparams.env_hyperparams)
+    env = make_eval_env(
+        base_config,
+        base_env_hyperparams,
+        override_hparams={"n_envs": 1},
+    )
     # env_hyperparams
     env_hyperparams = sample_env_hyperparams(trial, hyperparams.env_hyperparams, env)

rl_algo_impls/huggingface_publish.py CHANGED Viewed

@@ -133,7 +133,7 @@ def publish(
             make_eval_env(
                 config,
                 EnvHyperparams(**config.env_hyperparams),
-                override_n_envs=1,
                 normalize_load_path=model_path,
             ),
             os.path.join(repo_dir_path, "replay"),

             make_eval_env(
                 config,
                 EnvHyperparams(**config.env_hyperparams),
+                override_hparams={"n_envs": 1},
                 normalize_load_path=model_path,
             ),
             os.path.join(repo_dir_path, "replay"),

rl_algo_impls/hyperparams/a2c.yml CHANGED Viewed

@@ -101,31 +101,32 @@ HopperBulletEnv-v0:
 CarRacing-v0:
   n_timesteps: !!float 4e6
   env_hyperparams:
-    n_envs: 16
     frame_stack: 4
     normalize: true
     normalize_kwargs:
       norm_obs: false
       norm_reward: true
   policy_hyperparams:
-    use_sde: false
-    log_std_init: -1.3502584927786276
     init_layers_orthogonal: true
     activation_fn: tanh
     share_features_extractor: false
     cnn_flatten_dim: 256
     hidden_sizes: [256]
   algo_hyperparams:
-    n_steps: 16
-    learning_rate: 0.000025630993245026736
-    learning_rate_decay: linear
-    gamma: 0.99957617037542
-    gae_lambda: 0.949455676599436
-    ent_coef: !!float 1.707983205298309e-7
-    vf_coef: 0.10428178193833336
-    max_grad_norm: 0.5406643389792273
-    normalize_advantage: true
     use_rms_prop: false
 _atari: &atari-defaults
   n_timesteps: !!float 1e7

 CarRacing-v0:
   n_timesteps: !!float 4e6
   env_hyperparams:
+    n_envs: 4
     frame_stack: 4
     normalize: true
     normalize_kwargs:
       norm_obs: false
       norm_reward: true
   policy_hyperparams:
+    use_sde: true
+    log_std_init: -4.839609092563
     init_layers_orthogonal: true
     activation_fn: tanh
     share_features_extractor: false
     cnn_flatten_dim: 256
     hidden_sizes: [256]
   algo_hyperparams:
+    n_steps: 64
+    learning_rate: 0.000018971962220405576
+    gamma: 0.9942776405534832
+    gae_lambda: 0.9549244758833236
+    ent_coef: 0.0000015666550584860516
+    ent_coef_decay: linear
+    vf_coef: 0.12164696385898476
+    max_grad_norm: 2.2574480552177127
+    normalize_advantage: false
     use_rms_prop: false
+    sde_sample_freq: 16
 _atari: &atari-defaults
   n_timesteps: !!float 1e7

rl_algo_impls/hyperparams/ppo.yml CHANGED Viewed

@@ -252,13 +252,13 @@ MicrortsRandomEnemyShapedReward3-v1-NoMask:
 _microrts_ai: &microrts-ai-defaults
   <<: *microrts-defaults
   n_timesteps: !!float 100e6
-  additional_keys_to_log: ["microrts_stats"]
   env_hyperparams: &microrts-ai-env-defaults
     n_envs: 24
     env_type: microrts
     make_kwargs: &microrts-ai-env-make-kwargs-defaults
       num_selfplay_envs: 0
-      max_steps: 2000
       render_theme: 2
       map_paths: [maps/16x16/basesWorkers16x16.xml]
       reward_weight: [10.0, 1.0, 1.0, 0.2, 1.0, 4.0]
@@ -399,10 +399,10 @@ Microrts-selfplay-unet: &microrts-selfplay-defaults
       num_selfplay_envs: 36
     self_play_kwargs:
       num_old_policies: 12
-      save_steps: 200000
-      swap_steps: 10000
       swap_window_size: 4
-      window: 25
   eval_hyperparams: &microrts-selfplay-eval-defaults
     <<: *microrts-coacai-eval-defaults
     env_overrides: &microrts-selfplay-eval-env-overrides

 _microrts_ai: &microrts-ai-defaults
   <<: *microrts-defaults
   n_timesteps: !!float 100e6
+  additional_keys_to_log: ["microrts_stats", "microrts_results"]
   env_hyperparams: &microrts-ai-env-defaults
     n_envs: 24
     env_type: microrts
     make_kwargs: &microrts-ai-env-make-kwargs-defaults
       num_selfplay_envs: 0
+      max_steps: 4000
       render_theme: 2
       map_paths: [maps/16x16/basesWorkers16x16.xml]
       reward_weight: [10.0, 1.0, 1.0, 0.2, 1.0, 4.0]
       num_selfplay_envs: 36
     self_play_kwargs:
       num_old_policies: 12
+      save_steps: 300000
+      swap_steps: 6000
       swap_window_size: 4
+      window: 33
   eval_hyperparams: &microrts-selfplay-eval-defaults
     <<: *microrts-coacai-eval-defaults
     env_overrides: &microrts-selfplay-eval-env-overrides

rl_algo_impls/optimize.py CHANGED Viewed

@@ -211,7 +211,7 @@ def simple_optimize(trial: optuna.Trial, args: RunArgs, study_args: StudyArgs) -
     eval_env = make_eval_env(
         config,
         EnvHyperparams(**config.env_hyperparams),
-        override_n_envs=study_args.n_eval_envs,
     )
     optimize_callback = OptimizeCallback(
         policy,
@@ -331,7 +331,7 @@ def stepwise_optimize(
                 config,
                 EnvHyperparams(**config.env_hyperparams),
                 normalize_load_path=config.model_dir_path() if i > 0 else None,
-                override_n_envs=study_args.n_eval_envs,
             )
             start_timesteps = int(i * config.n_timesteps / study_args.n_evaluations)

     eval_env = make_eval_env(
         config,
         EnvHyperparams(**config.env_hyperparams),
+        override_hparams={"n_envs": study_args.n_eval_envs},
     )
     optimize_callback = OptimizeCallback(
         policy,
                 config,
                 EnvHyperparams(**config.env_hyperparams),
                 normalize_load_path=config.model_dir_path() if i > 0 else None,
+                override_hparams={"n_envs": study_args.n_eval_envs},
             )
             start_timesteps = int(i * config.n_timesteps / study_args.n_evaluations)

rl_algo_impls/ppo/ppo.py CHANGED Viewed

@@ -110,7 +110,7 @@ class PPO(Algorithm):
     ) -> None:
         super().__init__(policy, env, device, tb_writer)
         self.policy = policy
-        self.get_action_mask = getattr(env, "get_action_mask")
         self.gamma_schedule = (
             linear_schedule(gamma, gamma_end)

     ) -> None:
         super().__init__(policy, env, device, tb_writer)
         self.policy = policy
+        self.get_action_mask = getattr(env, "get_action_mask", None)
         self.gamma_schedule = (
             linear_schedule(gamma, gamma_end)

rl_algo_impls/runner/config.py CHANGED Viewed

@@ -52,6 +52,7 @@ class EnvHyperparams:
     mask_actions: bool = False
     bots: Optional[Dict[str, int]] = None
     self_play_kwargs: Optional[Dict[str, Any]] = None
 HyperparamsSelf = TypeVar("HyperparamsSelf", bound="Hyperparams")

     mask_actions: bool = False
     bots: Optional[Dict[str, int]] = None
     self_play_kwargs: Optional[Dict[str, Any]] = None
+    selfplay_bots: Optional[Dict[str, int]] = None
 HyperparamsSelf = TypeVar("HyperparamsSelf", bound="Hyperparams")

rl_algo_impls/runner/evaluate.py CHANGED Viewed

@@ -70,7 +70,7 @@ def evaluate_model(args: EvalArgs, root_dir: str) -> Evaluation:
     env = make_eval_env(
         config,
         EnvHyperparams(**config.env_hyperparams),
-        override_n_envs=args.n_envs,
         render=args.render,
         normalize_load_path=model_path,
     )

     env = make_eval_env(
         config,
         EnvHyperparams(**config.env_hyperparams),
+        override_hparams={"n_envs": args.n_envs} if args.n_envs else None,
         render=args.render,
         normalize_load_path=model_path,
     )

rl_algo_impls/runner/selfplay_evaluate.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import copy
+import dataclasses
+import os
+import shutil
+from dataclasses import dataclass
+from typing import List, NamedTuple, Optional
+import numpy as np
+import wandb
+from rl_algo_impls.runner.config import Config, EnvHyperparams, Hyperparams, RunArgs
+from rl_algo_impls.runner.evaluate import Evaluation
+from rl_algo_impls.runner.running_utils import (
+    get_device,
+    load_hyperparams,
+    make_policy,
+    set_seeds,
+)
+from rl_algo_impls.shared.callbacks.eval_callback import evaluate
+from rl_algo_impls.shared.vec_env import make_eval_env
+from rl_algo_impls.wrappers.vec_episode_recorder import VecEpisodeRecorder
+@dataclass
+class SelfplayEvalArgs(RunArgs):
+    # Either wandb_run_paths or model_file_paths must have 2 elements in it.
+    wandb_run_paths: List[str] = dataclasses.field(default_factory=list)
+    model_file_paths: List[str] = dataclasses.field(default_factory=list)
+    render: bool = False
+    best: bool = True
+    n_envs: int = 1
+    n_episodes: int = 1
+    deterministic_eval: Optional[bool] = None
+    no_print_returns: bool = False
+    video_path: Optional[str] = None
+def selfplay_evaluate(args: SelfplayEvalArgs, root_dir: str) -> Evaluation:
+    if args.wandb_run_paths:
+        api = wandb.Api()
+        args, config, player_1_model_path = load_player(
+            api, args.wandb_run_paths[0], args, root_dir
+        )
+        _, _, player_2_model_path = load_player(
+            api, args.wandb_run_paths[1], args, root_dir
+        )
+    elif args.model_file_paths:
+        hyperparams = load_hyperparams(args.algo, args.env)
+        config = Config(args, hyperparams, root_dir)
+        player_1_model_path, player_2_model_path = args.model_file_paths
+    else:
+        raise ValueError("Must specify 2 wandb_run_paths or 2 model_file_paths")
+    print(args)
+    set_seeds(args.seed, args.use_deterministic_algorithms)
+    env_make_kwargs = (
+        config.eval_hyperparams.get("env_overrides", {}).get("make_kwargs", {}).copy()
+    )
+    env_make_kwargs["num_selfplay_envs"] = args.n_envs * 2
+    env = make_eval_env(
+        config,
+        EnvHyperparams(**config.env_hyperparams),
+        override_hparams={
+            "n_envs": args.n_envs,
+            "selfplay_bots": {
+                player_2_model_path: args.n_envs,
+            },
+            "self_play_kwargs": {
+                "num_old_policies": 0,
+                "save_steps": np.inf,
+                "swap_steps": np.inf,
+                "bot_always_player_2": True,
+            },
+            "bots": None,
+            "make_kwargs": env_make_kwargs,
+        },
+        render=args.render,
+        normalize_load_path=player_1_model_path,
+    )
+    if args.video_path:
+        env = VecEpisodeRecorder(
+            env, args.video_path, max_video_length=18000, num_episodes=args.n_episodes
+        )
+    device = get_device(config, env)
+    policy = make_policy(
+        args.algo,
+        env,
+        device,
+        load_path=player_1_model_path,
+        **config.policy_hyperparams,
+    ).eval()
+    deterministic = (
+        args.deterministic_eval
+        if args.deterministic_eval is not None
+        else config.eval_hyperparams.get("deterministic", True)
+    )
+    return Evaluation(
+        policy,
+        evaluate(
+            env,
+            policy,
+            args.n_episodes,
+            render=args.render,
+            deterministic=deterministic,
+            print_returns=not args.no_print_returns,
+        ),
+        config,
+    )
+class PlayerData(NamedTuple):
+    args: SelfplayEvalArgs
+    config: Config
+    model_path: str
+def load_player(
+    api: wandb.Api, run_path: str, args: SelfplayEvalArgs, root_dir: str
+) -> PlayerData:
+    args = copy.copy(args)
+    run = api.run(run_path)
+    params = run.config
+    args.algo = params["algo"]
+    args.env = params["env"]
+    args.seed = params.get("seed", None)
+    args.use_deterministic_algorithms = params.get("use_deterministic_algorithms", True)
+    config = Config(args, Hyperparams.from_dict_with_extra_fields(params), root_dir)
+    model_path = config.model_dir_path(best=args.best, downloaded=True)
+    model_archive_name = config.model_dir_name(best=args.best, extension=".zip")
+    run.file(model_archive_name).download()
+    if os.path.isdir(model_path):
+        shutil.rmtree(model_path)
+    shutil.unpack_archive(model_archive_name, model_path)
+    os.remove(model_archive_name)
+    return PlayerData(args, config, model_path)

rl_algo_impls/runner/train.py CHANGED Viewed

@@ -49,7 +49,7 @@ def train(args: TrainArgs):
     print(hyperparams)
     config = Config(args, hyperparams, os.getcwd())
-    wandb_enabled = args.wandb_project_name
     if wandb_enabled:
         wandb.tensorboard.patch(
             root_logdir=config.tensorboard_summary_path, pytorch=True
@@ -100,12 +100,15 @@ def train(args: TrainArgs):
         best_model_path=config.model_dir_path(best=True),
         **config.eval_callback_params(),
         video_env=make_eval_env(
-            config, EnvHyperparams(**config.env_hyperparams), override_n_envs=1
         )
         if record_best_videos
         else None,
         best_video_dir=config.best_videos_dir,
         additional_keys_to_log=config.additional_keys_to_log,
     )
     callbacks: List[Callback] = [eval_callback]
     if config.hyperparams.microrts_reward_decay_callback:
@@ -149,13 +152,8 @@ def train(args: TrainArgs):
     if wandb_enabled:
         shutil.make_archive(
-            os.path.join(wandb.run.dir, config.model_dir_name()),
             "zip",
             config.model_dir_path(),
         )
-        shutil.make_archive(
-            os.path.join(wandb.run.dir, config.model_dir_name(best=True)),
-            "zip",
-            config.model_dir_path(best=True),
-        )
         wandb.finish()

     print(hyperparams)
     config = Config(args, hyperparams, os.getcwd())
+    wandb_enabled = bool(args.wandb_project_name)
     if wandb_enabled:
         wandb.tensorboard.patch(
             root_logdir=config.tensorboard_summary_path, pytorch=True
         best_model_path=config.model_dir_path(best=True),
         **config.eval_callback_params(),
         video_env=make_eval_env(
+            config,
+            EnvHyperparams(**config.env_hyperparams),
+            override_hparams={"n_envs": 1},
         )
         if record_best_videos
         else None,
         best_video_dir=config.best_videos_dir,
         additional_keys_to_log=config.additional_keys_to_log,
+        wandb_enabled=wandb_enabled,
     )
     callbacks: List[Callback] = [eval_callback]
     if config.hyperparams.microrts_reward_decay_callback:
     if wandb_enabled:
         shutil.make_archive(
+            os.path.join(wandb.run.dir, config.model_dir_name()),  # type: ignore
             "zip",
             config.model_dir_path(),
         )
         wandb.finish()

rl_algo_impls/selfplay_enjoy.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# Support for PyTorch mps mode (https://pytorch.org/docs/stable/notes/mps.html)
+import os
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+from rl_algo_impls.runner.running_utils import base_parser
+from rl_algo_impls.runner.selfplay_evaluate import SelfplayEvalArgs, selfplay_evaluate
+def selfplay_enjoy() -> None:
+    parser = base_parser(multiple=False)
+    parser.add_argument(
+        "--wandb-run-paths",
+        type=str,
+        nargs="*",
+        help="WandB run paths to load players from. Must be 0 or 2",
+    )
+    parser.add_argument(
+        "--model-file-paths",
+        type=str,
+        help="File paths to load players from. Must be 0 or 2",
+    )
+    parser.add_argument("--render", action="store_true")
+    parser.add_argument("--n-envs", default=1, type=int)
+    parser.add_argument("--n-episodes", default=1, type=int)
+    parser.add_argument("--deterministic-eval", default=None, type=bool)
+    parser.add_argument(
+        "--no-print-returns", action="store_true", help="Limit printing"
+    )
+    parser.add_argument(
+        "--video-path", type=str, help="Path to save video of all plays"
+    )
+    # parser.set_defaults(
+    #     algo=["ppo"],
+    #     env=["Microrts-selfplay-unet-decay"],
+    #     n_episodes=10,
+    #     model_file_paths=[
+    #         "downloaded_models/ppo-Microrts-selfplay-unet-decay-S3-best",
+    #         "downloaded_models/ppo-Microrts-selfplay-unet-decay-S2-best",
+    #     ],
+    #     video_path="/Users/sgoodfriend/Desktop/decay3-vs-decay2",
+    # )
+    args = parser.parse_args()
+    args.algo = args.algo[0]
+    args.env = args.env[0]
+    args.seed = args.seed[0]
+    args = SelfplayEvalArgs(**vars(args))
+    selfplay_evaluate(args, os.getcwd())
+if __name__ == "__main__":
+    selfplay_enjoy()

rl_algo_impls/shared/actor/state_dependent_noise.py CHANGED Viewed

@@ -172,7 +172,7 @@ class StateDependentNoiseActorHead(Actor):
             not action_masks
         ), f"{self.__class__.__name__} does not support action_masks"
         pi = self._distribution(obs)
-        return pi_forward(pi, actions)
     def sample_weights(self, batch_size: int = 1) -> None:
         std = self._get_std()
@@ -187,13 +187,13 @@ class StateDependentNoiseActorHead(Actor):
 def pi_forward(
-    distribution: Distribution, actions: Optional[torch.Tensor] = None
 ) -> PiForward:
     logp_a = None
     entropy = None
     if actions is not None:
         logp_a = distribution.log_prob(actions)
-        entropy = (
-            -logp_a if self.bijector else sum_independent_dims(distribution.entropy())
-        )
     return PiForward(distribution, logp_a, entropy)

             not action_masks
         ), f"{self.__class__.__name__} does not support action_masks"
         pi = self._distribution(obs)
+        return pi_forward(pi, actions, self.bijector)
     def sample_weights(self, batch_size: int = 1) -> None:
         std = self._get_std()
 def pi_forward(
+    distribution: Distribution,
+    actions: Optional[torch.Tensor] = None,
+    bijector: Optional[TanhBijector] = None,
 ) -> PiForward:
     logp_a = None
     entropy = None
     if actions is not None:
         logp_a = distribution.log_prob(actions)
+        entropy = -logp_a if bijector else sum_independent_dims(distribution.entropy())
     return PiForward(distribution, logp_a, entropy)

rl_algo_impls/shared/callbacks/eval_callback.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import itertools
 import os
 from time import perf_counter
 from typing import Dict, List, Optional, Union
@@ -94,7 +95,7 @@ def evaluate(
     )
     obs = env.reset()
-    get_action_mask = getattr(env, "get_action_mask")
     while not episodes.is_done():
         act = policy.act(
             obs,
@@ -132,6 +133,7 @@ class EvalCallback(Callback):
         ignore_first_episode: bool = False,
         additional_keys_to_log: Optional[List[str]] = None,
         score_function: str = "mean-std",
     ) -> None:
         super().__init__()
         self.policy = policy
@@ -157,6 +159,7 @@ class EvalCallback(Callback):
         self.ignore_first_episode = ignore_first_episode
         self.additional_keys_to_log = additional_keys_to_log
         self.score_function = score_function
     def on_step(self, timesteps_elapsed: int = 1) -> bool:
         super().on_step(timesteps_elapsed)
@@ -196,6 +199,15 @@ class EvalCallback(Callback):
                 assert self.best_model_path
                 self.policy.save(self.best_model_path)
                 print("Saved best model")
             self.best.write_to_tensorboard(
                 self.tb_writer, "best_eval", self.timesteps_elapsed
             )

 import itertools
 import os
+import shutil
 from time import perf_counter
 from typing import Dict, List, Optional, Union
     )
     obs = env.reset()
+    get_action_mask = getattr(env, "get_action_mask", None)
     while not episodes.is_done():
         act = policy.act(
             obs,
         ignore_first_episode: bool = False,
         additional_keys_to_log: Optional[List[str]] = None,
         score_function: str = "mean-std",
+        wandb_enabled: bool = False,
     ) -> None:
         super().__init__()
         self.policy = policy
         self.ignore_first_episode = ignore_first_episode
         self.additional_keys_to_log = additional_keys_to_log
         self.score_function = score_function
+        self.wandb_enabled = wandb_enabled
     def on_step(self, timesteps_elapsed: int = 1) -> bool:
         super().on_step(timesteps_elapsed)
                 assert self.best_model_path
                 self.policy.save(self.best_model_path)
                 print("Saved best model")
+                if self.wandb_enabled:
+                    import wandb
+                    best_model_name = os.path.split(self.best_model_path)[-1]
+                    shutil.make_archive(
+                        os.path.join(wandb.run.dir, best_model_name),  # type: ignore
+                        "zip",
+                        self.best_model_path,
+                    )
             self.best.write_to_tensorboard(
                 self.tb_writer, "best_eval", self.timesteps_elapsed
             )

rl_algo_impls/shared/policy/actor_critic.py CHANGED Viewed

@@ -93,7 +93,7 @@ class ActorCritic(OnPolicy):
         observation_space = single_observation_space(env)
         action_space = single_action_space(env)
-        action_plane_space = getattr(env, "action_plane_space")
         self.action_space = action_space
         self.squash_output = squash_output

         observation_space = single_observation_space(env)
         action_space = single_action_space(env)
+        action_plane_space = getattr(env, "action_plane_space", None)
         self.action_space = action_space
         self.squash_output = squash_output

rl_algo_impls/shared/vec_env/make_env.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from dataclasses import asdict
-from typing import Optional
 from torch.utils.tensorboard.writer import SummaryWriter
@@ -52,7 +52,7 @@ def make_env(
 def make_eval_env(
     config: Config,
     hparams: EnvHyperparams,
-    override_n_envs: Optional[int] = None,
     **kwargs,
 ) -> VecEnv:
     kwargs = kwargs.copy()
@@ -62,10 +62,11 @@ def make_eval_env(
         hparams_kwargs = asdict(hparams)
         hparams_kwargs.update(env_overrides)
         hparams = EnvHyperparams(**hparams_kwargs)
-    if override_n_envs is not None:
         hparams_kwargs = asdict(hparams)
-        hparams_kwargs["n_envs"] = override_n_envs
-        if override_n_envs == 1:
-            hparams_kwargs["vec_env_class"] = "sync"
         hparams = EnvHyperparams(**hparams_kwargs)
     return make_env(config, hparams, **kwargs)

 from dataclasses import asdict
+from typing import Any, Dict, Optional
 from torch.utils.tensorboard.writer import SummaryWriter
 def make_eval_env(
     config: Config,
     hparams: EnvHyperparams,
+    override_hparams: Optional[Dict[str, Any]] = None,
     **kwargs,
 ) -> VecEnv:
     kwargs = kwargs.copy()
         hparams_kwargs = asdict(hparams)
         hparams_kwargs.update(env_overrides)
         hparams = EnvHyperparams(**hparams_kwargs)
+    if override_hparams:
         hparams_kwargs = asdict(hparams)
+        for k, v in override_hparams.items():
+            hparams_kwargs[k] = v
+            if k == "n_envs" and v == 1:
+                hparams_kwargs["vec_env_class"] = "sync"
         hparams = EnvHyperparams(**hparams_kwargs)
     return make_env(config, hparams, **kwargs)

rl_algo_impls/shared/vec_env/microrts.py CHANGED Viewed

@@ -50,6 +50,7 @@ def make_microrts_env(
         _,  # mask_actions
         bots,
         self_play_kwargs,
     ) = astuple(hparams)
     seed = config.seed(training=training)
@@ -65,6 +66,7 @@ def make_microrts_env(
                 n_envs
                 - make_kwargs["num_selfplay_envs"]
                 + self_play_kwargs.get("num_old_policies", 0)
             )
         else:
             num_bot_envs = n_envs
@@ -100,14 +102,16 @@ def make_microrts_env(
     envs = MicrortsMaskWrapper(envs)
     if self_play_kwargs:
-        envs = SelfPlayWrapper(envs, **self_play_kwargs)
     if seed is not None:
         envs.action_space.seed(seed)
         envs.observation_space.seed(seed)
     envs = gym.wrappers.RecordEpisodeStatistics(envs)
-    envs = MicrortsStatsRecorder(envs, config.algo_hyperparams.get("gamma", 0.99))
     if training:
         assert tb_writer
         envs = EpisodeStatsWriter(

         _,  # mask_actions
         bots,
         self_play_kwargs,
+        selfplay_bots,
     ) = astuple(hparams)
     seed = config.seed(training=training)
                 n_envs
                 - make_kwargs["num_selfplay_envs"]
                 + self_play_kwargs.get("num_old_policies", 0)
+                + (len(selfplay_bots) if selfplay_bots else 0)
             )
         else:
             num_bot_envs = n_envs
     envs = MicrortsMaskWrapper(envs)
     if self_play_kwargs:
+        if selfplay_bots:
+            self_play_kwargs["selfplay_bots"] = selfplay_bots
+        envs = SelfPlayWrapper(envs, config, **self_play_kwargs)
     if seed is not None:
         envs.action_space.seed(seed)
         envs.observation_space.seed(seed)
     envs = gym.wrappers.RecordEpisodeStatistics(envs)
+    envs = MicrortsStatsRecorder(envs, config.algo_hyperparams.get("gamma", 0.99), bots)
     if training:
         assert tb_writer
         envs = EpisodeStatsWriter(

rl_algo_impls/shared/vec_env/procgen.py CHANGED Viewed

@@ -41,6 +41,8 @@ def make_procgen_env(
         _,  # normalize_type
         _,  # mask_actions
         _,  # bots
     ) = astuple(hparams)
     seed = config.seed(training=training)

         _,  # normalize_type
         _,  # mask_actions
         _,  # bots
+        _,  # self_play_kwargs
+        _,  # selfplay_bots
     ) = astuple(hparams)
     seed = config.seed(training=training)

rl_algo_impls/shared/vec_env/vec_env.py CHANGED Viewed

@@ -73,6 +73,8 @@ def make_vec_env(
         normalize_type,
         mask_actions,
         _,  # bots
     ) = astuple(hparams)
     import_for_env_id(config.env_id)

         normalize_type,
         mask_actions,
         _,  # bots
+        _,  # self_play_kwargs
+        _,  # selfplay_bots
     ) = astuple(hparams)
     import_for_env_id(config.env_id)

rl_algo_impls/wrappers/action_mask_wrapper.py CHANGED Viewed

@@ -16,11 +16,11 @@ class IncompleteArrayError(Exception):
 class SingleActionMaskWrapper(VecotarableWrapper):
     def get_action_mask(self) -> Optional[np.ndarray]:
-        envs = getattr(self.env.unwrapped, "envs")  # type: ignore
         assert (
             envs
         ), f"{self.__class__.__name__} expects to wrap synchronous vectorized env"
-        masks = [getattr(e.unwrapped, "action_mask") for e in envs]
         assert all(m is not None for m in masks)
         return np.array(masks, dtype=np.bool_)

 class SingleActionMaskWrapper(VecotarableWrapper):
     def get_action_mask(self) -> Optional[np.ndarray]:
+        envs = getattr(self.env.unwrapped, "envs", None)  # type: ignore
         assert (
             envs
         ), f"{self.__class__.__name__} expects to wrap synchronous vectorized env"
+        masks = [getattr(e.unwrapped, "action_mask", None) for e in envs]
         assert all(m is not None for m in masks)
         return np.array(masks, dtype=np.bool_)

rl_algo_impls/wrappers/microrts_stats_recorder.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Any, Dict, List
 import numpy as np
@@ -10,10 +10,19 @@ from rl_algo_impls.wrappers.vectorable_wrapper import (
 class MicrortsStatsRecorder(VecotarableWrapper):
-    def __init__(self, env, gamma: float) -> None:
         super().__init__(env)
         self.gamma = gamma
         self.raw_rewards = [[] for _ in range(self.num_envs)]
     def reset(self) -> VecEnvObs:
         obs = super().reset()
@@ -33,4 +42,19 @@ class MicrortsStatsRecorder(VecotarableWrapper):
                 raw_rewards = np.array(self.raw_rewards[idx]).sum(0)
                 raw_names = [str(rf) for rf in self.env.unwrapped.rfs]
                 info["microrts_stats"] = dict(zip(raw_names, raw_rewards))
                 self.raw_rewards[idx] = []

+from typing import Any, Dict, List, Optional
 import numpy as np
 class MicrortsStatsRecorder(VecotarableWrapper):
+    def __init__(
+        self, env, gamma: float, bots: Optional[Dict[str, int]] = None
+    ) -> None:
         super().__init__(env)
         self.gamma = gamma
         self.raw_rewards = [[] for _ in range(self.num_envs)]
+        self.bots = bots
+        if self.bots:
+            self.bot_at_index = [None] * (env.num_envs - sum(self.bots.values()))
+            for b, n in self.bots.items():
+                self.bot_at_index.extend([b] * n)
+        else:
+            self.bot_at_index = [None] * env.num_envs
     def reset(self) -> VecEnvObs:
         obs = super().reset()
                 raw_rewards = np.array(self.raw_rewards[idx]).sum(0)
                 raw_names = [str(rf) for rf in self.env.unwrapped.rfs]
                 info["microrts_stats"] = dict(zip(raw_names, raw_rewards))
+                winloss = raw_rewards[raw_names.index("WinLossRewardFunction")]
+                microrts_results = {
+                    "win": int(winloss == 1),
+                    "draw": int(winloss == 0),
+                    "loss": int(winloss == -1),
+                }
+                bot = self.bot_at_index[idx]
+                if bot:
+                    microrts_results.update(
+                        {f"{k}_{bot}": v for k, v in microrts_results.items()}
+                    )
+                info["microrts_results"] = microrts_results
                 self.raw_rewards[idx] = []

rl_algo_impls/wrappers/self_play_wrapper.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import copy
 import random
 from collections import deque
-from typing import Deque, List, Optional
 import numpy as np
 from rl_algo_impls.shared.policy.policy import Policy
 from rl_algo_impls.wrappers.action_mask_wrapper import find_action_masker
 from rl_algo_impls.wrappers.vectorable_wrapper import (
@@ -21,11 +22,14 @@ class SelfPlayWrapper(VecotarableWrapper):
     def __init__(
         self,
         env,
         num_old_policies: int = 0,
         save_steps: int = 20_000,
         swap_steps: int = 10_000,
         window: int = 10,
         swap_window_size: int = 2,
     ) -> None:
         super().__init__(env)
         assert num_old_policies % 2 == 0, f"num_old_policies must be even"
@@ -33,17 +37,26 @@ class SelfPlayWrapper(VecotarableWrapper):
             num_old_policies % swap_window_size == 0
         ), f"num_old_policies must be a multiple of swap_window_size"
         self.num_old_policies = num_old_policies
         self.save_steps = save_steps
         self.swap_steps = swap_steps
         self.swap_window_size = swap_window_size
         self.policies: Deque[Policy] = deque(maxlen=window)
         self.policy_assignments: List[Optional[Policy]] = [None] * env.num_envs
         self.steps_since_swap = np.zeros(env.num_envs)
         self.num_envs = env.num_envs - num_old_policies
     def get_action_mask(self) -> Optional[np.ndarray]:
         return self.env.get_action_mask()[self.learner_indexes()]
@@ -54,10 +67,12 @@ class SelfPlayWrapper(VecotarableWrapper):
         copied_policy.train(False)
         self.policies.append(copied_policy)
-        if all(p is None for p in self.policy_assignments):
             for i in range(self.num_old_policies):
                 # Switch between player 1 and 2
-                self.policy_assignments[2 * i + (i % 2)] = copied_policy
     def swap_policy(self, idx: int, swap_window_size: int = 1) -> None:
         policy = random.choice(self.policies)
@@ -69,6 +84,30 @@ class SelfPlayWrapper(VecotarableWrapper):
             swap_window_size * 2
         )
     def step(self, actions: np.ndarray) -> VecEnvStepReturn:
         env = self.env  # type: ignore
         all_actions = np.zeros((env.num_envs,) + actions.shape[1:], dtype=actions.dtype)

 import copy
 import random
 from collections import deque
+from typing import Any, Deque, Dict, List, Optional
 import numpy as np
+from rl_algo_impls.runner.config import Config
 from rl_algo_impls.shared.policy.policy import Policy
 from rl_algo_impls.wrappers.action_mask_wrapper import find_action_masker
 from rl_algo_impls.wrappers.vectorable_wrapper import (
     def __init__(
         self,
         env,
+        config: Config,
         num_old_policies: int = 0,
         save_steps: int = 20_000,
         swap_steps: int = 10_000,
         window: int = 10,
         swap_window_size: int = 2,
+        selfplay_bots: Optional[Dict[str, Any]] = None,
+        bot_always_player_2: bool = False,
     ) -> None:
         super().__init__(env)
         assert num_old_policies % 2 == 0, f"num_old_policies must be even"
             num_old_policies % swap_window_size == 0
         ), f"num_old_policies must be a multiple of swap_window_size"
+        self.config = config
         self.num_old_policies = num_old_policies
         self.save_steps = save_steps
         self.swap_steps = swap_steps
         self.swap_window_size = swap_window_size
+        self.selfplay_bots = selfplay_bots
+        self.bot_always_player_2 = bot_always_player_2
         self.policies: Deque[Policy] = deque(maxlen=window)
         self.policy_assignments: List[Optional[Policy]] = [None] * env.num_envs
         self.steps_since_swap = np.zeros(env.num_envs)
+        self.selfplay_policies: Dict[str, Policy] = {}
         self.num_envs = env.num_envs - num_old_policies
+        if self.selfplay_bots:
+            self.num_envs -= sum(self.selfplay_bots.values())
+            self.initialize_selfplay_bots()
     def get_action_mask(self) -> Optional[np.ndarray]:
         return self.env.get_action_mask()[self.learner_indexes()]
         copied_policy.train(False)
         self.policies.append(copied_policy)
+        if all(p is None for p in self.policy_assignments[: 2 * self.num_old_policies]):
             for i in range(self.num_old_policies):
                 # Switch between player 1 and 2
+                self.policy_assignments[
+                    2 * i + (i % 2 if not self.bot_always_player_2 else 1)
+                ] = copied_policy
     def swap_policy(self, idx: int, swap_window_size: int = 1) -> None:
         policy = random.choice(self.policies)
             swap_window_size * 2
         )
+    def initialize_selfplay_bots(self) -> None:
+        if not self.selfplay_bots:
+            return
+        from rl_algo_impls.runner.running_utils import get_device, make_policy
+        env = self.env  # Type: ignore
+        device = get_device(self.config, env)
+        start_idx = 2 * self.num_old_policies
+        for model_path, n in self.selfplay_bots.items():
+            policy = make_policy(
+                self.config.algo,
+                env,
+                device,
+                load_path=model_path,
+                **self.config.policy_hyperparams,
+            ).eval()
+            self.selfplay_policies["model_path"] = policy
+            for idx in range(start_idx, start_idx + 2 * n, 2):
+                bot_idx = (
+                    (idx + 1) if self.bot_always_player_2 else (idx + idx // 2 % 2)
+                )
+                self.policy_assignments[bot_idx] = policy
+            start_idx += 2 * n
     def step(self, actions: np.ndarray) -> VecEnvStepReturn:
         env = self.env  # type: ignore
         all_actions = np.zeros((env.num_envs,) + actions.shape[1:], dtype=actions.dtype)

rl_algo_impls/wrappers/vec_episode_recorder.py CHANGED Viewed

@@ -1,21 +1,24 @@
 import numpy as np
 from gym.wrappers.monitoring.video_recorder import VideoRecorder
 from rl_algo_impls.wrappers.vectorable_wrapper import (
-    VecotarableWrapper,
     VecEnvObs,
     VecEnvStepReturn,
 )
 class VecEpisodeRecorder(VecotarableWrapper):
-    def __init__(self, env, base_path: str, max_video_length: int = 3600):
         super().__init__(env)
         self.base_path = base_path
         self.max_video_length = max_video_length
         self.video_recorder = None
         self.recorded_frames = 0
     def step(self, actions: np.ndarray) -> VecEnvStepReturn:
         obs, rew, dones, infos = self.env.step(actions)
@@ -23,13 +26,21 @@ class VecEpisodeRecorder(VecotarableWrapper):
         if self.video_recorder:
             self.video_recorder.capture_frame()
             self.recorded_frames += 1
             if dones[0] and infos[0].get("episode"):
                 episode_info = {
                     k: v.item() if hasattr(v, "item") else v
                     for k, v in infos[0]["episode"].items()
                 }
-                self.video_recorder.metadata["episode"] = episode_info
-            if dones[0] or self.recorded_frames > self.max_video_length:
                 self._close_video_recorder()
         return obs, rew, dones, infos

 import numpy as np
 from gym.wrappers.monitoring.video_recorder import VideoRecorder
 from rl_algo_impls.wrappers.vectorable_wrapper import (
     VecEnvObs,
     VecEnvStepReturn,
+    VecotarableWrapper,
 )
 class VecEpisodeRecorder(VecotarableWrapper):
+    def __init__(
+        self, env, base_path: str, max_video_length: int = 3600, num_episodes: int = 1
+    ):
         super().__init__(env)
         self.base_path = base_path
         self.max_video_length = max_video_length
+        self.num_episodes = num_episodes
         self.video_recorder = None
         self.recorded_frames = 0
+        self.num_completed = 0
     def step(self, actions: np.ndarray) -> VecEnvStepReturn:
         obs, rew, dones, infos = self.env.step(actions)
         if self.video_recorder:
             self.video_recorder.capture_frame()
             self.recorded_frames += 1
+            if dones[0]:
+                self.num_completed += 1
             if dones[0] and infos[0].get("episode"):
                 episode_info = {
                     k: v.item() if hasattr(v, "item") else v
                     for k, v in infos[0]["episode"].items()
                 }
+                if "episodes" not in self.video_recorder.metadata:
+                    self.video_recorder.metadata["episodes"] = []
+                self.video_recorder.metadata["episodes"].append(episode_info)
+            if (
+                self.num_completed == self.num_episodes
+                or self.recorded_frames > self.max_video_length
+            ):
                 self._close_video_recorder()
         return obs, rew, dones, infos

saved_models/ppo-Microrts-selfplay-unet-decay-S1-best/model.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:88e16656964a7e40fb6b73bf2ef28715d53557e49beed41553d52e6e23d834bd
 size 15323895

 version https://git-lfs.github.com/spec/v1
+oid sha256:7bee4122bcaffdea46193740a46d983e016e5b71d837ee1221fbc4b21f15cc39
 size 15323895

selfplay_enjoy.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from rl_algo_impls.selfplay_enjoy import selfplay_enjoy
+if __name__ == "__main__":
+    selfplay_enjoy()