In [1]:
%%capture
!pip install stable-baselines3[extra]
!pip install moviepy

In [2]:
from stable_baselines3 import A2C
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import BaseCallback, EvalCallback, CallbackList
from stable_baselines3.common.logger import Video, HParam, TensorBoardOutputFormat
from stable_baselines3.common.evaluation import evaluate_policy

from typing import Any, Dict

import gymnasium as gym
import torch as th
import numpy as np

# =====File names=====
MODEL_FILE_NAME = "ALE-Pacman-v5"
POLICY_FILE_NAME = "a2c_policy_pacman_v2"

# =====Model Config=====
# Evaluate in 20ths
EVAL_CALLBACK_FREQ = 75_000
# Record in quarters (the last one won't record, will have to do manually)
VIDEO_CALLBACK_FREQ = 375_000
FRAMESKIP = 4
NUM_TIMESTEPS = 1_500_000

# =====Hyperparams=====
# The hyperparams should be defined here, however I want to run this just defaults for this run

2024-05-14 14:39:16.039893: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-14 14:39:16.039990: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-14 14:39:16.182583: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# VideoRecorderCallback
# The VideoRecorderCallback should record a video of the agent in the evaluation environment
# every render_freq timesteps. It will record one episode. It will also record one episode when
# the training has been completed

class VideoRecorderCallback(BaseCallback):
 def __init__(self, eval_env: gym.Env, render_freq: int, n_eval_episodes: int = 1, deterministic: bool = True):
 """
 Records a video of an agent's trajectory traversing ``eval_env`` and logs it to TensorBoard.
 :param eval_env: A gym environment from which the trajectory is recorded
 :param render_freq: Render the agent's trajectory every eval_freq call of the callback.
 :param n_eval_episodes: Number of episodes to render
 :param deterministic: Whether to use deterministic or stochastic policy
 """
 super().__init__()
 self._eval_env = eval_env
 self._render_freq = render_freq
 self._n_eval_episodes = n_eval_episodes
 self._deterministic = deterministic

 def _on_step(self) -> bool:
 if self.n_calls % self._render_freq == 0:
 screens = []

 def grab_screens(_locals: Dict[str, Any], _globals: Dict[str, Any]) -> None:
 """
 Renders the environment in its current state, recording the screen in the captured `screens` list
 :param _locals: A dictionary containing all local variables of the callback's scope
 :param _globals: A dictionary containing all global variables of the callback's scope
 """
 screen = self._eval_env.render()
 # PyTorch uses CxHxW vs HxWxC gym (and tensorflow) image convention
 screens.append(screen.transpose(2, 0, 1))

 evaluate_policy(
 self.model,
 self._eval_env,
 callback=grab_screens,
 n_eval_episodes=self._n_eval_episodes,
 deterministic=self._deterministic,
 )
 self.logger.record(
 "trajectory/video",
 Video(th.from_numpy(np.array([screens])), fps=60),
 exclude=("stdout", "log", "json", "csv"),
 )
 return True

In [4]:
# HParamCallback
# This should log the hyperparameters specified and map the metrics that are logged to 
# the appropriate run.
class HParamCallback(BaseCallback):
 """
 Saves the hyperparameters and metrics at the start of the training, and logs them to TensorBoard.
 """ 
 def __init__(self):
 super().__init__()
 

 def _on_training_start(self) -> None:
 
 hparam_dict = {
 "algorithm": self.model.__class__.__name__,
 "policy": self.model.policy.__class__.__name__,
 "environment": self.model.env.__class__.__name__,
 "learning_rate": self.model.learning_rate,
 "n_steps": self.model.n_steps,
 "gamma": self.model.gamma,
 "gae_lambda": self.model.gae_lambda,
 "ent_coef": self.model.ent_coef,
 "vf_coef": self.model.vf_coef,
 "max_grad_norm": self.model.max_grad_norm,
# "rms_prop_eps": self.model.rms_prop_eps, Threw error, model has no attribute
# "use_rms_prop": self.model.use_rms_prop, Threw error, model has no attribute
 "use_sde": self.model.use_sde,
 "sde_sample_freq": self.model.sde_sample_freq
 }
 # define the metrics that will appear in the `HPARAMS` Tensorboard tab by referencing their tag
 # Tensorbaord will find & display metrics from the `SCALARS` tab
 metric_dict = {
 "eval/mean_ep_length": 0,
 "eval/mean_reward": 0,
 "rollout/ep_len_mean": 0,
 "rollout/ep_rew_mean": 0,
 "rollout/exploration_rate": 0,
 "time/_episode_num": 0,
 "time/fps": 0,
 "time/total_timesteps": 0,
 "train/learning_rate": 0.0,
 "train/loss": 0.0,
 "train/n_updates": 0.0,
 "locals/rewards": 0.0,
 "locals/infos_0_lives": 0.0,
 "locals/num_collected_steps": 0.0,
 "locals/num_collected_episodes": 0.0
 }
 
 self.logger.record(
 "hparams",
 HParam(hparam_dict, metric_dict),
 exclude=("stdout", "log", "json", "csv"),
 )
 
 def _on_step(self) -> bool:
 return True

In [5]:
# PlotTensorboardValuesCallback
# This callback should log values to tensorboard on every step. 
# The self.logger class should plot a new scalar value when recording.

class PlotTensorboardValuesCallback(BaseCallback):
 """
 Custom callback for plotting additional values in tensorboard.
 """
 def __init__(self, eval_env: gym.Env, train_env: gym.Env, model: A2C, verbose=0):
 super().__init__(verbose)
 self._eval_env = eval_env
 self._train_env = train_env
 self._model = model

 def _on_training_start(self) -> None:
 output_formats = self.logger.output_formats
 # Save reference to tensorboard formatter object
 # note: the failure case (not formatter found) is not handled here, should be done with try/except.
 try:
 self.tb_formatter = next(formatter for formatter in output_formats if isinstance(formatter, TensorBoardOutputFormat))
 except:
 print("Exception thrown in tb_formatter initialization.") 
 
 self.tb_formatter.writer.add_text("metadata/eval_env", str(self._eval_env.metadata), self.num_timesteps)
 self.tb_formatter.writer.flush()
 self.tb_formatter.writer.add_text("metadata/train_env", str(self._train_env.metadata), self.num_timesteps)
 self.tb_formatter.writer.flush()
 

 def _on_step(self) -> bool:
 self.logger.record("time/_episode_num", self.model._episode_num, exclude=("stdout", "log", "json", "csv"))
 self.logger.record("train/n_updates", self.model._n_updates, exclude=("stdout", "log", "json", "csv"))
 self.logger.record("locals/rewards", self.locals["rewards"], exclude=("stdout", "log", "json", "csv"))
 self.logger.record("locals/infos_0_lives", self.locals["infos"][0]["lives"], exclude=("stdout", "log", "json", "csv"))
 
 return True
 
 def _on_training_end(self) -> None:
 self.tb_formatter.writer.add_text("metadata/eval_env", str(self._eval_env.metadata), self.num_timesteps)
 self.tb_formatter.writer.flush()
 self.tb_formatter.writer.add_text("metadata/train_env", str(self._train_env.metadata), self.num_timesteps)
 self.tb_formatter.writer.flush()

In [6]:
# make the training and evaluation environments
eval_env = Monitor(gym.make("ALE/Pacman-v5", render_mode="rgb_array", frameskip=FRAMESKIP))
train_env = gym.make("ALE/Pacman-v5", render_mode="rgb_array", frameskip=FRAMESKIP)

# Make the model with specified hyperparams
model = A2C(
 "CnnPolicy",
 train_env,
 verbose=1,
 tensorboard_log="./",
 )

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [7]:
# Define the callbacks and put them in a list
eval_callback = EvalCallback(
 eval_env,
 best_model_save_path="./best_model/",
 log_path="./evals/",
 eval_freq=EVAL_CALLBACK_FREQ,
 n_eval_episodes=10,
 deterministic=True,
 render=False)

tbplot_callback = PlotTensorboardValuesCallback(eval_env=eval_env, train_env=train_env, model=model)
video_callback = VideoRecorderCallback(eval_env, render_freq=VIDEO_CALLBACK_FREQ)
hparam_callback = HParamCallback()

callback_list = CallbackList([hparam_callback, eval_callback, video_callback, tbplot_callback])

In [8]:
# Train the model
model.learn(total_timesteps=NUM_TIMESTEPS, callback=callback_list, tb_log_name="./tb/")

Logging to ././tb/_1




------------------------------------
| rollout/ | |
| ep_len_mean | 392 |
| ep_rew_mean | 11 |
| time/ | |
| fps | 106 |
| iterations | 100 |
| time_elapsed | 4 |
| total_timesteps | 500 |
| train/ | |
| entropy_loss | -1.56 |
| explained_variance | -0.0132 |
| learning_rate | 0.0007 |
| policy_loss | -0.011 |
| value_loss | 5.01e-05 |
------------------------------------
------------------------------------
| rollout/ | |
| ep_len_mean | 455 |
| ep_rew_mean | 14.5 |
| time/ | |
| fps | 137 |
| iterations | 200 |
| time_elapsed | 7 |
| total_timesteps | 1000 |
| train/ | |
| entropy_loss | -1.56 |
| explained_variance | -0.774 |
| learning_rate | 0.0007 |
| policy_loss | -0.0162 |
| value_loss | 0.000132 |
------------------------------------
------------------------------------
| rollout/ | |
| ep_len_mean | 407 |
| ep_rew_mean | 12.7 |
| time/ | |
| fps | 152 |
| iterations | 300 |
| time_elapsed | 9 |
| total_timesteps | 1500 |
| train/ | |
| entropy_loss | -1.59 |
| explained_varia

 logger.warn(


---------------------------------
| rollout/ | |
| ep_len_mean | 508 |
| ep_rew_mean | 15.1 |
| time/ | |
| fps | 186 |
| iterations | 75000 |
| time_elapsed | 2010 |
| total_timesteps | 375000 |
---------------------------------
------------------------------------
| rollout/ | |
| ep_len_mean | 509 |
| ep_rew_mean | 15.2 |
| time/ | |
| fps | 186 |
| iterations | 75100 |
| time_elapsed | 2016 |
| total_timesteps | 375500 |
| train/ | |
| entropy_loss | -1.23 |
| explained_variance | 0 |
| learning_rate | 0.0007 |
| policy_loss | -0.0603 |
| value_loss | 0.00163 |
------------------------------------
-------------------------------------
| rollout/ | |
| ep_len_mean | 510 |
| ep_rew_mean | 15.2 |
| time/ | |
| fps | 186 |
| iterations | 75200 |
| time_elapsed | 2019 |
| total_timesteps | 376000 |
| train/ | |
| entropy_loss | -1.25 |
| explained_variance | -1.19e-07 |
| learning_rate | 0.0007 |
| policy_loss | -0.0521 |
| value_loss | 0.00145 |
-------------------------------------
--



In [9]:
# Save the model and policy for future loading and training
model.save(MODEL_FILE_NAME)
model.policy.save(POLICY_FILE_NAME)