In [1]:
%%capture
!pip install stable-baselines3[extra]
!pip install moviepy

In [2]:
from stable_baselines3 import DQN
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import BaseCallback, EvalCallback, CallbackList
from stable_baselines3.common.logger import Video, HParam, TensorBoardOutputFormat
from stable_baselines3.common.evaluation import evaluate_policy

from typing import Any, Dict

import gymnasium as gym
import torch as th
import numpy as np

# =====File names=====
MODEL_FILE_NAME = "ALE-Pacman-v5"
BUFFER_FILE_NAME = "dqn_replay_buffer_pacman_v2-7"
POLICY_FILE_NAME = "dqn_policy_pacman_v2-7"

# =====Model Config=====
# Evaluate in 20ths
EVAL_CALLBACK_FREQ = 50_000
# Record in approximate quarters
# Using an endpoint about 5% less than the total timesteps will trigger the last video call.
# This doesn't coincide exactly with the end, but gets close.
VIDEO_CALLBACK_FREQ = 350_000
FRAMESKIP = 4
NUM_TIMESTEPS = 1_500_000

# =====Hyperparams=====
EXPLORATION_FRACTION = 0.3
# Buffer size needs to be less than about 60k in order to save it in a Kaggle instance
# Increasing buffer size to 70K, should be able to store it.
BUFFER_SIZE = 70_000
BATCH_SIZE = 64
LEARNING_STARTS = 100_000
LEARNING_RATE = 0.00005
GAMMA = 0.999
FINAL_EPSILON = 0.01
# Target Update Interval is set to 10k by default and looks like it is set to 
# 4 in the Nature paper. This is a large discrepency and makes me wonder if it 
# is something different or measured differently...
TARGET_UPDATE_INTERVAL = 1_000

# =====Custom objects for hyperparam modification=====
CUSTOM_OBJECTS = {
 "exploration_fraction": EXPLORATION_FRACTION, 
 "buffer_size": BUFFER_SIZE,
 "batch_size": BATCH_SIZE,
 "learning_starts": LEARNING_STARTS,
 "learning_rate": LEARNING_RATE,
 "gamma": GAMMA,
 "target_update_interval": TARGET_UPDATE_INTERVAL,
 "exploration_final_eps": FINAL_EPSILON,
 "tensorboard_log": "./",
 "verbose": 1}

2024-05-16 13:08:34.276877: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-16 13:08:34.276975: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-16 13:08:34.377172: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# VideoRecorderCallback
# The VideoRecorderCallback should record a video of the agent in the evaluation environment
# every render_freq timesteps. It will record one episode. It will also record one episode when
# the training has been completed

class VideoRecorderCallback(BaseCallback):
 def __init__(self, eval_env: gym.Env, render_freq: int, n_eval_episodes: int = 1, deterministic: bool = True):
 """
 Records a video of an agent's trajectory traversing ``eval_env`` and logs it to TensorBoard.
 :param eval_env: A gym environment from which the trajectory is recorded
 :param render_freq: Render the agent's trajectory every eval_freq call of the callback.
 :param n_eval_episodes: Number of episodes to render
 :param deterministic: Whether to use deterministic or stochastic policy
 """
 super().__init__()
 self._eval_env = eval_env
 self._render_freq = render_freq
 self._n_eval_episodes = n_eval_episodes
 self._deterministic = deterministic

 def _on_step(self) -> bool:
 if self.n_calls % self._render_freq == 0:
 screens = []

 def grab_screens(_locals: Dict[str, Any], _globals: Dict[str, Any]) -> None:
 """
 Renders the environment in its current state, recording the screen in the captured `screens` list
 :param _locals: A dictionary containing all local variables of the callback's scope
 :param _globals: A dictionary containing all global variables of the callback's scope
 """
 screen = self._eval_env.render()
 # PyTorch uses CxHxW vs HxWxC gym (and tensorflow) image convention
 screens.append(screen.transpose(2, 0, 1))

 evaluate_policy(
 self.model,
 self._eval_env,
 callback=grab_screens,
 n_eval_episodes=self._n_eval_episodes,
 deterministic=self._deterministic,
 )
 self.logger.record(
 "trajectory/video",
 Video(th.from_numpy(np.array([screens])), fps=60),
 exclude=("stdout", "log", "json", "csv"),
 )
 return True

In [4]:
# HParamCallback
# This should log the hyperparameters specified and map the metrics that are logged to 
# the appropriate run.
class HParamCallback(BaseCallback):
 """
 Saves the hyperparameters and metrics at the start of the training, and logs them to TensorBoard.
 """ 
 def __init__(self):
 super().__init__()
 

 def _on_training_start(self) -> None:
 
 hparam_dict = {
 "algorithm": self.model.__class__.__name__,
 "policy": self.model.policy.__class__.__name__,
 "environment": self.model.env.__class__.__name__,
 "buffer_size": self.model.buffer_size,
 "batch_size": self.model.batch_size,
 "tau": self.model.tau,
 "gradient_steps": self.model.gradient_steps,
 "target_update_interval": self.model.target_update_interval,
 "exploration_fraction": self.model.exploration_fraction,
 "exploration_initial_eps": self.model.exploration_initial_eps,
 "exploration_final_eps": self.model.exploration_final_eps,
 "max_grad_norm": self.model.max_grad_norm,
 "tensorboard_log": self.model.tensorboard_log,
 "seed": self.model.seed, 
 "learning rate": self.model.learning_rate,
 "gamma": self.model.gamma, 
 }
 # define the metrics that will appear in the `HPARAMS` Tensorboard tab by referencing their tag
 # Tensorbaord will find & display metrics from the `SCALARS` tab
 metric_dict = {
 "eval/mean_ep_length": 0,
 "eval/mean_reward": 0,
 "rollout/ep_len_mean": 0,
 "rollout/ep_rew_mean": 0,
 "rollout/exploration_rate": 0,
 "time/_episode_num": 0,
 "time/fps": 0,
 "time/total_timesteps": 0,
 "train/learning_rate": 0.0,
 "train/loss": 0.0,
 "train/n_updates": 0.0,
 "locals/rewards": 0.0,
 "locals/infos_0_lives": 0.0,
 "locals/num_collected_steps": 0.0,
 "locals/num_collected_episodes": 0.0
 }
 
 self.logger.record(
 "hparams",
 HParam(hparam_dict, metric_dict),
 exclude=("stdout", "log", "json", "csv"),
 )
 
 def _on_step(self) -> bool:
 return True

In [5]:
# PlotTensorboardValuesCallback
# This callback should log values to tensorboard on every step. 
# The self.logger class should plot a new scalar value when recording.

class PlotTensorboardValuesCallback(BaseCallback):
 """
 Custom callback for plotting additional values in tensorboard.
 """
 def __init__(self, eval_env: gym.Env, train_env: gym.Env, model: DQN, verbose=0):
 super().__init__(verbose)
 self._eval_env = eval_env
 self._train_env = train_env
 self._model = model

 def _on_training_start(self) -> None:
 output_formats = self.logger.output_formats
 # Save reference to tensorboard formatter object
 # note: the failure case (not formatter found) is not handled here, should be done with try/except.
 try:
 self.tb_formatter = next(formatter for formatter in output_formats if isinstance(formatter, TensorBoardOutputFormat))
 except:
 print("Exception thrown in tb_formatter initialization.") 
 
 self.tb_formatter.writer.add_text("metadata/eval_env", str(self._eval_env.metadata), self.num_timesteps)
 self.tb_formatter.writer.flush()
 self.tb_formatter.writer.add_text("metadata/train_env", str(self._train_env.metadata), self.num_timesteps)
 self.tb_formatter.writer.flush()
 self.tb_formatter.writer.add_text("model/q_net", str(self._model.q_net), self.num_timesteps)
 self.tb_formatter.writer.flush()
 self.tb_formatter.writer.add_text("model/q_net_target", str(self._model.q_net_target), self.num_timesteps)
 self.tb_formatter.writer.flush()

 def _on_step(self) -> bool:
 self.logger.record("time/_episode_num", self.model._episode_num, exclude=("stdout", "log", "json", "csv"))
 self.logger.record("train/n_updates", self.model._n_updates, exclude=("stdout", "log", "json", "csv"))
 self.logger.record("locals/rewards", self.locals["rewards"], exclude=("stdout", "log", "json", "csv"))
 self.logger.record("locals/infos_0_lives", self.locals["infos"][0]["lives"], exclude=("stdout", "log", "json", "csv"))
 self.logger.record("locals/num_collected_steps", self.locals["num_collected_steps"], exclude=("stdout", "log", "json", "csv"))
 self.logger.record("locals/num_collected_episodes", self.locals["num_collected_episodes"], exclude=("stdout", "log", "json", "csv"))
 
 return True
 
 def _on_training_end(self) -> None:
 self.tb_formatter.writer.add_text("metadata/eval_env", str(self._eval_env.metadata), self.num_timesteps)
 self.tb_formatter.writer.flush()
 self.tb_formatter.writer.add_text("metadata/train_env", str(self._train_env.metadata), self.num_timesteps)
 self.tb_formatter.writer.flush()
 self.tb_formatter.writer.add_text("model/q_net", str(self._model.q_net), self.num_timesteps)
 self.tb_formatter.writer.flush()
 self.tb_formatter.writer.add_text("model/q_net_target", str(self._model.q_net_target), self.num_timesteps)
 self.tb_formatter.writer.flush()

In [6]:
# make the training and evaluation environments
eval_env = Monitor(gym.make("ALE/Pacman-v5", render_mode="rgb_array", frameskip=FRAMESKIP))
train_env = gym.make("ALE/Pacman-v5", render_mode="rgb_array", frameskip=FRAMESKIP)

# Make the model with specified hyperparams
# load the model
# load the buffer
# The loaded model still needs the hyperparameters to be passed to it, and the replay buffer
model = DQN.load("/kaggle/input/dqn-pacmanv5-run2v6/ALE-Pacman-v5.zip", 
 env=train_env, 
 custom_objects=CUSTOM_OBJECTS)
model.load_replay_buffer("/kaggle/input/dqn-pacmanv5-run2v6/dqn_replay_buffer_pacman_v2-6")

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [7]:
# Define the callbacks and put them in a list
eval_callback = EvalCallback(
 eval_env,
 best_model_save_path="./best_model/",
 log_path="./evals/",
 eval_freq=EVAL_CALLBACK_FREQ,
 n_eval_episodes=10,
 deterministic=True,
 render=False)

tbplot_callback = PlotTensorboardValuesCallback(eval_env=eval_env, train_env=train_env, model=model)
video_callback = VideoRecorderCallback(eval_env, render_freq=VIDEO_CALLBACK_FREQ)
hparam_callback = HParamCallback()

callback_list = CallbackList([hparam_callback, eval_callback, video_callback, tbplot_callback])

In [8]:
# Train the model
model.learn(total_timesteps=NUM_TIMESTEPS, 
 callback=callback_list, 
 tb_log_name="./tb/", 
 reset_num_timesteps=False)

Logging to ././tb/_0




----------------------------------
| rollout/ | |
| ep_len_mean | 1.21e+03 |
| ep_rew_mean | 308 |
| exploration_rate | 0.01 |
| time/ | |
| episodes | 6880 |
| fps | 110 |
| time_elapsed | 44 |
| total_timesteps | 7504846 |
| train/ | |
| learning_rate | 5e-05 |
| loss | 0.0931 |
----------------------------------
----------------------------------
| rollout/ | |
| ep_len_mean | 1.22e+03 |
| ep_rew_mean | 312 |
| exploration_rate | 0.01 |
| time/ | |
| episodes | 6884 |
| fps | 111 |
| time_elapsed | 95 |
| total_timesteps | 7510594 |
| train/ | |
| learning_rate | 5e-05 |
| loss | 0.0725 |
----------------------------------
----------------------------------
| rollout/ | |
| ep_len_mean | 1.22e+03 |
| ep_rew_mean | 312 |
| exploration_rate | 0.01 |
| time/ | |
| episodes | 6888 |
| fps | 111 |
| time_elapsed | 138 |
| total_timesteps | 7515488 |
| train/ | |
| learning_rate | 5e-05 |
| loss | 0.0945 |
----------------------------------
----------------------------------
| rollout/ | 

 logger.warn(


----------------------------------
| rollout/ | |
| ep_len_mean | 1.43e+03 |
| ep_rew_mean | 374 |
| exploration_rate | 0.01 |
| time/ | |
| episodes | 7116 |
| fps | 104 |
| time_elapsed | 3472 |
| total_timesteps | 7863417 |
| train/ | |
| learning_rate | 5e-05 |
| loss | 0.32 |
----------------------------------
----------------------------------
| rollout/ | |
| ep_len_mean | 1.42e+03 |
| ep_rew_mean | 368 |
| exploration_rate | 0.01 |
| time/ | |
| episodes | 7120 |
| fps | 104 |
| time_elapsed | 3511 |
| total_timesteps | 7867344 |
| train/ | |
| learning_rate | 5e-05 |
| loss | 0.18 |
----------------------------------
----------------------------------
| rollout/ | |
| ep_len_mean | 1.41e+03 |
| ep_rew_mean | 358 |
| exploration_rate | 0.01 |
| time/ | |
| episodes | 7124 |
| fps | 104 |
| time_elapsed | 3542 |
| total_timesteps | 7871173 |
| train/ | |
| learning_rate | 5e-05 |
| loss | 0.0378 |
----------------------------------
----------------------------------
| rollout/ |



In [9]:
# Save the model, policy, and replay buffer for future loading and training
model.save(MODEL_FILE_NAME)
model.save_replay_buffer(BUFFER_FILE_NAME)
model.policy.save(POLICY_FILE_NAME)