#Install

In [None]:
#COURSE: https://huggingface.co/deep-rl-course/unit1/hands-on?fw=pt
#SB3: https://stable-baselines3.readthedocs.io/en/master/

# The first step is to install the dependencies, we‚Äôll install multiple ones.

# gym[box2D]: Contains the LunarLander-v2 environment üåõ (we use gym==0.21)
!apt install swig cmake
# stable-baselines3[extra]: The deep reinforcement learning library.
# huggingface_sb3: Additional code for Stable-baselines3 to load and upload models from the Hugging Face ü§ó Hub.
!pip install -r https://raw.githubusercontent.com/huggingface/deep-rl-class/main/notebooks/unit1/requirements-unit1.txt

# During the notebook, we‚Äôll need to generate a replay video. To do so, with colab, we need to have a virtual screen to be able to render the environment (and thus record the frames).
# Hence the following cell will install virtual screen libraries and create and run a virtual screen üñ•
!sudo apt-get update
!apt install python-opengl
!apt install ffmpeg
!apt install xvfb
!pip3 install pyvirtualdisplay

# To make sure the new installed libraries are used, sometimes it‚Äôs required to restart the notebook runtime. 
# The next cell will force the runtime to crash, so you‚Äôll need to connect again and run the code starting from here.
# Thanks for this trick, we will be able to run our virtual screen.
import os
os.kill(os.getpid(), 9)

Reading package lists... Done
Building dependency tree       
Reading state information... Done
cmake is already the newest version (3.10.2-1ubuntu2.18.04.2).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
Suggested packages:
  swig-doc swig-examples swig3.0-examples swig3.0-doc
The following NEW packages will be installed:
  swig swig3.0
0 upgraded, 2 newly installed, 0 to remove and 20 not upgraded.
Need to get 1,100 kB of archives.
After this operation, 5,822 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 swig3.0 amd64 3.0.12-1 [1,094 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 swig amd64 3.0.12-1 [6,460 B]
Fetched 1,100 kB in 1s (844 kB/s)
Selecting previously unselected package swig3.0.
(Reading database ... 124016 files and directories currently installed.)
Preparing to unpack .../swig3.0_3.0.12-1_amd64.deb ...
Unpack

#101 RL

In [None]:
# Virtual display: 
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

#gym, training enviroment
import gym

#hf api to commit and upload in hub
from huggingface_sb3 import load_from_hub, package_to_hub, push_to_hub
from huggingface_hub import (
    notebook_login,
)  # To log to our Hugging Face account to be able to upload models to the Hub.

#SB3 imports
from stable_baselines3 import PPO,DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env

In [None]:
# We create our environment with gym.make("<name_of_the_environment>")
env = gym.make("LunarLander-v2")
env.reset()
print("_____OBSERVATION SPACE_____ \n")
print("Observation Space Shape", env.observation_space.shape)
print("Sample observation", env.observation_space.sample())  # Get a random observation

_____OBSERVATION SPACE_____ 

Observation Space Shape (8,)
Sample observation [-1.1949229   1.0401516  -1.5713538   0.6417865  -0.99068826  0.9180964
  0.6063702  -0.7064557 ]


1. Horizontal pad coordinate (x)
2. Vertical pad coordinate (y)
3. Horizontal speed (x)
4. Vertical speed (y)
5. Angle
6. Angular speed
7. If the left leg has contact point touched the land
8. If the right leg has contact point touched the land

The action space (the set of possible actions the agent can take) is discrete with 4 actions available üéÆ:

1. Do nothing,
2. Fire left orientation engine,
3. Fire the main engine,
4. Fire right orientation engine.

## Example in gym enviroment

In [None]:
import gym

# First, we create our environment called LunarLander-v2
env = gym.make("LunarLander-v2")

# Then we reset this environment
observation = env.reset()

for _ in range(20):
    # Take a random action
    action = env.action_space.sample()
    print("Action taken:", action)

    # Do this action in the environment and get
    # next_state, reward, done and info
    observation, reward, done, info = env.step(action)
    print('Observation Space: ', observation)
    print('Reward: ', reward)

    # If the game is done (in our case we land, crashed or timeout)
    if done:
        # Reset the environment
        print("Environment is reset")
        observation = env.reset()

Action taken: 3
Observation Space:  [-0.01036568  1.4191318  -0.5172001   0.16945073  0.00962685  0.07120871
  0.          0.        ]
Reward:  1.269672058751156
Action taken: 2
Observation Space:  [-0.01563349  1.4233158  -0.52976215  0.18593512  0.01258225  0.05911319
  0.          0.        ]
Reward:  -2.738042432763211
Action taken: 2
Observation Space:  [-0.02082663  1.4282496  -0.52270544  0.21925125  0.01595003  0.06736136
  0.          0.        ]
Reward:  -1.6749531743428634
Action taken: 2
Observation Space:  [-0.0260541   1.4340469  -0.52605623  0.2576173   0.01922631  0.06553184
  0.          0.        ]
Reward:  -3.108104401038008
Action taken: 2
Observation Space:  [-0.03140211  1.4405689  -0.53762066  0.28983435  0.0220068   0.05561444
  0.          0.        ]
Reward:  -3.742918376601284
Action taken: 0
Observation Space:  [-0.03675032  1.4464911  -0.5376283   0.2631606   0.0247872   0.05561341
  0.          0.        ]
Reward:  0.3363140354737766
Action taken: 1
Observ

## Tutorial in SB3 - CartPole

In [None]:
#https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/sb3/stable_baselines_getting_started.ipynb
#IMPORTS
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy

In [None]:
#@title Evaluate untrained model
#create gym enviroment baased on repo cartpole
env = gym.make('CartPole-v1')

#create model from scratch
# doc: https://stable-baselines3.readthedocs.io/en/master/modules/ppo.html#parameters
model = PPO('MlpPolicy',env,verbose=1)

#Use a separeted enviroment for evaluation
env_eval = gym.make('CartPole-v1')

#random angent, without training
mean_reward, std_reward = evaluate_policy(model,env_eval,n_eval_episodes=100)

#print results
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




mean_reward:36.41 +/- 13.44


In [None]:
#@title train agent and evaluate it
model.learn(total_timesteps=10000);

#evaluate the trained agent
mean_rwd, std_rwd = evaluate_policy(model,env_eval,n_eval_episodes=100)
print(f'reward: {mean_rwd:.2f} +/- {std_rwd:.2f}')

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 23.8     |
|    ep_rew_mean     | 23.8     |
| time/              |          |
|    fps             | 939      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 26.2         |
|    ep_rew_mean          | 26.2         |
| time/                   |              |
|    fps                  | 680          |
|    iterations           | 2            |
|    time_elapsed         | 6            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0073210318 |
|    clip_fraction        | 0.0722       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.687       |
|    explained_variance   | 0.0085       |
|    learning_r



reward: 396.51 +/- 142.18224185882005


In [None]:
#@title Record episode
# Set up fake display; otherwise rendering will fail
import os
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'

In [None]:
from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv

def record_video(env_id, model, video_length=500, prefix='', video_folder='videos/'):
  """
  :param env_id: (str)
  :param model: (RL model)
  :param video_length: (int)
  :param prefix: (str)
  :param video_folder: (str)
  """
  eval_env = DummyVecEnv([lambda: gym.make('CartPole-v1')])
  # Start the video at step=0 and record 500 steps
  eval_env = VecVideoRecorder(eval_env, video_folder=video_folder,
                              record_video_trigger=lambda step: step == 0, video_length=video_length,
                              name_prefix=prefix)

  obs = eval_env.reset()
  for _ in range(video_length):
    action, _ = model.predict(obs)
    obs, _, _, _ = eval_env.step(action)

  # Close the video recorder
  eval_env.close()

In [None]:
record_video('CartPole-v1', model, video_length=500, prefix='ppo-cartpole')

Saving video to /content/videos/ppo-cartpole-step-0-to-step-500.mp4


In [None]:
import base64
from pathlib import Path

from IPython import display as ipythondisplay

def show_videos(video_path='', prefix=''):
  """
  Taken from https://github.com/eleurent/highway-env

  :param video_path: (str) Path to the folder containing videos
  :param prefix: (str) Filter the video, showing only the only starting with this prefix
  """
  html = []
  for mp4 in Path(video_path).glob("{}*.mp4".format(prefix)):
      video_b64 = base64.b64encode(mp4.read_bytes())
      html.append('''<video alt="{}" autoplay 
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{}" type="video/mp4" />
                </video>'''.format(mp4, video_b64.decode('ascii')))
  ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))

In [None]:
show_videos('videos', prefix='ppo')

## Tutorial in SB3 - Lunar Lander

In [None]:
#https://stable-baselines3.readthedocs.io/en/master/guide/examples.html#id4
import gym
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env  #used to build vectorized enviroment

#create enviroment - lets use vec_env
env = gym.make('LunarLander-v2')

#create model
model = DQN('MlpPolicy',env,verbose=0).learn(total_timesteps=int(1e4),progress_bar=True)

#save the agent
# model.save('dqn_lunar')

# del model  # delete trained model to demonstrate loading

# # Load the trained agent
# # NOTE: if you have loading issue, you can pass `print_system_info=True`
# # to compare the system on which the model was trained vs the current one
# # model = DQN.load("dqn_lunar", env=env, print_system_info=True)
# model = DQN.load("dqn_lunar", env=env)

# We create a vectorized environment 
# (method for stacking multiple independent environments into a single environment) 
# of 16 environments, this way, we‚Äôll have more diverse experiences during the training.
env_eval = make_vec_env("LunarLander-v2", n_envs=5)

#evaluate model
mean_rwd,std_rwd = evaluate_policy(model,model.get_env(),n_eval_episodes=10)

#print results
print(f"mean_reward:{mean_rwd:.2f} +/- {std_rwd:.2f}")

Output()

mean_reward:-147.89 +/- 27.79


In [None]:
# Enjoy trained agent
vec_env = model.get_env()
obs = vec_env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, rewards, dones, info = vec_env.step(action)
    vec_env.render()

In [None]:
from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv

def record_video(env_id, model, video_length=500, prefix='', video_folder='videos/'):
  """
  :param env_id: (str)
  :param model: (RL model)
  :param video_length: (int)
  :param prefix: (str)
  :param video_folder: (str)
  """
  eval_env = model.get_env()
  # Start the video at step=0 and record 500 steps
  eval_env = VecVideoRecorder(eval_env, video_folder=video_folder,
                              record_video_trigger=lambda step: step == 0, video_length=video_length,
                              name_prefix=prefix)

  obs = eval_env.reset()
  for _ in range(video_length):
    action, _ = model.predict(obs)
    obs, _, _, _ = eval_env.step(action)

  # Close the video recorder
  eval_env.close()

import base64
from pathlib import Path

from IPython import display as ipythondisplay

def show_videos(video_path='', prefix=''):
  """
  Taken from https://github.com/eleurent/highway-env

  :param video_path: (str) Path to the folder containing videos
  :param prefix: (str) Filter the video, showing only the only starting with this prefix
  """
  html = []
  for mp4 in Path(video_path).glob("{}*.mp4".format(prefix)):
      video_b64 = base64.b64encode(mp4.read_bytes())
      html.append('''<video alt="{}" autoplay 
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{}" type="video/mp4" />
                </video>'''.format(mp4, video_b64.decode('ascii')))
  ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))

In [None]:
record_video('LunarLander-v2', model, video_length=500, prefix='dqn-ll')

Saving video to /content/videos/dqn-ll-step-0-to-step-500.mp4


In [None]:
show_videos('videos', prefix='dqn')

#MODEL

In [1]:
#@title imports
#HF tutorial
#https://huggingface.co/deep-rl-course/unit1/hands-on

#imports
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv
from huggingface_sb3 import load_from_hub, package_to_hub, push_to_hub
from huggingface_hub import (
    notebook_login,
)

# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900));
virtual_display.start();

#create seed
seed=11

In [2]:
%%time
#@title create and train the model
#create enviroment
env = gym.make('LunarLander-v2')

#reset enviroment ot initial state
env.reset()

#create vectorized enviroment
env = make_vec_env("LunarLander-v2", n_envs=16)

#instanciate the agent
#params: https://stable-baselines3.readthedocs.io/en/master/modules/ppo.html#parameters
model = PPO('MlpPolicy',env,verbose=1,
            learning_rate=0.0003,
            # n_steps=2048,
            n_steps=1024,
            batch_size=64,
            # n_epochs=10,
            n_epochs=4,
            # gamma=0.99,
            gamma=0.999,
            # gae_lambda=0.95,
            gae_lambda=0.98,
            clip_range=0.2,
            clip_range_vf=None,
            normalize_advantage=True,
            # ent_coef=0.0,
            ent_coef=0.01
            # vf_coef=0.5,
            # max_grad_norm=0.5,
            # use_sde=False,
            # sde_sample_freq=-1,
            # target_kl=None,
            # tensorboard_log=None,
            # policy_kwargs=None,
            # verbose=0,
            # seed=seed,
            # device='auto',
            # _init_setup_model=True           
            )

#train model
model.learn(total_timesteps=int(1e6))

# Save the model
# model_name = "ppo_notrain-LunarLander-v2"
# model.save(model_name)

Using cuda device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 96.5     |
|    ep_rew_mean     | -185     |
| time/              |          |
|    fps             | 1563     |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 16384    |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 89.2         |
|    ep_rew_mean          | -137         |
| time/                   |              |
|    fps                  | 1667         |
|    iterations           | 2            |
|    time_elapsed         | 19           |
|    total_timesteps      | 32768        |
| train/                  |              |
|    approx_kl            | 0.0072074104 |
|    clip_fraction        | 0.0691       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.38        |
|    explained_variance   | 0.0007      

<stable_baselines3.ppo.ppo.PPO at 0x7fa593c28a60>

In [3]:
#@title evaluate model

# Create a new environment for evaluation
eval_env = gym.make('LunarLander-v2')
eval_env.reset()

# Evaluate the model with 10 evaluation episodes and deterministic=True
mean_reward, std_reward = evaluate_policy(model,eval_env,n_eval_episodes=10,deterministic=True)

# Print the results
print(f'reward: {mean_reward:.2f} +/- {std_reward:.2f}')



reward: 242.08 +/- 17.93


In [5]:
#https://huggingface.co/deep-rl-course/unit1/hands-on?fw=pt#publish-our-trained-model-on-the-hub
#@title upload model and video in HF hub
# To log to our Hugging Face account to be able to upload models to the Hub.
notebook_login()  #copy and paste the token
!git config --global credential.helper store

# TODO: Define the name of the environment
env_id = 'LunarLander-v2'

# Create the evaluation env
eval_env = DummyVecEnv([lambda: gym.make(env_id)])

# Define the model architecture we used
model_architecture = "PPO"

model_name = "ppo-LunarLander-v2"

## repo_id is the id of the model repository from the Hugging Face Hub (repo_id =  {username}/{model_architecture}-{env_id} for instance ThomasSimonini/ppo-LunarLander-v2
repo_id = f'asuzuki/{model_architecture}-{env_id}'

## TODO: Define the commit message
commit_message = "first commit - model PPO performing good"

# Create the evaluation env
eval_env = DummyVecEnv([lambda: gym.make(env_id)])

# method save, evaluate, generate a model card and record a replay video of your agent before pushing the repo to the hub
package_to_hub(model=model, # Our trained model
               model_name=model_name, # The name of our trained model
               model_architecture=model_architecture, # The model architecture we used: in our case PPO
               env_id=env_id, # Name of the environment
               eval_env=eval_env, # Evaluation Environment
               repo_id=repo_id, # id of the model repository from the Hugging Face Hub (repo_id = {organization}/{repo_name} for instance ThomasSimonini/ppo-LunarLander-v2
               commit_message=commit_message)

# Note: if after running the package_to_hub function and it gives an issue of rebasing, please run the following code
# cd <path_to_repo> && git add . && git commit -m "Add message" && git pull
# And don't forget to do a "git push" at the end to push the change to the hub.

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


#LOAD MODEL

In [7]:
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from huggingface_sb3 import load_from_hub

# TODO: Define the name of the environment
env_id = 'LunarLander-v2'

# Define the model architecture we used
model_architecture = "PPO"

model_name = "ppo-LunarLander-v2"

repo_id = f'asuzuki/{model_architecture}-{env_id}'
model_name = "ppo-LunarLander-v2"
filename = f'{model_name}.zip'

# When the model was trained on Python 3.8 the pickle protocol is 5
# But Python 3.6, 3.7 use protocol 4
# In order to get compatibility we need to:
# 1. Install pickle5 (we done it at the beginning of the colab)
# 2. Create a custom empty object we pass as parameter to PPO.load()
custom_objects = {
    "learning_rate": 0.0,
    "lr_schedule": lambda _: 0.0,
    "clip_range": lambda _: 0.0,
}

checkpoint = load_from_hub(repo_id, filename)
model = PPO.load(checkpoint, custom_objects=custom_objects, print_system_info=True)

eval_env = gym.make("LunarLander-v2")
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)
print(f"\n\nmean_reward={mean_reward:.2f} +/- {std_reward}")

== CURRENT SYSTEM INFO ==
OS: Linux-5.10.147+-x86_64-with-glibc2.27 #1 SMP Sat Dec 10 16:00:40 UTC 2022
Python: 3.8.16
Stable-Baselines3: 1.6.2
PyTorch: 1.13.0+cu116
GPU Enabled: True
Numpy: 1.21.6
Gym: 0.21.0

== SAVED MODEL SYSTEM INFO ==
OS: Linux-5.10.147+-x86_64-with-glibc2.27 #1 SMP Sat Dec 10 16:00:40 UTC 2022
Python: 3.8.16
Stable-Baselines3: 1.6.2
PyTorch: 1.13.0+cu116
GPU Enabled: True
Numpy: 1.21.6
Gym: 0.21.0



mean_reward=251.52 +/- 26.518009760699584
