|
import numpy as np |
|
|
|
from collections import deque |
|
|
|
import matplotlib.pyplot as plt |
|
import pdb |
|
|
|
import torch |
|
import torch.nn as nn |
|
import torch.nn.functional as F |
|
import torch.optim as optim |
|
from torch.distributions import Categorical |
|
|
|
|
|
import gym |
|
import gym_pygame |
|
|
|
|
|
from huggingface_hub import notebook_login |
|
import imageio |
|
|
|
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") |
|
print(device) |
|
|
|
env_id = "CartPole-v1" |
|
|
|
env = gym.make(env_id, render_mode="rgb_array") |
|
|
|
|
|
eval_env = gym.make(env_id, render_mode="rgb_array") |
|
|
|
|
|
s_size = env.observation_space.shape[0] |
|
a_size = env.action_space.n |
|
|
|
print("_____OBSERVATION SPACE_____ \n") |
|
print("The State Space is: ", s_size) |
|
|
|
print("Sample observation", env.observation_space.sample()) |
|
|
|
print("\n _____ACTION SPACE_____ \n") |
|
print("The Action Space is: ", a_size) |
|
print("Action Space Sample", env.action_space.sample()) |
|
|
|
|
|
|
|
|
|
class Policy(nn.Module): |
|
def __init__(self, s_size, a_size, h_size): |
|
super(Policy, self).__init__() |
|
self.fc1 = nn.Linear(s_size, h_size) |
|
self.fc2 = nn.Linear(h_size, a_size) |
|
|
|
def forward(self, x): |
|
x = F.relu(self.fc1(x)) |
|
x = self.fc2(x) |
|
return F.softmax(x, dim=1) |
|
|
|
|
|
def act(self, state): |
|
state = torch.from_numpy(state).float().unsqueeze(0).to(device) |
|
probs = self.forward(state).cpu()[0] |
|
m = Categorical(probs) |
|
action = m.sample() |
|
|
|
return action.item(), m.log_prob(action) |
|
|
|
|
|
def reinforce(policy, optimizer, n_training_episodes, max_t, gamma, print_every): |
|
|
|
scores_deque = deque(maxlen=100) |
|
scores = [] |
|
|
|
for i_episode in range(1, n_training_episodes + 1): |
|
saved_log_probs = [] |
|
rewards = [] |
|
state = env.reset()[0] |
|
|
|
for t in range(max_t): |
|
action, log_prob = policy.act(state) |
|
saved_log_probs.append(log_prob) |
|
state, reward, done, _ = env.step(action)[0] |
|
rewards.append(reward) |
|
if done: |
|
break |
|
|
|
scores_deque.append(sum(rewards)) |
|
scores.append(sum(rewards)) |
|
|
|
|
|
returns = deque(maxlen=max_t) |
|
n_steps = len(rewards) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for t in range(n_steps)[::-1]: |
|
disc_return_t = returns[0] if len(returns) > 0 else 0 |
|
returns.appendleft(gamma * disc_return_t + rewards[t]) |
|
|
|
|
|
eps = np.finfo(np.float32).eps.item() |
|
|
|
|
|
returns = torch.tensor(returns) |
|
if len(returns) > 1: |
|
returns = (returns - returns.mean()) / (returns.std() + eps) |
|
|
|
|
|
policy_loss = [] |
|
for log_prob, disc_return in zip(saved_log_probs, returns): |
|
policy_loss.append(-log_prob * disc_return) |
|
if len(policy_loss) > 1: |
|
policy_loss = torch.cat(policy_loss).sum() |
|
else: |
|
policy_loss = policy_loss[0] |
|
|
|
|
|
optimizer.zero_grad() |
|
policy_loss.backward() |
|
optimizer.step() |
|
|
|
if i_episode % print_every == 0: |
|
print("Episode {}\tAverage Score: {:.2f}".format(i_episode, np.mean(scores_deque))) |
|
|
|
return scores |
|
|
|
cartpole_hyperparameters = { |
|
"h_size": 16, |
|
"n_training_episodes": 1000, |
|
"n_evaluation_episodes": 10, |
|
"max_t": 1000, |
|
"gamma": 1.0, |
|
"lr": 1e-2, |
|
"env_id": env_id, |
|
"state_space": s_size, |
|
"action_space": a_size, |
|
} |
|
|
|
|
|
cartpole_policy = Policy( |
|
cartpole_hyperparameters["state_space"], |
|
cartpole_hyperparameters["action_space"], |
|
cartpole_hyperparameters["h_size"], |
|
).to(device) |
|
cartpole_optimizer = optim.Adam(cartpole_policy.parameters(), lr=cartpole_hyperparameters["lr"]) |
|
|
|
scores = reinforce( |
|
cartpole_policy, |
|
cartpole_optimizer, |
|
cartpole_hyperparameters["n_training_episodes"], |
|
cartpole_hyperparameters["max_t"], |
|
cartpole_hyperparameters["gamma"], |
|
100, |
|
) |
|
|
|
|
|
def evaluate_agent(env, max_steps, n_eval_episodes, policy): |
|
""" |
|
Evaluate the agent for ``n_eval_episodes`` episodes and returns average reward and std of reward. |
|
:param env: The evaluation environment |
|
:param n_eval_episodes: Number of episode to evaluate the agent |
|
:param policy: The Reinforce agent |
|
""" |
|
episode_rewards = [] |
|
for episode in range(n_eval_episodes): |
|
state = env.reset()[0] |
|
step = 0 |
|
done = False |
|
total_rewards_ep = 0 |
|
|
|
for step in range(max_steps): |
|
action, _ = policy.act(state) |
|
new_state, reward, done, info = env.step(action)[0] |
|
total_rewards_ep += reward |
|
|
|
if done: |
|
break |
|
state = new_state |
|
episode_rewards.append(total_rewards_ep) |
|
mean_reward = np.mean(episode_rewards) |
|
std_reward = np.std(episode_rewards) |
|
|
|
return mean_reward, std_reward |
|
|
|
|
|
evaluate_agent( |
|
eval_env, cartpole_hyperparameters["max_t"], cartpole_hyperparameters["n_evaluation_episodes"], cartpole_policy |
|
) |
|
|
|
|
|
from huggingface_hub import HfApi, snapshot_download |
|
from huggingface_hub.repocard import metadata_eval_result, metadata_save |
|
|
|
from pathlib import Path |
|
import datetime |
|
import json |
|
import imageio |
|
|
|
import tempfile |
|
|
|
import os |
|
|
|
|
|
|
|
def record_video(env, policy, out_directory, fps=30): |
|
""" |
|
Generate a replay video of the agent |
|
:param env |
|
:param Qtable: Qtable of our agent |
|
:param out_directory |
|
:param fps: how many frame per seconds (with taxi-v3 and frozenlake-v1 we use 1) |
|
""" |
|
images = [] |
|
done = False |
|
state = env.reset()[0] |
|
img = env.render() |
|
images.append(img) |
|
while not done: |
|
|
|
action, _ = policy.act(state) |
|
state, reward, done, info = env.step(action)[0] |
|
img = env.render() |
|
images.append(img) |
|
imageio.mimsave(out_directory, [np.array(img) for i, img in enumerate(images)], fps=fps) |
|
|
|
|
|
|
|
|
|
def push_to_hub(repo_id, |
|
model, |
|
hyperparameters, |
|
eval_env, |
|
video_fps=30 |
|
): |
|
""" |
|
Evaluate, Generate a video and Upload a model to Hugging Face Hub. |
|
This method does the complete pipeline: |
|
- It evaluates the model |
|
- It generates the model card |
|
- It generates a replay video of the agent |
|
- It pushes everything to the Hub |
|
|
|
:param repo_id: repo_id: id of the model repository from the Hugging Face Hub |
|
:param model: the pytorch model we want to save |
|
:param hyperparameters: training hyperparameters |
|
:param eval_env: evaluation environment |
|
:param video_fps: how many frame per seconds to record our video replay |
|
""" |
|
|
|
_, repo_name = repo_id.split("/") |
|
api = HfApi() |
|
|
|
|
|
repo_url = api.create_repo( |
|
repo_id=repo_id, |
|
exist_ok=True, |
|
) |
|
|
|
local_dir = "./cartpole-v1" |
|
|
|
|
|
torch.save(model, os.path.join(local_dir, "model.pt")) |
|
|
|
|
|
hyper_path = os.path.join(local_dir, "hyperparameters.json") |
|
with open(hyper_path, "w") as outfile: |
|
json.dump(hyperparameters, outfile) |
|
|
|
|
|
mean_reward, std_reward = evaluate_agent(eval_env, |
|
hyperparameters["max_t"], |
|
hyperparameters["n_evaluation_episodes"], |
|
model) |
|
|
|
eval_datetime = datetime.datetime.now() |
|
eval_form_datetime = eval_datetime.isoformat() |
|
|
|
evaluate_data = { |
|
"env_id": hyperparameters["env_id"], |
|
"mean_reward": mean_reward, |
|
"n_evaluation_episodes": hyperparameters["n_evaluation_episodes"], |
|
"eval_datetime": eval_form_datetime, |
|
} |
|
|
|
|
|
result_path = os.path.join(local_dir, "results.json") |
|
with open(result_path, "w") as outfile: |
|
json.dump(evaluate_data, outfile) |
|
|
|
|
|
env_name = hyperparameters["env_id"] |
|
|
|
metadata = {} |
|
metadata["tags"] = [ |
|
env_name, |
|
"reinforce", |
|
"reinforcement-learning", |
|
"custom-implementation", |
|
"deep-rl-class" |
|
] |
|
|
|
|
|
eval = metadata_eval_result( |
|
model_pretty_name=repo_name, |
|
task_pretty_name="reinforcement-learning", |
|
task_id="reinforcement-learning", |
|
metrics_pretty_name="mean_reward", |
|
metrics_id="mean_reward", |
|
metrics_value=f"{mean_reward:.2f} +/- {std_reward:.2f}", |
|
dataset_pretty_name=env_name, |
|
dataset_id=env_name, |
|
) |
|
|
|
|
|
metadata = {**metadata, **eval} |
|
|
|
model_card = f""" |
|
# **Reinforce** Agent playing **{env_id}** |
|
This is a trained model of a **Reinforce** agent playing **{env_id}** . |
|
To learn to use this model and train yours check Unit 4 of the Deep Reinforcement Learning Course: https://huggingface.co/deep-rl-course/unit4/introduction |
|
""" |
|
readme_path = Path(local_dir )/ "README.md" |
|
|
|
readme = "" |
|
if readme_path.exists(): |
|
with readme_path.open("r", encoding="utf8") as f: |
|
readme = f.read() |
|
else: |
|
readme = model_card |
|
|
|
with readme_path.open("w", encoding="utf-8") as f: |
|
f.write(readme) |
|
|
|
|
|
metadata_save(readme_path, metadata) |
|
|
|
|
|
video_path = os.path.join(local_dir,"replay.mp4") |
|
record_video(env, model, video_path, video_fps) |
|
|
|
|
|
api.upload_folder( |
|
repo_id=repo_id, |
|
folder_path="./", |
|
path_in_repo=".", |
|
) |
|
|
|
print(f"Your model is pushed to the Hub. You can view your model here: {repo_url}") |
|
|
|
repo_id = "dlwlgus53/Reinforce_cartpol-v1" |
|
push_to_hub( |
|
repo_id, |
|
cartpole_policy, |
|
cartpole_hyperparameters, |
|
eval_env, |
|
video_fps=30 |
|
) |