Upload folder using huggingface_hub
Browse files- .vscode/settings.json +7 -0
- cartpole-v1/README.md +27 -0
- cartpole-v1/hyperparameters.json +1 -0
- cartpole-v1/model.pt +3 -0
- cartpole-v1/replay.mp4 +0 -0
- cartpole-v1/results.json +1 -0
- example1.py +4 -0
- hyperparameters.json +1 -0
- model.pt +3 -0
- policy_gradient.py +383 -0
- ppo.py +194 -0
.vscode/settings.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"workbench.colorCustomizations": {
|
3 |
+
"activityBar.background": "#561625",
|
4 |
+
"titleBar.activeBackground": "#781F34",
|
5 |
+
"titleBar.activeForeground": "#FEFBFC"
|
6 |
+
}
|
7 |
+
}
|
cartpole-v1/README.md
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
tags:
|
3 |
+
- CartPole-v1
|
4 |
+
- reinforce
|
5 |
+
- reinforcement-learning
|
6 |
+
- custom-implementation
|
7 |
+
- deep-rl-class
|
8 |
+
model-index:
|
9 |
+
- name: Reinforce_cartpol-v1
|
10 |
+
results:
|
11 |
+
- task:
|
12 |
+
type: reinforcement-learning
|
13 |
+
name: reinforcement-learning
|
14 |
+
dataset:
|
15 |
+
name: CartPole-v1
|
16 |
+
type: CartPole-v1
|
17 |
+
metrics:
|
18 |
+
- type: mean_reward
|
19 |
+
value: 0.19 +/- 0.03
|
20 |
+
name: mean_reward
|
21 |
+
verified: false
|
22 |
+
---
|
23 |
+
|
24 |
+
# **Reinforce** Agent playing **CartPole-v1**
|
25 |
+
This is a trained model of a **Reinforce** agent playing **CartPole-v1** .
|
26 |
+
To learn to use this model and train yours check Unit 4 of the Deep Reinforcement Learning Course: https://huggingface.co/deep-rl-course/unit4/introduction
|
27 |
+
|
cartpole-v1/hyperparameters.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"h_size": 16, "n_training_episodes": 1000, "n_evaluation_episodes": 10, "max_t": 1000, "gamma": 1.0, "lr": 0.01, "env_id": "CartPole-v1", "state_space": 4, "action_space": 2}
|
cartpole-v1/model.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ee315b44d3dbc05cd7ba1d04fc0b2b0c49605d506c21e34be0e3239656b0aaf9
|
3 |
+
size 3264
|
cartpole-v1/replay.mp4
ADDED
Binary file (2.02 kB). View file
|
|
cartpole-v1/results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"env_id": "CartPole-v1", "mean_reward": 0.1875143900513649, "n_evaluation_episodes": 10, "eval_datetime": "2024-03-09T00:13:58.880496"}
|
example1.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pyvirtualdisplay import Display
|
2 |
+
|
3 |
+
virtual_display = Display(visible=0, size=(1400, 900))
|
4 |
+
virtual_display.start()
|
hyperparameters.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"h_size": 16, "n_training_episodes": 1000, "n_evaluation_episodes": 10, "max_t": 1000, "gamma": 1.0, "lr": 0.01, "env_id": "CartPole-v1", "state_space": 4, "action_space": 2}
|
model.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1b290c70b9fb42b69640d1b651339c462d74379e36a1ef938e6119f30c2d0747
|
3 |
+
size 3264
|
policy_gradient.py
ADDED
@@ -0,0 +1,383 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
from collections import deque
|
4 |
+
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
import pdb
|
7 |
+
# PyTorch
|
8 |
+
import torch
|
9 |
+
import torch.nn as nn
|
10 |
+
import torch.nn.functional as F
|
11 |
+
import torch.optim as optim
|
12 |
+
from torch.distributions import Categorical
|
13 |
+
|
14 |
+
# Gym
|
15 |
+
import gym
|
16 |
+
import gym_pygame
|
17 |
+
|
18 |
+
# Hugging Face Hub
|
19 |
+
from huggingface_hub import notebook_login # To log to our Hugging Face account to be able to upload models to the Hub.
|
20 |
+
import imageio
|
21 |
+
|
22 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
23 |
+
print(device)
|
24 |
+
|
25 |
+
env_id = "CartPole-v1"
|
26 |
+
# Create the env
|
27 |
+
env = gym.make(env_id, render_mode="rgb_array")
|
28 |
+
|
29 |
+
# Create the evaluation env
|
30 |
+
eval_env = gym.make(env_id)
|
31 |
+
|
32 |
+
# Get the state space and action space
|
33 |
+
s_size = env.observation_space.shape[0]
|
34 |
+
a_size = env.action_space.n
|
35 |
+
|
36 |
+
print("_____OBSERVATION SPACE_____ \n")
|
37 |
+
print("The State Space is: ", s_size)
|
38 |
+
# cart postion, cart velocity, pole angle, pole velocity at tip
|
39 |
+
print("Sample observation", env.observation_space.sample()) # Get a random observation
|
40 |
+
|
41 |
+
print("\n _____ACTION SPACE_____ \n")
|
42 |
+
print("The Action Space is: ", a_size)
|
43 |
+
print("Action Space Sample", env.action_space.sample()) # Take a random action
|
44 |
+
|
45 |
+
|
46 |
+
# Policy Gradient Network
|
47 |
+
|
48 |
+
class Policy(nn.Module):
|
49 |
+
def __init__(self, s_size, a_size, h_size):
|
50 |
+
super(Policy, self).__init__()
|
51 |
+
self.fc1 = nn.Linear(s_size, h_size)
|
52 |
+
self.fc2 = nn.Linear(h_size, a_size)
|
53 |
+
|
54 |
+
def forward(self, x):
|
55 |
+
x = F.relu(self.fc1(x))
|
56 |
+
x = self.fc2(x)
|
57 |
+
return F.softmax(x, dim=1)
|
58 |
+
|
59 |
+
|
60 |
+
def act(self, state):
|
61 |
+
state = torch.from_numpy(state).float().unsqueeze(0).to(device)
|
62 |
+
probs = self.forward(state).cpu()[0]
|
63 |
+
m = Categorical(probs)
|
64 |
+
action = m.sample()
|
65 |
+
|
66 |
+
return action.item(), m.log_prob(action)
|
67 |
+
|
68 |
+
|
69 |
+
def reinforce(policy, optimizer, n_training_episodes, max_t, gamma, print_every):
|
70 |
+
# Help us to calculate the score during the training
|
71 |
+
scores_deque = deque(maxlen=100)
|
72 |
+
scores = []
|
73 |
+
# Line 3 of pseudocode
|
74 |
+
for i_episode in range(1, n_training_episodes + 1):
|
75 |
+
saved_log_probs = []
|
76 |
+
rewards = []
|
77 |
+
state = env.reset()[0]
|
78 |
+
# Line 4 of pseudocode
|
79 |
+
for t in range(max_t):
|
80 |
+
action, log_prob = policy.act(state)
|
81 |
+
saved_log_probs.append(log_prob)
|
82 |
+
state, reward, done, _ = env.step(action)[0]
|
83 |
+
rewards.append(reward)
|
84 |
+
if done:
|
85 |
+
break
|
86 |
+
|
87 |
+
scores_deque.append(sum(rewards))
|
88 |
+
scores.append(sum(rewards))
|
89 |
+
|
90 |
+
# Line 6 of pseudocode: calculate the return
|
91 |
+
returns = deque(maxlen=max_t)
|
92 |
+
n_steps = len(rewards)
|
93 |
+
# Compute the discounted returns at each timestep,
|
94 |
+
# as
|
95 |
+
# the sum of the gamma-discounted return at time t (G_t) + the reward at time t
|
96 |
+
#
|
97 |
+
# In O(N) time, where N is the number of time steps
|
98 |
+
# (this definition of the discounted return G_t follows the definition of this quantity
|
99 |
+
# shown at page 44 of Sutton&Barto 2017 2nd draft)
|
100 |
+
# G_t = r_(t+1) + r_(t+2) + ...
|
101 |
+
|
102 |
+
# Given this formulation, the returns at each timestep t can be computed
|
103 |
+
# by re-using the computed future returns G_(t+1) to compute the current return G_t
|
104 |
+
# G_t = r_(t+1) + gamma*G_(t+1)
|
105 |
+
# G_(t-1) = r_t + gamma* G_t
|
106 |
+
# (this follows a dynamic programming approach, with which we memorize solutions in order
|
107 |
+
# to avoid computing them multiple times)
|
108 |
+
|
109 |
+
# This is correct since the above is equivalent to (see also page 46 of Sutton&Barto 2017 2nd draft)
|
110 |
+
# G_(t-1) = r_t + gamma*r_(t+1) + gamma*gamma*r_(t+2) + ...
|
111 |
+
|
112 |
+
## Given the above, we calculate the returns at timestep t as:
|
113 |
+
# gamma[t] * return[t] + reward[t]
|
114 |
+
#
|
115 |
+
## We compute this starting from the last timestep to the first, in order
|
116 |
+
## to employ the formula presented above and avoid redundant computations that would be needed
|
117 |
+
## if we were to do it from first to last.
|
118 |
+
|
119 |
+
## Hence, the queue "returns" will hold the returns in chronological order, from t=0 to t=n_steps
|
120 |
+
## thanks to the appendleft() function which allows to append to the position 0 in constant time O(1)
|
121 |
+
## a normal python list would instead require O(N) to do this.
|
122 |
+
for t in range(n_steps)[::-1]:
|
123 |
+
disc_return_t = returns[0] if len(returns) > 0 else 0
|
124 |
+
returns.appendleft(gamma * disc_return_t + rewards[t])
|
125 |
+
|
126 |
+
## standardization of the returns is employed to make training more stable
|
127 |
+
eps = np.finfo(np.float32).eps.item()
|
128 |
+
## eps is the smallest representable float, which is
|
129 |
+
# added to the standard deviation of the returns to avoid numerical instabilities
|
130 |
+
returns = torch.tensor(returns)
|
131 |
+
if len(returns) > 1:
|
132 |
+
returns = (returns - returns.mean()) / (returns.std() + eps)
|
133 |
+
|
134 |
+
# Line 7:
|
135 |
+
policy_loss = []
|
136 |
+
for log_prob, disc_return in zip(saved_log_probs, returns):
|
137 |
+
policy_loss.append(-log_prob * disc_return)
|
138 |
+
if len(policy_loss) > 1:
|
139 |
+
policy_loss = torch.cat(policy_loss).sum()
|
140 |
+
else:
|
141 |
+
policy_loss = policy_loss[0]
|
142 |
+
|
143 |
+
# Line 8: PyTorch prefers gradient descent
|
144 |
+
optimizer.zero_grad()
|
145 |
+
policy_loss.backward()
|
146 |
+
optimizer.step()
|
147 |
+
|
148 |
+
if i_episode % print_every == 0:
|
149 |
+
print("Episode {}\tAverage Score: {:.2f}".format(i_episode, np.mean(scores_deque)))
|
150 |
+
|
151 |
+
return scores
|
152 |
+
|
153 |
+
cartpole_hyperparameters = {
|
154 |
+
"h_size": 16,
|
155 |
+
"n_training_episodes": 1000,
|
156 |
+
"n_evaluation_episodes": 10,
|
157 |
+
"max_t": 1000,
|
158 |
+
"gamma": 1.0,
|
159 |
+
"lr": 1e-2,
|
160 |
+
"env_id": env_id,
|
161 |
+
"state_space": s_size,
|
162 |
+
"action_space": a_size,
|
163 |
+
}
|
164 |
+
|
165 |
+
# Create policy and place it to the device
|
166 |
+
cartpole_policy = Policy(
|
167 |
+
cartpole_hyperparameters["state_space"],
|
168 |
+
cartpole_hyperparameters["action_space"],
|
169 |
+
cartpole_hyperparameters["h_size"],
|
170 |
+
).to(device)
|
171 |
+
cartpole_optimizer = optim.Adam(cartpole_policy.parameters(), lr=cartpole_hyperparameters["lr"])
|
172 |
+
|
173 |
+
scores = reinforce(
|
174 |
+
cartpole_policy,
|
175 |
+
cartpole_optimizer,
|
176 |
+
cartpole_hyperparameters["n_training_episodes"],
|
177 |
+
cartpole_hyperparameters["max_t"],
|
178 |
+
cartpole_hyperparameters["gamma"],
|
179 |
+
100,
|
180 |
+
)
|
181 |
+
|
182 |
+
|
183 |
+
def evaluate_agent(env, max_steps, n_eval_episodes, policy):
|
184 |
+
"""
|
185 |
+
Evaluate the agent for ``n_eval_episodes`` episodes and returns average reward and std of reward.
|
186 |
+
:param env: The evaluation environment
|
187 |
+
:param n_eval_episodes: Number of episode to evaluate the agent
|
188 |
+
:param policy: The Reinforce agent
|
189 |
+
"""
|
190 |
+
episode_rewards = []
|
191 |
+
for episode in range(n_eval_episodes):
|
192 |
+
state = env.reset()[0]
|
193 |
+
step = 0
|
194 |
+
done = False
|
195 |
+
total_rewards_ep = 0
|
196 |
+
|
197 |
+
for step in range(max_steps):
|
198 |
+
action, _ = policy.act(state)
|
199 |
+
new_state, reward, done, info = env.step(action)[0]
|
200 |
+
total_rewards_ep += reward
|
201 |
+
|
202 |
+
if done:
|
203 |
+
break
|
204 |
+
state = new_state
|
205 |
+
episode_rewards.append(total_rewards_ep)
|
206 |
+
mean_reward = np.mean(episode_rewards)
|
207 |
+
std_reward = np.std(episode_rewards)
|
208 |
+
|
209 |
+
return mean_reward, std_reward
|
210 |
+
|
211 |
+
|
212 |
+
evaluate_agent(
|
213 |
+
eval_env, cartpole_hyperparameters["max_t"], cartpole_hyperparameters["n_evaluation_episodes"], cartpole_policy
|
214 |
+
)
|
215 |
+
|
216 |
+
|
217 |
+
from huggingface_hub import HfApi, snapshot_download
|
218 |
+
from huggingface_hub.repocard import metadata_eval_result, metadata_save
|
219 |
+
|
220 |
+
from pathlib import Path
|
221 |
+
import datetime
|
222 |
+
import json
|
223 |
+
import imageio
|
224 |
+
|
225 |
+
import tempfile
|
226 |
+
|
227 |
+
import os
|
228 |
+
|
229 |
+
|
230 |
+
|
231 |
+
def record_video(env, policy, out_directory, fps=30):
|
232 |
+
"""
|
233 |
+
Generate a replay video of the agent
|
234 |
+
:param env
|
235 |
+
:param Qtable: Qtable of our agent
|
236 |
+
:param out_directory
|
237 |
+
:param fps: how many frame per seconds (with taxi-v3 and frozenlake-v1 we use 1)
|
238 |
+
"""
|
239 |
+
images = []
|
240 |
+
done = False
|
241 |
+
state = env.reset()[0]
|
242 |
+
img = env.render()
|
243 |
+
images.append(img)
|
244 |
+
while not done:
|
245 |
+
# Take the action (index) that have the maximum expected future reward given that state
|
246 |
+
action, _ = policy.act(state)
|
247 |
+
state, reward, done, info = env.step(action)[0] # We directly put next_state = state for recording logic
|
248 |
+
img = env.render()
|
249 |
+
images.append(img)
|
250 |
+
imageio.mimsave(out_directory, [np.array(img) for i, img in enumerate(images)], fps=fps)
|
251 |
+
|
252 |
+
|
253 |
+
|
254 |
+
|
255 |
+
def push_to_hub(repo_id,
|
256 |
+
model,
|
257 |
+
hyperparameters,
|
258 |
+
eval_env,
|
259 |
+
video_fps=30
|
260 |
+
):
|
261 |
+
"""
|
262 |
+
Evaluate, Generate a video and Upload a model to Hugging Face Hub.
|
263 |
+
This method does the complete pipeline:
|
264 |
+
- It evaluates the model
|
265 |
+
- It generates the model card
|
266 |
+
- It generates a replay video of the agent
|
267 |
+
- It pushes everything to the Hub
|
268 |
+
|
269 |
+
:param repo_id: repo_id: id of the model repository from the Hugging Face Hub
|
270 |
+
:param model: the pytorch model we want to save
|
271 |
+
:param hyperparameters: training hyperparameters
|
272 |
+
:param eval_env: evaluation environment
|
273 |
+
:param video_fps: how many frame per seconds to record our video replay
|
274 |
+
"""
|
275 |
+
|
276 |
+
_, repo_name = repo_id.split("/")
|
277 |
+
api = HfApi()
|
278 |
+
|
279 |
+
# Step 1: Create the repo
|
280 |
+
repo_url = api.create_repo(
|
281 |
+
repo_id=repo_id,
|
282 |
+
exist_ok=True,
|
283 |
+
)
|
284 |
+
|
285 |
+
local_dir = "./cartpole-v1"
|
286 |
+
|
287 |
+
# Step 2: Save the model
|
288 |
+
torch.save(model, os.path.join(local_dir, "model.pt"))
|
289 |
+
|
290 |
+
# Step 3: Save the hyperparameters to JSON
|
291 |
+
hyper_path = os.path.join(local_dir, "hyperparameters.json")
|
292 |
+
with open(hyper_path, "w") as outfile:
|
293 |
+
json.dump(hyperparameters, outfile)
|
294 |
+
|
295 |
+
# Step 4: Evaluate the model and build JSON
|
296 |
+
mean_reward, std_reward = evaluate_agent(eval_env,
|
297 |
+
hyperparameters["max_t"],
|
298 |
+
hyperparameters["n_evaluation_episodes"],
|
299 |
+
model)
|
300 |
+
# Get datetime
|
301 |
+
eval_datetime = datetime.datetime.now()
|
302 |
+
eval_form_datetime = eval_datetime.isoformat()
|
303 |
+
|
304 |
+
evaluate_data = {
|
305 |
+
"env_id": hyperparameters["env_id"],
|
306 |
+
"mean_reward": mean_reward,
|
307 |
+
"n_evaluation_episodes": hyperparameters["n_evaluation_episodes"],
|
308 |
+
"eval_datetime": eval_form_datetime,
|
309 |
+
}
|
310 |
+
|
311 |
+
# Write a JSON file
|
312 |
+
result_path = os.path.join(local_dir, "results.json")
|
313 |
+
with open(result_path, "w") as outfile:
|
314 |
+
json.dump(evaluate_data, outfile)
|
315 |
+
|
316 |
+
# Step 5: Create the model card
|
317 |
+
env_name = hyperparameters["env_id"]
|
318 |
+
|
319 |
+
metadata = {}
|
320 |
+
metadata["tags"] = [
|
321 |
+
env_name,
|
322 |
+
"reinforce",
|
323 |
+
"reinforcement-learning",
|
324 |
+
"custom-implementation",
|
325 |
+
"deep-rl-class"
|
326 |
+
]
|
327 |
+
|
328 |
+
# Add metrics
|
329 |
+
eval = metadata_eval_result(
|
330 |
+
model_pretty_name=repo_name,
|
331 |
+
task_pretty_name="reinforcement-learning",
|
332 |
+
task_id="reinforcement-learning",
|
333 |
+
metrics_pretty_name="mean_reward",
|
334 |
+
metrics_id="mean_reward",
|
335 |
+
metrics_value=f"{mean_reward:.2f} +/- {std_reward:.2f}",
|
336 |
+
dataset_pretty_name=env_name,
|
337 |
+
dataset_id=env_name,
|
338 |
+
)
|
339 |
+
|
340 |
+
# Merges both dictionaries
|
341 |
+
metadata = {**metadata, **eval}
|
342 |
+
|
343 |
+
model_card = f"""
|
344 |
+
# **Reinforce** Agent playing **{env_id}**
|
345 |
+
This is a trained model of a **Reinforce** agent playing **{env_id}** .
|
346 |
+
To learn to use this model and train yours check Unit 4 of the Deep Reinforcement Learning Course: https://huggingface.co/deep-rl-course/unit4/introduction
|
347 |
+
"""
|
348 |
+
readme_path = Path(local_dir )/ "README.md"
|
349 |
+
|
350 |
+
readme = ""
|
351 |
+
if readme_path.exists():
|
352 |
+
with readme_path.open("r", encoding="utf8") as f:
|
353 |
+
readme = f.read()
|
354 |
+
else:
|
355 |
+
readme = model_card
|
356 |
+
|
357 |
+
with readme_path.open("w", encoding="utf-8") as f:
|
358 |
+
f.write(readme)
|
359 |
+
|
360 |
+
# Save our metrics to Readme metadata
|
361 |
+
metadata_save(readme_path, metadata)
|
362 |
+
|
363 |
+
# Step 6: Record a video
|
364 |
+
video_path = os.path.join(local_dir,"replay.mp4")
|
365 |
+
record_video(env, model, video_path, video_fps)
|
366 |
+
|
367 |
+
# Step 7. Push everything to the Hub
|
368 |
+
api.upload_folder(
|
369 |
+
repo_id=repo_id,
|
370 |
+
folder_path="./",
|
371 |
+
path_in_repo=".",
|
372 |
+
)
|
373 |
+
|
374 |
+
print(f"Your model is pushed to the Hub. You can view your model here: {repo_url}")
|
375 |
+
|
376 |
+
repo_id = "dlwlgus53/Reinforce_cartpol-v1"
|
377 |
+
push_to_hub(
|
378 |
+
repo_id,
|
379 |
+
cartpole_policy, # The model we want to save
|
380 |
+
cartpole_hyperparameters, # Hyperparameters
|
381 |
+
eval_env, # Evaluation environment
|
382 |
+
video_fps=30
|
383 |
+
)
|
ppo.py
ADDED
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
"""
|
15 |
+
python examples/scripts/ppo.py \
|
16 |
+
--log_with=wandb
|
17 |
+
"""
|
18 |
+
from dataclasses import dataclass, field
|
19 |
+
from typing import Optional
|
20 |
+
|
21 |
+
import torch
|
22 |
+
from accelerate import Accelerator
|
23 |
+
from datasets import load_dataset
|
24 |
+
from peft import LoraConfig
|
25 |
+
from tqdm import tqdm
|
26 |
+
from transformers import AutoTokenizer, HfArgumentParser, pipeline
|
27 |
+
|
28 |
+
from trl import AutoModelForCausalLMWithValueHead, AutoModelForSeq2SeqLMWithValueHead, PPOConfig, PPOTrainer, set_seed
|
29 |
+
from trl.core import LengthSampler
|
30 |
+
from trl.import_utils import is_npu_available, is_xpu_available
|
31 |
+
|
32 |
+
|
33 |
+
tqdm.pandas()
|
34 |
+
|
35 |
+
|
36 |
+
@dataclass
|
37 |
+
class ScriptArguments:
|
38 |
+
use_seq2seq: bool = field(default=False, metadata={"help": "whether to use seq2seq"})
|
39 |
+
trust_remote_code: bool = field(default=False, metadata={"help": "Enable `trust_remote_code`"})
|
40 |
+
|
41 |
+
# LoraConfig
|
42 |
+
use_peft: bool = field(default=False, metadata={"help": "whether to use peft"})
|
43 |
+
lora_alpha: Optional[float] = field(default=16, metadata={"help": "the lora alpha parameter"})
|
44 |
+
lora_r: Optional[int] = field(default=16, metadata={"help": "the lora r parameter"})
|
45 |
+
|
46 |
+
|
47 |
+
parser = HfArgumentParser((ScriptArguments, PPOConfig))
|
48 |
+
args, ppo_config = parser.parse_args_into_dataclasses()
|
49 |
+
|
50 |
+
# We then define the arguments to pass to the sentiment analysis pipeline.
|
51 |
+
# We set `return_all_scores` to True to get the sentiment score for each token.
|
52 |
+
sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16}
|
53 |
+
|
54 |
+
trl_model_class = AutoModelForCausalLMWithValueHead if not args.use_seq2seq else AutoModelForSeq2SeqLMWithValueHead
|
55 |
+
|
56 |
+
|
57 |
+
# Below is an example function to build the dataset. In our case, we use the IMDB dataset
|
58 |
+
# from the `datasets` library. One should customize this function to train the model on
|
59 |
+
# its own dataset.
|
60 |
+
def build_dataset(config, query_dataset, input_min_text_length=2, input_max_text_length=8):
|
61 |
+
"""
|
62 |
+
Build dataset for training. This builds the dataset from `load_dataset`, one should
|
63 |
+
customize this function to train the model on its own dataset.
|
64 |
+
|
65 |
+
Args:
|
66 |
+
query_dataset (`str`):
|
67 |
+
The name of the dataset to be loaded.
|
68 |
+
|
69 |
+
Returns:
|
70 |
+
dataloader (`torch.utils.data.DataLoader`):
|
71 |
+
The dataloader for the dataset.
|
72 |
+
"""
|
73 |
+
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
|
74 |
+
tokenizer.pad_token = tokenizer.eos_token
|
75 |
+
# load imdb with datasets
|
76 |
+
ds = load_dataset(query_dataset, split="train")
|
77 |
+
ds = ds.rename_columns({"text": "review"})
|
78 |
+
ds = ds.filter(lambda x: len(x["review"]) > 200, batched=False)
|
79 |
+
|
80 |
+
input_size = LengthSampler(input_min_text_length, input_max_text_length)
|
81 |
+
|
82 |
+
def tokenize(sample):
|
83 |
+
sample["input_ids"] = tokenizer.encode(sample["review"])[: input_size()]
|
84 |
+
sample["query"] = tokenizer.decode(sample["input_ids"])
|
85 |
+
return sample
|
86 |
+
|
87 |
+
ds = ds.map(tokenize, batched=False)
|
88 |
+
ds.set_format(type="torch")
|
89 |
+
return ds
|
90 |
+
|
91 |
+
|
92 |
+
# We retrieve the dataloader by calling the `build_dataset` function.
|
93 |
+
dataset = build_dataset(ppo_config, ppo_config.query_dataset)
|
94 |
+
|
95 |
+
|
96 |
+
def collator(data):
|
97 |
+
return {key: [d[key] for d in data] for key in data[0]}
|
98 |
+
|
99 |
+
|
100 |
+
# set seed before initializing value head for deterministic eval
|
101 |
+
set_seed(ppo_config.seed)
|
102 |
+
|
103 |
+
# Now let's build the model, the reference model, and the tokenizer.
|
104 |
+
if not args.use_peft:
|
105 |
+
ref_model = trl_model_class.from_pretrained(ppo_config.model_name, trust_remote_code=args.trust_remote_code)
|
106 |
+
device_map = None
|
107 |
+
peft_config = None
|
108 |
+
else:
|
109 |
+
peft_config = LoraConfig(
|
110 |
+
r=args.lora_r,
|
111 |
+
lora_alpha=args.lora_alpha,
|
112 |
+
bias="none",
|
113 |
+
task_type="CAUSAL_LM",
|
114 |
+
)
|
115 |
+
ref_model = None
|
116 |
+
# Copy the model to each device
|
117 |
+
device_map = {"": Accelerator().local_process_index}
|
118 |
+
|
119 |
+
model = trl_model_class.from_pretrained(
|
120 |
+
ppo_config.model_name,
|
121 |
+
trust_remote_code=args.trust_remote_code,
|
122 |
+
device_map=device_map,
|
123 |
+
peft_config=peft_config,
|
124 |
+
)
|
125 |
+
|
126 |
+
|
127 |
+
tokenizer = AutoTokenizer.from_pretrained(ppo_config.model_name)
|
128 |
+
|
129 |
+
# Some tokenizers like GPT-2's don't have a padding token by default, so we set one here.
|
130 |
+
tokenizer.pad_token_id = tokenizer.eos_token_id
|
131 |
+
|
132 |
+
# We then build the PPOTrainer, passing the model, the reference model, the tokenizer
|
133 |
+
ppo_trainer = PPOTrainer(ppo_config, model, ref_model, tokenizer, dataset=dataset, data_collator=collator)
|
134 |
+
|
135 |
+
# We then build the sentiment analysis pipeline, passing the model name and the
|
136 |
+
# sentiment analysis pipeline arguments. Let's also make sure to set the device
|
137 |
+
# to the same device as the PPOTrainer.
|
138 |
+
device = ppo_trainer.accelerator.device
|
139 |
+
if ppo_trainer.accelerator.num_processes == 1:
|
140 |
+
if is_xpu_available():
|
141 |
+
device = "xpu:0"
|
142 |
+
elif is_npu_available():
|
143 |
+
device = "npu:0"
|
144 |
+
else:
|
145 |
+
device = 0 if torch.cuda.is_available() else "cpu" # to avoid a `pipeline` bug
|
146 |
+
ds_plugin = ppo_trainer.accelerator.state.deepspeed_plugin
|
147 |
+
task, model_name = ppo_config.reward_model.split(":")
|
148 |
+
if ds_plugin is not None and ds_plugin.is_zero3_init_enabled():
|
149 |
+
with ds_plugin.zero3_init_context_manager(enable=False):
|
150 |
+
sentiment_pipe = pipeline(task, model=model_name, device=device)
|
151 |
+
else:
|
152 |
+
sentiment_pipe = pipeline(task, model=model_name, device=device)
|
153 |
+
|
154 |
+
# Some tokenizers like GPT-2's don't have a padding token by default, so we set one here.
|
155 |
+
if sentiment_pipe.tokenizer.pad_token_id is None:
|
156 |
+
sentiment_pipe.tokenizer.pad_token_id = tokenizer.pad_token_id
|
157 |
+
|
158 |
+
if sentiment_pipe.model.config.pad_token_id is None:
|
159 |
+
sentiment_pipe.model.config.pad_token_id = tokenizer.pad_token_id
|
160 |
+
|
161 |
+
# We then define the arguments to pass to the `generate` function. These arguments
|
162 |
+
# are passed to the `generate` function of the PPOTrainer, which is a wrapper around
|
163 |
+
# the `generate` function of the trained model.
|
164 |
+
generation_kwargs = {
|
165 |
+
"min_length": -1,
|
166 |
+
"top_k": 0.0,
|
167 |
+
"top_p": 1.0,
|
168 |
+
"do_sample": True,
|
169 |
+
"pad_token_id": tokenizer.eos_token_id,
|
170 |
+
"max_new_tokens": 32,
|
171 |
+
}
|
172 |
+
|
173 |
+
for _epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
|
174 |
+
query_tensors = batch["input_ids"]
|
175 |
+
|
176 |
+
# Get response from gpt2
|
177 |
+
response_tensors, ref_response_tensors = ppo_trainer.generate(
|
178 |
+
query_tensors, return_prompt=False, generate_ref_response=True, **generation_kwargs
|
179 |
+
)
|
180 |
+
batch["response"] = tokenizer.batch_decode(response_tensors)
|
181 |
+
batch["ref_response"] = tokenizer.batch_decode(ref_response_tensors)
|
182 |
+
|
183 |
+
# Compute sentiment score
|
184 |
+
texts = [q + r for q, r in zip(batch["query"], batch["response"])]
|
185 |
+
pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
|
186 |
+
rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]
|
187 |
+
ref_texts = [q + r for q, r in zip(batch["query"], batch["ref_response"])]
|
188 |
+
ref_pipe_outputs = sentiment_pipe(ref_texts, **sent_kwargs)
|
189 |
+
ref_rewards = [torch.tensor(output[1]["score"]) for output in ref_pipe_outputs]
|
190 |
+
batch["ref_rewards"] = ref_rewards
|
191 |
+
|
192 |
+
# Run PPO step
|
193 |
+
stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
|
194 |
+
ppo_trainer.log_stats(stats, batch, rewards, columns_to_log=["query", "response", "ref_response", "ref_rewards"])
|