dlwlgus53 commited on
Commit
888331d
1 Parent(s): b6d5610

Upload folder using huggingface_hub

Browse files
.vscode/settings.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "workbench.colorCustomizations": {
3
+ "activityBar.background": "#561625",
4
+ "titleBar.activeBackground": "#781F34",
5
+ "titleBar.activeForeground": "#FEFBFC"
6
+ }
7
+ }
cartpole-v1/README.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - CartPole-v1
4
+ - reinforce
5
+ - reinforcement-learning
6
+ - custom-implementation
7
+ - deep-rl-class
8
+ model-index:
9
+ - name: Reinforce_cartpol-v1
10
+ results:
11
+ - task:
12
+ type: reinforcement-learning
13
+ name: reinforcement-learning
14
+ dataset:
15
+ name: CartPole-v1
16
+ type: CartPole-v1
17
+ metrics:
18
+ - type: mean_reward
19
+ value: 0.19 +/- 0.03
20
+ name: mean_reward
21
+ verified: false
22
+ ---
23
+
24
+ # **Reinforce** Agent playing **CartPole-v1**
25
+ This is a trained model of a **Reinforce** agent playing **CartPole-v1** .
26
+ To learn to use this model and train yours check Unit 4 of the Deep Reinforcement Learning Course: https://huggingface.co/deep-rl-course/unit4/introduction
27
+
cartpole-v1/hyperparameters.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"h_size": 16, "n_training_episodes": 1000, "n_evaluation_episodes": 10, "max_t": 1000, "gamma": 1.0, "lr": 0.01, "env_id": "CartPole-v1", "state_space": 4, "action_space": 2}
cartpole-v1/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee315b44d3dbc05cd7ba1d04fc0b2b0c49605d506c21e34be0e3239656b0aaf9
3
+ size 3264
cartpole-v1/replay.mp4 ADDED
Binary file (2.02 kB). View file
 
cartpole-v1/results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"env_id": "CartPole-v1", "mean_reward": 0.1875143900513649, "n_evaluation_episodes": 10, "eval_datetime": "2024-03-09T00:13:58.880496"}
example1.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from pyvirtualdisplay import Display
2
+
3
+ virtual_display = Display(visible=0, size=(1400, 900))
4
+ virtual_display.start()
hyperparameters.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"h_size": 16, "n_training_episodes": 1000, "n_evaluation_episodes": 10, "max_t": 1000, "gamma": 1.0, "lr": 0.01, "env_id": "CartPole-v1", "state_space": 4, "action_space": 2}
model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b290c70b9fb42b69640d1b651339c462d74379e36a1ef938e6119f30c2d0747
3
+ size 3264
policy_gradient.py ADDED
@@ -0,0 +1,383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ from collections import deque
4
+
5
+ import matplotlib.pyplot as plt
6
+ import pdb
7
+ # PyTorch
8
+ import torch
9
+ import torch.nn as nn
10
+ import torch.nn.functional as F
11
+ import torch.optim as optim
12
+ from torch.distributions import Categorical
13
+
14
+ # Gym
15
+ import gym
16
+ import gym_pygame
17
+
18
+ # Hugging Face Hub
19
+ from huggingface_hub import notebook_login # To log to our Hugging Face account to be able to upload models to the Hub.
20
+ import imageio
21
+
22
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
23
+ print(device)
24
+
25
+ env_id = "CartPole-v1"
26
+ # Create the env
27
+ env = gym.make(env_id, render_mode="rgb_array")
28
+
29
+ # Create the evaluation env
30
+ eval_env = gym.make(env_id)
31
+
32
+ # Get the state space and action space
33
+ s_size = env.observation_space.shape[0]
34
+ a_size = env.action_space.n
35
+
36
+ print("_____OBSERVATION SPACE_____ \n")
37
+ print("The State Space is: ", s_size)
38
+ # cart postion, cart velocity, pole angle, pole velocity at tip
39
+ print("Sample observation", env.observation_space.sample()) # Get a random observation
40
+
41
+ print("\n _____ACTION SPACE_____ \n")
42
+ print("The Action Space is: ", a_size)
43
+ print("Action Space Sample", env.action_space.sample()) # Take a random action
44
+
45
+
46
+ # Policy Gradient Network
47
+
48
+ class Policy(nn.Module):
49
+ def __init__(self, s_size, a_size, h_size):
50
+ super(Policy, self).__init__()
51
+ self.fc1 = nn.Linear(s_size, h_size)
52
+ self.fc2 = nn.Linear(h_size, a_size)
53
+
54
+ def forward(self, x):
55
+ x = F.relu(self.fc1(x))
56
+ x = self.fc2(x)
57
+ return F.softmax(x, dim=1)
58
+
59
+
60
+ def act(self, state):
61
+ state = torch.from_numpy(state).float().unsqueeze(0).to(device)
62
+ probs = self.forward(state).cpu()[0]
63
+ m = Categorical(probs)
64
+ action = m.sample()
65
+
66
+ return action.item(), m.log_prob(action)
67
+
68
+
69
+ def reinforce(policy, optimizer, n_training_episodes, max_t, gamma, print_every):
70
+ # Help us to calculate the score during the training
71
+ scores_deque = deque(maxlen=100)
72
+ scores = []
73
+ # Line 3 of pseudocode
74
+ for i_episode in range(1, n_training_episodes + 1):
75
+ saved_log_probs = []
76
+ rewards = []
77
+ state = env.reset()[0]
78
+ # Line 4 of pseudocode
79
+ for t in range(max_t):
80
+ action, log_prob = policy.act(state)
81
+ saved_log_probs.append(log_prob)
82
+ state, reward, done, _ = env.step(action)[0]
83
+ rewards.append(reward)
84
+ if done:
85
+ break
86
+
87
+ scores_deque.append(sum(rewards))
88
+ scores.append(sum(rewards))
89
+
90
+ # Line 6 of pseudocode: calculate the return
91
+ returns = deque(maxlen=max_t)
92
+ n_steps = len(rewards)
93
+ # Compute the discounted returns at each timestep,
94
+ # as
95
+ # the sum of the gamma-discounted return at time t (G_t) + the reward at time t
96
+ #
97
+ # In O(N) time, where N is the number of time steps
98
+ # (this definition of the discounted return G_t follows the definition of this quantity
99
+ # shown at page 44 of Sutton&Barto 2017 2nd draft)
100
+ # G_t = r_(t+1) + r_(t+2) + ...
101
+
102
+ # Given this formulation, the returns at each timestep t can be computed
103
+ # by re-using the computed future returns G_(t+1) to compute the current return G_t
104
+ # G_t = r_(t+1) + gamma*G_(t+1)
105
+ # G_(t-1) = r_t + gamma* G_t
106
+ # (this follows a dynamic programming approach, with which we memorize solutions in order
107
+ # to avoid computing them multiple times)
108
+
109
+ # This is correct since the above is equivalent to (see also page 46 of Sutton&Barto 2017 2nd draft)
110
+ # G_(t-1) = r_t + gamma*r_(t+1) + gamma*gamma*r_(t+2) + ...
111
+
112
+ ## Given the above, we calculate the returns at timestep t as:
113
+ # gamma[t] * return[t] + reward[t]
114
+ #
115
+ ## We compute this starting from the last timestep to the first, in order
116
+ ## to employ the formula presented above and avoid redundant computations that would be needed
117
+ ## if we were to do it from first to last.
118
+
119
+ ## Hence, the queue "returns" will hold the returns in chronological order, from t=0 to t=n_steps
120
+ ## thanks to the appendleft() function which allows to append to the position 0 in constant time O(1)
121
+ ## a normal python list would instead require O(N) to do this.
122
+ for t in range(n_steps)[::-1]:
123
+ disc_return_t = returns[0] if len(returns) > 0 else 0
124
+ returns.appendleft(gamma * disc_return_t + rewards[t])
125
+
126
+ ## standardization of the returns is employed to make training more stable
127
+ eps = np.finfo(np.float32).eps.item()
128
+ ## eps is the smallest representable float, which is
129
+ # added to the standard deviation of the returns to avoid numerical instabilities
130
+ returns = torch.tensor(returns)
131
+ if len(returns) > 1:
132
+ returns = (returns - returns.mean()) / (returns.std() + eps)
133
+
134
+ # Line 7:
135
+ policy_loss = []
136
+ for log_prob, disc_return in zip(saved_log_probs, returns):
137
+ policy_loss.append(-log_prob * disc_return)
138
+ if len(policy_loss) > 1:
139
+ policy_loss = torch.cat(policy_loss).sum()
140
+ else:
141
+ policy_loss = policy_loss[0]
142
+
143
+ # Line 8: PyTorch prefers gradient descent
144
+ optimizer.zero_grad()
145
+ policy_loss.backward()
146
+ optimizer.step()
147
+
148
+ if i_episode % print_every == 0:
149
+ print("Episode {}\tAverage Score: {:.2f}".format(i_episode, np.mean(scores_deque)))
150
+
151
+ return scores
152
+
153
+ cartpole_hyperparameters = {
154
+ "h_size": 16,
155
+ "n_training_episodes": 1000,
156
+ "n_evaluation_episodes": 10,
157
+ "max_t": 1000,
158
+ "gamma": 1.0,
159
+ "lr": 1e-2,
160
+ "env_id": env_id,
161
+ "state_space": s_size,
162
+ "action_space": a_size,
163
+ }
164
+
165
+ # Create policy and place it to the device
166
+ cartpole_policy = Policy(
167
+ cartpole_hyperparameters["state_space"],
168
+ cartpole_hyperparameters["action_space"],
169
+ cartpole_hyperparameters["h_size"],
170
+ ).to(device)
171
+ cartpole_optimizer = optim.Adam(cartpole_policy.parameters(), lr=cartpole_hyperparameters["lr"])
172
+
173
+ scores = reinforce(
174
+ cartpole_policy,
175
+ cartpole_optimizer,
176
+ cartpole_hyperparameters["n_training_episodes"],
177
+ cartpole_hyperparameters["max_t"],
178
+ cartpole_hyperparameters["gamma"],
179
+ 100,
180
+ )
181
+
182
+
183
+ def evaluate_agent(env, max_steps, n_eval_episodes, policy):
184
+ """
185
+ Evaluate the agent for ``n_eval_episodes`` episodes and returns average reward and std of reward.
186
+ :param env: The evaluation environment
187
+ :param n_eval_episodes: Number of episode to evaluate the agent
188
+ :param policy: The Reinforce agent
189
+ """
190
+ episode_rewards = []
191
+ for episode in range(n_eval_episodes):
192
+ state = env.reset()[0]
193
+ step = 0
194
+ done = False
195
+ total_rewards_ep = 0
196
+
197
+ for step in range(max_steps):
198
+ action, _ = policy.act(state)
199
+ new_state, reward, done, info = env.step(action)[0]
200
+ total_rewards_ep += reward
201
+
202
+ if done:
203
+ break
204
+ state = new_state
205
+ episode_rewards.append(total_rewards_ep)
206
+ mean_reward = np.mean(episode_rewards)
207
+ std_reward = np.std(episode_rewards)
208
+
209
+ return mean_reward, std_reward
210
+
211
+
212
+ evaluate_agent(
213
+ eval_env, cartpole_hyperparameters["max_t"], cartpole_hyperparameters["n_evaluation_episodes"], cartpole_policy
214
+ )
215
+
216
+
217
+ from huggingface_hub import HfApi, snapshot_download
218
+ from huggingface_hub.repocard import metadata_eval_result, metadata_save
219
+
220
+ from pathlib import Path
221
+ import datetime
222
+ import json
223
+ import imageio
224
+
225
+ import tempfile
226
+
227
+ import os
228
+
229
+
230
+
231
+ def record_video(env, policy, out_directory, fps=30):
232
+ """
233
+ Generate a replay video of the agent
234
+ :param env
235
+ :param Qtable: Qtable of our agent
236
+ :param out_directory
237
+ :param fps: how many frame per seconds (with taxi-v3 and frozenlake-v1 we use 1)
238
+ """
239
+ images = []
240
+ done = False
241
+ state = env.reset()[0]
242
+ img = env.render()
243
+ images.append(img)
244
+ while not done:
245
+ # Take the action (index) that have the maximum expected future reward given that state
246
+ action, _ = policy.act(state)
247
+ state, reward, done, info = env.step(action)[0] # We directly put next_state = state for recording logic
248
+ img = env.render()
249
+ images.append(img)
250
+ imageio.mimsave(out_directory, [np.array(img) for i, img in enumerate(images)], fps=fps)
251
+
252
+
253
+
254
+
255
+ def push_to_hub(repo_id,
256
+ model,
257
+ hyperparameters,
258
+ eval_env,
259
+ video_fps=30
260
+ ):
261
+ """
262
+ Evaluate, Generate a video and Upload a model to Hugging Face Hub.
263
+ This method does the complete pipeline:
264
+ - It evaluates the model
265
+ - It generates the model card
266
+ - It generates a replay video of the agent
267
+ - It pushes everything to the Hub
268
+
269
+ :param repo_id: repo_id: id of the model repository from the Hugging Face Hub
270
+ :param model: the pytorch model we want to save
271
+ :param hyperparameters: training hyperparameters
272
+ :param eval_env: evaluation environment
273
+ :param video_fps: how many frame per seconds to record our video replay
274
+ """
275
+
276
+ _, repo_name = repo_id.split("/")
277
+ api = HfApi()
278
+
279
+ # Step 1: Create the repo
280
+ repo_url = api.create_repo(
281
+ repo_id=repo_id,
282
+ exist_ok=True,
283
+ )
284
+
285
+ local_dir = "./cartpole-v1"
286
+
287
+ # Step 2: Save the model
288
+ torch.save(model, os.path.join(local_dir, "model.pt"))
289
+
290
+ # Step 3: Save the hyperparameters to JSON
291
+ hyper_path = os.path.join(local_dir, "hyperparameters.json")
292
+ with open(hyper_path, "w") as outfile:
293
+ json.dump(hyperparameters, outfile)
294
+
295
+ # Step 4: Evaluate the model and build JSON
296
+ mean_reward, std_reward = evaluate_agent(eval_env,
297
+ hyperparameters["max_t"],
298
+ hyperparameters["n_evaluation_episodes"],
299
+ model)
300
+ # Get datetime
301
+ eval_datetime = datetime.datetime.now()
302
+ eval_form_datetime = eval_datetime.isoformat()
303
+
304
+ evaluate_data = {
305
+ "env_id": hyperparameters["env_id"],
306
+ "mean_reward": mean_reward,
307
+ "n_evaluation_episodes": hyperparameters["n_evaluation_episodes"],
308
+ "eval_datetime": eval_form_datetime,
309
+ }
310
+
311
+ # Write a JSON file
312
+ result_path = os.path.join(local_dir, "results.json")
313
+ with open(result_path, "w") as outfile:
314
+ json.dump(evaluate_data, outfile)
315
+
316
+ # Step 5: Create the model card
317
+ env_name = hyperparameters["env_id"]
318
+
319
+ metadata = {}
320
+ metadata["tags"] = [
321
+ env_name,
322
+ "reinforce",
323
+ "reinforcement-learning",
324
+ "custom-implementation",
325
+ "deep-rl-class"
326
+ ]
327
+
328
+ # Add metrics
329
+ eval = metadata_eval_result(
330
+ model_pretty_name=repo_name,
331
+ task_pretty_name="reinforcement-learning",
332
+ task_id="reinforcement-learning",
333
+ metrics_pretty_name="mean_reward",
334
+ metrics_id="mean_reward",
335
+ metrics_value=f"{mean_reward:.2f} +/- {std_reward:.2f}",
336
+ dataset_pretty_name=env_name,
337
+ dataset_id=env_name,
338
+ )
339
+
340
+ # Merges both dictionaries
341
+ metadata = {**metadata, **eval}
342
+
343
+ model_card = f"""
344
+ # **Reinforce** Agent playing **{env_id}**
345
+ This is a trained model of a **Reinforce** agent playing **{env_id}** .
346
+ To learn to use this model and train yours check Unit 4 of the Deep Reinforcement Learning Course: https://huggingface.co/deep-rl-course/unit4/introduction
347
+ """
348
+ readme_path = Path(local_dir )/ "README.md"
349
+
350
+ readme = ""
351
+ if readme_path.exists():
352
+ with readme_path.open("r", encoding="utf8") as f:
353
+ readme = f.read()
354
+ else:
355
+ readme = model_card
356
+
357
+ with readme_path.open("w", encoding="utf-8") as f:
358
+ f.write(readme)
359
+
360
+ # Save our metrics to Readme metadata
361
+ metadata_save(readme_path, metadata)
362
+
363
+ # Step 6: Record a video
364
+ video_path = os.path.join(local_dir,"replay.mp4")
365
+ record_video(env, model, video_path, video_fps)
366
+
367
+ # Step 7. Push everything to the Hub
368
+ api.upload_folder(
369
+ repo_id=repo_id,
370
+ folder_path="./",
371
+ path_in_repo=".",
372
+ )
373
+
374
+ print(f"Your model is pushed to the Hub. You can view your model here: {repo_url}")
375
+
376
+ repo_id = "dlwlgus53/Reinforce_cartpol-v1"
377
+ push_to_hub(
378
+ repo_id,
379
+ cartpole_policy, # The model we want to save
380
+ cartpole_hyperparameters, # Hyperparameters
381
+ eval_env, # Evaluation environment
382
+ video_fps=30
383
+ )
ppo.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """
15
+ python examples/scripts/ppo.py \
16
+ --log_with=wandb
17
+ """
18
+ from dataclasses import dataclass, field
19
+ from typing import Optional
20
+
21
+ import torch
22
+ from accelerate import Accelerator
23
+ from datasets import load_dataset
24
+ from peft import LoraConfig
25
+ from tqdm import tqdm
26
+ from transformers import AutoTokenizer, HfArgumentParser, pipeline
27
+
28
+ from trl import AutoModelForCausalLMWithValueHead, AutoModelForSeq2SeqLMWithValueHead, PPOConfig, PPOTrainer, set_seed
29
+ from trl.core import LengthSampler
30
+ from trl.import_utils import is_npu_available, is_xpu_available
31
+
32
+
33
+ tqdm.pandas()
34
+
35
+
36
+ @dataclass
37
+ class ScriptArguments:
38
+ use_seq2seq: bool = field(default=False, metadata={"help": "whether to use seq2seq"})
39
+ trust_remote_code: bool = field(default=False, metadata={"help": "Enable `trust_remote_code`"})
40
+
41
+ # LoraConfig
42
+ use_peft: bool = field(default=False, metadata={"help": "whether to use peft"})
43
+ lora_alpha: Optional[float] = field(default=16, metadata={"help": "the lora alpha parameter"})
44
+ lora_r: Optional[int] = field(default=16, metadata={"help": "the lora r parameter"})
45
+
46
+
47
+ parser = HfArgumentParser((ScriptArguments, PPOConfig))
48
+ args, ppo_config = parser.parse_args_into_dataclasses()
49
+
50
+ # We then define the arguments to pass to the sentiment analysis pipeline.
51
+ # We set `return_all_scores` to True to get the sentiment score for each token.
52
+ sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16}
53
+
54
+ trl_model_class = AutoModelForCausalLMWithValueHead if not args.use_seq2seq else AutoModelForSeq2SeqLMWithValueHead
55
+
56
+
57
+ # Below is an example function to build the dataset. In our case, we use the IMDB dataset
58
+ # from the `datasets` library. One should customize this function to train the model on
59
+ # its own dataset.
60
+ def build_dataset(config, query_dataset, input_min_text_length=2, input_max_text_length=8):
61
+ """
62
+ Build dataset for training. This builds the dataset from `load_dataset`, one should
63
+ customize this function to train the model on its own dataset.
64
+
65
+ Args:
66
+ query_dataset (`str`):
67
+ The name of the dataset to be loaded.
68
+
69
+ Returns:
70
+ dataloader (`torch.utils.data.DataLoader`):
71
+ The dataloader for the dataset.
72
+ """
73
+ tokenizer = AutoTokenizer.from_pretrained(config.model_name)
74
+ tokenizer.pad_token = tokenizer.eos_token
75
+ # load imdb with datasets
76
+ ds = load_dataset(query_dataset, split="train")
77
+ ds = ds.rename_columns({"text": "review"})
78
+ ds = ds.filter(lambda x: len(x["review"]) > 200, batched=False)
79
+
80
+ input_size = LengthSampler(input_min_text_length, input_max_text_length)
81
+
82
+ def tokenize(sample):
83
+ sample["input_ids"] = tokenizer.encode(sample["review"])[: input_size()]
84
+ sample["query"] = tokenizer.decode(sample["input_ids"])
85
+ return sample
86
+
87
+ ds = ds.map(tokenize, batched=False)
88
+ ds.set_format(type="torch")
89
+ return ds
90
+
91
+
92
+ # We retrieve the dataloader by calling the `build_dataset` function.
93
+ dataset = build_dataset(ppo_config, ppo_config.query_dataset)
94
+
95
+
96
+ def collator(data):
97
+ return {key: [d[key] for d in data] for key in data[0]}
98
+
99
+
100
+ # set seed before initializing value head for deterministic eval
101
+ set_seed(ppo_config.seed)
102
+
103
+ # Now let's build the model, the reference model, and the tokenizer.
104
+ if not args.use_peft:
105
+ ref_model = trl_model_class.from_pretrained(ppo_config.model_name, trust_remote_code=args.trust_remote_code)
106
+ device_map = None
107
+ peft_config = None
108
+ else:
109
+ peft_config = LoraConfig(
110
+ r=args.lora_r,
111
+ lora_alpha=args.lora_alpha,
112
+ bias="none",
113
+ task_type="CAUSAL_LM",
114
+ )
115
+ ref_model = None
116
+ # Copy the model to each device
117
+ device_map = {"": Accelerator().local_process_index}
118
+
119
+ model = trl_model_class.from_pretrained(
120
+ ppo_config.model_name,
121
+ trust_remote_code=args.trust_remote_code,
122
+ device_map=device_map,
123
+ peft_config=peft_config,
124
+ )
125
+
126
+
127
+ tokenizer = AutoTokenizer.from_pretrained(ppo_config.model_name)
128
+
129
+ # Some tokenizers like GPT-2's don't have a padding token by default, so we set one here.
130
+ tokenizer.pad_token_id = tokenizer.eos_token_id
131
+
132
+ # We then build the PPOTrainer, passing the model, the reference model, the tokenizer
133
+ ppo_trainer = PPOTrainer(ppo_config, model, ref_model, tokenizer, dataset=dataset, data_collator=collator)
134
+
135
+ # We then build the sentiment analysis pipeline, passing the model name and the
136
+ # sentiment analysis pipeline arguments. Let's also make sure to set the device
137
+ # to the same device as the PPOTrainer.
138
+ device = ppo_trainer.accelerator.device
139
+ if ppo_trainer.accelerator.num_processes == 1:
140
+ if is_xpu_available():
141
+ device = "xpu:0"
142
+ elif is_npu_available():
143
+ device = "npu:0"
144
+ else:
145
+ device = 0 if torch.cuda.is_available() else "cpu" # to avoid a `pipeline` bug
146
+ ds_plugin = ppo_trainer.accelerator.state.deepspeed_plugin
147
+ task, model_name = ppo_config.reward_model.split(":")
148
+ if ds_plugin is not None and ds_plugin.is_zero3_init_enabled():
149
+ with ds_plugin.zero3_init_context_manager(enable=False):
150
+ sentiment_pipe = pipeline(task, model=model_name, device=device)
151
+ else:
152
+ sentiment_pipe = pipeline(task, model=model_name, device=device)
153
+
154
+ # Some tokenizers like GPT-2's don't have a padding token by default, so we set one here.
155
+ if sentiment_pipe.tokenizer.pad_token_id is None:
156
+ sentiment_pipe.tokenizer.pad_token_id = tokenizer.pad_token_id
157
+
158
+ if sentiment_pipe.model.config.pad_token_id is None:
159
+ sentiment_pipe.model.config.pad_token_id = tokenizer.pad_token_id
160
+
161
+ # We then define the arguments to pass to the `generate` function. These arguments
162
+ # are passed to the `generate` function of the PPOTrainer, which is a wrapper around
163
+ # the `generate` function of the trained model.
164
+ generation_kwargs = {
165
+ "min_length": -1,
166
+ "top_k": 0.0,
167
+ "top_p": 1.0,
168
+ "do_sample": True,
169
+ "pad_token_id": tokenizer.eos_token_id,
170
+ "max_new_tokens": 32,
171
+ }
172
+
173
+ for _epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
174
+ query_tensors = batch["input_ids"]
175
+
176
+ # Get response from gpt2
177
+ response_tensors, ref_response_tensors = ppo_trainer.generate(
178
+ query_tensors, return_prompt=False, generate_ref_response=True, **generation_kwargs
179
+ )
180
+ batch["response"] = tokenizer.batch_decode(response_tensors)
181
+ batch["ref_response"] = tokenizer.batch_decode(ref_response_tensors)
182
+
183
+ # Compute sentiment score
184
+ texts = [q + r for q, r in zip(batch["query"], batch["response"])]
185
+ pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
186
+ rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]
187
+ ref_texts = [q + r for q, r in zip(batch["query"], batch["ref_response"])]
188
+ ref_pipe_outputs = sentiment_pipe(ref_texts, **sent_kwargs)
189
+ ref_rewards = [torch.tensor(output[1]["score"]) for output in ref_pipe_outputs]
190
+ batch["ref_rewards"] = ref_rewards
191
+
192
+ # Run PPO step
193
+ stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
194
+ ppo_trainer.log_stats(stats, batch, rewards, columns_to_log=["query", "response", "ref_response", "ref_rewards"])