TTTXXX01 commited on
Commit
4cf8663
·
verified ·
1 Parent(s): 5170d34

Model save

Browse files
README.md CHANGED
@@ -2,15 +2,10 @@
2
  license: mit
3
  base_model: HuggingFaceH4/mistral-7b-sft-beta
4
  tags:
5
- - alignment-handbook
6
- - trl
7
- - dpo
8
- - generated_from_trainer
9
  - trl
10
  - dpo
 
11
  - generated_from_trainer
12
- datasets:
13
- - HuggingFaceH4/ultrafeedback_binarized
14
  model-index:
15
  - name: DPO-Zephyr-7B
16
  results: []
@@ -21,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
21
 
22
  # DPO-Zephyr-7B
23
 
24
- This model is a fine-tuned version of [HuggingFaceH4/mistral-7b-sft-beta](https://huggingface.co/HuggingFaceH4/mistral-7b-sft-beta) on the HuggingFaceH4/ultrafeedback_binarized dataset.
25
 
26
  ## Model description
27
 
@@ -41,14 +36,13 @@ More information needed
41
 
42
  The following hyperparameters were used during training:
43
  - learning_rate: 5e-07
44
- - train_batch_size: 8
45
- - eval_batch_size: 8
46
  - seed: 42
47
  - distributed_type: multi-GPU
48
  - num_devices: 3
49
- - gradient_accumulation_steps: 4
50
- - total_train_batch_size: 96
51
- - total_eval_batch_size: 24
52
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
53
  - lr_scheduler_type: cosine
54
  - lr_scheduler_warmup_ratio: 0.1
 
2
  license: mit
3
  base_model: HuggingFaceH4/mistral-7b-sft-beta
4
  tags:
 
 
 
 
5
  - trl
6
  - dpo
7
+ - alignment-handbook
8
  - generated_from_trainer
 
 
9
  model-index:
10
  - name: DPO-Zephyr-7B
11
  results: []
 
16
 
17
  # DPO-Zephyr-7B
18
 
19
+ This model is a fine-tuned version of [HuggingFaceH4/mistral-7b-sft-beta](https://huggingface.co/HuggingFaceH4/mistral-7b-sft-beta) on an unknown dataset.
20
 
21
  ## Model description
22
 
 
36
 
37
  The following hyperparameters were used during training:
38
  - learning_rate: 5e-07
39
+ - train_batch_size: 3
40
+ - eval_batch_size: 4
41
  - seed: 42
42
  - distributed_type: multi-GPU
43
  - num_devices: 3
44
+ - total_train_batch_size: 9
45
+ - total_eval_batch_size: 12
 
46
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
  - lr_scheduler_type: cosine
48
  - lr_scheduler_warmup_ratio: 0.1
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 0.9984301412872841,
3
  "total_flos": 0.0,
4
- "train_loss": 0.6094696866641255,
5
- "train_runtime": 4741.6763,
6
  "train_samples": 15283,
7
- "train_samples_per_second": 3.223,
8
- "train_steps_per_second": 0.034
9
  }
 
1
  {
2
+ "epoch": 1.0,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.3818549155440398,
5
+ "train_runtime": 4714.6907,
6
  "train_samples": 15283,
7
+ "train_samples_per_second": 3.242,
8
+ "train_steps_per_second": 0.135
9
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f7a71fd86983200a0e9da467d2d7f8faa5ae8e9b0300040f8833bb139b00c1e2
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac26dbef8ff942f79ddbc3c5b131cdc9fa2d10ae43fab47f2377c9b7de2fccef
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b32a56026769897ec6a84df6af5e995510cdf30f166051e86cf2731a21e2760
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c927851423f252a01b03cf8090580227fde5515df1c249c6f6198324520d072c
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16c9c1d861d6afe007e669a5c048323be406c2e23f14ccde55c7157235e33b86
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f784959e0b5911b50bf26eb3e5c2da47b1203fe94661e2c520a44209a9af3fc1
3
  size 4540516344
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 0.9984301412872841,
3
  "total_flos": 0.0,
4
- "train_loss": 0.6094696866641255,
5
- "train_runtime": 4741.6763,
6
  "train_samples": 15283,
7
- "train_samples_per_second": 3.223,
8
- "train_steps_per_second": 0.034
9
  }
 
1
  {
2
+ "epoch": 1.0,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.3818549155440398,
5
+ "train_runtime": 4714.6907,
6
  "train_samples": 15283,
7
+ "train_samples_per_second": 3.242,
8
+ "train_steps_per_second": 0.135
9
  }
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9984301412872841,
5
  "eval_steps": 500,
6
- "global_step": 159,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -174,92 +174,812 @@
174
  "step": 100
175
  },
176
  {
177
- "epoch": 0.6907378335949764,
178
- "grad_norm": 17.53131220916881,
179
- "learning_rate": 1.3139467229135998e-07,
180
- "logits/chosen": -2.6724138259887695,
181
- "logits/rejected": -2.6650776863098145,
182
- "logps/chosen": -285.9563293457031,
183
- "logps/rejected": -335.96673583984375,
184
- "loss": 0.5746,
185
- "rewards/accuracies": 0.6968749761581421,
186
- "rewards/chosen": -0.47776883840560913,
187
- "rewards/margins": 0.4206571578979492,
188
- "rewards/rejected": -0.8984260559082031,
189
  "step": 110
190
  },
191
  {
192
- "epoch": 0.7535321821036107,
193
- "grad_norm": 20.93530860525305,
194
- "learning_rate": 8.628481651367875e-08,
195
- "logits/chosen": -2.7068216800689697,
196
- "logits/rejected": -2.685987949371338,
197
- "logps/chosen": -314.00042724609375,
198
- "logps/rejected": -346.78546142578125,
199
- "loss": 0.5883,
200
- "rewards/accuracies": 0.7250000238418579,
201
- "rewards/chosen": -0.5427230596542358,
202
- "rewards/margins": 0.36296552419662476,
203
- "rewards/rejected": -0.9056886434555054,
204
  "step": 120
205
  },
206
  {
207
- "epoch": 0.8163265306122449,
208
- "grad_norm": 27.215171241370864,
209
- "learning_rate": 4.904486005914027e-08,
210
- "logits/chosen": -2.725468635559082,
211
- "logits/rejected": -2.6890971660614014,
212
- "logps/chosen": -312.6821594238281,
213
- "logps/rejected": -338.29144287109375,
214
- "loss": 0.5549,
215
- "rewards/accuracies": 0.753125011920929,
216
- "rewards/chosen": -0.45934683084487915,
217
- "rewards/margins": 0.4836532175540924,
218
- "rewards/rejected": -0.9429999589920044,
219
  "step": 130
220
  },
221
  {
222
- "epoch": 0.8791208791208791,
223
- "grad_norm": 21.30706575112927,
224
- "learning_rate": 2.1464952759020856e-08,
225
- "logits/chosen": -2.7268600463867188,
226
- "logits/rejected": -2.698526382446289,
227
- "logps/chosen": -334.44989013671875,
228
- "logps/rejected": -353.6800842285156,
229
- "loss": 0.5646,
230
- "rewards/accuracies": 0.734375,
231
- "rewards/chosen": -0.4807310998439789,
232
- "rewards/margins": 0.43861979246139526,
233
- "rewards/rejected": -0.9193509221076965,
234
  "step": 140
235
  },
236
  {
237
- "epoch": 0.9419152276295133,
238
- "grad_norm": 20.93813177233855,
239
- "learning_rate": 4.8708793644441086e-09,
240
- "logits/chosen": -2.690723419189453,
241
- "logits/rejected": -2.6735761165618896,
242
- "logps/chosen": -309.25262451171875,
243
- "logps/rejected": -340.45733642578125,
244
- "loss": 0.5617,
245
- "rewards/accuracies": 0.7124999761581421,
246
- "rewards/chosen": -0.47407713532447815,
247
- "rewards/margins": 0.4926603436470032,
248
- "rewards/rejected": -0.9667374491691589,
249
  "step": 150
250
  },
251
  {
252
- "epoch": 0.9984301412872841,
253
- "step": 159,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  "total_flos": 0.0,
255
- "train_loss": 0.6094696866641255,
256
- "train_runtime": 4741.6763,
257
- "train_samples_per_second": 3.223,
258
- "train_steps_per_second": 0.034
259
  }
260
  ],
261
  "logging_steps": 10,
262
- "max_steps": 159,
263
  "num_input_tokens_seen": 0,
264
  "num_train_epochs": 1,
265
  "save_steps": 100,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 637,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
174
  "step": 100
175
  },
176
  {
177
+ "epoch": 0.1726844583987441,
178
+ "grad_norm": 31.95579631076908,
179
+ "learning_rate": 4.920911573406924e-07,
180
+ "logits/chosen": -2.737246513366699,
181
+ "logits/rejected": -2.694401979446411,
182
+ "logps/chosen": -324.00128173828125,
183
+ "logps/rejected": -355.02178955078125,
184
+ "loss": 0.4353,
185
+ "rewards/accuracies": 0.887499988079071,
186
+ "rewards/chosen": -0.3587312698364258,
187
+ "rewards/margins": 0.7477051615715027,
188
+ "rewards/rejected": -1.1064363718032837,
189
  "step": 110
190
  },
191
  {
192
+ "epoch": 0.18838304552590268,
193
+ "grad_norm": 35.71890678681308,
194
+ "learning_rate": 4.883087164434672e-07,
195
+ "logits/chosen": -2.626753568649292,
196
+ "logits/rejected": -2.6289310455322266,
197
+ "logps/chosen": -319.3797302246094,
198
+ "logps/rejected": -375.67864990234375,
199
+ "loss": 0.4172,
200
+ "rewards/accuracies": 0.8374999761581421,
201
+ "rewards/chosen": -0.4024926722049713,
202
+ "rewards/margins": 0.8855185508728027,
203
+ "rewards/rejected": -1.2880111932754517,
204
  "step": 120
205
  },
206
  {
207
+ "epoch": 0.20408163265306123,
208
+ "grad_norm": 44.90656496644609,
209
+ "learning_rate": 4.838100964592904e-07,
210
+ "logits/chosen": -2.5899486541748047,
211
+ "logits/rejected": -2.5745112895965576,
212
+ "logps/chosen": -272.218994140625,
213
+ "logps/rejected": -347.6167297363281,
214
+ "loss": 0.4832,
215
+ "rewards/accuracies": 0.7875000238418579,
216
+ "rewards/chosen": -0.4248787760734558,
217
+ "rewards/margins": 0.8124195337295532,
218
+ "rewards/rejected": -1.2372982501983643,
219
  "step": 130
220
  },
221
  {
222
+ "epoch": 0.21978021978021978,
223
+ "grad_norm": 76.16900405992267,
224
+ "learning_rate": 4.786088169001671e-07,
225
+ "logits/chosen": -2.5720114707946777,
226
+ "logits/rejected": -2.5637192726135254,
227
+ "logps/chosen": -289.96759033203125,
228
+ "logps/rejected": -435.85308837890625,
229
+ "loss": 0.3981,
230
+ "rewards/accuracies": 0.824999988079071,
231
+ "rewards/chosen": -0.3887278437614441,
232
+ "rewards/margins": 0.9565574526786804,
233
+ "rewards/rejected": -1.3452852964401245,
234
  "step": 140
235
  },
236
  {
237
+ "epoch": 0.23547880690737832,
238
+ "grad_norm": 70.20692587613685,
239
+ "learning_rate": 4.727205089511466e-07,
240
+ "logits/chosen": -2.5425446033477783,
241
+ "logits/rejected": -2.525315523147583,
242
+ "logps/chosen": -313.8908996582031,
243
+ "logps/rejected": -377.4201354980469,
244
+ "loss": 0.4486,
245
+ "rewards/accuracies": 0.8374999761581421,
246
+ "rewards/chosen": -0.7373483180999756,
247
+ "rewards/margins": 0.8309048414230347,
248
+ "rewards/rejected": -1.5682532787322998,
249
  "step": 150
250
  },
251
  {
252
+ "epoch": 0.25117739403453687,
253
+ "grad_norm": 49.06765782836144,
254
+ "learning_rate": 4.661628684945851e-07,
255
+ "logits/chosen": -2.573646068572998,
256
+ "logits/rejected": -2.5104339122772217,
257
+ "logps/chosen": -370.089599609375,
258
+ "logps/rejected": -390.0255126953125,
259
+ "loss": 0.4023,
260
+ "rewards/accuracies": 0.8500000238418579,
261
+ "rewards/chosen": -0.8807455897331238,
262
+ "rewards/margins": 0.7309385538101196,
263
+ "rewards/rejected": -1.6116840839385986,
264
+ "step": 160
265
+ },
266
+ {
267
+ "epoch": 0.2668759811616955,
268
+ "grad_norm": 54.18143326196735,
269
+ "learning_rate": 4.5895560292945996e-07,
270
+ "logits/chosen": -2.5094046592712402,
271
+ "logits/rejected": -2.519862413406372,
272
+ "logps/chosen": -328.95098876953125,
273
+ "logps/rejected": -409.6293029785156,
274
+ "loss": 0.4229,
275
+ "rewards/accuracies": 0.887499988079071,
276
+ "rewards/chosen": -0.8252763748168945,
277
+ "rewards/margins": 0.8684083223342896,
278
+ "rewards/rejected": -1.6936845779418945,
279
+ "step": 170
280
+ },
281
+ {
282
+ "epoch": 0.282574568288854,
283
+ "grad_norm": 47.86539813882727,
284
+ "learning_rate": 4.5112037194555876e-07,
285
+ "logits/chosen": -2.4061732292175293,
286
+ "logits/rejected": -2.336141586303711,
287
+ "logps/chosen": -340.967529296875,
288
+ "logps/rejected": -416.18438720703125,
289
+ "loss": 0.4129,
290
+ "rewards/accuracies": 0.862500011920929,
291
+ "rewards/chosen": -0.9469585418701172,
292
+ "rewards/margins": 1.003477692604065,
293
+ "rewards/rejected": -1.9504363536834717,
294
+ "step": 180
295
+ },
296
+ {
297
+ "epoch": 0.29827315541601257,
298
+ "grad_norm": 47.40203923720411,
299
+ "learning_rate": 4.426807224305315e-07,
300
+ "logits/chosen": -2.434738874435425,
301
+ "logits/rejected": -2.4182467460632324,
302
+ "logps/chosen": -390.0809631347656,
303
+ "logps/rejected": -541.5980224609375,
304
+ "loss": 0.39,
305
+ "rewards/accuracies": 0.925000011920929,
306
+ "rewards/chosen": -0.907140851020813,
307
+ "rewards/margins": 1.477752447128296,
308
+ "rewards/rejected": -2.3848931789398193,
309
+ "step": 190
310
+ },
311
+ {
312
+ "epoch": 0.3139717425431711,
313
+ "grad_norm": 59.480022233916614,
314
+ "learning_rate": 4.3366201770542687e-07,
315
+ "logits/chosen": -2.3925621509552,
316
+ "logits/rejected": -2.3594958782196045,
317
+ "logps/chosen": -399.2648010253906,
318
+ "logps/rejected": -485.82061767578125,
319
+ "loss": 0.3722,
320
+ "rewards/accuracies": 0.8500000238418579,
321
+ "rewards/chosen": -1.242919683456421,
322
+ "rewards/margins": 1.0100038051605225,
323
+ "rewards/rejected": -2.2529234886169434,
324
+ "step": 200
325
+ },
326
+ {
327
+ "epoch": 0.32967032967032966,
328
+ "grad_norm": 54.533637965345,
329
+ "learning_rate": 4.2409136130137845e-07,
330
+ "logits/chosen": -2.391516923904419,
331
+ "logits/rejected": -2.3367998600006104,
332
+ "logps/chosen": -410.27752685546875,
333
+ "logps/rejected": -488.3935546875,
334
+ "loss": 0.3723,
335
+ "rewards/accuracies": 0.824999988079071,
336
+ "rewards/chosen": -1.2089805603027344,
337
+ "rewards/margins": 1.3162455558776855,
338
+ "rewards/rejected": -2.52522611618042,
339
+ "step": 210
340
+ },
341
+ {
342
+ "epoch": 0.3453689167974882,
343
+ "grad_norm": 134.3137263377747,
344
+ "learning_rate": 4.1399751550651084e-07,
345
+ "logits/chosen": -2.2474474906921387,
346
+ "logits/rejected": -2.273486852645874,
347
+ "logps/chosen": -422.23419189453125,
348
+ "logps/rejected": -493.7196350097656,
349
+ "loss": 0.4143,
350
+ "rewards/accuracies": 0.7875000238418579,
351
+ "rewards/chosen": -1.3034000396728516,
352
+ "rewards/margins": 1.1734570264816284,
353
+ "rewards/rejected": -2.4768571853637695,
354
+ "step": 220
355
+ },
356
+ {
357
+ "epoch": 0.36106750392464676,
358
+ "grad_norm": 51.06322946002842,
359
+ "learning_rate": 4.034108149278543e-07,
360
+ "logits/chosen": -2.246279716491699,
361
+ "logits/rejected": -2.218812942504883,
362
+ "logps/chosen": -396.13433837890625,
363
+ "logps/rejected": -494.9988708496094,
364
+ "loss": 0.3583,
365
+ "rewards/accuracies": 0.8999999761581421,
366
+ "rewards/chosen": -1.4342329502105713,
367
+ "rewards/margins": 1.3676201105117798,
368
+ "rewards/rejected": -2.8018529415130615,
369
+ "step": 230
370
+ },
371
+ {
372
+ "epoch": 0.37676609105180536,
373
+ "grad_norm": 53.106943652386406,
374
+ "learning_rate": 3.923630753280357e-07,
375
+ "logits/chosen": -2.296950578689575,
376
+ "logits/rejected": -2.193643808364868,
377
+ "logps/chosen": -446.6080017089844,
378
+ "logps/rejected": -504.3912658691406,
379
+ "loss": 0.439,
380
+ "rewards/accuracies": 0.7875000238418579,
381
+ "rewards/chosen": -1.5059573650360107,
382
+ "rewards/margins": 1.111060380935669,
383
+ "rewards/rejected": -2.617017984390259,
384
+ "step": 240
385
+ },
386
+ {
387
+ "epoch": 0.3924646781789639,
388
+ "grad_norm": 31.823002355070333,
389
+ "learning_rate": 3.8088749801071496e-07,
390
+ "logits/chosen": -2.179251194000244,
391
+ "logits/rejected": -2.139425277709961,
392
+ "logps/chosen": -373.7418518066406,
393
+ "logps/rejected": -501.843505859375,
394
+ "loss": 0.369,
395
+ "rewards/accuracies": 0.8500000238418579,
396
+ "rewards/chosen": -1.2382627725601196,
397
+ "rewards/margins": 1.2532708644866943,
398
+ "rewards/rejected": -2.4915337562561035,
399
+ "step": 250
400
+ },
401
+ {
402
+ "epoch": 0.40816326530612246,
403
+ "grad_norm": 57.958160333752495,
404
+ "learning_rate": 3.6901857004211443e-07,
405
+ "logits/chosen": -2.2685585021972656,
406
+ "logits/rejected": -2.1836700439453125,
407
+ "logps/chosen": -438.3352966308594,
408
+ "logps/rejected": -474.6566467285156,
409
+ "loss": 0.3729,
410
+ "rewards/accuracies": 0.8999999761581421,
411
+ "rewards/chosen": -1.387054443359375,
412
+ "rewards/margins": 1.218056559562683,
413
+ "rewards/rejected": -2.6051113605499268,
414
+ "step": 260
415
+ },
416
+ {
417
+ "epoch": 0.423861852433281,
418
+ "grad_norm": 78.46346160026032,
419
+ "learning_rate": 3.5679196060850034e-07,
420
+ "logits/chosen": -2.194472312927246,
421
+ "logits/rejected": -2.1565122604370117,
422
+ "logps/chosen": -470.94024658203125,
423
+ "logps/rejected": -557.4560546875,
424
+ "loss": 0.3988,
425
+ "rewards/accuracies": 0.800000011920929,
426
+ "rewards/chosen": -1.681726098060608,
427
+ "rewards/margins": 1.0995349884033203,
428
+ "rewards/rejected": -2.7812609672546387,
429
+ "step": 270
430
+ },
431
+ {
432
+ "epoch": 0.43956043956043955,
433
+ "grad_norm": 73.42817989092451,
434
+ "learning_rate": 3.4424441382108826e-07,
435
+ "logits/chosen": -2.0928378105163574,
436
+ "logits/rejected": -2.080376148223877,
437
+ "logps/chosen": -437.2099609375,
438
+ "logps/rejected": -575.6405029296875,
439
+ "loss": 0.3881,
440
+ "rewards/accuracies": 0.7749999761581421,
441
+ "rewards/chosen": -1.617456078529358,
442
+ "rewards/margins": 1.3479641675949097,
443
+ "rewards/rejected": -2.9654202461242676,
444
+ "step": 280
445
+ },
446
+ {
447
+ "epoch": 0.4552590266875981,
448
+ "grad_norm": 62.6558696654114,
449
+ "learning_rate": 3.314136382905234e-07,
450
+ "logits/chosen": -2.0131213665008545,
451
+ "logits/rejected": -1.9851818084716797,
452
+ "logps/chosen": -512.6057739257812,
453
+ "logps/rejected": -646.832275390625,
454
+ "loss": 0.4216,
455
+ "rewards/accuracies": 0.7875000238418579,
456
+ "rewards/chosen": -2.3018593788146973,
457
+ "rewards/margins": 1.346274971961975,
458
+ "rewards/rejected": -3.648134708404541,
459
+ "step": 290
460
+ },
461
+ {
462
+ "epoch": 0.47095761381475665,
463
+ "grad_norm": 78.12598055564801,
464
+ "learning_rate": 3.1833819380279023e-07,
465
+ "logits/chosen": -2.0949795246124268,
466
+ "logits/rejected": -1.9874067306518555,
467
+ "logps/chosen": -488.44384765625,
468
+ "logps/rejected": -571.2259521484375,
469
+ "loss": 0.386,
470
+ "rewards/accuracies": 0.8125,
471
+ "rewards/chosen": -2.06516695022583,
472
+ "rewards/margins": 1.3252384662628174,
473
+ "rewards/rejected": -3.3904056549072266,
474
+ "step": 300
475
+ },
476
+ {
477
+ "epoch": 0.48665620094191525,
478
+ "grad_norm": 48.25836081307725,
479
+ "learning_rate": 3.0505737543712275e-07,
480
+ "logits/chosen": -2.1759631633758545,
481
+ "logits/rejected": -2.095853805541992,
482
+ "logps/chosen": -437.9419860839844,
483
+ "logps/rejected": -518.4359130859375,
484
+ "loss": 0.3903,
485
+ "rewards/accuracies": 0.9125000238418579,
486
+ "rewards/chosen": -1.5794050693511963,
487
+ "rewards/margins": 1.1850736141204834,
488
+ "rewards/rejected": -2.7644786834716797,
489
+ "step": 310
490
+ },
491
+ {
492
+ "epoch": 0.5023547880690737,
493
+ "grad_norm": 60.78980701157502,
494
+ "learning_rate": 2.9161109547416667e-07,
495
+ "logits/chosen": -2.1254727840423584,
496
+ "logits/rejected": -2.0983071327209473,
497
+ "logps/chosen": -487.24383544921875,
498
+ "logps/rejected": -561.2527465820312,
499
+ "loss": 0.3585,
500
+ "rewards/accuracies": 0.862500011920929,
501
+ "rewards/chosen": -1.5642211437225342,
502
+ "rewards/margins": 1.3017809391021729,
503
+ "rewards/rejected": -2.866001844406128,
504
+ "step": 320
505
+ },
506
+ {
507
+ "epoch": 0.5180533751962323,
508
+ "grad_norm": 59.58938057948085,
509
+ "learning_rate": 2.780397634492949e-07,
510
+ "logits/chosen": -2.1095099449157715,
511
+ "logits/rejected": -2.0623815059661865,
512
+ "logps/chosen": -420.601318359375,
513
+ "logps/rejected": -561.5743408203125,
514
+ "loss": 0.4054,
515
+ "rewards/accuracies": 0.8500000238418579,
516
+ "rewards/chosen": -1.7670698165893555,
517
+ "rewards/margins": 1.3623578548431396,
518
+ "rewards/rejected": -3.129427671432495,
519
+ "step": 330
520
+ },
521
+ {
522
+ "epoch": 0.533751962323391,
523
+ "grad_norm": 59.60421047453409,
524
+ "learning_rate": 2.6438416471154273e-07,
525
+ "logits/chosen": -2.105985641479492,
526
+ "logits/rejected": -2.1100611686706543,
527
+ "logps/chosen": -455.4019470214844,
528
+ "logps/rejected": -597.48583984375,
529
+ "loss": 0.3559,
530
+ "rewards/accuracies": 0.7875000238418579,
531
+ "rewards/chosen": -1.826162338256836,
532
+ "rewards/margins": 1.1689088344573975,
533
+ "rewards/rejected": -2.9950711727142334,
534
+ "step": 340
535
+ },
536
+ {
537
+ "epoch": 0.5494505494505495,
538
+ "grad_norm": 64.80810897640455,
539
+ "learning_rate": 2.5068533785312666e-07,
540
+ "logits/chosen": -2.1501102447509766,
541
+ "logits/rejected": -2.098646640777588,
542
+ "logps/chosen": -398.44293212890625,
543
+ "logps/rejected": -512.3727416992188,
544
+ "loss": 0.3817,
545
+ "rewards/accuracies": 0.8374999761581421,
546
+ "rewards/chosen": -1.6598011255264282,
547
+ "rewards/margins": 1.3402740955352783,
548
+ "rewards/rejected": -3.000075101852417,
549
+ "step": 350
550
+ },
551
+ {
552
+ "epoch": 0.565149136577708,
553
+ "grad_norm": 66.30622337743506,
554
+ "learning_rate": 2.3698445137790258e-07,
555
+ "logits/chosen": -2.183788776397705,
556
+ "logits/rejected": -2.1399548053741455,
557
+ "logps/chosen": -450.28790283203125,
558
+ "logps/rejected": -570.9381103515625,
559
+ "loss": 0.3441,
560
+ "rewards/accuracies": 0.8999999761581421,
561
+ "rewards/chosen": -1.7852160930633545,
562
+ "rewards/margins": 1.4079563617706299,
563
+ "rewards/rejected": -3.1931726932525635,
564
+ "step": 360
565
+ },
566
+ {
567
+ "epoch": 0.5808477237048666,
568
+ "grad_norm": 74.385047879014,
569
+ "learning_rate": 2.2332267997940513e-07,
570
+ "logits/chosen": -1.9558976888656616,
571
+ "logits/rejected": -1.8889505863189697,
572
+ "logps/chosen": -472.29046630859375,
573
+ "logps/rejected": -564.1910400390625,
574
+ "loss": 0.4144,
575
+ "rewards/accuracies": 0.8125,
576
+ "rewards/chosen": -2.128002405166626,
577
+ "rewards/margins": 1.2228187322616577,
578
+ "rewards/rejected": -3.3508212566375732,
579
+ "step": 370
580
+ },
581
+ {
582
+ "epoch": 0.5965463108320251,
583
+ "grad_norm": 98.38326705187197,
584
+ "learning_rate": 2.0974108080028692e-07,
585
+ "logits/chosen": -2.1048827171325684,
586
+ "logits/rejected": -2.014101266860962,
587
+ "logps/chosen": -472.6429138183594,
588
+ "logps/rejected": -552.962646484375,
589
+ "loss": 0.362,
590
+ "rewards/accuracies": 0.887499988079071,
591
+ "rewards/chosen": -1.7709888219833374,
592
+ "rewards/margins": 1.5231949090957642,
593
+ "rewards/rejected": -3.2941837310791016,
594
+ "step": 380
595
+ },
596
+ {
597
+ "epoch": 0.6122448979591837,
598
+ "grad_norm": 28.798007236746653,
599
+ "learning_rate": 1.962804700450265e-07,
600
+ "logits/chosen": -2.229259490966797,
601
+ "logits/rejected": -2.136399269104004,
602
+ "logps/chosen": -404.88970947265625,
603
+ "logps/rejected": -605.4417724609375,
604
+ "loss": 0.2913,
605
+ "rewards/accuracies": 0.9125000238418579,
606
+ "rewards/chosen": -1.31179940700531,
607
+ "rewards/margins": 2.022050142288208,
608
+ "rewards/rejected": -3.3338496685028076,
609
+ "step": 390
610
+ },
611
+ {
612
+ "epoch": 0.6279434850863422,
613
+ "grad_norm": 47.93461485983235,
614
+ "learning_rate": 1.8298130031671972e-07,
615
+ "logits/chosen": -2.066481351852417,
616
+ "logits/rejected": -2.0052218437194824,
617
+ "logps/chosen": -461.60711669921875,
618
+ "logps/rejected": -593.0902099609375,
619
+ "loss": 0.3182,
620
+ "rewards/accuracies": 0.887499988079071,
621
+ "rewards/chosen": -1.50743567943573,
622
+ "rewards/margins": 1.5388964414596558,
623
+ "rewards/rejected": -3.0463321208953857,
624
+ "step": 400
625
+ },
626
+ {
627
+ "epoch": 0.6436420722135008,
628
+ "grad_norm": 81.64181086704542,
629
+ "learning_rate": 1.6988353904658492e-07,
630
+ "logits/chosen": -2.079695224761963,
631
+ "logits/rejected": -2.0234522819519043,
632
+ "logps/chosen": -435.9869079589844,
633
+ "logps/rejected": -548.7467041015625,
634
+ "loss": 0.5391,
635
+ "rewards/accuracies": 0.7124999761581421,
636
+ "rewards/chosen": -1.8887078762054443,
637
+ "rewards/margins": 1.0673691034317017,
638
+ "rewards/rejected": -2.9560768604278564,
639
+ "step": 410
640
+ },
641
+ {
642
+ "epoch": 0.6593406593406593,
643
+ "grad_norm": 99.80504939653643,
644
+ "learning_rate": 1.570265483815364e-07,
645
+ "logits/chosen": -2.146688222885132,
646
+ "logits/rejected": -2.1056787967681885,
647
+ "logps/chosen": -441.99456787109375,
648
+ "logps/rejected": -486.8045959472656,
649
+ "loss": 0.6064,
650
+ "rewards/accuracies": 0.625,
651
+ "rewards/chosen": -1.862978219985962,
652
+ "rewards/margins": 0.5048335790634155,
653
+ "rewards/rejected": -2.367811679840088,
654
+ "step": 420
655
+ },
656
+ {
657
+ "epoch": 0.6750392464678179,
658
+ "grad_norm": 85.01436657258617,
659
+ "learning_rate": 1.444489668907914e-07,
660
+ "logits/chosen": -2.0182952880859375,
661
+ "logits/rejected": -2.0226969718933105,
662
+ "logps/chosen": -346.7263488769531,
663
+ "logps/rejected": -505.09619140625,
664
+ "loss": 0.5198,
665
+ "rewards/accuracies": 0.800000011920929,
666
+ "rewards/chosen": -1.6009681224822998,
667
+ "rewards/margins": 1.235584020614624,
668
+ "rewards/rejected": -2.836552143096924,
669
+ "step": 430
670
+ },
671
+ {
672
+ "epoch": 0.6907378335949764,
673
+ "grad_norm": 75.62069078299841,
674
+ "learning_rate": 1.3218859344701632e-07,
675
+ "logits/chosen": -2.2084121704101562,
676
+ "logits/rejected": -2.1915016174316406,
677
+ "logps/chosen": -412.80224609375,
678
+ "logps/rejected": -515.23095703125,
679
+ "loss": 0.5238,
680
+ "rewards/accuracies": 0.7875000238418579,
681
+ "rewards/chosen": -1.4952690601348877,
682
+ "rewards/margins": 1.0581085681915283,
683
+ "rewards/rejected": -2.553377628326416,
684
+ "step": 440
685
+ },
686
+ {
687
+ "epoch": 0.706436420722135,
688
+ "grad_norm": 150.50609049436147,
689
+ "learning_rate": 1.202822736309758e-07,
690
+ "logits/chosen": -2.1147706508636475,
691
+ "logits/rejected": -2.0880684852600098,
692
+ "logps/chosen": -442.6726989746094,
693
+ "logps/rejected": -475.3531799316406,
694
+ "loss": 0.5647,
695
+ "rewards/accuracies": 0.75,
696
+ "rewards/chosen": -1.6886441707611084,
697
+ "rewards/margins": 0.8048279881477356,
698
+ "rewards/rejected": -2.4934723377227783,
699
+ "step": 450
700
+ },
701
+ {
702
+ "epoch": 0.7221350078492935,
703
+ "grad_norm": 76.56164006248804,
704
+ "learning_rate": 1.0876578900107053e-07,
705
+ "logits/chosen": -2.2221930027008057,
706
+ "logits/rejected": -2.1485908031463623,
707
+ "logps/chosen": -464.043701171875,
708
+ "logps/rejected": -508.6534118652344,
709
+ "loss": 0.522,
710
+ "rewards/accuracies": 0.6875,
711
+ "rewards/chosen": -1.763662338256836,
712
+ "rewards/margins": 0.7875553965568542,
713
+ "rewards/rejected": -2.551217555999756,
714
+ "step": 460
715
+ },
716
+ {
717
+ "epoch": 0.7378335949764521,
718
+ "grad_norm": 74.89323927902005,
719
+ "learning_rate": 9.767374956053584e-08,
720
+ "logits/chosen": -2.205277919769287,
721
+ "logits/rejected": -2.18218994140625,
722
+ "logps/chosen": -412.97686767578125,
723
+ "logps/rejected": -543.1309204101562,
724
+ "loss": 0.5596,
725
+ "rewards/accuracies": 0.7749999761581421,
726
+ "rewards/chosen": -1.553432822227478,
727
+ "rewards/margins": 0.9423764944076538,
728
+ "rewards/rejected": -2.495809316635132,
729
+ "step": 470
730
+ },
731
+ {
732
+ "epoch": 0.7535321821036107,
733
+ "grad_norm": 87.37555939932336,
734
+ "learning_rate": 8.70394897454659e-08,
735
+ "logits/chosen": -2.2252628803253174,
736
+ "logits/rejected": -2.1848928928375244,
737
+ "logps/chosen": -382.27752685546875,
738
+ "logps/rejected": -457.525390625,
739
+ "loss": 0.547,
740
+ "rewards/accuracies": 0.612500011920929,
741
+ "rewards/chosen": -1.6248447895050049,
742
+ "rewards/margins": 0.4326212406158447,
743
+ "rewards/rejected": -2.0574657917022705,
744
+ "step": 480
745
+ },
746
+ {
747
+ "epoch": 0.7692307692307693,
748
+ "grad_norm": 93.79826007492892,
749
+ "learning_rate": 7.689496824624525e-08,
750
+ "logits/chosen": -2.2677321434020996,
751
+ "logits/rejected": -2.212900161743164,
752
+ "logps/chosen": -403.1687316894531,
753
+ "logps/rejected": -456.62615966796875,
754
+ "loss": 0.5459,
755
+ "rewards/accuracies": 0.7124999761581421,
756
+ "rewards/chosen": -1.4313546419143677,
757
+ "rewards/margins": 0.6928239464759827,
758
+ "rewards/rejected": -2.124178647994995,
759
+ "step": 490
760
+ },
761
+ {
762
+ "epoch": 0.7849293563579278,
763
+ "grad_norm": 56.75850068630019,
764
+ "learning_rate": 6.727067196345099e-08,
765
+ "logits/chosen": -2.3276655673980713,
766
+ "logits/rejected": -2.2189202308654785,
767
+ "logps/chosen": -438.74267578125,
768
+ "logps/rejected": -478.62298583984375,
769
+ "loss": 0.5453,
770
+ "rewards/accuracies": 0.75,
771
+ "rewards/chosen": -1.3515713214874268,
772
+ "rewards/margins": 1.0120915174484253,
773
+ "rewards/rejected": -2.3636624813079834,
774
+ "step": 500
775
+ },
776
+ {
777
+ "epoch": 0.8006279434850864,
778
+ "grad_norm": 97.05649014340479,
779
+ "learning_rate": 5.8195524386862374e-08,
780
+ "logits/chosen": -2.311316967010498,
781
+ "logits/rejected": -2.242011547088623,
782
+ "logps/chosen": -362.28399658203125,
783
+ "logps/rejected": -465.5743103027344,
784
+ "loss": 0.4852,
785
+ "rewards/accuracies": 0.800000011920929,
786
+ "rewards/chosen": -1.409887433052063,
787
+ "rewards/margins": 0.9168035387992859,
788
+ "rewards/rejected": -2.326690912246704,
789
+ "step": 510
790
+ },
791
+ {
792
+ "epoch": 0.8163265306122449,
793
+ "grad_norm": 127.5887173764836,
794
+ "learning_rate": 4.969679867292276e-08,
795
+ "logits/chosen": -2.2007040977478027,
796
+ "logits/rejected": -2.138988971710205,
797
+ "logps/chosen": -413.65447998046875,
798
+ "logps/rejected": -489.22540283203125,
799
+ "loss": 0.4985,
800
+ "rewards/accuracies": 0.762499988079071,
801
+ "rewards/chosen": -1.3157858848571777,
802
+ "rewards/margins": 1.0105135440826416,
803
+ "rewards/rejected": -2.3262994289398193,
804
+ "step": 520
805
+ },
806
+ {
807
+ "epoch": 0.8320251177394035,
808
+ "grad_norm": 92.31568060854485,
809
+ "learning_rate": 4.180003568187776e-08,
810
+ "logits/chosen": -2.2751996517181396,
811
+ "logits/rejected": -2.2204182147979736,
812
+ "logps/chosen": -437.9576110839844,
813
+ "logps/rejected": -478.0186462402344,
814
+ "loss": 0.5193,
815
+ "rewards/accuracies": 0.699999988079071,
816
+ "rewards/chosen": -1.3832905292510986,
817
+ "rewards/margins": 0.656178891658783,
818
+ "rewards/rejected": -2.0394692420959473,
819
+ "step": 530
820
+ },
821
+ {
822
+ "epoch": 0.847723704866562,
823
+ "grad_norm": 93.73800900430595,
824
+ "learning_rate": 3.452896722091128e-08,
825
+ "logits/chosen": -2.2862913608551025,
826
+ "logits/rejected": -2.1965749263763428,
827
+ "logps/chosen": -418.68218994140625,
828
+ "logps/rejected": -488.07208251953125,
829
+ "loss": 0.549,
830
+ "rewards/accuracies": 0.762499988079071,
831
+ "rewards/chosen": -1.4643903970718384,
832
+ "rewards/margins": 0.8227276802062988,
833
+ "rewards/rejected": -2.2871181964874268,
834
+ "step": 540
835
+ },
836
+ {
837
+ "epoch": 0.8634222919937206,
838
+ "grad_norm": 74.68277239397396,
839
+ "learning_rate": 2.7905444723949762e-08,
840
+ "logits/chosen": -2.2326064109802246,
841
+ "logits/rejected": -2.1876773834228516,
842
+ "logps/chosen": -434.93377685546875,
843
+ "logps/rejected": -486.3379821777344,
844
+ "loss": 0.5464,
845
+ "rewards/accuracies": 0.7124999761581421,
846
+ "rewards/chosen": -1.5688042640686035,
847
+ "rewards/margins": 0.6943384408950806,
848
+ "rewards/rejected": -2.2631428241729736,
849
+ "step": 550
850
+ },
851
+ {
852
+ "epoch": 0.8791208791208791,
853
+ "grad_norm": 98.95496114421825,
854
+ "learning_rate": 2.194937358247506e-08,
855
+ "logits/chosen": -2.314251661300659,
856
+ "logits/rejected": -2.282222032546997,
857
+ "logps/chosen": -444.4891662597656,
858
+ "logps/rejected": -482.2521057128906,
859
+ "loss": 0.5058,
860
+ "rewards/accuracies": 0.7875000238418579,
861
+ "rewards/chosen": -1.4890687465667725,
862
+ "rewards/margins": 0.7982088327407837,
863
+ "rewards/rejected": -2.2872776985168457,
864
+ "step": 560
865
+ },
866
+ {
867
+ "epoch": 0.8948194662480377,
868
+ "grad_norm": 93.93456697380334,
869
+ "learning_rate": 1.6678653324693787e-08,
870
+ "logits/chosen": -2.271334171295166,
871
+ "logits/rejected": -2.243417263031006,
872
+ "logps/chosen": -379.41131591796875,
873
+ "logps/rejected": -474.8644104003906,
874
+ "loss": 0.5554,
875
+ "rewards/accuracies": 0.737500011920929,
876
+ "rewards/chosen": -1.2463254928588867,
877
+ "rewards/margins": 1.0109448432922363,
878
+ "rewards/rejected": -2.257270336151123,
879
+ "step": 570
880
+ },
881
+ {
882
+ "epoch": 0.9105180533751962,
883
+ "grad_norm": 63.908735732483215,
884
+ "learning_rate": 1.2109123822844653e-08,
885
+ "logits/chosen": -2.2195041179656982,
886
+ "logits/rejected": -2.152104139328003,
887
+ "logps/chosen": -424.376220703125,
888
+ "logps/rejected": -493.041748046875,
889
+ "loss": 0.5407,
890
+ "rewards/accuracies": 0.75,
891
+ "rewards/chosen": -1.484989047050476,
892
+ "rewards/margins": 0.8357736468315125,
893
+ "rewards/rejected": -2.3207626342773438,
894
+ "step": 580
895
+ },
896
+ {
897
+ "epoch": 0.9262166405023547,
898
+ "grad_norm": 77.29245256601979,
899
+ "learning_rate": 8.254517690300944e-09,
900
+ "logits/chosen": -2.1921186447143555,
901
+ "logits/rejected": -2.160266876220703,
902
+ "logps/chosen": -358.38665771484375,
903
+ "logps/rejected": -419.24053955078125,
904
+ "loss": 0.5065,
905
+ "rewards/accuracies": 0.699999988079071,
906
+ "rewards/chosen": -1.4891427755355835,
907
+ "rewards/margins": 0.6564360857009888,
908
+ "rewards/rejected": -2.1455788612365723,
909
+ "step": 590
910
+ },
911
+ {
912
+ "epoch": 0.9419152276295133,
913
+ "grad_norm": 85.49976100967837,
914
+ "learning_rate": 5.126419011529992e-09,
915
+ "logits/chosen": -2.247999668121338,
916
+ "logits/rejected": -2.1932740211486816,
917
+ "logps/chosen": -437.494384765625,
918
+ "logps/rejected": -485.1595764160156,
919
+ "loss": 0.4877,
920
+ "rewards/accuracies": 0.737500011920929,
921
+ "rewards/chosen": -1.302431583404541,
922
+ "rewards/margins": 0.9456756711006165,
923
+ "rewards/rejected": -2.2481071949005127,
924
+ "step": 600
925
+ },
926
+ {
927
+ "epoch": 0.957613814756672,
928
+ "grad_norm": 107.98710002534753,
929
+ "learning_rate": 2.734228528934679e-09,
930
+ "logits/chosen": -2.124533176422119,
931
+ "logits/rejected": -2.1044251918792725,
932
+ "logps/chosen": -403.1171569824219,
933
+ "logps/rejected": -488.7078552246094,
934
+ "loss": 0.5816,
935
+ "rewards/accuracies": 0.6875,
936
+ "rewards/chosen": -1.370384931564331,
937
+ "rewards/margins": 0.671573281288147,
938
+ "rewards/rejected": -2.0419580936431885,
939
+ "step": 610
940
+ },
941
+ {
942
+ "epoch": 0.9733124018838305,
943
+ "grad_norm": 62.944406149509454,
944
+ "learning_rate": 1.0851353912008642e-09,
945
+ "logits/chosen": -2.2792160511016846,
946
+ "logits/rejected": -2.1857845783233643,
947
+ "logps/chosen": -420.2891540527344,
948
+ "logps/rejected": -431.2037658691406,
949
+ "loss": 0.5459,
950
+ "rewards/accuracies": 0.737500011920929,
951
+ "rewards/chosen": -1.3576745986938477,
952
+ "rewards/margins": 0.5345790982246399,
953
+ "rewards/rejected": -1.8922535181045532,
954
+ "step": 620
955
+ },
956
+ {
957
+ "epoch": 0.989010989010989,
958
+ "grad_norm": 81.15926557617168,
959
+ "learning_rate": 1.840955480532924e-10,
960
+ "logits/chosen": -2.2514820098876953,
961
+ "logits/rejected": -2.2048940658569336,
962
+ "logps/chosen": -372.85504150390625,
963
+ "logps/rejected": -425.1669921875,
964
+ "loss": 0.5177,
965
+ "rewards/accuracies": 0.762499988079071,
966
+ "rewards/chosen": -1.1938732862472534,
967
+ "rewards/margins": 0.8971937894821167,
968
+ "rewards/rejected": -2.091067314147949,
969
+ "step": 630
970
+ },
971
+ {
972
+ "epoch": 1.0,
973
+ "step": 637,
974
  "total_flos": 0.0,
975
+ "train_loss": 0.3818549155440398,
976
+ "train_runtime": 4714.6907,
977
+ "train_samples_per_second": 3.242,
978
+ "train_steps_per_second": 0.135
979
  }
980
  ],
981
  "logging_steps": 10,
982
+ "max_steps": 637,
983
  "num_input_tokens_seen": 0,
984
  "num_train_epochs": 1,
985
  "save_steps": 100,