wzhouad commited on
Commit
3e15dd4
1 Parent(s): a71d566

Model save

Browse files
README.md CHANGED
@@ -35,7 +35,7 @@ The following hyperparameters were used during training:
35
  - learning_rate: 1e-06
36
  - train_batch_size: 2
37
  - eval_batch_size: 8
38
- - seed: 2
39
  - distributed_type: multi-GPU
40
  - num_devices: 8
41
  - gradient_accumulation_steps: 8
 
35
  - learning_rate: 1e-06
36
  - train_batch_size: 2
37
  - eval_batch_size: 8
38
+ - seed: 1
39
  - distributed_type: multi-GPU
40
  - num_devices: 8
41
  - gradient_accumulation_steps: 8
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.0,
3
- "train_loss": 0.5407548575334146,
4
- "train_runtime": 10382.7821,
5
- "train_samples": 45548,
6
- "train_samples_per_second": 8.774,
7
- "train_steps_per_second": 0.068
8
  }
 
1
  {
2
  "epoch": 2.0,
3
+ "train_loss": 0.050850671487596796,
4
+ "train_runtime": 12712.7589,
5
+ "train_samples": 61134,
6
+ "train_samples_per_second": 9.618,
7
+ "train_steps_per_second": 0.075
8
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f5fe4288e0a548624dde43de0a52827f920f75036371a401f4802c6559686b85
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7618f7999999ef60b2bfe93ea1451f00269f72eacb4df7c6c113ae154adca33a
3
  size 4976698672
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:05a91ac86fed729b227a85c23a32b80f447f76f43e88b8fc71ab73e747c36331
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9e1ad781186cb8bf880a39e7274512b7b6981f9be52ef8007de07ccca61620c
3
  size 4999802720
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b3527625878dae3b5d2ff8b6e7b96aa2f5fd8b0df8ba1c760622ef3f9fc8f4e5
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2443b4320da98c5727b93698fb94167c01ff8f426fb8289a970af245b8f93022
3
  size 4915916176
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7cb34bcd04c59efbf8ba150596a40f797ad20adebd35f851806967b3528b4756
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f60ade571941bdba1e5f8c4d7b6dd59bc051248e9fca85c5e2cff4d393bd5ead
3
  size 1168138808
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.0,
3
- "train_loss": 0.5407548575334146,
4
- "train_runtime": 10382.7821,
5
- "train_samples": 45548,
6
- "train_samples_per_second": 8.774,
7
- "train_steps_per_second": 0.068
8
  }
 
1
  {
2
  "epoch": 2.0,
3
+ "train_loss": 0.050850671487596796,
4
+ "train_runtime": 12712.7589,
5
+ "train_samples": 61134,
6
+ "train_samples_per_second": 9.618,
7
+ "train_steps_per_second": 0.075
8
  }
trainer_state.json CHANGED
@@ -1,1019 +1,1355 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.9950825430277486,
5
  "eval_steps": 10000,
6
- "global_step": 710,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.03,
13
- "learning_rate": 1.4084507042253522e-07,
14
- "logits/chosen": -0.024068962782621384,
15
- "logits/rejected": 0.04506533965468407,
16
- "logps/chosen": -317.1720275878906,
17
- "logps/rejected": -207.3963623046875,
18
- "loss": 0.6933,
19
- "rewards/accuracies": 0.4124999940395355,
20
- "rewards/chosen": -0.0009543737396597862,
21
- "rewards/margins": -0.0014725492801517248,
22
- "rewards/rejected": 0.0005181756569072604,
23
  "step": 10
24
  },
25
  {
26
- "epoch": 0.06,
27
- "learning_rate": 2.8169014084507043e-07,
28
- "logits/chosen": -0.04674551263451576,
29
- "logits/rejected": -0.010908829048275948,
30
- "logps/chosen": -294.0361328125,
31
- "logps/rejected": -201.79061889648438,
32
- "loss": 0.6914,
33
- "rewards/accuracies": 0.5874999761581421,
34
- "rewards/chosen": 0.0003093578852713108,
35
- "rewards/margins": 0.004586204886436462,
36
- "rewards/rejected": -0.004276847001165152,
37
  "step": 20
38
  },
39
  {
40
- "epoch": 0.08,
41
- "learning_rate": 4.225352112676056e-07,
42
- "logits/chosen": -0.04512999951839447,
43
- "logits/rejected": 0.007197662256658077,
44
- "logps/chosen": -361.146240234375,
45
- "logps/rejected": -249.73080444335938,
46
- "loss": 0.6841,
47
- "rewards/accuracies": 0.5625,
48
- "rewards/chosen": -0.001304158358834684,
49
- "rewards/margins": 0.015663906931877136,
50
- "rewards/rejected": -0.01696806587278843,
51
  "step": 30
52
  },
53
  {
54
- "epoch": 0.11,
55
- "learning_rate": 5.633802816901409e-07,
56
- "logits/chosen": -0.04949279874563217,
57
- "logits/rejected": 0.00024662315263412893,
58
- "logps/chosen": -291.42913818359375,
59
- "logps/rejected": -196.7120361328125,
60
- "loss": 0.6759,
61
- "rewards/accuracies": 0.5874999761581421,
62
- "rewards/chosen": -0.015403245575726032,
63
- "rewards/margins": 0.05249834060668945,
64
- "rewards/rejected": -0.06790158897638321,
65
  "step": 40
66
  },
67
  {
68
- "epoch": 0.14,
69
- "learning_rate": 7.04225352112676e-07,
70
- "logits/chosen": 0.02841530181467533,
71
- "logits/rejected": 0.0736595019698143,
72
- "logps/chosen": -373.4976806640625,
73
- "logps/rejected": -227.70068359375,
74
- "loss": 0.665,
75
- "rewards/accuracies": 0.606249988079071,
76
- "rewards/chosen": -0.05390547588467598,
77
- "rewards/margins": 0.11358483880758286,
78
- "rewards/rejected": -0.16749031841754913,
79
  "step": 50
80
  },
81
  {
82
- "epoch": 0.17,
83
- "learning_rate": 8.450704225352112e-07,
84
- "logits/chosen": -0.03540544956922531,
85
- "logits/rejected": 0.025740886107087135,
86
- "logps/chosen": -323.0583801269531,
87
- "logps/rejected": -230.3257293701172,
88
- "loss": 0.6567,
89
- "rewards/accuracies": 0.5874999761581421,
90
- "rewards/chosen": -0.03229106217622757,
91
- "rewards/margins": 0.1074444055557251,
92
- "rewards/rejected": -0.13973546028137207,
93
  "step": 60
94
  },
95
  {
96
- "epoch": 0.2,
97
- "learning_rate": 9.859154929577465e-07,
98
- "logits/chosen": -0.06292358785867691,
99
- "logits/rejected": -0.009262708015739918,
100
- "logps/chosen": -354.81341552734375,
101
- "logps/rejected": -255.65560913085938,
102
- "loss": 0.6477,
103
- "rewards/accuracies": 0.6187499761581421,
104
- "rewards/chosen": -0.03358080983161926,
105
- "rewards/margins": 0.17669352889060974,
106
- "rewards/rejected": -0.210274338722229,
107
  "step": 70
108
  },
109
  {
110
- "epoch": 0.22,
111
- "learning_rate": 9.995106132599868e-07,
112
- "logits/chosen": 0.03767090290784836,
113
- "logits/rejected": 0.09468533098697662,
114
- "logps/chosen": -319.8956298828125,
115
- "logps/rejected": -235.5646514892578,
116
- "loss": 0.667,
117
- "rewards/accuracies": 0.5562499761581421,
118
- "rewards/chosen": -0.07292340695858002,
119
- "rewards/margins": 0.1298426389694214,
120
- "rewards/rejected": -0.2027660608291626,
121
  "step": 80
122
  },
123
  {
124
- "epoch": 0.25,
125
- "learning_rate": 9.978201358980644e-07,
126
- "logits/chosen": -0.016046693548560143,
127
- "logits/rejected": 0.04123395308852196,
128
- "logps/chosen": -361.1316223144531,
129
- "logps/rejected": -279.39520263671875,
130
- "loss": 0.6491,
131
- "rewards/accuracies": 0.625,
132
- "rewards/chosen": 0.030158694833517075,
133
- "rewards/margins": 0.1393767148256302,
134
- "rewards/rejected": -0.10921802371740341,
135
  "step": 90
136
  },
137
  {
138
- "epoch": 0.28,
139
- "learning_rate": 9.949266103908894e-07,
140
- "logits/chosen": -0.01002789568156004,
141
- "logits/rejected": 0.055082082748413086,
142
- "logps/chosen": -275.36773681640625,
143
- "logps/rejected": -211.86721801757812,
144
- "loss": 0.6472,
145
- "rewards/accuracies": 0.53125,
146
- "rewards/chosen": 0.04291190952062607,
147
- "rewards/margins": 0.11093775182962418,
148
- "rewards/rejected": -0.06802584230899811,
149
  "step": 100
150
  },
151
  {
152
- "epoch": 0.31,
153
- "learning_rate": 9.908370293252287e-07,
154
- "logits/chosen": -0.008449924178421497,
155
- "logits/rejected": 0.018720177933573723,
156
- "logps/chosen": -323.53802490234375,
157
- "logps/rejected": -244.490966796875,
158
- "loss": 0.6407,
159
- "rewards/accuracies": 0.59375,
160
- "rewards/chosen": 0.07049892842769623,
161
- "rewards/margins": 0.1456976681947708,
162
- "rewards/rejected": -0.07519873231649399,
163
  "step": 110
164
  },
165
  {
166
- "epoch": 0.34,
167
- "learning_rate": 9.855612757141654e-07,
168
- "logits/chosen": -0.07508296519517899,
169
- "logits/rejected": -0.012953217141330242,
170
- "logps/chosen": -336.1876525878906,
171
- "logps/rejected": -208.52011108398438,
172
- "loss": 0.6333,
173
- "rewards/accuracies": 0.65625,
174
- "rewards/chosen": 0.03278186544775963,
175
- "rewards/margins": 0.26016712188720703,
176
- "rewards/rejected": -0.2273852527141571,
177
  "step": 120
178
  },
179
  {
180
- "epoch": 0.37,
181
- "learning_rate": 9.791120991134902e-07,
182
- "logits/chosen": -0.10753818601369858,
183
- "logits/rejected": -0.01082658488303423,
184
- "logps/chosen": -376.611083984375,
185
- "logps/rejected": -251.7242889404297,
186
- "loss": 0.6491,
187
- "rewards/accuracies": 0.668749988079071,
188
- "rewards/chosen": -0.026872599497437477,
189
- "rewards/margins": 0.2888794541358948,
190
- "rewards/rejected": -0.3157520890235901,
191
  "step": 130
192
  },
193
  {
194
- "epoch": 0.39,
195
- "learning_rate": 9.715050848107168e-07,
196
- "logits/chosen": -0.17280681431293488,
197
- "logits/rejected": -0.0874427855014801,
198
- "logps/chosen": -342.14593505859375,
199
- "logps/rejected": -229.78652954101562,
200
- "loss": 0.6346,
201
- "rewards/accuracies": 0.643750011920929,
202
- "rewards/chosen": -0.014674955978989601,
203
- "rewards/margins": 0.2581936717033386,
204
- "rewards/rejected": -0.2728686034679413,
205
  "step": 140
206
  },
207
  {
208
- "epoch": 0.42,
209
- "learning_rate": 9.627586161611731e-07,
210
- "logits/chosen": -0.09239596873521805,
211
- "logits/rejected": -0.035311758518218994,
212
- "logps/chosen": -282.2192077636719,
213
- "logps/rejected": -214.80810546875,
214
- "loss": 0.6501,
215
- "rewards/accuracies": 0.6499999761581421,
216
- "rewards/chosen": -0.03641432523727417,
217
- "rewards/margins": 0.19125035405158997,
218
- "rewards/rejected": -0.22766467928886414,
219
  "step": 150
220
  },
221
  {
222
- "epoch": 0.45,
223
- "learning_rate": 9.528938301621955e-07,
224
- "logits/chosen": -0.168453648686409,
225
- "logits/rejected": -0.1073572039604187,
226
- "logps/chosen": -341.64599609375,
227
- "logps/rejected": -223.13101196289062,
228
- "loss": 0.6342,
229
- "rewards/accuracies": 0.6312500238418579,
230
- "rewards/chosen": 0.01668923906981945,
231
- "rewards/margins": 0.19806070625782013,
232
- "rewards/rejected": -0.18137145042419434,
233
  "step": 160
234
  },
235
  {
236
- "epoch": 0.48,
237
- "learning_rate": 9.419345663727804e-07,
238
- "logits/chosen": -0.13118991255760193,
239
- "logits/rejected": -0.0702921599149704,
240
- "logps/chosen": -344.593505859375,
241
- "logps/rejected": -231.71804809570312,
242
- "loss": 0.6221,
243
- "rewards/accuracies": 0.65625,
244
- "rewards/chosen": 0.004311258438974619,
245
- "rewards/margins": 0.2375575602054596,
246
- "rewards/rejected": -0.23324629664421082,
247
  "step": 170
248
  },
249
  {
250
- "epoch": 0.51,
251
- "learning_rate": 9.299073093021404e-07,
252
- "logits/chosen": -0.05861334875226021,
253
- "logits/rejected": 0.004674489144235849,
254
- "logps/chosen": -355.00872802734375,
255
- "logps/rejected": -255.2248992919922,
256
- "loss": 0.6285,
257
- "rewards/accuracies": 0.6499999761581421,
258
- "rewards/chosen": -0.13683992624282837,
259
- "rewards/margins": 0.26262304186820984,
260
- "rewards/rejected": -0.3994629979133606,
261
  "step": 180
262
  },
263
  {
264
- "epoch": 0.53,
265
- "learning_rate": 9.168411244063861e-07,
266
- "logits/chosen": -0.15201464295387268,
267
- "logits/rejected": -0.08699367940425873,
268
- "logps/chosen": -369.53936767578125,
269
- "logps/rejected": -261.6515197753906,
270
- "loss": 0.6253,
271
- "rewards/accuracies": 0.637499988079071,
272
- "rewards/chosen": -0.18763788044452667,
273
- "rewards/margins": 0.2461237609386444,
274
- "rewards/rejected": -0.4337615966796875,
275
  "step": 190
276
  },
277
  {
278
- "epoch": 0.56,
279
- "learning_rate": 9.02767587848013e-07,
280
- "logits/chosen": -0.1659875363111496,
281
- "logits/rejected": -0.11425626277923584,
282
- "logps/chosen": -322.26971435546875,
283
- "logps/rejected": -269.41571044921875,
284
- "loss": 0.6377,
285
- "rewards/accuracies": 0.5874999761581421,
286
- "rewards/chosen": -0.1545395851135254,
287
- "rewards/margins": 0.16578510403633118,
288
- "rewards/rejected": -0.32032471895217896,
289
  "step": 200
290
  },
291
  {
292
- "epoch": 0.59,
293
- "learning_rate": 8.877207101879301e-07,
294
- "logits/chosen": -0.12081719934940338,
295
- "logits/rejected": -0.055270951241254807,
296
- "logps/chosen": -303.1000061035156,
297
- "logps/rejected": -226.3379364013672,
298
- "loss": 0.6265,
299
- "rewards/accuracies": 0.643750011920929,
300
- "rewards/chosen": -0.1504133641719818,
301
- "rewards/margins": 0.2184509038925171,
302
- "rewards/rejected": -0.3688642680644989,
303
  "step": 210
304
  },
305
  {
306
- "epoch": 0.62,
307
- "learning_rate": 8.717368541944452e-07,
308
- "logits/chosen": -0.14912042021751404,
309
- "logits/rejected": -0.11508848518133163,
310
- "logps/chosen": -313.95745849609375,
311
- "logps/rejected": -232.803955078125,
312
- "loss": 0.6245,
313
- "rewards/accuracies": 0.606249988079071,
314
- "rewards/chosen": -0.17989172041416168,
315
- "rewards/margins": 0.21268931031227112,
316
- "rewards/rejected": -0.3925810158252716,
317
  "step": 220
318
  },
319
  {
320
- "epoch": 0.65,
321
- "learning_rate": 8.54854646967831e-07,
322
- "logits/chosen": -0.20850150287151337,
323
- "logits/rejected": -0.14839352667331696,
324
- "logps/chosen": -351.56768798828125,
325
- "logps/rejected": -286.0954284667969,
326
- "loss": 0.6232,
327
- "rewards/accuracies": 0.6937500238418579,
328
- "rewards/chosen": -0.20482465624809265,
329
- "rewards/margins": 0.2566435933113098,
330
- "rewards/rejected": -0.46146830916404724,
331
  "step": 230
332
  },
333
  {
334
- "epoch": 0.67,
335
- "learning_rate": 8.371148865928318e-07,
336
- "logits/chosen": -0.13414549827575684,
337
- "logits/rejected": -0.09452676773071289,
338
- "logps/chosen": -334.4315490722656,
339
- "logps/rejected": -271.9152526855469,
340
- "loss": 0.6104,
341
- "rewards/accuracies": 0.612500011920929,
342
- "rewards/chosen": -0.18954138457775116,
343
- "rewards/margins": 0.24784322082996368,
344
- "rewards/rejected": -0.43738460540771484,
345
  "step": 240
346
  },
347
  {
348
- "epoch": 0.7,
349
- "learning_rate": 8.185604435447001e-07,
350
- "logits/chosen": -0.08637334406375885,
351
- "logits/rejected": -0.03546728193759918,
352
- "logps/chosen": -349.38787841796875,
353
- "logps/rejected": -278.61761474609375,
354
- "loss": 0.6238,
355
- "rewards/accuracies": 0.6625000238418579,
356
- "rewards/chosen": -0.2417462319135666,
357
- "rewards/margins": 0.2916719615459442,
358
- "rewards/rejected": -0.5334181785583496,
359
  "step": 250
360
  },
361
  {
362
- "epoch": 0.73,
363
- "learning_rate": 7.992361570870287e-07,
364
- "logits/chosen": -0.15081752836704254,
365
- "logits/rejected": -0.11529238522052765,
366
- "logps/chosen": -311.26507568359375,
367
- "logps/rejected": -253.03659057617188,
368
- "loss": 0.625,
369
- "rewards/accuracies": 0.668749988079071,
370
- "rewards/chosen": -0.34321683645248413,
371
- "rewards/margins": 0.19419686496257782,
372
- "rewards/rejected": -0.5374137163162231,
373
  "step": 260
374
  },
375
  {
376
- "epoch": 0.76,
377
- "learning_rate": 7.791887269117441e-07,
378
- "logits/chosen": -0.08736265450716019,
379
- "logits/rejected": 0.010209694504737854,
380
- "logps/chosen": -350.77197265625,
381
- "logps/rejected": -249.4473114013672,
382
- "loss": 0.6198,
383
- "rewards/accuracies": 0.668749988079071,
384
- "rewards/chosen": -0.33235999941825867,
385
- "rewards/margins": 0.3289110064506531,
386
- "rewards/rejected": -0.6612709760665894,
387
  "step": 270
388
  },
389
  {
390
- "epoch": 0.79,
391
- "learning_rate": 7.584666002831294e-07,
392
- "logits/chosen": -0.17468181252479553,
393
- "logits/rejected": -0.10544611513614655,
394
- "logps/chosen": -393.7838439941406,
395
- "logps/rejected": -275.46636962890625,
396
- "loss": 0.6284,
397
- "rewards/accuracies": 0.7250000238418579,
398
- "rewards/chosen": -0.20970232784748077,
399
- "rewards/margins": 0.31893718242645264,
400
- "rewards/rejected": -0.5286394357681274,
401
  "step": 280
402
  },
403
  {
404
- "epoch": 0.81,
405
- "learning_rate": 7.37119854958609e-07,
406
- "logits/chosen": -0.18520912528038025,
407
- "logits/rejected": -0.12915001809597015,
408
- "logps/chosen": -372.53961181640625,
409
- "logps/rejected": -301.4205017089844,
410
- "loss": 0.6426,
411
- "rewards/accuracies": 0.675000011920929,
412
- "rewards/chosen": -0.3479081690311432,
413
- "rewards/margins": 0.25431200861930847,
414
- "rewards/rejected": -0.6022201776504517,
415
  "step": 290
416
  },
417
  {
418
- "epoch": 0.84,
419
- "learning_rate": 7.152000781692285e-07,
420
- "logits/chosen": -0.20307299494743347,
421
- "logits/rejected": -0.1757517158985138,
422
- "logps/chosen": -368.33392333984375,
423
- "logps/rejected": -302.0033874511719,
424
- "loss": 0.6219,
425
- "rewards/accuracies": 0.6499999761581421,
426
- "rewards/chosen": -0.2545304298400879,
427
- "rewards/margins": 0.2229403257369995,
428
- "rewards/rejected": -0.4774707853794098,
429
  "step": 300
430
  },
431
  {
432
- "epoch": 0.87,
433
- "learning_rate": 6.927602419522946e-07,
434
- "logits/chosen": -0.15039893984794617,
435
- "logits/rejected": -0.1383979469537735,
436
- "logps/chosen": -319.64349365234375,
437
- "logps/rejected": -265.85198974609375,
438
- "loss": 0.624,
439
- "rewards/accuracies": 0.59375,
440
- "rewards/chosen": -0.3667163848876953,
441
- "rewards/margins": 0.18039202690124512,
442
- "rewards/rejected": -0.5471083521842957,
443
  "step": 310
444
  },
445
  {
446
- "epoch": 0.9,
447
- "learning_rate": 6.698545751374463e-07,
448
- "logits/chosen": -0.25044673681259155,
449
- "logits/rejected": -0.12538060545921326,
450
- "logps/chosen": -413.25946044921875,
451
- "logps/rejected": -303.199951171875,
452
- "loss": 0.6216,
453
- "rewards/accuracies": 0.6937500238418579,
454
- "rewards/chosen": -0.3125981390476227,
455
- "rewards/margins": 0.38619184494018555,
456
- "rewards/rejected": -0.6987899541854858,
457
  "step": 320
458
  },
459
  {
460
- "epoch": 0.93,
461
- "learning_rate": 6.465384322955224e-07,
462
- "logits/chosen": -0.1757752150297165,
463
- "logits/rejected": -0.11078636348247528,
464
- "logps/chosen": -362.4681091308594,
465
- "logps/rejected": -267.5201416015625,
466
- "loss": 0.6074,
467
- "rewards/accuracies": 0.7250000238418579,
468
- "rewards/chosen": -0.33040475845336914,
469
- "rewards/margins": 0.37607377767562866,
470
- "rewards/rejected": -0.7064785957336426,
471
  "step": 330
472
  },
473
  {
474
- "epoch": 0.96,
475
- "learning_rate": 6.228681599669248e-07,
476
- "logits/chosen": -0.19059506058692932,
477
- "logits/rejected": -0.09184812009334564,
478
- "logps/chosen": -430.0191345214844,
479
- "logps/rejected": -296.02557373046875,
480
- "loss": 0.617,
481
- "rewards/accuracies": 0.6875,
482
- "rewards/chosen": -0.3559810519218445,
483
- "rewards/margins": 0.3992285132408142,
484
- "rewards/rejected": -0.7552096247673035,
485
  "step": 340
486
  },
487
  {
488
- "epoch": 0.98,
489
- "learning_rate": 5.989009604927586e-07,
490
- "logits/chosen": -0.1337263584136963,
491
- "logits/rejected": -0.01602357253432274,
492
- "logps/chosen": -391.5013122558594,
493
- "logps/rejected": -306.56982421875,
494
- "loss": 0.5985,
495
- "rewards/accuracies": 0.7124999761581421,
496
- "rewards/chosen": -0.3254554867744446,
497
- "rewards/margins": 0.4386405944824219,
498
- "rewards/rejected": -0.7640960812568665,
499
  "step": 350
500
  },
501
  {
502
- "epoch": 1.01,
503
- "learning_rate": 5.74694753777815e-07,
504
- "logits/chosen": -0.13974535465240479,
505
- "logits/rejected": -0.11217441409826279,
506
- "logps/chosen": -327.7983703613281,
507
- "logps/rejected": -279.513916015625,
508
- "loss": 0.545,
509
- "rewards/accuracies": 0.7437499761581421,
510
- "rewards/chosen": -0.42400461435317993,
511
- "rewards/margins": 0.4070449769496918,
512
- "rewards/rejected": -0.8310495615005493,
513
  "step": 360
514
  },
515
  {
516
- "epoch": 1.04,
517
- "learning_rate": 5.503080373194666e-07,
518
- "logits/chosen": -0.2235272228717804,
519
- "logits/rejected": -0.12439509481191635,
520
- "logps/chosen": -374.24261474609375,
521
- "logps/rejected": -333.127685546875,
522
- "loss": 0.4827,
523
- "rewards/accuracies": 0.862500011920929,
524
- "rewards/chosen": -0.4394102096557617,
525
- "rewards/margins": 0.6707636713981628,
526
- "rewards/rejected": -1.1101738214492798,
527
  "step": 370
528
  },
529
  {
530
- "epoch": 1.07,
531
- "learning_rate": 5.257997448407366e-07,
532
- "logits/chosen": -0.12490369379520416,
533
- "logits/rejected": 0.020254041999578476,
534
- "logps/chosen": -408.45367431640625,
535
- "logps/rejected": -308.180908203125,
536
- "loss": 0.4897,
537
- "rewards/accuracies": 0.824999988079071,
538
- "rewards/chosen": -0.4755324423313141,
539
- "rewards/margins": 0.6596079468727112,
540
- "rewards/rejected": -1.1351404190063477,
541
  "step": 380
542
  },
543
  {
544
- "epoch": 1.1,
545
- "learning_rate": 5.012291038691665e-07,
546
- "logits/chosen": -0.15194548666477203,
547
- "logits/rejected": -0.03352439031004906,
548
- "logps/chosen": -419.17730712890625,
549
- "logps/rejected": -391.656982421875,
550
- "loss": 0.4746,
551
- "rewards/accuracies": 0.8999999761581421,
552
- "rewards/chosen": -0.6772257089614868,
553
- "rewards/margins": 0.8299336433410645,
554
- "rewards/rejected": -1.5071594715118408,
555
  "step": 390
556
  },
557
  {
558
- "epoch": 1.12,
559
- "learning_rate": 4.7665549260567063e-07,
560
- "logits/chosen": -0.20493462681770325,
561
- "logits/rejected": -0.13634856045246124,
562
- "logps/chosen": -411.919189453125,
563
- "logps/rejected": -344.78759765625,
564
- "loss": 0.4713,
565
- "rewards/accuracies": 0.8500000238418579,
566
- "rewards/chosen": -0.7057473063468933,
567
- "rewards/margins": 0.824458122253418,
568
- "rewards/rejected": -1.530205249786377,
569
  "step": 400
570
  },
571
  {
572
- "epoch": 1.15,
573
- "learning_rate": 4.521382964292663e-07,
574
- "logits/chosen": -0.14551517367362976,
575
- "logits/rejected": -0.07351706176996231,
576
- "logps/chosen": -394.6050109863281,
577
- "logps/rejected": -345.28082275390625,
578
- "loss": 0.4741,
579
- "rewards/accuracies": 0.84375,
580
- "rewards/chosen": -0.6415594816207886,
581
- "rewards/margins": 0.7878659963607788,
582
- "rewards/rejected": -1.4294254779815674,
583
  "step": 410
584
  },
585
  {
586
- "epoch": 1.18,
587
- "learning_rate": 4.277367643844574e-07,
588
- "logits/chosen": -0.1973930448293686,
589
- "logits/rejected": -0.10258449614048004,
590
- "logps/chosen": -446.4418029785156,
591
- "logps/rejected": -366.457763671875,
592
- "loss": 0.4605,
593
- "rewards/accuracies": 0.862500011920929,
594
- "rewards/chosen": -0.8334900736808777,
595
- "rewards/margins": 0.7750234603881836,
596
- "rewards/rejected": -1.608513593673706,
597
  "step": 420
598
  },
599
  {
600
- "epoch": 1.21,
601
- "learning_rate": 4.035098659980891e-07,
602
- "logits/chosen": -0.1986023634672165,
603
- "logits/rejected": -0.06475992500782013,
604
- "logps/chosen": -419.17010498046875,
605
- "logps/rejected": -395.53509521484375,
606
- "loss": 0.465,
607
- "rewards/accuracies": 0.8062499761581421,
608
- "rewards/chosen": -1.0268628597259521,
609
- "rewards/margins": 0.7243725657463074,
610
- "rewards/rejected": -1.7512352466583252,
611
  "step": 430
612
  },
613
  {
614
- "epoch": 1.24,
615
- "learning_rate": 3.795161487716928e-07,
616
- "logits/chosen": -0.20193979144096375,
617
- "logits/rejected": -0.09314943850040436,
618
- "logps/chosen": -491.21405029296875,
619
- "logps/rejected": -423.76654052734375,
620
- "loss": 0.4524,
621
- "rewards/accuracies": 0.8500000238418579,
622
- "rewards/chosen": -1.0318418741226196,
623
- "rewards/margins": 0.9399174451828003,
624
- "rewards/rejected": -1.9717592000961304,
625
  "step": 440
626
  },
627
  {
628
- "epoch": 1.26,
629
- "learning_rate": 3.5581359669371223e-07,
630
- "logits/chosen": -0.11288833618164062,
631
- "logits/rejected": -0.1004786491394043,
632
- "logps/chosen": -417.3036193847656,
633
- "logps/rejected": -372.6753845214844,
634
- "loss": 0.4561,
635
- "rewards/accuracies": 0.831250011920929,
636
- "rewards/chosen": -0.969109058380127,
637
- "rewards/margins": 0.6991706490516663,
638
- "rewards/rejected": -1.6682794094085693,
639
  "step": 450
640
  },
641
  {
642
- "epoch": 1.29,
643
- "learning_rate": 3.324594901135326e-07,
644
- "logits/chosen": -0.17446324229240417,
645
- "logits/rejected": -0.0842059999704361,
646
- "logps/chosen": -438.217041015625,
647
- "logps/rejected": -390.52874755859375,
648
- "loss": 0.459,
649
- "rewards/accuracies": 0.824999988079071,
650
- "rewards/chosen": -1.0395996570587158,
651
- "rewards/margins": 0.7797072529792786,
652
- "rewards/rejected": -1.8193069696426392,
653
  "step": 460
654
  },
655
  {
656
- "epoch": 1.32,
657
- "learning_rate": 3.095102673159463e-07,
658
- "logits/chosen": -0.13965001702308655,
659
- "logits/rejected": -0.09614251554012299,
660
- "logps/chosen": -477.21337890625,
661
- "logps/rejected": -420.8619079589844,
662
- "loss": 0.4336,
663
- "rewards/accuracies": 0.84375,
664
- "rewards/chosen": -1.1968367099761963,
665
- "rewards/margins": 0.8374635577201843,
666
- "rewards/rejected": -2.0343000888824463,
667
  "step": 470
668
  },
669
  {
670
- "epoch": 1.35,
671
- "learning_rate": 2.870213881305802e-07,
672
- "logits/chosen": -0.06153956800699234,
673
- "logits/rejected": 0.04337477311491966,
674
- "logps/chosen": -457.060302734375,
675
- "logps/rejected": -406.51300048828125,
676
- "loss": 0.4411,
677
- "rewards/accuracies": 0.793749988079071,
678
- "rewards/chosen": -1.1784467697143555,
679
- "rewards/margins": 0.8234208226203918,
680
- "rewards/rejected": -2.0018675327301025,
681
  "step": 480
682
  },
683
  {
684
- "epoch": 1.38,
685
- "learning_rate": 2.6504719990588745e-07,
686
- "logits/chosen": -0.05881907790899277,
687
- "logits/rejected": 0.013253748416900635,
688
- "logps/chosen": -455.1761169433594,
689
- "logps/rejected": -401.30133056640625,
690
- "loss": 0.4396,
691
- "rewards/accuracies": 0.831250011920929,
692
- "rewards/chosen": -1.2112154960632324,
693
- "rewards/margins": 0.8642939329147339,
694
- "rewards/rejected": -2.0755093097686768,
695
  "step": 490
696
  },
697
  {
698
- "epoch": 1.4,
699
- "learning_rate": 2.436408061715988e-07,
700
- "logits/chosen": 0.04037608206272125,
701
- "logits/rejected": 0.047458432614803314,
702
- "logps/chosen": -382.39141845703125,
703
- "logps/rejected": -422.6522521972656,
704
- "loss": 0.4409,
705
- "rewards/accuracies": 0.84375,
706
- "rewards/chosen": -1.066866159439087,
707
- "rewards/margins": 0.8686450719833374,
708
- "rewards/rejected": -1.9355109930038452,
709
  "step": 500
710
  },
711
  {
712
- "epoch": 1.43,
713
- "learning_rate": 2.22853938307025e-07,
714
- "logits/chosen": 0.09550214558839798,
715
- "logits/rejected": 0.1434255838394165,
716
- "logps/chosen": -393.2830810546875,
717
- "logps/rejected": -355.2184753417969,
718
- "loss": 0.4469,
719
- "rewards/accuracies": 0.8187500238418579,
720
- "rewards/chosen": -1.153294324874878,
721
- "rewards/margins": 0.6956819891929626,
722
- "rewards/rejected": -1.8489763736724854,
723
  "step": 510
724
  },
725
  {
726
- "epoch": 1.46,
727
- "learning_rate": 2.0273683052534173e-07,
728
- "logits/chosen": 0.14489802718162537,
729
- "logits/rejected": 0.22391514480113983,
730
- "logps/chosen": -465.29595947265625,
731
- "logps/rejected": -435.1458435058594,
732
- "loss": 0.4414,
733
- "rewards/accuracies": 0.800000011920929,
734
- "rewards/chosen": -1.2600550651550293,
735
- "rewards/margins": 0.9257229566574097,
736
- "rewards/rejected": -2.1857781410217285,
737
  "step": 520
738
  },
739
  {
740
- "epoch": 1.49,
741
- "learning_rate": 1.833380984759764e-07,
742
- "logits/chosen": 0.07251317799091339,
743
- "logits/rejected": 0.17206279933452606,
744
- "logps/chosen": -410.1485900878906,
745
- "logps/rejected": -438.0814514160156,
746
- "loss": 0.4487,
747
- "rewards/accuracies": 0.8062499761581421,
748
- "rewards/chosen": -1.2980937957763672,
749
- "rewards/margins": 0.8264884948730469,
750
- "rewards/rejected": -2.124582290649414,
751
  "step": 530
752
  },
753
  {
754
- "epoch": 1.52,
755
- "learning_rate": 1.6470462175846606e-07,
756
- "logits/chosen": 0.09354039281606674,
757
- "logits/rejected": 0.13454324007034302,
758
- "logps/chosen": -464.46990966796875,
759
- "logps/rejected": -444.4037170410156,
760
- "loss": 0.4383,
761
- "rewards/accuracies": 0.793749988079071,
762
- "rewards/chosen": -1.0875951051712036,
763
- "rewards/margins": 0.913548469543457,
764
- "rewards/rejected": -2.001143455505371,
765
  "step": 540
766
  },
767
  {
768
- "epoch": 1.55,
769
- "learning_rate": 1.468814306317092e-07,
770
- "logits/chosen": 0.12879224121570587,
771
- "logits/rejected": 0.1979069709777832,
772
- "logps/chosen": -426.343505859375,
773
- "logps/rejected": -380.24493408203125,
774
- "loss": 0.4398,
775
- "rewards/accuracies": 0.856249988079071,
776
- "rewards/chosen": -1.0752108097076416,
777
- "rewards/margins": 0.9150044322013855,
778
- "rewards/rejected": -1.9902150630950928,
779
  "step": 550
780
  },
781
  {
782
- "epoch": 1.57,
783
- "learning_rate": 1.299115971923958e-07,
784
- "logits/chosen": 0.12415747344493866,
785
- "logits/rejected": 0.20912082493305206,
786
- "logps/chosen": -448.64031982421875,
787
- "logps/rejected": -442.22576904296875,
788
- "loss": 0.4317,
789
- "rewards/accuracies": 0.8687499761581421,
790
- "rewards/chosen": -1.2348341941833496,
791
- "rewards/margins": 0.9574505090713501,
792
- "rewards/rejected": -2.1922848224639893,
793
  "step": 560
794
  },
795
  {
796
- "epoch": 1.6,
797
- "learning_rate": 1.1383613128559305e-07,
798
- "logits/chosen": 0.024107109755277634,
799
- "logits/rejected": 0.1119920164346695,
800
- "logps/chosen": -505.82928466796875,
801
- "logps/rejected": -505.4091796875,
802
- "loss": 0.426,
803
- "rewards/accuracies": 0.8687499761581421,
804
- "rewards/chosen": -1.2975566387176514,
805
- "rewards/margins": 1.085707187652588,
806
- "rewards/rejected": -2.3832640647888184,
807
  "step": 570
808
  },
809
  {
810
- "epoch": 1.63,
811
- "learning_rate": 9.869388139903495e-08,
812
- "logits/chosen": 0.012987576425075531,
813
- "logits/rejected": 0.06353282183408737,
814
- "logps/chosen": -541.9244995117188,
815
- "logps/rejected": -499.44110107421875,
816
- "loss": 0.4104,
817
- "rewards/accuracies": 0.875,
818
- "rewards/chosen": -1.3797237873077393,
819
- "rewards/margins": 0.9557130932807922,
820
- "rewards/rejected": -2.335437059402466,
821
  "step": 580
822
  },
823
  {
824
- "epoch": 1.66,
825
- "learning_rate": 8.452144078061818e-08,
826
- "logits/chosen": 0.14748625457286835,
827
- "logits/rejected": 0.21033573150634766,
828
- "logps/chosen": -428.7500915527344,
829
- "logps/rejected": -418.6878356933594,
830
- "loss": 0.4248,
831
- "rewards/accuracies": 0.8062499761581421,
832
- "rewards/chosen": -1.361243724822998,
833
- "rewards/margins": 0.893297553062439,
834
- "rewards/rejected": -2.2545411586761475,
835
  "step": 590
836
  },
837
  {
838
- "epoch": 1.69,
839
- "learning_rate": 7.135305900598321e-08,
840
- "logits/chosen": 0.012422094121575356,
841
- "logits/rejected": 0.08027663081884384,
842
- "logps/chosen": -506.3257751464844,
843
- "logps/rejected": -460.52777099609375,
844
- "loss": 0.4245,
845
- "rewards/accuracies": 0.9125000238418579,
846
- "rewards/chosen": -1.5179449319839478,
847
- "rewards/margins": 1.0011281967163086,
848
- "rewards/rejected": -2.5190727710723877,
849
  "step": 600
850
  },
851
  {
852
- "epoch": 1.71,
853
- "learning_rate": 5.9220559209888166e-08,
854
- "logits/chosen": 0.12760373950004578,
855
- "logits/rejected": 0.20078882575035095,
856
- "logps/chosen": -452.77734375,
857
- "logps/rejected": -487.36016845703125,
858
- "loss": 0.4266,
859
- "rewards/accuracies": 0.862500011920929,
860
- "rewards/chosen": -1.444874882698059,
861
- "rewards/margins": 1.0283567905426025,
862
- "rewards/rejected": -2.473231792449951,
863
  "step": 610
864
  },
865
  {
866
- "epoch": 1.74,
867
- "learning_rate": 4.815326118139812e-08,
868
- "logits/chosen": 0.14059333503246307,
869
- "logits/rejected": 0.2606312930583954,
870
- "logps/chosen": -415.8949279785156,
871
- "logps/rejected": -418.4481506347656,
872
- "loss": 0.4339,
873
- "rewards/accuracies": 0.8187500238418579,
874
- "rewards/chosen": -1.5133248567581177,
875
- "rewards/margins": 0.8190656900405884,
876
- "rewards/rejected": -2.332390546798706,
877
  "step": 620
878
  },
879
  {
880
- "epoch": 1.77,
881
- "learning_rate": 3.81779105087407e-08,
882
- "logits/chosen": 0.08177526295185089,
883
- "logits/rejected": 0.21824178099632263,
884
- "logps/chosen": -476.4537658691406,
885
- "logps/rejected": -467.0603942871094,
886
- "loss": 0.4436,
887
- "rewards/accuracies": 0.8125,
888
- "rewards/chosen": -1.559093713760376,
889
- "rewards/margins": 0.9111778140068054,
890
- "rewards/rejected": -2.470271587371826,
891
  "step": 630
892
  },
893
  {
894
- "epoch": 1.8,
895
- "learning_rate": 2.9318613945057637e-08,
896
- "logits/chosen": 0.12043829262256622,
897
- "logits/rejected": 0.19589121639728546,
898
- "logps/chosen": -504.67608642578125,
899
- "logps/rejected": -489.88330078125,
900
- "loss": 0.4181,
901
- "rewards/accuracies": 0.856249988079071,
902
- "rewards/chosen": -1.4812028408050537,
903
- "rewards/margins": 1.078580617904663,
904
- "rewards/rejected": -2.559783458709717,
905
  "step": 640
906
  },
907
  {
908
- "epoch": 1.83,
909
- "learning_rate": 2.1596781151249523e-08,
910
- "logits/chosen": 0.15832188725471497,
911
- "logits/rejected": 0.2765315771102905,
912
- "logps/chosen": -461.1602478027344,
913
- "logps/rejected": -456.65826416015625,
914
- "loss": 0.4321,
915
- "rewards/accuracies": 0.8187500238418579,
916
- "rewards/chosen": -1.4598538875579834,
917
- "rewards/margins": 0.8676374554634094,
918
- "rewards/rejected": -2.327491283416748,
919
  "step": 650
920
  },
921
  {
922
- "epoch": 1.85,
923
- "learning_rate": 1.5031072956701695e-08,
924
- "logits/chosen": 0.13437607884407043,
925
- "logits/rejected": 0.19320687651634216,
926
- "logps/chosen": -503.2486267089844,
927
- "logps/rejected": -498.3321228027344,
928
- "loss": 0.4196,
929
- "rewards/accuracies": 0.856249988079071,
930
- "rewards/chosen": -1.5257575511932373,
931
- "rewards/margins": 1.106710433959961,
932
- "rewards/rejected": -2.632467746734619,
933
  "step": 660
934
  },
935
  {
936
- "epoch": 1.88,
937
- "learning_rate": 9.637356262923723e-09,
938
- "logits/chosen": 0.24336127936840057,
939
- "logits/rejected": 0.2697839140892029,
940
- "logps/chosen": -468.03289794921875,
941
- "logps/rejected": -444.1060485839844,
942
- "loss": 0.4387,
943
- "rewards/accuracies": 0.856249988079071,
944
- "rewards/chosen": -1.5293312072753906,
945
- "rewards/margins": 0.842644214630127,
946
- "rewards/rejected": -2.3719754219055176,
947
  "step": 670
948
  },
949
  {
950
- "epoch": 1.91,
951
- "learning_rate": 5.428665699084789e-09,
952
- "logits/chosen": 0.10800081491470337,
953
- "logits/rejected": 0.1402808427810669,
954
- "logps/chosen": -492.41448974609375,
955
- "logps/rejected": -494.878173828125,
956
- "loss": 0.4088,
957
- "rewards/accuracies": 0.84375,
958
- "rewards/chosen": -1.4665706157684326,
959
- "rewards/margins": 1.0468090772628784,
960
- "rewards/rejected": -2.5133795738220215,
961
  "step": 680
962
  },
963
  {
964
- "epoch": 1.94,
965
- "learning_rate": 2.415172122110343e-09,
966
- "logits/chosen": 0.13986334204673767,
967
- "logits/rejected": 0.26917481422424316,
968
- "logps/chosen": -483.06024169921875,
969
- "logps/rejected": -482.5399475097656,
970
- "loss": 0.4382,
971
- "rewards/accuracies": 0.84375,
972
- "rewards/chosen": -1.4039775133132935,
973
- "rewards/margins": 1.0037094354629517,
974
- "rewards/rejected": -2.407686948776245,
975
  "step": 690
976
  },
977
  {
978
- "epoch": 1.97,
979
- "learning_rate": 6.041580374618327e-10,
980
- "logits/chosen": 0.105182945728302,
981
- "logits/rejected": 0.17750731110572815,
982
- "logps/chosen": -485.490478515625,
983
- "logps/rejected": -456.46063232421875,
984
- "loss": 0.4411,
985
- "rewards/accuracies": 0.84375,
986
- "rewards/chosen": -1.5071966648101807,
987
- "rewards/margins": 0.8808605074882507,
988
- "rewards/rejected": -2.388056993484497,
989
  "step": 700
990
  },
991
  {
992
- "epoch": 2.0,
993
- "learning_rate": 0.0,
994
- "logits/chosen": 0.1599053144454956,
995
- "logits/rejected": 0.2173675298690796,
996
- "logps/chosen": -413.2229919433594,
997
- "logps/rejected": -438.8427734375,
998
- "loss": 0.4277,
999
- "rewards/accuracies": 0.875,
1000
- "rewards/chosen": -1.4848761558532715,
1001
- "rewards/margins": 0.8370813131332397,
1002
- "rewards/rejected": -2.3219573497772217,
1003
  "step": 710
1004
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1005
  {
1006
  "epoch": 2.0,
1007
- "step": 710,
1008
  "total_flos": 0.0,
1009
- "train_loss": 0.5407548575334146,
1010
- "train_runtime": 10382.7821,
1011
- "train_samples_per_second": 8.774,
1012
- "train_steps_per_second": 0.068
1013
  }
1014
  ],
1015
  "logging_steps": 10,
1016
- "max_steps": 710,
1017
  "num_train_epochs": 2,
1018
  "save_steps": 10000,
1019
  "total_flos": 0.0,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.9973828840617638,
5
  "eval_steps": 10000,
6
+ "global_step": 954,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.02,
13
+ "learning_rate": 1.0416666666666667e-07,
14
+ "logits/chosen": 0.17704486846923828,
15
+ "logits/rejected": 0.25409135222435,
16
+ "logps/chosen": -354.4068603515625,
17
+ "logps/rejected": -305.2366638183594,
18
+ "loss": 0.1821,
19
+ "rewards/accuracies": 0.4312500059604645,
20
+ "rewards/chosen": -6.60312725813128e-05,
21
+ "rewards/margins": 0.00012125837383791804,
22
+ "rewards/rejected": -0.00018728969735093415,
23
  "step": 10
24
  },
25
  {
26
+ "epoch": 0.04,
27
+ "learning_rate": 2.0833333333333333e-07,
28
+ "logits/chosen": 0.07091161608695984,
29
+ "logits/rejected": 0.1985362321138382,
30
+ "logps/chosen": -316.65069580078125,
31
+ "logps/rejected": -276.1200866699219,
32
+ "loss": 0.182,
33
+ "rewards/accuracies": 0.5375000238418579,
34
+ "rewards/chosen": 0.0008458361262455583,
35
+ "rewards/margins": 0.0016920112539082766,
36
+ "rewards/rejected": -0.0008461751276627183,
37
  "step": 20
38
  },
39
  {
40
+ "epoch": 0.06,
41
+ "learning_rate": 3.1249999999999997e-07,
42
+ "logits/chosen": 0.17787829041481018,
43
+ "logits/rejected": 0.2488478720188141,
44
+ "logps/chosen": -294.9706115722656,
45
+ "logps/rejected": -298.59521484375,
46
+ "loss": 0.1822,
47
+ "rewards/accuracies": 0.625,
48
+ "rewards/chosen": -3.700423985719681e-05,
49
+ "rewards/margins": 0.0029355171136558056,
50
+ "rewards/rejected": -0.0029725211206823587,
51
  "step": 30
52
  },
53
  {
54
+ "epoch": 0.08,
55
+ "learning_rate": 4.1666666666666667e-07,
56
+ "logits/chosen": 0.09609868377447128,
57
+ "logits/rejected": 0.21795693039894104,
58
+ "logps/chosen": -347.44097900390625,
59
+ "logps/rejected": -320.9972839355469,
60
+ "loss": 0.1877,
61
+ "rewards/accuracies": 0.643750011920929,
62
+ "rewards/chosen": 0.0013125470140948892,
63
+ "rewards/margins": 0.00661453977227211,
64
+ "rewards/rejected": -0.005301993805915117,
65
  "step": 40
66
  },
67
  {
68
+ "epoch": 0.1,
69
+ "learning_rate": 5.208333333333334e-07,
70
+ "logits/chosen": 0.1497882902622223,
71
+ "logits/rejected": 0.240590900182724,
72
+ "logps/chosen": -311.1229553222656,
73
+ "logps/rejected": -286.51702880859375,
74
+ "loss": 0.1814,
75
+ "rewards/accuracies": 0.6937500238418579,
76
+ "rewards/chosen": -0.005703258328139782,
77
+ "rewards/margins": 0.022644545882940292,
78
+ "rewards/rejected": -0.02834780514240265,
79
  "step": 50
80
  },
81
  {
82
+ "epoch": 0.13,
83
+ "learning_rate": 6.249999999999999e-07,
84
+ "logits/chosen": 0.13869214057922363,
85
+ "logits/rejected": 0.28307411074638367,
86
+ "logps/chosen": -295.9754638671875,
87
+ "logps/rejected": -281.43798828125,
88
+ "loss": 0.1766,
89
+ "rewards/accuracies": 0.6187499761581421,
90
+ "rewards/chosen": -0.03096725046634674,
91
+ "rewards/margins": 0.028959080576896667,
92
+ "rewards/rejected": -0.059926338493824005,
93
  "step": 60
94
  },
95
  {
96
+ "epoch": 0.15,
97
+ "learning_rate": 7.291666666666666e-07,
98
+ "logits/chosen": 0.18460798263549805,
99
+ "logits/rejected": 0.2718513607978821,
100
+ "logps/chosen": -335.46148681640625,
101
+ "logps/rejected": -330.33404541015625,
102
+ "loss": 0.174,
103
+ "rewards/accuracies": 0.59375,
104
+ "rewards/chosen": -0.057377688586711884,
105
+ "rewards/margins": 0.05648452043533325,
106
+ "rewards/rejected": -0.11386220157146454,
107
  "step": 70
108
  },
109
  {
110
+ "epoch": 0.17,
111
+ "learning_rate": 8.333333333333333e-07,
112
+ "logits/chosen": 0.29816848039627075,
113
+ "logits/rejected": 0.4011983871459961,
114
+ "logps/chosen": -330.4580383300781,
115
+ "logps/rejected": -311.96490478515625,
116
+ "loss": 0.159,
117
+ "rewards/accuracies": 0.731249988079071,
118
+ "rewards/chosen": -0.11794394254684448,
119
+ "rewards/margins": 0.13102997839450836,
120
+ "rewards/rejected": -0.24897389113903046,
121
  "step": 80
122
  },
123
  {
124
+ "epoch": 0.19,
125
+ "learning_rate": 9.374999999999999e-07,
126
+ "logits/chosen": 0.2283201515674591,
127
+ "logits/rejected": 0.37335914373397827,
128
+ "logps/chosen": -358.6737365722656,
129
+ "logps/rejected": -304.0804138183594,
130
+ "loss": 0.1421,
131
+ "rewards/accuracies": 0.6312500238418579,
132
+ "rewards/chosen": -0.21732211112976074,
133
+ "rewards/margins": 0.15273679792881012,
134
+ "rewards/rejected": -0.37005892395973206,
135
  "step": 90
136
  },
137
  {
138
+ "epoch": 0.21,
139
+ "learning_rate": 9.999463737538052e-07,
140
+ "logits/chosen": 0.2938156723976135,
141
+ "logits/rejected": 0.46553492546081543,
142
+ "logps/chosen": -361.78338623046875,
143
+ "logps/rejected": -343.25750732421875,
144
+ "loss": 0.1217,
145
+ "rewards/accuracies": 0.668749988079071,
146
+ "rewards/chosen": -0.27221935987472534,
147
+ "rewards/margins": 0.23653486371040344,
148
+ "rewards/rejected": -0.5087541937828064,
149
  "step": 100
150
  },
151
  {
152
+ "epoch": 0.23,
153
+ "learning_rate": 9.993432105822034e-07,
154
+ "logits/chosen": 0.31155580282211304,
155
+ "logits/rejected": 0.3508353531360626,
156
+ "logps/chosen": -353.184814453125,
157
+ "logps/rejected": -366.32720947265625,
158
+ "loss": 0.106,
159
+ "rewards/accuracies": 0.6312500238418579,
160
+ "rewards/chosen": -0.40565404295921326,
161
+ "rewards/margins": 0.2631165683269501,
162
+ "rewards/rejected": -0.6687706708908081,
163
  "step": 110
164
  },
165
  {
166
+ "epoch": 0.25,
167
+ "learning_rate": 9.980706626858607e-07,
168
+ "logits/chosen": 0.26659709215164185,
169
+ "logits/rejected": 0.3288796842098236,
170
+ "logps/chosen": -374.50274658203125,
171
+ "logps/rejected": -403.8424377441406,
172
+ "loss": 0.0951,
173
+ "rewards/accuracies": 0.625,
174
+ "rewards/chosen": -0.5394914150238037,
175
+ "rewards/margins": 0.28696924448013306,
176
+ "rewards/rejected": -0.8264607191085815,
177
  "step": 120
178
  },
179
  {
180
+ "epoch": 0.27,
181
+ "learning_rate": 9.961304359538434e-07,
182
+ "logits/chosen": 0.1616436094045639,
183
+ "logits/rejected": 0.2970871031284332,
184
+ "logps/chosen": -396.555419921875,
185
+ "logps/rejected": -362.3848876953125,
186
+ "loss": 0.0934,
187
+ "rewards/accuracies": 0.625,
188
+ "rewards/chosen": -0.5805934071540833,
189
+ "rewards/margins": 0.19475166499614716,
190
+ "rewards/rejected": -0.775344967842102,
191
  "step": 130
192
  },
193
  {
194
+ "epoch": 0.29,
195
+ "learning_rate": 9.935251313189563e-07,
196
+ "logits/chosen": 0.1485656201839447,
197
+ "logits/rejected": 0.2714545428752899,
198
+ "logps/chosen": -384.0659484863281,
199
+ "logps/rejected": -346.6048278808594,
200
+ "loss": 0.0933,
201
+ "rewards/accuracies": 0.59375,
202
+ "rewards/chosen": -0.5202253460884094,
203
+ "rewards/margins": 0.24675369262695312,
204
+ "rewards/rejected": -0.766978919506073,
205
  "step": 140
206
  },
207
  {
208
+ "epoch": 0.31,
209
+ "learning_rate": 9.902582412711118e-07,
210
+ "logits/chosen": 0.12988325953483582,
211
+ "logits/rejected": 0.1523539423942566,
212
+ "logps/chosen": -379.16839599609375,
213
+ "logps/rejected": -395.9466552734375,
214
+ "loss": 0.1019,
215
+ "rewards/accuracies": 0.6875,
216
+ "rewards/chosen": -0.4386775493621826,
217
+ "rewards/margins": 0.37129276990890503,
218
+ "rewards/rejected": -0.8099702596664429,
219
  "step": 150
220
  },
221
  {
222
+ "epoch": 0.33,
223
+ "learning_rate": 9.86334145175542e-07,
224
+ "logits/chosen": 0.06655962765216827,
225
+ "logits/rejected": 0.09024105966091156,
226
+ "logps/chosen": -341.7105407714844,
227
+ "logps/rejected": -360.19805908203125,
228
+ "loss": 0.0937,
229
+ "rewards/accuracies": 0.699999988079071,
230
+ "rewards/chosen": -0.3964901566505432,
231
+ "rewards/margins": 0.3985019028186798,
232
+ "rewards/rejected": -0.7949920892715454,
233
  "step": 160
234
  },
235
  {
236
+ "epoch": 0.36,
237
+ "learning_rate": 9.817581034021272e-07,
238
+ "logits/chosen": 0.16973164677619934,
239
+ "logits/rejected": 0.21836213767528534,
240
+ "logps/chosen": -398.22369384765625,
241
+ "logps/rejected": -417.8206481933594,
242
+ "loss": 0.081,
243
+ "rewards/accuracies": 0.675000011920929,
244
+ "rewards/chosen": -0.6185532808303833,
245
+ "rewards/margins": 0.4811604917049408,
246
+ "rewards/rejected": -1.0997138023376465,
247
  "step": 170
248
  },
249
  {
250
+ "epoch": 0.38,
251
+ "learning_rate": 9.765362502737097e-07,
252
+ "logits/chosen": 0.09212584793567657,
253
+ "logits/rejected": 0.23974208533763885,
254
+ "logps/chosen": -388.64910888671875,
255
+ "logps/rejected": -411.5782775878906,
256
+ "loss": 0.0713,
257
+ "rewards/accuracies": 0.706250011920929,
258
+ "rewards/chosen": -0.6261709928512573,
259
+ "rewards/margins": 0.4908596873283386,
260
+ "rewards/rejected": -1.1170307397842407,
261
  "step": 180
262
  },
263
  {
264
+ "epoch": 0.4,
265
+ "learning_rate": 9.706755858428485e-07,
266
+ "logits/chosen": 0.1811675727367401,
267
+ "logits/rejected": 0.27236208319664,
268
+ "logps/chosen": -419.11376953125,
269
+ "logps/rejected": -437.33843994140625,
270
+ "loss": 0.0681,
271
+ "rewards/accuracies": 0.6625000238418579,
272
+ "rewards/chosen": -0.8445426225662231,
273
+ "rewards/margins": 0.4015916883945465,
274
+ "rewards/rejected": -1.2461342811584473,
275
  "step": 190
276
  },
277
  {
278
+ "epoch": 0.42,
279
+ "learning_rate": 9.641839665080363e-07,
280
+ "logits/chosen": 0.14256766438484192,
281
+ "logits/rejected": 0.2711044251918793,
282
+ "logps/chosen": -414.55975341796875,
283
+ "logps/rejected": -416.9037170410156,
284
+ "loss": 0.0675,
285
+ "rewards/accuracies": 0.6875,
286
+ "rewards/chosen": -0.7406997680664062,
287
+ "rewards/margins": 0.48706990480422974,
288
+ "rewards/rejected": -1.2277696132659912,
289
  "step": 200
290
  },
291
  {
292
+ "epoch": 0.44,
293
+ "learning_rate": 9.570700944819582e-07,
294
+ "logits/chosen": 0.23208096623420715,
295
+ "logits/rejected": 0.35697174072265625,
296
+ "logps/chosen": -382.19970703125,
297
+ "logps/rejected": -386.50701904296875,
298
+ "loss": 0.0708,
299
+ "rewards/accuracies": 0.6625000238418579,
300
+ "rewards/chosen": -0.6804240942001343,
301
+ "rewards/margins": 0.48590850830078125,
302
+ "rewards/rejected": -1.166332721710205,
303
  "step": 210
304
  },
305
  {
306
+ "epoch": 0.46,
307
+ "learning_rate": 9.493435061259129e-07,
308
+ "logits/chosen": 0.13639363646507263,
309
+ "logits/rejected": 0.23731064796447754,
310
+ "logps/chosen": -382.42022705078125,
311
+ "logps/rejected": -369.6554870605469,
312
+ "loss": 0.0763,
313
+ "rewards/accuracies": 0.6625000238418579,
314
+ "rewards/chosen": -0.6574115753173828,
315
+ "rewards/margins": 0.40243881940841675,
316
+ "rewards/rejected": -1.0598504543304443,
317
  "step": 220
318
  },
319
  {
320
+ "epoch": 0.48,
321
+ "learning_rate": 9.4101455916603e-07,
322
+ "logits/chosen": 0.1799091249704361,
323
+ "logits/rejected": 0.2304597645998001,
324
+ "logps/chosen": -416.672607421875,
325
+ "logps/rejected": -420.39862060546875,
326
+ "loss": 0.0668,
327
+ "rewards/accuracies": 0.643750011920929,
328
+ "rewards/chosen": -0.9061130285263062,
329
+ "rewards/margins": 0.46666598320007324,
330
+ "rewards/rejected": -1.3727790117263794,
331
  "step": 230
332
  },
333
  {
334
+ "epoch": 0.5,
335
+ "learning_rate": 9.320944188084241e-07,
336
+ "logits/chosen": 0.08318189531564713,
337
+ "logits/rejected": 0.13486048579216003,
338
+ "logps/chosen": -408.77545166015625,
339
+ "logps/rejected": -427.9566345214844,
340
+ "loss": 0.0639,
341
+ "rewards/accuracies": 0.6187499761581421,
342
+ "rewards/chosen": -0.968237042427063,
343
+ "rewards/margins": 0.2922549843788147,
344
+ "rewards/rejected": -1.260491967201233,
345
  "step": 240
346
  },
347
  {
348
+ "epoch": 0.52,
349
+ "learning_rate": 9.225950427718974e-07,
350
+ "logits/chosen": 0.051157813519239426,
351
+ "logits/rejected": 0.1319509893655777,
352
+ "logps/chosen": -385.2474670410156,
353
+ "logps/rejected": -402.11126708984375,
354
+ "loss": 0.0631,
355
+ "rewards/accuracies": 0.668749988079071,
356
+ "rewards/chosen": -0.7319932579994202,
357
+ "rewards/margins": 0.468679815530777,
358
+ "rewards/rejected": -1.2006731033325195,
359
  "step": 250
360
  },
361
  {
362
+ "epoch": 0.54,
363
+ "learning_rate": 9.125291652582547e-07,
364
+ "logits/chosen": 0.013853952288627625,
365
+ "logits/rejected": 0.10071275383234024,
366
+ "logps/chosen": -445.53607177734375,
367
+ "logps/rejected": -434.2711486816406,
368
+ "loss": 0.0641,
369
+ "rewards/accuracies": 0.612500011920929,
370
+ "rewards/chosen": -0.9089228510856628,
371
+ "rewards/margins": 0.4331666827201843,
372
+ "rewards/rejected": -1.3420894145965576,
373
  "step": 260
374
  },
375
  {
376
+ "epoch": 0.57,
377
+ "learning_rate": 9.019102798817195e-07,
378
+ "logits/chosen": 0.1297096163034439,
379
+ "logits/rejected": 0.1613592505455017,
380
+ "logps/chosen": -403.47393798828125,
381
+ "logps/rejected": -446.1951599121094,
382
+ "loss": 0.0685,
383
+ "rewards/accuracies": 0.6937500238418579,
384
+ "rewards/chosen": -0.7434005737304688,
385
+ "rewards/margins": 0.6140644550323486,
386
+ "rewards/rejected": -1.357465147972107,
387
  "step": 270
388
  },
389
  {
390
+ "epoch": 0.59,
391
+ "learning_rate": 8.90752621580335e-07,
392
+ "logits/chosen": 0.16231071949005127,
393
+ "logits/rejected": 0.1873283088207245,
394
+ "logps/chosen": -362.4006652832031,
395
+ "logps/rejected": -398.279296875,
396
+ "loss": 0.0751,
397
+ "rewards/accuracies": 0.7124999761581421,
398
+ "rewards/chosen": -0.6591774225234985,
399
+ "rewards/margins": 0.41294485330581665,
400
+ "rewards/rejected": -1.07212233543396,
401
  "step": 280
402
  },
403
  {
404
+ "epoch": 0.61,
405
+ "learning_rate": 8.79071147533597e-07,
406
+ "logits/chosen": 0.14204099774360657,
407
+ "logits/rejected": 0.20997166633605957,
408
+ "logps/chosen": -424.5856018066406,
409
+ "logps/rejected": -456.9698181152344,
410
+ "loss": 0.0642,
411
+ "rewards/accuracies": 0.7250000238418579,
412
+ "rewards/chosen": -0.7815448045730591,
413
+ "rewards/margins": 0.5602203011512756,
414
+ "rewards/rejected": -1.34176504611969,
415
  "step": 290
416
  },
417
  {
418
+ "epoch": 0.63,
419
+ "learning_rate": 8.668815171119019e-07,
420
+ "logits/chosen": 0.2026984989643097,
421
+ "logits/rejected": 0.23374077677726746,
422
+ "logps/chosen": -380.8060607910156,
423
+ "logps/rejected": -468.7802734375,
424
+ "loss": 0.0554,
425
+ "rewards/accuracies": 0.699999988079071,
426
+ "rewards/chosen": -0.8905105590820312,
427
+ "rewards/margins": 0.5638677477836609,
428
+ "rewards/rejected": -1.454378366470337,
429
  "step": 300
430
  },
431
  {
432
+ "epoch": 0.65,
433
+ "learning_rate": 8.54200070884685e-07,
434
+ "logits/chosen": 0.23336808383464813,
435
+ "logits/rejected": 0.25176650285720825,
436
+ "logps/chosen": -385.24676513671875,
437
+ "logps/rejected": -462.87322998046875,
438
+ "loss": 0.0565,
439
+ "rewards/accuracies": 0.706250011920929,
440
+ "rewards/chosen": -0.8951492309570312,
441
+ "rewards/margins": 0.6165014505386353,
442
+ "rewards/rejected": -1.5116506814956665,
443
  "step": 310
444
  },
445
  {
446
+ "epoch": 0.67,
447
+ "learning_rate": 8.410438087153911e-07,
448
+ "logits/chosen": 0.22913236916065216,
449
+ "logits/rejected": 0.3360585570335388,
450
+ "logps/chosen": -383.767578125,
451
+ "logps/rejected": -424.25067138671875,
452
+ "loss": 0.0641,
453
+ "rewards/accuracies": 0.7250000238418579,
454
+ "rewards/chosen": -0.6813658475875854,
455
+ "rewards/margins": 0.6591276526451111,
456
+ "rewards/rejected": -1.3404934406280518,
457
  "step": 320
458
  },
459
  {
460
+ "epoch": 0.69,
461
+ "learning_rate": 8.274303669726426e-07,
462
+ "logits/chosen": 0.22990348935127258,
463
+ "logits/rejected": 0.3006184697151184,
464
+ "logps/chosen": -366.43499755859375,
465
+ "logps/rejected": -444.06536865234375,
466
+ "loss": 0.0636,
467
+ "rewards/accuracies": 0.675000011920929,
468
+ "rewards/chosen": -0.6766657829284668,
469
+ "rewards/margins": 0.6564770936965942,
470
+ "rewards/rejected": -1.333142876625061,
471
  "step": 330
472
  },
473
  {
474
+ "epoch": 0.71,
475
+ "learning_rate": 8.133779948881513e-07,
476
+ "logits/chosen": 0.22257550060749054,
477
+ "logits/rejected": 0.3241097033023834,
478
+ "logps/chosen": -360.141845703125,
479
+ "logps/rejected": -405.85711669921875,
480
+ "loss": 0.0662,
481
+ "rewards/accuracies": 0.78125,
482
+ "rewards/chosen": -0.7344536781311035,
483
+ "rewards/margins": 0.7157880067825317,
484
+ "rewards/rejected": -1.4502416849136353,
485
  "step": 340
486
  },
487
  {
488
+ "epoch": 0.73,
489
+ "learning_rate": 7.989055300930704e-07,
490
+ "logits/chosen": 0.1499968320131302,
491
+ "logits/rejected": 0.15372925996780396,
492
+ "logps/chosen": -388.67559814453125,
493
+ "logps/rejected": -462.0445251464844,
494
+ "loss": 0.0644,
495
+ "rewards/accuracies": 0.6499999761581421,
496
+ "rewards/chosen": -0.8717344403266907,
497
+ "rewards/margins": 0.6429644227027893,
498
+ "rewards/rejected": -1.51469886302948,
499
  "step": 350
500
  },
501
  {
502
+ "epoch": 0.75,
503
+ "learning_rate": 7.840323733655778e-07,
504
+ "logits/chosen": 0.08885981142520905,
505
+ "logits/rejected": 0.19541098177433014,
506
+ "logps/chosen": -407.87286376953125,
507
+ "logps/rejected": -420.4515686035156,
508
+ "loss": 0.0583,
509
+ "rewards/accuracies": 0.7124999761581421,
510
+ "rewards/chosen": -0.797155499458313,
511
+ "rewards/margins": 0.5855330228805542,
512
+ "rewards/rejected": -1.3826884031295776,
513
  "step": 360
514
  },
515
  {
516
+ "epoch": 0.77,
517
+ "learning_rate": 7.687784626235447e-07,
518
+ "logits/chosen": 0.05912008136510849,
519
+ "logits/rejected": 0.17702099680900574,
520
+ "logps/chosen": -428.82354736328125,
521
+ "logps/rejected": -466.0895080566406,
522
+ "loss": 0.0599,
523
+ "rewards/accuracies": 0.6937500238418579,
524
+ "rewards/chosen": -0.803920567035675,
525
+ "rewards/margins": 0.7507921457290649,
526
+ "rewards/rejected": -1.5547125339508057,
527
  "step": 370
528
  },
529
  {
530
+ "epoch": 0.8,
531
+ "learning_rate": 7.531642461971514e-07,
532
+ "logits/chosen": 0.11388075351715088,
533
+ "logits/rejected": 0.1931450068950653,
534
+ "logps/chosen": -388.9282531738281,
535
+ "logps/rejected": -427.1614685058594,
536
+ "loss": 0.0578,
537
+ "rewards/accuracies": 0.6812499761581421,
538
+ "rewards/chosen": -0.9585503339767456,
539
+ "rewards/margins": 0.5912213325500488,
540
+ "rewards/rejected": -1.5497716665267944,
541
  "step": 380
542
  },
543
  {
544
+ "epoch": 0.82,
545
+ "learning_rate": 7.372106554172801e-07,
546
+ "logits/chosen": -0.049389470368623734,
547
+ "logits/rejected": 0.10218650102615356,
548
+ "logps/chosen": -443.7737731933594,
549
+ "logps/rejected": -484.5735778808594,
550
+ "loss": 0.0446,
551
+ "rewards/accuracies": 0.731249988079071,
552
+ "rewards/chosen": -1.0208237171173096,
553
+ "rewards/margins": 0.8150562047958374,
554
+ "rewards/rejected": -1.835879921913147,
555
  "step": 390
556
  },
557
  {
558
+ "epoch": 0.84,
559
+ "learning_rate": 7.209390765564318e-07,
560
+ "logits/chosen": 0.07526848465204239,
561
+ "logits/rejected": 0.1457681804895401,
562
+ "logps/chosen": -430.77130126953125,
563
+ "logps/rejected": -478.53118896484375,
564
+ "loss": 0.0488,
565
+ "rewards/accuracies": 0.706250011920929,
566
+ "rewards/chosen": -1.137662410736084,
567
+ "rewards/margins": 0.6997725963592529,
568
+ "rewards/rejected": -1.837435007095337,
569
  "step": 400
570
  },
571
  {
572
+ "epoch": 0.86,
573
+ "learning_rate": 7.043713221597773e-07,
574
+ "logits/chosen": -0.014962440356612206,
575
+ "logits/rejected": 0.049673158675432205,
576
+ "logps/chosen": -394.35980224609375,
577
+ "logps/rejected": -455.79168701171875,
578
+ "loss": 0.0469,
579
+ "rewards/accuracies": 0.5874999761581421,
580
+ "rewards/chosen": -1.0516221523284912,
581
+ "rewards/margins": 0.6002627015113831,
582
+ "rewards/rejected": -1.65188467502594,
583
  "step": 410
584
  },
585
  {
586
+ "epoch": 0.88,
587
+ "learning_rate": 6.875296018047809e-07,
588
+ "logits/chosen": 0.1113734096288681,
589
+ "logits/rejected": 0.17297616600990295,
590
+ "logps/chosen": -371.1769104003906,
591
+ "logps/rejected": -433.82763671875,
592
+ "loss": 0.057,
593
+ "rewards/accuracies": 0.731249988079071,
594
+ "rewards/chosen": -0.7784308791160583,
595
+ "rewards/margins": 0.7032991647720337,
596
+ "rewards/rejected": -1.4817302227020264,
597
  "step": 420
598
  },
599
  {
600
+ "epoch": 0.9,
601
+ "learning_rate": 6.704364923285857e-07,
602
+ "logits/chosen": 0.08021976053714752,
603
+ "logits/rejected": 0.09611347317695618,
604
+ "logps/chosen": -433.26898193359375,
605
+ "logps/rejected": -482.2544860839844,
606
+ "loss": 0.0623,
607
+ "rewards/accuracies": 0.7124999761581421,
608
+ "rewards/chosen": -0.9919212460517883,
609
+ "rewards/margins": 0.5928072333335876,
610
+ "rewards/rejected": -1.584728479385376,
611
  "step": 430
612
  },
613
  {
614
+ "epoch": 0.92,
615
+ "learning_rate": 6.531149075630796e-07,
616
+ "logits/chosen": 0.06492827087640762,
617
+ "logits/rejected": 0.09372309595346451,
618
+ "logps/chosen": -369.0657958984375,
619
+ "logps/rejected": -427.1637268066406,
620
+ "loss": 0.0602,
621
+ "rewards/accuracies": 0.6812499761581421,
622
+ "rewards/chosen": -0.8450859785079956,
623
+ "rewards/margins": 0.6487796902656555,
624
+ "rewards/rejected": -1.4938656091690063,
625
  "step": 440
626
  },
627
  {
628
+ "epoch": 0.94,
629
+ "learning_rate": 6.355880676182085e-07,
630
+ "logits/chosen": 0.015085640363395214,
631
+ "logits/rejected": 0.1697283238172531,
632
+ "logps/chosen": -454.42071533203125,
633
+ "logps/rejected": -461.6656799316406,
634
+ "loss": 0.0537,
635
+ "rewards/accuracies": 0.699999988079071,
636
+ "rewards/chosen": -1.0446925163269043,
637
+ "rewards/margins": 0.7324589490890503,
638
+ "rewards/rejected": -1.7771514654159546,
639
  "step": 450
640
  },
641
  {
642
+ "epoch": 0.96,
643
+ "learning_rate": 6.178794677547137e-07,
644
+ "logits/chosen": 0.052903078496456146,
645
+ "logits/rejected": 0.21909013390541077,
646
+ "logps/chosen": -389.771728515625,
647
+ "logps/rejected": -432.63311767578125,
648
+ "loss": 0.0475,
649
+ "rewards/accuracies": 0.71875,
650
+ "rewards/chosen": -0.918341338634491,
651
+ "rewards/margins": 0.7504295706748962,
652
+ "rewards/rejected": -1.6687707901000977,
653
  "step": 460
654
  },
655
  {
656
+ "epoch": 0.98,
657
+ "learning_rate": 6.000128468880222e-07,
658
+ "logits/chosen": 0.0020152360666543245,
659
+ "logits/rejected": 0.10528425872325897,
660
+ "logps/chosen": -439.73016357421875,
661
+ "logps/rejected": -486.3055114746094,
662
+ "loss": 0.0531,
663
+ "rewards/accuracies": 0.75,
664
+ "rewards/chosen": -1.0058103799819946,
665
+ "rewards/margins": 0.8824182748794556,
666
+ "rewards/rejected": -1.8882286548614502,
667
  "step": 470
668
  },
669
  {
670
+ "epoch": 1.0,
671
+ "learning_rate": 5.820121557655108e-07,
672
+ "logits/chosen": 0.03267590329051018,
673
+ "logits/rejected": 0.10403893887996674,
674
+ "logps/chosen": -426.3312072753906,
675
+ "logps/rejected": -521.575439453125,
676
+ "loss": 0.0497,
677
+ "rewards/accuracies": 0.7875000238418579,
678
+ "rewards/chosen": -0.897496223449707,
679
+ "rewards/margins": 1.0473217964172363,
680
+ "rewards/rejected": -1.9448179006576538,
681
  "step": 480
682
  },
683
  {
684
+ "epoch": 1.03,
685
+ "learning_rate": 5.639015248598023e-07,
686
+ "logits/chosen": -0.05066138505935669,
687
+ "logits/rejected": 0.0016520231729373336,
688
+ "logps/chosen": -459.2066955566406,
689
+ "logps/rejected": -572.3805541992188,
690
+ "loss": 0.0254,
691
+ "rewards/accuracies": 0.768750011920929,
692
+ "rewards/chosen": -1.404326319694519,
693
+ "rewards/margins": 1.2682745456695557,
694
+ "rewards/rejected": -2.6726012229919434,
695
  "step": 490
696
  },
697
  {
698
+ "epoch": 1.05,
699
+ "learning_rate": 5.457052320211339e-07,
700
+ "logits/chosen": 0.10663177818059921,
701
+ "logits/rejected": 0.143524631857872,
702
+ "logps/chosen": -454.5547790527344,
703
+ "logps/rejected": -574.3235473632812,
704
+ "loss": 0.0198,
705
+ "rewards/accuracies": 0.71875,
706
+ "rewards/chosen": -1.592284083366394,
707
+ "rewards/margins": 1.2184875011444092,
708
+ "rewards/rejected": -2.8107717037200928,
709
  "step": 500
710
  },
711
  {
712
+ "epoch": 1.07,
713
+ "learning_rate": 5.274476699321637e-07,
714
+ "logits/chosen": -0.019788045436143875,
715
+ "logits/rejected": 0.12656378746032715,
716
+ "logps/chosen": -488.24627685546875,
717
+ "logps/rejected": -596.00537109375,
718
+ "loss": 0.015,
719
+ "rewards/accuracies": 0.800000011920929,
720
+ "rewards/chosen": -1.8213142156600952,
721
+ "rewards/margins": 1.3653538227081299,
722
+ "rewards/rejected": -3.1866683959960938,
723
  "step": 510
724
  },
725
  {
726
+ "epoch": 1.09,
727
+ "learning_rate": 5.091533134088387e-07,
728
+ "logits/chosen": -0.0814504474401474,
729
+ "logits/rejected": 0.05524957925081253,
730
+ "logps/chosen": -552.7730712890625,
731
+ "logps/rejected": -634.5548095703125,
732
+ "loss": 0.0147,
733
+ "rewards/accuracies": 0.731249988079071,
734
+ "rewards/chosen": -2.0995850563049316,
735
+ "rewards/margins": 1.1655638217926025,
736
+ "rewards/rejected": -3.2651493549346924,
737
  "step": 520
738
  },
739
  {
740
+ "epoch": 1.11,
741
+ "learning_rate": 4.908466865911614e-07,
742
+ "logits/chosen": 0.03363295644521713,
743
+ "logits/rejected": 0.043015364557504654,
744
+ "logps/chosen": -468.89593505859375,
745
+ "logps/rejected": -560.2864990234375,
746
+ "loss": 0.0174,
747
+ "rewards/accuracies": 0.7875000238418579,
748
+ "rewards/chosen": -1.5512639284133911,
749
+ "rewards/margins": 1.2513355016708374,
750
+ "rewards/rejected": -2.8025994300842285,
751
  "step": 530
752
  },
753
  {
754
+ "epoch": 1.13,
755
+ "learning_rate": 4.7255233006783624e-07,
756
+ "logits/chosen": -0.03754299506545067,
757
+ "logits/rejected": 0.08725563436746597,
758
+ "logps/chosen": -456.68243408203125,
759
+ "logps/rejected": -549.9105224609375,
760
+ "loss": 0.0178,
761
+ "rewards/accuracies": 0.706250011920929,
762
+ "rewards/chosen": -1.657478928565979,
763
+ "rewards/margins": 1.0530353784561157,
764
+ "rewards/rejected": -2.7105140686035156,
765
  "step": 540
766
  },
767
  {
768
+ "epoch": 1.15,
769
+ "learning_rate": 4.5429476797886617e-07,
770
+ "logits/chosen": 0.0340617299079895,
771
+ "logits/rejected": 0.1264275759458542,
772
+ "logps/chosen": -469.5687561035156,
773
+ "logps/rejected": -592.4705810546875,
774
+ "loss": 0.0185,
775
+ "rewards/accuracies": 0.800000011920929,
776
+ "rewards/chosen": -1.408406138420105,
777
+ "rewards/margins": 1.4667712450027466,
778
+ "rewards/rejected": -2.8751769065856934,
779
  "step": 550
780
  },
781
  {
782
+ "epoch": 1.17,
783
+ "learning_rate": 4.3609847514019763e-07,
784
+ "logits/chosen": 0.0167356226593256,
785
+ "logits/rejected": 0.032135289162397385,
786
+ "logps/chosen": -480.41278076171875,
787
+ "logps/rejected": -577.2174072265625,
788
+ "loss": 0.0165,
789
+ "rewards/accuracies": 0.7875000238418579,
790
+ "rewards/chosen": -1.5578255653381348,
791
+ "rewards/margins": 1.0947318077087402,
792
+ "rewards/rejected": -2.652557611465454,
793
  "step": 560
794
  },
795
  {
796
+ "epoch": 1.19,
797
+ "learning_rate": 4.179878442344892e-07,
798
+ "logits/chosen": 0.10041844844818115,
799
+ "logits/rejected": 0.16732005774974823,
800
+ "logps/chosen": -453.9161071777344,
801
+ "logps/rejected": -615.6796875,
802
+ "loss": 0.0153,
803
+ "rewards/accuracies": 0.762499988079071,
804
+ "rewards/chosen": -1.7070415019989014,
805
+ "rewards/margins": 1.4755295515060425,
806
+ "rewards/rejected": -3.1825711727142334,
807
  "step": 570
808
  },
809
  {
810
+ "epoch": 1.21,
811
+ "learning_rate": 3.9998715311197783e-07,
812
+ "logits/chosen": 0.1310591995716095,
813
+ "logits/rejected": 0.20585906505584717,
814
+ "logps/chosen": -493.8118591308594,
815
+ "logps/rejected": -631.4963989257812,
816
+ "loss": 0.015,
817
+ "rewards/accuracies": 0.731249988079071,
818
+ "rewards/chosen": -1.7850983142852783,
819
+ "rewards/margins": 1.443263292312622,
820
+ "rewards/rejected": -3.228361129760742,
821
  "step": 580
822
  },
823
  {
824
+ "epoch": 1.24,
825
+ "learning_rate": 3.821205322452863e-07,
826
+ "logits/chosen": 0.22954685986042023,
827
+ "logits/rejected": 0.2483092099428177,
828
+ "logps/chosen": -473.4378967285156,
829
+ "logps/rejected": -605.134033203125,
830
+ "loss": 0.0149,
831
+ "rewards/accuracies": 0.7562500238418579,
832
+ "rewards/chosen": -1.700280785560608,
833
+ "rewards/margins": 1.460669755935669,
834
+ "rewards/rejected": -3.1609506607055664,
835
  "step": 590
836
  },
837
  {
838
+ "epoch": 1.26,
839
+ "learning_rate": 3.6441193238179146e-07,
840
+ "logits/chosen": 0.13607949018478394,
841
+ "logits/rejected": 0.1680508852005005,
842
+ "logps/chosen": -451.55340576171875,
843
+ "logps/rejected": -627.7686157226562,
844
+ "loss": 0.0147,
845
+ "rewards/accuracies": 0.75,
846
+ "rewards/chosen": -1.6148862838745117,
847
+ "rewards/margins": 1.678989052772522,
848
+ "rewards/rejected": -3.2938759326934814,
849
  "step": 600
850
  },
851
  {
852
+ "epoch": 1.28,
853
+ "learning_rate": 3.4688509243692034e-07,
854
+ "logits/chosen": 0.04345204681158066,
855
+ "logits/rejected": 0.13040025532245636,
856
+ "logps/chosen": -461.54095458984375,
857
+ "logps/rejected": -684.9581909179688,
858
+ "loss": 0.0153,
859
+ "rewards/accuracies": 0.7437499761581421,
860
+ "rewards/chosen": -1.6480602025985718,
861
+ "rewards/margins": 1.6946277618408203,
862
+ "rewards/rejected": -3.3426880836486816,
863
  "step": 610
864
  },
865
  {
866
+ "epoch": 1.3,
867
+ "learning_rate": 3.295635076714144e-07,
868
+ "logits/chosen": 0.18233785033226013,
869
+ "logits/rejected": 0.19972297549247742,
870
+ "logps/chosen": -408.9209899902344,
871
+ "logps/rejected": -547.9658813476562,
872
+ "loss": 0.0143,
873
+ "rewards/accuracies": 0.75,
874
+ "rewards/chosen": -1.6356074810028076,
875
+ "rewards/margins": 1.3703811168670654,
876
+ "rewards/rejected": -3.005988597869873,
877
  "step": 620
878
  },
879
  {
880
+ "epoch": 1.32,
881
+ "learning_rate": 3.12470398195219e-07,
882
+ "logits/chosen": 0.15017299354076385,
883
+ "logits/rejected": 0.07167269289493561,
884
+ "logps/chosen": -474.58172607421875,
885
+ "logps/rejected": -649.4796142578125,
886
+ "loss": 0.0129,
887
+ "rewards/accuracies": 0.7749999761581421,
888
+ "rewards/chosen": -1.6831333637237549,
889
+ "rewards/margins": 1.4837870597839355,
890
+ "rewards/rejected": -3.1669201850891113,
891
  "step": 630
892
  },
893
  {
894
+ "epoch": 1.34,
895
+ "learning_rate": 2.956286778402226e-07,
896
+ "logits/chosen": 0.03866753727197647,
897
+ "logits/rejected": 0.20129835605621338,
898
+ "logps/chosen": -546.3468017578125,
899
+ "logps/rejected": -608.462646484375,
900
+ "loss": 0.0126,
901
+ "rewards/accuracies": 0.75,
902
+ "rewards/chosen": -1.7091865539550781,
903
+ "rewards/margins": 1.3178246021270752,
904
+ "rewards/rejected": -3.0270111560821533,
905
  "step": 640
906
  },
907
  {
908
+ "epoch": 1.36,
909
+ "learning_rate": 2.7906092344356826e-07,
910
+ "logits/chosen": 0.2127591073513031,
911
+ "logits/rejected": 0.24179625511169434,
912
+ "logps/chosen": -462.47412109375,
913
+ "logps/rejected": -581.084228515625,
914
+ "loss": 0.014,
915
+ "rewards/accuracies": 0.7437499761581421,
916
+ "rewards/chosen": -1.751960039138794,
917
+ "rewards/margins": 1.4448457956314087,
918
+ "rewards/rejected": -3.196805953979492,
919
  "step": 650
920
  },
921
  {
922
+ "epoch": 1.38,
923
+ "learning_rate": 2.6278934458271996e-07,
924
+ "logits/chosen": 0.09269841015338898,
925
+ "logits/rejected": 0.2964209318161011,
926
+ "logps/chosen": -479.434326171875,
927
+ "logps/rejected": -605.9524536132812,
928
+ "loss": 0.0123,
929
+ "rewards/accuracies": 0.7250000238418579,
930
+ "rewards/chosen": -1.8022867441177368,
931
+ "rewards/margins": 1.3753817081451416,
932
+ "rewards/rejected": -3.177668333053589,
933
  "step": 660
934
  },
935
  {
936
+ "epoch": 1.4,
937
+ "learning_rate": 2.468357538028487e-07,
938
+ "logits/chosen": 0.16141146421432495,
939
+ "logits/rejected": 0.18542757630348206,
940
+ "logps/chosen": -487.90277099609375,
941
+ "logps/rejected": -652.5034790039062,
942
+ "loss": 0.0107,
943
+ "rewards/accuracies": 0.75,
944
+ "rewards/chosen": -1.9332258701324463,
945
+ "rewards/margins": 1.736053705215454,
946
+ "rewards/rejected": -3.6692795753479004,
947
  "step": 670
948
  },
949
  {
950
+ "epoch": 1.42,
951
+ "learning_rate": 2.312215373764551e-07,
952
+ "logits/chosen": 0.07799498736858368,
953
+ "logits/rejected": 0.17718131840229034,
954
+ "logps/chosen": -603.2567138671875,
955
+ "logps/rejected": -699.2156372070312,
956
+ "loss": 0.0101,
957
+ "rewards/accuracies": 0.7749999761581421,
958
+ "rewards/chosen": -2.1482930183410645,
959
+ "rewards/margins": 1.3787685632705688,
960
+ "rewards/rejected": -3.5270614624023438,
961
  "step": 680
962
  },
963
  {
964
+ "epoch": 1.44,
965
+ "learning_rate": 2.1596762663442213e-07,
966
+ "logits/chosen": 0.2014874666929245,
967
+ "logits/rejected": 0.3246391713619232,
968
+ "logps/chosen": -489.08349609375,
969
+ "logps/rejected": -607.5847778320312,
970
+ "loss": 0.0096,
971
+ "rewards/accuracies": 0.762499988079071,
972
+ "rewards/chosen": -2.083740711212158,
973
+ "rewards/margins": 1.446257472038269,
974
+ "rewards/rejected": -3.5299980640411377,
975
  "step": 690
976
  },
977
  {
978
+ "epoch": 1.47,
979
+ "learning_rate": 2.0109446990692963e-07,
980
+ "logits/chosen": 0.09734896570444107,
981
+ "logits/rejected": 0.16283641755580902,
982
+ "logps/chosen": -540.1688232421875,
983
+ "logps/rejected": -701.462890625,
984
+ "loss": 0.0094,
985
+ "rewards/accuracies": 0.8125,
986
+ "rewards/chosen": -2.07643985748291,
987
+ "rewards/margins": 1.7090556621551514,
988
+ "rewards/rejected": -3.7854957580566406,
989
  "step": 700
990
  },
991
  {
992
+ "epoch": 1.49,
993
+ "learning_rate": 1.8662200511184872e-07,
994
+ "logits/chosen": 0.07912759482860565,
995
+ "logits/rejected": 0.19963078200817108,
996
+ "logps/chosen": -491.30426025390625,
997
+ "logps/rejected": -630.0563354492188,
998
+ "loss": 0.0099,
999
+ "rewards/accuracies": 0.800000011920929,
1000
+ "rewards/chosen": -1.9977525472640991,
1001
+ "rewards/margins": 1.5802443027496338,
1002
+ "rewards/rejected": -3.5779967308044434,
1003
  "step": 710
1004
  },
1005
+ {
1006
+ "epoch": 1.51,
1007
+ "learning_rate": 1.725696330273575e-07,
1008
+ "logits/chosen": 0.14783975481987,
1009
+ "logits/rejected": 0.27563345432281494,
1010
+ "logps/chosen": -530.8796997070312,
1011
+ "logps/rejected": -640.3440551757812,
1012
+ "loss": 0.0107,
1013
+ "rewards/accuracies": 0.7562500238418579,
1014
+ "rewards/chosen": -2.036653518676758,
1015
+ "rewards/margins": 1.323557734489441,
1016
+ "rewards/rejected": -3.3602116107940674,
1017
+ "step": 720
1018
+ },
1019
+ {
1020
+ "epoch": 1.53,
1021
+ "learning_rate": 1.589561912846089e-07,
1022
+ "logits/chosen": 0.16717246174812317,
1023
+ "logits/rejected": 0.2920343279838562,
1024
+ "logps/chosen": -499.3802795410156,
1025
+ "logps/rejected": -612.64892578125,
1026
+ "loss": 0.012,
1027
+ "rewards/accuracies": 0.731249988079071,
1028
+ "rewards/chosen": -2.0618550777435303,
1029
+ "rewards/margins": 1.435462236404419,
1030
+ "rewards/rejected": -3.4973175525665283,
1031
+ "step": 730
1032
+ },
1033
+ {
1034
+ "epoch": 1.55,
1035
+ "learning_rate": 1.4579992911531496e-07,
1036
+ "logits/chosen": 0.1249130517244339,
1037
+ "logits/rejected": 0.23616066575050354,
1038
+ "logps/chosen": -575.0750732421875,
1039
+ "logps/rejected": -649.9669189453125,
1040
+ "loss": 0.0106,
1041
+ "rewards/accuracies": 0.762499988079071,
1042
+ "rewards/chosen": -2.2815146446228027,
1043
+ "rewards/margins": 1.226216197013855,
1044
+ "rewards/rejected": -3.5077309608459473,
1045
+ "step": 740
1046
+ },
1047
+ {
1048
+ "epoch": 1.57,
1049
+ "learning_rate": 1.3311848288809813e-07,
1050
+ "logits/chosen": 0.21837782859802246,
1051
+ "logits/rejected": 0.31546956300735474,
1052
+ "logps/chosen": -510.7059020996094,
1053
+ "logps/rejected": -609.2933959960938,
1054
+ "loss": 0.0119,
1055
+ "rewards/accuracies": 0.737500011920929,
1056
+ "rewards/chosen": -1.790372610092163,
1057
+ "rewards/margins": 1.2426694631576538,
1058
+ "rewards/rejected": -3.0330421924591064,
1059
+ "step": 750
1060
+ },
1061
+ {
1062
+ "epoch": 1.59,
1063
+ "learning_rate": 1.209288524664029e-07,
1064
+ "logits/chosen": 0.14562873542308807,
1065
+ "logits/rejected": 0.3084864318370819,
1066
+ "logps/chosen": -622.6912841796875,
1067
+ "logps/rejected": -749.8731689453125,
1068
+ "loss": 0.0131,
1069
+ "rewards/accuracies": 0.71875,
1070
+ "rewards/chosen": -2.2252538204193115,
1071
+ "rewards/margins": 1.5818650722503662,
1072
+ "rewards/rejected": -3.8071188926696777,
1073
+ "step": 760
1074
+ },
1075
+ {
1076
+ "epoch": 1.61,
1077
+ "learning_rate": 1.0924737841966497e-07,
1078
+ "logits/chosen": 0.1799144446849823,
1079
+ "logits/rejected": 0.354133278131485,
1080
+ "logps/chosen": -585.0472412109375,
1081
+ "logps/rejected": -712.3133544921875,
1082
+ "loss": 0.0107,
1083
+ "rewards/accuracies": 0.737500011920929,
1084
+ "rewards/chosen": -2.1570990085601807,
1085
+ "rewards/margins": 1.6586040258407593,
1086
+ "rewards/rejected": -3.8157036304473877,
1087
+ "step": 770
1088
+ },
1089
+ {
1090
+ "epoch": 1.63,
1091
+ "learning_rate": 9.808972011828054e-08,
1092
+ "logits/chosen": 0.20896565914154053,
1093
+ "logits/rejected": 0.1832619458436966,
1094
+ "logps/chosen": -474.9366149902344,
1095
+ "logps/rejected": -665.3892822265625,
1096
+ "loss": 0.0099,
1097
+ "rewards/accuracies": 0.8125,
1098
+ "rewards/chosen": -1.9308887720108032,
1099
+ "rewards/margins": 1.5281493663787842,
1100
+ "rewards/rejected": -3.459038257598877,
1101
+ "step": 780
1102
+ },
1103
+ {
1104
+ "epoch": 1.65,
1105
+ "learning_rate": 8.747083474174527e-08,
1106
+ "logits/chosen": 0.25221484899520874,
1107
+ "logits/rejected": 0.3025228679180145,
1108
+ "logps/chosen": -486.76678466796875,
1109
+ "logps/rejected": -610.9810791015625,
1110
+ "loss": 0.01,
1111
+ "rewards/accuracies": 0.7250000238418579,
1112
+ "rewards/chosen": -1.9139289855957031,
1113
+ "rewards/margins": 1.4173685312271118,
1114
+ "rewards/rejected": -3.3312973976135254,
1115
+ "step": 790
1116
+ },
1117
+ {
1118
+ "epoch": 1.67,
1119
+ "learning_rate": 7.740495722810269e-08,
1120
+ "logits/chosen": 0.12703558802604675,
1121
+ "logits/rejected": 0.25433093309402466,
1122
+ "logps/chosen": -528.8013916015625,
1123
+ "logps/rejected": -645.4374389648438,
1124
+ "loss": 0.01,
1125
+ "rewards/accuracies": 0.768750011920929,
1126
+ "rewards/chosen": -2.129984140396118,
1127
+ "rewards/margins": 1.322923183441162,
1128
+ "rewards/rejected": -3.452907085418701,
1129
+ "step": 800
1130
+ },
1131
+ {
1132
+ "epoch": 1.7,
1133
+ "learning_rate": 6.790558119157597e-08,
1134
+ "logits/chosen": 0.1941952407360077,
1135
+ "logits/rejected": 0.36538344621658325,
1136
+ "logps/chosen": -536.0458374023438,
1137
+ "logps/rejected": -630.6697387695312,
1138
+ "loss": 0.0111,
1139
+ "rewards/accuracies": 0.7875000238418579,
1140
+ "rewards/chosen": -1.9618316888809204,
1141
+ "rewards/margins": 1.3840124607086182,
1142
+ "rewards/rejected": -3.34584379196167,
1143
+ "step": 810
1144
+ },
1145
+ {
1146
+ "epoch": 1.72,
1147
+ "learning_rate": 5.898544083397e-08,
1148
+ "logits/chosen": 0.1936766654253006,
1149
+ "logits/rejected": 0.22626741230487823,
1150
+ "logps/chosen": -482.18902587890625,
1151
+ "logps/rejected": -640.9258422851562,
1152
+ "loss": 0.0113,
1153
+ "rewards/accuracies": 0.800000011920929,
1154
+ "rewards/chosen": -1.822951078414917,
1155
+ "rewards/margins": 1.679091215133667,
1156
+ "rewards/rejected": -3.502042055130005,
1157
+ "step": 820
1158
+ },
1159
+ {
1160
+ "epoch": 1.74,
1161
+ "learning_rate": 5.065649387408705e-08,
1162
+ "logits/chosen": 0.16037659347057343,
1163
+ "logits/rejected": 0.23867423832416534,
1164
+ "logps/chosen": -536.796630859375,
1165
+ "logps/rejected": -645.6795654296875,
1166
+ "loss": 0.0119,
1167
+ "rewards/accuracies": 0.7562500238418579,
1168
+ "rewards/chosen": -2.09273624420166,
1169
+ "rewards/margins": 1.3475998640060425,
1170
+ "rewards/rejected": -3.440336227416992,
1171
+ "step": 830
1172
+ },
1173
+ {
1174
+ "epoch": 1.76,
1175
+ "learning_rate": 4.292990551804171e-08,
1176
+ "logits/chosen": 0.11955185234546661,
1177
+ "logits/rejected": 0.2987907826900482,
1178
+ "logps/chosen": -521.8675537109375,
1179
+ "logps/rejected": -622.3560791015625,
1180
+ "loss": 0.0115,
1181
+ "rewards/accuracies": 0.800000011920929,
1182
+ "rewards/chosen": -1.9727070331573486,
1183
+ "rewards/margins": 1.207002878189087,
1184
+ "rewards/rejected": -3.1797099113464355,
1185
+ "step": 840
1186
+ },
1187
+ {
1188
+ "epoch": 1.78,
1189
+ "learning_rate": 3.581603349196371e-08,
1190
+ "logits/chosen": 0.12183141708374023,
1191
+ "logits/rejected": 0.24950018525123596,
1192
+ "logps/chosen": -529.2427978515625,
1193
+ "logps/rejected": -662.9299926757812,
1194
+ "loss": 0.0112,
1195
+ "rewards/accuracies": 0.8374999761581421,
1196
+ "rewards/chosen": -2.024509906768799,
1197
+ "rewards/margins": 1.5907318592071533,
1198
+ "rewards/rejected": -3.615241289138794,
1199
+ "step": 850
1200
+ },
1201
+ {
1202
+ "epoch": 1.8,
1203
+ "learning_rate": 2.9324414157151367e-08,
1204
+ "logits/chosen": 0.11247365176677704,
1205
+ "logits/rejected": 0.28803473711013794,
1206
+ "logps/chosen": -538.6015625,
1207
+ "logps/rejected": -616.6097412109375,
1208
+ "loss": 0.0105,
1209
+ "rewards/accuracies": 0.7437499761581421,
1210
+ "rewards/chosen": -2.005286693572998,
1211
+ "rewards/margins": 1.320533037185669,
1212
+ "rewards/rejected": -3.325819492340088,
1213
+ "step": 860
1214
+ },
1215
+ {
1216
+ "epoch": 1.82,
1217
+ "learning_rate": 2.3463749726290284e-08,
1218
+ "logits/chosen": 0.09726160764694214,
1219
+ "logits/rejected": 0.3085189759731293,
1220
+ "logps/chosen": -527.7420043945312,
1221
+ "logps/rejected": -666.7064208984375,
1222
+ "loss": 0.0114,
1223
+ "rewards/accuracies": 0.78125,
1224
+ "rewards/chosen": -1.963595986366272,
1225
+ "rewards/margins": 1.6061077117919922,
1226
+ "rewards/rejected": -3.5697035789489746,
1227
+ "step": 870
1228
+ },
1229
+ {
1230
+ "epoch": 1.84,
1231
+ "learning_rate": 1.824189659787284e-08,
1232
+ "logits/chosen": 0.19652321934700012,
1233
+ "logits/rejected": 0.2885872423648834,
1234
+ "logps/chosen": -515.560546875,
1235
+ "logps/rejected": -641.10791015625,
1236
+ "loss": 0.0111,
1237
+ "rewards/accuracies": 0.762499988079071,
1238
+ "rewards/chosen": -1.9605176448822021,
1239
+ "rewards/margins": 1.3721264600753784,
1240
+ "rewards/rejected": -3.33264422416687,
1241
+ "step": 880
1242
+ },
1243
+ {
1244
+ "epoch": 1.86,
1245
+ "learning_rate": 1.3665854824458035e-08,
1246
+ "logits/chosen": 0.16733339428901672,
1247
+ "logits/rejected": 0.3634529113769531,
1248
+ "logps/chosen": -542.18505859375,
1249
+ "logps/rejected": -629.7310791015625,
1250
+ "loss": 0.0115,
1251
+ "rewards/accuracies": 0.768750011920929,
1252
+ "rewards/chosen": -2.0391831398010254,
1253
+ "rewards/margins": 1.1835925579071045,
1254
+ "rewards/rejected": -3.2227752208709717,
1255
+ "step": 890
1256
+ },
1257
+ {
1258
+ "epoch": 1.88,
1259
+ "learning_rate": 9.741758728888217e-09,
1260
+ "logits/chosen": 0.08950433880090714,
1261
+ "logits/rejected": 0.2665843069553375,
1262
+ "logps/chosen": -533.1641845703125,
1263
+ "logps/rejected": -621.0523681640625,
1264
+ "loss": 0.0113,
1265
+ "rewards/accuracies": 0.675000011920929,
1266
+ "rewards/chosen": -1.9605424404144287,
1267
+ "rewards/margins": 1.1125773191452026,
1268
+ "rewards/rejected": -3.073119640350342,
1269
+ "step": 900
1270
+ },
1271
+ {
1272
+ "epoch": 1.91,
1273
+ "learning_rate": 6.474868681043577e-09,
1274
+ "logits/chosen": 0.13345034420490265,
1275
+ "logits/rejected": 0.2458508014678955,
1276
+ "logps/chosen": -523.0572509765625,
1277
+ "logps/rejected": -666.5548706054688,
1278
+ "loss": 0.0107,
1279
+ "rewards/accuracies": 0.75,
1280
+ "rewards/chosen": -2.094968557357788,
1281
+ "rewards/margins": 1.4136923551559448,
1282
+ "rewards/rejected": -3.5086607933044434,
1283
+ "step": 910
1284
+ },
1285
+ {
1286
+ "epoch": 1.93,
1287
+ "learning_rate": 3.869564046156459e-09,
1288
+ "logits/chosen": 0.17636564373970032,
1289
+ "logits/rejected": 0.24904970824718475,
1290
+ "logps/chosen": -521.7586669921875,
1291
+ "logps/rejected": -661.547119140625,
1292
+ "loss": 0.0115,
1293
+ "rewards/accuracies": 0.7875000238418579,
1294
+ "rewards/chosen": -2.0953400135040283,
1295
+ "rewards/margins": 1.3953152894973755,
1296
+ "rewards/rejected": -3.4906551837921143,
1297
+ "step": 920
1298
+ },
1299
+ {
1300
+ "epoch": 1.95,
1301
+ "learning_rate": 1.929337314139412e-09,
1302
+ "logits/chosen": 0.1708141714334488,
1303
+ "logits/rejected": 0.2874212861061096,
1304
+ "logps/chosen": -481.3929138183594,
1305
+ "logps/rejected": -591.492431640625,
1306
+ "loss": 0.0107,
1307
+ "rewards/accuracies": 0.737500011920929,
1308
+ "rewards/chosen": -1.8482071161270142,
1309
+ "rewards/margins": 1.3176212310791016,
1310
+ "rewards/rejected": -3.165828227996826,
1311
+ "step": 930
1312
+ },
1313
+ {
1314
+ "epoch": 1.97,
1315
+ "learning_rate": 6.567894177967325e-10,
1316
+ "logits/chosen": 0.1810809224843979,
1317
+ "logits/rejected": 0.3499010503292084,
1318
+ "logps/chosen": -509.21966552734375,
1319
+ "logps/rejected": -619.0591430664062,
1320
+ "loss": 0.0119,
1321
+ "rewards/accuracies": 0.737500011920929,
1322
+ "rewards/chosen": -1.7878868579864502,
1323
+ "rewards/margins": 1.3797376155853271,
1324
+ "rewards/rejected": -3.1676242351531982,
1325
+ "step": 940
1326
+ },
1327
+ {
1328
+ "epoch": 1.99,
1329
+ "learning_rate": 5.3626246194704575e-11,
1330
+ "logits/chosen": 0.12432925403118134,
1331
+ "logits/rejected": 0.1847553700208664,
1332
+ "logps/chosen": -471.4737854003906,
1333
+ "logps/rejected": -620.7115478515625,
1334
+ "loss": 0.0121,
1335
+ "rewards/accuracies": 0.800000011920929,
1336
+ "rewards/chosen": -1.8229620456695557,
1337
+ "rewards/margins": 1.5415856838226318,
1338
+ "rewards/rejected": -3.3645477294921875,
1339
+ "step": 950
1340
+ },
1341
  {
1342
  "epoch": 2.0,
1343
+ "step": 954,
1344
  "total_flos": 0.0,
1345
+ "train_loss": 0.050850671487596796,
1346
+ "train_runtime": 12712.7589,
1347
+ "train_samples_per_second": 9.618,
1348
+ "train_steps_per_second": 0.075
1349
  }
1350
  ],
1351
  "logging_steps": 10,
1352
+ "max_steps": 954,
1353
  "num_train_epochs": 2,
1354
  "save_steps": 10000,
1355
  "total_flos": 0.0,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1091900189f2bf20317d25f99163db88a112c122ef6171e24251c1551cf023ed
3
  size 6648
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bcefccada842dee8d69c04461efc22f005902c17e1d735f351279d99f9c2e09
3
  size 6648