wzhouad commited on
Commit
c475147
1 Parent(s): 414bcf1

Model save

Browse files
README.md CHANGED
@@ -17,15 +17,18 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [HuggingFaceH4/mistral-7b-sft-beta](https://huggingface.co/HuggingFaceH4/mistral-7b-sft-beta) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 0.2611
21
- - Rewards/chosen: -1.1614
22
- - Rewards/rejected: -2.0289
23
- - Rewards/accuracies: 0.7461
24
- - Rewards/margins: 0.8674
25
- - Logps/rejected: -460.2387
26
- - Logps/chosen: -373.1811
27
- - Logits/rejected: -2.1811
28
- - Logits/chosen: -2.2167
 
 
 
29
 
30
  ## Model description
31
 
@@ -60,12 +63,12 @@ The following hyperparameters were used during training:
60
 
61
  ### Training results
62
 
63
- | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
64
- |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
65
- | 0.2925 | 0.21 | 100 | 0.3029 | -0.7549 | -1.2071 | 0.7031 | 0.4522 | -378.0666 | -332.5286 | -2.6453 | -2.6608 |
66
- | 0.2756 | 0.42 | 200 | 0.2765 | -1.0186 | -1.7149 | 0.7148 | 0.6963 | -428.8432 | -358.9036 | -2.3729 | -2.3947 |
67
- | 0.2684 | 0.63 | 300 | 0.2669 | -1.2042 | -2.0211 | 0.7422 | 0.8169 | -459.4592 | -377.4594 | -2.2540 | -2.2836 |
68
- | 0.2654 | 0.84 | 400 | 0.2611 | -1.1614 | -2.0289 | 0.7461 | 0.8674 | -460.2387 | -373.1811 | -2.1811 | -2.2167 |
69
 
70
 
71
  ### Framework versions
 
17
 
18
  This model is a fine-tuned version of [HuggingFaceH4/mistral-7b-sft-beta](https://huggingface.co/HuggingFaceH4/mistral-7b-sft-beta) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.1314
21
+ - Rewards/chosen: -1.5200
22
+ - Rewards/rejected: -2.4344
23
+ - Rewards/accuracies: 0.75
24
+ - Rewards/margins: 0.9144
25
+ - Logps/rejected: -500.7934
26
+ - Logps/chosen: -409.0388
27
+ - Logits/rejected: -2.1508
28
+ - Logits/chosen: -2.1830
29
+ - Debug/policy Weights: 0.2589
30
+ - Debug/losses: 0.1297
31
+ - Debug/raw Losses: 0.4817
32
 
33
  ## Model description
34
 
 
63
 
64
  ### Training results
65
 
66
+ | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen | Debug/policy Weights | Debug/losses | Debug/raw Losses |
67
+ |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|:--------------------:|:------------:|:----------------:|
68
+ | 0.2168 | 0.21 | 100 | 0.2150 | -0.5440 | -1.0580 | 0.7383 | 0.5141 | -363.1571 | -311.4377 | -2.6827 | -2.6979 | 0.3735 | 0.2082 | 0.5529 |
69
+ | 0.1396 | 0.42 | 200 | 0.1416 | -1.3480 | -2.1286 | 0.7656 | 0.7807 | -470.2158 | -391.8350 | -2.2733 | -2.2968 | 0.2687 | 0.1390 | 0.5030 |
70
+ | 0.1294 | 0.63 | 300 | 0.1309 | -1.6003 | -2.4486 | 0.7383 | 0.8483 | -502.2112 | -417.0714 | -2.1589 | -2.1885 | 0.2545 | 0.1284 | 0.4935 |
71
+ | 0.1329 | 0.84 | 400 | 0.1314 | -1.5200 | -2.4344 | 0.75 | 0.9144 | -500.7934 | -409.0388 | -2.1508 | -2.1830 | 0.2589 | 0.1297 | 0.4817 |
72
 
73
 
74
  ### Framework versions
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.28123937291580264,
4
- "train_runtime": 4275.6877,
5
  "train_samples": 61134,
6
- "train_samples_per_second": 14.298,
7
- "train_steps_per_second": 0.112
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.17621936496331603,
4
+ "train_runtime": 4510.4366,
5
  "train_samples": 61134,
6
+ "train_samples_per_second": 13.554,
7
+ "train_steps_per_second": 0.106
8
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f818db06c2df0daed1173424964a76bf104c6880c792c7504a1b3c8b482a1c3b
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09cc7ca4a2419236436e6c007340b593bba2a32c8ae2632430bedae57caa0e7d
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:de40456c8404b1e8967e7e253ec7b090cd274d4aa84367faa3c245eb1094aa0a
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a18ed3e61ca7e185538cfad27e95e49182c44b8e7cf83e0a3e9e75dfa4402b4
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5329c6f04ca5c32ed2b8ff84211932e80751ed496ad55cdc96bffbad28dc7f55
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe431dd8a696ee319a6385010068d99a7950c45d95d27f41443264941966e681
3
  size 4540516344
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.28123937291580264,
4
- "train_runtime": 4275.6877,
5
  "train_samples": 61134,
6
- "train_samples_per_second": 14.298,
7
- "train_steps_per_second": 0.112
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.17621936496331603,
4
+ "train_runtime": 4510.4366,
5
  "train_samples": 61134,
6
+ "train_samples_per_second": 13.554,
7
+ "train_steps_per_second": 0.106
8
  }
trainer_state.json CHANGED
@@ -9,13 +9,16 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
 
 
 
12
  "epoch": 0.0,
13
  "learning_rate": 1.0416666666666666e-08,
14
  "logits/chosen": -2.8099329471588135,
15
  "logits/rejected": -2.7572641372680664,
16
  "logps/chosen": -241.48843383789062,
17
  "logps/rejected": -197.4517822265625,
18
- "loss": 0.3419,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
@@ -23,735 +26,888 @@
23
  "step": 1
24
  },
25
  {
 
 
 
26
  "epoch": 0.02,
27
  "learning_rate": 1.0416666666666667e-07,
28
- "logits/chosen": -2.8319878578186035,
29
- "logits/rejected": -2.8086318969726562,
30
- "logps/chosen": -292.67718505859375,
31
- "logps/rejected": -278.6147766113281,
32
- "loss": 0.357,
33
- "rewards/accuracies": 0.4166666567325592,
34
- "rewards/chosen": 0.000136316564748995,
35
- "rewards/margins": 0.00042209154344163835,
36
- "rewards/rejected": -0.0002857750514522195,
37
  "step": 10
38
  },
39
  {
 
 
 
40
  "epoch": 0.04,
41
  "learning_rate": 2.0833333333333333e-07,
42
- "logits/chosen": -2.8108882904052734,
43
- "logits/rejected": -2.782381057739258,
44
- "logps/chosen": -290.30352783203125,
45
- "logps/rejected": -290.828125,
46
- "loss": 0.3442,
47
- "rewards/accuracies": 0.59375,
48
- "rewards/chosen": 0.0009612235007807612,
49
- "rewards/margins": 0.0021439972333610058,
50
- "rewards/rejected": -0.0011827738489955664,
51
  "step": 20
52
  },
53
  {
 
 
 
54
  "epoch": 0.06,
55
  "learning_rate": 3.1249999999999997e-07,
56
- "logits/chosen": -2.753178834915161,
57
- "logits/rejected": -2.722262382507324,
58
- "logps/chosen": -246.39126586914062,
59
- "logps/rejected": -227.2058868408203,
60
- "loss": 0.3474,
61
- "rewards/accuracies": 0.643750011920929,
62
- "rewards/chosen": -0.0002911156916525215,
63
- "rewards/margins": 0.008968978188931942,
64
- "rewards/rejected": -0.009260093793272972,
65
  "step": 30
66
  },
67
  {
 
 
 
68
  "epoch": 0.08,
69
  "learning_rate": 4.1666666666666667e-07,
70
- "logits/chosen": -2.7772631645202637,
71
- "logits/rejected": -2.7496306896209717,
72
- "logps/chosen": -299.43017578125,
73
- "logps/rejected": -264.92559814453125,
74
- "loss": 0.3437,
75
- "rewards/accuracies": 0.706250011920929,
76
- "rewards/chosen": 0.0009144862997345626,
77
- "rewards/margins": 0.037536174058914185,
78
- "rewards/rejected": -0.03662168234586716,
79
  "step": 40
80
  },
81
  {
 
 
 
82
  "epoch": 0.1,
83
  "learning_rate": 4.999733114418725e-07,
84
- "logits/chosen": -2.6782639026641846,
85
- "logits/rejected": -2.6587812900543213,
86
- "logps/chosen": -279.86041259765625,
87
- "logps/rejected": -272.60748291015625,
88
- "loss": 0.3357,
89
  "rewards/accuracies": 0.65625,
90
- "rewards/chosen": -0.024299880489706993,
91
- "rewards/margins": 0.06480460613965988,
92
- "rewards/rejected": -0.08910447359085083,
93
  "step": 50
94
  },
95
  {
 
 
 
96
  "epoch": 0.13,
97
  "learning_rate": 4.990398100856366e-07,
98
- "logits/chosen": -2.684187173843384,
99
- "logits/rejected": -2.6644179821014404,
100
- "logps/chosen": -266.75103759765625,
101
- "logps/rejected": -249.3052215576172,
102
- "loss": 0.3212,
103
- "rewards/accuracies": 0.59375,
104
- "rewards/chosen": -0.07674752175807953,
105
- "rewards/margins": 0.09015476703643799,
106
- "rewards/rejected": -0.1669023185968399,
107
  "step": 60
108
  },
109
  {
 
 
 
110
  "epoch": 0.15,
111
  "learning_rate": 4.967775735898179e-07,
112
- "logits/chosen": -2.661471366882324,
113
- "logits/rejected": -2.6346983909606934,
114
- "logps/chosen": -283.1821594238281,
115
- "logps/rejected": -263.30267333984375,
116
- "loss": 0.3188,
117
- "rewards/accuracies": 0.6312500238418579,
118
- "rewards/chosen": -0.14759597182273865,
119
- "rewards/margins": 0.13659381866455078,
120
- "rewards/rejected": -0.28418979048728943,
121
  "step": 70
122
  },
123
  {
 
 
 
124
  "epoch": 0.17,
125
  "learning_rate": 4.931986719649298e-07,
126
- "logits/chosen": -2.6391332149505615,
127
- "logits/rejected": -2.63655948638916,
128
- "logps/chosen": -301.9135437011719,
129
- "logps/rejected": -322.2628479003906,
130
- "loss": 0.3009,
131
- "rewards/accuracies": 0.675000011920929,
132
- "rewards/chosen": -0.311817467212677,
133
- "rewards/margins": 0.2355634719133377,
134
- "rewards/rejected": -0.5473809242248535,
135
  "step": 80
136
  },
137
  {
 
 
 
138
  "epoch": 0.19,
139
  "learning_rate": 4.883222001996351e-07,
140
- "logits/chosen": -2.695049285888672,
141
- "logits/rejected": -2.6640732288360596,
142
- "logps/chosen": -298.87469482421875,
143
- "logps/rejected": -303.5993957519531,
144
- "loss": 0.3033,
145
- "rewards/accuracies": 0.6625000238418579,
146
- "rewards/chosen": -0.43910256028175354,
147
- "rewards/margins": 0.24994739890098572,
148
- "rewards/rejected": -0.6890498995780945,
149
  "step": 90
150
  },
151
  {
 
 
 
152
  "epoch": 0.21,
153
  "learning_rate": 4.821741763807186e-07,
154
- "logits/chosen": -2.7090964317321777,
155
- "logits/rejected": -2.6778430938720703,
156
- "logps/chosen": -359.72955322265625,
157
- "logps/rejected": -371.3741760253906,
158
- "loss": 0.2925,
159
- "rewards/accuracies": 0.65625,
160
- "rewards/chosen": -0.7029050588607788,
161
- "rewards/margins": 0.3322359025478363,
162
- "rewards/rejected": -1.0351407527923584,
163
  "step": 100
164
  },
165
  {
166
  "epoch": 0.21,
167
- "eval_logits/chosen": -2.660813570022583,
168
- "eval_logits/rejected": -2.6453118324279785,
169
- "eval_logps/chosen": -332.528564453125,
170
- "eval_logps/rejected": -378.06658935546875,
171
- "eval_loss": 0.30285975337028503,
172
- "eval_rewards/accuracies": 0.703125,
173
- "eval_rewards/chosen": -0.7548891305923462,
174
- "eval_rewards/margins": 0.452243834733963,
175
- "eval_rewards/rejected": -1.2071329355239868,
176
- "eval_runtime": 53.3188,
177
- "eval_samples_per_second": 37.51,
178
- "eval_steps_per_second": 0.6,
 
 
 
179
  "step": 100
180
  },
181
  {
 
 
 
182
  "epoch": 0.23,
183
  "learning_rate": 4.747874028753375e-07,
184
- "logits/chosen": -2.526386260986328,
185
- "logits/rejected": -2.497170925140381,
186
- "logps/chosen": -353.86492919921875,
187
- "logps/rejected": -355.2405700683594,
188
- "loss": 0.2989,
189
- "rewards/accuracies": 0.706250011920929,
190
- "rewards/chosen": -0.8376742601394653,
191
- "rewards/margins": 0.4966367185115814,
192
- "rewards/rejected": -1.3343110084533691,
193
  "step": 110
194
  },
195
  {
 
 
 
196
  "epoch": 0.25,
197
  "learning_rate": 4.662012913161997e-07,
198
- "logits/chosen": -2.6284193992614746,
199
- "logits/rejected": -2.597144603729248,
200
- "logps/chosen": -328.7850341796875,
201
- "logps/rejected": -369.0627746582031,
202
- "loss": 0.2923,
203
- "rewards/accuracies": 0.699999988079071,
204
- "rewards/chosen": -0.5993393659591675,
205
- "rewards/margins": 0.46277889609336853,
206
- "rewards/rejected": -1.0621182918548584,
207
  "step": 120
208
  },
209
  {
 
 
 
210
  "epoch": 0.27,
211
  "learning_rate": 4.5646165232345103e-07,
212
- "logits/chosen": -2.5967652797698975,
213
- "logits/rejected": -2.580361843109131,
214
- "logps/chosen": -338.71331787109375,
215
- "logps/rejected": -352.9530334472656,
216
- "loss": 0.2846,
217
- "rewards/accuracies": 0.6625000238418579,
218
- "rewards/chosen": -0.6421850323677063,
219
- "rewards/margins": 0.38081812858581543,
220
- "rewards/rejected": -1.023003101348877,
221
  "step": 130
222
  },
223
  {
 
 
 
224
  "epoch": 0.29,
225
  "learning_rate": 4.456204510851956e-07,
226
- "logits/chosen": -2.546405792236328,
227
- "logits/rejected": -2.5155086517333984,
228
- "logps/chosen": -361.4484558105469,
229
- "logps/rejected": -406.66717529296875,
230
- "loss": 0.2882,
231
- "rewards/accuracies": 0.706250011920929,
232
- "rewards/chosen": -0.8081401586532593,
233
- "rewards/margins": 0.5508965253829956,
234
- "rewards/rejected": -1.3590366840362549,
235
  "step": 140
236
  },
237
  {
 
 
 
238
  "epoch": 0.31,
239
  "learning_rate": 4.337355301007335e-07,
240
- "logits/chosen": -2.4885222911834717,
241
- "logits/rejected": -2.513378620147705,
242
- "logps/chosen": -290.5274963378906,
243
- "logps/rejected": -359.549560546875,
244
- "loss": 0.271,
245
- "rewards/accuracies": 0.731249988079071,
246
- "rewards/chosen": -0.6384859681129456,
247
- "rewards/margins": 0.5941485166549683,
248
- "rewards/rejected": -1.232634425163269,
249
  "step": 150
250
  },
251
  {
 
 
 
252
  "epoch": 0.33,
253
  "learning_rate": 4.2087030056579986e-07,
254
- "logits/chosen": -2.461970806121826,
255
- "logits/rejected": -2.4330391883850098,
256
- "logps/chosen": -348.7262878417969,
257
- "logps/rejected": -383.38458251953125,
258
- "loss": 0.2854,
259
- "rewards/accuracies": 0.668749988079071,
260
- "rewards/chosen": -0.9275447130203247,
261
- "rewards/margins": 0.4475277364253998,
262
- "rewards/rejected": -1.3750722408294678,
263
  "step": 160
264
  },
265
  {
 
 
 
266
  "epoch": 0.36,
267
  "learning_rate": 4.070934040463998e-07,
268
- "logits/chosen": -2.380707263946533,
269
- "logits/rejected": -2.3646743297576904,
270
- "logps/chosen": -368.96636962890625,
271
- "logps/rejected": -414.83905029296875,
272
- "loss": 0.2767,
273
- "rewards/accuracies": 0.7562500238418579,
274
- "rewards/chosen": -0.8679953813552856,
275
- "rewards/margins": 0.6787124872207642,
276
- "rewards/rejected": -1.5467078685760498,
277
  "step": 170
278
  },
279
  {
 
 
 
280
  "epoch": 0.38,
281
  "learning_rate": 3.9247834624635404e-07,
282
- "logits/chosen": -2.3971316814422607,
283
- "logits/rejected": -2.4019041061401367,
284
- "logps/chosen": -348.81622314453125,
285
- "logps/rejected": -395.7445068359375,
286
- "loss": 0.2742,
287
- "rewards/accuracies": 0.737500011920929,
288
- "rewards/chosen": -0.7963830232620239,
289
- "rewards/margins": 0.6072254776954651,
290
- "rewards/rejected": -1.4036084413528442,
291
  "step": 180
292
  },
293
  {
 
 
 
294
  "epoch": 0.4,
295
  "learning_rate": 3.7710310482256523e-07,
296
- "logits/chosen": -2.389106512069702,
297
- "logits/rejected": -2.3477344512939453,
298
- "logps/chosen": -376.10321044921875,
299
- "logps/rejected": -420.30364990234375,
300
- "loss": 0.2739,
301
- "rewards/accuracies": 0.762499988079071,
302
- "rewards/chosen": -0.8470739126205444,
303
- "rewards/margins": 0.7152097821235657,
304
- "rewards/rejected": -1.5622835159301758,
305
  "step": 190
306
  },
307
  {
 
 
 
308
  "epoch": 0.42,
309
  "learning_rate": 3.610497133404795e-07,
310
- "logits/chosen": -2.3799221515655518,
311
- "logits/rejected": -2.389249563217163,
312
- "logps/chosen": -386.76690673828125,
313
- "logps/rejected": -418.9993591308594,
314
- "loss": 0.2756,
315
- "rewards/accuracies": 0.71875,
316
- "rewards/chosen": -0.905994713306427,
317
- "rewards/margins": 0.6408411860466003,
318
- "rewards/rejected": -1.5468358993530273,
319
  "step": 200
320
  },
321
  {
322
  "epoch": 0.42,
323
- "eval_logits/chosen": -2.3946902751922607,
324
- "eval_logits/rejected": -2.3728690147399902,
325
- "eval_logps/chosen": -358.903564453125,
326
- "eval_logps/rejected": -428.84320068359375,
327
- "eval_loss": 0.276460200548172,
328
- "eval_rewards/accuracies": 0.71484375,
329
- "eval_rewards/chosen": -1.018639087677002,
330
- "eval_rewards/margins": 0.6962600946426392,
331
- "eval_rewards/rejected": -1.7148993015289307,
332
- "eval_runtime": 53.23,
333
- "eval_samples_per_second": 37.573,
334
- "eval_steps_per_second": 0.601,
 
 
 
335
  "step": 200
336
  },
337
  {
 
 
 
338
  "epoch": 0.44,
339
  "learning_rate": 3.4440382358952115e-07,
340
- "logits/chosen": -2.332733631134033,
341
- "logits/rejected": -2.316524028778076,
342
- "logps/chosen": -363.67376708984375,
343
- "logps/rejected": -420.559814453125,
344
- "loss": 0.273,
345
- "rewards/accuracies": 0.737500011920929,
346
- "rewards/chosen": -0.9064055681228638,
347
- "rewards/margins": 0.7798347473144531,
348
- "rewards/rejected": -1.6862401962280273,
349
  "step": 210
350
  },
351
  {
 
 
 
352
  "epoch": 0.46,
353
  "learning_rate": 3.272542485937368e-07,
354
- "logits/chosen": -2.33485746383667,
355
- "logits/rejected": -2.284698247909546,
356
- "logps/chosen": -357.8499755859375,
357
- "logps/rejected": -403.9745788574219,
358
- "loss": 0.2701,
359
- "rewards/accuracies": 0.737500011920929,
360
- "rewards/chosen": -0.8288432359695435,
361
- "rewards/margins": 0.5878358483314514,
362
- "rewards/rejected": -1.41667902469635,
363
  "step": 220
364
  },
365
  {
 
 
 
366
  "epoch": 0.48,
367
  "learning_rate": 3.096924887558854e-07,
368
- "logits/chosen": -2.3702471256256104,
369
- "logits/rejected": -2.314202308654785,
370
- "logps/chosen": -374.31231689453125,
371
- "logps/rejected": -429.17718505859375,
372
- "loss": 0.2771,
373
  "rewards/accuracies": 0.699999988079071,
374
- "rewards/chosen": -0.9976439476013184,
375
- "rewards/margins": 0.5939726829528809,
376
- "rewards/rejected": -1.5916167497634888,
377
  "step": 230
378
  },
379
  {
 
 
 
380
  "epoch": 0.5,
381
  "learning_rate": 2.9181224366319943e-07,
382
- "logits/chosen": -2.323148250579834,
383
- "logits/rejected": -2.2833874225616455,
384
- "logps/chosen": -378.198974609375,
385
- "logps/rejected": -421.3106994628906,
386
- "loss": 0.2546,
387
- "rewards/accuracies": 0.731249988079071,
388
- "rewards/chosen": -0.9202890396118164,
389
- "rewards/margins": 0.7598530650138855,
390
- "rewards/rejected": -1.6801420450210571,
391
  "step": 240
392
  },
393
  {
 
 
 
394
  "epoch": 0.52,
395
  "learning_rate": 2.7370891215954565e-07,
396
- "logits/chosen": -2.332274913787842,
397
- "logits/rejected": -2.2773728370666504,
398
- "logps/chosen": -363.24713134765625,
399
- "logps/rejected": -417.0872497558594,
400
- "loss": 0.2847,
401
- "rewards/accuracies": 0.737500011920929,
402
- "rewards/chosen": -0.9167481660842896,
403
- "rewards/margins": 0.709892988204956,
404
- "rewards/rejected": -1.6266412734985352,
405
  "step": 250
406
  },
407
  {
 
 
 
408
  "epoch": 0.54,
409
  "learning_rate": 2.55479083351317e-07,
410
- "logits/chosen": -2.330303907394409,
411
- "logits/rejected": -2.3061676025390625,
412
- "logps/chosen": -344.08203125,
413
- "logps/rejected": -414.8096618652344,
414
- "loss": 0.2653,
415
  "rewards/accuracies": 0.7124999761581421,
416
- "rewards/chosen": -0.849345862865448,
417
- "rewards/margins": 0.7162947058677673,
418
- "rewards/rejected": -1.5656404495239258,
419
  "step": 260
420
  },
421
  {
 
 
 
422
  "epoch": 0.56,
423
  "learning_rate": 2.3722002126275822e-07,
424
- "logits/chosen": -2.3339719772338867,
425
- "logits/rejected": -2.3088550567626953,
426
- "logps/chosen": -378.90631103515625,
427
- "logps/rejected": -447.6788635253906,
428
- "loss": 0.2631,
429
- "rewards/accuracies": 0.793749988079071,
430
- "rewards/chosen": -0.9045342206954956,
431
- "rewards/margins": 0.9216111302375793,
432
- "rewards/rejected": -1.8261455297470093,
433
  "step": 270
434
  },
435
  {
 
 
 
436
  "epoch": 0.59,
437
  "learning_rate": 2.19029145890313e-07,
438
- "logits/chosen": -2.2406225204467773,
439
- "logits/rejected": -2.229895830154419,
440
- "logps/chosen": -422.8426208496094,
441
- "logps/rejected": -476.47314453125,
442
- "loss": 0.2625,
443
- "rewards/accuracies": 0.8062499761581421,
444
- "rewards/chosen": -1.1509960889816284,
445
- "rewards/margins": 0.9580610394477844,
446
- "rewards/rejected": -2.1090569496154785,
447
  "step": 280
448
  },
449
  {
 
 
 
450
  "epoch": 0.61,
451
  "learning_rate": 2.0100351342479216e-07,
452
- "logits/chosen": -2.2886314392089844,
453
- "logits/rejected": -2.2553632259368896,
454
- "logps/chosen": -388.92877197265625,
455
- "logps/rejected": -442.0311584472656,
456
- "loss": 0.2692,
457
- "rewards/accuracies": 0.75,
458
- "rewards/chosen": -1.1116503477096558,
459
- "rewards/margins": 0.7756324410438538,
460
- "rewards/rejected": -1.8872827291488647,
461
  "step": 290
462
  },
463
  {
 
 
 
464
  "epoch": 0.63,
465
  "learning_rate": 1.8323929841460178e-07,
466
- "logits/chosen": -2.3207592964172363,
467
- "logits/rejected": -2.2983639240264893,
468
- "logps/chosen": -368.4107360839844,
469
- "logps/rejected": -431.96209716796875,
470
- "loss": 0.2684,
471
- "rewards/accuracies": 0.7124999761581421,
472
- "rewards/chosen": -1.168041467666626,
473
- "rewards/margins": 0.7282370924949646,
474
- "rewards/rejected": -1.896278738975525,
475
  "step": 300
476
  },
477
  {
478
  "epoch": 0.63,
479
- "eval_logits/chosen": -2.2835628986358643,
480
- "eval_logits/rejected": -2.2539806365966797,
481
- "eval_logps/chosen": -377.4593505859375,
482
- "eval_logps/rejected": -459.45916748046875,
483
- "eval_loss": 0.26686665415763855,
484
- "eval_rewards/accuracies": 0.7421875,
485
- "eval_rewards/chosen": -1.2041971683502197,
486
- "eval_rewards/margins": 0.8168618083000183,
487
- "eval_rewards/rejected": -2.021059036254883,
488
- "eval_runtime": 53.1986,
489
- "eval_samples_per_second": 37.595,
490
- "eval_steps_per_second": 0.602,
 
 
 
491
  "step": 300
492
  },
493
  {
 
 
 
494
  "epoch": 0.65,
495
  "learning_rate": 1.6583128063291573e-07,
496
- "logits/chosen": -2.257838010787964,
497
- "logits/rejected": -2.2076783180236816,
498
- "logps/chosen": -403.1900329589844,
499
- "logps/rejected": -454.09649658203125,
500
- "loss": 0.2691,
501
- "rewards/accuracies": 0.762499988079071,
502
- "rewards/chosen": -1.1548385620117188,
503
- "rewards/margins": 0.8283940553665161,
504
- "rewards/rejected": -1.9832324981689453,
505
  "step": 310
506
  },
507
  {
 
 
 
508
  "epoch": 0.67,
509
  "learning_rate": 1.488723393865766e-07,
510
- "logits/chosen": -2.218254804611206,
511
- "logits/rejected": -2.1774368286132812,
512
- "logps/chosen": -386.1689147949219,
513
- "logps/rejected": -453.9234924316406,
514
- "loss": 0.2502,
515
  "rewards/accuracies": 0.78125,
516
- "rewards/chosen": -1.003075361251831,
517
- "rewards/margins": 0.8829510807991028,
518
- "rewards/rejected": -1.886026382446289,
519
  "step": 320
520
  },
521
  {
 
 
 
522
  "epoch": 0.69,
523
  "learning_rate": 1.3245295796480788e-07,
524
- "logits/chosen": -2.2131848335266113,
525
- "logits/rejected": -2.1682944297790527,
526
- "logps/chosen": -386.908203125,
527
- "logps/rejected": -459.43939208984375,
528
- "loss": 0.2736,
529
- "rewards/accuracies": 0.737500011920929,
530
- "rewards/chosen": -1.1287882328033447,
531
- "rewards/margins": 0.8232651948928833,
532
- "rewards/rejected": -1.9520530700683594,
533
  "step": 330
534
  },
535
  {
 
 
 
536
  "epoch": 0.71,
537
  "learning_rate": 1.1666074087171627e-07,
538
- "logits/chosen": -2.238731622695923,
539
- "logits/rejected": -2.2161293029785156,
540
- "logps/chosen": -358.34014892578125,
541
- "logps/rejected": -442.72186279296875,
542
- "loss": 0.2605,
543
  "rewards/accuracies": 0.75,
544
- "rewards/chosen": -1.0374398231506348,
545
- "rewards/margins": 0.860522449016571,
546
- "rewards/rejected": -1.8979623317718506,
547
  "step": 340
548
  },
549
  {
 
 
 
550
  "epoch": 0.73,
551
  "learning_rate": 1.0157994641835734e-07,
552
- "logits/chosen": -2.2104313373565674,
553
- "logits/rejected": -2.178367853164673,
554
- "logps/chosen": -402.49273681640625,
555
- "logps/rejected": -467.699462890625,
556
- "loss": 0.2795,
557
- "rewards/accuracies": 0.737500011920929,
558
- "rewards/chosen": -1.2603912353515625,
559
- "rewards/margins": 0.7859227061271667,
560
- "rewards/rejected": -2.046314001083374,
561
  "step": 350
562
  },
563
  {
 
 
 
564
  "epoch": 0.75,
565
  "learning_rate": 8.729103716819111e-08,
566
- "logits/chosen": -2.172868013381958,
567
- "logits/rejected": -2.1350009441375732,
568
- "logps/chosen": -366.4203796386719,
569
- "logps/rejected": -455.3935546875,
570
- "loss": 0.2549,
571
- "rewards/accuracies": 0.7749999761581421,
572
- "rewards/chosen": -1.0803582668304443,
573
- "rewards/margins": 1.0399386882781982,
574
- "rewards/rejected": -2.1202971935272217,
575
  "step": 360
576
  },
577
  {
 
 
 
578
  "epoch": 0.77,
579
  "learning_rate": 7.387025063449081e-08,
580
- "logits/chosen": -2.226396322250366,
581
- "logits/rejected": -2.206636905670166,
582
- "logps/chosen": -384.2486572265625,
583
- "logps/rejected": -450.4458923339844,
584
- "loss": 0.2594,
585
- "rewards/accuracies": 0.7250000238418579,
586
- "rewards/chosen": -1.061886191368103,
587
- "rewards/margins": 0.8060476183891296,
588
- "rewards/rejected": -1.8679338693618774,
589
  "step": 370
590
  },
591
  {
 
 
 
592
  "epoch": 0.79,
593
  "learning_rate": 6.138919252022435e-08,
594
- "logits/chosen": -2.1346209049224854,
595
- "logits/rejected": -2.1192917823791504,
596
- "logps/chosen": -359.8027038574219,
597
- "logps/rejected": -446.1529235839844,
598
- "loss": 0.2575,
599
- "rewards/accuracies": 0.737500011920929,
600
- "rewards/chosen": -1.3696153163909912,
601
- "rewards/margins": 0.6622845530509949,
602
- "rewards/rejected": -2.031899929046631,
603
  "step": 380
604
  },
605
  {
 
 
 
606
  "epoch": 0.82,
607
  "learning_rate": 4.991445467064689e-08,
608
- "logits/chosen": -2.193723440170288,
609
- "logits/rejected": -2.1565771102905273,
610
- "logps/chosen": -371.32269287109375,
611
- "logps/rejected": -441.592041015625,
612
- "loss": 0.2676,
613
- "rewards/accuracies": 0.7749999761581421,
614
- "rewards/chosen": -1.1976280212402344,
615
- "rewards/margins": 0.8632476925849915,
616
- "rewards/rejected": -2.060875654220581,
617
  "step": 390
618
  },
619
  {
 
 
 
620
  "epoch": 0.84,
621
  "learning_rate": 3.9507259776993954e-08,
622
- "logits/chosen": -2.171326160430908,
623
- "logits/rejected": -2.108442783355713,
624
- "logps/chosen": -409.718994140625,
625
- "logps/rejected": -448.8550720214844,
626
- "loss": 0.2654,
627
- "rewards/accuracies": 0.800000011920929,
628
- "rewards/chosen": -1.2401525974273682,
629
- "rewards/margins": 0.7730957269668579,
630
- "rewards/rejected": -2.0132482051849365,
631
  "step": 400
632
  },
633
  {
634
  "epoch": 0.84,
635
- "eval_logits/chosen": -2.2167153358459473,
636
- "eval_logits/rejected": -2.1810660362243652,
637
- "eval_logps/chosen": -373.1811218261719,
638
- "eval_logps/rejected": -460.23870849609375,
639
- "eval_loss": 0.26106157898902893,
640
- "eval_rewards/accuracies": 0.74609375,
641
- "eval_rewards/chosen": -1.1614149808883667,
642
- "eval_rewards/margins": 0.8674393892288208,
643
- "eval_rewards/rejected": -2.0288543701171875,
644
- "eval_runtime": 53.1796,
645
- "eval_samples_per_second": 37.608,
646
- "eval_steps_per_second": 0.602,
 
 
 
647
  "step": 400
648
  },
649
  {
 
 
 
650
  "epoch": 0.86,
651
  "learning_rate": 3.022313472693447e-08,
652
- "logits/chosen": -2.215372085571289,
653
- "logits/rejected": -2.137930393218994,
654
- "logps/chosen": -390.9686279296875,
655
- "logps/rejected": -485.49688720703125,
656
- "loss": 0.2677,
657
- "rewards/accuracies": 0.737500011920929,
658
- "rewards/chosen": -1.2737891674041748,
659
- "rewards/margins": 0.7665437459945679,
660
- "rewards/rejected": -2.0403332710266113,
661
  "step": 410
662
  },
663
  {
 
 
 
664
  "epoch": 0.88,
665
  "learning_rate": 2.2111614344599684e-08,
666
- "logits/chosen": -2.21181058883667,
667
- "logits/rejected": -2.1742496490478516,
668
- "logps/chosen": -413.1441955566406,
669
- "logps/rejected": -491.66424560546875,
670
- "loss": 0.2569,
671
- "rewards/accuracies": 0.6812499761581421,
672
- "rewards/chosen": -1.263311505317688,
673
- "rewards/margins": 0.7292413711547852,
674
- "rewards/rejected": -1.9925527572631836,
675
  "step": 420
676
  },
677
  {
 
 
 
678
  "epoch": 0.9,
679
  "learning_rate": 1.521597710086439e-08,
680
- "logits/chosen": -2.1671276092529297,
681
- "logits/rejected": -2.1266417503356934,
682
- "logps/chosen": -410.6739807128906,
683
- "logps/rejected": -491.3251953125,
684
- "loss": 0.2689,
685
  "rewards/accuracies": 0.768750011920929,
686
- "rewards/chosen": -1.2042081356048584,
687
- "rewards/margins": 0.9651464223861694,
688
- "rewards/rejected": -2.1693546772003174,
689
  "step": 430
690
  },
691
  {
 
 
 
692
  "epoch": 0.92,
693
  "learning_rate": 9.57301420397924e-09,
694
- "logits/chosen": -2.2078280448913574,
695
- "logits/rejected": -2.1698689460754395,
696
- "logps/chosen": -446.28106689453125,
697
- "logps/rejected": -456.0550231933594,
698
- "loss": 0.2585,
699
- "rewards/accuracies": 0.6812499761581421,
700
- "rewards/chosen": -1.3123003244400024,
701
- "rewards/margins": 0.5923169851303101,
702
- "rewards/rejected": -1.9046173095703125,
703
  "step": 440
704
  },
705
  {
 
 
 
706
  "epoch": 0.94,
707
  "learning_rate": 5.212833302556258e-09,
708
- "logits/chosen": -2.163405179977417,
709
- "logits/rejected": -2.1259548664093018,
710
- "logps/chosen": -396.20977783203125,
711
- "logps/rejected": -447.4117126464844,
712
- "loss": 0.2577,
713
- "rewards/accuracies": 0.7437499761581421,
714
- "rewards/chosen": -1.1535217761993408,
715
- "rewards/margins": 0.8187162280082703,
716
- "rewards/rejected": -1.9722381830215454,
717
  "step": 450
718
  },
719
  {
 
 
 
720
  "epoch": 0.96,
721
  "learning_rate": 2.158697848236607e-09,
722
- "logits/chosen": -2.188662052154541,
723
- "logits/rejected": -2.1400952339172363,
724
- "logps/chosen": -417.8199157714844,
725
- "logps/rejected": -463.55078125,
726
- "loss": 0.2717,
727
- "rewards/accuracies": 0.7437499761581421,
728
- "rewards/chosen": -1.174331545829773,
729
- "rewards/margins": 0.7548145055770874,
730
- "rewards/rejected": -1.9291460514068604,
731
  "step": 460
732
  },
733
  {
 
 
 
734
  "epoch": 0.98,
735
  "learning_rate": 4.269029751107489e-10,
736
- "logits/chosen": -2.211068630218506,
737
- "logits/rejected": -2.1270031929016113,
738
- "logps/chosen": -402.3449401855469,
739
- "logps/rejected": -420.40814208984375,
740
- "loss": 0.2596,
741
  "rewards/accuracies": 0.699999988079071,
742
- "rewards/chosen": -1.041385531425476,
743
- "rewards/margins": 0.8322860598564148,
744
- "rewards/rejected": -1.8736717700958252,
745
  "step": 470
746
  },
747
  {
748
  "epoch": 1.0,
749
  "step": 478,
750
  "total_flos": 0.0,
751
- "train_loss": 0.28123937291580264,
752
- "train_runtime": 4275.6877,
753
- "train_samples_per_second": 14.298,
754
- "train_steps_per_second": 0.112
755
  }
756
  ],
757
  "logging_steps": 10,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "debug/losses": 0.34217238426208496,
13
+ "debug/policy_weights": 0.4936503767967224,
14
+ "debug/raw_losses": 0.6931471824645996,
15
  "epoch": 0.0,
16
  "learning_rate": 1.0416666666666666e-08,
17
  "logits/chosen": -2.8099329471588135,
18
  "logits/rejected": -2.7572641372680664,
19
  "logps/chosen": -241.48843383789062,
20
  "logps/rejected": -197.4517822265625,
21
+ "loss": 0.3561,
22
  "rewards/accuracies": 0.0,
23
  "rewards/chosen": 0.0,
24
  "rewards/margins": 0.0,
 
26
  "step": 1
27
  },
28
  {
29
+ "debug/losses": 0.3613118529319763,
30
+ "debug/policy_weights": 0.5213115215301514,
31
+ "debug/raw_losses": 0.6931909918785095,
32
  "epoch": 0.02,
33
  "learning_rate": 1.0416666666666667e-07,
34
+ "logits/chosen": -2.8320045471191406,
35
+ "logits/rejected": -2.8085670471191406,
36
+ "logps/chosen": -292.685546875,
37
+ "logps/rejected": -278.5729064941406,
38
+ "loss": 0.3674,
39
+ "rewards/accuracies": 0.4236111044883728,
40
+ "rewards/chosen": 5.248460729490034e-05,
41
+ "rewards/margins": -7.99686458776705e-05,
42
+ "rewards/rejected": 0.00013245324953459203,
43
  "step": 10
44
  },
45
  {
46
+ "debug/losses": 0.3490375578403473,
47
+ "debug/policy_weights": 0.5044432878494263,
48
+ "debug/raw_losses": 0.6918557286262512,
49
  "epoch": 0.04,
50
  "learning_rate": 2.0833333333333333e-07,
51
+ "logits/chosen": -2.811972141265869,
52
+ "logits/rejected": -2.78340482711792,
53
+ "logps/chosen": -290.2806396484375,
54
+ "logps/rejected": -290.8512268066406,
55
+ "loss": 0.3549,
56
+ "rewards/accuracies": 0.612500011920929,
57
+ "rewards/chosen": 0.0011896035866811872,
58
+ "rewards/margins": 0.0026031401939690113,
59
+ "rewards/rejected": -0.0014135364908725023,
60
  "step": 20
61
  },
62
  {
63
+ "debug/losses": 0.3571945130825043,
64
+ "debug/policy_weights": 0.518287181854248,
65
+ "debug/raw_losses": 0.6891354322433472,
66
  "epoch": 0.06,
67
  "learning_rate": 3.1249999999999997e-07,
68
+ "logits/chosen": -2.759937047958374,
69
+ "logits/rejected": -2.7286112308502197,
70
+ "logps/chosen": -246.35159301757812,
71
+ "logps/rejected": -227.08651733398438,
72
+ "loss": 0.3602,
73
+ "rewards/accuracies": 0.637499988079071,
74
+ "rewards/chosen": 0.00010563675459707156,
75
+ "rewards/margins": 0.008171903900802135,
76
+ "rewards/rejected": -0.008066266775131226,
77
  "step": 30
78
  },
79
  {
80
+ "debug/losses": 0.3552504777908325,
81
+ "debug/policy_weights": 0.52684086561203,
82
+ "debug/raw_losses": 0.6755487322807312,
83
  "epoch": 0.08,
84
  "learning_rate": 4.1666666666666667e-07,
85
+ "logits/chosen": -2.7985873222351074,
86
+ "logits/rejected": -2.7698562145233154,
87
+ "logps/chosen": -298.9928283691406,
88
+ "logps/rejected": -264.44781494140625,
89
+ "loss": 0.3544,
90
+ "rewards/accuracies": 0.7124999761581421,
91
+ "rewards/chosen": 0.005288169719278812,
92
+ "rewards/margins": 0.03713225945830345,
93
+ "rewards/rejected": -0.031844086945056915,
94
  "step": 40
95
  },
96
  {
97
+ "debug/losses": 0.3372410833835602,
98
+ "debug/policy_weights": 0.50788414478302,
99
+ "debug/raw_losses": 0.6635575890541077,
100
  "epoch": 0.1,
101
  "learning_rate": 4.999733114418725e-07,
102
+ "logits/chosen": -2.719874143600464,
103
+ "logits/rejected": -2.698538064956665,
104
+ "logps/chosen": -279.17694091796875,
105
+ "logps/rejected": -272.10687255859375,
106
+ "loss": 0.3418,
107
  "rewards/accuracies": 0.65625,
108
+ "rewards/chosen": -0.017464958131313324,
109
+ "rewards/margins": 0.06663360446691513,
110
+ "rewards/rejected": -0.08409856259822845,
111
  "step": 50
112
  },
113
  {
114
+ "debug/losses": 0.32032984495162964,
115
+ "debug/policy_weights": 0.49015456438064575,
116
+ "debug/raw_losses": 0.6468743085861206,
117
  "epoch": 0.13,
118
  "learning_rate": 4.990398100856366e-07,
119
+ "logits/chosen": -2.741781234741211,
120
+ "logits/rejected": -2.718924045562744,
121
+ "logps/chosen": -264.25994873046875,
122
+ "logps/rejected": -249.49978637695312,
123
+ "loss": 0.3195,
124
+ "rewards/accuracies": 0.6499999761581421,
125
+ "rewards/chosen": -0.05183681100606918,
126
+ "rewards/margins": 0.11701079457998276,
127
+ "rewards/rejected": -0.16884759068489075,
128
  "step": 60
129
  },
130
  {
131
+ "debug/losses": 0.2948753237724304,
132
+ "debug/policy_weights": 0.4617387652397156,
133
+ "debug/raw_losses": 0.6441112756729126,
134
  "epoch": 0.15,
135
  "learning_rate": 4.967775735898179e-07,
136
+ "logits/chosen": -2.7274184226989746,
137
+ "logits/rejected": -2.697922706604004,
138
+ "logps/chosen": -283.56719970703125,
139
+ "logps/rejected": -265.3944091796875,
140
+ "loss": 0.2944,
141
+ "rewards/accuracies": 0.6499999761581421,
142
+ "rewards/chosen": -0.15144629776477814,
143
+ "rewards/margins": 0.15366096794605255,
144
+ "rewards/rejected": -0.3051072657108307,
145
  "step": 70
146
  },
147
  {
148
+ "debug/losses": 0.24030272662639618,
149
+ "debug/policy_weights": 0.39300116896629333,
150
+ "debug/raw_losses": 0.61235511302948,
151
  "epoch": 0.17,
152
  "learning_rate": 4.931986719649298e-07,
153
+ "logits/chosen": -2.6943717002868652,
154
+ "logits/rejected": -2.6917672157287598,
155
+ "logps/chosen": -306.8209533691406,
156
+ "logps/rejected": -331.4168701171875,
157
+ "loss": 0.2366,
158
+ "rewards/accuracies": 0.6875,
159
+ "rewards/chosen": -0.36089134216308594,
160
+ "rewards/margins": 0.27803000807762146,
161
+ "rewards/rejected": -0.638921320438385,
162
  "step": 80
163
  },
164
  {
165
+ "debug/losses": 0.21284589171409607,
166
+ "debug/policy_weights": 0.36208364367485046,
167
+ "debug/raw_losses": 0.6084356307983398,
168
  "epoch": 0.19,
169
  "learning_rate": 4.883222001996351e-07,
170
+ "logits/chosen": -2.742755889892578,
171
+ "logits/rejected": -2.7089426517486572,
172
+ "logps/chosen": -306.11700439453125,
173
+ "logps/rejected": -313.5643005371094,
174
+ "loss": 0.2153,
175
+ "rewards/accuracies": 0.675000011920929,
176
+ "rewards/chosen": -0.5115253925323486,
177
+ "rewards/margins": 0.27717387676239014,
178
+ "rewards/rejected": -0.7886992692947388,
179
  "step": 90
180
  },
181
  {
182
+ "debug/losses": 0.23982000350952148,
183
+ "debug/policy_weights": 0.39243510365486145,
184
+ "debug/raw_losses": 0.625305712223053,
185
  "epoch": 0.21,
186
  "learning_rate": 4.821741763807186e-07,
187
+ "logits/chosen": -2.754852533340454,
188
+ "logits/rejected": -2.7216262817382812,
189
+ "logps/chosen": -346.60498046875,
190
+ "logps/rejected": -359.0435485839844,
191
+ "loss": 0.2168,
192
+ "rewards/accuracies": 0.637499988079071,
193
+ "rewards/chosen": -0.5716595649719238,
194
+ "rewards/margins": 0.3401753306388855,
195
+ "rewards/rejected": -0.9118350148200989,
196
  "step": 100
197
  },
198
  {
199
  "epoch": 0.21,
200
+ "eval_debug/losses": 0.2082262486219406,
201
+ "eval_debug/policy_weights": 0.37346428632736206,
202
+ "eval_debug/raw_losses": 0.5528886318206787,
203
+ "eval_logits/chosen": -2.697880268096924,
204
+ "eval_logits/rejected": -2.6826982498168945,
205
+ "eval_logps/chosen": -311.4377136230469,
206
+ "eval_logps/rejected": -363.1571350097656,
207
+ "eval_loss": 0.21503373980522156,
208
+ "eval_rewards/accuracies": 0.73828125,
209
+ "eval_rewards/chosen": -0.5439806580543518,
210
+ "eval_rewards/margins": 0.5140582323074341,
211
+ "eval_rewards/rejected": -1.0580389499664307,
212
+ "eval_runtime": 53.0291,
213
+ "eval_samples_per_second": 37.715,
214
+ "eval_steps_per_second": 0.603,
215
  "step": 100
216
  },
217
  {
218
+ "debug/losses": 0.1748097836971283,
219
+ "debug/policy_weights": 0.3250483572483063,
220
+ "debug/raw_losses": 0.5473231077194214,
221
  "epoch": 0.23,
222
  "learning_rate": 4.747874028753375e-07,
223
+ "logits/chosen": -2.5498530864715576,
224
+ "logits/rejected": -2.5195746421813965,
225
+ "logps/chosen": -338.24639892578125,
226
+ "logps/rejected": -351.0300598144531,
227
+ "loss": 0.2041,
228
+ "rewards/accuracies": 0.6875,
229
+ "rewards/chosen": -0.6814893484115601,
230
+ "rewards/margins": 0.6107165813446045,
231
+ "rewards/rejected": -1.292205810546875,
232
  "step": 110
233
  },
234
  {
235
+ "debug/losses": 0.1997809112071991,
236
+ "debug/policy_weights": 0.35038530826568604,
237
+ "debug/raw_losses": 0.5690494775772095,
238
  "epoch": 0.25,
239
  "learning_rate": 4.662012913161997e-07,
240
+ "logits/chosen": -2.6230885982513428,
241
+ "logits/rejected": -2.5906565189361572,
242
+ "logps/chosen": -337.9828186035156,
243
+ "logps/rejected": -383.8708801269531,
244
+ "loss": 0.1885,
245
+ "rewards/accuracies": 0.7250000238418579,
246
+ "rewards/chosen": -0.6913173794746399,
247
+ "rewards/margins": 0.5188819169998169,
248
+ "rewards/rejected": -1.2101994752883911,
249
  "step": 120
250
  },
251
  {
252
+ "debug/losses": 0.17902129888534546,
253
+ "debug/policy_weights": 0.3178775906562805,
254
+ "debug/raw_losses": 0.5685083866119385,
255
  "epoch": 0.27,
256
  "learning_rate": 4.5646165232345103e-07,
257
+ "logits/chosen": -2.591015100479126,
258
+ "logits/rejected": -2.576317548751831,
259
+ "logps/chosen": -350.18768310546875,
260
+ "logps/rejected": -370.1181640625,
261
+ "loss": 0.1792,
262
+ "rewards/accuracies": 0.6937500238418579,
263
+ "rewards/chosen": -0.7569286227226257,
264
+ "rewards/margins": 0.4377259314060211,
265
+ "rewards/rejected": -1.1946544647216797,
266
  "step": 130
267
  },
268
  {
269
+ "debug/losses": 0.18843333423137665,
270
+ "debug/policy_weights": 0.3299049437046051,
271
+ "debug/raw_losses": 0.550617516040802,
272
  "epoch": 0.29,
273
  "learning_rate": 4.456204510851956e-07,
274
+ "logits/chosen": -2.570383071899414,
275
+ "logits/rejected": -2.5405478477478027,
276
+ "logps/chosen": -350.4234924316406,
277
+ "logps/rejected": -398.32403564453125,
278
+ "loss": 0.1848,
279
+ "rewards/accuracies": 0.7250000238418579,
280
+ "rewards/chosen": -0.69789057970047,
281
+ "rewards/margins": 0.577714741230011,
282
+ "rewards/rejected": -1.275605320930481,
283
  "step": 140
284
  },
285
  {
286
+ "debug/losses": 0.18008050322532654,
287
+ "debug/policy_weights": 0.34716594219207764,
288
+ "debug/raw_losses": 0.5200980305671692,
289
  "epoch": 0.31,
290
  "learning_rate": 4.337355301007335e-07,
291
+ "logits/chosen": -2.520066738128662,
292
+ "logits/rejected": -2.5489494800567627,
293
+ "logps/chosen": -280.36163330078125,
294
+ "logps/rejected": -348.47869873046875,
295
+ "loss": 0.1859,
296
+ "rewards/accuracies": 0.793749988079071,
297
+ "rewards/chosen": -0.5368272066116333,
298
+ "rewards/margins": 0.585098147392273,
299
+ "rewards/rejected": -1.1219253540039062,
300
  "step": 150
301
  },
302
  {
303
+ "debug/losses": 0.17896616458892822,
304
+ "debug/policy_weights": 0.3156106472015381,
305
+ "debug/raw_losses": 0.599802553653717,
306
  "epoch": 0.33,
307
  "learning_rate": 4.2087030056579986e-07,
308
+ "logits/chosen": -2.5032947063446045,
309
+ "logits/rejected": -2.476680278778076,
310
+ "logps/chosen": -341.430908203125,
311
+ "logps/rejected": -376.94244384765625,
312
+ "loss": 0.1759,
313
+ "rewards/accuracies": 0.675000011920929,
314
+ "rewards/chosen": -0.8545902967453003,
315
+ "rewards/margins": 0.4560604691505432,
316
+ "rewards/rejected": -1.3106508255004883,
317
  "step": 160
318
  },
319
  {
320
+ "debug/losses": 0.12902560830116272,
321
+ "debug/policy_weights": 0.25470516085624695,
322
+ "debug/raw_losses": 0.5344475507736206,
323
  "epoch": 0.36,
324
  "learning_rate": 4.070934040463998e-07,
325
+ "logits/chosen": -2.3815720081329346,
326
+ "logits/rejected": -2.365286350250244,
327
+ "logps/chosen": -394.91192626953125,
328
+ "logps/rejected": -446.85321044921875,
329
+ "loss": 0.1496,
330
+ "rewards/accuracies": 0.7437499761581421,
331
+ "rewards/chosen": -1.1274508237838745,
332
+ "rewards/margins": 0.7393988966941833,
333
+ "rewards/rejected": -1.866849660873413,
334
  "step": 170
335
  },
336
  {
337
+ "debug/losses": 0.12488824129104614,
338
+ "debug/policy_weights": 0.24605941772460938,
339
+ "debug/raw_losses": 0.5098173022270203,
340
  "epoch": 0.38,
341
  "learning_rate": 3.9247834624635404e-07,
342
+ "logits/chosen": -2.3833303451538086,
343
+ "logits/rejected": -2.389314889907837,
344
+ "logps/chosen": -389.9188232421875,
345
+ "logps/rejected": -450.25140380859375,
346
+ "loss": 0.1436,
347
+ "rewards/accuracies": 0.768750011920929,
348
+ "rewards/chosen": -1.2074089050292969,
349
+ "rewards/margins": 0.7412688136100769,
350
+ "rewards/rejected": -1.9486777782440186,
351
  "step": 180
352
  },
353
  {
354
+ "debug/losses": 0.1292407363653183,
355
+ "debug/policy_weights": 0.26770901679992676,
356
+ "debug/raw_losses": 0.48357778787612915,
357
  "epoch": 0.4,
358
  "learning_rate": 3.7710310482256523e-07,
359
+ "logits/chosen": -2.3599636554718018,
360
+ "logits/rejected": -2.3223559856414795,
361
+ "logps/chosen": -414.10699462890625,
362
+ "logps/rejected": -469.5603942871094,
363
+ "loss": 0.1421,
364
+ "rewards/accuracies": 0.793749988079071,
365
+ "rewards/chosen": -1.227112054824829,
366
+ "rewards/margins": 0.8277386426925659,
367
+ "rewards/rejected": -2.0548505783081055,
368
  "step": 190
369
  },
370
  {
371
+ "debug/losses": 0.15382704138755798,
372
+ "debug/policy_weights": 0.2786335349082947,
373
+ "debug/raw_losses": 0.5533261299133301,
374
  "epoch": 0.42,
375
  "learning_rate": 3.610497133404795e-07,
376
+ "logits/chosen": -2.2979869842529297,
377
+ "logits/rejected": -2.312802791595459,
378
+ "logps/chosen": -419.5997009277344,
379
+ "logps/rejected": -456.7002868652344,
380
+ "loss": 0.1396,
381
+ "rewards/accuracies": 0.731249988079071,
382
+ "rewards/chosen": -1.2343220710754395,
383
+ "rewards/margins": 0.6895232200622559,
384
+ "rewards/rejected": -1.9238452911376953,
385
  "step": 200
386
  },
387
  {
388
  "epoch": 0.42,
389
+ "eval_debug/losses": 0.13899114727973938,
390
+ "eval_debug/policy_weights": 0.2686985731124878,
391
+ "eval_debug/raw_losses": 0.5029721260070801,
392
+ "eval_logits/chosen": -2.2968499660491943,
393
+ "eval_logits/rejected": -2.273340940475464,
394
+ "eval_logps/chosen": -391.8349609375,
395
+ "eval_logps/rejected": -470.2158203125,
396
+ "eval_loss": 0.14160528779029846,
397
+ "eval_rewards/accuracies": 0.765625,
398
+ "eval_rewards/chosen": -1.3479530811309814,
399
+ "eval_rewards/margins": 0.7806724309921265,
400
+ "eval_rewards/rejected": -2.1286253929138184,
401
+ "eval_runtime": 52.9895,
402
+ "eval_samples_per_second": 37.743,
403
+ "eval_steps_per_second": 0.604,
404
  "step": 200
405
  },
406
  {
407
+ "debug/losses": 0.12412895262241364,
408
+ "debug/policy_weights": 0.26891231536865234,
409
+ "debug/raw_losses": 0.47236162424087524,
410
  "epoch": 0.44,
411
  "learning_rate": 3.4440382358952115e-07,
412
+ "logits/chosen": -2.2376811504364014,
413
+ "logits/rejected": -2.2135262489318848,
414
+ "logps/chosen": -395.38421630859375,
415
+ "logps/rejected": -459.84210205078125,
416
+ "loss": 0.1418,
417
+ "rewards/accuracies": 0.762499988079071,
418
+ "rewards/chosen": -1.2235102653503418,
419
+ "rewards/margins": 0.8555533289909363,
420
+ "rewards/rejected": -2.0790634155273438,
421
  "step": 210
422
  },
423
  {
424
+ "debug/losses": 0.13483984768390656,
425
+ "debug/policy_weights": 0.2588108479976654,
426
+ "debug/raw_losses": 0.5093666911125183,
427
  "epoch": 0.46,
428
  "learning_rate": 3.272542485937368e-07,
429
+ "logits/chosen": -2.2282791137695312,
430
+ "logits/rejected": -2.1731905937194824,
431
+ "logps/chosen": -391.88104248046875,
432
+ "logps/rejected": -457.2334899902344,
433
+ "loss": 0.1418,
434
+ "rewards/accuracies": 0.7562500238418579,
435
+ "rewards/chosen": -1.1691535711288452,
436
+ "rewards/margins": 0.7801142930984497,
437
+ "rewards/rejected": -1.9492677450180054,
438
  "step": 220
439
  },
440
  {
441
+ "debug/losses": 0.14488555490970612,
442
+ "debug/policy_weights": 0.26247432827949524,
443
+ "debug/raw_losses": 0.5395208597183228,
444
  "epoch": 0.48,
445
  "learning_rate": 3.096924887558854e-07,
446
+ "logits/chosen": -2.2395882606506348,
447
+ "logits/rejected": -2.1690821647644043,
448
+ "logps/chosen": -417.9415588378906,
449
+ "logps/rejected": -489.2323303222656,
450
+ "loss": 0.1451,
451
  "rewards/accuracies": 0.699999988079071,
452
+ "rewards/chosen": -1.433936595916748,
453
+ "rewards/margins": 0.758230984210968,
454
+ "rewards/rejected": -2.1921677589416504,
455
  "step": 230
456
  },
457
  {
458
+ "debug/losses": 0.1316806524991989,
459
+ "debug/policy_weights": 0.2545274794101715,
460
+ "debug/raw_losses": 0.5126517415046692,
461
  "epoch": 0.5,
462
  "learning_rate": 2.9181224366319943e-07,
463
+ "logits/chosen": -2.171607732772827,
464
+ "logits/rejected": -2.1234748363494873,
465
+ "logps/chosen": -426.31719970703125,
466
+ "logps/rejected": -475.4518127441406,
467
+ "loss": 0.1325,
468
+ "rewards/accuracies": 0.75,
469
+ "rewards/chosen": -1.4014716148376465,
470
+ "rewards/margins": 0.8200809359550476,
471
+ "rewards/rejected": -2.2215523719787598,
472
  "step": 240
473
  },
474
  {
475
+ "debug/losses": 0.14006611704826355,
476
+ "debug/policy_weights": 0.2591710388660431,
477
+ "debug/raw_losses": 0.5314095616340637,
478
  "epoch": 0.52,
479
  "learning_rate": 2.7370891215954565e-07,
480
+ "logits/chosen": -2.1962928771972656,
481
+ "logits/rejected": -2.1328587532043457,
482
+ "logps/chosen": -403.8672790527344,
483
+ "logps/rejected": -458.2118225097656,
484
+ "loss": 0.1446,
485
+ "rewards/accuracies": 0.762499988079071,
486
+ "rewards/chosen": -1.3229501247406006,
487
+ "rewards/margins": 0.7149368524551392,
488
+ "rewards/rejected": -2.03788685798645,
489
  "step": 250
490
  },
491
  {
492
+ "debug/losses": 0.14360225200653076,
493
+ "debug/policy_weights": 0.26045817136764526,
494
+ "debug/raw_losses": 0.5265286564826965,
495
  "epoch": 0.54,
496
  "learning_rate": 2.55479083351317e-07,
497
+ "logits/chosen": -2.1760964393615723,
498
+ "logits/rejected": -2.150712013244629,
499
+ "logps/chosen": -401.54754638671875,
500
+ "logps/rejected": -479.643310546875,
501
+ "loss": 0.1352,
502
  "rewards/accuracies": 0.7124999761581421,
503
+ "rewards/chosen": -1.4240009784698486,
504
+ "rewards/margins": 0.7899759411811829,
505
+ "rewards/rejected": -2.2139768600463867,
506
  "step": 260
507
  },
508
  {
509
+ "debug/losses": 0.11131677776575089,
510
+ "debug/policy_weights": 0.24807500839233398,
511
+ "debug/raw_losses": 0.47175589203834534,
512
  "epoch": 0.56,
513
  "learning_rate": 2.3722002126275822e-07,
514
+ "logits/chosen": -2.186707019805908,
515
+ "logits/rejected": -2.1590161323547363,
516
+ "logps/chosen": -427.5020446777344,
517
+ "logps/rejected": -495.8419494628906,
518
+ "loss": 0.1267,
519
+ "rewards/accuracies": 0.75,
520
+ "rewards/chosen": -1.3904914855957031,
521
+ "rewards/margins": 0.9172846078872681,
522
+ "rewards/rejected": -2.3077759742736816,
523
  "step": 270
524
  },
525
  {
526
+ "debug/losses": 0.11321704089641571,
527
+ "debug/policy_weights": 0.24243195354938507,
528
+ "debug/raw_losses": 0.4841908812522888,
529
  "epoch": 0.59,
530
  "learning_rate": 2.19029145890313e-07,
531
+ "logits/chosen": -2.1000428199768066,
532
+ "logits/rejected": -2.089947462081909,
533
+ "logps/chosen": -469.45867919921875,
534
+ "logps/rejected": -520.9786987304688,
535
+ "loss": 0.1249,
536
+ "rewards/accuracies": 0.8187500238418579,
537
+ "rewards/chosen": -1.617157220840454,
538
+ "rewards/margins": 0.9369556307792664,
539
+ "rewards/rejected": -2.554112672805786,
540
  "step": 280
541
  },
542
  {
543
+ "debug/losses": 0.11086218059062958,
544
+ "debug/policy_weights": 0.22184400260448456,
545
+ "debug/raw_losses": 0.5123878717422485,
546
  "epoch": 0.61,
547
  "learning_rate": 2.0100351342479216e-07,
548
+ "logits/chosen": -2.1559338569641113,
549
+ "logits/rejected": -2.1183762550354004,
550
+ "logps/chosen": -447.17138671875,
551
+ "logps/rejected": -499.66143798828125,
552
+ "loss": 0.1214,
553
+ "rewards/accuracies": 0.71875,
554
+ "rewards/chosen": -1.6940768957138062,
555
+ "rewards/margins": 0.7695088386535645,
556
+ "rewards/rejected": -2.46358585357666,
557
  "step": 290
558
  },
559
  {
560
+ "debug/losses": 0.13476888835430145,
561
+ "debug/policy_weights": 0.23854057490825653,
562
+ "debug/raw_losses": 0.5516811013221741,
563
  "epoch": 0.63,
564
  "learning_rate": 1.8323929841460178e-07,
565
+ "logits/chosen": -2.199371099472046,
566
+ "logits/rejected": -2.178723096847534,
567
+ "logps/chosen": -422.80450439453125,
568
+ "logps/rejected": -485.37890625,
569
+ "loss": 0.1294,
570
+ "rewards/accuracies": 0.7250000238418579,
571
+ "rewards/chosen": -1.7119791507720947,
572
+ "rewards/margins": 0.7184675335884094,
573
+ "rewards/rejected": -2.4304463863372803,
574
  "step": 300
575
  },
576
  {
577
  "epoch": 0.63,
578
+ "eval_debug/losses": 0.12840886414051056,
579
+ "eval_debug/policy_weights": 0.25453710556030273,
580
+ "eval_debug/raw_losses": 0.4935261309146881,
581
+ "eval_logits/chosen": -2.1884968280792236,
582
+ "eval_logits/rejected": -2.158949851989746,
583
+ "eval_logps/chosen": -417.07135009765625,
584
+ "eval_logps/rejected": -502.21124267578125,
585
+ "eval_loss": 0.13086578249931335,
586
+ "eval_rewards/accuracies": 0.73828125,
587
+ "eval_rewards/chosen": -1.6003175973892212,
588
+ "eval_rewards/margins": 0.8482623100280762,
589
+ "eval_rewards/rejected": -2.448579788208008,
590
+ "eval_runtime": 53.0489,
591
+ "eval_samples_per_second": 37.701,
592
+ "eval_steps_per_second": 0.603,
593
  "step": 300
594
  },
595
  {
596
+ "debug/losses": 0.12721626460552216,
597
+ "debug/policy_weights": 0.25444597005844116,
598
+ "debug/raw_losses": 0.49146708846092224,
599
  "epoch": 0.65,
600
  "learning_rate": 1.6583128063291573e-07,
601
+ "logits/chosen": -2.169189453125,
602
+ "logits/rejected": -2.12001895904541,
603
+ "logps/chosen": -451.0543518066406,
604
+ "logps/rejected": -501.00830078125,
605
+ "loss": 0.1339,
606
+ "rewards/accuracies": 0.7749999761581421,
607
+ "rewards/chosen": -1.6334816217422485,
608
+ "rewards/margins": 0.8188700675964355,
609
+ "rewards/rejected": -2.4523518085479736,
610
  "step": 310
611
  },
612
  {
613
+ "debug/losses": 0.1101643294095993,
614
+ "debug/policy_weights": 0.24654574692249298,
615
+ "debug/raw_losses": 0.46620503067970276,
616
  "epoch": 0.67,
617
  "learning_rate": 1.488723393865766e-07,
618
+ "logits/chosen": -2.1372463703155518,
619
+ "logits/rejected": -2.0980920791625977,
620
+ "logps/chosen": -443.265625,
621
+ "logps/rejected": -513.0931396484375,
622
+ "loss": 0.1228,
623
  "rewards/accuracies": 0.78125,
624
+ "rewards/chosen": -1.5740420818328857,
625
+ "rewards/margins": 0.9036803245544434,
626
+ "rewards/rejected": -2.477722644805908,
627
  "step": 320
628
  },
629
  {
630
+ "debug/losses": 0.12689927220344543,
631
+ "debug/policy_weights": 0.25536665320396423,
632
+ "debug/raw_losses": 0.49713826179504395,
633
  "epoch": 0.69,
634
  "learning_rate": 1.3245295796480788e-07,
635
+ "logits/chosen": -2.1500933170318604,
636
+ "logits/rejected": -2.1095988750457764,
637
+ "logps/chosen": -424.4521484375,
638
+ "logps/rejected": -503.61163330078125,
639
+ "loss": 0.1324,
640
+ "rewards/accuracies": 0.793749988079071,
641
+ "rewards/chosen": -1.5042273998260498,
642
+ "rewards/margins": 0.8895484805107117,
643
+ "rewards/rejected": -2.3937759399414062,
644
  "step": 330
645
  },
646
  {
647
+ "debug/losses": 0.11528172343969345,
648
+ "debug/policy_weights": 0.24339346587657928,
649
+ "debug/raw_losses": 0.48817843198776245,
650
  "epoch": 0.71,
651
  "learning_rate": 1.1666074087171627e-07,
652
+ "logits/chosen": -2.1724162101745605,
653
+ "logits/rejected": -2.1479129791259766,
654
+ "logps/chosen": -403.76806640625,
655
+ "logps/rejected": -496.89703369140625,
656
+ "loss": 0.1239,
657
  "rewards/accuracies": 0.75,
658
+ "rewards/chosen": -1.4917190074920654,
659
+ "rewards/margins": 0.9479940533638,
660
+ "rewards/rejected": -2.4397130012512207,
661
  "step": 340
662
  },
663
  {
664
+ "debug/losses": 0.11798451095819473,
665
+ "debug/policy_weights": 0.23595662415027618,
666
+ "debug/raw_losses": 0.5302962064743042,
667
  "epoch": 0.73,
668
  "learning_rate": 1.0157994641835734e-07,
669
+ "logits/chosen": -2.148632526397705,
670
+ "logits/rejected": -2.1165192127227783,
671
+ "logps/chosen": -442.53057861328125,
672
+ "logps/rejected": -513.9363403320312,
673
+ "loss": 0.1374,
674
+ "rewards/accuracies": 0.7124999761581421,
675
+ "rewards/chosen": -1.6607694625854492,
676
+ "rewards/margins": 0.8479129672050476,
677
+ "rewards/rejected": -2.5086822509765625,
678
  "step": 350
679
  },
680
  {
681
+ "debug/losses": 0.10292885452508926,
682
+ "debug/policy_weights": 0.22015142440795898,
683
+ "debug/raw_losses": 0.4573485255241394,
684
  "epoch": 0.75,
685
  "learning_rate": 8.729103716819111e-08,
686
+ "logits/chosen": -2.1121697425842285,
687
+ "logits/rejected": -2.0813915729522705,
688
+ "logps/chosen": -401.754150390625,
689
+ "logps/rejected": -494.1514587402344,
690
+ "loss": 0.1311,
691
+ "rewards/accuracies": 0.762499988079071,
692
+ "rewards/chosen": -1.4336955547332764,
693
+ "rewards/margins": 1.074181079864502,
694
+ "rewards/rejected": -2.5078768730163574,
695
  "step": 360
696
  },
697
  {
698
+ "debug/losses": 0.13746492564678192,
699
+ "debug/policy_weights": 0.25476521253585815,
700
+ "debug/raw_losses": 0.5358820557594299,
701
  "epoch": 0.77,
702
  "learning_rate": 7.387025063449081e-08,
703
+ "logits/chosen": -2.175912380218506,
704
+ "logits/rejected": -2.1576006412506104,
705
+ "logps/chosen": -416.6788024902344,
706
+ "logps/rejected": -477.13153076171875,
707
+ "loss": 0.1253,
708
+ "rewards/accuracies": 0.7124999761581421,
709
+ "rewards/chosen": -1.3861879110336304,
710
+ "rewards/margins": 0.7486017346382141,
711
+ "rewards/rejected": -2.1347897052764893,
712
  "step": 370
713
  },
714
  {
715
+ "debug/losses": 0.14446747303009033,
716
+ "debug/policy_weights": 0.2367408275604248,
717
+ "debug/raw_losses": 0.5707719326019287,
718
  "epoch": 0.79,
719
  "learning_rate": 6.138919252022435e-08,
720
+ "logits/chosen": -2.084808111190796,
721
+ "logits/rejected": -2.070844888687134,
722
+ "logps/chosen": -394.0305480957031,
723
+ "logps/rejected": -481.216552734375,
724
+ "loss": 0.1285,
725
+ "rewards/accuracies": 0.699999988079071,
726
+ "rewards/chosen": -1.7118937969207764,
727
+ "rewards/margins": 0.6706421375274658,
728
+ "rewards/rejected": -2.3825364112854004,
729
  "step": 380
730
  },
731
  {
732
+ "debug/losses": 0.12546047568321228,
733
+ "debug/policy_weights": 0.24170584976673126,
734
+ "debug/raw_losses": 0.4937317967414856,
735
  "epoch": 0.82,
736
  "learning_rate": 4.991445467064689e-08,
737
+ "logits/chosen": -2.1555511951446533,
738
+ "logits/rejected": -2.1209189891815186,
739
+ "logps/chosen": -406.2367248535156,
740
+ "logps/rejected": -479.50567626953125,
741
+ "loss": 0.1302,
742
+ "rewards/accuracies": 0.7875000238418579,
743
+ "rewards/chosen": -1.5467679500579834,
744
+ "rewards/margins": 0.8932439684867859,
745
+ "rewards/rejected": -2.440011501312256,
746
  "step": 390
747
  },
748
  {
749
+ "debug/losses": 0.13418573141098022,
750
+ "debug/policy_weights": 0.24948246777057648,
751
+ "debug/raw_losses": 0.5159034132957458,
752
  "epoch": 0.84,
753
  "learning_rate": 3.9507259776993954e-08,
754
+ "logits/chosen": -2.1299073696136475,
755
+ "logits/rejected": -2.0702521800994873,
756
+ "logps/chosen": -448.74298095703125,
757
+ "logps/rejected": -487.4234313964844,
758
+ "loss": 0.1329,
759
+ "rewards/accuracies": 0.7562500238418579,
760
+ "rewards/chosen": -1.6303924322128296,
761
+ "rewards/margins": 0.7685388326644897,
762
+ "rewards/rejected": -2.3989315032958984,
763
  "step": 400
764
  },
765
  {
766
  "epoch": 0.84,
767
+ "eval_debug/losses": 0.12970629334449768,
768
+ "eval_debug/policy_weights": 0.25886857509613037,
769
+ "eval_debug/raw_losses": 0.48170554637908936,
770
+ "eval_logits/chosen": -2.18296217918396,
771
+ "eval_logits/rejected": -2.1507883071899414,
772
+ "eval_logps/chosen": -409.0387878417969,
773
+ "eval_logps/rejected": -500.7933654785156,
774
+ "eval_loss": 0.1314304769039154,
775
+ "eval_rewards/accuracies": 0.75,
776
+ "eval_rewards/chosen": -1.519991159439087,
777
+ "eval_rewards/margins": 0.9144098162651062,
778
+ "eval_rewards/rejected": -2.434401035308838,
779
+ "eval_runtime": 53.0316,
780
+ "eval_samples_per_second": 37.713,
781
+ "eval_steps_per_second": 0.603,
782
  "step": 400
783
  },
784
  {
785
+ "debug/losses": 0.13898980617523193,
786
+ "debug/policy_weights": 0.24470162391662598,
787
+ "debug/raw_losses": 0.5698193907737732,
788
  "epoch": 0.86,
789
  "learning_rate": 3.022313472693447e-08,
790
+ "logits/chosen": -2.1896605491638184,
791
+ "logits/rejected": -2.1111254692077637,
792
+ "logps/chosen": -431.5157165527344,
793
+ "logps/rejected": -519.735595703125,
794
+ "loss": 0.1335,
795
+ "rewards/accuracies": 0.71875,
796
+ "rewards/chosen": -1.67926025390625,
797
+ "rewards/margins": 0.7034608125686646,
798
+ "rewards/rejected": -2.382721185684204,
799
  "step": 410
800
  },
801
  {
802
+ "debug/losses": 0.13722026348114014,
803
+ "debug/policy_weights": 0.25542253255844116,
804
+ "debug/raw_losses": 0.553850531578064,
805
  "epoch": 0.88,
806
  "learning_rate": 2.2111614344599684e-08,
807
+ "logits/chosen": -2.1816725730895996,
808
+ "logits/rejected": -2.1436891555786133,
809
+ "logps/chosen": -450.2588806152344,
810
+ "logps/rejected": -532.4340209960938,
811
+ "loss": 0.132,
812
+ "rewards/accuracies": 0.668749988079071,
813
+ "rewards/chosen": -1.6344585418701172,
814
+ "rewards/margins": 0.765791654586792,
815
+ "rewards/rejected": -2.40024995803833,
816
  "step": 420
817
  },
818
  {
819
+ "debug/losses": 0.11242847144603729,
820
+ "debug/policy_weights": 0.23566405475139618,
821
+ "debug/raw_losses": 0.474797785282135,
822
  "epoch": 0.9,
823
  "learning_rate": 1.521597710086439e-08,
824
+ "logits/chosen": -2.1367993354797363,
825
+ "logits/rejected": -2.101963520050049,
826
+ "logps/chosen": -447.2886657714844,
827
+ "logps/rejected": -528.4554443359375,
828
+ "loss": 0.135,
829
  "rewards/accuracies": 0.768750011920929,
830
+ "rewards/chosen": -1.57035493850708,
831
+ "rewards/margins": 0.9703021049499512,
832
+ "rewards/rejected": -2.5406570434570312,
833
  "step": 430
834
  },
835
  {
836
+ "debug/losses": 0.14394986629486084,
837
+ "debug/policy_weights": 0.2594669461250305,
838
+ "debug/raw_losses": 0.551374614238739,
839
  "epoch": 0.92,
840
  "learning_rate": 9.57301420397924e-09,
841
+ "logits/chosen": -2.1795363426208496,
842
+ "logits/rejected": -2.1459131240844727,
843
+ "logps/chosen": -479.01873779296875,
844
+ "logps/rejected": -490.289794921875,
845
+ "loss": 0.128,
846
+ "rewards/accuracies": 0.7250000238418579,
847
+ "rewards/chosen": -1.6396774053573608,
848
+ "rewards/margins": 0.6072880625724792,
849
+ "rewards/rejected": -2.2469656467437744,
850
  "step": 440
851
  },
852
  {
853
+ "debug/losses": 0.12136085331439972,
854
+ "debug/policy_weights": 0.25487110018730164,
855
+ "debug/raw_losses": 0.5141120553016663,
856
  "epoch": 0.94,
857
  "learning_rate": 5.212833302556258e-09,
858
+ "logits/chosen": -2.131383180618286,
859
+ "logits/rejected": -2.101245880126953,
860
+ "logps/chosen": -428.8694763183594,
861
+ "logps/rejected": -482.65380859375,
862
+ "loss": 0.1317,
863
+ "rewards/accuracies": 0.762499988079071,
864
+ "rewards/chosen": -1.4801188707351685,
865
+ "rewards/margins": 0.8445402383804321,
866
+ "rewards/rejected": -2.3246593475341797,
867
  "step": 450
868
  },
869
  {
870
+ "debug/losses": 0.13340520858764648,
871
+ "debug/policy_weights": 0.26456892490386963,
872
+ "debug/raw_losses": 0.5068139433860779,
873
  "epoch": 0.96,
874
  "learning_rate": 2.158697848236607e-09,
875
+ "logits/chosen": -2.1625778675079346,
876
+ "logits/rejected": -2.118445873260498,
877
+ "logps/chosen": -446.69012451171875,
878
+ "logps/rejected": -497.6316833496094,
879
+ "loss": 0.1415,
880
+ "rewards/accuracies": 0.7749999761581421,
881
+ "rewards/chosen": -1.46303391456604,
882
+ "rewards/margins": 0.8069203495979309,
883
+ "rewards/rejected": -2.2699544429779053,
884
  "step": 460
885
  },
886
  {
887
+ "debug/losses": 0.13318563997745514,
888
+ "debug/policy_weights": 0.2579984962940216,
889
+ "debug/raw_losses": 0.5174868702888489,
890
  "epoch": 0.98,
891
  "learning_rate": 4.269029751107489e-10,
892
+ "logits/chosen": -2.1812849044799805,
893
+ "logits/rejected": -2.100320816040039,
894
+ "logps/chosen": -435.08428955078125,
895
+ "logps/rejected": -452.8404235839844,
896
+ "loss": 0.1302,
897
  "rewards/accuracies": 0.699999988079071,
898
+ "rewards/chosen": -1.3687784671783447,
899
+ "rewards/margins": 0.8292155265808105,
900
+ "rewards/rejected": -2.1979942321777344,
901
  "step": 470
902
  },
903
  {
904
  "epoch": 1.0,
905
  "step": 478,
906
  "total_flos": 0.0,
907
+ "train_loss": 0.17621936496331603,
908
+ "train_runtime": 4510.4366,
909
+ "train_samples_per_second": 13.554,
910
+ "train_steps_per_second": 0.106
911
  }
912
  ],
913
  "logging_steps": 10,