RikkiXu commited on
Commit
4dbbd2f
1 Parent(s): 8692c3b

Model save

Browse files
README.md CHANGED
@@ -15,15 +15,15 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  This model was trained from scratch on the None dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 0.1420
19
- - Rewards/chosen: 5.3389
20
- - Rewards/rejected: -11.5415
21
- - Rewards/accuracies: 0.9258
22
- - Rewards/margins: 16.8803
23
- - Logps/rejected: -131.7123
24
- - Logps/chosen: -119.9761
25
- - Logits/rejected: -1.8130
26
- - Logits/chosen: -1.8134
27
 
28
  ## Model description
29
 
@@ -60,10 +60,10 @@ The following hyperparameters were used during training:
60
 
61
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
62
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
63
- | 0.1496 | 0.21 | 100 | 0.1356 | 4.1320 | -11.2810 | 0.9414 | 15.4129 | -130.8439 | -123.9990 | -1.7910 | -1.8010 |
64
- | 0.1795 | 0.42 | 200 | 0.1364 | 5.2675 | -11.0420 | 0.9336 | 16.3095 | -130.0476 | -120.2140 | -1.8607 | -1.8614 |
65
- | 0.1585 | 0.63 | 300 | 0.1425 | 5.1387 | -11.7029 | 0.9258 | 16.8416 | -132.2504 | -120.6432 | -1.7960 | -1.7980 |
66
- | 0.2005 | 0.84 | 400 | 0.1420 | 5.3389 | -11.5415 | 0.9258 | 16.8803 | -131.7123 | -119.9761 | -1.8130 | -1.8134 |
67
 
68
 
69
  ### Framework versions
 
15
 
16
  This model was trained from scratch on the None dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 0.1390
19
+ - Rewards/chosen: 3.4895
20
+ - Rewards/rejected: -9.2522
21
+ - Rewards/accuracies: 0.9297
22
+ - Rewards/margins: 12.7417
23
+ - Logps/rejected: -139.5015
24
+ - Logps/chosen: -120.3246
25
+ - Logits/rejected: -1.8106
26
+ - Logits/chosen: -1.8098
27
 
28
  ## Model description
29
 
 
60
 
61
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
62
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
63
+ | 0.1523 | 0.21 | 100 | 0.1399 | 2.5441 | -8.9516 | 0.9375 | 11.4956 | -137.9985 | -125.0519 | -1.8014 | -1.8101 |
64
+ | 0.176 | 0.42 | 200 | 0.1358 | 3.3974 | -8.7531 | 0.9375 | 12.1505 | -137.0064 | -120.7853 | -1.8762 | -1.8764 |
65
+ | 0.1509 | 0.63 | 300 | 0.1403 | 3.3534 | -9.3163 | 0.9336 | 12.6696 | -139.8221 | -121.0054 | -1.7873 | -1.7875 |
66
+ | 0.2009 | 0.84 | 400 | 0.1390 | 3.4895 | -9.2522 | 0.9297 | 12.7417 | -139.5015 | -120.3246 | -1.8106 | -1.8098 |
67
 
68
 
69
  ### Framework versions
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.17749474911510196,
4
- "train_runtime": 7645.2484,
5
  "train_samples": 61135,
6
- "train_samples_per_second": 7.996,
7
  "train_steps_per_second": 0.063
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.17478099153630405,
4
+ "train_runtime": 7645.1097,
5
  "train_samples": 61135,
6
+ "train_samples_per_second": 7.997,
7
  "train_steps_per_second": 0.063
8
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5505459ad98890a3203705be3e3b4413c1fdb1bed021f564a6052d2b7286ba53
3
  size 4943178720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8fe36024d6aef3b53eef0e9ec42db4750fe5b59749cc0d952e83d3dc0c53701
3
  size 4943178720
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9cd4bbebcca95695bc07db64291f4364758bb91a1c6301d838900b964109691
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27281dd482564b4b96afafdbf09d408b28d4f3d5fae15c627e97b427e5e9846f
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6603cc6955b683f2147a8107dd2423db37551bb18757806d3089f527c647290d
3
  size 4540532728
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b852db6837b7ef3c2835e6c040e79055a89cbb91a5e93609a246227dc052c44
3
  size 4540532728
runs/May13_12-29-41_n136-129-074/events.out.tfevents.1715574720.n136-129-074.1898043.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16d91bcfd42684acb3c691ead1f7c83163f47315ea3bc893779f9fcbf4cdd06c
3
- size 35913
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73cbd01c36e5321599ce0320582bd802ecd27f095f002acc1abc694f14aa3849
3
+ size 41083
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.17749474911510196,
4
- "train_runtime": 7645.2484,
5
  "train_samples": 61135,
6
- "train_samples_per_second": 7.996,
7
  "train_steps_per_second": 0.063
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.17478099153630405,
4
+ "train_runtime": 7645.1097,
5
  "train_samples": 61135,
6
+ "train_samples_per_second": 7.997,
7
  "train_steps_per_second": 0.063
8
  }
trainer_state.json CHANGED
@@ -10,7 +10,7 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "grad_norm": 1316.2617480695828,
14
  "learning_rate": 1.0416666666666666e-08,
15
  "logits/chosen": -1.689455509185791,
16
  "logits/rejected": -1.4794573783874512,
@@ -25,780 +25,780 @@
25
  },
26
  {
27
  "epoch": 0.02,
28
- "grad_norm": 1084.7724692148897,
29
  "learning_rate": 1.0416666666666667e-07,
30
- "logits/chosen": -1.707624912261963,
31
- "logits/rejected": -1.6101186275482178,
32
- "logps/chosen": -139.66224670410156,
33
- "logps/rejected": -91.32621002197266,
34
- "loss": 0.686,
35
- "rewards/accuracies": 0.5,
36
- "rewards/chosen": 0.01818913221359253,
37
- "rewards/margins": 0.027222516015172005,
38
- "rewards/rejected": -0.009033381938934326,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.04,
43
- "grad_norm": 372.5890585979663,
44
  "learning_rate": 2.0833333333333333e-07,
45
- "logits/chosen": -1.6384038925170898,
46
- "logits/rejected": -1.6487312316894531,
47
- "logps/chosen": -130.37515258789062,
48
- "logps/rejected": -93.99095153808594,
49
- "loss": 0.4495,
50
- "rewards/accuracies": 0.84375,
51
- "rewards/chosen": 0.6898423433303833,
52
- "rewards/margins": 0.9020320177078247,
53
- "rewards/rejected": -0.21218962967395782,
54
  "step": 20
55
  },
56
  {
57
  "epoch": 0.06,
58
- "grad_norm": 363.4016752055454,
59
  "learning_rate": 3.1249999999999997e-07,
60
- "logits/chosen": -1.6896642446517944,
61
- "logits/rejected": -1.6273959875106812,
62
- "logps/chosen": -130.80935668945312,
63
- "logps/rejected": -106.37054443359375,
64
- "loss": 0.2556,
65
- "rewards/accuracies": 0.9437500238418579,
66
- "rewards/chosen": 2.216484785079956,
67
- "rewards/margins": 3.602745771408081,
68
- "rewards/rejected": -1.386260986328125,
69
  "step": 30
70
  },
71
  {
72
  "epoch": 0.08,
73
- "grad_norm": 254.07955635631413,
74
  "learning_rate": 4.1666666666666667e-07,
75
- "logits/chosen": -1.6251739263534546,
76
- "logits/rejected": -1.5512189865112305,
77
- "logps/chosen": -142.7510528564453,
78
- "logps/rejected": -113.31219482421875,
79
- "loss": 0.1925,
80
- "rewards/accuracies": 0.925000011920929,
81
- "rewards/chosen": 2.6909842491149902,
82
- "rewards/margins": 7.113295078277588,
83
- "rewards/rejected": -4.422310829162598,
84
  "step": 40
85
  },
86
  {
87
  "epoch": 0.1,
88
- "grad_norm": 450.1083647442556,
89
  "learning_rate": 4.999733114418725e-07,
90
- "logits/chosen": -1.5827839374542236,
91
- "logits/rejected": -1.6095731258392334,
92
- "logps/chosen": -127.47169494628906,
93
- "logps/rejected": -124.7472152709961,
94
- "loss": 0.187,
95
- "rewards/accuracies": 0.8687499761581421,
96
- "rewards/chosen": 2.2808213233947754,
97
- "rewards/margins": 9.9701509475708,
98
- "rewards/rejected": -7.6893310546875,
99
  "step": 50
100
  },
101
  {
102
  "epoch": 0.13,
103
- "grad_norm": 256.12553340364644,
104
  "learning_rate": 4.990398100856366e-07,
105
- "logits/chosen": -1.6854372024536133,
106
- "logits/rejected": -1.6347172260284424,
107
- "logps/chosen": -143.6377410888672,
108
- "logps/rejected": -138.22506713867188,
109
- "loss": 0.1745,
110
- "rewards/accuracies": 0.9437500238418579,
111
- "rewards/chosen": 2.208740711212158,
112
- "rewards/margins": 13.148699760437012,
113
- "rewards/rejected": -10.939959526062012,
114
  "step": 60
115
  },
116
  {
117
  "epoch": 0.15,
118
- "grad_norm": 266.6931182384945,
119
  "learning_rate": 4.967775735898179e-07,
120
- "logits/chosen": -1.6533622741699219,
121
- "logits/rejected": -1.6828343868255615,
122
- "logps/chosen": -136.77774047851562,
123
- "logps/rejected": -139.76748657226562,
124
- "loss": 0.1719,
125
- "rewards/accuracies": 0.9125000238418579,
126
- "rewards/chosen": 2.496314525604248,
127
- "rewards/margins": 15.036949157714844,
128
- "rewards/rejected": -12.540634155273438,
129
  "step": 70
130
  },
131
  {
132
  "epoch": 0.17,
133
- "grad_norm": 311.4093156505733,
134
  "learning_rate": 4.931986719649298e-07,
135
- "logits/chosen": -1.6512008905410767,
136
- "logits/rejected": -1.642218828201294,
137
- "logps/chosen": -129.7277374267578,
138
- "logps/rejected": -130.1470489501953,
139
- "loss": 0.1638,
140
- "rewards/accuracies": 0.918749988079071,
141
- "rewards/chosen": 3.2196338176727295,
142
- "rewards/margins": 14.145663261413574,
143
- "rewards/rejected": -10.926031112670898,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 0.19,
148
- "grad_norm": 353.08827705610673,
149
  "learning_rate": 4.883222001996351e-07,
150
- "logits/chosen": -1.7057621479034424,
151
- "logits/rejected": -1.7253615856170654,
152
- "logps/chosen": -136.57843017578125,
153
- "logps/rejected": -141.9429931640625,
154
- "loss": 0.1501,
155
  "rewards/accuracies": 0.918749988079071,
156
- "rewards/chosen": 3.7895398139953613,
157
- "rewards/margins": 15.804595947265625,
158
- "rewards/rejected": -12.015056610107422,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 0.21,
163
- "grad_norm": 173.53718581228586,
164
  "learning_rate": 4.821741763807186e-07,
165
- "logits/chosen": -1.7514231204986572,
166
- "logits/rejected": -1.7408148050308228,
167
- "logps/chosen": -116.7274169921875,
168
- "logps/rejected": -125.6600570678711,
169
- "loss": 0.1496,
170
- "rewards/accuracies": 0.90625,
171
- "rewards/chosen": 3.375108242034912,
172
- "rewards/margins": 13.876774787902832,
173
- "rewards/rejected": -10.501666069030762,
174
  "step": 100
175
  },
176
  {
177
  "epoch": 0.21,
178
- "eval_logits/chosen": -1.8009788990020752,
179
- "eval_logits/rejected": -1.790999174118042,
180
- "eval_logps/chosen": -123.99901580810547,
181
- "eval_logps/rejected": -130.8439178466797,
182
- "eval_loss": 0.13556738197803497,
183
- "eval_rewards/accuracies": 0.94140625,
184
- "eval_rewards/chosen": 4.131972789764404,
185
- "eval_rewards/margins": 15.41292953491211,
186
- "eval_rewards/rejected": -11.280956268310547,
187
- "eval_runtime": 97.6442,
188
- "eval_samples_per_second": 20.483,
189
  "eval_steps_per_second": 0.328,
190
  "step": 100
191
  },
192
  {
193
  "epoch": 0.23,
194
- "grad_norm": 220.3790872760113,
195
  "learning_rate": 4.747874028753375e-07,
196
- "logits/chosen": -1.6608690023422241,
197
- "logits/rejected": -1.7199954986572266,
198
- "logps/chosen": -120.6917724609375,
199
- "logps/rejected": -133.25762939453125,
200
- "loss": 0.1546,
201
  "rewards/accuracies": 0.9375,
202
- "rewards/chosen": 3.4064812660217285,
203
- "rewards/margins": 12.731483459472656,
204
- "rewards/rejected": -9.325002670288086,
205
  "step": 110
206
  },
207
  {
208
  "epoch": 0.25,
209
- "grad_norm": 179.12119430014053,
210
  "learning_rate": 4.662012913161997e-07,
211
- "logits/chosen": -1.7208821773529053,
212
- "logits/rejected": -1.7148889303207397,
213
- "logps/chosen": -118.548583984375,
214
- "logps/rejected": -133.46463012695312,
215
- "loss": 0.1456,
216
- "rewards/accuracies": 0.875,
217
- "rewards/chosen": 4.286909580230713,
218
- "rewards/margins": 13.92347240447998,
219
- "rewards/rejected": -9.636563301086426,
220
  "step": 120
221
  },
222
  {
223
  "epoch": 0.27,
224
- "grad_norm": 119.5811123757762,
225
  "learning_rate": 4.5646165232345103e-07,
226
- "logits/chosen": -1.688997507095337,
227
- "logits/rejected": -1.7153244018554688,
228
- "logps/chosen": -124.4625244140625,
229
- "logps/rejected": -129.4587860107422,
230
- "loss": 0.1477,
231
- "rewards/accuracies": 0.918749988079071,
232
- "rewards/chosen": 4.8756022453308105,
233
- "rewards/margins": 15.727206230163574,
234
- "rewards/rejected": -10.851605415344238,
235
  "step": 130
236
  },
237
  {
238
  "epoch": 0.29,
239
- "grad_norm": 293.67492501764275,
240
  "learning_rate": 4.456204510851956e-07,
241
- "logits/chosen": -1.6008217334747314,
242
- "logits/rejected": -1.5764399766921997,
243
- "logps/chosen": -121.84040832519531,
244
- "logps/rejected": -125.70499420166016,
245
- "loss": 0.1534,
246
- "rewards/accuracies": 0.90625,
247
- "rewards/chosen": 4.718628406524658,
248
- "rewards/margins": 14.510149955749512,
249
- "rewards/rejected": -9.791521072387695,
250
  "step": 140
251
  },
252
  {
253
  "epoch": 0.31,
254
- "grad_norm": 250.47040288737202,
255
  "learning_rate": 4.337355301007335e-07,
256
- "logits/chosen": -1.7394781112670898,
257
- "logits/rejected": -1.7636123895645142,
258
- "logps/chosen": -119.86863708496094,
259
- "logps/rejected": -122.49459075927734,
260
- "loss": 0.1403,
261
  "rewards/accuracies": 0.8999999761581421,
262
- "rewards/chosen": 4.755049705505371,
263
- "rewards/margins": 14.378946304321289,
264
- "rewards/rejected": -9.623896598815918,
265
  "step": 150
266
  },
267
  {
268
  "epoch": 0.33,
269
- "grad_norm": 394.54185792168863,
270
  "learning_rate": 4.2087030056579986e-07,
271
- "logits/chosen": -1.6095482110977173,
272
- "logits/rejected": -1.5893223285675049,
273
- "logps/chosen": -127.0114517211914,
274
- "logps/rejected": -128.70870971679688,
275
- "loss": 0.1577,
276
- "rewards/accuracies": 0.925000011920929,
277
- "rewards/chosen": 4.486311435699463,
278
- "rewards/margins": 13.415287971496582,
279
- "rewards/rejected": -8.928976058959961,
280
  "step": 160
281
  },
282
  {
283
  "epoch": 0.36,
284
- "grad_norm": 336.28954028277553,
285
  "learning_rate": 4.070934040463998e-07,
286
- "logits/chosen": -1.7846260070800781,
287
- "logits/rejected": -1.7679131031036377,
288
- "logps/chosen": -121.9030990600586,
289
- "logps/rejected": -130.14794921875,
290
- "loss": 0.1516,
291
- "rewards/accuracies": 0.9312499761581421,
292
- "rewards/chosen": 5.171528339385986,
293
- "rewards/margins": 15.514431953430176,
294
- "rewards/rejected": -10.342904090881348,
295
  "step": 170
296
  },
297
  {
298
  "epoch": 0.38,
299
- "grad_norm": 266.7294156199543,
300
  "learning_rate": 3.9247834624635404e-07,
301
- "logits/chosen": -1.7311102151870728,
302
- "logits/rejected": -1.725064992904663,
303
- "logps/chosen": -125.84059143066406,
304
- "logps/rejected": -125.44111633300781,
305
- "loss": 0.1616,
306
- "rewards/accuracies": 0.90625,
307
- "rewards/chosen": 5.19378137588501,
308
- "rewards/margins": 15.565200805664062,
309
- "rewards/rejected": -10.371419906616211,
310
  "step": 180
311
  },
312
  {
313
  "epoch": 0.4,
314
- "grad_norm": 243.44642169675112,
315
  "learning_rate": 3.7710310482256523e-07,
316
- "logits/chosen": -1.764722466468811,
317
- "logits/rejected": -1.7476263046264648,
318
- "logps/chosen": -114.77116394042969,
319
- "logps/rejected": -139.33917236328125,
320
- "loss": 0.1684,
321
  "rewards/accuracies": 0.887499988079071,
322
- "rewards/chosen": 5.108515739440918,
323
- "rewards/margins": 16.28169822692871,
324
- "rewards/rejected": -11.173181533813477,
325
  "step": 190
326
  },
327
  {
328
  "epoch": 0.42,
329
- "grad_norm": 258.83830985573707,
330
  "learning_rate": 3.610497133404795e-07,
331
- "logits/chosen": -1.7498699426651,
332
- "logits/rejected": -1.7545902729034424,
333
- "logps/chosen": -120.42120361328125,
334
- "logps/rejected": -123.0103988647461,
335
- "loss": 0.1795,
336
- "rewards/accuracies": 0.9375,
337
- "rewards/chosen": 5.614555835723877,
338
- "rewards/margins": 16.115243911743164,
339
- "rewards/rejected": -10.500688552856445,
340
  "step": 200
341
  },
342
  {
343
  "epoch": 0.42,
344
- "eval_logits/chosen": -1.86138117313385,
345
- "eval_logits/rejected": -1.8606913089752197,
346
- "eval_logps/chosen": -120.21395111083984,
347
- "eval_logps/rejected": -130.0475616455078,
348
- "eval_loss": 0.13641399145126343,
349
- "eval_rewards/accuracies": 0.93359375,
350
- "eval_rewards/chosen": 5.267488479614258,
351
- "eval_rewards/margins": 16.30953025817871,
352
- "eval_rewards/rejected": -11.042043685913086,
353
- "eval_runtime": 97.5652,
354
- "eval_samples_per_second": 20.499,
355
  "eval_steps_per_second": 0.328,
356
  "step": 200
357
  },
358
  {
359
  "epoch": 0.44,
360
- "grad_norm": 258.76531038458563,
361
  "learning_rate": 3.4440382358952115e-07,
362
- "logits/chosen": -1.689432144165039,
363
- "logits/rejected": -1.632364273071289,
364
- "logps/chosen": -117.7408447265625,
365
- "logps/rejected": -117.53926086425781,
366
- "loss": 0.1494,
367
- "rewards/accuracies": 0.887499988079071,
368
- "rewards/chosen": 4.45497989654541,
369
- "rewards/margins": 12.836206436157227,
370
- "rewards/rejected": -8.381224632263184,
371
  "step": 210
372
  },
373
  {
374
  "epoch": 0.46,
375
- "grad_norm": 163.97448768931915,
376
  "learning_rate": 3.272542485937368e-07,
377
- "logits/chosen": -1.8162240982055664,
378
- "logits/rejected": -1.8604834079742432,
379
- "logps/chosen": -118.64457702636719,
380
- "logps/rejected": -118.34297180175781,
381
- "loss": 0.1524,
382
- "rewards/accuracies": 0.925000011920929,
383
- "rewards/chosen": 5.34921407699585,
384
- "rewards/margins": 14.593510627746582,
385
- "rewards/rejected": -9.244296073913574,
386
  "step": 220
387
  },
388
  {
389
  "epoch": 0.48,
390
- "grad_norm": 122.96583033580265,
391
  "learning_rate": 3.096924887558854e-07,
392
- "logits/chosen": -1.7572132349014282,
393
- "logits/rejected": -1.708581566810608,
394
- "logps/chosen": -131.16842651367188,
395
- "logps/rejected": -137.84829711914062,
396
- "loss": 0.1815,
397
  "rewards/accuracies": 0.9437500238418579,
398
- "rewards/chosen": 4.962122440338135,
399
- "rewards/margins": 16.372663497924805,
400
- "rewards/rejected": -11.410540580749512,
401
  "step": 230
402
  },
403
  {
404
  "epoch": 0.5,
405
- "grad_norm": 300.9054437339817,
406
  "learning_rate": 2.9181224366319943e-07,
407
- "logits/chosen": -1.768376111984253,
408
- "logits/rejected": -1.7853620052337646,
409
- "logps/chosen": -118.08064270019531,
410
- "logps/rejected": -129.52337646484375,
411
- "loss": 0.1638,
412
- "rewards/accuracies": 0.8999999761581421,
413
- "rewards/chosen": 4.15334415435791,
414
- "rewards/margins": 13.721521377563477,
415
- "rewards/rejected": -9.568175315856934,
416
  "step": 240
417
  },
418
  {
419
  "epoch": 0.52,
420
- "grad_norm": 279.7087004166346,
421
  "learning_rate": 2.7370891215954565e-07,
422
- "logits/chosen": -1.7569881677627563,
423
- "logits/rejected": -1.7871322631835938,
424
- "logps/chosen": -121.43367767333984,
425
- "logps/rejected": -130.93099975585938,
426
- "loss": 0.1737,
427
- "rewards/accuracies": 0.9312499761581421,
428
- "rewards/chosen": 4.7944416999816895,
429
- "rewards/margins": 14.978727340698242,
430
- "rewards/rejected": -10.184286117553711,
431
  "step": 250
432
  },
433
  {
434
  "epoch": 0.54,
435
- "grad_norm": 163.84527157544522,
436
  "learning_rate": 2.55479083351317e-07,
437
- "logits/chosen": -1.7968614101409912,
438
- "logits/rejected": -1.8222767114639282,
439
- "logps/chosen": -125.540283203125,
440
- "logps/rejected": -119.97703552246094,
441
- "loss": 0.1484,
442
- "rewards/accuracies": 0.90625,
443
- "rewards/chosen": 5.486014366149902,
444
- "rewards/margins": 16.016239166259766,
445
- "rewards/rejected": -10.530224800109863,
446
  "step": 260
447
  },
448
  {
449
  "epoch": 0.56,
450
- "grad_norm": 150.71761898213634,
451
  "learning_rate": 2.3722002126275822e-07,
452
- "logits/chosen": -1.787398099899292,
453
- "logits/rejected": -1.723350167274475,
454
- "logps/chosen": -115.22535705566406,
455
- "logps/rejected": -123.31414794921875,
456
- "loss": 0.139,
457
- "rewards/accuracies": 0.90625,
458
- "rewards/chosen": 5.049933433532715,
459
- "rewards/margins": 14.674034118652344,
460
- "rewards/rejected": -9.624099731445312,
461
  "step": 270
462
  },
463
  {
464
  "epoch": 0.59,
465
- "grad_norm": 267.05255152770263,
466
  "learning_rate": 2.19029145890313e-07,
467
- "logits/chosen": -1.6403396129608154,
468
- "logits/rejected": -1.728877067565918,
469
- "logps/chosen": -121.6760482788086,
470
- "logps/rejected": -137.2602996826172,
471
- "loss": 0.1837,
472
  "rewards/accuracies": 0.918749988079071,
473
- "rewards/chosen": 4.614184379577637,
474
- "rewards/margins": 15.387414932250977,
475
- "rewards/rejected": -10.773229598999023,
476
  "step": 280
477
  },
478
  {
479
  "epoch": 0.61,
480
- "grad_norm": 355.7773679008215,
481
  "learning_rate": 2.0100351342479216e-07,
482
- "logits/chosen": -1.7196556329727173,
483
- "logits/rejected": -1.7061948776245117,
484
- "logps/chosen": -112.95915222167969,
485
- "logps/rejected": -124.29833984375,
486
- "loss": 0.1648,
487
  "rewards/accuracies": 0.887499988079071,
488
- "rewards/chosen": 4.5611090660095215,
489
- "rewards/margins": 15.115681648254395,
490
- "rewards/rejected": -10.554571151733398,
491
  "step": 290
492
  },
493
  {
494
  "epoch": 0.63,
495
- "grad_norm": 387.9028451533635,
496
  "learning_rate": 1.8323929841460178e-07,
497
- "logits/chosen": -1.6907150745391846,
498
- "logits/rejected": -1.6335220336914062,
499
- "logps/chosen": -130.05313110351562,
500
- "logps/rejected": -141.36476135253906,
501
- "loss": 0.1585,
502
- "rewards/accuracies": 0.9312499761581421,
503
- "rewards/chosen": 4.916111946105957,
504
- "rewards/margins": 15.453518867492676,
505
- "rewards/rejected": -10.537405967712402,
506
  "step": 300
507
  },
508
  {
509
  "epoch": 0.63,
510
- "eval_logits/chosen": -1.7980220317840576,
511
- "eval_logits/rejected": -1.7959610223770142,
512
- "eval_logps/chosen": -120.6431655883789,
513
- "eval_logps/rejected": -132.25042724609375,
514
- "eval_loss": 0.14247241616249084,
515
- "eval_rewards/accuracies": 0.92578125,
516
- "eval_rewards/chosen": 5.138728141784668,
517
- "eval_rewards/margins": 16.841632843017578,
518
- "eval_rewards/rejected": -11.702906608581543,
519
- "eval_runtime": 97.7011,
520
- "eval_samples_per_second": 20.471,
521
- "eval_steps_per_second": 0.328,
522
  "step": 300
523
  },
524
  {
525
  "epoch": 0.65,
526
- "grad_norm": 308.7683125090126,
527
  "learning_rate": 1.6583128063291573e-07,
528
- "logits/chosen": -1.6941922903060913,
529
- "logits/rejected": -1.7201515436172485,
530
- "logps/chosen": -121.898193359375,
531
- "logps/rejected": -132.00962829589844,
532
- "loss": 0.1619,
533
  "rewards/accuracies": 0.9375,
534
- "rewards/chosen": 5.231846809387207,
535
- "rewards/margins": 15.2730131149292,
536
- "rewards/rejected": -10.041168212890625,
537
  "step": 310
538
  },
539
  {
540
  "epoch": 0.67,
541
- "grad_norm": 208.88097429038612,
542
  "learning_rate": 1.488723393865766e-07,
543
- "logits/chosen": -1.7199640274047852,
544
- "logits/rejected": -1.6923316717147827,
545
- "logps/chosen": -111.40040588378906,
546
- "logps/rejected": -131.6649627685547,
547
- "loss": 0.154,
548
- "rewards/accuracies": 0.8812500238418579,
549
- "rewards/chosen": 5.818942070007324,
550
- "rewards/margins": 16.441822052001953,
551
- "rewards/rejected": -10.62287712097168,
552
  "step": 320
553
  },
554
  {
555
  "epoch": 0.69,
556
- "grad_norm": 167.39144709959334,
557
  "learning_rate": 1.3245295796480788e-07,
558
- "logits/chosen": -1.65103018283844,
559
- "logits/rejected": -1.7277615070343018,
560
- "logps/chosen": -121.6876449584961,
561
- "logps/rejected": -127.0235366821289,
562
- "loss": 0.1507,
563
- "rewards/accuracies": 0.90625,
564
- "rewards/chosen": 4.948590278625488,
565
- "rewards/margins": 14.931228637695312,
566
- "rewards/rejected": -9.982640266418457,
567
  "step": 330
568
  },
569
  {
570
  "epoch": 0.71,
571
- "grad_norm": 156.90849982717606,
572
  "learning_rate": 1.1666074087171627e-07,
573
- "logits/chosen": -1.602086067199707,
574
- "logits/rejected": -1.6703577041625977,
575
- "logps/chosen": -115.71211242675781,
576
- "logps/rejected": -116.56599426269531,
577
- "loss": 0.1908,
578
  "rewards/accuracies": 0.8812500238418579,
579
- "rewards/chosen": 4.433460235595703,
580
- "rewards/margins": 13.112295150756836,
581
- "rewards/rejected": -8.678834915161133,
582
  "step": 340
583
  },
584
  {
585
  "epoch": 0.73,
586
- "grad_norm": 295.768622683131,
587
  "learning_rate": 1.0157994641835734e-07,
588
- "logits/chosen": -1.6467043161392212,
589
- "logits/rejected": -1.659854531288147,
590
- "logps/chosen": -115.77754974365234,
591
- "logps/rejected": -137.26531982421875,
592
- "loss": 0.1326,
593
- "rewards/accuracies": 0.9375,
594
- "rewards/chosen": 4.292226314544678,
595
- "rewards/margins": 15.004185676574707,
596
- "rewards/rejected": -10.711957931518555,
597
  "step": 350
598
  },
599
  {
600
  "epoch": 0.75,
601
- "grad_norm": 201.50715466732544,
602
  "learning_rate": 8.729103716819111e-08,
603
- "logits/chosen": -1.6420570611953735,
604
- "logits/rejected": -1.663745641708374,
605
- "logps/chosen": -122.47066497802734,
606
- "logps/rejected": -131.9073486328125,
607
- "loss": 0.1543,
608
- "rewards/accuracies": 0.8687499761581421,
609
- "rewards/chosen": 4.703906059265137,
610
- "rewards/margins": 14.480855941772461,
611
- "rewards/rejected": -9.776951789855957,
612
  "step": 360
613
  },
614
  {
615
  "epoch": 0.77,
616
- "grad_norm": 398.4959635020424,
617
  "learning_rate": 7.387025063449081e-08,
618
- "logits/chosen": -1.7505977153778076,
619
- "logits/rejected": -1.6930338144302368,
620
- "logps/chosen": -112.33251953125,
621
- "logps/rejected": -121.57230377197266,
622
- "loss": 0.1443,
623
- "rewards/accuracies": 0.9437500238418579,
624
- "rewards/chosen": 4.217165470123291,
625
- "rewards/margins": 12.93463134765625,
626
- "rewards/rejected": -8.7174654006958,
627
  "step": 370
628
  },
629
  {
630
  "epoch": 0.79,
631
- "grad_norm": 239.63863129657855,
632
  "learning_rate": 6.138919252022435e-08,
633
- "logits/chosen": -1.7603282928466797,
634
- "logits/rejected": -1.7833961248397827,
635
- "logps/chosen": -119.14958190917969,
636
- "logps/rejected": -136.85977172851562,
637
- "loss": 0.1576,
638
  "rewards/accuracies": 0.949999988079071,
639
- "rewards/chosen": 5.251246452331543,
640
- "rewards/margins": 16.546478271484375,
641
- "rewards/rejected": -11.295232772827148,
642
  "step": 380
643
  },
644
  {
645
  "epoch": 0.82,
646
- "grad_norm": 197.58750581294214,
647
  "learning_rate": 4.991445467064689e-08,
648
- "logits/chosen": -1.656961441040039,
649
- "logits/rejected": -1.6608669757843018,
650
- "logps/chosen": -115.59519958496094,
651
- "logps/rejected": -124.886474609375,
652
- "loss": 0.1242,
653
- "rewards/accuracies": 0.9624999761581421,
654
- "rewards/chosen": 5.070186614990234,
655
- "rewards/margins": 15.444877624511719,
656
- "rewards/rejected": -10.3746919631958,
657
  "step": 390
658
  },
659
  {
660
  "epoch": 0.84,
661
- "grad_norm": 266.2162184394133,
662
  "learning_rate": 3.9507259776993954e-08,
663
- "logits/chosen": -1.636885643005371,
664
- "logits/rejected": -1.713200330734253,
665
- "logps/chosen": -117.88565826416016,
666
- "logps/rejected": -124.86863708496094,
667
- "loss": 0.2005,
668
- "rewards/accuracies": 0.9125000238418579,
669
- "rewards/chosen": 4.46274471282959,
670
- "rewards/margins": 14.449310302734375,
671
- "rewards/rejected": -9.986566543579102,
672
  "step": 400
673
  },
674
  {
675
  "epoch": 0.84,
676
- "eval_logits/chosen": -1.8133598566055298,
677
- "eval_logits/rejected": -1.8130455017089844,
678
- "eval_logps/chosen": -119.97606658935547,
679
- "eval_logps/rejected": -131.71229553222656,
680
- "eval_loss": 0.14202240109443665,
681
- "eval_rewards/accuracies": 0.92578125,
682
- "eval_rewards/chosen": 5.338851451873779,
683
- "eval_rewards/margins": 16.880319595336914,
684
- "eval_rewards/rejected": -11.541468620300293,
685
- "eval_runtime": 97.6019,
686
- "eval_samples_per_second": 20.491,
687
  "eval_steps_per_second": 0.328,
688
  "step": 400
689
  },
690
  {
691
  "epoch": 0.86,
692
- "grad_norm": 300.6588760835356,
693
  "learning_rate": 3.022313472693447e-08,
694
- "logits/chosen": -1.735870599746704,
695
- "logits/rejected": -1.7638452053070068,
696
- "logps/chosen": -128.670166015625,
697
- "logps/rejected": -129.5255126953125,
698
- "loss": 0.1508,
699
- "rewards/accuracies": 0.925000011920929,
700
- "rewards/chosen": 5.593460559844971,
701
- "rewards/margins": 16.819358825683594,
702
- "rewards/rejected": -11.225897789001465,
703
  "step": 410
704
  },
705
  {
706
  "epoch": 0.88,
707
- "grad_norm": 184.26802651594184,
708
  "learning_rate": 2.2111614344599684e-08,
709
- "logits/chosen": -1.7033697366714478,
710
- "logits/rejected": -1.704993486404419,
711
- "logps/chosen": -113.72819519042969,
712
- "logps/rejected": -124.34715270996094,
713
- "loss": 0.1419,
714
- "rewards/accuracies": 0.9125000238418579,
715
- "rewards/chosen": 4.3744401931762695,
716
- "rewards/margins": 14.479223251342773,
717
- "rewards/rejected": -10.104782104492188,
718
  "step": 420
719
  },
720
  {
721
  "epoch": 0.9,
722
- "grad_norm": 269.33105520831003,
723
  "learning_rate": 1.521597710086439e-08,
724
- "logits/chosen": -1.6633046865463257,
725
- "logits/rejected": -1.6779390573501587,
726
- "logps/chosen": -129.4523162841797,
727
- "logps/rejected": -124.72953796386719,
728
- "loss": 0.154,
729
  "rewards/accuracies": 0.918749988079071,
730
- "rewards/chosen": 4.997079372406006,
731
- "rewards/margins": 14.370283126831055,
732
- "rewards/rejected": -9.373201370239258,
733
  "step": 430
734
  },
735
  {
736
  "epoch": 0.92,
737
- "grad_norm": 181.5022378731684,
738
  "learning_rate": 9.57301420397924e-09,
739
- "logits/chosen": -1.7851531505584717,
740
- "logits/rejected": -1.790157675743103,
741
- "logps/chosen": -120.8554458618164,
742
- "logps/rejected": -131.34410095214844,
743
- "loss": 0.144,
744
- "rewards/accuracies": 0.96875,
745
- "rewards/chosen": 5.435299873352051,
746
- "rewards/margins": 16.81759262084961,
747
- "rewards/rejected": -11.382290840148926,
748
  "step": 440
749
  },
750
  {
751
  "epoch": 0.94,
752
- "grad_norm": 227.09412853715097,
753
  "learning_rate": 5.212833302556258e-09,
754
- "logits/chosen": -1.7998348474502563,
755
- "logits/rejected": -1.7618439197540283,
756
- "logps/chosen": -116.46590423583984,
757
- "logps/rejected": -127.42796325683594,
758
- "loss": 0.159,
759
- "rewards/accuracies": 0.8999999761581421,
760
- "rewards/chosen": 5.652300834655762,
761
- "rewards/margins": 17.369140625,
762
- "rewards/rejected": -11.716839790344238,
763
  "step": 450
764
  },
765
  {
766
  "epoch": 0.96,
767
- "grad_norm": 225.69247261851913,
768
  "learning_rate": 2.158697848236607e-09,
769
- "logits/chosen": -1.6827681064605713,
770
- "logits/rejected": -1.7046699523925781,
771
- "logps/chosen": -120.79095458984375,
772
- "logps/rejected": -130.98043823242188,
773
- "loss": 0.1335,
774
- "rewards/accuracies": 0.925000011920929,
775
- "rewards/chosen": 5.568859100341797,
776
- "rewards/margins": 16.174407958984375,
777
- "rewards/rejected": -10.605547904968262,
778
  "step": 460
779
  },
780
  {
781
  "epoch": 0.98,
782
- "grad_norm": 199.48903785359678,
783
  "learning_rate": 4.269029751107489e-10,
784
- "logits/chosen": -1.6656444072723389,
785
- "logits/rejected": -1.6796722412109375,
786
- "logps/chosen": -114.7574462890625,
787
- "logps/rejected": -139.92117309570312,
788
- "loss": 0.1514,
789
- "rewards/accuracies": 0.90625,
790
- "rewards/chosen": 4.486257553100586,
791
- "rewards/margins": 15.330111503601074,
792
- "rewards/rejected": -10.843853950500488,
793
  "step": 470
794
  },
795
  {
796
  "epoch": 1.0,
797
  "step": 478,
798
  "total_flos": 0.0,
799
- "train_loss": 0.17749474911510196,
800
- "train_runtime": 7645.2484,
801
- "train_samples_per_second": 7.996,
802
  "train_steps_per_second": 0.063
803
  }
804
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "grad_norm": 877.3754021335358,
14
  "learning_rate": 1.0416666666666666e-08,
15
  "logits/chosen": -1.689455509185791,
16
  "logits/rejected": -1.4794573783874512,
 
25
  },
26
  {
27
  "epoch": 0.02,
28
+ "grad_norm": 774.0071219589337,
29
  "learning_rate": 1.0416666666666667e-07,
30
+ "logits/chosen": -1.7073653936386108,
31
+ "logits/rejected": -1.610068678855896,
32
+ "logps/chosen": -139.72607421875,
33
+ "logps/rejected": -91.37654113769531,
34
+ "loss": 0.6859,
35
+ "rewards/accuracies": 0.5138888955116272,
36
+ "rewards/chosen": -0.0006362733547575772,
37
+ "rewards/margins": 0.015451871789991856,
38
+ "rewards/rejected": -0.016088144853711128,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.04,
43
+ "grad_norm": 288.7199289028814,
44
  "learning_rate": 2.0833333333333333e-07,
45
+ "logits/chosen": -1.6388952732086182,
46
+ "logits/rejected": -1.6494001150131226,
47
+ "logps/chosen": -130.20901489257812,
48
+ "logps/rejected": -94.0677490234375,
49
+ "loss": 0.491,
50
+ "rewards/accuracies": 0.856249988079071,
51
+ "rewards/chosen": 0.4931251108646393,
52
+ "rewards/margins": 0.6499455571174622,
53
+ "rewards/rejected": -0.15682044625282288,
54
  "step": 20
55
  },
56
  {
57
  "epoch": 0.06,
58
+ "grad_norm": 224.69456938109306,
59
  "learning_rate": 3.1249999999999997e-07,
60
+ "logits/chosen": -1.6837990283966064,
61
+ "logits/rejected": -1.6247406005859375,
62
+ "logps/chosen": -130.29306030273438,
63
+ "logps/rejected": -107.59431457519531,
64
+ "loss": 0.2826,
65
+ "rewards/accuracies": 0.925000011920929,
66
+ "rewards/chosen": 1.5809106826782227,
67
+ "rewards/margins": 2.7498364448547363,
68
+ "rewards/rejected": -1.1689256429672241,
69
  "step": 30
70
  },
71
  {
72
  "epoch": 0.08,
73
+ "grad_norm": 183.79856437398223,
74
  "learning_rate": 4.1666666666666667e-07,
75
+ "logits/chosen": -1.6083917617797852,
76
+ "logits/rejected": -1.5386857986450195,
77
+ "logps/chosen": -142.89288330078125,
78
+ "logps/rejected": -118.29256439208984,
79
+ "loss": 0.2069,
80
+ "rewards/accuracies": 0.9312499761581421,
81
+ "rewards/chosen": 1.7656170129776,
82
+ "rewards/margins": 5.709896087646484,
83
+ "rewards/rejected": -3.944279432296753,
84
  "step": 40
85
  },
86
  {
87
  "epoch": 0.1,
88
+ "grad_norm": 299.30898027438695,
89
  "learning_rate": 4.999733114418725e-07,
90
+ "logits/chosen": -1.5618069171905518,
91
+ "logits/rejected": -1.589414119720459,
92
+ "logps/chosen": -129.36355590820312,
93
+ "logps/rejected": -135.2159881591797,
94
+ "loss": 0.1965,
95
+ "rewards/accuracies": 0.8812500238418579,
96
+ "rewards/chosen": 1.1421730518341064,
97
+ "rewards/margins": 8.362150192260742,
98
+ "rewards/rejected": -7.219976902008057,
99
  "step": 50
100
  },
101
  {
102
  "epoch": 0.13,
103
+ "grad_norm": 245.51415645278104,
104
  "learning_rate": 4.990398100856366e-07,
105
+ "logits/chosen": -1.6810020208358765,
106
+ "logits/rejected": -1.6362440586090088,
107
+ "logps/chosen": -146.06399536132812,
108
+ "logps/rejected": -148.35720825195312,
109
+ "loss": 0.1815,
110
+ "rewards/accuracies": 0.9624999761581421,
111
+ "rewards/chosen": 0.9872430562973022,
112
+ "rewards/margins": 10.306981086730957,
113
+ "rewards/rejected": -9.319738388061523,
114
  "step": 60
115
  },
116
  {
117
  "epoch": 0.15,
118
+ "grad_norm": 184.4357866731278,
119
  "learning_rate": 4.967775735898179e-07,
120
+ "logits/chosen": -1.6720081567764282,
121
+ "logits/rejected": -1.706272840499878,
122
+ "logps/chosen": -138.72039794921875,
123
+ "logps/rejected": -148.6911163330078,
124
+ "loss": 0.1754,
125
+ "rewards/accuracies": 0.90625,
126
+ "rewards/chosen": 1.2756741046905518,
127
+ "rewards/margins": 11.420818328857422,
128
+ "rewards/rejected": -10.14514446258545,
129
  "step": 70
130
  },
131
  {
132
  "epoch": 0.17,
133
+ "grad_norm": 147.5148614226922,
134
  "learning_rate": 4.931986719649298e-07,
135
+ "logits/chosen": -1.6803735494613647,
136
+ "logits/rejected": -1.677125334739685,
137
+ "logps/chosen": -131.1448211669922,
138
+ "logps/rejected": -137.2668914794922,
139
+ "loss": 0.1602,
140
+ "rewards/accuracies": 0.9125000238418579,
141
+ "rewards/chosen": 1.8630034923553467,
142
+ "rewards/margins": 10.570993423461914,
143
+ "rewards/rejected": -8.707988739013672,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 0.19,
148
+ "grad_norm": 231.45195159349998,
149
  "learning_rate": 4.883222001996351e-07,
150
+ "logits/chosen": -1.7273483276367188,
151
+ "logits/rejected": -1.7491174936294556,
152
+ "logps/chosen": -137.68093872070312,
153
+ "logps/rejected": -149.29808044433594,
154
+ "loss": 0.1529,
155
  "rewards/accuracies": 0.918749988079071,
156
+ "rewards/chosen": 2.305860757827759,
157
+ "rewards/margins": 11.786917686462402,
158
+ "rewards/rejected": -9.481058120727539,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 0.21,
163
+ "grad_norm": 125.1418915635804,
164
  "learning_rate": 4.821741763807186e-07,
165
+ "logits/chosen": -1.7639491558074951,
166
+ "logits/rejected": -1.7565345764160156,
167
+ "logps/chosen": -118.08821105957031,
168
+ "logps/rejected": -132.42239379882812,
169
+ "loss": 0.1523,
170
+ "rewards/accuracies": 0.925000011920929,
171
+ "rewards/chosen": 1.9779144525527954,
172
+ "rewards/margins": 10.331491470336914,
173
+ "rewards/rejected": -8.35357666015625,
174
  "step": 100
175
  },
176
  {
177
  "epoch": 0.21,
178
+ "eval_logits/chosen": -1.810097336769104,
179
+ "eval_logits/rejected": -1.8013982772827148,
180
+ "eval_logps/chosen": -125.05193328857422,
181
+ "eval_logps/rejected": -137.99850463867188,
182
+ "eval_loss": 0.13986752927303314,
183
+ "eval_rewards/accuracies": 0.9375,
184
+ "eval_rewards/chosen": 2.5440609455108643,
185
+ "eval_rewards/margins": 11.49561595916748,
186
+ "eval_rewards/rejected": -8.951555252075195,
187
+ "eval_runtime": 97.654,
188
+ "eval_samples_per_second": 20.48,
189
  "eval_steps_per_second": 0.328,
190
  "step": 100
191
  },
192
  {
193
  "epoch": 0.23,
194
+ "grad_norm": 145.2147024977398,
195
  "learning_rate": 4.747874028753375e-07,
196
+ "logits/chosen": -1.6646867990493774,
197
+ "logits/rejected": -1.7233518362045288,
198
+ "logps/chosen": -122.03011322021484,
199
+ "logps/rejected": -140.46389770507812,
200
+ "loss": 0.1545,
201
  "rewards/accuracies": 0.9375,
202
+ "rewards/chosen": 2.0033175945281982,
203
+ "rewards/margins": 9.661236763000488,
204
+ "rewards/rejected": -7.657918453216553,
205
  "step": 110
206
  },
207
  {
208
  "epoch": 0.25,
209
+ "grad_norm": 129.9317141592677,
210
  "learning_rate": 4.662012913161997e-07,
211
+ "logits/chosen": -1.7326112985610962,
212
+ "logits/rejected": -1.7273359298706055,
213
+ "logps/chosen": -119.6495132446289,
214
+ "logps/rejected": -140.51266479492188,
215
+ "loss": 0.1492,
216
+ "rewards/accuracies": 0.8687499761581421,
217
+ "rewards/chosen": 2.637754440307617,
218
+ "rewards/margins": 10.471738815307617,
219
+ "rewards/rejected": -7.833985805511475,
220
  "step": 120
221
  },
222
  {
223
  "epoch": 0.27,
224
+ "grad_norm": 81.32565764107251,
225
  "learning_rate": 4.5646165232345103e-07,
226
+ "logits/chosen": -1.6919190883636475,
227
+ "logits/rejected": -1.7214174270629883,
228
+ "logps/chosen": -124.76143646240234,
229
+ "logps/rejected": -136.91891479492188,
230
+ "loss": 0.144,
231
+ "rewards/accuracies": 0.925000011920929,
232
+ "rewards/chosen": 3.1906180381774902,
233
+ "rewards/margins": 11.917051315307617,
234
+ "rewards/rejected": -8.726433753967285,
235
  "step": 130
236
  },
237
  {
238
  "epoch": 0.29,
239
+ "grad_norm": 233.71107343479284,
240
  "learning_rate": 4.456204510851956e-07,
241
+ "logits/chosen": -1.6101233959197998,
242
+ "logits/rejected": -1.5833767652511597,
243
+ "logps/chosen": -121.59381103515625,
244
+ "logps/rejected": -132.78977966308594,
245
+ "loss": 0.1501,
246
+ "rewards/accuracies": 0.8999999761581421,
247
+ "rewards/chosen": 3.1950697898864746,
248
+ "rewards/margins": 11.13970947265625,
249
+ "rewards/rejected": -7.944640159606934,
250
  "step": 140
251
  },
252
  {
253
  "epoch": 0.31,
254
+ "grad_norm": 177.51912736620577,
255
  "learning_rate": 4.337355301007335e-07,
256
+ "logits/chosen": -1.7467788457870483,
257
+ "logits/rejected": -1.7698566913604736,
258
+ "logps/chosen": -119.5277328491211,
259
+ "logps/rejected": -128.22744750976562,
260
+ "loss": 0.1433,
261
  "rewards/accuracies": 0.8999999761581421,
262
+ "rewards/chosen": 3.2382194995880127,
263
+ "rewards/margins": 10.8007173538208,
264
+ "rewards/rejected": -7.562497615814209,
265
  "step": 150
266
  },
267
  {
268
  "epoch": 0.33,
269
+ "grad_norm": 248.7150598871683,
270
  "learning_rate": 4.2087030056579986e-07,
271
+ "logits/chosen": -1.6178503036499023,
272
+ "logits/rejected": -1.5939674377441406,
273
+ "logps/chosen": -127.01255798339844,
274
+ "logps/rejected": -134.47132873535156,
275
+ "loss": 0.1508,
276
+ "rewards/accuracies": 0.918749988079071,
277
+ "rewards/chosen": 2.990654468536377,
278
+ "rewards/margins": 10.095832824707031,
279
+ "rewards/rejected": -7.105177879333496,
280
  "step": 160
281
  },
282
  {
283
  "epoch": 0.36,
284
+ "grad_norm": 201.08598013628404,
285
  "learning_rate": 4.070934040463998e-07,
286
+ "logits/chosen": -1.7953569889068604,
287
+ "logits/rejected": -1.7801055908203125,
288
+ "logps/chosen": -122.10235595703125,
289
+ "logps/rejected": -136.7244873046875,
290
+ "loss": 0.1455,
291
+ "rewards/accuracies": 0.925000011920929,
292
+ "rewards/chosen": 3.407832622528076,
293
+ "rewards/margins": 11.618408203125,
294
+ "rewards/rejected": -8.210575103759766,
295
  "step": 170
296
  },
297
  {
298
  "epoch": 0.38,
299
+ "grad_norm": 169.66216294230117,
300
  "learning_rate": 3.9247834624635404e-07,
301
+ "logits/chosen": -1.7412538528442383,
302
+ "logits/rejected": -1.734012246131897,
303
+ "logps/chosen": -126.0450439453125,
304
+ "logps/rejected": -131.69639587402344,
305
+ "loss": 0.1565,
306
+ "rewards/accuracies": 0.925000011920929,
307
+ "rewards/chosen": 3.421628952026367,
308
+ "rewards/margins": 11.586966514587402,
309
+ "rewards/rejected": -8.165337562561035,
310
  "step": 180
311
  },
312
  {
313
  "epoch": 0.4,
314
+ "grad_norm": 165.65601380930994,
315
  "learning_rate": 3.7710310482256523e-07,
316
+ "logits/chosen": -1.7725350856781006,
317
+ "logits/rejected": -1.7596399784088135,
318
+ "logps/chosen": -115.76643371582031,
319
+ "logps/rejected": -146.38294982910156,
320
+ "loss": 0.1609,
321
  "rewards/accuracies": 0.887499988079071,
322
+ "rewards/chosen": 3.206624984741211,
323
+ "rewards/margins": 12.064164161682129,
324
+ "rewards/rejected": -8.857539176940918,
325
  "step": 190
326
  },
327
  {
328
  "epoch": 0.42,
329
+ "grad_norm": 164.38352614855714,
330
  "learning_rate": 3.610497133404795e-07,
331
+ "logits/chosen": -1.7693474292755127,
332
+ "logits/rejected": -1.7811864614486694,
333
+ "logps/chosen": -121.21810150146484,
334
+ "logps/rejected": -129.79788208007812,
335
+ "loss": 0.176,
336
+ "rewards/accuracies": 0.9312499761581421,
337
+ "rewards/chosen": 3.5836567878723145,
338
+ "rewards/margins": 11.941611289978027,
339
+ "rewards/rejected": -8.357954025268555,
340
  "step": 200
341
  },
342
  {
343
  "epoch": 0.42,
344
+ "eval_logits/chosen": -1.8763718605041504,
345
+ "eval_logits/rejected": -1.8761746883392334,
346
+ "eval_logps/chosen": -120.78527069091797,
347
+ "eval_logps/rejected": -137.00637817382812,
348
+ "eval_loss": 0.13582415878772736,
349
+ "eval_rewards/accuracies": 0.9375,
350
+ "eval_rewards/chosen": 3.3973963260650635,
351
+ "eval_rewards/margins": 12.150527954101562,
352
+ "eval_rewards/rejected": -8.753131866455078,
353
+ "eval_runtime": 97.4856,
354
+ "eval_samples_per_second": 20.516,
355
  "eval_steps_per_second": 0.328,
356
  "step": 200
357
  },
358
  {
359
  "epoch": 0.44,
360
+ "grad_norm": 177.62042791365712,
361
  "learning_rate": 3.4440382358952115e-07,
362
+ "logits/chosen": -1.708108901977539,
363
+ "logits/rejected": -1.654348611831665,
364
+ "logps/chosen": -118.52095794677734,
365
+ "logps/rejected": -123.7565689086914,
366
+ "loss": 0.1417,
367
+ "rewards/accuracies": 0.8999999761581421,
368
+ "rewards/chosen": 2.8139612674713135,
369
+ "rewards/margins": 9.644906997680664,
370
+ "rewards/rejected": -6.830945014953613,
371
  "step": 210
372
  },
373
  {
374
  "epoch": 0.46,
375
+ "grad_norm": 106.72676059840462,
376
  "learning_rate": 3.272542485937368e-07,
377
+ "logits/chosen": -1.8347289562225342,
378
+ "logits/rejected": -1.8747615814208984,
379
+ "logps/chosen": -118.0849609375,
380
+ "logps/rejected": -125.06263732910156,
381
+ "loss": 0.1482,
382
+ "rewards/accuracies": 0.9312499761581421,
383
+ "rewards/chosen": 3.678068161010742,
384
+ "rewards/margins": 11.184865951538086,
385
+ "rewards/rejected": -7.506799221038818,
386
  "step": 220
387
  },
388
  {
389
  "epoch": 0.48,
390
+ "grad_norm": 105.28694347115214,
391
  "learning_rate": 3.096924887558854e-07,
392
+ "logits/chosen": -1.7717214822769165,
393
+ "logits/rejected": -1.720580816268921,
394
+ "logps/chosen": -131.0685577392578,
395
+ "logps/rejected": -143.81317138671875,
396
+ "loss": 0.1684,
397
  "rewards/accuracies": 0.9437500238418579,
398
+ "rewards/chosen": 3.328054904937744,
399
+ "rewards/margins": 12.128056526184082,
400
+ "rewards/rejected": -8.800003051757812,
401
  "step": 230
402
  },
403
  {
404
  "epoch": 0.5,
405
+ "grad_norm": 190.93999332658404,
406
  "learning_rate": 2.9181224366319943e-07,
407
+ "logits/chosen": -1.7859230041503906,
408
+ "logits/rejected": -1.802541971206665,
409
+ "logps/chosen": -117.80191802978516,
410
+ "logps/rejected": -135.9575653076172,
411
+ "loss": 0.1559,
412
+ "rewards/accuracies": 0.9125000238418579,
413
+ "rewards/chosen": 2.8246419429779053,
414
+ "rewards/margins": 10.490262031555176,
415
+ "rewards/rejected": -7.665619850158691,
416
  "step": 240
417
  },
418
  {
419
  "epoch": 0.52,
420
+ "grad_norm": 200.5301056579919,
421
  "learning_rate": 2.7370891215954565e-07,
422
+ "logits/chosen": -1.7719627618789673,
423
+ "logits/rejected": -1.8007062673568726,
424
+ "logps/chosen": -120.75521087646484,
425
+ "logps/rejected": -136.2689971923828,
426
+ "loss": 0.1635,
427
+ "rewards/accuracies": 0.9375,
428
+ "rewards/chosen": 3.3319907188415527,
429
+ "rewards/margins": 11.189115524291992,
430
+ "rewards/rejected": -7.857124328613281,
431
  "step": 250
432
  },
433
  {
434
  "epoch": 0.54,
435
+ "grad_norm": 109.42764564797797,
436
  "learning_rate": 2.55479083351317e-07,
437
+ "logits/chosen": -1.8106673955917358,
438
+ "logits/rejected": -1.8370100259780884,
439
+ "logps/chosen": -125.1119613647461,
440
+ "logps/rejected": -127.0175552368164,
441
+ "loss": 0.1411,
442
+ "rewards/accuracies": 0.918749988079071,
443
+ "rewards/chosen": 3.7430100440979004,
444
+ "rewards/margins": 12.171258926391602,
445
+ "rewards/rejected": -8.428248405456543,
446
  "step": 260
447
  },
448
  {
449
  "epoch": 0.56,
450
+ "grad_norm": 107.44062116926116,
451
  "learning_rate": 2.3722002126275822e-07,
452
+ "logits/chosen": -1.795784592628479,
453
+ "logits/rejected": -1.726580262184143,
454
+ "logps/chosen": -114.95989990234375,
455
+ "logps/rejected": -129.06564331054688,
456
+ "loss": 0.1393,
457
+ "rewards/accuracies": 0.918749988079071,
458
+ "rewards/chosen": 3.4197163581848145,
459
+ "rewards/margins": 10.986078262329102,
460
+ "rewards/rejected": -7.566361427307129,
461
  "step": 270
462
  },
463
  {
464
  "epoch": 0.59,
465
+ "grad_norm": 191.88979341493172,
466
  "learning_rate": 2.19029145890313e-07,
467
+ "logits/chosen": -1.6350713968276978,
468
+ "logits/rejected": -1.7275726795196533,
469
+ "logps/chosen": -121.347412109375,
470
+ "logps/rejected": -143.2399444580078,
471
+ "loss": 0.1733,
472
  "rewards/accuracies": 0.918749988079071,
473
+ "rewards/chosen": 3.141850233078003,
474
+ "rewards/margins": 11.51993179321289,
475
+ "rewards/rejected": -8.378082275390625,
476
  "step": 280
477
  },
478
  {
479
  "epoch": 0.61,
480
+ "grad_norm": 187.2847938998864,
481
  "learning_rate": 2.0100351342479216e-07,
482
+ "logits/chosen": -1.7116390466690063,
483
+ "logits/rejected": -1.6921733617782593,
484
+ "logps/chosen": -112.99186706542969,
485
+ "logps/rejected": -130.45889282226562,
486
+ "loss": 0.1534,
487
  "rewards/accuracies": 0.887499988079071,
488
+ "rewards/chosen": 3.034196376800537,
489
+ "rewards/margins": 11.30268669128418,
490
+ "rewards/rejected": -8.268491744995117,
491
  "step": 290
492
  },
493
  {
494
  "epoch": 0.63,
495
+ "grad_norm": 427.8567531133513,
496
  "learning_rate": 1.8323929841460178e-07,
497
+ "logits/chosen": -1.6803079843521118,
498
+ "logits/rejected": -1.6214721202850342,
499
+ "logps/chosen": -130.09739685058594,
500
+ "logps/rejected": -148.15475463867188,
501
+ "loss": 0.1509,
502
+ "rewards/accuracies": 0.925000011920929,
503
+ "rewards/chosen": 3.2685546875,
504
+ "rewards/margins": 11.6514892578125,
505
+ "rewards/rejected": -8.382935523986816,
506
  "step": 300
507
  },
508
  {
509
  "epoch": 0.63,
510
+ "eval_logits/chosen": -1.7874743938446045,
511
+ "eval_logits/rejected": -1.7873148918151855,
512
+ "eval_logps/chosen": -121.00535583496094,
513
+ "eval_logps/rejected": -139.82205200195312,
514
+ "eval_loss": 0.14029568433761597,
515
+ "eval_rewards/accuracies": 0.93359375,
516
+ "eval_rewards/chosen": 3.353379249572754,
517
+ "eval_rewards/margins": 12.669642448425293,
518
+ "eval_rewards/rejected": -9.316261291503906,
519
+ "eval_runtime": 97.7473,
520
+ "eval_samples_per_second": 20.461,
521
+ "eval_steps_per_second": 0.327,
522
  "step": 300
523
  },
524
  {
525
  "epoch": 0.65,
526
+ "grad_norm": 209.36571305183378,
527
  "learning_rate": 1.6583128063291573e-07,
528
+ "logits/chosen": -1.6824207305908203,
529
+ "logits/rejected": -1.7118114233016968,
530
+ "logps/chosen": -122.2557144165039,
531
+ "logps/rejected": -138.8484344482422,
532
+ "loss": 0.1544,
533
  "rewards/accuracies": 0.9375,
534
+ "rewards/chosen": 3.416393280029297,
535
+ "rewards/margins": 11.478261947631836,
536
+ "rewards/rejected": -8.061868667602539,
537
  "step": 310
538
  },
539
  {
540
  "epoch": 0.67,
541
+ "grad_norm": 115.39699943542352,
542
  "learning_rate": 1.488723393865766e-07,
543
+ "logits/chosen": -1.7209253311157227,
544
+ "logits/rejected": -1.6945152282714844,
545
+ "logps/chosen": -111.72103118896484,
546
+ "logps/rejected": -138.40509033203125,
547
+ "loss": 0.144,
548
+ "rewards/accuracies": 0.8999999761581421,
549
+ "rewards/chosen": 3.8151679039001465,
550
+ "rewards/margins": 12.245112419128418,
551
+ "rewards/rejected": -8.42994499206543,
552
  "step": 320
553
  },
554
  {
555
  "epoch": 0.69,
556
+ "grad_norm": 101.70209071929943,
557
  "learning_rate": 1.3245295796480788e-07,
558
+ "logits/chosen": -1.6543442010879517,
559
+ "logits/rejected": -1.7321891784667969,
560
+ "logps/chosen": -122.24208068847656,
561
+ "logps/rejected": -133.8969268798828,
562
+ "loss": 0.1435,
563
+ "rewards/accuracies": 0.8999999761581421,
564
+ "rewards/chosen": 3.18817400932312,
565
+ "rewards/margins": 11.217942237854004,
566
+ "rewards/rejected": -8.029767990112305,
567
  "step": 330
568
  },
569
  {
570
  "epoch": 0.71,
571
+ "grad_norm": 123.95909153688254,
572
  "learning_rate": 1.1666074087171627e-07,
573
+ "logits/chosen": -1.6039130687713623,
574
+ "logits/rejected": -1.6741559505462646,
575
+ "logps/chosen": -116.26409912109375,
576
+ "logps/rejected": -122.7804946899414,
577
+ "loss": 0.1746,
578
  "rewards/accuracies": 0.8812500238418579,
579
+ "rewards/chosen": 2.8452401161193848,
580
+ "rewards/margins": 9.874032974243164,
581
+ "rewards/rejected": -7.028792381286621,
582
  "step": 340
583
  },
584
  {
585
  "epoch": 0.73,
586
+ "grad_norm": 294.0385926355559,
587
  "learning_rate": 1.0157994641835734e-07,
588
+ "logits/chosen": -1.6423372030258179,
589
+ "logits/rejected": -1.6570653915405273,
590
+ "logps/chosen": -116.1936264038086,
591
+ "logps/rejected": -144.1549072265625,
592
+ "loss": 0.1291,
593
+ "rewards/accuracies": 0.949999988079071,
594
+ "rewards/chosen": 2.7782697677612305,
595
+ "rewards/margins": 11.297493934631348,
596
+ "rewards/rejected": -8.519224166870117,
597
  "step": 350
598
  },
599
  {
600
  "epoch": 0.75,
601
+ "grad_norm": 147.62661016854165,
602
  "learning_rate": 8.729103716819111e-08,
603
+ "logits/chosen": -1.6380598545074463,
604
+ "logits/rejected": -1.6626033782958984,
605
+ "logps/chosen": -122.89555358886719,
606
+ "logps/rejected": -138.44113159179688,
607
+ "loss": 0.1469,
608
+ "rewards/accuracies": 0.862500011920929,
609
+ "rewards/chosen": 3.050956964492798,
610
+ "rewards/margins": 10.875678062438965,
611
+ "rewards/rejected": -7.824721336364746,
612
  "step": 360
613
  },
614
  {
615
  "epoch": 0.77,
616
+ "grad_norm": 250.3877641186192,
617
  "learning_rate": 7.387025063449081e-08,
618
+ "logits/chosen": -1.749659776687622,
619
+ "logits/rejected": -1.6942018270492554,
620
+ "logps/chosen": -113.17866516113281,
621
+ "logps/rejected": -127.69132232666016,
622
+ "loss": 0.1488,
623
+ "rewards/accuracies": 0.9125000238418579,
624
+ "rewards/chosen": 2.642218828201294,
625
+ "rewards/margins": 9.677666664123535,
626
+ "rewards/rejected": -7.035447597503662,
627
  "step": 370
628
  },
629
  {
630
  "epoch": 0.79,
631
+ "grad_norm": 179.11776110792954,
632
  "learning_rate": 6.138919252022435e-08,
633
+ "logits/chosen": -1.7493488788604736,
634
+ "logits/rejected": -1.777907133102417,
635
+ "logps/chosen": -119.62767028808594,
636
+ "logps/rejected": -144.23379516601562,
637
+ "loss": 0.1547,
638
  "rewards/accuracies": 0.949999988079071,
639
+ "rewards/chosen": 3.405210494995117,
640
+ "rewards/margins": 12.410168647766113,
641
+ "rewards/rejected": -9.004958152770996,
642
  "step": 380
643
  },
644
  {
645
  "epoch": 0.82,
646
+ "grad_norm": 105.79142677310188,
647
  "learning_rate": 4.991445467064689e-08,
648
+ "logits/chosen": -1.6522718667984009,
649
+ "logits/rejected": -1.657579779624939,
650
+ "logps/chosen": -115.9373779296875,
651
+ "logps/rejected": -131.8628387451172,
652
+ "loss": 0.1191,
653
+ "rewards/accuracies": 0.956250011920929,
654
+ "rewards/chosen": 3.3116908073425293,
655
+ "rewards/margins": 11.62342643737793,
656
+ "rewards/rejected": -8.311735153198242,
657
  "step": 390
658
  },
659
  {
660
  "epoch": 0.84,
661
+ "grad_norm": 160.26625520373477,
662
  "learning_rate": 3.9507259776993954e-08,
663
+ "logits/chosen": -1.6317713260650635,
664
+ "logits/rejected": -1.717016577720642,
665
+ "logps/chosen": -118.45027923583984,
666
+ "logps/rejected": -131.49853515625,
667
+ "loss": 0.2009,
668
+ "rewards/accuracies": 0.90625,
669
+ "rewards/chosen": 2.862239360809326,
670
+ "rewards/margins": 10.845926284790039,
671
+ "rewards/rejected": -7.983686923980713,
672
  "step": 400
673
  },
674
  {
675
  "epoch": 0.84,
676
+ "eval_logits/chosen": -1.8097559213638306,
677
+ "eval_logits/rejected": -1.8106379508972168,
678
+ "eval_logps/chosen": -120.3245849609375,
679
+ "eval_logps/rejected": -139.5015411376953,
680
+ "eval_loss": 0.1389647275209427,
681
+ "eval_rewards/accuracies": 0.9296875,
682
+ "eval_rewards/chosen": 3.489531993865967,
683
+ "eval_rewards/margins": 12.741693496704102,
684
+ "eval_rewards/rejected": -9.252161026000977,
685
+ "eval_runtime": 97.6986,
686
+ "eval_samples_per_second": 20.471,
687
  "eval_steps_per_second": 0.328,
688
  "step": 400
689
  },
690
  {
691
  "epoch": 0.86,
692
+ "grad_norm": 192.9545547193168,
693
  "learning_rate": 3.022313472693447e-08,
694
+ "logits/chosen": -1.7275054454803467,
695
+ "logits/rejected": -1.756888747215271,
696
+ "logps/chosen": -129.18605041503906,
697
+ "logps/rejected": -136.9790496826172,
698
+ "loss": 0.1378,
699
+ "rewards/accuracies": 0.9375,
700
+ "rewards/chosen": 3.625795841217041,
701
+ "rewards/margins": 12.600431442260742,
702
+ "rewards/rejected": -8.974635124206543,
703
  "step": 410
704
  },
705
  {
706
  "epoch": 0.88,
707
+ "grad_norm": 135.89148969120785,
708
  "learning_rate": 2.2111614344599684e-08,
709
+ "logits/chosen": -1.697636365890503,
710
+ "logits/rejected": -1.6976314783096313,
711
+ "logps/chosen": -114.35809326171875,
712
+ "logps/rejected": -131.75164794921875,
713
+ "loss": 0.1286,
714
+ "rewards/accuracies": 0.925000011920929,
715
+ "rewards/chosen": 2.790315866470337,
716
+ "rewards/margins": 11.007737159729004,
717
+ "rewards/rejected": -8.21742057800293,
718
  "step": 420
719
  },
720
  {
721
  "epoch": 0.9,
722
+ "grad_norm": 163.87841082901525,
723
  "learning_rate": 1.521597710086439e-08,
724
+ "logits/chosen": -1.6538774967193604,
725
+ "logits/rejected": -1.667636513710022,
726
+ "logps/chosen": -129.87210083007812,
727
+ "logps/rejected": -131.872802734375,
728
+ "loss": 0.1437,
729
  "rewards/accuracies": 0.918749988079071,
730
+ "rewards/chosen": 3.2474303245544434,
731
+ "rewards/margins": 10.924890518188477,
732
+ "rewards/rejected": -7.677459716796875,
733
  "step": 430
734
  },
735
  {
736
  "epoch": 0.92,
737
+ "grad_norm": 117.88131682798563,
738
  "learning_rate": 9.57301420397924e-09,
739
+ "logits/chosen": -1.7787322998046875,
740
+ "logits/rejected": -1.7886247634887695,
741
+ "logps/chosen": -121.3507080078125,
742
+ "logps/rejected": -139.0587921142578,
743
+ "loss": 0.142,
744
+ "rewards/accuracies": 0.9624999761581421,
745
+ "rewards/chosen": 3.5244851112365723,
746
+ "rewards/margins": 12.655617713928223,
747
+ "rewards/rejected": -9.131133079528809,
748
  "step": 440
749
  },
750
  {
751
  "epoch": 0.94,
752
+ "grad_norm": 179.270368210056,
753
  "learning_rate": 5.212833302556258e-09,
754
+ "logits/chosen": -1.792822241783142,
755
+ "logits/rejected": -1.7586778402328491,
756
+ "logps/chosen": -116.64122009277344,
757
+ "logps/rejected": -135.26832580566406,
758
+ "loss": 0.1538,
759
+ "rewards/accuracies": 0.90625,
760
+ "rewards/chosen": 3.7331383228302,
761
+ "rewards/margins": 13.112436294555664,
762
+ "rewards/rejected": -9.379298210144043,
763
  "step": 450
764
  },
765
  {
766
  "epoch": 0.96,
767
+ "grad_norm": 118.73739536412977,
768
  "learning_rate": 2.158697848236607e-09,
769
+ "logits/chosen": -1.6758760213851929,
770
+ "logits/rejected": -1.6996933221817017,
771
+ "logps/chosen": -120.97281646728516,
772
+ "logps/rejected": -138.555419921875,
773
+ "loss": 0.1252,
774
+ "rewards/accuracies": 0.9375,
775
+ "rewards/chosen": 3.676201581954956,
776
+ "rewards/margins": 12.26156234741211,
777
+ "rewards/rejected": -8.585359573364258,
778
  "step": 460
779
  },
780
  {
781
  "epoch": 0.98,
782
+ "grad_norm": 152.28679005488013,
783
  "learning_rate": 4.269029751107489e-10,
784
+ "logits/chosen": -1.6590917110443115,
785
+ "logits/rejected": -1.6743139028549194,
786
+ "logps/chosen": -115.36566162109375,
787
+ "logps/rejected": -147.09939575195312,
788
+ "loss": 0.1469,
789
+ "rewards/accuracies": 0.925000011920929,
790
+ "rewards/chosen": 2.869194269180298,
791
+ "rewards/margins": 11.534073829650879,
792
+ "rewards/rejected": -8.66487979888916,
793
  "step": 470
794
  },
795
  {
796
  "epoch": 1.0,
797
  "step": 478,
798
  "total_flos": 0.0,
799
+ "train_loss": 0.17478099153630405,
800
+ "train_runtime": 7645.1097,
801
+ "train_samples_per_second": 7.997,
802
  "train_steps_per_second": 0.063
803
  }
804
  ],