Wenboz commited on
Commit
de33a6d
·
verified ·
1 Parent(s): 1c1e317

Model save

Browse files
Files changed (4) hide show
  1. README.md +17 -17
  2. all_results.json +4 -4
  3. train_results.json +4 -4
  4. trainer_state.json +525 -525
README.md CHANGED
@@ -16,20 +16,20 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  This model was trained from scratch on the None dataset.
18
  It achieves the following results on the evaluation set:
19
- - Loss: 0.9861
20
- - Rewards/chosen: -0.7020
21
- - Rewards/rejected: -0.9079
22
- - Rewards/gen: -2.6099
23
- - Rewards/accuracies: 0.6020
24
- - Rewards/margins: 0.2059
25
- - Logps/rejected: -280.7023
26
- - Logps/chosen: -315.6724
27
- - Logps/response: -196.7418
28
- - Logits/rejected: 0.8408
29
- - Logits/chosen: 0.8746
30
- - Logits/response: 0.9388
31
- - Improvement: 0.3049
32
- - Penalty: 0.6913
33
 
34
  ## Model description
35
 
@@ -66,9 +66,9 @@ The following hyperparameters were used during training:
66
 
67
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/gen | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logps/response | Logits/rejected | Logits/chosen | Logits/response | Improvement | Penalty |
68
  |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:-----------:|:------------------:|:---------------:|:--------------:|:------------:|:--------------:|:---------------:|:-------------:|:---------------:|:-----------:|:-------:|
69
- | 1.0506 | 0.3140 | 100 | 1.0429 | -0.3597 | -0.4629 | -1.7879 | 0.5620 | 0.1032 | -276.2524 | -312.2498 | -188.5220 | 0.8938 | 0.9341 | 0.9707 | 0.3487 | 0.7045 |
70
- | 0.9432 | 0.6281 | 200 | 0.9939 | -0.6203 | -0.8053 | -2.4531 | 0.5920 | 0.1850 | -279.6760 | -314.8556 | -195.1740 | 0.8562 | 0.8910 | 0.9488 | 0.3102 | 0.6945 |
71
- | 0.9766 | 0.9421 | 300 | 0.9861 | -0.7020 | -0.9079 | -2.6099 | 0.6020 | 0.2059 | -280.7023 | -315.6724 | -196.7418 | 0.8408 | 0.8746 | 0.9388 | 0.3049 | 0.6913 |
72
 
73
 
74
  ### Framework versions
 
16
 
17
  This model was trained from scratch on the None dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 1.3955
20
+ - Rewards/chosen: -0.7655
21
+ - Rewards/rejected: -1.1037
22
+ - Rewards/gen: -0.9813
23
+ - Rewards/accuracies: 0.6480
24
+ - Rewards/margins: 0.3382
25
+ - Logps/rejected: -291.7406
26
+ - Logps/chosen: -323.3212
27
+ - Logps/response: -206.5588
28
+ - Logits/rejected: 1.0075
29
+ - Logits/chosen: 1.0518
30
+ - Logits/response: 1.1119
31
+ - Improvement: 0.7646
32
+ - Penalty: 0.6340
33
 
34
  ## Model description
35
 
 
66
 
67
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/gen | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logps/response | Logits/rejected | Logits/chosen | Logits/response | Improvement | Penalty |
68
  |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:-----------:|:------------------:|:---------------:|:--------------:|:------------:|:--------------:|:---------------:|:-------------:|:---------------:|:-----------:|:-------:|
69
+ | 0.6875 | 0.3140 | 100 | 1.3506 | -0.4754 | -0.6595 | -0.7565 | 0.6060 | 0.1841 | -287.2987 | -320.4206 | -204.3113 | 1.0762 | 1.1203 | 1.1703 | 0.6764 | 0.6691 |
70
+ | 0.6367 | 0.6281 | 200 | 1.4008 | -0.6776 | -1.0030 | -0.8686 | 0.6420 | 0.3254 | -290.7334 | -322.4421 | -205.4324 | 1.0335 | 1.0745 | 1.1297 | 0.7678 | 0.6372 |
71
+ | 0.6335 | 0.9421 | 300 | 1.3955 | -0.7655 | -1.1037 | -0.9813 | 0.6480 | 0.3382 | -291.7406 | -323.3212 | -206.5588 | 1.0075 | 1.0518 | 1.1119 | 0.7646 | 0.6340 |
72
 
73
 
74
  ### Framework versions
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.9986261040235525,
3
  "total_flos": 0.0,
4
- "train_loss": 1.0325740605780163,
5
- "train_runtime": 3343.8277,
6
  "train_samples": 20378,
7
- "train_samples_per_second": 6.094,
8
- "train_steps_per_second": 0.095
9
  }
 
1
  {
2
  "epoch": 0.9986261040235525,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.7174485007172111,
5
+ "train_runtime": 3896.582,
6
  "train_samples": 20378,
7
+ "train_samples_per_second": 5.23,
8
+ "train_steps_per_second": 0.082
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.9986261040235525,
3
  "total_flos": 0.0,
4
- "train_loss": 1.0325740605780163,
5
- "train_runtime": 3343.8277,
6
  "train_samples": 20378,
7
- "train_samples_per_second": 6.094,
8
- "train_steps_per_second": 0.095
9
  }
 
1
  {
2
  "epoch": 0.9986261040235525,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.7174485007172111,
5
+ "train_runtime": 3896.582,
6
  "train_samples": 20378,
7
+ "train_samples_per_second": 5.23,
8
+ "train_steps_per_second": 0.082
9
  }
trainer_state.json CHANGED
@@ -10,15 +10,15 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.003140333660451423,
13
- "grad_norm": 90.99359893798828,
14
  "improvement": 0.6931471824645996,
15
  "learning_rate": 1.5625e-08,
16
- "logits/chosen": 1.3514697551727295,
17
- "logits/rejected": 1.4558440446853638,
18
- "logits/response": 1.3213673830032349,
19
- "logps/chosen": -256.549560546875,
20
- "logps/rejected": -246.02056884765625,
21
- "logps/response": -111.43739318847656,
22
  "loss": 1.3863,
23
  "penalty": 0.6931471824645996,
24
  "rewards/accuracies": 0.0,
@@ -30,695 +30,695 @@
30
  },
31
  {
32
  "epoch": 0.03140333660451423,
33
- "grad_norm": 88.48505401611328,
34
- "improvement": 0.6908075213432312,
35
  "learning_rate": 1.5624999999999999e-07,
36
- "logits/chosen": 1.0925401449203491,
37
- "logits/rejected": 1.2437928915023804,
38
- "logits/response": 1.1619499921798706,
39
- "logps/chosen": -286.2481994628906,
40
- "logps/rejected": -304.1895751953125,
41
- "logps/response": -179.84832763671875,
42
- "loss": 1.3863,
43
- "penalty": 0.6960046291351318,
44
- "rewards/accuracies": 0.4097222089767456,
45
- "rewards/chosen": 0.0026971669867634773,
46
- "rewards/gen": -0.0007864322979003191,
47
- "rewards/margins": -0.004067544359713793,
48
- "rewards/rejected": 0.006764710880815983,
49
  "step": 10
50
  },
51
  {
52
  "epoch": 0.06280667320902845,
53
- "grad_norm": 81.83534240722656,
54
- "improvement": 0.6806281805038452,
55
  "learning_rate": 3.1249999999999997e-07,
56
- "logits/chosen": 1.1330327987670898,
57
- "logits/rejected": 1.1094038486480713,
58
- "logits/response": 1.12299382686615,
59
- "logps/chosen": -265.14605712890625,
60
- "logps/rejected": -268.95941162109375,
61
- "logps/response": -156.56362915039062,
62
- "loss": 1.3741,
63
- "penalty": 0.6888696551322937,
64
- "rewards/accuracies": 0.5625,
65
- "rewards/chosen": 0.004303407855331898,
66
- "rewards/gen": -0.027122128754854202,
67
- "rewards/margins": 0.01006249524652958,
68
- "rewards/rejected": -0.005759088322520256,
69
  "step": 20
70
  },
71
  {
72
  "epoch": 0.09421000981354269,
73
- "grad_norm": 77.37152099609375,
74
- "improvement": 0.6238707304000854,
75
  "learning_rate": 4.6874999999999996e-07,
76
- "logits/chosen": 1.2423532009124756,
77
- "logits/rejected": 1.0369462966918945,
78
- "logits/response": 1.284911870956421,
79
- "logps/chosen": -366.21832275390625,
80
- "logps/rejected": -271.5616149902344,
81
- "logps/response": -191.85791015625,
82
- "loss": 1.3185,
83
- "penalty": 0.6943666934967041,
84
- "rewards/accuracies": 0.5,
85
- "rewards/chosen": -0.013009263202548027,
86
- "rewards/gen": -0.16623075306415558,
87
- "rewards/margins": 0.0020996746607124805,
88
- "rewards/rejected": -0.015108938328921795,
89
  "step": 30
90
  },
91
  {
92
  "epoch": 0.1256133464180569,
93
- "grad_norm": 61.32085037231445,
94
- "improvement": 0.5091749429702759,
95
  "learning_rate": 4.990353313429303e-07,
96
- "logits/chosen": 1.388942837715149,
97
- "logits/rejected": 1.2175312042236328,
98
- "logits/response": 1.2681037187576294,
99
- "logps/chosen": -350.94537353515625,
100
- "logps/rejected": -293.6703186035156,
101
- "logps/response": -179.65731811523438,
102
- "loss": 1.2024,
103
- "penalty": 0.6836713552474976,
104
- "rewards/accuracies": 0.6000000238418579,
105
- "rewards/chosen": -0.07153712213039398,
106
- "rewards/gen": -0.587524950504303,
107
- "rewards/margins": 0.04365935176610947,
108
- "rewards/rejected": -0.11519646644592285,
109
  "step": 40
110
  },
111
  {
112
  "epoch": 0.15701668302257116,
113
- "grad_norm": 52.960693359375,
114
- "improvement": 0.43595728278160095,
115
  "learning_rate": 4.951291206355559e-07,
116
- "logits/chosen": 1.4330917596817017,
117
- "logits/rejected": 1.3490939140319824,
118
- "logits/response": 1.453476905822754,
119
- "logps/chosen": -284.6172180175781,
120
- "logps/rejected": -279.71728515625,
121
- "logps/response": -189.18643188476562,
122
- "loss": 1.1356,
123
- "penalty": 0.7028869986534119,
124
- "rewards/accuracies": 0.581250011920929,
125
- "rewards/chosen": -0.20936663448810577,
126
- "rewards/gen": -1.1179193258285522,
127
- "rewards/margins": 0.05230969190597534,
128
- "rewards/rejected": -0.2616763114929199,
129
  "step": 50
130
  },
131
  {
132
  "epoch": 0.18842001962708538,
133
- "grad_norm": 59.61778259277344,
134
- "improvement": 0.3726009428501129,
135
  "learning_rate": 4.882681251368548e-07,
136
- "logits/chosen": 1.404496192932129,
137
- "logits/rejected": 1.3673858642578125,
138
- "logits/response": 1.3688445091247559,
139
- "logps/chosen": -329.80340576171875,
140
- "logps/rejected": -319.18133544921875,
141
- "logps/response": -194.2289581298828,
142
- "loss": 1.1305,
143
- "penalty": 0.7743828892707825,
144
- "rewards/accuracies": 0.48750001192092896,
145
- "rewards/chosen": -0.4070376753807068,
146
- "rewards/gen": -1.5880019664764404,
147
- "rewards/margins": -0.050781238824129105,
148
- "rewards/rejected": -0.3562564253807068,
149
  "step": 60
150
  },
151
  {
152
  "epoch": 0.2198233562315996,
153
- "grad_norm": 50.95313262939453,
154
- "improvement": 0.3971942067146301,
155
  "learning_rate": 4.785350472409791e-07,
156
- "logits/chosen": 1.4829801321029663,
157
- "logits/rejected": 1.3746827840805054,
158
- "logits/response": 1.4223880767822266,
159
- "logps/chosen": -297.8169860839844,
160
- "logps/rejected": -259.929443359375,
161
- "logps/response": -165.90725708007812,
162
- "loss": 1.0959,
163
- "penalty": 0.6936275959014893,
164
- "rewards/accuracies": 0.5249999761581421,
165
- "rewards/chosen": -0.3267964720726013,
166
- "rewards/gen": -1.5778385400772095,
167
- "rewards/margins": 0.11241116374731064,
168
- "rewards/rejected": -0.43920764327049255,
169
  "step": 70
170
  },
171
  {
172
  "epoch": 0.2512266928361138,
173
- "grad_norm": 50.096981048583984,
174
- "improvement": 0.3729608952999115,
175
  "learning_rate": 4.6604720940421207e-07,
176
- "logits/chosen": 1.3744746446609497,
177
- "logits/rejected": 1.2923038005828857,
178
- "logits/response": 1.3090846538543701,
179
- "logps/chosen": -313.1800537109375,
180
- "logps/rejected": -258.7066955566406,
181
- "logps/response": -175.9040985107422,
182
- "loss": 1.1115,
183
- "penalty": 0.7144418358802795,
184
- "rewards/accuracies": 0.5874999761581421,
185
- "rewards/chosen": -0.3523169457912445,
186
- "rewards/gen": -1.8431284427642822,
187
- "rewards/margins": 0.10072679817676544,
188
- "rewards/rejected": -0.45304378867149353,
189
  "step": 80
190
  },
191
  {
192
  "epoch": 0.2826300294406281,
193
- "grad_norm": 51.301666259765625,
194
- "improvement": 0.3327198624610901,
195
  "learning_rate": 4.5095513994085974e-07,
196
- "logits/chosen": 1.3864705562591553,
197
- "logits/rejected": 1.2353299856185913,
198
- "logits/response": 1.3959097862243652,
199
- "logps/chosen": -324.7561340332031,
200
- "logps/rejected": -244.14535522460938,
201
- "logps/response": -200.8162078857422,
202
- "loss": 1.0443,
203
- "penalty": 0.6970937252044678,
204
- "rewards/accuracies": 0.5375000238418579,
205
- "rewards/chosen": -0.30312663316726685,
206
- "rewards/gen": -1.87189519405365,
207
- "rewards/margins": 0.10124621540307999,
208
- "rewards/rejected": -0.4043728709220886,
209
  "step": 90
210
  },
211
  {
212
  "epoch": 0.3140333660451423,
213
- "grad_norm": 46.86961364746094,
214
- "improvement": 0.3321753144264221,
215
  "learning_rate": 4.3344075855595097e-07,
216
- "logits/chosen": 1.1061931848526,
217
- "logits/rejected": 1.1866940259933472,
218
- "logits/response": 1.2408634424209595,
219
- "logps/chosen": -272.04541015625,
220
- "logps/rejected": -263.6874084472656,
221
- "logps/response": -188.56777954101562,
222
- "loss": 1.0506,
223
- "penalty": 0.6661034226417542,
224
- "rewards/accuracies": 0.59375,
225
- "rewards/chosen": -0.30777376890182495,
226
- "rewards/gen": -1.7470309734344482,
227
- "rewards/margins": 0.17530158162117004,
228
- "rewards/rejected": -0.4830753207206726,
229
  "step": 100
230
  },
231
  {
232
  "epoch": 0.3140333660451423,
233
- "eval_improvement": 0.34868791699409485,
234
- "eval_logits/chosen": 0.9340688586235046,
235
- "eval_logits/rejected": 0.8938046097755432,
236
- "eval_logits/response": 0.9706884026527405,
237
- "eval_logps/chosen": -312.2497863769531,
238
- "eval_logps/rejected": -276.25238037109375,
239
- "eval_logps/response": -188.52197265625,
240
- "eval_loss": 1.042933702468872,
241
- "eval_penalty": 0.704525351524353,
242
- "eval_rewards/accuracies": 0.5619999766349792,
243
- "eval_rewards/chosen": -0.35974380373954773,
244
- "eval_rewards/gen": -1.787913203239441,
245
- "eval_rewards/margins": 0.10319552570581436,
246
- "eval_rewards/rejected": -0.4629393219947815,
247
- "eval_runtime": 133.7047,
248
- "eval_samples_per_second": 14.958,
249
- "eval_steps_per_second": 1.87,
250
  "step": 100
251
  },
252
  {
253
  "epoch": 0.34543670264965654,
254
- "grad_norm": 47.566768646240234,
255
- "improvement": 0.310733437538147,
256
  "learning_rate": 4.137151834863213e-07,
257
- "logits/chosen": 1.0738614797592163,
258
- "logits/rejected": 1.086809515953064,
259
- "logits/response": 1.1392853260040283,
260
- "logps/chosen": -295.4719543457031,
261
- "logps/rejected": -279.11383056640625,
262
- "logps/response": -192.47640991210938,
263
- "loss": 1.0474,
264
- "penalty": 0.6999384760856628,
265
- "rewards/accuracies": 0.581250011920929,
266
- "rewards/chosen": -0.3630922734737396,
267
- "rewards/gen": -1.9901072978973389,
268
- "rewards/margins": 0.14921404421329498,
269
- "rewards/rejected": -0.5123063325881958,
270
  "step": 110
271
  },
272
  {
273
  "epoch": 0.37684003925417076,
274
- "grad_norm": 47.566810607910156,
275
- "improvement": 0.2977697253227234,
276
  "learning_rate": 3.920161866827889e-07,
277
- "logits/chosen": 1.3284032344818115,
278
- "logits/rejected": 1.2398704290390015,
279
- "logits/response": 1.488197922706604,
280
- "logps/chosen": -347.5028991699219,
281
- "logps/rejected": -263.88165283203125,
282
- "logps/response": -211.24081420898438,
283
- "loss": 0.9844,
284
- "penalty": 0.6048867702484131,
285
- "rewards/accuracies": 0.675000011920929,
286
- "rewards/chosen": -0.2518952190876007,
287
- "rewards/gen": -2.0901896953582764,
288
- "rewards/margins": 0.3502174913883209,
289
- "rewards/rejected": -0.6021127104759216,
290
  "step": 120
291
  },
292
  {
293
  "epoch": 0.408243375858685,
294
- "grad_norm": 45.997779846191406,
295
- "improvement": 0.3221287727355957,
296
  "learning_rate": 3.6860532770864005e-07,
297
- "logits/chosen": 1.2325729131698608,
298
- "logits/rejected": 1.1537472009658813,
299
- "logits/response": 1.307138204574585,
300
- "logps/chosen": -286.8653564453125,
301
- "logps/rejected": -271.20159912109375,
302
- "logps/response": -182.6356658935547,
303
- "loss": 0.9735,
304
- "penalty": 0.6481137275695801,
305
- "rewards/accuracies": 0.6312500238418579,
306
- "rewards/chosen": -0.3536524772644043,
307
- "rewards/gen": -1.9381176233291626,
308
- "rewards/margins": 0.21084347367286682,
309
- "rewards/rejected": -0.5644959211349487,
310
  "step": 130
311
  },
312
  {
313
  "epoch": 0.4396467124631992,
314
- "grad_norm": 42.91578674316406,
315
- "improvement": 0.3166593611240387,
316
  "learning_rate": 3.4376480090239047e-07,
317
- "logits/chosen": 1.259896993637085,
318
- "logits/rejected": 1.2792214155197144,
319
- "logits/response": 1.3045755624771118,
320
- "logps/chosen": -300.57342529296875,
321
- "logps/rejected": -275.1106262207031,
322
- "logps/response": -190.06996154785156,
323
- "loss": 0.9986,
324
- "penalty": 0.6845930814743042,
325
- "rewards/accuracies": 0.581250011920929,
326
- "rewards/chosen": -0.44455188512802124,
327
- "rewards/gen": -2.133059501647949,
328
- "rewards/margins": 0.1889132261276245,
329
- "rewards/rejected": -0.6334651708602905,
330
  "step": 140
331
  },
332
  {
333
  "epoch": 0.47105004906771347,
334
- "grad_norm": 43.6589241027832,
335
- "improvement": 0.3184241056442261,
336
  "learning_rate": 3.1779403380910425e-07,
337
- "logits/chosen": 1.1337382793426514,
338
- "logits/rejected": 1.1580231189727783,
339
- "logits/response": 1.1197118759155273,
340
- "logps/chosen": -277.9029846191406,
341
- "logps/rejected": -250.3749542236328,
342
- "logps/response": -191.55584716796875,
343
- "loss": 0.9929,
344
- "penalty": 0.6434761881828308,
345
- "rewards/accuracies": 0.6312500238418579,
346
- "rewards/chosen": -0.38797393441200256,
347
- "rewards/gen": -2.0863261222839355,
348
- "rewards/margins": 0.23583289980888367,
349
- "rewards/rejected": -0.6238068342208862,
350
  "step": 150
351
  },
352
  {
353
  "epoch": 0.5024533856722276,
354
- "grad_norm": 49.80767059326172,
355
- "improvement": 0.3020181953907013,
356
  "learning_rate": 2.910060778827554e-07,
357
- "logits/chosen": 1.080392599105835,
358
- "logits/rejected": 1.1482455730438232,
359
- "logits/response": 1.1770050525665283,
360
- "logps/chosen": -302.5500183105469,
361
- "logps/rejected": -283.81341552734375,
362
- "logps/response": -196.9333953857422,
363
- "loss": 0.9816,
364
- "penalty": 0.6278713345527649,
365
  "rewards/accuracies": 0.668749988079071,
366
- "rewards/chosen": -0.4150875210762024,
367
- "rewards/gen": -2.412015199661255,
368
- "rewards/margins": 0.31777530908584595,
369
- "rewards/rejected": -0.7328628301620483,
370
  "step": 160
371
  },
372
  {
373
  "epoch": 0.5338567222767419,
374
- "grad_norm": 62.087135314941406,
375
- "improvement": 0.3313751220703125,
376
  "learning_rate": 2.6372383496608186e-07,
377
- "logits/chosen": 1.2298156023025513,
378
- "logits/rejected": 1.187656044960022,
379
- "logits/response": 1.1628153324127197,
380
- "logps/chosen": -319.54022216796875,
381
- "logps/rejected": -319.6458740234375,
382
- "logps/response": -189.41993713378906,
383
- "loss": 0.9651,
384
- "penalty": 0.7312244176864624,
385
- "rewards/accuracies": 0.5687500238418579,
386
- "rewards/chosen": -0.5790542364120483,
387
- "rewards/gen": -2.1839423179626465,
388
- "rewards/margins": 0.11392831802368164,
389
- "rewards/rejected": -0.69298255443573,
390
  "step": 170
391
  },
392
  {
393
  "epoch": 0.5652600588812562,
394
- "grad_norm": 48.49706268310547,
395
- "improvement": 0.34473639726638794,
396
  "learning_rate": 2.3627616503391812e-07,
397
- "logits/chosen": 1.1173150539398193,
398
- "logits/rejected": 1.1408215761184692,
399
- "logits/response": 1.1199573278427124,
400
- "logps/chosen": -283.539306640625,
401
- "logps/rejected": -250.66598510742188,
402
- "logps/response": -179.87173461914062,
403
- "loss": 0.9908,
404
- "penalty": 0.6365185379981995,
405
- "rewards/accuracies": 0.643750011920929,
406
- "rewards/chosen": -0.39007699489593506,
407
- "rewards/gen": -2.085050106048584,
408
- "rewards/margins": 0.3132508099079132,
409
- "rewards/rejected": -0.7033277750015259,
410
  "step": 180
411
  },
412
  {
413
  "epoch": 0.5966633954857704,
414
- "grad_norm": 52.51746368408203,
415
- "improvement": 0.3269854485988617,
416
  "learning_rate": 2.089939221172446e-07,
417
- "logits/chosen": 1.2556055784225464,
418
- "logits/rejected": 1.26613450050354,
419
- "logits/response": 1.2274242639541626,
420
- "logps/chosen": -280.4549255371094,
421
- "logps/rejected": -285.5497741699219,
422
- "logps/response": -159.73428344726562,
423
- "loss": 0.9521,
424
- "penalty": 0.6330380439758301,
425
- "rewards/accuracies": 0.6312500238418579,
426
- "rewards/chosen": -0.3872259557247162,
427
- "rewards/gen": -2.0674312114715576,
428
- "rewards/margins": 0.30792227387428284,
429
- "rewards/rejected": -0.6951482892036438,
430
  "step": 190
431
  },
432
  {
433
  "epoch": 0.6280667320902846,
434
- "grad_norm": 50.308650970458984,
435
- "improvement": 0.29781386256217957,
436
  "learning_rate": 1.8220596619089573e-07,
437
- "logits/chosen": 1.2265857458114624,
438
- "logits/rejected": 1.2468065023422241,
439
- "logits/response": 1.3728151321411133,
440
- "logps/chosen": -271.88836669921875,
441
- "logps/rejected": -263.320068359375,
442
- "logps/response": -174.91575622558594,
443
- "loss": 0.9432,
444
- "penalty": 0.6788309812545776,
445
- "rewards/accuracies": 0.6312500238418579,
446
- "rewards/chosen": -0.5131560564041138,
447
- "rewards/gen": -2.364034652709961,
448
- "rewards/margins": 0.21325425803661346,
449
- "rewards/rejected": -0.7264103293418884,
450
  "step": 200
451
  },
452
  {
453
  "epoch": 0.6280667320902846,
454
- "eval_improvement": 0.31024378538131714,
455
- "eval_logits/chosen": 0.890977680683136,
456
- "eval_logits/rejected": 0.8562321662902832,
457
- "eval_logits/response": 0.9488180875778198,
458
- "eval_logps/chosen": -314.8556213378906,
459
- "eval_logps/rejected": -279.6759948730469,
460
- "eval_logps/response": -195.17404174804688,
461
- "eval_loss": 0.9939214587211609,
462
- "eval_penalty": 0.6945350170135498,
463
- "eval_rewards/accuracies": 0.5920000076293945,
464
- "eval_rewards/chosen": -0.6203244924545288,
465
- "eval_rewards/gen": -2.453124523162842,
466
- "eval_rewards/margins": 0.18497607111930847,
467
- "eval_rewards/rejected": -0.8053005337715149,
468
- "eval_runtime": 131.2858,
469
- "eval_samples_per_second": 15.234,
470
- "eval_steps_per_second": 1.904,
471
  "step": 200
472
  },
473
  {
474
  "epoch": 0.6594700686947988,
475
- "grad_norm": 46.08613967895508,
476
- "improvement": 0.3185407519340515,
477
  "learning_rate": 1.562351990976095e-07,
478
- "logits/chosen": 1.124887228012085,
479
- "logits/rejected": 1.2165838479995728,
480
- "logits/response": 1.2256819009780884,
481
- "logps/chosen": -318.11102294921875,
482
- "logps/rejected": -313.8572082519531,
483
- "logps/response": -191.56932067871094,
484
- "loss": 0.9183,
485
- "penalty": 0.6234660148620605,
486
- "rewards/accuracies": 0.6187499761581421,
487
- "rewards/chosen": -0.4947032034397125,
488
- "rewards/gen": -2.4716310501098633,
489
- "rewards/margins": 0.3273914158344269,
490
- "rewards/rejected": -0.8220946192741394,
491
  "step": 210
492
  },
493
  {
494
  "epoch": 0.6908734052993131,
495
- "grad_norm": 49.8296012878418,
496
- "improvement": 0.2562139630317688,
497
  "learning_rate": 1.3139467229135998e-07,
498
- "logits/chosen": 1.0995049476623535,
499
- "logits/rejected": 1.1437008380889893,
500
- "logits/response": 1.1871978044509888,
501
- "logps/chosen": -310.8740539550781,
502
- "logps/rejected": -332.66162109375,
503
- "logps/response": -202.8819580078125,
504
- "loss": 0.9309,
505
- "penalty": 0.6312650442123413,
506
- "rewards/accuracies": 0.6625000238418579,
507
- "rewards/chosen": -0.5009514093399048,
508
- "rewards/gen": -2.697756052017212,
509
- "rewards/margins": 0.3737282156944275,
510
- "rewards/rejected": -0.874679684638977,
511
  "step": 220
512
  },
513
  {
514
  "epoch": 0.7222767419038273,
515
- "grad_norm": 43.834163665771484,
516
- "improvement": 0.30367541313171387,
517
  "learning_rate": 1.0798381331721107e-07,
518
- "logits/chosen": 1.1779167652130127,
519
- "logits/rejected": 1.0963248014450073,
520
- "logits/response": 1.1921260356903076,
521
- "logps/chosen": -292.7645568847656,
522
- "logps/rejected": -265.25018310546875,
523
- "logps/response": -173.6116485595703,
524
- "loss": 0.9371,
525
- "penalty": 0.6615164279937744,
526
- "rewards/accuracies": 0.6312500238418579,
527
- "rewards/chosen": -0.5470173358917236,
528
- "rewards/gen": -2.3808140754699707,
529
- "rewards/margins": 0.3109579086303711,
530
- "rewards/rejected": -0.85797518491745,
531
  "step": 230
532
  },
533
  {
534
  "epoch": 0.7536800785083415,
535
- "grad_norm": 48.935142517089844,
536
- "improvement": 0.2968369424343109,
537
  "learning_rate": 8.628481651367875e-08,
538
- "logits/chosen": 1.1252264976501465,
539
- "logits/rejected": 1.0409650802612305,
540
- "logits/response": 1.1756417751312256,
541
- "logps/chosen": -312.1551208496094,
542
- "logps/rejected": -272.1668395996094,
543
- "logps/response": -187.1859893798828,
544
- "loss": 0.9424,
545
- "penalty": 0.6408424377441406,
546
- "rewards/accuracies": 0.612500011920929,
547
- "rewards/chosen": -0.6089781522750854,
548
- "rewards/gen": -2.672689437866211,
549
- "rewards/margins": 0.27286165952682495,
550
- "rewards/rejected": -0.8818397521972656,
551
  "step": 240
552
  },
553
  {
554
  "epoch": 0.7850834151128557,
555
- "grad_norm": 48.9501838684082,
556
- "improvement": 0.29138535261154175,
557
  "learning_rate": 6.655924144404906e-08,
558
- "logits/chosen": 1.2226974964141846,
559
- "logits/rejected": 1.2076431512832642,
560
- "logits/response": 1.2697948217391968,
561
- "logps/chosen": -296.0802917480469,
562
- "logps/rejected": -281.9087829589844,
563
- "logps/response": -191.81581115722656,
564
- "loss": 0.9396,
565
- "penalty": 0.6523431539535522,
566
- "rewards/accuracies": 0.668749988079071,
567
- "rewards/chosen": -0.5890125036239624,
568
- "rewards/gen": -2.5938172340393066,
569
- "rewards/margins": 0.2846723198890686,
570
- "rewards/rejected": -0.8736848831176758,
571
  "step": 250
572
  },
573
  {
574
  "epoch": 0.81648675171737,
575
- "grad_norm": 50.87124252319336,
576
- "improvement": 0.2962267994880676,
577
  "learning_rate": 4.904486005914027e-08,
578
- "logits/chosen": 1.247997760772705,
579
- "logits/rejected": 1.2229816913604736,
580
- "logits/response": 1.220820426940918,
581
- "logps/chosen": -325.2000427246094,
582
- "logps/rejected": -324.00360107421875,
583
- "logps/response": -192.4630126953125,
584
- "loss": 0.9512,
585
- "penalty": 0.6405457258224487,
586
- "rewards/accuracies": 0.6312500238418579,
587
- "rewards/chosen": -0.5452840924263,
588
- "rewards/gen": -2.5521068572998047,
589
- "rewards/margins": 0.31265154480934143,
590
- "rewards/rejected": -0.8579355478286743,
591
  "step": 260
592
  },
593
  {
594
  "epoch": 0.8478900883218842,
595
- "grad_norm": 49.64020538330078,
596
- "improvement": 0.2779327929019928,
597
  "learning_rate": 3.3952790595787986e-08,
598
- "logits/chosen": 1.181114912033081,
599
- "logits/rejected": 1.0998259782791138,
600
- "logits/response": 1.2433074712753296,
601
- "logps/chosen": -349.3531494140625,
602
- "logps/rejected": -303.54290771484375,
603
- "logps/response": -204.0261688232422,
604
- "loss": 0.9736,
605
- "penalty": 0.6398892402648926,
606
  "rewards/accuracies": 0.6499999761581421,
607
- "rewards/chosen": -0.5694975256919861,
608
- "rewards/gen": -2.798640489578247,
609
- "rewards/margins": 0.3327513337135315,
610
- "rewards/rejected": -0.9022488594055176,
611
  "step": 270
612
  },
613
  {
614
  "epoch": 0.8792934249263984,
615
- "grad_norm": 41.62852096557617,
616
- "improvement": 0.2895236015319824,
617
  "learning_rate": 2.1464952759020856e-08,
618
- "logits/chosen": 1.3301640748977661,
619
- "logits/rejected": 1.3078454732894897,
620
- "logits/response": 1.366155743598938,
621
- "logps/chosen": -290.0614013671875,
622
- "logps/rejected": -289.795654296875,
623
- "logps/response": -183.56405639648438,
624
- "loss": 0.9523,
625
- "penalty": 0.6176464557647705,
626
- "rewards/accuracies": 0.606249988079071,
627
- "rewards/chosen": -0.517002284526825,
628
- "rewards/gen": -2.5910143852233887,
629
- "rewards/margins": 0.3448910117149353,
630
- "rewards/rejected": -0.861893355846405,
631
  "step": 280
632
  },
633
  {
634
  "epoch": 0.9106967615309126,
635
- "grad_norm": 44.526798248291016,
636
- "improvement": 0.27915069460868835,
637
  "learning_rate": 1.1731874863145142e-08,
638
- "logits/chosen": 1.107360601425171,
639
- "logits/rejected": 1.115980863571167,
640
- "logits/response": 1.1596436500549316,
641
- "logps/chosen": -331.88482666015625,
642
- "logps/rejected": -284.38092041015625,
643
- "logps/response": -202.94888305664062,
644
- "loss": 0.9388,
645
- "penalty": 0.5815633535385132,
646
- "rewards/accuracies": 0.65625,
647
- "rewards/chosen": -0.5069407224655151,
648
- "rewards/gen": -2.5722098350524902,
649
- "rewards/margins": 0.4680548310279846,
650
- "rewards/rejected": -0.974995493888855,
651
  "step": 290
652
  },
653
  {
654
  "epoch": 0.9421000981354269,
655
- "grad_norm": 48.79948425292969,
656
- "improvement": 0.276977002620697,
657
  "learning_rate": 4.8708793644441086e-09,
658
- "logits/chosen": 1.1860829591751099,
659
- "logits/rejected": 1.1684296131134033,
660
- "logits/response": 1.3256675004959106,
661
- "logps/chosen": -323.58575439453125,
662
- "logps/rejected": -287.38873291015625,
663
- "logps/response": -207.91500854492188,
664
- "loss": 0.9766,
665
- "penalty": 0.7038360834121704,
666
- "rewards/accuracies": 0.612500011920929,
667
- "rewards/chosen": -0.6609238982200623,
668
- "rewards/gen": -2.6817288398742676,
669
- "rewards/margins": 0.17689001560211182,
670
- "rewards/rejected": -0.8378139734268188,
671
  "step": 300
672
  },
673
  {
674
  "epoch": 0.9421000981354269,
675
- "eval_improvement": 0.30491918325424194,
676
- "eval_logits/chosen": 0.8745693564414978,
677
- "eval_logits/rejected": 0.8407858610153198,
678
- "eval_logits/response": 0.9387850165367126,
679
- "eval_logps/chosen": -315.67236328125,
680
- "eval_logps/rejected": -280.70233154296875,
681
- "eval_logps/response": -196.74180603027344,
682
- "eval_loss": 0.9861029982566833,
683
- "eval_penalty": 0.6912589073181152,
684
- "eval_rewards/accuracies": 0.6019999980926514,
685
- "eval_rewards/chosen": -0.7020009160041809,
686
- "eval_rewards/gen": -2.609898328781128,
687
- "eval_rewards/margins": 0.20593461394309998,
688
- "eval_rewards/rejected": -0.9079356789588928,
689
- "eval_runtime": 130.1224,
690
- "eval_samples_per_second": 15.37,
691
- "eval_steps_per_second": 1.921,
692
  "step": 300
693
  },
694
  {
695
  "epoch": 0.9735034347399412,
696
- "grad_norm": 48.872318267822266,
697
- "improvement": 0.27313023805618286,
698
  "learning_rate": 9.64668657069706e-10,
699
- "logits/chosen": 1.1439440250396729,
700
- "logits/rejected": 1.109357237815857,
701
- "logits/response": 1.206132173538208,
702
- "logps/chosen": -325.14862060546875,
703
- "logps/rejected": -304.11956787109375,
704
- "logps/response": -201.95692443847656,
705
- "loss": 0.9496,
706
- "penalty": 0.6938046216964722,
707
- "rewards/accuracies": 0.5874999761581421,
708
- "rewards/chosen": -0.654919445514679,
709
- "rewards/gen": -2.729240894317627,
710
- "rewards/margins": 0.16051390767097473,
711
- "rewards/rejected": -0.8154333233833313,
712
  "step": 310
713
  },
714
  {
715
  "epoch": 0.9986261040235525,
716
  "step": 318,
717
  "total_flos": 0.0,
718
- "train_loss": 1.0325740605780163,
719
- "train_runtime": 3343.8277,
720
- "train_samples_per_second": 6.094,
721
- "train_steps_per_second": 0.095
722
  }
723
  ],
724
  "logging_steps": 10,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.003140333660451423,
13
+ "grad_norm": 200.00250244140625,
14
  "improvement": 0.6931471824645996,
15
  "learning_rate": 1.5625e-08,
16
+ "logits/chosen": 1.1002808809280396,
17
+ "logits/rejected": 1.2185591459274292,
18
+ "logits/response": 1.4746066331863403,
19
+ "logps/chosen": -266.4275207519531,
20
+ "logps/rejected": -258.69293212890625,
21
+ "logps/response": -758.1343994140625,
22
  "loss": 1.3863,
23
  "penalty": 0.6931471824645996,
24
  "rewards/accuracies": 0.0,
 
30
  },
31
  {
32
  "epoch": 0.03140333660451423,
33
+ "grad_norm": 167.06924438476562,
34
+ "improvement": 0.6249845623970032,
35
  "learning_rate": 1.5624999999999999e-07,
36
+ "logits/chosen": 1.2528102397918701,
37
+ "logits/rejected": 1.1278939247131348,
38
+ "logits/response": 1.543837308883667,
39
+ "logps/chosen": -338.7834167480469,
40
+ "logps/rejected": -262.37567138671875,
41
+ "logps/response": -920.6803588867188,
42
+ "loss": 1.3201,
43
+ "penalty": 0.6959547996520996,
44
+ "rewards/accuracies": 0.4305555522441864,
45
+ "rewards/chosen": 0.010056542232632637,
46
+ "rewards/gen": -0.14015476405620575,
47
+ "rewards/margins": -0.004349019844084978,
48
+ "rewards/rejected": 0.014405561611056328,
49
  "step": 10
50
  },
51
  {
52
  "epoch": 0.06280667320902845,
53
+ "grad_norm": 74.37409973144531,
54
+ "improvement": 0.2921519875526428,
55
  "learning_rate": 3.1249999999999997e-07,
56
+ "logits/chosen": 1.2214809656143188,
57
+ "logits/rejected": 1.1795613765716553,
58
+ "logits/response": 1.5969406366348267,
59
+ "logps/chosen": -302.14825439453125,
60
+ "logps/rejected": -276.88629150390625,
61
+ "logps/response": -1001.9197387695312,
62
+ "loss": 1.0297,
63
+ "penalty": 0.7252423763275146,
64
+ "rewards/accuracies": 0.4000000059604645,
65
+ "rewards/chosen": 0.050227295607328415,
66
+ "rewards/gen": -1.1749426126480103,
67
+ "rewards/margins": -0.05469644069671631,
68
+ "rewards/rejected": 0.10492374747991562,
69
  "step": 20
70
  },
71
  {
72
  "epoch": 0.09421000981354269,
73
+ "grad_norm": 52.853057861328125,
74
+ "improvement": 0.08411481231451035,
75
  "learning_rate": 4.6874999999999996e-07,
76
+ "logits/chosen": 1.488523244857788,
77
+ "logits/rejected": 1.3755172491073608,
78
+ "logits/response": 1.854111909866333,
79
+ "logps/chosen": -329.28521728515625,
80
+ "logps/rejected": -283.4335021972656,
81
+ "logps/response": -953.9349365234375,
82
+ "loss": 0.8522,
83
+ "penalty": 0.7766379714012146,
84
+ "rewards/accuracies": 0.34375,
85
+ "rewards/chosen": 0.03275767341256142,
86
+ "rewards/gen": -3.0305302143096924,
87
+ "rewards/margins": -0.12535743415355682,
88
+ "rewards/rejected": 0.15811510384082794,
89
  "step": 30
90
  },
91
  {
92
  "epoch": 0.1256133464180569,
93
+ "grad_norm": 55.16160583496094,
94
+ "improvement": 0.044371988624334335,
95
  "learning_rate": 4.990353313429303e-07,
96
+ "logits/chosen": 1.4546756744384766,
97
+ "logits/rejected": 1.3696863651275635,
98
+ "logits/response": 2.0380258560180664,
99
+ "logps/chosen": -291.39862060546875,
100
+ "logps/rejected": -258.83160400390625,
101
+ "logps/response": -974.2811279296875,
102
+ "loss": 0.8206,
103
+ "penalty": 0.794792890548706,
104
+ "rewards/accuracies": 0.41874998807907104,
105
+ "rewards/chosen": -0.042534660547971725,
106
+ "rewards/gen": -4.516351222991943,
107
+ "rewards/margins": -0.1319345235824585,
108
+ "rewards/rejected": 0.08939988166093826,
109
  "step": 40
110
  },
111
  {
112
  "epoch": 0.15701668302257116,
113
+ "grad_norm": 52.997276306152344,
114
+ "improvement": 0.039832912385463715,
115
  "learning_rate": 4.951291206355559e-07,
116
+ "logits/chosen": 1.4147427082061768,
117
+ "logits/rejected": 1.3478193283081055,
118
+ "logits/response": 1.897972822189331,
119
+ "logps/chosen": -316.64971923828125,
120
+ "logps/rejected": -287.1830749511719,
121
+ "logps/response": -962.357421875,
122
+ "loss": 0.8111,
123
+ "penalty": 0.75470370054245,
124
+ "rewards/accuracies": 0.4625000059604645,
125
+ "rewards/chosen": -0.02022326923906803,
126
+ "rewards/gen": -4.572790145874023,
127
+ "rewards/margins": -0.057704925537109375,
128
+ "rewards/rejected": 0.03748166561126709,
129
  "step": 50
130
  },
131
  {
132
  "epoch": 0.18842001962708538,
133
+ "grad_norm": 51.460330963134766,
134
+ "improvement": 0.04238835722208023,
135
  "learning_rate": 4.882681251368548e-07,
136
+ "logits/chosen": 1.493253469467163,
137
+ "logits/rejected": 1.5756161212921143,
138
+ "logits/response": 2.1205339431762695,
139
+ "logps/chosen": -271.7294616699219,
140
+ "logps/rejected": -284.16998291015625,
141
+ "logps/response": -965.3502807617188,
142
+ "loss": 0.7701,
143
+ "penalty": 0.6884164214134216,
144
+ "rewards/accuracies": 0.5687500238418579,
145
+ "rewards/chosen": 0.01879013516008854,
146
+ "rewards/gen": -4.314782619476318,
147
+ "rewards/margins": 0.05694611743092537,
148
+ "rewards/rejected": -0.03815598413348198,
149
  "step": 60
150
  },
151
  {
152
  "epoch": 0.2198233562315996,
153
+ "grad_norm": 47.554962158203125,
154
+ "improvement": 0.05140012502670288,
155
  "learning_rate": 4.785350472409791e-07,
156
+ "logits/chosen": 1.4707852602005005,
157
+ "logits/rejected": 1.3398946523666382,
158
+ "logits/response": 1.9091663360595703,
159
+ "logps/chosen": -293.58685302734375,
160
+ "logps/rejected": -277.17047119140625,
161
+ "logps/response": -1005.49560546875,
162
+ "loss": 0.7433,
163
+ "penalty": 0.6779423952102661,
164
+ "rewards/accuracies": 0.59375,
165
+ "rewards/chosen": -0.21125507354736328,
166
+ "rewards/gen": -4.562561511993408,
167
+ "rewards/margins": 0.09027546644210815,
168
+ "rewards/rejected": -0.30153053998947144,
169
  "step": 70
170
  },
171
  {
172
  "epoch": 0.2512266928361138,
173
+ "grad_norm": 52.48830795288086,
174
+ "improvement": 0.05445639044046402,
175
  "learning_rate": 4.6604720940421207e-07,
176
+ "logits/chosen": 1.3364737033843994,
177
+ "logits/rejected": 1.348964810371399,
178
+ "logits/response": 1.8606315851211548,
179
+ "logps/chosen": -330.0252685546875,
180
+ "logps/rejected": -295.9020080566406,
181
+ "logps/response": -994.1389770507812,
182
+ "loss": 0.7334,
183
+ "penalty": 0.6729768514633179,
184
+ "rewards/accuracies": 0.581250011920929,
185
+ "rewards/chosen": -0.43410032987594604,
186
+ "rewards/gen": -4.849533557891846,
187
+ "rewards/margins": 0.1275758445262909,
188
+ "rewards/rejected": -0.5616761445999146,
189
  "step": 80
190
  },
191
  {
192
  "epoch": 0.2826300294406281,
193
+ "grad_norm": 48.90186309814453,
194
+ "improvement": 0.03790447860956192,
195
  "learning_rate": 4.5095513994085974e-07,
196
+ "logits/chosen": 1.3569543361663818,
197
+ "logits/rejected": 1.298161506652832,
198
+ "logits/response": 1.9093761444091797,
199
+ "logps/chosen": -333.34332275390625,
200
+ "logps/rejected": -314.8207092285156,
201
+ "logps/response": -995.0445556640625,
202
+ "loss": 0.6955,
203
+ "penalty": 0.6896320581436157,
204
+ "rewards/accuracies": 0.550000011920929,
205
+ "rewards/chosen": -0.5641836524009705,
206
+ "rewards/gen": -5.513597011566162,
207
+ "rewards/margins": 0.11063194274902344,
208
+ "rewards/rejected": -0.6748155355453491,
209
  "step": 90
210
  },
211
  {
212
  "epoch": 0.3140333660451423,
213
+ "grad_norm": 43.70199966430664,
214
+ "improvement": 0.045615434646606445,
215
  "learning_rate": 4.3344075855595097e-07,
216
+ "logits/chosen": 1.3917421102523804,
217
+ "logits/rejected": 1.3425309658050537,
218
+ "logits/response": 1.9128587245941162,
219
+ "logps/chosen": -337.1028747558594,
220
+ "logps/rejected": -292.86566162109375,
221
+ "logps/response": -978.150390625,
222
+ "loss": 0.6875,
223
+ "penalty": 0.6265669465065002,
224
+ "rewards/accuracies": 0.6499999761581421,
225
+ "rewards/chosen": -0.48066097497940063,
226
+ "rewards/gen": -5.6218461990356445,
227
+ "rewards/margins": 0.2668846547603607,
228
+ "rewards/rejected": -0.747545599937439,
229
  "step": 100
230
  },
231
  {
232
  "epoch": 0.3140333660451423,
233
+ "eval_improvement": 0.6763742566108704,
234
+ "eval_logits/chosen": 1.120324730873108,
235
+ "eval_logits/rejected": 1.0761666297912598,
236
+ "eval_logits/response": 1.170290231704712,
237
+ "eval_logps/chosen": -320.4205627441406,
238
+ "eval_logps/rejected": -287.2986755371094,
239
+ "eval_logps/response": -204.3113250732422,
240
+ "eval_loss": 1.3505975008010864,
241
+ "eval_penalty": 0.6691088080406189,
242
+ "eval_rewards/accuracies": 0.6060000061988831,
243
+ "eval_rewards/chosen": -0.47539979219436646,
244
+ "eval_rewards/gen": -0.756511926651001,
245
+ "eval_rewards/margins": 0.18411880731582642,
246
+ "eval_rewards/rejected": -0.6595185995101929,
247
+ "eval_runtime": 141.2797,
248
+ "eval_samples_per_second": 14.156,
249
+ "eval_steps_per_second": 1.77,
250
  "step": 100
251
  },
252
  {
253
  "epoch": 0.34543670264965654,
254
+ "grad_norm": 50.130496978759766,
255
+ "improvement": 0.03334973007440567,
256
  "learning_rate": 4.137151834863213e-07,
257
+ "logits/chosen": 1.4784291982650757,
258
+ "logits/rejected": 1.4504770040512085,
259
+ "logits/response": 2.0669291019439697,
260
+ "logps/chosen": -282.5090026855469,
261
+ "logps/rejected": -292.196533203125,
262
+ "logps/response": -916.5537109375,
263
+ "loss": 0.6779,
264
+ "penalty": 0.6117620468139648,
265
+ "rewards/accuracies": 0.6875,
266
+ "rewards/chosen": -0.41435232758522034,
267
+ "rewards/gen": -6.181846618652344,
268
+ "rewards/margins": 0.31640559434890747,
269
+ "rewards/rejected": -0.730758011341095,
270
  "step": 110
271
  },
272
  {
273
  "epoch": 0.37684003925417076,
274
+ "grad_norm": 52.74955749511719,
275
+ "improvement": 0.058611877262592316,
276
  "learning_rate": 3.920161866827889e-07,
277
+ "logits/chosen": 1.3459668159484863,
278
+ "logits/rejected": 1.2685140371322632,
279
+ "logits/response": 1.8824383020401,
280
+ "logps/chosen": -362.947509765625,
281
+ "logps/rejected": -320.85797119140625,
282
+ "logps/response": -1031.55517578125,
283
+ "loss": 0.7052,
284
+ "penalty": 0.6450746655464172,
285
+ "rewards/accuracies": 0.606249988079071,
286
+ "rewards/chosen": -0.8527549505233765,
287
+ "rewards/gen": -6.363945007324219,
288
+ "rewards/margins": 0.3166798949241638,
289
+ "rewards/rejected": -1.169434905052185,
290
  "step": 120
291
  },
292
  {
293
  "epoch": 0.408243375858685,
294
+ "grad_norm": 44.321163177490234,
295
+ "improvement": 0.036691777408123016,
296
  "learning_rate": 3.6860532770864005e-07,
297
+ "logits/chosen": 1.4267325401306152,
298
+ "logits/rejected": 1.4107747077941895,
299
+ "logits/response": 1.9945790767669678,
300
+ "logps/chosen": -290.82928466796875,
301
+ "logps/rejected": -272.4383239746094,
302
+ "logps/response": -988.95263671875,
303
+ "loss": 0.6601,
304
+ "penalty": 0.6543839573860168,
305
+ "rewards/accuracies": 0.606249988079071,
306
+ "rewards/chosen": -0.7737542390823364,
307
+ "rewards/gen": -6.266350269317627,
308
+ "rewards/margins": 0.24219664931297302,
309
+ "rewards/rejected": -1.0159507989883423,
310
  "step": 130
311
  },
312
  {
313
  "epoch": 0.4396467124631992,
314
+ "grad_norm": 46.13864517211914,
315
+ "improvement": 0.03532712906599045,
316
  "learning_rate": 3.4376480090239047e-07,
317
+ "logits/chosen": 1.4152085781097412,
318
+ "logits/rejected": 1.2808418273925781,
319
+ "logits/response": 1.9436925649642944,
320
+ "logps/chosen": -276.03021240234375,
321
+ "logps/rejected": -248.2073211669922,
322
+ "logps/response": -1000.24462890625,
323
+ "loss": 0.6838,
324
+ "penalty": 0.6066700220108032,
325
+ "rewards/accuracies": 0.668749988079071,
326
+ "rewards/chosen": -0.6766573190689087,
327
+ "rewards/gen": -6.422157287597656,
328
+ "rewards/margins": 0.367643803358078,
329
+ "rewards/rejected": -1.044301152229309,
330
  "step": 140
331
  },
332
  {
333
  "epoch": 0.47105004906771347,
334
+ "grad_norm": 63.85268783569336,
335
+ "improvement": 0.03399471566081047,
336
  "learning_rate": 3.1779403380910425e-07,
337
+ "logits/chosen": 1.3651916980743408,
338
+ "logits/rejected": 1.3227938413619995,
339
+ "logits/response": 1.9420855045318604,
340
+ "logps/chosen": -297.1623840332031,
341
+ "logps/rejected": -276.270751953125,
342
+ "logps/response": -1037.3096923828125,
343
+ "loss": 0.668,
344
+ "penalty": 0.6184068918228149,
345
+ "rewards/accuracies": 0.7124999761581421,
346
+ "rewards/chosen": -0.6357800960540771,
347
+ "rewards/gen": -7.107823371887207,
348
+ "rewards/margins": 0.3691752851009369,
349
+ "rewards/rejected": -1.0049554109573364,
350
  "step": 150
351
  },
352
  {
353
  "epoch": 0.5024533856722276,
354
+ "grad_norm": 55.31007766723633,
355
+ "improvement": 0.04934271052479744,
356
  "learning_rate": 2.910060778827554e-07,
357
+ "logits/chosen": 1.3888250589370728,
358
+ "logits/rejected": 1.2889330387115479,
359
+ "logits/response": 1.8870728015899658,
360
+ "logps/chosen": -378.60809326171875,
361
+ "logps/rejected": -333.920654296875,
362
+ "logps/response": -1007.1419067382812,
363
+ "loss": 0.6667,
364
+ "penalty": 0.6215599775314331,
365
  "rewards/accuracies": 0.668749988079071,
366
+ "rewards/chosen": -0.671469509601593,
367
+ "rewards/gen": -6.274459362030029,
368
+ "rewards/margins": 0.35334306955337524,
369
+ "rewards/rejected": -1.0248124599456787,
370
  "step": 160
371
  },
372
  {
373
  "epoch": 0.5338567222767419,
374
+ "grad_norm": 52.735923767089844,
375
+ "improvement": 0.03530705347657204,
376
  "learning_rate": 2.6372383496608186e-07,
377
+ "logits/chosen": 1.3521907329559326,
378
+ "logits/rejected": 1.3136357069015503,
379
+ "logits/response": 1.958385705947876,
380
+ "logps/chosen": -309.96441650390625,
381
+ "logps/rejected": -290.8580627441406,
382
+ "logps/response": -1000.8353271484375,
383
+ "loss": 0.6716,
384
+ "penalty": 0.6742448210716248,
385
+ "rewards/accuracies": 0.581250011920929,
386
+ "rewards/chosen": -0.5993863940238953,
387
+ "rewards/gen": -6.6541900634765625,
388
+ "rewards/margins": 0.18541964888572693,
389
+ "rewards/rejected": -0.7848061323165894,
390
  "step": 170
391
  },
392
  {
393
  "epoch": 0.5652600588812562,
394
+ "grad_norm": 59.980445861816406,
395
+ "improvement": 0.047461725771427155,
396
  "learning_rate": 2.3627616503391812e-07,
397
+ "logits/chosen": 1.546133041381836,
398
+ "logits/rejected": 1.4397121667861938,
399
+ "logits/response": 2.062786340713501,
400
+ "logps/chosen": -307.1769104003906,
401
+ "logps/rejected": -260.46331787109375,
402
+ "logps/response": -957.2349853515625,
403
+ "loss": 0.6516,
404
+ "penalty": 0.5987246632575989,
405
+ "rewards/accuracies": 0.699999988079071,
406
+ "rewards/chosen": -0.5737658739089966,
407
+ "rewards/gen": -6.288596153259277,
408
+ "rewards/margins": 0.36477407813072205,
409
+ "rewards/rejected": -0.938539981842041,
410
  "step": 180
411
  },
412
  {
413
  "epoch": 0.5966633954857704,
414
+ "grad_norm": 48.051780700683594,
415
+ "improvement": 0.03724605590105057,
416
  "learning_rate": 2.089939221172446e-07,
417
+ "logits/chosen": 1.3898411989212036,
418
+ "logits/rejected": 1.3639791011810303,
419
+ "logits/response": 1.9668989181518555,
420
+ "logps/chosen": -292.2279052734375,
421
+ "logps/rejected": -282.55889892578125,
422
+ "logps/response": -946.9928588867188,
423
+ "loss": 0.6857,
424
+ "penalty": 0.660628616809845,
425
+ "rewards/accuracies": 0.581250011920929,
426
+ "rewards/chosen": -0.6309083700180054,
427
+ "rewards/gen": -6.227112770080566,
428
+ "rewards/margins": 0.27697476744651794,
429
+ "rewards/rejected": -0.9078830480575562,
430
  "step": 190
431
  },
432
  {
433
  "epoch": 0.6280667320902846,
434
+ "grad_norm": 50.111351013183594,
435
+ "improvement": 0.04991893097758293,
436
  "learning_rate": 1.8220596619089573e-07,
437
+ "logits/chosen": 1.3693121671676636,
438
+ "logits/rejected": 1.301695466041565,
439
+ "logits/response": 1.932735800743103,
440
+ "logps/chosen": -292.2115478515625,
441
+ "logps/rejected": -260.68133544921875,
442
+ "logps/response": -974.4039916992188,
443
+ "loss": 0.6367,
444
+ "penalty": 0.6076933741569519,
445
+ "rewards/accuracies": 0.6499999761581421,
446
+ "rewards/chosen": -0.6337991952896118,
447
+ "rewards/gen": -6.308970928192139,
448
+ "rewards/margins": 0.385838121175766,
449
+ "rewards/rejected": -1.0196373462677002,
450
  "step": 200
451
  },
452
  {
453
  "epoch": 0.6280667320902846,
454
+ "eval_improvement": 0.7678172588348389,
455
+ "eval_logits/chosen": 1.0745381116867065,
456
+ "eval_logits/rejected": 1.0335055589675903,
457
+ "eval_logits/response": 1.1297274827957153,
458
+ "eval_logps/chosen": -322.44207763671875,
459
+ "eval_logps/rejected": -290.7333679199219,
460
+ "eval_logps/response": -205.4324493408203,
461
+ "eval_loss": 1.400818109512329,
462
+ "eval_penalty": 0.637177050113678,
463
+ "eval_rewards/accuracies": 0.6420000195503235,
464
+ "eval_rewards/chosen": -0.6775563359260559,
465
+ "eval_rewards/gen": -0.8686242699623108,
466
+ "eval_rewards/margins": 0.32543402910232544,
467
+ "eval_rewards/rejected": -1.0029902458190918,
468
+ "eval_runtime": 131.9039,
469
+ "eval_samples_per_second": 15.163,
470
+ "eval_steps_per_second": 1.895,
471
  "step": 200
472
  },
473
  {
474
  "epoch": 0.6594700686947988,
475
+ "grad_norm": 50.02403259277344,
476
+ "improvement": 0.03354328125715256,
477
  "learning_rate": 1.562351990976095e-07,
478
+ "logits/chosen": 1.250815749168396,
479
+ "logits/rejected": 1.2162940502166748,
480
+ "logits/response": 1.792128562927246,
481
+ "logps/chosen": -315.5356750488281,
482
+ "logps/rejected": -285.4361267089844,
483
+ "logps/response": -1000.24658203125,
484
+ "loss": 0.643,
485
+ "penalty": 0.6080543994903564,
486
+ "rewards/accuracies": 0.6625000238418579,
487
+ "rewards/chosen": -0.6579871773719788,
488
+ "rewards/gen": -6.532958984375,
489
+ "rewards/margins": 0.366300493478775,
490
+ "rewards/rejected": -1.0242877006530762,
491
  "step": 210
492
  },
493
  {
494
  "epoch": 0.6908734052993131,
495
+ "grad_norm": 48.42168045043945,
496
+ "improvement": 0.03473372012376785,
497
  "learning_rate": 1.3139467229135998e-07,
498
+ "logits/chosen": 1.3695895671844482,
499
+ "logits/rejected": 1.2674177885055542,
500
+ "logits/response": 1.913102149963379,
501
+ "logps/chosen": -317.5673522949219,
502
+ "logps/rejected": -282.95709228515625,
503
+ "logps/response": -1023.9091796875,
504
+ "loss": 0.6433,
505
+ "penalty": 0.580784797668457,
506
+ "rewards/accuracies": 0.699999988079071,
507
+ "rewards/chosen": -0.6436624526977539,
508
+ "rewards/gen": -6.871123313903809,
509
+ "rewards/margins": 0.46090683341026306,
510
+ "rewards/rejected": -1.1045693159103394,
511
  "step": 220
512
  },
513
  {
514
  "epoch": 0.7222767419038273,
515
+ "grad_norm": 48.59343338012695,
516
+ "improvement": 0.047753967344760895,
517
  "learning_rate": 1.0798381331721107e-07,
518
+ "logits/chosen": 1.3397281169891357,
519
+ "logits/rejected": 1.1965497732162476,
520
+ "logits/response": 1.9877132177352905,
521
+ "logps/chosen": -329.905029296875,
522
+ "logps/rejected": -280.7567138671875,
523
+ "logps/response": -942.7415771484375,
524
+ "loss": 0.6473,
525
+ "penalty": 0.6103171110153198,
526
+ "rewards/accuracies": 0.6937500238418579,
527
+ "rewards/chosen": -0.839415431022644,
528
+ "rewards/gen": -6.383160591125488,
529
+ "rewards/margins": 0.3692413568496704,
530
+ "rewards/rejected": -1.2086567878723145,
531
  "step": 230
532
  },
533
  {
534
  "epoch": 0.7536800785083415,
535
+ "grad_norm": 50.30048751831055,
536
+ "improvement": 0.03364395350217819,
537
  "learning_rate": 8.628481651367875e-08,
538
+ "logits/chosen": 1.3906304836273193,
539
+ "logits/rejected": 1.307106614112854,
540
+ "logits/response": 1.8935962915420532,
541
+ "logps/chosen": -333.33880615234375,
542
+ "logps/rejected": -319.1541748046875,
543
+ "logps/response": -1009.0065307617188,
544
+ "loss": 0.6684,
545
+ "penalty": 0.6743585467338562,
546
+ "rewards/accuracies": 0.5874999761581421,
547
+ "rewards/chosen": -0.7209497094154358,
548
+ "rewards/gen": -6.688617706298828,
549
+ "rewards/margins": 0.20695701241493225,
550
+ "rewards/rejected": -0.9279066920280457,
551
  "step": 240
552
  },
553
  {
554
  "epoch": 0.7850834151128557,
555
+ "grad_norm": 51.985145568847656,
556
+ "improvement": 0.025273319333791733,
557
  "learning_rate": 6.655924144404906e-08,
558
+ "logits/chosen": 1.3345959186553955,
559
+ "logits/rejected": 1.3350740671157837,
560
+ "logits/response": 1.926537275314331,
561
+ "logps/chosen": -320.46905517578125,
562
+ "logps/rejected": -313.0986633300781,
563
+ "logps/response": -1047.0450439453125,
564
+ "loss": 0.6355,
565
+ "penalty": 0.5920716524124146,
566
+ "rewards/accuracies": 0.6625000238418579,
567
+ "rewards/chosen": -0.5942948460578918,
568
+ "rewards/gen": -7.248889923095703,
569
+ "rewards/margins": 0.43543609976768494,
570
+ "rewards/rejected": -1.029731035232544,
571
  "step": 250
572
  },
573
  {
574
  "epoch": 0.81648675171737,
575
+ "grad_norm": 52.067352294921875,
576
+ "improvement": 0.0384710431098938,
577
  "learning_rate": 4.904486005914027e-08,
578
+ "logits/chosen": 1.3885962963104248,
579
+ "logits/rejected": 1.286679983139038,
580
+ "logits/response": 2.0341262817382812,
581
+ "logps/chosen": -302.25225830078125,
582
+ "logps/rejected": -276.5771484375,
583
+ "logps/response": -1033.217529296875,
584
+ "loss": 0.6409,
585
+ "penalty": 0.6347585916519165,
586
+ "rewards/accuracies": 0.6625000238418579,
587
+ "rewards/chosen": -0.6630799174308777,
588
+ "rewards/gen": -7.127025604248047,
589
+ "rewards/margins": 0.33273279666900635,
590
+ "rewards/rejected": -0.9958127737045288,
591
  "step": 260
592
  },
593
  {
594
  "epoch": 0.8478900883218842,
595
+ "grad_norm": 52.939659118652344,
596
+ "improvement": 0.032857220619916916,
597
  "learning_rate": 3.3952790595787986e-08,
598
+ "logits/chosen": 1.3275368213653564,
599
+ "logits/rejected": 1.3410176038742065,
600
+ "logits/response": 1.9151852130889893,
601
+ "logps/chosen": -300.0262451171875,
602
+ "logps/rejected": -301.26348876953125,
603
+ "logps/response": -984.4091796875,
604
+ "loss": 0.6556,
605
+ "penalty": 0.616500973701477,
606
  "rewards/accuracies": 0.6499999761581421,
607
+ "rewards/chosen": -0.6747496724128723,
608
+ "rewards/gen": -7.032576560974121,
609
+ "rewards/margins": 0.3772509694099426,
610
+ "rewards/rejected": -1.052000641822815,
611
  "step": 270
612
  },
613
  {
614
  "epoch": 0.8792934249263984,
615
+ "grad_norm": 56.18733215332031,
616
+ "improvement": 0.04124368727207184,
617
  "learning_rate": 2.1464952759020856e-08,
618
+ "logits/chosen": 1.3870338201522827,
619
+ "logits/rejected": 1.2385810613632202,
620
+ "logits/response": 1.9627625942230225,
621
+ "logps/chosen": -339.856201171875,
622
+ "logps/rejected": -290.2395935058594,
623
+ "logps/response": -957.0828247070312,
624
+ "loss": 0.6484,
625
+ "penalty": 0.5773279666900635,
626
+ "rewards/accuracies": 0.6625000238418579,
627
+ "rewards/chosen": -0.6688198447227478,
628
+ "rewards/gen": -6.297931671142578,
629
+ "rewards/margins": 0.4349055290222168,
630
+ "rewards/rejected": -1.103725552558899,
631
  "step": 280
632
  },
633
  {
634
  "epoch": 0.9106967615309126,
635
+ "grad_norm": 50.07227325439453,
636
+ "improvement": 0.0390530601143837,
637
  "learning_rate": 1.1731874863145142e-08,
638
+ "logits/chosen": 1.4286466836929321,
639
+ "logits/rejected": 1.3211020231246948,
640
+ "logits/response": 1.9781084060668945,
641
+ "logps/chosen": -288.7481384277344,
642
+ "logps/rejected": -284.15789794921875,
643
+ "logps/response": -978.9161987304688,
644
+ "loss": 0.6264,
645
+ "penalty": 0.5874598026275635,
646
+ "rewards/accuracies": 0.6875,
647
+ "rewards/chosen": -0.5895588994026184,
648
+ "rewards/gen": -6.995156764984131,
649
+ "rewards/margins": 0.39368805289268494,
650
+ "rewards/rejected": -0.983246922492981,
651
  "step": 290
652
  },
653
  {
654
  "epoch": 0.9421000981354269,
655
+ "grad_norm": 47.96150207519531,
656
+ "improvement": 0.026903603225946426,
657
  "learning_rate": 4.8708793644441086e-09,
658
+ "logits/chosen": 1.3613035678863525,
659
+ "logits/rejected": 1.3257195949554443,
660
+ "logits/response": 1.956418752670288,
661
+ "logps/chosen": -295.4048156738281,
662
+ "logps/rejected": -294.7374267578125,
663
+ "logps/response": -1010.1788940429688,
664
+ "loss": 0.6335,
665
+ "penalty": 0.5789179801940918,
666
+ "rewards/accuracies": 0.6937500238418579,
667
+ "rewards/chosen": -0.6945083737373352,
668
+ "rewards/gen": -7.097268581390381,
669
+ "rewards/margins": 0.49477386474609375,
670
+ "rewards/rejected": -1.1892822980880737,
671
  "step": 300
672
  },
673
  {
674
  "epoch": 0.9421000981354269,
675
+ "eval_improvement": 0.7646235227584839,
676
+ "eval_logits/chosen": 1.051769733428955,
677
+ "eval_logits/rejected": 1.007547378540039,
678
+ "eval_logits/response": 1.1118723154067993,
679
+ "eval_logps/chosen": -323.3211975097656,
680
+ "eval_logps/rejected": -291.7405700683594,
681
+ "eval_logps/response": -206.5587921142578,
682
+ "eval_loss": 1.3955390453338623,
683
+ "eval_penalty": 0.6340250372886658,
684
+ "eval_rewards/accuracies": 0.6480000019073486,
685
+ "eval_rewards/chosen": -0.7654658555984497,
686
+ "eval_rewards/gen": -0.9812589287757874,
687
+ "eval_rewards/margins": 0.3382430076599121,
688
+ "eval_rewards/rejected": -1.1037088632583618,
689
+ "eval_runtime": 130.0193,
690
+ "eval_samples_per_second": 15.382,
691
+ "eval_steps_per_second": 1.923,
692
  "step": 300
693
  },
694
  {
695
  "epoch": 0.9735034347399412,
696
+ "grad_norm": 46.95528030395508,
697
+ "improvement": 0.053758006542921066,
698
  "learning_rate": 9.64668657069706e-10,
699
+ "logits/chosen": 1.3695099353790283,
700
+ "logits/rejected": 1.280415415763855,
701
+ "logits/response": 1.937442421913147,
702
+ "logps/chosen": -311.23272705078125,
703
+ "logps/rejected": -276.55255126953125,
704
+ "logps/response": -1002.0380859375,
705
+ "loss": 0.6576,
706
+ "penalty": 0.6296704411506653,
707
+ "rewards/accuracies": 0.637499988079071,
708
+ "rewards/chosen": -0.7500702738761902,
709
+ "rewards/gen": -6.900748252868652,
710
+ "rewards/margins": 0.36686232686042786,
711
+ "rewards/rejected": -1.1169326305389404,
712
  "step": 310
713
  },
714
  {
715
  "epoch": 0.9986261040235525,
716
  "step": 318,
717
  "total_flos": 0.0,
718
+ "train_loss": 0.7174485007172111,
719
+ "train_runtime": 3896.582,
720
+ "train_samples_per_second": 5.23,
721
+ "train_steps_per_second": 0.082
722
  }
723
  ],
724
  "logging_steps": 10,