RikkiXu commited on
Commit
44dd9c1
1 Parent(s): 20aac80

Model save

Browse files
README.md CHANGED
@@ -2,15 +2,9 @@
2
  license: apache-2.0
3
  base_model: alignment-handbook/zephyr-7b-sft-full
4
  tags:
5
- - alignment-handbook
6
  - trl
7
  - dpo
8
  - generated_from_trainer
9
- - trl
10
- - dpo
11
- - generated_from_trainer
12
- datasets:
13
- - HuggingFaceH4/ultrafeedback_binarized
14
  model-index:
15
  - name: zephyr-7b-dpo-full
16
  results: []
@@ -21,17 +15,17 @@ should probably proofread and complete it, then remove this comment. -->
21
 
22
  # zephyr-7b-dpo-full
23
 
24
- This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the HuggingFaceH4/ultrafeedback_binarized dataset.
25
  It achieves the following results on the evaluation set:
26
- - Loss: 0.2784
27
- - Rewards/chosen: 31.9054
28
- - Rewards/rejected: 27.5130
29
- - Rewards/accuracies: 0.5742
30
- - Rewards/margins: 4.3924
31
- - Logps/rejected: -235.1542
32
- - Logps/chosen: -230.6881
33
- - Logits/rejected: -2.6646
34
- - Logits/chosen: -2.6896
35
 
36
  ## Model description
37
 
@@ -68,10 +62,10 @@ The following hyperparameters were used during training:
68
 
69
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
70
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
71
- | 0.3163 | 0.21 | 100 | 0.3129 | 27.2147 | 24.2237 | 0.5820 | 2.9910 | -238.4435 | -235.3788 | -2.6701 | -2.6969 |
72
- | 0.2918 | 0.42 | 200 | 0.2923 | 30.3522 | 26.4568 | 0.5820 | 3.8954 | -236.2104 | -232.2412 | -2.6826 | -2.7085 |
73
- | 0.286 | 0.63 | 300 | 0.2921 | 31.4394 | 27.2385 | 0.5820 | 4.2009 | -235.4286 | -231.1540 | -2.6717 | -2.6957 |
74
- | 0.2819 | 0.84 | 400 | 0.2788 | 31.8547 | 27.4761 | 0.5781 | 4.3786 | -235.1911 | -230.7387 | -2.6619 | -2.6869 |
75
 
76
 
77
  ### Framework versions
 
2
  license: apache-2.0
3
  base_model: alignment-handbook/zephyr-7b-sft-full
4
  tags:
 
5
  - trl
6
  - dpo
7
  - generated_from_trainer
 
 
 
 
 
8
  model-index:
9
  - name: zephyr-7b-dpo-full
10
  results: []
 
15
 
16
  # zephyr-7b-dpo-full
17
 
18
+ This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.4917
21
+ - Rewards/chosen: 21.2230
22
+ - Rewards/rejected: 6.3692
23
+ - Rewards/accuracies: 0.7344
24
+ - Rewards/margins: 14.8539
25
+ - Logps/rejected: -256.2980
26
+ - Logps/chosen: -241.3705
27
+ - Logits/rejected: -2.7037
28
+ - Logits/chosen: -2.7315
29
 
30
  ## Model description
31
 
 
62
 
63
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
64
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
65
+ | 0.5187 | 0.21 | 100 | 0.5296 | 19.0644 | 9.0310 | 0.7227 | 10.0334 | -253.6362 | -243.5290 | -2.7384 | -2.7638 |
66
+ | 0.508 | 0.42 | 200 | 0.5006 | 20.6504 | 7.0237 | 0.7266 | 13.6267 | -255.6435 | -241.9431 | -2.7569 | -2.7826 |
67
+ | 0.4808 | 0.63 | 300 | 0.4966 | 20.8183 | 6.9540 | 0.7227 | 13.8643 | -255.7132 | -241.7751 | -2.7115 | -2.7378 |
68
+ | 0.4835 | 0.84 | 400 | 0.4917 | 21.2230 | 6.3692 | 0.7344 | 14.8539 | -256.2980 | -241.3705 | -2.7037 | -2.7315 |
69
 
70
 
71
  ### Framework versions
all_results.json CHANGED
@@ -1,21 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_logits/chosen": -2.689567804336548,
4
- "eval_logits/rejected": -2.664588451385498,
5
- "eval_logps/chosen": -230.68809509277344,
6
- "eval_logps/rejected": -235.1542205810547,
7
- "eval_loss": 0.27835845947265625,
8
- "eval_rewards/accuracies": 0.57421875,
9
- "eval_rewards/chosen": 31.905372619628906,
10
- "eval_rewards/margins": 4.392405033111572,
11
- "eval_rewards/rejected": 27.51296615600586,
12
- "eval_runtime": 96.4814,
13
- "eval_samples": 2000,
14
- "eval_samples_per_second": 20.729,
15
- "eval_steps_per_second": 0.332,
16
- "train_loss": 0.31381568898715734,
17
- "train_runtime": 7749.4814,
18
  "train_samples": 61134,
19
- "train_samples_per_second": 7.889,
20
- "train_steps_per_second": 0.062
21
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.5147256711536871,
4
+ "train_runtime": 7551.132,
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "train_samples": 61134,
6
+ "train_samples_per_second": 8.096,
7
+ "train_steps_per_second": 0.063
8
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aeaa284c42a7973b56bb02672fa19944707a1761c29070843c6f9f62dd72e4fe
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc366785bb090b61844d3334a997422ad3d771ce8be204a936bc25dbbdca80c7
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5fe9c0ae7edc563803fced44889ee13e00331e230ffe85265ea71dd173dd7599
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:300478fcbb7886772c950cffb296e8aa0f8100dee2533cd0f29f38b9e71b79e0
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4eb8c28dbe053ab22cc46c14d5c8cb9815cb514a15461a39bb7903e6e94dd96a
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d61731df17ec5159cd05d998bcdbadb621478295c6553a07ec6f60275d8223e4
3
  size 4540516344
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.31381568898715734,
4
- "train_runtime": 7749.4814,
5
  "train_samples": 61134,
6
- "train_samples_per_second": 7.889,
7
- "train_steps_per_second": 0.062
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.5147256711536871,
4
+ "train_runtime": 7551.132,
5
  "train_samples": 61134,
6
+ "train_samples_per_second": 8.096,
7
+ "train_steps_per_second": 0.063
8
  }
trainer_state.json CHANGED
@@ -10,7 +10,7 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "grad_norm": 74.50819179863889,
14
  "learning_rate": 1.0416666666666666e-08,
15
  "logits/chosen": -2.7660439014434814,
16
  "logits/rejected": -2.717564582824707,
@@ -25,781 +25,781 @@
25
  },
26
  {
27
  "epoch": 0.02,
28
- "grad_norm": 71.5827858042053,
29
  "learning_rate": 1.0416666666666667e-07,
30
- "logits/chosen": -2.592801809310913,
31
- "logits/rejected": -2.5633366107940674,
32
- "logps/chosen": -264.5331726074219,
33
- "logps/rejected": -251.33367919921875,
34
- "loss": 0.6884,
35
  "rewards/accuracies": 0.4444444477558136,
36
- "rewards/chosen": 0.2647041380405426,
37
- "rewards/margins": 0.0454571396112442,
38
- "rewards/rejected": 0.2192470282316208,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.04,
43
- "grad_norm": 33.37630632393394,
44
  "learning_rate": 2.0833333333333333e-07,
45
- "logits/chosen": -2.6635663509368896,
46
- "logits/rejected": -2.6177525520324707,
47
- "logps/chosen": -275.1928405761719,
48
- "logps/rejected": -290.4365539550781,
49
- "loss": 0.5763,
50
- "rewards/accuracies": 0.5,
51
- "rewards/chosen": 6.3604888916015625,
52
- "rewards/margins": -0.009852093644440174,
53
- "rewards/rejected": 6.370340824127197,
54
  "step": 20
55
  },
56
  {
57
  "epoch": 0.06,
58
- "grad_norm": 22.1278736890366,
59
  "learning_rate": 3.1249999999999997e-07,
60
- "logits/chosen": -2.7272486686706543,
61
- "logits/rejected": -2.667067527770996,
62
- "logps/chosen": -285.1613464355469,
63
- "logps/rejected": -249.3108367919922,
64
- "loss": 0.4416,
65
- "rewards/accuracies": 0.5562499761581421,
66
- "rewards/chosen": 15.510467529296875,
67
- "rewards/margins": 0.8711569905281067,
68
- "rewards/rejected": 14.639310836791992,
69
  "step": 30
70
  },
71
  {
72
  "epoch": 0.08,
73
- "grad_norm": 17.071895487907064,
74
  "learning_rate": 4.1666666666666667e-07,
75
- "logits/chosen": -2.6888694763183594,
76
- "logits/rejected": -2.6701016426086426,
77
- "logps/chosen": -247.84716796875,
78
- "logps/rejected": -227.38131713867188,
79
- "loss": 0.3982,
80
- "rewards/accuracies": 0.543749988079071,
81
- "rewards/chosen": 19.278215408325195,
82
- "rewards/margins": 2.267552137374878,
83
- "rewards/rejected": 17.010662078857422,
84
  "step": 40
85
  },
86
  {
87
  "epoch": 0.1,
88
- "grad_norm": 14.78162706214556,
89
  "learning_rate": 4.999733114418725e-07,
90
- "logits/chosen": -2.659508466720581,
91
- "logits/rejected": -2.6249804496765137,
92
- "logps/chosen": -259.9454650878906,
93
- "logps/rejected": -272.14227294921875,
94
- "loss": 0.3676,
95
- "rewards/accuracies": 0.4625000059604645,
96
- "rewards/chosen": 19.786420822143555,
97
- "rewards/margins": -0.8553922772407532,
98
- "rewards/rejected": 20.64181137084961,
99
  "step": 50
100
  },
101
  {
102
  "epoch": 0.13,
103
- "grad_norm": 14.285832773490087,
104
  "learning_rate": 4.990398100856366e-07,
105
- "logits/chosen": -2.6977388858795166,
106
- "logits/rejected": -2.654181957244873,
107
- "logps/chosen": -247.1780242919922,
108
- "logps/rejected": -275.7373962402344,
109
- "loss": 0.3521,
110
- "rewards/accuracies": 0.5625,
111
- "rewards/chosen": 24.428516387939453,
112
- "rewards/margins": 2.0845706462860107,
113
- "rewards/rejected": 22.343944549560547,
114
  "step": 60
115
  },
116
  {
117
  "epoch": 0.15,
118
- "grad_norm": 14.416469937136577,
119
  "learning_rate": 4.967775735898179e-07,
120
- "logits/chosen": -2.6118428707122803,
121
- "logits/rejected": -2.625479221343994,
122
- "logps/chosen": -239.4540252685547,
123
- "logps/rejected": -232.90463256835938,
124
- "loss": 0.3304,
125
- "rewards/accuracies": 0.5874999761581421,
126
- "rewards/chosen": 26.162424087524414,
127
- "rewards/margins": 2.349818706512451,
128
- "rewards/rejected": 23.812606811523438,
129
  "step": 70
130
  },
131
  {
132
  "epoch": 0.17,
133
- "grad_norm": 15.840881084472352,
134
  "learning_rate": 4.931986719649298e-07,
135
- "logits/chosen": -2.7612788677215576,
136
- "logits/rejected": -2.7243030071258545,
137
- "logps/chosen": -295.0336608886719,
138
- "logps/rejected": -240.8730010986328,
139
- "loss": 0.3248,
140
- "rewards/accuracies": 0.612500011920929,
141
- "rewards/chosen": 27.784252166748047,
142
- "rewards/margins": 4.598628997802734,
143
- "rewards/rejected": 23.185623168945312,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 0.19,
148
- "grad_norm": 13.661268677283298,
149
  "learning_rate": 4.883222001996351e-07,
150
- "logits/chosen": -2.6661014556884766,
151
- "logits/rejected": -2.645249128341675,
152
- "logps/chosen": -231.57553100585938,
153
- "logps/rejected": -228.09091186523438,
154
- "loss": 0.3223,
155
- "rewards/accuracies": 0.574999988079071,
156
- "rewards/chosen": 27.535770416259766,
157
- "rewards/margins": 3.228619337081909,
158
- "rewards/rejected": 24.30714988708496,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 0.21,
163
- "grad_norm": 11.61288143003843,
164
  "learning_rate": 4.821741763807186e-07,
165
- "logits/chosen": -2.6386702060699463,
166
- "logits/rejected": -2.6339759826660156,
167
- "logps/chosen": -233.39047241210938,
168
- "logps/rejected": -232.5922393798828,
169
- "loss": 0.3163,
170
- "rewards/accuracies": 0.550000011920929,
171
- "rewards/chosen": 26.968032836914062,
172
- "rewards/margins": 2.5318057537078857,
173
- "rewards/rejected": 24.436227798461914,
174
  "step": 100
175
  },
176
  {
177
  "epoch": 0.21,
178
- "eval_logits/chosen": -2.6968541145324707,
179
- "eval_logits/rejected": -2.670072555541992,
180
- "eval_logps/chosen": -235.37875366210938,
181
- "eval_logps/rejected": -238.44345092773438,
182
- "eval_loss": 0.31289389729499817,
183
- "eval_rewards/accuracies": 0.58203125,
184
- "eval_rewards/chosen": 27.21471405029297,
185
- "eval_rewards/margins": 2.99098801612854,
186
- "eval_rewards/rejected": 24.223726272583008,
187
- "eval_runtime": 96.735,
188
- "eval_samples_per_second": 20.675,
189
- "eval_steps_per_second": 0.331,
190
  "step": 100
191
  },
192
  {
193
  "epoch": 0.23,
194
- "grad_norm": 11.688620320219954,
195
  "learning_rate": 4.747874028753375e-07,
196
- "logits/chosen": -2.7125041484832764,
197
- "logits/rejected": -2.6624934673309326,
198
- "logps/chosen": -276.029052734375,
199
- "logps/rejected": -234.1141815185547,
200
- "loss": 0.3136,
201
- "rewards/accuracies": 0.625,
202
- "rewards/chosen": 28.551036834716797,
203
- "rewards/margins": 4.829342842102051,
204
- "rewards/rejected": 23.72169303894043,
205
  "step": 110
206
  },
207
  {
208
  "epoch": 0.25,
209
- "grad_norm": 14.849649400244427,
210
  "learning_rate": 4.662012913161997e-07,
211
- "logits/chosen": -2.6516470909118652,
212
- "logits/rejected": -2.647688865661621,
213
- "logps/chosen": -253.4019317626953,
214
- "logps/rejected": -234.5045623779297,
215
- "loss": 0.3065,
216
- "rewards/accuracies": 0.606249988079071,
217
- "rewards/chosen": 27.765233993530273,
218
- "rewards/margins": 2.4132068157196045,
219
- "rewards/rejected": 25.352027893066406,
220
  "step": 120
221
  },
222
  {
223
  "epoch": 0.27,
224
- "grad_norm": 12.095477452171375,
225
  "learning_rate": 4.5646165232345103e-07,
226
- "logits/chosen": -2.679412364959717,
227
- "logits/rejected": -2.6742541790008545,
228
- "logps/chosen": -249.6054229736328,
229
- "logps/rejected": -241.8912811279297,
230
- "loss": 0.2993,
231
- "rewards/accuracies": 0.606249988079071,
232
- "rewards/chosen": 32.39772415161133,
233
- "rewards/margins": 5.853152275085449,
234
- "rewards/rejected": 26.544570922851562,
235
  "step": 130
236
  },
237
  {
238
  "epoch": 0.29,
239
- "grad_norm": 13.237989201417717,
240
  "learning_rate": 4.456204510851956e-07,
241
- "logits/chosen": -2.7010607719421387,
242
- "logits/rejected": -2.689103603363037,
243
- "logps/chosen": -284.6669921875,
244
- "logps/rejected": -270.44970703125,
245
- "loss": 0.3016,
246
- "rewards/accuracies": 0.5249999761581421,
247
- "rewards/chosen": 31.298425674438477,
248
- "rewards/margins": 1.071274995803833,
249
- "rewards/rejected": 30.22715187072754,
250
  "step": 140
251
  },
252
  {
253
  "epoch": 0.31,
254
- "grad_norm": 11.533759549255185,
255
  "learning_rate": 4.337355301007335e-07,
256
- "logits/chosen": -2.6910300254821777,
257
- "logits/rejected": -2.6623480319976807,
258
- "logps/chosen": -251.215576171875,
259
- "logps/rejected": -248.98348999023438,
260
- "loss": 0.2985,
261
- "rewards/accuracies": 0.5687500238418579,
262
- "rewards/chosen": 32.008628845214844,
263
- "rewards/margins": 4.783123970031738,
264
- "rewards/rejected": 27.225509643554688,
265
  "step": 150
266
  },
267
  {
268
  "epoch": 0.33,
269
- "grad_norm": 13.117822478323479,
270
  "learning_rate": 4.2087030056579986e-07,
271
- "logits/chosen": -2.721895217895508,
272
- "logits/rejected": -2.675842523574829,
273
- "logps/chosen": -242.4053192138672,
274
- "logps/rejected": -230.8060302734375,
275
- "loss": 0.3009,
276
- "rewards/accuracies": 0.612500011920929,
277
- "rewards/chosen": 30.662723541259766,
278
- "rewards/margins": 4.044883728027344,
279
- "rewards/rejected": 26.61783790588379,
280
  "step": 160
281
  },
282
  {
283
  "epoch": 0.36,
284
- "grad_norm": 11.340151801902158,
285
  "learning_rate": 4.070934040463998e-07,
286
- "logits/chosen": -2.670436382293701,
287
- "logits/rejected": -2.632450819015503,
288
- "logps/chosen": -220.5222625732422,
289
- "logps/rejected": -204.80908203125,
290
- "loss": 0.2938,
291
- "rewards/accuracies": 0.59375,
292
- "rewards/chosen": 28.81294822692871,
293
- "rewards/margins": 2.497253179550171,
294
- "rewards/rejected": 26.31569480895996,
295
  "step": 170
296
  },
297
  {
298
  "epoch": 0.38,
299
- "grad_norm": 11.477634324684333,
300
  "learning_rate": 3.9247834624635404e-07,
301
- "logits/chosen": -2.646768093109131,
302
- "logits/rejected": -2.6306955814361572,
303
- "logps/chosen": -225.45016479492188,
304
- "logps/rejected": -200.42015075683594,
305
- "loss": 0.2914,
306
- "rewards/accuracies": 0.643750011920929,
307
- "rewards/chosen": 30.908817291259766,
308
- "rewards/margins": 3.7578415870666504,
309
- "rewards/rejected": 27.150976181030273,
310
  "step": 180
311
  },
312
  {
313
  "epoch": 0.4,
314
- "grad_norm": 13.566633133843082,
315
  "learning_rate": 3.7710310482256523e-07,
316
- "logits/chosen": -2.679771900177002,
317
- "logits/rejected": -2.6499440670013428,
318
- "logps/chosen": -241.45156860351562,
319
- "logps/rejected": -231.2630615234375,
320
- "loss": 0.2963,
321
- "rewards/accuracies": 0.518750011920929,
322
- "rewards/chosen": 29.79128646850586,
323
- "rewards/margins": 1.2995483875274658,
324
- "rewards/rejected": 28.49173927307129,
325
  "step": 190
326
  },
327
  {
328
  "epoch": 0.42,
329
- "grad_norm": 16.736011308973627,
330
  "learning_rate": 3.610497133404795e-07,
331
- "logits/chosen": -2.630007028579712,
332
- "logits/rejected": -2.6183559894561768,
333
- "logps/chosen": -230.09048461914062,
334
- "logps/rejected": -223.8180694580078,
335
- "loss": 0.2918,
336
- "rewards/accuracies": 0.5062500238418579,
337
- "rewards/chosen": 29.806177139282227,
338
- "rewards/margins": 1.575269341468811,
339
- "rewards/rejected": 28.230907440185547,
340
  "step": 200
341
  },
342
  {
343
  "epoch": 0.42,
344
- "eval_logits/chosen": -2.708475112915039,
345
- "eval_logits/rejected": -2.682575225830078,
346
- "eval_logps/chosen": -232.24124145507812,
347
- "eval_logps/rejected": -236.21038818359375,
348
- "eval_loss": 0.29230329394340515,
349
- "eval_rewards/accuracies": 0.58203125,
350
- "eval_rewards/chosen": 30.35222816467285,
351
- "eval_rewards/margins": 3.8954334259033203,
352
- "eval_rewards/rejected": 26.45679473876953,
353
- "eval_runtime": 96.829,
354
- "eval_samples_per_second": 20.655,
355
- "eval_steps_per_second": 0.33,
356
  "step": 200
357
  },
358
  {
359
  "epoch": 0.44,
360
- "grad_norm": 11.417465496451523,
361
  "learning_rate": 3.4440382358952115e-07,
362
- "logits/chosen": -2.6330389976501465,
363
- "logits/rejected": -2.6055209636688232,
364
- "logps/chosen": -257.6673889160156,
365
- "logps/rejected": -225.943359375,
366
- "loss": 0.2902,
367
- "rewards/accuracies": 0.606249988079071,
368
- "rewards/chosen": 32.02475357055664,
369
- "rewards/margins": 6.720486640930176,
370
- "rewards/rejected": 25.304264068603516,
371
  "step": 210
372
  },
373
  {
374
  "epoch": 0.46,
375
- "grad_norm": 12.04727391696027,
376
  "learning_rate": 3.272542485937368e-07,
377
- "logits/chosen": -2.5957412719726562,
378
- "logits/rejected": -2.5795822143554688,
379
- "logps/chosen": -233.29476928710938,
380
- "logps/rejected": -217.3531951904297,
381
- "loss": 0.2919,
382
- "rewards/accuracies": 0.643750011920929,
383
- "rewards/chosen": 32.082313537597656,
384
- "rewards/margins": 3.7717392444610596,
385
- "rewards/rejected": 28.310577392578125,
386
  "step": 220
387
  },
388
  {
389
  "epoch": 0.48,
390
- "grad_norm": 11.505656123665526,
391
  "learning_rate": 3.096924887558854e-07,
392
- "logits/chosen": -2.6124305725097656,
393
- "logits/rejected": -2.5944228172302246,
394
- "logps/chosen": -217.5354461669922,
395
- "logps/rejected": -220.5460205078125,
396
- "loss": 0.3047,
397
- "rewards/accuracies": 0.574999988079071,
398
- "rewards/chosen": 31.32999038696289,
399
- "rewards/margins": 4.138183116912842,
400
- "rewards/rejected": 27.19180679321289,
401
  "step": 230
402
  },
403
  {
404
  "epoch": 0.5,
405
- "grad_norm": 11.083392566284138,
406
  "learning_rate": 2.9181224366319943e-07,
407
- "logits/chosen": -2.660727024078369,
408
- "logits/rejected": -2.6385245323181152,
409
- "logps/chosen": -232.0665740966797,
410
- "logps/rejected": -219.62210083007812,
411
- "loss": 0.2834,
412
- "rewards/accuracies": 0.6000000238418579,
413
- "rewards/chosen": 31.633642196655273,
414
- "rewards/margins": 2.1873562335968018,
415
- "rewards/rejected": 29.446285247802734,
416
  "step": 240
417
  },
418
  {
419
  "epoch": 0.52,
420
- "grad_norm": 11.463127161742676,
421
  "learning_rate": 2.7370891215954565e-07,
422
- "logits/chosen": -2.6206917762756348,
423
- "logits/rejected": -2.576387405395508,
424
- "logps/chosen": -264.06439208984375,
425
- "logps/rejected": -229.7786865234375,
426
- "loss": 0.2818,
427
- "rewards/accuracies": 0.612500011920929,
428
- "rewards/chosen": 34.12608337402344,
429
- "rewards/margins": 4.382205009460449,
430
- "rewards/rejected": 29.743881225585938,
431
  "step": 250
432
  },
433
  {
434
  "epoch": 0.54,
435
- "grad_norm": 10.661524920447267,
436
  "learning_rate": 2.55479083351317e-07,
437
- "logits/chosen": -2.6774675846099854,
438
- "logits/rejected": -2.668527364730835,
439
- "logps/chosen": -260.33514404296875,
440
- "logps/rejected": -225.80810546875,
441
- "loss": 0.2858,
442
- "rewards/accuracies": 0.6187499761581421,
443
- "rewards/chosen": 33.976402282714844,
444
- "rewards/margins": 5.804098606109619,
445
- "rewards/rejected": 28.17230224609375,
446
  "step": 260
447
  },
448
  {
449
  "epoch": 0.56,
450
- "grad_norm": 11.916616915089687,
451
  "learning_rate": 2.3722002126275822e-07,
452
- "logits/chosen": -2.6731224060058594,
453
- "logits/rejected": -2.6551766395568848,
454
- "logps/chosen": -245.6435089111328,
455
- "logps/rejected": -228.1649932861328,
456
- "loss": 0.2808,
457
- "rewards/accuracies": 0.512499988079071,
458
- "rewards/chosen": 31.35245704650879,
459
- "rewards/margins": 1.8731645345687866,
460
- "rewards/rejected": 29.479290008544922,
461
  "step": 270
462
  },
463
  {
464
  "epoch": 0.59,
465
- "grad_norm": 11.982078860289866,
466
  "learning_rate": 2.19029145890313e-07,
467
- "logits/chosen": -2.6452529430389404,
468
- "logits/rejected": -2.6127915382385254,
469
- "logps/chosen": -229.02554321289062,
470
- "logps/rejected": -215.188720703125,
471
- "loss": 0.2835,
472
- "rewards/accuracies": 0.6312500238418579,
473
- "rewards/chosen": 32.651554107666016,
474
- "rewards/margins": 5.653929233551025,
475
- "rewards/rejected": 26.99761962890625,
476
  "step": 280
477
  },
478
  {
479
  "epoch": 0.61,
480
- "grad_norm": 11.17239233559609,
481
  "learning_rate": 2.0100351342479216e-07,
482
- "logits/chosen": -2.675553321838379,
483
- "logits/rejected": -2.662069082260132,
484
- "logps/chosen": -219.8170928955078,
485
- "logps/rejected": -211.7806396484375,
486
- "loss": 0.2849,
487
- "rewards/accuracies": 0.518750011920929,
488
- "rewards/chosen": 31.27024269104004,
489
- "rewards/margins": 1.0949894189834595,
490
- "rewards/rejected": 30.175247192382812,
491
  "step": 290
492
  },
493
  {
494
  "epoch": 0.63,
495
- "grad_norm": 9.847053265544167,
496
  "learning_rate": 1.8323929841460178e-07,
497
- "logits/chosen": -2.65397572517395,
498
- "logits/rejected": -2.6134414672851562,
499
- "logps/chosen": -268.84588623046875,
500
- "logps/rejected": -232.80752563476562,
501
- "loss": 0.286,
502
- "rewards/accuracies": 0.612500011920929,
503
- "rewards/chosen": 32.15021514892578,
504
- "rewards/margins": 4.852233409881592,
505
- "rewards/rejected": 27.297988891601562,
506
  "step": 300
507
  },
508
  {
509
  "epoch": 0.63,
510
- "eval_logits/chosen": -2.695726156234741,
511
- "eval_logits/rejected": -2.6716713905334473,
512
- "eval_logps/chosen": -231.15402221679688,
513
- "eval_logps/rejected": -235.42864990234375,
514
- "eval_loss": 0.29209351539611816,
515
- "eval_rewards/accuracies": 0.58203125,
516
- "eval_rewards/chosen": 31.439437866210938,
517
- "eval_rewards/margins": 4.200903415679932,
518
- "eval_rewards/rejected": 27.238534927368164,
519
- "eval_runtime": 96.789,
520
- "eval_samples_per_second": 20.664,
521
- "eval_steps_per_second": 0.331,
522
  "step": 300
523
  },
524
  {
525
  "epoch": 0.65,
526
- "grad_norm": 11.299461074514115,
527
  "learning_rate": 1.6583128063291573e-07,
528
- "logits/chosen": -2.6087942123413086,
529
- "logits/rejected": -2.607959270477295,
530
- "logps/chosen": -263.2939758300781,
531
- "logps/rejected": -229.5752716064453,
532
- "loss": 0.2804,
533
- "rewards/accuracies": 0.581250011920929,
534
- "rewards/chosen": 32.041908264160156,
535
- "rewards/margins": 3.100654363632202,
536
- "rewards/rejected": 28.941247940063477,
537
  "step": 310
538
  },
539
  {
540
  "epoch": 0.67,
541
- "grad_norm": 11.979925902064297,
542
  "learning_rate": 1.488723393865766e-07,
543
- "logits/chosen": -2.652468204498291,
544
- "logits/rejected": -2.6433398723602295,
545
- "logps/chosen": -260.83233642578125,
546
- "logps/rejected": -216.2664337158203,
547
- "loss": 0.2788,
548
- "rewards/accuracies": 0.5874999761581421,
549
- "rewards/chosen": 32.8377571105957,
550
- "rewards/margins": 4.280916213989258,
551
- "rewards/rejected": 28.556838989257812,
552
  "step": 320
553
  },
554
  {
555
  "epoch": 0.69,
556
- "grad_norm": 10.289416601586245,
557
  "learning_rate": 1.3245295796480788e-07,
558
- "logits/chosen": -2.678496837615967,
559
- "logits/rejected": -2.634920835494995,
560
- "logps/chosen": -229.55624389648438,
561
- "logps/rejected": -231.64407348632812,
562
- "loss": 0.2812,
563
- "rewards/accuracies": 0.5562499761581421,
564
- "rewards/chosen": 32.6539306640625,
565
- "rewards/margins": 4.799349784851074,
566
- "rewards/rejected": 27.854583740234375,
567
  "step": 330
568
  },
569
  {
570
  "epoch": 0.71,
571
- "grad_norm": 12.940304501019066,
572
  "learning_rate": 1.1666074087171627e-07,
573
- "logits/chosen": -2.687782049179077,
574
- "logits/rejected": -2.6474757194519043,
575
- "logps/chosen": -258.529541015625,
576
- "logps/rejected": -247.69125366210938,
577
- "loss": 0.2752,
578
- "rewards/accuracies": 0.53125,
579
- "rewards/chosen": 31.464908599853516,
580
- "rewards/margins": -0.8856052160263062,
581
- "rewards/rejected": 32.35051727294922,
582
  "step": 340
583
  },
584
  {
585
  "epoch": 0.73,
586
- "grad_norm": 13.446019747621028,
587
  "learning_rate": 1.0157994641835734e-07,
588
- "logits/chosen": -2.6681811809539795,
589
- "logits/rejected": -2.6358139514923096,
590
- "logps/chosen": -227.58425903320312,
591
- "logps/rejected": -212.9467010498047,
592
- "loss": 0.2866,
593
- "rewards/accuracies": 0.5562499761581421,
594
- "rewards/chosen": 30.626983642578125,
595
- "rewards/margins": 2.8648905754089355,
596
- "rewards/rejected": 27.7620906829834,
597
  "step": 350
598
  },
599
  {
600
  "epoch": 0.75,
601
- "grad_norm": 10.212615361555141,
602
  "learning_rate": 8.729103716819111e-08,
603
- "logits/chosen": -2.691338300704956,
604
- "logits/rejected": -2.6329030990600586,
605
- "logps/chosen": -269.2547302246094,
606
- "logps/rejected": -233.14053344726562,
607
- "loss": 0.2785,
608
- "rewards/accuracies": 0.625,
609
- "rewards/chosen": 33.437278747558594,
610
- "rewards/margins": 5.27285623550415,
611
- "rewards/rejected": 28.1644287109375,
612
  "step": 360
613
  },
614
  {
615
  "epoch": 0.77,
616
- "grad_norm": 12.701608094493194,
617
  "learning_rate": 7.387025063449081e-08,
618
- "logits/chosen": -2.6507325172424316,
619
- "logits/rejected": -2.6226696968078613,
620
- "logps/chosen": -243.0960693359375,
621
- "logps/rejected": -207.664794921875,
622
- "loss": 0.2854,
623
- "rewards/accuracies": 0.518750011920929,
624
- "rewards/chosen": 32.23695373535156,
625
- "rewards/margins": 1.6676933765411377,
626
- "rewards/rejected": 30.569263458251953,
627
  "step": 370
628
  },
629
  {
630
  "epoch": 0.79,
631
- "grad_norm": 11.004484883830752,
632
  "learning_rate": 6.138919252022435e-08,
633
- "logits/chosen": -2.592874526977539,
634
- "logits/rejected": -2.5939741134643555,
635
- "logps/chosen": -206.689697265625,
636
- "logps/rejected": -228.67898559570312,
637
- "loss": 0.2774,
638
- "rewards/accuracies": 0.550000011920929,
639
- "rewards/chosen": 32.79497146606445,
640
- "rewards/margins": 2.7575299739837646,
641
- "rewards/rejected": 30.037445068359375,
642
  "step": 380
643
  },
644
  {
645
  "epoch": 0.82,
646
- "grad_norm": 12.608909298282311,
647
  "learning_rate": 4.991445467064689e-08,
648
- "logits/chosen": -2.6360385417938232,
649
- "logits/rejected": -2.6261894702911377,
650
- "logps/chosen": -270.9910888671875,
651
- "logps/rejected": -252.8332977294922,
652
- "loss": 0.276,
653
- "rewards/accuracies": 0.5249999761581421,
654
- "rewards/chosen": 34.645816802978516,
655
- "rewards/margins": 3.508648633956909,
656
- "rewards/rejected": 31.137165069580078,
657
  "step": 390
658
  },
659
  {
660
  "epoch": 0.84,
661
- "grad_norm": 11.347134923103408,
662
  "learning_rate": 3.9507259776993954e-08,
663
- "logits/chosen": -2.632523775100708,
664
- "logits/rejected": -2.594832181930542,
665
- "logps/chosen": -236.8807830810547,
666
- "logps/rejected": -237.6399688720703,
667
- "loss": 0.2819,
668
- "rewards/accuracies": 0.625,
669
- "rewards/chosen": 33.6544189453125,
670
- "rewards/margins": 4.281933784484863,
671
- "rewards/rejected": 29.372488021850586,
672
  "step": 400
673
  },
674
  {
675
  "epoch": 0.84,
676
- "eval_logits/chosen": -2.6868975162506104,
677
- "eval_logits/rejected": -2.66192626953125,
678
- "eval_logps/chosen": -230.7387237548828,
679
- "eval_logps/rejected": -235.19105529785156,
680
- "eval_loss": 0.2787904143333435,
681
- "eval_rewards/accuracies": 0.578125,
682
- "eval_rewards/chosen": 31.854747772216797,
683
- "eval_rewards/margins": 4.3786234855651855,
684
- "eval_rewards/rejected": 27.476125717163086,
685
- "eval_runtime": 96.6885,
686
- "eval_samples_per_second": 20.685,
687
- "eval_steps_per_second": 0.331,
688
  "step": 400
689
  },
690
  {
691
  "epoch": 0.86,
692
- "grad_norm": 12.175943173191595,
693
  "learning_rate": 3.022313472693447e-08,
694
- "logits/chosen": -2.6695199012756348,
695
- "logits/rejected": -2.626798152923584,
696
- "logps/chosen": -263.4989318847656,
697
- "logps/rejected": -240.9721221923828,
698
- "loss": 0.2806,
699
- "rewards/accuracies": 0.6499999761581421,
700
- "rewards/chosen": 35.418556213378906,
701
- "rewards/margins": 7.573515892028809,
702
- "rewards/rejected": 27.845043182373047,
703
  "step": 410
704
  },
705
  {
706
  "epoch": 0.88,
707
- "grad_norm": 11.7624491150407,
708
  "learning_rate": 2.2111614344599684e-08,
709
- "logits/chosen": -2.6308817863464355,
710
- "logits/rejected": -2.620222568511963,
711
- "logps/chosen": -264.280517578125,
712
- "logps/rejected": -247.2097625732422,
713
- "loss": 0.2882,
714
- "rewards/accuracies": 0.574999988079071,
715
- "rewards/chosen": 32.79326248168945,
716
- "rewards/margins": 5.5407843589782715,
717
- "rewards/rejected": 27.252477645874023,
718
  "step": 420
719
  },
720
  {
721
  "epoch": 0.9,
722
- "grad_norm": 11.16296113559481,
723
  "learning_rate": 1.521597710086439e-08,
724
- "logits/chosen": -2.577580213546753,
725
- "logits/rejected": -2.5429909229278564,
726
- "logps/chosen": -248.5481719970703,
727
- "logps/rejected": -228.4681396484375,
728
- "loss": 0.2851,
729
- "rewards/accuracies": 0.53125,
730
- "rewards/chosen": 30.489971160888672,
731
- "rewards/margins": 1.1781085729599,
732
- "rewards/rejected": 29.311859130859375,
733
  "step": 430
734
  },
735
  {
736
  "epoch": 0.92,
737
- "grad_norm": 10.453636294498436,
738
  "learning_rate": 9.57301420397924e-09,
739
- "logits/chosen": -2.654780864715576,
740
- "logits/rejected": -2.619481086730957,
741
- "logps/chosen": -251.1508026123047,
742
- "logps/rejected": -240.0060272216797,
743
- "loss": 0.2805,
744
- "rewards/accuracies": 0.581250011920929,
745
- "rewards/chosen": 33.18633270263672,
746
- "rewards/margins": 4.080627918243408,
747
- "rewards/rejected": 29.1057071685791,
748
  "step": 440
749
  },
750
  {
751
  "epoch": 0.94,
752
- "grad_norm": 10.779162534358996,
753
  "learning_rate": 5.212833302556258e-09,
754
- "logits/chosen": -2.598240375518799,
755
- "logits/rejected": -2.6028037071228027,
756
- "logps/chosen": -259.9753112792969,
757
- "logps/rejected": -276.95166015625,
758
- "loss": 0.2836,
759
- "rewards/accuracies": 0.550000011920929,
760
- "rewards/chosen": 33.70884323120117,
761
- "rewards/margins": 3.5860488414764404,
762
- "rewards/rejected": 30.122793197631836,
763
  "step": 450
764
  },
765
  {
766
  "epoch": 0.96,
767
- "grad_norm": 12.07874608208951,
768
  "learning_rate": 2.158697848236607e-09,
769
- "logits/chosen": -2.6384501457214355,
770
- "logits/rejected": -2.618943452835083,
771
- "logps/chosen": -240.47885131835938,
772
- "logps/rejected": -213.6422882080078,
773
- "loss": 0.2815,
774
- "rewards/accuracies": 0.5625,
775
- "rewards/chosen": 31.485698699951172,
776
- "rewards/margins": 2.44018292427063,
777
- "rewards/rejected": 29.045513153076172,
778
  "step": 460
779
  },
780
  {
781
  "epoch": 0.98,
782
- "grad_norm": 11.390948919388384,
783
  "learning_rate": 4.269029751107489e-10,
784
- "logits/chosen": -2.6327641010284424,
785
- "logits/rejected": -2.6079437732696533,
786
- "logps/chosen": -245.8006591796875,
787
- "logps/rejected": -253.76730346679688,
788
- "loss": 0.2778,
789
- "rewards/accuracies": 0.543749988079071,
790
- "rewards/chosen": 32.898033142089844,
791
- "rewards/margins": 4.314266204833984,
792
- "rewards/rejected": 28.58376121520996,
793
  "step": 470
794
  },
795
  {
796
  "epoch": 1.0,
797
  "step": 478,
798
  "total_flos": 0.0,
799
- "train_loss": 0.31381568898715734,
800
- "train_runtime": 7749.4814,
801
- "train_samples_per_second": 7.889,
802
- "train_steps_per_second": 0.062
803
  }
804
  ],
805
  "logging_steps": 10,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "grad_norm": 45.900680687932606,
14
  "learning_rate": 1.0416666666666666e-08,
15
  "logits/chosen": -2.7660439014434814,
16
  "logits/rejected": -2.717564582824707,
 
25
  },
26
  {
27
  "epoch": 0.02,
28
+ "grad_norm": 42.914729941865076,
29
  "learning_rate": 1.0416666666666667e-07,
30
+ "logits/chosen": -2.592682361602783,
31
+ "logits/rejected": -2.5630006790161133,
32
+ "logps/chosen": -264.6473388671875,
33
+ "logps/rejected": -251.43508911132812,
34
+ "loss": 0.6923,
35
  "rewards/accuracies": 0.4444444477558136,
36
+ "rewards/chosen": 0.15051230788230896,
37
+ "rewards/margins": 0.03262672945857048,
38
+ "rewards/rejected": 0.11788560450077057,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.04,
43
+ "grad_norm": 38.452837409916924,
44
  "learning_rate": 2.0833333333333333e-07,
45
+ "logits/chosen": -2.6565070152282715,
46
+ "logits/rejected": -2.609382390975952,
47
+ "logps/chosen": -278.57049560546875,
48
+ "logps/rejected": -293.88580322265625,
49
+ "loss": 0.675,
50
+ "rewards/accuracies": 0.5375000238418579,
51
+ "rewards/chosen": 2.982862949371338,
52
+ "rewards/margins": 0.061783939599990845,
53
+ "rewards/rejected": 2.92107892036438,
54
  "step": 20
55
  },
56
  {
57
  "epoch": 0.06,
58
+ "grad_norm": 33.8019211383058,
59
  "learning_rate": 3.1249999999999997e-07,
60
+ "logits/chosen": -2.682241916656494,
61
+ "logits/rejected": -2.6154792308807373,
62
+ "logps/chosen": -288.4586181640625,
63
+ "logps/rejected": -252.54623413085938,
64
+ "loss": 0.6263,
65
+ "rewards/accuracies": 0.574999988079071,
66
+ "rewards/chosen": 12.213181495666504,
67
+ "rewards/margins": 0.8092762231826782,
68
+ "rewards/rejected": 11.403905868530273,
69
  "step": 30
70
  },
71
  {
72
  "epoch": 0.08,
73
+ "grad_norm": 31.31645871670535,
74
  "learning_rate": 4.1666666666666667e-07,
75
+ "logits/chosen": -2.6231982707977295,
76
+ "logits/rejected": -2.59993052482605,
77
+ "logps/chosen": -250.78829956054688,
78
+ "logps/rejected": -231.2698974609375,
79
+ "loss": 0.5948,
80
+ "rewards/accuracies": 0.668749988079071,
81
+ "rewards/chosen": 16.337059020996094,
82
+ "rewards/margins": 3.214967727661133,
83
+ "rewards/rejected": 13.122090339660645,
84
  "step": 40
85
  },
86
  {
87
  "epoch": 0.1,
88
+ "grad_norm": 28.69978279384899,
89
  "learning_rate": 4.999733114418725e-07,
90
+ "logits/chosen": -2.62522554397583,
91
+ "logits/rejected": -2.5902233123779297,
92
+ "logps/chosen": -264.43536376953125,
93
+ "logps/rejected": -280.06646728515625,
94
+ "loss": 0.5678,
95
+ "rewards/accuracies": 0.643750011920929,
96
+ "rewards/chosen": 15.29652214050293,
97
+ "rewards/margins": 2.578859806060791,
98
+ "rewards/rejected": 12.717663764953613,
99
  "step": 50
100
  },
101
  {
102
  "epoch": 0.13,
103
+ "grad_norm": 31.624733594392392,
104
  "learning_rate": 4.990398100856366e-07,
105
+ "logits/chosen": -2.714506149291992,
106
+ "logits/rejected": -2.663816213607788,
107
+ "logps/chosen": -253.0142364501953,
108
+ "logps/rejected": -284.220703125,
109
+ "loss": 0.5655,
110
+ "rewards/accuracies": 0.6812499761581421,
111
+ "rewards/chosen": 18.592296600341797,
112
+ "rewards/margins": 4.731665134429932,
113
+ "rewards/rejected": 13.860631942749023,
114
  "step": 60
115
  },
116
  {
117
  "epoch": 0.15,
118
+ "grad_norm": 31.114783845109457,
119
  "learning_rate": 4.967775735898179e-07,
120
+ "logits/chosen": -2.659493923187256,
121
+ "logits/rejected": -2.6703927516937256,
122
+ "logps/chosen": -245.8984375,
123
+ "logps/rejected": -243.549560546875,
124
+ "loss": 0.5399,
125
+ "rewards/accuracies": 0.75,
126
+ "rewards/chosen": 19.718013763427734,
127
+ "rewards/margins": 6.550329685211182,
128
+ "rewards/rejected": 13.167686462402344,
129
  "step": 70
130
  },
131
  {
132
  "epoch": 0.17,
133
+ "grad_norm": 41.80005183977875,
134
  "learning_rate": 4.931986719649298e-07,
135
+ "logits/chosen": -2.816267490386963,
136
+ "logits/rejected": -2.7780513763427734,
137
+ "logps/chosen": -303.81060791015625,
138
+ "logps/rejected": -252.63510131835938,
139
+ "loss": 0.5406,
140
+ "rewards/accuracies": 0.71875,
141
+ "rewards/chosen": 19.00731658935547,
142
+ "rewards/margins": 7.583803653717041,
143
+ "rewards/rejected": 11.42351245880127,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 0.19,
148
+ "grad_norm": 29.109112028609335,
149
  "learning_rate": 4.883222001996351e-07,
150
+ "logits/chosen": -2.7375311851501465,
151
+ "logits/rejected": -2.7109663486480713,
152
+ "logps/chosen": -239.97073364257812,
153
+ "logps/rejected": -243.8080596923828,
154
+ "loss": 0.5261,
155
+ "rewards/accuracies": 0.737500011920929,
156
+ "rewards/chosen": 19.140594482421875,
157
+ "rewards/margins": 10.550561904907227,
158
+ "rewards/rejected": 8.590032577514648,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 0.21,
163
+ "grad_norm": 27.121615894160126,
164
  "learning_rate": 4.821741763807186e-07,
165
+ "logits/chosen": -2.7122435569763184,
166
+ "logits/rejected": -2.710378646850586,
167
+ "logps/chosen": -242.0121307373047,
168
+ "logps/rejected": -249.0485382080078,
169
+ "loss": 0.5187,
170
+ "rewards/accuracies": 0.7250000238418579,
171
+ "rewards/chosen": 18.346372604370117,
172
+ "rewards/margins": 10.366473197937012,
173
+ "rewards/rejected": 7.979898929595947,
174
  "step": 100
175
  },
176
  {
177
  "epoch": 0.21,
178
+ "eval_logits/chosen": -2.7638356685638428,
179
+ "eval_logits/rejected": -2.7384395599365234,
180
+ "eval_logps/chosen": -243.52903747558594,
181
+ "eval_logps/rejected": -253.63619995117188,
182
+ "eval_loss": 0.5296456813812256,
183
+ "eval_rewards/accuracies": 0.72265625,
184
+ "eval_rewards/chosen": 19.06442642211914,
185
+ "eval_rewards/margins": 10.033439636230469,
186
+ "eval_rewards/rejected": 9.030986785888672,
187
+ "eval_runtime": 97.082,
188
+ "eval_samples_per_second": 20.601,
189
+ "eval_steps_per_second": 0.33,
190
  "step": 100
191
  },
192
  {
193
  "epoch": 0.23,
194
+ "grad_norm": 34.19751869568068,
195
  "learning_rate": 4.747874028753375e-07,
196
+ "logits/chosen": -2.7722010612487793,
197
+ "logits/rejected": -2.725037097930908,
198
+ "logps/chosen": -285.4516906738281,
199
+ "logps/rejected": -248.623046875,
200
+ "loss": 0.5339,
201
+ "rewards/accuracies": 0.731249988079071,
202
+ "rewards/chosen": 19.128398895263672,
203
+ "rewards/margins": 9.91553783416748,
204
+ "rewards/rejected": 9.212862014770508,
205
  "step": 110
206
  },
207
  {
208
  "epoch": 0.25,
209
+ "grad_norm": 31.322721824556634,
210
  "learning_rate": 4.662012913161997e-07,
211
+ "logits/chosen": -2.70879864692688,
212
+ "logits/rejected": -2.7064578533172607,
213
+ "logps/chosen": -262.66522216796875,
214
+ "logps/rejected": -249.9930419921875,
215
+ "loss": 0.5035,
216
+ "rewards/accuracies": 0.71875,
217
+ "rewards/chosen": 18.501953125,
218
+ "rewards/margins": 8.638445854187012,
219
+ "rewards/rejected": 9.863507270812988,
220
  "step": 120
221
  },
222
  {
223
  "epoch": 0.27,
224
+ "grad_norm": 29.14222659930629,
225
  "learning_rate": 4.5646165232345103e-07,
226
+ "logits/chosen": -2.7430055141448975,
227
+ "logits/rejected": -2.7424817085266113,
228
+ "logps/chosen": -258.7245788574219,
229
+ "logps/rejected": -257.83563232421875,
230
+ "loss": 0.5095,
231
+ "rewards/accuracies": 0.675000011920929,
232
+ "rewards/chosen": 23.278545379638672,
233
+ "rewards/margins": 12.678305625915527,
234
+ "rewards/rejected": 10.600237846374512,
235
  "step": 130
236
  },
237
  {
238
  "epoch": 0.29,
239
+ "grad_norm": 31.79008837992398,
240
  "learning_rate": 4.456204510851956e-07,
241
+ "logits/chosen": -2.768493175506592,
242
+ "logits/rejected": -2.759028911590576,
243
+ "logps/chosen": -295.2568359375,
244
+ "logps/rejected": -290.87103271484375,
245
+ "loss": 0.5009,
246
+ "rewards/accuracies": 0.699999988079071,
247
+ "rewards/chosen": 20.70852279663086,
248
+ "rewards/margins": 10.902701377868652,
249
+ "rewards/rejected": 9.80582332611084,
250
  "step": 140
251
  },
252
  {
253
  "epoch": 0.31,
254
+ "grad_norm": 30.12291595525387,
255
  "learning_rate": 4.337355301007335e-07,
256
+ "logits/chosen": -2.757780075073242,
257
+ "logits/rejected": -2.7286086082458496,
258
+ "logps/chosen": -264.67327880859375,
259
+ "logps/rejected": -267.2817077636719,
260
+ "loss": 0.5095,
261
+ "rewards/accuracies": 0.7124999761581421,
262
+ "rewards/chosen": 18.550952911376953,
263
+ "rewards/margins": 9.623664855957031,
264
+ "rewards/rejected": 8.927289009094238,
265
  "step": 150
266
  },
267
  {
268
  "epoch": 0.33,
269
+ "grad_norm": 36.61347768159374,
270
  "learning_rate": 4.2087030056579986e-07,
271
+ "logits/chosen": -2.8047165870666504,
272
+ "logits/rejected": -2.7731175422668457,
273
+ "logps/chosen": -251.7733612060547,
274
+ "logps/rejected": -248.71923828125,
275
+ "loss": 0.5234,
276
+ "rewards/accuracies": 0.71875,
277
+ "rewards/chosen": 21.294713973999023,
278
+ "rewards/margins": 12.590093612670898,
279
+ "rewards/rejected": 8.704621315002441,
280
  "step": 160
281
  },
282
  {
283
  "epoch": 0.36,
284
+ "grad_norm": 35.61122377710651,
285
  "learning_rate": 4.070934040463998e-07,
286
+ "logits/chosen": -2.7708938121795654,
287
+ "logits/rejected": -2.7386412620544434,
288
+ "logps/chosen": -230.99172973632812,
289
+ "logps/rejected": -221.28964233398438,
290
+ "loss": 0.5068,
291
+ "rewards/accuracies": 0.6875,
292
+ "rewards/chosen": 18.34353256225586,
293
+ "rewards/margins": 8.508430480957031,
294
+ "rewards/rejected": 9.835103988647461,
295
  "step": 170
296
  },
297
  {
298
  "epoch": 0.38,
299
+ "grad_norm": 38.25232870150566,
300
  "learning_rate": 3.9247834624635404e-07,
301
+ "logits/chosen": -2.73944091796875,
302
+ "logits/rejected": -2.7309627532958984,
303
+ "logps/chosen": -237.39566040039062,
304
+ "logps/rejected": -219.0083770751953,
305
+ "loss": 0.4872,
306
+ "rewards/accuracies": 0.706250011920929,
307
+ "rewards/chosen": 18.96334457397461,
308
+ "rewards/margins": 10.400626182556152,
309
+ "rewards/rejected": 8.562715530395508,
310
  "step": 180
311
  },
312
  {
313
  "epoch": 0.4,
314
+ "grad_norm": 33.97866980743485,
315
  "learning_rate": 3.7710310482256523e-07,
316
+ "logits/chosen": -2.7516446113586426,
317
+ "logits/rejected": -2.7251124382019043,
318
+ "logps/chosen": -251.7504425048828,
319
+ "logps/rejected": -249.9357147216797,
320
+ "loss": 0.5079,
321
+ "rewards/accuracies": 0.675000011920929,
322
+ "rewards/chosen": 19.492395401000977,
323
+ "rewards/margins": 9.673317909240723,
324
+ "rewards/rejected": 9.819077491760254,
325
  "step": 190
326
  },
327
  {
328
  "epoch": 0.42,
329
+ "grad_norm": 134.1578718379133,
330
  "learning_rate": 3.610497133404795e-07,
331
+ "logits/chosen": -2.702623128890991,
332
+ "logits/rejected": -2.6973912715911865,
333
+ "logps/chosen": -240.9090118408203,
334
+ "logps/rejected": -245.4873046875,
335
+ "loss": 0.508,
336
+ "rewards/accuracies": 0.6875,
337
+ "rewards/chosen": 18.98764419555664,
338
+ "rewards/margins": 12.425970077514648,
339
+ "rewards/rejected": 6.561669826507568,
340
  "step": 200
341
  },
342
  {
343
  "epoch": 0.42,
344
+ "eval_logits/chosen": -2.782581090927124,
345
+ "eval_logits/rejected": -2.756884813308716,
346
+ "eval_logps/chosen": -241.94308471679688,
347
+ "eval_logps/rejected": -255.6435089111328,
348
+ "eval_loss": 0.5005597472190857,
349
+ "eval_rewards/accuracies": 0.7265625,
350
+ "eval_rewards/chosen": 20.650381088256836,
351
+ "eval_rewards/margins": 13.626703262329102,
352
+ "eval_rewards/rejected": 7.02367639541626,
353
+ "eval_runtime": 96.3977,
354
+ "eval_samples_per_second": 20.747,
355
+ "eval_steps_per_second": 0.332,
356
  "step": 200
357
  },
358
  {
359
  "epoch": 0.44,
360
+ "grad_norm": 29.796012700680272,
361
  "learning_rate": 3.4440382358952115e-07,
362
+ "logits/chosen": -2.7070722579956055,
363
+ "logits/rejected": -2.683690071105957,
364
+ "logps/chosen": -269.78045654296875,
365
+ "logps/rejected": -245.2332763671875,
366
+ "loss": 0.5103,
367
+ "rewards/accuracies": 0.706250011920929,
368
+ "rewards/chosen": 19.911705017089844,
369
+ "rewards/margins": 13.897372245788574,
370
+ "rewards/rejected": 6.0143327713012695,
371
  "step": 210
372
  },
373
  {
374
  "epoch": 0.46,
375
+ "grad_norm": 30.457889800742976,
376
  "learning_rate": 3.272542485937368e-07,
377
+ "logits/chosen": -2.6613078117370605,
378
+ "logits/rejected": -2.6541290283203125,
379
+ "logps/chosen": -245.76773071289062,
380
+ "logps/rejected": -238.1407470703125,
381
+ "loss": 0.5084,
382
+ "rewards/accuracies": 0.75,
383
+ "rewards/chosen": 19.60938835144043,
384
+ "rewards/margins": 12.086370468139648,
385
+ "rewards/rejected": 7.523016452789307,
386
  "step": 220
387
  },
388
  {
389
  "epoch": 0.48,
390
+ "grad_norm": 32.04472046652176,
391
  "learning_rate": 3.096924887558854e-07,
392
+ "logits/chosen": -2.682553768157959,
393
+ "logits/rejected": -2.6649653911590576,
394
+ "logps/chosen": -228.4560089111328,
395
+ "logps/rejected": -240.5241241455078,
396
+ "loss": 0.5079,
397
+ "rewards/accuracies": 0.706250011920929,
398
+ "rewards/chosen": 20.409427642822266,
399
+ "rewards/margins": 13.195713996887207,
400
+ "rewards/rejected": 7.2137131690979,
401
  "step": 230
402
  },
403
  {
404
  "epoch": 0.5,
405
+ "grad_norm": 33.84679720475086,
406
  "learning_rate": 2.9181224366319943e-07,
407
+ "logits/chosen": -2.7205722332000732,
408
+ "logits/rejected": -2.701112985610962,
409
+ "logps/chosen": -243.3650665283203,
410
+ "logps/rejected": -238.9823760986328,
411
+ "loss": 0.4938,
412
+ "rewards/accuracies": 0.6875,
413
+ "rewards/chosen": 20.33513069152832,
414
+ "rewards/margins": 10.249124526977539,
415
+ "rewards/rejected": 10.086007118225098,
416
  "step": 240
417
  },
418
  {
419
  "epoch": 0.52,
420
+ "grad_norm": 31.858527336933598,
421
  "learning_rate": 2.7370891215954565e-07,
422
+ "logits/chosen": -2.6577446460723877,
423
+ "logits/rejected": -2.6170592308044434,
424
+ "logps/chosen": -275.0615234375,
425
+ "logps/rejected": -250.7952880859375,
426
+ "loss": 0.486,
427
+ "rewards/accuracies": 0.7562500238418579,
428
+ "rewards/chosen": 23.128969192504883,
429
+ "rewards/margins": 14.401697158813477,
430
+ "rewards/rejected": 8.727272987365723,
431
  "step": 250
432
  },
433
  {
434
  "epoch": 0.54,
435
+ "grad_norm": 33.909893139050666,
436
  "learning_rate": 2.55479083351317e-07,
437
+ "logits/chosen": -2.7196850776672363,
438
+ "logits/rejected": -2.708618640899658,
439
+ "logps/chosen": -272.7483825683594,
440
+ "logps/rejected": -246.864013671875,
441
+ "loss": 0.4929,
442
+ "rewards/accuracies": 0.7437499761581421,
443
+ "rewards/chosen": 21.56315803527832,
444
+ "rewards/margins": 14.446769714355469,
445
+ "rewards/rejected": 7.116389274597168,
446
  "step": 260
447
  },
448
  {
449
  "epoch": 0.56,
450
+ "grad_norm": 32.106664429608195,
451
  "learning_rate": 2.3722002126275822e-07,
452
+ "logits/chosen": -2.7111871242523193,
453
+ "logits/rejected": -2.69014310836792,
454
+ "logps/chosen": -257.98822021484375,
455
+ "logps/rejected": -246.855712890625,
456
+ "loss": 0.5003,
457
+ "rewards/accuracies": 0.6187499761581421,
458
+ "rewards/chosen": 19.00775718688965,
459
+ "rewards/margins": 8.219191551208496,
460
+ "rewards/rejected": 10.788566589355469,
461
  "step": 270
462
  },
463
  {
464
  "epoch": 0.59,
465
+ "grad_norm": 36.46282639590835,
466
  "learning_rate": 2.19029145890313e-07,
467
+ "logits/chosen": -2.6879024505615234,
468
+ "logits/rejected": -2.6575160026550293,
469
+ "logps/chosen": -241.15469360351562,
470
+ "logps/rejected": -236.71749877929688,
471
+ "loss": 0.5092,
472
+ "rewards/accuracies": 0.762499988079071,
473
+ "rewards/chosen": 20.52240753173828,
474
+ "rewards/margins": 15.053570747375488,
475
+ "rewards/rejected": 5.468836784362793,
476
  "step": 280
477
  },
478
  {
479
  "epoch": 0.61,
480
+ "grad_norm": 32.938385675504726,
481
  "learning_rate": 2.0100351342479216e-07,
482
+ "logits/chosen": -2.730700969696045,
483
+ "logits/rejected": -2.7169148921966553,
484
+ "logps/chosen": -231.1471710205078,
485
+ "logps/rejected": -233.15213012695312,
486
+ "loss": 0.4989,
487
+ "rewards/accuracies": 0.706250011920929,
488
+ "rewards/chosen": 19.9401798248291,
489
+ "rewards/margins": 11.136396408081055,
490
+ "rewards/rejected": 8.803783416748047,
491
  "step": 290
492
  },
493
  {
494
  "epoch": 0.63,
495
+ "grad_norm": 40.46462898598811,
496
  "learning_rate": 1.8323929841460178e-07,
497
+ "logits/chosen": -2.699219226837158,
498
+ "logits/rejected": -2.6627821922302246,
499
+ "logps/chosen": -282.5068054199219,
500
+ "logps/rejected": -253.93002319335938,
501
+ "loss": 0.4808,
502
+ "rewards/accuracies": 0.675000011920929,
503
+ "rewards/chosen": 18.48929214477539,
504
+ "rewards/margins": 12.313825607299805,
505
+ "rewards/rejected": 6.175467491149902,
506
  "step": 300
507
  },
508
  {
509
  "epoch": 0.63,
510
+ "eval_logits/chosen": -2.737816095352173,
511
+ "eval_logits/rejected": -2.7115261554718018,
512
+ "eval_logps/chosen": -241.775146484375,
513
+ "eval_logps/rejected": -255.71316528320312,
514
+ "eval_loss": 0.4966064989566803,
515
+ "eval_rewards/accuracies": 0.72265625,
516
+ "eval_rewards/chosen": 20.818317413330078,
517
+ "eval_rewards/margins": 13.864299774169922,
518
+ "eval_rewards/rejected": 6.954016208648682,
519
+ "eval_runtime": 96.5223,
520
+ "eval_samples_per_second": 20.721,
521
+ "eval_steps_per_second": 0.332,
522
  "step": 300
523
  },
524
  {
525
  "epoch": 0.65,
526
+ "grad_norm": 34.221593161714,
527
  "learning_rate": 1.6583128063291573e-07,
528
+ "logits/chosen": -2.650965452194214,
529
+ "logits/rejected": -2.6548705101013184,
530
+ "logps/chosen": -275.9816589355469,
531
+ "logps/rejected": -252.0779571533203,
532
+ "loss": 0.4758,
533
+ "rewards/accuracies": 0.7437499761581421,
534
+ "rewards/chosen": 19.354223251342773,
535
+ "rewards/margins": 12.915657043457031,
536
+ "rewards/rejected": 6.438567161560059,
537
  "step": 310
538
  },
539
  {
540
  "epoch": 0.67,
541
+ "grad_norm": 33.001844259909745,
542
  "learning_rate": 1.488723393865766e-07,
543
+ "logits/chosen": -2.6921844482421875,
544
+ "logits/rejected": -2.691749095916748,
545
+ "logps/chosen": -272.9283142089844,
546
+ "logps/rejected": -237.22213745117188,
547
+ "loss": 0.4701,
548
+ "rewards/accuracies": 0.75,
549
+ "rewards/chosen": 20.741744995117188,
550
+ "rewards/margins": 13.140623092651367,
551
+ "rewards/rejected": 7.6011223793029785,
552
  "step": 320
553
  },
554
  {
555
  "epoch": 0.69,
556
+ "grad_norm": 29.49781014497435,
557
  "learning_rate": 1.3245295796480788e-07,
558
+ "logits/chosen": -2.7120227813720703,
559
+ "logits/rejected": -2.677337169647217,
560
+ "logps/chosen": -243.09988403320312,
561
+ "logps/rejected": -251.96121215820312,
562
+ "loss": 0.4909,
563
+ "rewards/accuracies": 0.6812499761581421,
564
+ "rewards/chosen": 19.110301971435547,
565
+ "rewards/margins": 11.57287883758545,
566
+ "rewards/rejected": 7.537426948547363,
567
  "step": 330
568
  },
569
  {
570
  "epoch": 0.71,
571
+ "grad_norm": 41.44497406324907,
572
  "learning_rate": 1.1666074087171627e-07,
573
+ "logits/chosen": -2.723417043685913,
574
+ "logits/rejected": -2.691898822784424,
575
+ "logps/chosen": -269.8048095703125,
576
+ "logps/rejected": -273.60491943359375,
577
+ "loss": 0.4884,
578
+ "rewards/accuracies": 0.6937500238418579,
579
+ "rewards/chosen": 20.189619064331055,
580
+ "rewards/margins": 13.752766609191895,
581
+ "rewards/rejected": 6.43685245513916,
582
  "step": 340
583
  },
584
  {
585
  "epoch": 0.73,
586
+ "grad_norm": 34.6350412976571,
587
  "learning_rate": 1.0157994641835734e-07,
588
+ "logits/chosen": -2.700378179550171,
589
+ "logits/rejected": -2.6705689430236816,
590
+ "logps/chosen": -238.4877166748047,
591
+ "logps/rejected": -235.2138214111328,
592
+ "loss": 0.4772,
593
+ "rewards/accuracies": 0.7437499761581421,
594
+ "rewards/chosen": 19.723506927490234,
595
+ "rewards/margins": 14.228517532348633,
596
+ "rewards/rejected": 5.494990825653076,
597
  "step": 350
598
  },
599
  {
600
  "epoch": 0.75,
601
+ "grad_norm": 42.44685611987456,
602
  "learning_rate": 8.729103716819111e-08,
603
+ "logits/chosen": -2.732093334197998,
604
+ "logits/rejected": -2.685842990875244,
605
+ "logps/chosen": -282.49151611328125,
606
+ "logps/rejected": -255.60482788085938,
607
+ "loss": 0.5041,
608
+ "rewards/accuracies": 0.75,
609
+ "rewards/chosen": 20.200531005859375,
610
+ "rewards/margins": 14.500404357910156,
611
+ "rewards/rejected": 5.700125694274902,
612
  "step": 360
613
  },
614
  {
615
  "epoch": 0.77,
616
+ "grad_norm": 33.10027222498208,
617
  "learning_rate": 7.387025063449081e-08,
618
+ "logits/chosen": -2.69976544380188,
619
+ "logits/rejected": -2.6724953651428223,
620
+ "logps/chosen": -256.20074462890625,
621
+ "logps/rejected": -229.0190887451172,
622
+ "loss": 0.5081,
623
+ "rewards/accuracies": 0.6625000238418579,
624
+ "rewards/chosen": 19.13229751586914,
625
+ "rewards/margins": 9.917332649230957,
626
+ "rewards/rejected": 9.2149658203125,
627
  "step": 370
628
  },
629
  {
630
  "epoch": 0.79,
631
+ "grad_norm": 30.0028500984578,
632
  "learning_rate": 6.138919252022435e-08,
633
+ "logits/chosen": -2.6289491653442383,
634
+ "logits/rejected": -2.6343891620635986,
635
+ "logps/chosen": -221.06399536132812,
636
+ "logps/rejected": -254.60464477539062,
637
+ "loss": 0.4847,
638
+ "rewards/accuracies": 0.7250000238418579,
639
+ "rewards/chosen": 18.420679092407227,
640
+ "rewards/margins": 14.3089017868042,
641
+ "rewards/rejected": 4.111776828765869,
642
  "step": 380
643
  },
644
  {
645
  "epoch": 0.82,
646
+ "grad_norm": 41.20295613426392,
647
  "learning_rate": 4.991445467064689e-08,
648
+ "logits/chosen": -2.673283338546753,
649
+ "logits/rejected": -2.668842315673828,
650
+ "logps/chosen": -285.0795593261719,
651
+ "logps/rejected": -274.74163818359375,
652
+ "loss": 0.4871,
653
+ "rewards/accuracies": 0.699999988079071,
654
+ "rewards/chosen": 20.55733871459961,
655
+ "rewards/margins": 11.32852840423584,
656
+ "rewards/rejected": 9.228808403015137,
657
  "step": 390
658
  },
659
  {
660
  "epoch": 0.84,
661
+ "grad_norm": 32.56833882197615,
662
  "learning_rate": 3.9507259776993954e-08,
663
+ "logits/chosen": -2.676893949508667,
664
+ "logits/rejected": -2.6494295597076416,
665
+ "logps/chosen": -250.10324096679688,
666
+ "logps/rejected": -261.3594055175781,
667
+ "loss": 0.4835,
668
+ "rewards/accuracies": 0.731249988079071,
669
+ "rewards/chosen": 20.431964874267578,
670
+ "rewards/margins": 14.77888011932373,
671
+ "rewards/rejected": 5.653082370758057,
672
  "step": 400
673
  },
674
  {
675
  "epoch": 0.84,
676
+ "eval_logits/chosen": -2.731470823287964,
677
+ "eval_logits/rejected": -2.703744649887085,
678
+ "eval_logps/chosen": -241.37046813964844,
679
+ "eval_logps/rejected": -256.2980041503906,
680
+ "eval_loss": 0.4916878044605255,
681
+ "eval_rewards/accuracies": 0.734375,
682
+ "eval_rewards/chosen": 21.223026275634766,
683
+ "eval_rewards/margins": 14.853860855102539,
684
+ "eval_rewards/rejected": 6.369164943695068,
685
+ "eval_runtime": 96.4786,
686
+ "eval_samples_per_second": 20.73,
687
+ "eval_steps_per_second": 0.332,
688
  "step": 400
689
  },
690
  {
691
  "epoch": 0.86,
692
+ "grad_norm": 33.3145791190938,
693
  "learning_rate": 3.022313472693447e-08,
694
+ "logits/chosen": -2.7175421714782715,
695
+ "logits/rejected": -2.6865715980529785,
696
+ "logps/chosen": -276.17120361328125,
697
+ "logps/rejected": -262.1912841796875,
698
+ "loss": 0.4963,
699
+ "rewards/accuracies": 0.78125,
700
+ "rewards/chosen": 22.746294021606445,
701
+ "rewards/margins": 16.12038803100586,
702
+ "rewards/rejected": 6.625903129577637,
703
  "step": 410
704
  },
705
  {
706
  "epoch": 0.88,
707
+ "grad_norm": 38.26789970081439,
708
  "learning_rate": 2.2111614344599684e-08,
709
+ "logits/chosen": -2.6725165843963623,
710
+ "logits/rejected": -2.6631102561950684,
711
+ "logps/chosen": -278.31524658203125,
712
+ "logps/rejected": -268.63787841796875,
713
+ "loss": 0.497,
714
+ "rewards/accuracies": 0.6625000238418579,
715
+ "rewards/chosen": 18.758575439453125,
716
+ "rewards/margins": 12.9342041015625,
717
+ "rewards/rejected": 5.824368953704834,
718
  "step": 420
719
  },
720
  {
721
  "epoch": 0.9,
722
+ "grad_norm": 29.576447372480786,
723
  "learning_rate": 1.521597710086439e-08,
724
+ "logits/chosen": -2.622992515563965,
725
+ "logits/rejected": -2.5860610008239746,
726
+ "logps/chosen": -260.331787109375,
727
+ "logps/rejected": -252.45510864257812,
728
+ "loss": 0.4834,
729
+ "rewards/accuracies": 0.768750011920929,
730
+ "rewards/chosen": 18.70633316040039,
731
+ "rewards/margins": 13.38142204284668,
732
+ "rewards/rejected": 5.3249101638793945,
733
  "step": 430
734
  },
735
  {
736
  "epoch": 0.92,
737
+ "grad_norm": 31.865322800676886,
738
  "learning_rate": 9.57301420397924e-09,
739
+ "logits/chosen": -2.696772813796997,
740
+ "logits/rejected": -2.6611225605010986,
741
+ "logps/chosen": -262.6088562011719,
742
+ "logps/rejected": -258.1268005371094,
743
+ "loss": 0.485,
744
+ "rewards/accuracies": 0.6875,
745
+ "rewards/chosen": 21.72829246520996,
746
+ "rewards/margins": 10.743375778198242,
747
+ "rewards/rejected": 10.984918594360352,
748
  "step": 440
749
  },
750
  {
751
  "epoch": 0.94,
752
+ "grad_norm": 29.684061550729,
753
  "learning_rate": 5.212833302556258e-09,
754
+ "logits/chosen": -2.6331818103790283,
755
+ "logits/rejected": -2.646833658218384,
756
+ "logps/chosen": -273.6771240234375,
757
+ "logps/rejected": -299.3607482910156,
758
+ "loss": 0.4824,
759
+ "rewards/accuracies": 0.6875,
760
+ "rewards/chosen": 20.00701904296875,
761
+ "rewards/margins": 12.293313980102539,
762
+ "rewards/rejected": 7.7137041091918945,
763
  "step": 450
764
  },
765
  {
766
  "epoch": 0.96,
767
+ "grad_norm": 35.062338944611746,
768
  "learning_rate": 2.158697848236607e-09,
769
+ "logits/chosen": -2.686453342437744,
770
+ "logits/rejected": -2.6655733585357666,
771
+ "logps/chosen": -252.51998901367188,
772
+ "logps/rejected": -234.3843231201172,
773
+ "loss": 0.4908,
774
+ "rewards/accuracies": 0.6875,
775
+ "rewards/chosen": 19.44460105895996,
776
+ "rewards/margins": 11.141111373901367,
777
+ "rewards/rejected": 8.303489685058594,
778
  "step": 460
779
  },
780
  {
781
  "epoch": 0.98,
782
+ "grad_norm": 32.367340987762965,
783
  "learning_rate": 4.269029751107489e-10,
784
+ "logits/chosen": -2.6681036949157715,
785
+ "logits/rejected": -2.640652656555176,
786
+ "logps/chosen": -258.857177734375,
787
+ "logps/rejected": -274.1594543457031,
788
+ "loss": 0.4653,
789
+ "rewards/accuracies": 0.699999988079071,
790
+ "rewards/chosen": 19.84146499633789,
791
+ "rewards/margins": 11.649839401245117,
792
+ "rewards/rejected": 8.191625595092773,
793
  "step": 470
794
  },
795
  {
796
  "epoch": 1.0,
797
  "step": 478,
798
  "total_flos": 0.0,
799
+ "train_loss": 0.5147256711536871,
800
+ "train_runtime": 7551.132,
801
+ "train_samples_per_second": 8.096,
802
+ "train_steps_per_second": 0.063
803
  }
804
  ],
805
  "logging_steps": 10,