RikkiXu commited on
Commit
df68f81
1 Parent(s): adef70e

Model save

Browse files
README.md CHANGED
@@ -15,15 +15,15 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  This model was trained from scratch on the None dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 0.4292
19
- - Rewards/chosen: -1.8869
20
- - Rewards/rejected: -2.7914
21
- - Rewards/accuracies: 0.8242
22
- - Rewards/margins: 0.9045
23
- - Logps/rejected: -612.2493
24
- - Logps/chosen: -524.2042
25
- - Logits/rejected: -0.4436
26
- - Logits/chosen: -0.8025
27
 
28
  ## Model description
29
 
@@ -60,14 +60,14 @@ The following hyperparameters were used during training:
60
 
61
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
62
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
63
- | 0.5405 | 0.12 | 100 | 0.6086 | -0.8599 | -1.1867 | 0.6953 | 0.3268 | -451.7755 | -421.5048 | -1.6547 | -1.7462 |
64
- | 0.4371 | 0.23 | 200 | 0.5454 | -2.0208 | -2.5842 | 0.7422 | 0.5634 | -591.5291 | -537.5920 | -0.7151 | -0.8867 |
65
- | 0.4348 | 0.35 | 300 | 0.5012 | -2.0998 | -2.8410 | 0.7734 | 0.7413 | -617.2101 | -545.4883 | -0.3499 | -0.5939 |
66
- | 0.3733 | 0.46 | 400 | 0.4721 | -2.1506 | -2.9308 | 0.7773 | 0.7802 | -626.1902 | -550.5717 | -0.2280 | -0.5456 |
67
- | 0.3689 | 0.58 | 500 | 0.4484 | -2.0467 | -2.9485 | 0.7969 | 0.9018 | -627.9595 | -540.1826 | -0.1091 | -0.4774 |
68
- | 0.3829 | 0.69 | 600 | 0.4419 | -2.0265 | -2.9075 | 0.8086 | 0.8810 | -623.8541 | -538.1624 | -0.1412 | -0.5099 |
69
- | 0.3725 | 0.81 | 700 | 0.4329 | -1.9184 | -2.8079 | 0.8242 | 0.8895 | -613.8932 | -527.3496 | -0.3224 | -0.6920 |
70
- | 0.4052 | 0.92 | 800 | 0.4292 | -1.8869 | -2.7914 | 0.8242 | 0.9045 | -612.2493 | -524.2042 | -0.4436 | -0.8025 |
71
 
72
 
73
  ### Framework versions
 
15
 
16
  This model was trained from scratch on the None dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 0.2027
19
+ - Rewards/chosen: 0.6729
20
+ - Rewards/rejected: -2.3580
21
+ - Rewards/accuracies: 0.9141
22
+ - Rewards/margins: 3.0309
23
+ - Logps/rejected: -380.2658
24
+ - Logps/chosen: -322.0539
25
+ - Logits/rejected: -1.9204
26
+ - Logits/chosen: -1.9591
27
 
28
  ## Model description
29
 
 
60
 
61
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
62
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
63
+ | 0.4898 | 0.12 | 100 | 0.5505 | -0.1967 | -1.0051 | 0.6875 | 0.8085 | -353.2088 | -339.4445 | -1.7659 | -1.8469 |
64
+ | 0.4277 | 0.23 | 200 | 0.4655 | -0.4834 | -1.8836 | 0.7383 | 1.4002 | -370.7788 | -345.1795 | -1.7248 | -1.8009 |
65
+ | 0.4188 | 0.35 | 300 | 0.3922 | -0.0720 | -2.0263 | 0.7969 | 1.9544 | -373.6328 | -336.9513 | -1.6143 | -1.6899 |
66
+ | 0.3506 | 0.46 | 400 | 0.3457 | 0.2171 | -2.0472 | 0.8203 | 2.2643 | -374.0495 | -331.1692 | -1.9794 | -2.0296 |
67
+ | 0.3611 | 0.58 | 500 | 0.2959 | 0.2498 | -2.4347 | 0.8516 | 2.6844 | -381.7997 | -330.5164 | -1.8183 | -1.8592 |
68
+ | 0.3562 | 0.69 | 600 | 0.2513 | 0.3868 | -2.4732 | 0.8711 | 2.8600 | -382.5696 | -327.7753 | -1.9217 | -1.9736 |
69
+ | 0.3624 | 0.81 | 700 | 0.2194 | 0.6454 | -2.3556 | 0.9062 | 3.0010 | -380.2178 | -322.6031 | -1.9301 | -1.9717 |
70
+ | 0.4069 | 0.92 | 800 | 0.2027 | 0.6729 | -2.3580 | 0.9141 | 3.0309 | -380.2658 | -322.0539 | -1.9204 | -1.9591 |
71
 
72
 
73
  ### Framework versions
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.42912535238925215,
4
- "train_runtime": 13911.1927,
5
  "train_samples": 111134,
6
- "train_samples_per_second": 7.989,
7
- "train_steps_per_second": 0.062
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.40559157427005504,
4
+ "train_runtime": 13777.3263,
5
  "train_samples": 111134,
6
+ "train_samples_per_second": 8.066,
7
+ "train_steps_per_second": 0.063
8
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0acb5813455afb8a2a593db5c3f70e6ce3a70a6dba71da0deed700bd92f8bae6
3
  size 4943178720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:419678fed1e21fc0085a3e5460e53663c8575f3962e49b1af0e61f023fd4943e
3
  size 4943178720
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e900b3151695fca8285835cb5d4f82531dbf6d2368960fc224c9b92a979d4555
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf7dfdf8135658ed696d23c67a80c515ab05997271e3dfbd5a614d3783b327ce
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:99b499766e278e51c9caa2c54b1e6bd1bae29c88bcb8dbb19fde2439c75638e4
3
  size 4540532728
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae56d3cafd2deed309e3da51f8c4ba401fe0402f043f2bd51fb953c4cba99c36
3
  size 4540532728
runs/May20_13-41-32_n136-100-194/events.out.tfevents.1716184187.n136-100-194.1143920.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:667920d9cbf4473f8cb0820deda4b1afb243b3a22e0f80d95d25f7c3711f94e8
3
- size 66313
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41ae544c3e35fea0352946b6fd6b18f40dea31b26fd311ee418f65aaaba669a4
3
+ size 70795
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.42912535238925215,
4
- "train_runtime": 13911.1927,
5
  "train_samples": 111134,
6
- "train_samples_per_second": 7.989,
7
- "train_steps_per_second": 0.062
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.40559157427005504,
4
+ "train_runtime": 13777.3263,
5
  "train_samples": 111134,
6
+ "train_samples_per_second": 8.066,
7
+ "train_steps_per_second": 0.063
8
  }
trainer_state.json CHANGED
@@ -10,13 +10,13 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "grad_norm": 23.51828299790517,
14
  "learning_rate": 5.747126436781609e-09,
15
- "logits/chosen": -1.865264654159546,
16
- "logits/rejected": -1.587956428527832,
17
- "logps/chosen": -204.58331298828125,
18
- "logps/rejected": -154.1517333984375,
19
- "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
22
  "rewards/margins": 0.0,
@@ -25,1430 +25,1430 @@
25
  },
26
  {
27
  "epoch": 0.01,
28
- "grad_norm": 24.149515970375678,
29
  "learning_rate": 5.747126436781609e-08,
30
- "logits/chosen": -1.90481698513031,
31
- "logits/rejected": -1.8536584377288818,
32
- "logps/chosen": -213.41416931152344,
33
- "logps/rejected": -191.33694458007812,
34
- "loss": 0.6932,
35
- "rewards/accuracies": 0.4861111044883728,
36
- "rewards/chosen": -1.9929786503780633e-05,
37
- "rewards/margins": 0.00017105697770603,
38
- "rewards/rejected": -0.00019098672783002257,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.02,
43
- "grad_norm": 23.563731768256098,
44
  "learning_rate": 1.1494252873563217e-07,
45
- "logits/chosen": -1.9680726528167725,
46
- "logits/rejected": -1.798654317855835,
47
- "logps/chosen": -255.55111694335938,
48
- "logps/rejected": -189.6189727783203,
49
- "loss": 0.6921,
50
- "rewards/accuracies": 0.6312500238418579,
51
- "rewards/chosen": 0.0016116431215777993,
52
- "rewards/margins": 0.002336590550839901,
53
- "rewards/rejected": -0.0007249473710544407,
54
  "step": 20
55
  },
56
  {
57
  "epoch": 0.03,
58
- "grad_norm": 23.038450073297746,
59
  "learning_rate": 1.7241379310344828e-07,
60
- "logits/chosen": -1.8938862085342407,
61
- "logits/rejected": -1.8228662014007568,
62
- "logps/chosen": -212.65322875976562,
63
- "logps/rejected": -194.4668426513672,
64
- "loss": 0.6878,
65
- "rewards/accuracies": 0.75,
66
- "rewards/chosen": 0.011502735316753387,
67
- "rewards/margins": 0.014704583212733269,
68
- "rewards/rejected": -0.003201847430318594,
69
  "step": 30
70
  },
71
  {
72
  "epoch": 0.05,
73
- "grad_norm": 22.339093495440075,
74
  "learning_rate": 2.2988505747126435e-07,
75
- "logits/chosen": -1.8691730499267578,
76
- "logits/rejected": -1.810280442237854,
77
- "logps/chosen": -212.04031372070312,
78
- "logps/rejected": -189.72427368164062,
79
- "loss": 0.6773,
80
- "rewards/accuracies": 0.75,
81
- "rewards/chosen": 0.027534600347280502,
82
- "rewards/margins": 0.037894655019044876,
83
- "rewards/rejected": -0.010360054671764374,
84
  "step": 40
85
  },
86
  {
87
  "epoch": 0.06,
88
- "grad_norm": 21.83120331543706,
89
  "learning_rate": 2.873563218390804e-07,
90
- "logits/chosen": -1.9792773723602295,
91
- "logits/rejected": -1.8856391906738281,
92
- "logps/chosen": -199.00392150878906,
93
- "logps/rejected": -184.42074584960938,
94
- "loss": 0.6637,
95
- "rewards/accuracies": 0.7250000238418579,
96
- "rewards/chosen": 0.02775971218943596,
97
- "rewards/margins": 0.08295276015996933,
98
- "rewards/rejected": -0.05519305542111397,
99
  "step": 50
100
  },
101
  {
102
  "epoch": 0.07,
103
- "grad_norm": 21.94313336281609,
104
  "learning_rate": 3.4482758620689656e-07,
105
- "logits/chosen": -1.978032112121582,
106
- "logits/rejected": -1.8626216650009155,
107
- "logps/chosen": -263.13702392578125,
108
- "logps/rejected": -227.51931762695312,
109
- "loss": 0.6365,
110
- "rewards/accuracies": 0.7562500238418579,
111
- "rewards/chosen": -0.024905700236558914,
112
- "rewards/margins": 0.1394185870885849,
113
- "rewards/rejected": -0.1643243134021759,
114
  "step": 60
115
  },
116
  {
117
  "epoch": 0.08,
118
- "grad_norm": 21.93834951114425,
119
  "learning_rate": 4.0229885057471266e-07,
120
- "logits/chosen": -1.923208236694336,
121
- "logits/rejected": -1.9092395305633545,
122
- "logps/chosen": -211.4084930419922,
123
- "logps/rejected": -216.09439086914062,
124
- "loss": 0.6127,
125
- "rewards/accuracies": 0.768750011920929,
126
- "rewards/chosen": -0.16204482316970825,
127
- "rewards/margins": 0.21120235323905945,
128
- "rewards/rejected": -0.3732471466064453,
129
  "step": 70
130
  },
131
  {
132
  "epoch": 0.09,
133
- "grad_norm": 26.27963832031748,
134
  "learning_rate": 4.597701149425287e-07,
135
- "logits/chosen": -1.7020299434661865,
136
- "logits/rejected": -1.635000467300415,
137
- "logps/chosen": -229.10562133789062,
138
- "logps/rejected": -228.198486328125,
139
- "loss": 0.5888,
140
- "rewards/accuracies": 0.7250000238418579,
141
- "rewards/chosen": -0.3274237811565399,
142
- "rewards/margins": 0.26525241136550903,
143
- "rewards/rejected": -0.5926762819290161,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 0.1,
148
- "grad_norm": 35.47456739543052,
149
  "learning_rate": 4.999817969178237e-07,
150
- "logits/chosen": -1.768843412399292,
151
- "logits/rejected": -1.73134446144104,
152
- "logps/chosen": -271.71563720703125,
153
- "logps/rejected": -283.0465393066406,
154
- "loss": 0.5313,
155
- "rewards/accuracies": 0.7562500238418579,
156
- "rewards/chosen": -0.538571298122406,
157
- "rewards/margins": 0.47389060258865356,
158
- "rewards/rejected": -1.0124619007110596,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 0.12,
163
- "grad_norm": 38.67050237438448,
164
  "learning_rate": 4.996582603056428e-07,
165
- "logits/chosen": -1.7260372638702393,
166
- "logits/rejected": -1.6588356494903564,
167
- "logps/chosen": -285.2041320800781,
168
- "logps/rejected": -323.65692138671875,
169
- "loss": 0.5405,
170
- "rewards/accuracies": 0.75,
171
- "rewards/chosen": -0.6892239451408386,
172
- "rewards/margins": 0.5662633180618286,
173
- "rewards/rejected": -1.2554872035980225,
174
  "step": 100
175
  },
176
  {
177
  "epoch": 0.12,
178
- "eval_logits/chosen": -1.746153473854065,
179
- "eval_logits/rejected": -1.6546903848648071,
180
- "eval_logps/chosen": -421.5047912597656,
181
- "eval_logps/rejected": -451.7755432128906,
182
- "eval_loss": 0.6086099743843079,
183
- "eval_rewards/accuracies": 0.6953125,
184
- "eval_rewards/chosen": -0.8599321246147156,
185
- "eval_rewards/margins": 0.3267643451690674,
186
- "eval_rewards/rejected": -1.1866965293884277,
187
- "eval_runtime": 98.2501,
188
- "eval_samples_per_second": 20.356,
189
- "eval_steps_per_second": 0.326,
190
  "step": 100
191
  },
192
  {
193
  "epoch": 0.13,
194
- "grad_norm": 56.77623681367674,
195
  "learning_rate": 4.989308132738126e-07,
196
- "logits/chosen": -1.8324391841888428,
197
- "logits/rejected": -1.7346527576446533,
198
- "logps/chosen": -289.9622802734375,
199
- "logps/rejected": -307.9504699707031,
200
- "loss": 0.5032,
201
- "rewards/accuracies": 0.762499988079071,
202
- "rewards/chosen": -0.7402961850166321,
203
- "rewards/margins": 0.6292544007301331,
204
- "rewards/rejected": -1.3695508241653442,
205
  "step": 110
206
  },
207
  {
208
  "epoch": 0.14,
209
- "grad_norm": 54.65739090602792,
210
  "learning_rate": 4.978006327248536e-07,
211
- "logits/chosen": -1.91842520236969,
212
- "logits/rejected": -1.849988579750061,
213
- "logps/chosen": -323.345703125,
214
- "logps/rejected": -366.32415771484375,
215
- "loss": 0.4966,
216
- "rewards/accuracies": 0.762499988079071,
217
- "rewards/chosen": -1.107177972793579,
218
- "rewards/margins": 0.7300722599029541,
219
- "rewards/rejected": -1.8372503519058228,
220
  "step": 120
221
  },
222
  {
223
  "epoch": 0.15,
224
- "grad_norm": 40.66462467188264,
225
  "learning_rate": 4.962695471250032e-07,
226
- "logits/chosen": -1.7266982793807983,
227
- "logits/rejected": -1.6543283462524414,
228
- "logps/chosen": -320.31195068359375,
229
- "logps/rejected": -359.983154296875,
230
- "loss": 0.4886,
231
- "rewards/accuracies": 0.762499988079071,
232
- "rewards/chosen": -1.0283275842666626,
233
- "rewards/margins": 0.7512324452400208,
234
- "rewards/rejected": -1.7795600891113281,
235
  "step": 130
236
  },
237
  {
238
  "epoch": 0.16,
239
- "grad_norm": 45.88018498600559,
240
  "learning_rate": 4.94340033546025e-07,
241
- "logits/chosen": -1.4110041856765747,
242
- "logits/rejected": -1.3973127603530884,
243
- "logps/chosen": -312.18145751953125,
244
- "logps/rejected": -390.5517578125,
245
- "loss": 0.4739,
246
- "rewards/accuracies": 0.7562500238418579,
247
- "rewards/chosen": -1.310011863708496,
248
- "rewards/margins": 0.8049423098564148,
249
- "rewards/rejected": -2.1149544715881348,
250
  "step": 140
251
  },
252
  {
253
  "epoch": 0.17,
254
- "grad_norm": 79.78754356153908,
255
  "learning_rate": 4.920152136576705e-07,
256
- "logits/chosen": -1.2265546321868896,
257
- "logits/rejected": -1.1716219186782837,
258
- "logps/chosen": -357.737060546875,
259
- "logps/rejected": -431.76806640625,
260
- "loss": 0.4655,
261
- "rewards/accuracies": 0.7875000238418579,
262
- "rewards/chosen": -1.4461175203323364,
263
- "rewards/margins": 0.9848885536193848,
264
- "rewards/rejected": -2.4310059547424316,
265
  "step": 150
266
  },
267
  {
268
  "epoch": 0.18,
269
- "grad_norm": 40.08268655919122,
270
  "learning_rate": 4.892988486772756e-07,
271
- "logits/chosen": -1.2588635683059692,
272
- "logits/rejected": -1.1425318717956543,
273
- "logps/chosen": -354.57867431640625,
274
- "logps/rejected": -432.987060546875,
275
- "loss": 0.4787,
276
- "rewards/accuracies": 0.8125,
277
- "rewards/chosen": -1.4544165134429932,
278
- "rewards/margins": 0.9601584672927856,
279
- "rewards/rejected": -2.4145748615264893,
280
  "step": 160
281
  },
282
  {
283
  "epoch": 0.2,
284
- "grad_norm": 37.08844280081501,
285
  "learning_rate": 4.861953332846629e-07,
286
- "logits/chosen": -1.0948612689971924,
287
- "logits/rejected": -0.9797511100769043,
288
- "logps/chosen": -370.5609436035156,
289
- "logps/rejected": -417.10418701171875,
290
- "loss": 0.4741,
291
  "rewards/accuracies": 0.7875000238418579,
292
- "rewards/chosen": -1.4464932680130005,
293
- "rewards/margins": 0.8114526867866516,
294
- "rewards/rejected": -2.257946014404297,
295
  "step": 170
296
  },
297
  {
298
  "epoch": 0.21,
299
- "grad_norm": 51.03369267010431,
300
  "learning_rate": 4.827096885121953e-07,
301
- "logits/chosen": -0.9882611036300659,
302
- "logits/rejected": -0.786241888999939,
303
- "logps/chosen": -403.01361083984375,
304
- "logps/rejected": -465.450439453125,
305
- "loss": 0.4518,
306
- "rewards/accuracies": 0.7562500238418579,
307
- "rewards/chosen": -1.6873054504394531,
308
- "rewards/margins": 0.8884965181350708,
309
- "rewards/rejected": -2.5758020877838135,
310
  "step": 180
311
  },
312
  {
313
  "epoch": 0.22,
314
- "grad_norm": 40.75117386512369,
315
  "learning_rate": 4.788475536214821e-07,
316
- "logits/chosen": -0.6994659900665283,
317
- "logits/rejected": -0.57302325963974,
318
- "logps/chosen": -345.23858642578125,
319
- "logps/rejected": -434.90069580078125,
320
- "loss": 0.4305,
321
- "rewards/accuracies": 0.831250011920929,
322
- "rewards/chosen": -1.5364990234375,
323
- "rewards/margins": 1.0722037553787231,
324
- "rewards/rejected": -2.6087028980255127,
325
  "step": 190
326
  },
327
  {
328
  "epoch": 0.23,
329
- "grad_norm": 50.385160508667006,
330
  "learning_rate": 4.746151769798818e-07,
331
- "logits/chosen": -0.46505388617515564,
332
- "logits/rejected": -0.32105451822280884,
333
- "logps/chosen": -395.0636901855469,
334
- "logps/rejected": -491.369873046875,
335
- "loss": 0.4371,
336
- "rewards/accuracies": 0.800000011920929,
337
- "rewards/chosen": -1.5998367071151733,
338
- "rewards/margins": 1.3637341260910034,
339
- "rewards/rejected": -2.9635708332061768,
340
  "step": 200
341
  },
342
  {
343
  "epoch": 0.23,
344
- "eval_logits/chosen": -0.8866692185401917,
345
- "eval_logits/rejected": -0.715141236782074,
346
- "eval_logps/chosen": -537.5919799804688,
347
- "eval_logps/rejected": -591.529052734375,
348
- "eval_loss": 0.5454351305961609,
349
- "eval_rewards/accuracies": 0.7421875,
350
- "eval_rewards/chosen": -2.0208044052124023,
351
- "eval_rewards/margins": 0.5634276270866394,
352
- "eval_rewards/rejected": -2.5842318534851074,
353
- "eval_runtime": 98.1521,
354
- "eval_samples_per_second": 20.377,
355
  "eval_steps_per_second": 0.326,
356
  "step": 200
357
  },
358
  {
359
  "epoch": 0.24,
360
- "grad_norm": 44.17462139523744,
361
  "learning_rate": 4.7001940595156055e-07,
362
- "logits/chosen": -0.5879951119422913,
363
- "logits/rejected": -0.31766843795776367,
364
- "logps/chosen": -347.45184326171875,
365
- "logps/rejected": -442.23291015625,
366
- "loss": 0.466,
367
  "rewards/accuracies": 0.8062499761581421,
368
- "rewards/chosen": -1.621807336807251,
369
- "rewards/margins": 1.1228187084197998,
370
- "rewards/rejected": -2.7446258068084717,
371
  "step": 210
372
  },
373
  {
374
  "epoch": 0.25,
375
- "grad_norm": 46.80720748583798,
376
  "learning_rate": 4.650676758194623e-07,
377
- "logits/chosen": -0.5494168996810913,
378
- "logits/rejected": -0.3329974114894867,
379
- "logps/chosen": -386.22528076171875,
380
- "logps/rejected": -472.072998046875,
381
- "loss": 0.419,
382
- "rewards/accuracies": 0.78125,
383
- "rewards/chosen": -1.6599994897842407,
384
- "rewards/margins": 1.2505383491516113,
385
- "rewards/rejected": -2.9105377197265625,
386
  "step": 220
387
  },
388
  {
389
  "epoch": 0.26,
390
- "grad_norm": 43.28959440159286,
391
  "learning_rate": 4.5976799775611215e-07,
392
- "logits/chosen": -0.6910772919654846,
393
- "logits/rejected": -0.4287993013858795,
394
- "logps/chosen": -385.10784912109375,
395
- "logps/rejected": -484.22314453125,
396
- "loss": 0.43,
397
- "rewards/accuracies": 0.831250011920929,
398
- "rewards/chosen": -1.7417128086090088,
399
- "rewards/margins": 1.4360835552215576,
400
- "rewards/rejected": -3.1777961254119873,
401
  "step": 230
402
  },
403
  {
404
  "epoch": 0.28,
405
- "grad_norm": 48.21494711877692,
406
  "learning_rate": 4.5412894586271543e-07,
407
- "logits/chosen": -0.3966357111930847,
408
- "logits/rejected": -0.13579869270324707,
409
- "logps/chosen": -405.3009338378906,
410
- "logps/rejected": -484.6737365722656,
411
- "loss": 0.4083,
412
- "rewards/accuracies": 0.800000011920929,
413
- "rewards/chosen": -1.8974357843399048,
414
- "rewards/margins": 1.3567252159118652,
415
- "rewards/rejected": -3.2541611194610596,
416
  "step": 240
417
  },
418
  {
419
  "epoch": 0.29,
420
- "grad_norm": 42.352515667816355,
421
  "learning_rate": 4.481596432975201e-07,
422
- "logits/chosen": -0.6702763438224792,
423
- "logits/rejected": -0.49778255820274353,
424
- "logps/chosen": -340.3480224609375,
425
- "logps/rejected": -434.61376953125,
426
- "loss": 0.425,
427
- "rewards/accuracies": 0.737500011920929,
428
- "rewards/chosen": -1.6621681451797485,
429
- "rewards/margins": 1.0998741388320923,
430
- "rewards/rejected": -2.762042284011841,
431
  "step": 250
432
  },
433
  {
434
  "epoch": 0.3,
435
- "grad_norm": 51.54256095538614,
436
  "learning_rate": 4.41869747515886e-07,
437
- "logits/chosen": -0.6597603559494019,
438
- "logits/rejected": -0.5498248338699341,
439
- "logps/chosen": -365.7995910644531,
440
- "logps/rejected": -490.1622009277344,
441
- "loss": 0.4244,
442
- "rewards/accuracies": 0.824999988079071,
443
- "rewards/chosen": -1.4162827730178833,
444
- "rewards/margins": 1.2882452011108398,
445
- "rewards/rejected": -2.7045278549194336,
446
  "step": 260
447
  },
448
  {
449
  "epoch": 0.31,
450
- "grad_norm": 48.71803198385668,
451
  "learning_rate": 4.352694346459396e-07,
452
- "logits/chosen": 0.04401933029294014,
453
- "logits/rejected": 0.16322588920593262,
454
- "logps/chosen": -363.21539306640625,
455
- "logps/rejected": -463.6495056152344,
456
- "loss": 0.4206,
457
- "rewards/accuracies": 0.831250011920929,
458
- "rewards/chosen": -1.5739765167236328,
459
- "rewards/margins": 1.1849424839019775,
460
- "rewards/rejected": -2.7589190006256104,
461
  "step": 270
462
  },
463
  {
464
  "epoch": 0.32,
465
- "grad_norm": 38.68223370724194,
466
  "learning_rate": 4.2836938302509256e-07,
467
- "logits/chosen": -0.13973233103752136,
468
- "logits/rejected": 0.19283699989318848,
469
- "logps/chosen": -328.5007019042969,
470
- "logps/rejected": -440.18365478515625,
471
- "loss": 0.4456,
472
- "rewards/accuracies": 0.8062499761581421,
473
- "rewards/chosen": -1.279756784439087,
474
- "rewards/margins": 1.4430491924285889,
475
- "rewards/rejected": -2.7228057384490967,
476
  "step": 280
477
  },
478
  {
479
  "epoch": 0.33,
480
- "grad_norm": 45.704934038680605,
481
  "learning_rate": 4.2118075592405874e-07,
482
- "logits/chosen": 0.20580144226551056,
483
- "logits/rejected": 0.34621715545654297,
484
- "logps/chosen": -407.57373046875,
485
- "logps/rejected": -517.0430908203125,
486
- "loss": 0.4242,
487
- "rewards/accuracies": 0.78125,
488
- "rewards/chosen": -1.8687858581542969,
489
- "rewards/margins": 1.2867904901504517,
490
- "rewards/rejected": -3.155576229095459,
491
  "step": 290
492
  },
493
  {
494
  "epoch": 0.35,
495
- "grad_norm": 48.006993514366904,
496
  "learning_rate": 4.137151834863213e-07,
497
- "logits/chosen": 0.6578917503356934,
498
- "logits/rejected": 0.7554408311843872,
499
- "logps/chosen": -349.4103088378906,
500
- "logps/rejected": -480.834228515625,
501
- "loss": 0.4348,
502
- "rewards/accuracies": 0.7875000238418579,
503
- "rewards/chosen": -1.828386664390564,
504
- "rewards/margins": 1.3594980239868164,
505
- "rewards/rejected": -3.18788480758667,
506
  "step": 300
507
  },
508
  {
509
  "epoch": 0.35,
510
- "eval_logits/chosen": -0.5939264297485352,
511
- "eval_logits/rejected": -0.34991100430488586,
512
- "eval_logps/chosen": -545.4883422851562,
513
- "eval_logps/rejected": -617.2100830078125,
514
- "eval_loss": 0.5011798739433289,
515
- "eval_rewards/accuracies": 0.7734375,
516
- "eval_rewards/chosen": -2.0997684001922607,
517
- "eval_rewards/margins": 0.7412738800048828,
518
- "eval_rewards/rejected": -2.8410420417785645,
519
- "eval_runtime": 98.127,
520
- "eval_samples_per_second": 20.382,
521
  "eval_steps_per_second": 0.326,
522
  "step": 300
523
  },
524
  {
525
  "epoch": 0.36,
526
- "grad_norm": 43.38987414729455,
527
  "learning_rate": 4.059847439122671e-07,
528
- "logits/chosen": 0.5874438285827637,
529
- "logits/rejected": 0.8824877738952637,
530
- "logps/chosen": -419.9178771972656,
531
- "logps/rejected": -517.2019653320312,
532
- "loss": 0.4149,
533
- "rewards/accuracies": 0.8062499761581421,
534
- "rewards/chosen": -2.0750081539154053,
535
- "rewards/margins": 1.2572228908538818,
536
- "rewards/rejected": -3.332231044769287,
537
  "step": 310
538
  },
539
  {
540
  "epoch": 0.37,
541
- "grad_norm": 56.605050092804255,
542
  "learning_rate": 3.98001943918432e-07,
543
- "logits/chosen": 0.6735237836837769,
544
- "logits/rejected": 1.019078254699707,
545
- "logps/chosen": -373.03009033203125,
546
- "logps/rejected": -483.0083923339844,
547
- "loss": 0.4049,
548
- "rewards/accuracies": 0.793749988079071,
549
- "rewards/chosen": -1.7667083740234375,
550
- "rewards/margins": 1.1942052841186523,
551
- "rewards/rejected": -2.960913896560669,
552
  "step": 320
553
  },
554
  {
555
  "epoch": 0.38,
556
- "grad_norm": 57.81664075376147,
557
  "learning_rate": 3.8977969850346866e-07,
558
- "logits/chosen": 0.4839138090610504,
559
- "logits/rejected": 0.8274878263473511,
560
- "logps/chosen": -387.33673095703125,
561
- "logps/rejected": -499.78094482421875,
562
- "loss": 0.4004,
563
- "rewards/accuracies": 0.7875000238418579,
564
- "rewards/chosen": -1.7052650451660156,
565
- "rewards/margins": 1.477137565612793,
566
- "rewards/rejected": -3.1824028491973877,
567
  "step": 330
568
  },
569
  {
570
  "epoch": 0.39,
571
- "grad_norm": 50.66567087546677,
572
  "learning_rate": 3.8133131005357465e-07,
573
- "logits/chosen": 0.23904335498809814,
574
- "logits/rejected": 0.6436888575553894,
575
- "logps/chosen": -374.50750732421875,
576
- "logps/rejected": -534.21435546875,
577
- "loss": 0.3943,
578
- "rewards/accuracies": 0.8374999761581421,
579
- "rewards/chosen": -1.705120325088501,
580
- "rewards/margins": 1.7923282384872437,
581
- "rewards/rejected": -3.497448444366455,
582
  "step": 340
583
  },
584
  {
585
  "epoch": 0.4,
586
- "grad_norm": 41.43510772615216,
587
  "learning_rate": 3.7267044682118435e-07,
588
- "logits/chosen": 0.3483354449272156,
589
- "logits/rejected": 0.6899020075798035,
590
- "logps/chosen": -369.47418212890625,
591
- "logps/rejected": -496.38262939453125,
592
- "loss": 0.3884,
593
- "rewards/accuracies": 0.8374999761581421,
594
- "rewards/chosen": -1.8410135507583618,
595
- "rewards/margins": 1.4833061695098877,
596
- "rewards/rejected": -3.324319362640381,
597
  "step": 350
598
  },
599
  {
600
  "epoch": 0.41,
601
- "grad_norm": 46.89248795203356,
602
  "learning_rate": 3.638111208117425e-07,
603
- "logits/chosen": 0.22267869114875793,
604
- "logits/rejected": 0.4508979916572571,
605
- "logps/chosen": -409.98974609375,
606
- "logps/rejected": -508.88055419921875,
607
- "loss": 0.4111,
608
- "rewards/accuracies": 0.75,
609
- "rewards/chosen": -2.0787599086761475,
610
- "rewards/margins": 1.0934727191925049,
611
- "rewards/rejected": -3.1722328662872314,
612
  "step": 360
613
  },
614
  {
615
  "epoch": 0.43,
616
- "grad_norm": 43.02323311612351,
617
  "learning_rate": 3.5476766511433605e-07,
618
- "logits/chosen": 0.1800430715084076,
619
- "logits/rejected": 0.6425480842590332,
620
- "logps/chosen": -431.10736083984375,
621
- "logps/rejected": -516.4458618164062,
622
- "loss": 0.4194,
623
- "rewards/accuracies": 0.7749999761581421,
624
- "rewards/chosen": -1.9750921726226807,
625
- "rewards/margins": 1.3207170963287354,
626
- "rewards/rejected": -3.295809268951416,
627
  "step": 370
628
  },
629
  {
630
  "epoch": 0.44,
631
- "grad_norm": 43.154999607698095,
632
  "learning_rate": 3.455547107128602e-07,
633
- "logits/chosen": 0.3740110993385315,
634
- "logits/rejected": 0.8220480680465698,
635
- "logps/chosen": -410.6556701660156,
636
- "logps/rejected": -515.9549560546875,
637
- "loss": 0.3767,
638
- "rewards/accuracies": 0.8500000238418579,
639
- "rewards/chosen": -1.596968412399292,
640
- "rewards/margins": 1.6267616748809814,
641
- "rewards/rejected": -3.2237300872802734,
642
  "step": 380
643
  },
644
  {
645
  "epoch": 0.45,
646
- "grad_norm": 56.90068596534485,
647
  "learning_rate": 3.361871628152338e-07,
648
- "logits/chosen": 0.6576219797134399,
649
- "logits/rejected": 1.0373657941818237,
650
- "logps/chosen": -398.47906494140625,
651
- "logps/rejected": -556.7415771484375,
652
- "loss": 0.4239,
653
- "rewards/accuracies": 0.768750011920929,
654
- "rewards/chosen": -1.9783694744110107,
655
- "rewards/margins": 1.5746887922286987,
656
- "rewards/rejected": -3.55305814743042,
657
  "step": 390
658
  },
659
  {
660
  "epoch": 0.46,
661
- "grad_norm": 41.49097538770333,
662
  "learning_rate": 3.2668017673896077e-07,
663
- "logits/chosen": 0.6066378355026245,
664
- "logits/rejected": 1.0441324710845947,
665
- "logps/chosen": -376.2064514160156,
666
- "logps/rejected": -497.462890625,
667
- "loss": 0.3733,
668
- "rewards/accuracies": 0.831250011920929,
669
- "rewards/chosen": -1.7407310009002686,
670
- "rewards/margins": 1.581956148147583,
671
- "rewards/rejected": -3.3226871490478516,
672
  "step": 400
673
  },
674
  {
675
  "epoch": 0.46,
676
- "eval_logits/chosen": -0.5456388592720032,
677
- "eval_logits/rejected": -0.2280205935239792,
678
- "eval_logps/chosen": -550.5716552734375,
679
- "eval_logps/rejected": -626.190185546875,
680
- "eval_loss": 0.47210657596588135,
681
- "eval_rewards/accuracies": 0.77734375,
682
- "eval_rewards/chosen": -2.1506011486053467,
683
- "eval_rewards/margins": 0.7802413105964661,
684
- "eval_rewards/rejected": -2.930842399597168,
685
- "eval_runtime": 98.1161,
686
- "eval_samples_per_second": 20.384,
687
- "eval_steps_per_second": 0.326,
688
  "step": 400
689
  },
690
  {
691
  "epoch": 0.47,
692
- "grad_norm": 47.55353494901972,
693
  "learning_rate": 3.1704913339205103e-07,
694
- "logits/chosen": 0.5084329843521118,
695
- "logits/rejected": 0.796318531036377,
696
- "logps/chosen": -409.43585205078125,
697
- "logps/rejected": -561.5556030273438,
698
- "loss": 0.3928,
699
- "rewards/accuracies": 0.824999988079071,
700
- "rewards/chosen": -1.992550253868103,
701
- "rewards/margins": 1.6422802209854126,
702
- "rewards/rejected": -3.6348299980163574,
703
  "step": 410
704
  },
705
  {
706
  "epoch": 0.48,
707
- "grad_norm": 41.646877730648264,
708
  "learning_rate": 3.0730961438896885e-07,
709
- "logits/chosen": 0.4776241183280945,
710
- "logits/rejected": 0.7627217769622803,
711
- "logps/chosen": -482.1835021972656,
712
- "logps/rejected": -587.5792236328125,
713
- "loss": 0.3881,
714
- "rewards/accuracies": 0.793749988079071,
715
- "rewards/chosen": -2.093543767929077,
716
- "rewards/margins": 1.4904192686080933,
717
- "rewards/rejected": -3.583962917327881,
718
  "step": 420
719
  },
720
  {
721
  "epoch": 0.5,
722
- "grad_norm": 68.32669660083764,
723
  "learning_rate": 2.9747737684186795e-07,
724
- "logits/chosen": 0.7197389602661133,
725
- "logits/rejected": 0.8317638635635376,
726
- "logps/chosen": -388.28656005859375,
727
- "logps/rejected": -509.2151794433594,
728
- "loss": 0.3841,
729
  "rewards/accuracies": 0.8125,
730
- "rewards/chosen": -1.7809364795684814,
731
- "rewards/margins": 1.5095723867416382,
732
- "rewards/rejected": -3.290508985519409,
733
  "step": 430
734
  },
735
  {
736
  "epoch": 0.51,
737
- "grad_norm": 46.78192200543751,
738
  "learning_rate": 2.8756832786789663e-07,
739
- "logits/chosen": 0.3376988172531128,
740
- "logits/rejected": 0.8295138478279114,
741
- "logps/chosen": -403.0928649902344,
742
- "logps/rejected": -518.611083984375,
743
- "loss": 0.4029,
744
  "rewards/accuracies": 0.8187500238418579,
745
- "rewards/chosen": -1.744091272354126,
746
- "rewards/margins": 1.5630067586898804,
747
- "rewards/rejected": -3.307097911834717,
748
  "step": 440
749
  },
750
  {
751
  "epoch": 0.52,
752
- "grad_norm": 49.72034219777285,
753
  "learning_rate": 2.7759849885381747e-07,
754
- "logits/chosen": 0.3917238414287567,
755
- "logits/rejected": 0.9007431268692017,
756
- "logps/chosen": -451.806884765625,
757
- "logps/rejected": -584.4218139648438,
758
- "loss": 0.3785,
759
- "rewards/accuracies": 0.8374999761581421,
760
- "rewards/chosen": -2.0996882915496826,
761
- "rewards/margins": 1.9295704364776611,
762
- "rewards/rejected": -4.029258728027344,
763
  "step": 450
764
  },
765
  {
766
  "epoch": 0.53,
767
- "grad_norm": 38.3046078852496,
768
  "learning_rate": 2.675840195195762e-07,
769
- "logits/chosen": 0.1938302218914032,
770
- "logits/rejected": 0.7046247720718384,
771
- "logps/chosen": -375.27606201171875,
772
- "logps/rejected": -523.9801025390625,
773
- "loss": 0.3934,
774
- "rewards/accuracies": 0.8062499761581421,
775
- "rewards/chosen": -1.7863012552261353,
776
- "rewards/margins": 1.6471843719482422,
777
- "rewards/rejected": -3.433485507965088,
778
  "step": 460
779
  },
780
  {
781
  "epoch": 0.54,
782
- "grad_norm": 39.056692194028,
783
  "learning_rate": 2.575410918227829e-07,
784
- "logits/chosen": 0.09105312079191208,
785
- "logits/rejected": 0.5196784138679504,
786
- "logps/chosen": -413.9867248535156,
787
- "logps/rejected": -532.4803466796875,
788
- "loss": 0.3755,
789
- "rewards/accuracies": 0.8062499761581421,
790
- "rewards/chosen": -1.720029592514038,
791
- "rewards/margins": 1.528271198272705,
792
- "rewards/rejected": -3.2483010292053223,
793
  "step": 470
794
  },
795
  {
796
  "epoch": 0.55,
797
- "grad_norm": 46.6868254294557,
798
  "learning_rate": 2.474859637463226e-07,
799
- "logits/chosen": 0.21693472564220428,
800
- "logits/rejected": 0.8155421018600464,
801
- "logps/chosen": -418.37652587890625,
802
- "logps/rejected": -540.866455078125,
803
- "loss": 0.3846,
804
- "rewards/accuracies": 0.7749999761581421,
805
- "rewards/chosen": -1.9778916835784912,
806
- "rewards/margins": 1.7564996480941772,
807
- "rewards/rejected": -3.734391450881958,
808
  "step": 480
809
  },
810
  {
811
  "epoch": 0.56,
812
- "grad_norm": 45.512117273870444,
813
  "learning_rate": 2.3743490301150355e-07,
814
- "logits/chosen": 0.2570355236530304,
815
- "logits/rejected": 0.8997817039489746,
816
- "logps/chosen": -381.27801513671875,
817
- "logps/rejected": -525.5377807617188,
818
- "loss": 0.4012,
819
- "rewards/accuracies": 0.84375,
820
- "rewards/chosen": -1.6076080799102783,
821
- "rewards/margins": 1.8304884433746338,
822
- "rewards/rejected": -3.438096523284912,
823
  "step": 490
824
  },
825
  {
826
  "epoch": 0.58,
827
- "grad_norm": 46.09704078060399,
828
  "learning_rate": 2.274041707592724e-07,
829
- "logits/chosen": 0.7786660194396973,
830
- "logits/rejected": 1.2057403326034546,
831
- "logps/chosen": -416.14068603515625,
832
- "logps/rejected": -602.9859008789062,
833
- "loss": 0.3689,
834
- "rewards/accuracies": 0.800000011920929,
835
- "rewards/chosen": -2.2852025032043457,
836
- "rewards/margins": 1.9095999002456665,
837
- "rewards/rejected": -4.194802284240723,
838
  "step": 500
839
  },
840
  {
841
  "epoch": 0.58,
842
- "eval_logits/chosen": -0.4774431586265564,
843
- "eval_logits/rejected": -0.1090613454580307,
844
- "eval_logps/chosen": -540.1826171875,
845
- "eval_logps/rejected": -627.9595336914062,
846
- "eval_loss": 0.448412150144577,
847
- "eval_rewards/accuracies": 0.796875,
848
- "eval_rewards/chosen": -2.046710968017578,
849
- "eval_rewards/margins": 0.9018256068229675,
850
- "eval_rewards/rejected": -2.9485368728637695,
851
- "eval_runtime": 98.1848,
852
- "eval_samples_per_second": 20.37,
853
- "eval_steps_per_second": 0.326,
854
  "step": 500
855
  },
856
  {
857
  "epoch": 0.59,
858
- "grad_norm": 42.744213876119844,
859
  "learning_rate": 2.17409995242075e-07,
860
- "logits/chosen": 0.6994825005531311,
861
- "logits/rejected": 1.289393663406372,
862
- "logps/chosen": -405.2342224121094,
863
- "logps/rejected": -555.2643432617188,
864
- "loss": 0.3921,
865
- "rewards/accuracies": 0.8187500238418579,
866
- "rewards/chosen": -2.062455415725708,
867
- "rewards/margins": 1.8831449747085571,
868
- "rewards/rejected": -3.9456000328063965,
869
  "step": 510
870
  },
871
  {
872
  "epoch": 0.6,
873
- "grad_norm": 44.25862131066792,
874
  "learning_rate": 2.0746854556892544e-07,
875
- "logits/chosen": 0.7421714067459106,
876
- "logits/rejected": 0.9166728258132935,
877
- "logps/chosen": -363.72222900390625,
878
- "logps/rejected": -499.4908752441406,
879
- "loss": 0.4102,
880
- "rewards/accuracies": 0.78125,
881
- "rewards/chosen": -1.795539140701294,
882
- "rewards/margins": 1.4331713914871216,
883
- "rewards/rejected": -3.228710889816284,
884
  "step": 520
885
  },
886
  {
887
  "epoch": 0.61,
888
- "grad_norm": 40.42456029676201,
889
  "learning_rate": 1.9759590554616173e-07,
890
- "logits/chosen": 0.2788628935813904,
891
- "logits/rejected": 0.5978427529335022,
892
- "logps/chosen": -387.8989562988281,
893
- "logps/rejected": -499.9576110839844,
894
- "loss": 0.4053,
895
- "rewards/accuracies": 0.78125,
896
- "rewards/chosen": -1.6805702447891235,
897
- "rewards/margins": 1.3731516599655151,
898
- "rewards/rejected": -3.0537219047546387,
899
  "step": 530
900
  },
901
  {
902
  "epoch": 0.62,
903
- "grad_norm": 43.79592437572997,
904
  "learning_rate": 1.8780804765620746e-07,
905
- "logits/chosen": 0.37570881843566895,
906
- "logits/rejected": 0.5200439691543579,
907
- "logps/chosen": -394.23284912109375,
908
- "logps/rejected": -548.2333374023438,
909
- "loss": 0.384,
910
- "rewards/accuracies": 0.800000011920929,
911
- "rewards/chosen": -1.5941615104675293,
912
- "rewards/margins": 1.527552843093872,
913
- "rewards/rejected": -3.1217141151428223,
914
  "step": 540
915
  },
916
  {
917
  "epoch": 0.63,
918
- "grad_norm": 44.94669101797897,
919
  "learning_rate": 1.7812080721643973e-07,
920
- "logits/chosen": 0.6379637122154236,
921
- "logits/rejected": 1.1335102319717407,
922
- "logps/chosen": -422.62200927734375,
923
- "logps/rejected": -535.2354736328125,
924
- "loss": 0.3932,
925
- "rewards/accuracies": 0.8374999761581421,
926
- "rewards/chosen": -2.0112552642822266,
927
- "rewards/margins": 1.6570736169815063,
928
- "rewards/rejected": -3.6683287620544434,
929
  "step": 550
930
  },
931
  {
932
  "epoch": 0.64,
933
- "grad_norm": 48.51576878403802,
934
  "learning_rate": 1.6854985675997063e-07,
935
- "logits/chosen": 0.5151522755622864,
936
- "logits/rejected": 0.9227844476699829,
937
- "logps/chosen": -410.75244140625,
938
- "logps/rejected": -543.8304443359375,
939
- "loss": 0.3729,
940
- "rewards/accuracies": 0.8062499761581421,
941
- "rewards/chosen": -1.9362386465072632,
942
- "rewards/margins": 1.548099398612976,
943
- "rewards/rejected": -3.4843380451202393,
944
  "step": 560
945
  },
946
  {
947
  "epoch": 0.66,
948
- "grad_norm": 42.77055197730572,
949
  "learning_rate": 1.5911068067978818e-07,
950
- "logits/chosen": 0.7765737771987915,
951
- "logits/rejected": 0.9592781066894531,
952
- "logps/chosen": -391.6842041015625,
953
- "logps/rejected": -575.3435668945312,
954
- "loss": 0.3642,
955
- "rewards/accuracies": 0.7749999761581421,
956
- "rewards/chosen": -2.0532357692718506,
957
- "rewards/margins": 1.811832070350647,
958
- "rewards/rejected": -3.865067720413208,
959
  "step": 570
960
  },
961
  {
962
  "epoch": 0.67,
963
- "grad_norm": 51.09604434640814,
964
  "learning_rate": 1.4981855017728197e-07,
965
- "logits/chosen": 0.596177875995636,
966
- "logits/rejected": 0.7803729772567749,
967
- "logps/chosen": -459.51422119140625,
968
- "logps/rejected": -612.7260131835938,
969
- "loss": 0.388,
970
- "rewards/accuracies": 0.768750011920929,
971
- "rewards/chosen": -2.421908140182495,
972
- "rewards/margins": 1.5485522747039795,
973
- "rewards/rejected": -3.9704601764678955,
974
  "step": 580
975
  },
976
  {
977
  "epoch": 0.68,
978
- "grad_norm": 51.69715596466598,
979
  "learning_rate": 1.406884985556804e-07,
980
- "logits/chosen": 0.6335197687149048,
981
- "logits/rejected": 1.1092630624771118,
982
- "logps/chosen": -429.76690673828125,
983
- "logps/rejected": -580.2468872070312,
984
- "loss": 0.3807,
985
- "rewards/accuracies": 0.824999988079071,
986
- "rewards/chosen": -2.2301127910614014,
987
- "rewards/margins": 1.8223087787628174,
988
- "rewards/rejected": -4.052420616149902,
989
  "step": 590
990
  },
991
  {
992
  "epoch": 0.69,
993
- "grad_norm": 48.435911535292384,
994
  "learning_rate": 1.3173529689837354e-07,
995
- "logits/chosen": 0.5912660956382751,
996
- "logits/rejected": 1.1899088621139526,
997
- "logps/chosen": -393.476318359375,
998
- "logps/rejected": -521.782958984375,
999
- "loss": 0.3829,
1000
  "rewards/accuracies": 0.800000011920929,
1001
- "rewards/chosen": -1.8528053760528564,
1002
- "rewards/margins": 1.6730989217758179,
1003
- "rewards/rejected": -3.5259041786193848,
1004
  "step": 600
1005
  },
1006
  {
1007
  "epoch": 0.69,
1008
- "eval_logits/chosen": -0.509851336479187,
1009
- "eval_logits/rejected": -0.14121857285499573,
1010
- "eval_logps/chosen": -538.1624145507812,
1011
- "eval_logps/rejected": -623.8541259765625,
1012
- "eval_loss": 0.44193577766418457,
1013
- "eval_rewards/accuracies": 0.80859375,
1014
- "eval_rewards/chosen": -2.0265088081359863,
1015
- "eval_rewards/margins": 0.8809735774993896,
1016
- "eval_rewards/rejected": -2.907482147216797,
1017
- "eval_runtime": 98.167,
1018
- "eval_samples_per_second": 20.373,
1019
- "eval_steps_per_second": 0.326,
1020
  "step": 600
1021
  },
1022
  {
1023
  "epoch": 0.7,
1024
- "grad_norm": 48.985755457205066,
1025
  "learning_rate": 1.2297343017146726e-07,
1026
- "logits/chosen": 0.7694305181503296,
1027
- "logits/rejected": 1.232879877090454,
1028
- "logps/chosen": -402.1836853027344,
1029
- "logps/rejected": -533.408447265625,
1030
- "loss": 0.3929,
1031
- "rewards/accuracies": 0.831250011920929,
1032
- "rewards/chosen": -1.905542016029358,
1033
- "rewards/margins": 1.6221548318862915,
1034
- "rewards/rejected": -3.5276970863342285,
1035
  "step": 610
1036
  },
1037
  {
1038
  "epoch": 0.71,
1039
- "grad_norm": 45.28513242475784,
1040
  "learning_rate": 1.1441707378923474e-07,
1041
- "logits/chosen": 0.5253760814666748,
1042
- "logits/rejected": 1.0413273572921753,
1043
- "logps/chosen": -359.5643615722656,
1044
- "logps/rejected": -514.2081909179688,
1045
- "loss": 0.3806,
1046
- "rewards/accuracies": 0.8500000238418579,
1047
- "rewards/chosen": -1.579487919807434,
1048
- "rewards/margins": 1.767327070236206,
1049
- "rewards/rejected": -3.3468146324157715,
1050
  "step": 620
1051
  },
1052
  {
1053
  "epoch": 0.73,
1054
- "grad_norm": 47.72652227607087,
1055
  "learning_rate": 1.06080070680377e-07,
1056
- "logits/chosen": 0.4920094907283783,
1057
- "logits/rejected": 1.009433627128601,
1058
- "logps/chosen": -399.2576599121094,
1059
- "logps/rejected": -537.9578247070312,
1060
- "loss": 0.3821,
1061
- "rewards/accuracies": 0.8374999761581421,
1062
- "rewards/chosen": -1.7137682437896729,
1063
- "rewards/margins": 1.7276941537857056,
1064
- "rewards/rejected": -3.441462755203247,
1065
  "step": 630
1066
  },
1067
  {
1068
  "epoch": 0.74,
1069
- "grad_norm": 42.1168430015071,
1070
  "learning_rate": 9.797590889219587e-08,
1071
- "logits/chosen": 0.3111940026283264,
1072
- "logits/rejected": 0.8665814399719238,
1073
- "logps/chosen": -396.842529296875,
1074
- "logps/rejected": -543.9876098632812,
1075
- "loss": 0.3843,
1076
  "rewards/accuracies": 0.8374999761581421,
1077
- "rewards/chosen": -1.7231413125991821,
1078
- "rewards/margins": 1.8444896936416626,
1079
- "rewards/rejected": -3.567631244659424,
1080
  "step": 640
1081
  },
1082
  {
1083
  "epoch": 0.75,
1084
- "grad_norm": 47.41933670532933,
1085
  "learning_rate": 9.011769976891367e-08,
1086
- "logits/chosen": 0.4944031834602356,
1087
- "logits/rejected": 0.8744715452194214,
1088
- "logps/chosen": -398.05615234375,
1089
- "logps/rejected": -543.6096801757812,
1090
- "loss": 0.3763,
1091
- "rewards/accuracies": 0.8374999761581421,
1092
- "rewards/chosen": -1.8718990087509155,
1093
- "rewards/margins": 1.6193087100982666,
1094
- "rewards/rejected": -3.4912078380584717,
1095
  "step": 650
1096
  },
1097
  {
1098
  "epoch": 0.76,
1099
- "grad_norm": 52.75260796298546,
1100
  "learning_rate": 8.251815673944218e-08,
1101
- "logits/chosen": 0.5813334584236145,
1102
- "logits/rejected": 0.9786221385002136,
1103
- "logps/chosen": -443.66070556640625,
1104
- "logps/rejected": -576.3490600585938,
1105
- "loss": 0.3822,
1106
- "rewards/accuracies": 0.8062499761581421,
1107
- "rewards/chosen": -2.220869779586792,
1108
- "rewards/margins": 1.742889404296875,
1109
- "rewards/rejected": -3.963758945465088,
1110
  "step": 660
1111
  },
1112
  {
1113
  "epoch": 0.77,
1114
- "grad_norm": 46.64520061062158,
1115
  "learning_rate": 7.518957474892148e-08,
1116
- "logits/chosen": 0.6128578186035156,
1117
- "logits/rejected": 1.1231881380081177,
1118
- "logps/chosen": -427.1106872558594,
1119
- "logps/rejected": -589.3102416992188,
1120
- "loss": 0.3662,
1121
- "rewards/accuracies": 0.862500011920929,
1122
- "rewards/chosen": -2.0986740589141846,
1123
- "rewards/margins": 1.9974746704101562,
1124
- "rewards/rejected": -4.096148490905762,
1125
  "step": 670
1126
  },
1127
  {
1128
  "epoch": 0.78,
1129
- "grad_norm": 44.32719204523107,
1130
  "learning_rate": 6.814381036730274e-08,
1131
- "logits/chosen": 0.44363918900489807,
1132
- "logits/rejected": 0.8115978240966797,
1133
- "logps/chosen": -397.6707763671875,
1134
- "logps/rejected": -538.56591796875,
1135
- "loss": 0.3962,
1136
- "rewards/accuracies": 0.7875000238418579,
1137
- "rewards/chosen": -1.9131567478179932,
1138
- "rewards/margins": 1.6610631942749023,
1139
- "rewards/rejected": -3.5742194652557373,
1140
  "step": 680
1141
  },
1142
  {
1143
  "epoch": 0.79,
1144
- "grad_norm": 43.744460103075866,
1145
  "learning_rate": 6.139226260715872e-08,
1146
- "logits/chosen": 0.34574732184410095,
1147
- "logits/rejected": 0.7309020161628723,
1148
- "logps/chosen": -390.32464599609375,
1149
- "logps/rejected": -550.9197998046875,
1150
- "loss": 0.3747,
1151
- "rewards/accuracies": 0.84375,
1152
- "rewards/chosen": -1.7989364862442017,
1153
- "rewards/margins": 1.8234875202178955,
1154
- "rewards/rejected": -3.622424364089966,
1155
  "step": 690
1156
  },
1157
  {
1158
  "epoch": 0.81,
1159
- "grad_norm": 48.21671181557863,
1160
  "learning_rate": 5.4945854481754734e-08,
1161
- "logits/chosen": 0.4160235822200775,
1162
- "logits/rejected": 1.0240848064422607,
1163
- "logps/chosen": -393.590576171875,
1164
- "logps/rejected": -540.9241333007812,
1165
- "loss": 0.3725,
1166
- "rewards/accuracies": 0.768750011920929,
1167
- "rewards/chosen": -1.9948371648788452,
1168
- "rewards/margins": 1.689173936843872,
1169
- "rewards/rejected": -3.6840109825134277,
1170
  "step": 700
1171
  },
1172
  {
1173
  "epoch": 0.81,
1174
- "eval_logits/chosen": -0.6920372247695923,
1175
- "eval_logits/rejected": -0.3223564326763153,
1176
- "eval_logps/chosen": -527.349609375,
1177
- "eval_logps/rejected": -613.8932495117188,
1178
- "eval_loss": 0.43294557929039,
1179
- "eval_rewards/accuracies": 0.82421875,
1180
- "eval_rewards/chosen": -1.9183804988861084,
1181
- "eval_rewards/margins": 0.8894931077957153,
1182
- "eval_rewards/rejected": -2.8078737258911133,
1183
- "eval_runtime": 98.1374,
1184
- "eval_samples_per_second": 20.38,
1185
  "eval_steps_per_second": 0.326,
1186
  "step": 700
1187
  },
1188
  {
1189
  "epoch": 0.82,
1190
- "grad_norm": 42.53084626680963,
1191
  "learning_rate": 4.881501533321605e-08,
1192
- "logits/chosen": 0.6980074048042297,
1193
- "logits/rejected": 1.0298550128936768,
1194
- "logps/chosen": -367.0564880371094,
1195
- "logps/rejected": -539.99560546875,
1196
- "loss": 0.3547,
1197
- "rewards/accuracies": 0.84375,
1198
- "rewards/chosen": -1.8503217697143555,
1199
- "rewards/margins": 1.9031312465667725,
1200
- "rewards/rejected": -3.753452777862549,
1201
  "step": 710
1202
  },
1203
  {
1204
  "epoch": 0.83,
1205
- "grad_norm": 43.590506229310456,
1206
  "learning_rate": 4.300966395938377e-08,
1207
- "logits/chosen": 0.35197392106056213,
1208
- "logits/rejected": 0.8350766897201538,
1209
- "logps/chosen": -427.9037170410156,
1210
- "logps/rejected": -580.8751831054688,
1211
- "loss": 0.3788,
1212
- "rewards/accuracies": 0.793749988079071,
1213
- "rewards/chosen": -2.0454230308532715,
1214
- "rewards/margins": 1.8579833507537842,
1215
- "rewards/rejected": -3.9034061431884766,
1216
  "step": 720
1217
  },
1218
  {
1219
  "epoch": 0.84,
1220
- "grad_norm": 47.15415328548373,
1221
  "learning_rate": 3.7539192566655246e-08,
1222
- "logits/chosen": 0.3688026964664459,
1223
- "logits/rejected": 0.7924972772598267,
1224
- "logps/chosen": -387.2108459472656,
1225
- "logps/rejected": -532.4842529296875,
1226
- "loss": 0.3762,
1227
- "rewards/accuracies": 0.856249988079071,
1228
- "rewards/chosen": -1.6555856466293335,
1229
- "rewards/margins": 1.8027565479278564,
1230
- "rewards/rejected": -3.4583423137664795,
1231
  "step": 730
1232
  },
1233
  {
1234
  "epoch": 0.85,
1235
- "grad_norm": 41.72651096064494,
1236
  "learning_rate": 3.24124515747731e-08,
1237
- "logits/chosen": 0.4526204466819763,
1238
- "logits/rejected": 0.7684503793716431,
1239
- "logps/chosen": -406.00042724609375,
1240
- "logps/rejected": -571.0294189453125,
1241
- "loss": 0.3881,
1242
- "rewards/accuracies": 0.7875000238418579,
1243
- "rewards/chosen": -2.0768213272094727,
1244
- "rewards/margins": 1.7967207431793213,
1245
- "rewards/rejected": -3.8735415935516357,
1246
  "step": 740
1247
  },
1248
  {
1249
  "epoch": 0.86,
1250
- "grad_norm": 47.004010938683734,
1251
  "learning_rate": 2.763773529814506e-08,
1252
- "logits/chosen": 0.24592173099517822,
1253
- "logits/rejected": 0.5948923826217651,
1254
- "logps/chosen": -437.3650817871094,
1255
- "logps/rejected": -581.8604125976562,
1256
- "loss": 0.3772,
1257
- "rewards/accuracies": 0.831250011920929,
1258
- "rewards/chosen": -1.974538803100586,
1259
- "rewards/margins": 1.9196981191635132,
1260
- "rewards/rejected": -3.8942363262176514,
1261
  "step": 750
1262
  },
1263
  {
1264
  "epoch": 0.88,
1265
- "grad_norm": 56.33205281532714,
1266
  "learning_rate": 2.3222768526860698e-08,
1267
- "logits/chosen": 0.2990577220916748,
1268
- "logits/rejected": 0.7854124903678894,
1269
- "logps/chosen": -404.5032653808594,
1270
- "logps/rejected": -561.688720703125,
1271
- "loss": 0.3938,
1272
- "rewards/accuracies": 0.8374999761581421,
1273
- "rewards/chosen": -1.9624149799346924,
1274
- "rewards/margins": 1.855332374572754,
1275
- "rewards/rejected": -3.8177475929260254,
1276
  "step": 760
1277
  },
1278
  {
1279
  "epoch": 0.89,
1280
- "grad_norm": 43.51724396608159,
1281
  "learning_rate": 1.9174694029115146e-08,
1282
- "logits/chosen": 0.18542930483818054,
1283
- "logits/rejected": 0.5257433652877808,
1284
- "logps/chosen": -424.1546325683594,
1285
- "logps/rejected": -532.9678344726562,
1286
- "loss": 0.3879,
1287
- "rewards/accuracies": 0.768750011920929,
1288
- "rewards/chosen": -1.8116706609725952,
1289
- "rewards/margins": 1.6489944458007812,
1290
- "rewards/rejected": -3.460665225982666,
1291
  "step": 770
1292
  },
1293
  {
1294
  "epoch": 0.9,
1295
- "grad_norm": 51.298202533295,
1296
  "learning_rate": 1.5500060995258134e-08,
1297
- "logits/chosen": 0.3892073333263397,
1298
- "logits/rejected": 0.8499504327774048,
1299
- "logps/chosen": -402.9557189941406,
1300
- "logps/rejected": -541.4577026367188,
1301
- "loss": 0.349,
1302
- "rewards/accuracies": 0.8687499761581421,
1303
- "rewards/chosen": -1.8981506824493408,
1304
- "rewards/margins": 1.7807424068450928,
1305
- "rewards/rejected": -3.6788933277130127,
1306
  "step": 780
1307
  },
1308
  {
1309
  "epoch": 0.91,
1310
- "grad_norm": 56.6017962844276,
1311
  "learning_rate": 1.2204814442165812e-08,
1312
- "logits/chosen": 0.3551040589809418,
1313
- "logits/rejected": 0.8326929807662964,
1314
- "logps/chosen": -402.6451416015625,
1315
- "logps/rejected": -552.5445556640625,
1316
- "loss": 0.386,
1317
- "rewards/accuracies": 0.831250011920929,
1318
- "rewards/chosen": -1.9424476623535156,
1319
- "rewards/margins": 1.9533637762069702,
1320
- "rewards/rejected": -3.8958117961883545,
1321
  "step": 790
1322
  },
1323
  {
1324
  "epoch": 0.92,
1325
- "grad_norm": 49.36333315496645,
1326
  "learning_rate": 9.294285595075669e-09,
1327
- "logits/chosen": 0.06378497928380966,
1328
- "logits/rejected": 0.5464959144592285,
1329
- "logps/chosen": -430.5462951660156,
1330
- "logps/rejected": -562.2453002929688,
1331
- "loss": 0.4052,
1332
- "rewards/accuracies": 0.7875000238418579,
1333
- "rewards/chosen": -1.9655787944793701,
1334
- "rewards/margins": 1.7598493099212646,
1335
- "rewards/rejected": -3.7254281044006348,
1336
  "step": 800
1337
  },
1338
  {
1339
  "epoch": 0.92,
1340
- "eval_logits/chosen": -0.8024855852127075,
1341
- "eval_logits/rejected": -0.4436371624469757,
1342
- "eval_logps/chosen": -524.2042236328125,
1343
- "eval_logps/rejected": -612.2493286132812,
1344
- "eval_loss": 0.42916327714920044,
1345
- "eval_rewards/accuracies": 0.82421875,
1346
- "eval_rewards/chosen": -1.8869271278381348,
1347
- "eval_rewards/margins": 0.9045072793960571,
1348
- "eval_rewards/rejected": -2.7914342880249023,
1349
- "eval_runtime": 98.1154,
1350
- "eval_samples_per_second": 20.384,
1351
- "eval_steps_per_second": 0.326,
1352
  "step": 800
1353
  },
1354
  {
1355
  "epoch": 0.93,
1356
- "grad_norm": 48.45659164140374,
1357
  "learning_rate": 6.773183262446914e-09,
1358
- "logits/chosen": 0.2793930172920227,
1359
- "logits/rejected": 0.8751212954521179,
1360
- "logps/chosen": -400.6767883300781,
1361
- "logps/rejected": -544.5294799804688,
1362
- "loss": 0.3882,
1363
- "rewards/accuracies": 0.78125,
1364
- "rewards/chosen": -1.9480419158935547,
1365
- "rewards/margins": 1.711806297302246,
1366
- "rewards/rejected": -3.65984845161438,
1367
  "step": 810
1368
  },
1369
  {
1370
  "epoch": 0.94,
1371
- "grad_norm": 48.21463789648397,
1372
  "learning_rate": 4.645586217799452e-09,
1373
- "logits/chosen": 0.24326184391975403,
1374
- "logits/rejected": 0.6566700339317322,
1375
- "logps/chosen": -410.050537109375,
1376
- "logps/rejected": -576.2342529296875,
1377
- "loss": 0.4036,
1378
- "rewards/accuracies": 0.8187500238418579,
1379
- "rewards/chosen": -1.893699049949646,
1380
- "rewards/margins": 1.8373210430145264,
1381
- "rewards/rejected": -3.731020450592041,
1382
  "step": 820
1383
  },
1384
  {
1385
  "epoch": 0.96,
1386
- "grad_norm": 52.40196558130504,
1387
  "learning_rate": 2.9149366008568987e-09,
1388
- "logits/chosen": 0.2516610622406006,
1389
- "logits/rejected": 0.6028949022293091,
1390
- "logps/chosen": -397.42755126953125,
1391
- "logps/rejected": -558.4515380859375,
1392
- "loss": 0.3856,
1393
- "rewards/accuracies": 0.8062499761581421,
1394
- "rewards/chosen": -1.790833830833435,
1395
- "rewards/margins": 1.8916391134262085,
1396
- "rewards/rejected": -3.6824734210968018,
1397
  "step": 830
1398
  },
1399
  {
1400
  "epoch": 0.97,
1401
- "grad_norm": 45.18885600860689,
1402
  "learning_rate": 1.5840343486700215e-09,
1403
- "logits/chosen": 0.011555513367056847,
1404
- "logits/rejected": 0.5860650539398193,
1405
- "logps/chosen": -406.7879638671875,
1406
- "logps/rejected": -555.0967407226562,
1407
- "loss": 0.3728,
1408
- "rewards/accuracies": 0.8062499761581421,
1409
- "rewards/chosen": -1.6412866115570068,
1410
- "rewards/margins": 1.9584299325942993,
1411
- "rewards/rejected": -3.5997166633605957,
1412
  "step": 840
1413
  },
1414
  {
1415
  "epoch": 0.98,
1416
- "grad_norm": 45.90265978309936,
1417
  "learning_rate": 6.550326657293881e-10,
1418
- "logits/chosen": 0.08577422052621841,
1419
- "logits/rejected": 0.549113929271698,
1420
- "logps/chosen": -403.1221618652344,
1421
- "logps/rejected": -571.7515869140625,
1422
- "loss": 0.3525,
1423
- "rewards/accuracies": 0.8500000238418579,
1424
- "rewards/chosen": -1.8727197647094727,
1425
- "rewards/margins": 2.0556139945983887,
1426
- "rewards/rejected": -3.9283337593078613,
1427
  "step": 850
1428
  },
1429
  {
1430
  "epoch": 0.99,
1431
- "grad_norm": 48.04876217861222,
1432
  "learning_rate": 1.2943454039654467e-10,
1433
- "logits/chosen": 0.5522348284721375,
1434
- "logits/rejected": 0.82818204164505,
1435
- "logps/chosen": -399.8492126464844,
1436
- "logps/rejected": -529.6903076171875,
1437
- "loss": 0.3623,
1438
- "rewards/accuracies": 0.768750011920929,
1439
- "rewards/chosen": -2.0089919567108154,
1440
- "rewards/margins": 1.499420404434204,
1441
- "rewards/rejected": -3.5084125995635986,
1442
  "step": 860
1443
  },
1444
  {
1445
  "epoch": 1.0,
1446
  "step": 868,
1447
  "total_flos": 0.0,
1448
- "train_loss": 0.42912535238925215,
1449
- "train_runtime": 13911.1927,
1450
- "train_samples_per_second": 7.989,
1451
- "train_steps_per_second": 0.062
1452
  }
1453
  ],
1454
  "logging_steps": 10,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "grad_norm": 146.8957421194674,
14
  "learning_rate": 5.747126436781609e-09,
15
+ "logits/chosen": -1.8686045408248901,
16
+ "logits/rejected": -1.7644572257995605,
17
+ "logps/chosen": -235.48362731933594,
18
+ "logps/rejected": -183.77415466308594,
19
+ "loss": 0.6922,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
22
  "rewards/margins": 0.0,
 
25
  },
26
  {
27
  "epoch": 0.01,
28
+ "grad_norm": 237.04909600464902,
29
  "learning_rate": 5.747126436781609e-08,
30
+ "logits/chosen": -1.9218311309814453,
31
+ "logits/rejected": -1.8686226606369019,
32
+ "logps/chosen": -240.50628662109375,
33
+ "logps/rejected": -216.8230438232422,
34
+ "loss": 0.6941,
35
+ "rewards/accuracies": 0.4444444477558136,
36
+ "rewards/chosen": -0.002868607407435775,
37
+ "rewards/margins": -0.0005126786418259144,
38
+ "rewards/rejected": -0.002355928998440504,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.02,
43
+ "grad_norm": 211.82620683349913,
44
  "learning_rate": 1.1494252873563217e-07,
45
+ "logits/chosen": -2.010253429412842,
46
+ "logits/rejected": -1.8642921447753906,
47
+ "logps/chosen": -283.1783752441406,
48
+ "logps/rejected": -215.68887329101562,
49
+ "loss": 0.6864,
50
+ "rewards/accuracies": 0.612500011920929,
51
+ "rewards/chosen": 0.008166970685124397,
52
+ "rewards/margins": 0.013039084151387215,
53
+ "rewards/rejected": -0.004872114397585392,
54
  "step": 20
55
  },
56
  {
57
  "epoch": 0.03,
58
+ "grad_norm": 112.97267570781544,
59
  "learning_rate": 1.7241379310344828e-07,
60
+ "logits/chosen": -1.9509646892547607,
61
+ "logits/rejected": -1.8835735321044922,
62
+ "logps/chosen": -240.29074096679688,
63
+ "logps/rejected": -221.15274047851562,
64
+ "loss": 0.666,
65
+ "rewards/accuracies": 0.668749988079071,
66
+ "rewards/chosen": 0.06300461292266846,
67
+ "rewards/margins": 0.06308884918689728,
68
+ "rewards/rejected": -8.423496183240786e-05,
69
  "step": 30
70
  },
71
  {
72
  "epoch": 0.05,
73
+ "grad_norm": 95.74241597637995,
74
  "learning_rate": 2.2988505747126435e-07,
75
+ "logits/chosen": -1.9107061624526978,
76
+ "logits/rejected": -1.8901317119598389,
77
+ "logps/chosen": -237.59036254882812,
78
+ "logps/rejected": -216.1823272705078,
79
+ "loss": 0.6254,
80
+ "rewards/accuracies": 0.7124999761581421,
81
+ "rewards/chosen": 0.1618741899728775,
82
+ "rewards/margins": 0.1895591914653778,
83
+ "rewards/rejected": -0.027684981003403664,
84
  "step": 40
85
  },
86
  {
87
  "epoch": 0.06,
88
+ "grad_norm": 99.41131976874209,
89
  "learning_rate": 2.873563218390804e-07,
90
+ "logits/chosen": -1.9803855419158936,
91
+ "logits/rejected": -1.9117343425750732,
92
+ "logps/chosen": -222.8354949951172,
93
+ "logps/rejected": -207.8356170654297,
94
+ "loss": 0.587,
95
+ "rewards/accuracies": 0.768750011920929,
96
+ "rewards/chosen": 0.23140163719654083,
97
+ "rewards/margins": 0.3971676230430603,
98
+ "rewards/rejected": -0.16576598584651947,
99
  "step": 50
100
  },
101
  {
102
  "epoch": 0.07,
103
+ "grad_norm": 85.607755330573,
104
  "learning_rate": 3.4482758620689656e-07,
105
+ "logits/chosen": -2.0026185512542725,
106
+ "logits/rejected": -1.9393552541732788,
107
+ "logps/chosen": -283.61199951171875,
108
+ "logps/rejected": -245.7741241455078,
109
+ "loss": 0.5351,
110
+ "rewards/accuracies": 0.793749988079071,
111
+ "rewards/chosen": 0.2226170301437378,
112
+ "rewards/margins": 0.6314491629600525,
113
+ "rewards/rejected": -0.4088321626186371,
114
  "step": 60
115
  },
116
  {
117
  "epoch": 0.08,
118
+ "grad_norm": 105.09596276450131,
119
  "learning_rate": 4.0229885057471266e-07,
120
+ "logits/chosen": -1.9401233196258545,
121
+ "logits/rejected": -1.9097837209701538,
122
+ "logps/chosen": -223.41983032226562,
123
+ "logps/rejected": -222.15283203125,
124
+ "loss": 0.5181,
125
+ "rewards/accuracies": 0.7749999761581421,
126
+ "rewards/chosen": -0.10973484814167023,
127
+ "rewards/margins": 0.7763268351554871,
128
+ "rewards/rejected": -0.8860616683959961,
129
  "step": 70
130
  },
131
  {
132
  "epoch": 0.09,
133
+ "grad_norm": 103.35591967882027,
134
  "learning_rate": 4.597701149425287e-07,
135
+ "logits/chosen": -1.8168106079101562,
136
+ "logits/rejected": -1.7877649068832397,
137
+ "logps/chosen": -227.1862030029297,
138
+ "logps/rejected": -217.2149200439453,
139
+ "loss": 0.487,
140
+ "rewards/accuracies": 0.824999988079071,
141
+ "rewards/chosen": -0.1610104888677597,
142
+ "rewards/margins": 0.8937110900878906,
143
+ "rewards/rejected": -1.0547215938568115,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 0.1,
148
+ "grad_norm": 96.75574881093117,
149
  "learning_rate": 4.999817969178237e-07,
150
+ "logits/chosen": -1.8949458599090576,
151
+ "logits/rejected": -1.8469880819320679,
152
+ "logps/chosen": -249.9053192138672,
153
+ "logps/rejected": -233.94631958007812,
154
+ "loss": 0.4415,
155
+ "rewards/accuracies": 0.8062499761581421,
156
+ "rewards/chosen": -0.24459132552146912,
157
+ "rewards/margins": 1.046515703201294,
158
+ "rewards/rejected": -1.291106939315796,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 0.12,
163
+ "grad_norm": 92.71145741103783,
164
  "learning_rate": 4.996582603056428e-07,
165
+ "logits/chosen": -1.8467813730239868,
166
+ "logits/rejected": -1.7593021392822266,
167
+ "logps/chosen": -251.5709686279297,
168
+ "logps/rejected": -255.50814819335938,
169
+ "loss": 0.4898,
170
+ "rewards/accuracies": 0.7749999761581421,
171
+ "rewards/chosen": -0.3932832181453705,
172
+ "rewards/margins": 1.1354423761367798,
173
+ "rewards/rejected": -1.5287256240844727,
174
  "step": 100
175
  },
176
  {
177
  "epoch": 0.12,
178
+ "eval_logits/chosen": -1.8469043970108032,
179
+ "eval_logits/rejected": -1.7659016847610474,
180
+ "eval_logps/chosen": -339.4445495605469,
181
+ "eval_logps/rejected": -353.2087707519531,
182
+ "eval_loss": 0.5505225658416748,
183
+ "eval_rewards/accuracies": 0.6875,
184
+ "eval_rewards/chosen": -0.19665074348449707,
185
+ "eval_rewards/margins": 0.8084924817085266,
186
+ "eval_rewards/rejected": -1.005143165588379,
187
+ "eval_runtime": 98.3147,
188
+ "eval_samples_per_second": 20.343,
189
+ "eval_steps_per_second": 0.325,
190
  "step": 100
191
  },
192
  {
193
  "epoch": 0.13,
194
+ "grad_norm": 108.17707201404545,
195
  "learning_rate": 4.989308132738126e-07,
196
+ "logits/chosen": -1.8153671026229858,
197
+ "logits/rejected": -1.6921818256378174,
198
+ "logps/chosen": -244.46237182617188,
199
+ "logps/rejected": -222.83145141601562,
200
+ "loss": 0.4368,
201
+ "rewards/accuracies": 0.7875000238418579,
202
+ "rewards/chosen": -0.10259479284286499,
203
+ "rewards/margins": 1.188932180404663,
204
+ "rewards/rejected": -1.2915267944335938,
205
  "step": 110
206
  },
207
  {
208
  "epoch": 0.14,
209
+ "grad_norm": 100.40422750465044,
210
  "learning_rate": 4.978006327248536e-07,
211
+ "logits/chosen": -1.7503843307495117,
212
+ "logits/rejected": -1.6768817901611328,
213
+ "logps/chosen": -247.3853759765625,
214
+ "logps/rejected": -240.05496215820312,
215
+ "loss": 0.4237,
216
+ "rewards/accuracies": 0.8125,
217
+ "rewards/chosen": -0.34009289741516113,
218
+ "rewards/margins": 1.1859456300735474,
219
+ "rewards/rejected": -1.526038408279419,
220
  "step": 120
221
  },
222
  {
223
  "epoch": 0.15,
224
+ "grad_norm": 91.1070676572417,
225
  "learning_rate": 4.962695471250032e-07,
226
+ "logits/chosen": -1.654911756515503,
227
+ "logits/rejected": -1.62875235080719,
228
+ "logps/chosen": -254.4684600830078,
229
+ "logps/rejected": -245.8209686279297,
230
+ "loss": 0.4249,
231
+ "rewards/accuracies": 0.8125,
232
+ "rewards/chosen": -0.5070122480392456,
233
+ "rewards/margins": 1.3276703357696533,
234
+ "rewards/rejected": -1.8346824645996094,
235
  "step": 130
236
  },
237
  {
238
  "epoch": 0.16,
239
+ "grad_norm": 97.14051346245,
240
  "learning_rate": 4.94340033546025e-07,
241
+ "logits/chosen": -1.6183685064315796,
242
+ "logits/rejected": -1.6242666244506836,
243
+ "logps/chosen": -220.20852661132812,
244
+ "logps/rejected": -242.1243133544922,
245
+ "loss": 0.4218,
246
+ "rewards/accuracies": 0.8062499761581421,
247
+ "rewards/chosen": -0.6856725811958313,
248
+ "rewards/margins": 1.2277655601501465,
249
+ "rewards/rejected": -1.913438081741333,
250
  "step": 140
251
  },
252
  {
253
  "epoch": 0.17,
254
+ "grad_norm": 94.8408241800107,
255
  "learning_rate": 4.920152136576705e-07,
256
+ "logits/chosen": -1.4978981018066406,
257
+ "logits/rejected": -1.5163871049880981,
258
+ "logps/chosen": -251.6667938232422,
259
+ "logps/rejected": -254.0281219482422,
260
+ "loss": 0.4261,
261
+ "rewards/accuracies": 0.824999988079071,
262
+ "rewards/chosen": -0.6180266737937927,
263
+ "rewards/margins": 1.381075143814087,
264
+ "rewards/rejected": -1.9991016387939453,
265
  "step": 150
266
  },
267
  {
268
  "epoch": 0.18,
269
+ "grad_norm": 108.14894780962342,
270
  "learning_rate": 4.892988486772756e-07,
271
+ "logits/chosen": -1.579886794090271,
272
+ "logits/rejected": -1.5718797445297241,
273
+ "logps/chosen": -250.04495239257812,
274
+ "logps/rejected": -262.3863830566406,
275
+ "loss": 0.4246,
276
+ "rewards/accuracies": 0.824999988079071,
277
+ "rewards/chosen": -0.6856819987297058,
278
+ "rewards/margins": 1.5999362468719482,
279
+ "rewards/rejected": -2.285618305206299,
280
  "step": 160
281
  },
282
  {
283
  "epoch": 0.2,
284
+ "grad_norm": 70.9507948612333,
285
  "learning_rate": 4.861953332846629e-07,
286
+ "logits/chosen": -1.7066303491592407,
287
+ "logits/rejected": -1.6715686321258545,
288
+ "logps/chosen": -270.17132568359375,
289
+ "logps/rejected": -260.03350830078125,
290
+ "loss": 0.4316,
291
  "rewards/accuracies": 0.7875000238418579,
292
+ "rewards/chosen": -0.8634172677993774,
293
+ "rewards/margins": 1.3165854215621948,
294
+ "rewards/rejected": -2.1800026893615723,
295
  "step": 170
296
  },
297
  {
298
  "epoch": 0.21,
299
+ "grad_norm": 114.85174445864124,
300
  "learning_rate": 4.827096885121953e-07,
301
+ "logits/chosen": -1.817678689956665,
302
+ "logits/rejected": -1.7374699115753174,
303
+ "logps/chosen": -286.0107116699219,
304
+ "logps/rejected": -279.7050476074219,
305
+ "loss": 0.4309,
306
+ "rewards/accuracies": 0.75,
307
+ "rewards/chosen": -1.0613857507705688,
308
+ "rewards/margins": 1.1405597925186157,
309
+ "rewards/rejected": -2.2019453048706055,
310
  "step": 180
311
  },
312
  {
313
  "epoch": 0.22,
314
+ "grad_norm": 90.50728576001431,
315
  "learning_rate": 4.788475536214821e-07,
316
+ "logits/chosen": -1.7829034328460693,
317
+ "logits/rejected": -1.7400600910186768,
318
+ "logps/chosen": -236.38650512695312,
319
+ "logps/rejected": -245.64291381835938,
320
+ "loss": 0.3873,
321
+ "rewards/accuracies": 0.84375,
322
+ "rewards/chosen": -0.8258365392684937,
323
+ "rewards/margins": 1.4504865407943726,
324
+ "rewards/rejected": -2.276322841644287,
325
  "step": 190
326
  },
327
  {
328
  "epoch": 0.23,
329
+ "grad_norm": 100.41476084088063,
330
  "learning_rate": 4.746151769798818e-07,
331
+ "logits/chosen": -1.7743650674819946,
332
+ "logits/rejected": -1.7203725576400757,
333
+ "logps/chosen": -279.8271789550781,
334
+ "logps/rejected": -269.1463317871094,
335
+ "loss": 0.4277,
336
+ "rewards/accuracies": 0.831250011920929,
337
+ "rewards/chosen": -0.8149574398994446,
338
+ "rewards/margins": 1.5317128896713257,
339
+ "rewards/rejected": -2.346670389175415,
340
  "step": 200
341
  },
342
  {
343
  "epoch": 0.23,
344
+ "eval_logits/chosen": -1.8008995056152344,
345
+ "eval_logits/rejected": -1.7248116731643677,
346
+ "eval_logps/chosen": -345.1794738769531,
347
+ "eval_logps/rejected": -370.77880859375,
348
+ "eval_loss": 0.46549829840660095,
349
+ "eval_rewards/accuracies": 0.73828125,
350
+ "eval_rewards/chosen": -0.4833980202674866,
351
+ "eval_rewards/margins": 1.4002480506896973,
352
+ "eval_rewards/rejected": -1.8836462497711182,
353
+ "eval_runtime": 98.0211,
354
+ "eval_samples_per_second": 20.404,
355
  "eval_steps_per_second": 0.326,
356
  "step": 200
357
  },
358
  {
359
  "epoch": 0.24,
360
+ "grad_norm": 99.20354803224151,
361
  "learning_rate": 4.7001940595156055e-07,
362
+ "logits/chosen": -1.7606436014175415,
363
+ "logits/rejected": -1.6915203332901,
364
+ "logps/chosen": -232.93832397460938,
365
+ "logps/rejected": -246.24072265625,
366
+ "loss": 0.441,
367
  "rewards/accuracies": 0.8062499761581421,
368
+ "rewards/chosen": -1.0429778099060059,
369
+ "rewards/margins": 1.6135696172714233,
370
+ "rewards/rejected": -2.6565470695495605,
371
  "step": 210
372
  },
373
  {
374
  "epoch": 0.25,
375
+ "grad_norm": 80.49166768353011,
376
  "learning_rate": 4.650676758194623e-07,
377
+ "logits/chosen": -1.820059061050415,
378
+ "logits/rejected": -1.741408348083496,
379
+ "logps/chosen": -265.69769287109375,
380
+ "logps/rejected": -259.860595703125,
381
+ "loss": 0.4003,
382
+ "rewards/accuracies": 0.793749988079071,
383
+ "rewards/chosen": -0.915117621421814,
384
+ "rewards/margins": 1.692671537399292,
385
+ "rewards/rejected": -2.6077892780303955,
386
  "step": 220
387
  },
388
  {
389
  "epoch": 0.26,
390
+ "grad_norm": 103.96889607439884,
391
  "learning_rate": 4.5976799775611215e-07,
392
+ "logits/chosen": -1.84983229637146,
393
+ "logits/rejected": -1.7714653015136719,
394
+ "logps/chosen": -260.39678955078125,
395
+ "logps/rejected": -254.94235229492188,
396
+ "loss": 0.4118,
397
+ "rewards/accuracies": 0.8812500238418579,
398
+ "rewards/chosen": -1.0704491138458252,
399
+ "rewards/margins": 1.982627511024475,
400
+ "rewards/rejected": -3.05307674407959,
401
  "step": 230
402
  },
403
  {
404
  "epoch": 0.28,
405
+ "grad_norm": 87.59607388393054,
406
  "learning_rate": 4.5412894586271543e-07,
407
+ "logits/chosen": -1.8541374206542969,
408
+ "logits/rejected": -1.7867300510406494,
409
+ "logps/chosen": -259.6327819824219,
410
+ "logps/rejected": -233.4642333984375,
411
+ "loss": 0.3961,
412
+ "rewards/accuracies": 0.831250011920929,
413
+ "rewards/chosen": -0.8504000902175903,
414
+ "rewards/margins": 1.6259987354278564,
415
+ "rewards/rejected": -2.4763987064361572,
416
  "step": 240
417
  },
418
  {
419
  "epoch": 0.29,
420
+ "grad_norm": 87.76098428515671,
421
  "learning_rate": 4.481596432975201e-07,
422
+ "logits/chosen": -1.8068411350250244,
423
+ "logits/rejected": -1.7720015048980713,
424
+ "logps/chosen": -218.69546508789062,
425
+ "logps/rejected": -231.09292602539062,
426
+ "loss": 0.4236,
427
+ "rewards/accuracies": 0.762499988079071,
428
+ "rewards/chosen": -0.8816181421279907,
429
+ "rewards/margins": 1.4680168628692627,
430
+ "rewards/rejected": -2.349634885787964,
431
  "step": 250
432
  },
433
  {
434
  "epoch": 0.3,
435
+ "grad_norm": 99.04429789846779,
436
  "learning_rate": 4.41869747515886e-07,
437
+ "logits/chosen": -1.7596534490585327,
438
+ "logits/rejected": -1.7420539855957031,
439
+ "logps/chosen": -272.49273681640625,
440
+ "logps/rejected": -301.8647766113281,
441
+ "loss": 0.3824,
442
+ "rewards/accuracies": 0.8125,
443
+ "rewards/chosen": -1.0995886325836182,
444
+ "rewards/margins": 1.7163059711456299,
445
+ "rewards/rejected": -2.815894603729248,
446
  "step": 260
447
  },
448
  {
449
  "epoch": 0.31,
450
+ "grad_norm": 91.17380836071179,
451
  "learning_rate": 4.352694346459396e-07,
452
+ "logits/chosen": -1.7119308710098267,
453
+ "logits/rejected": -1.713330626487732,
454
+ "logps/chosen": -245.8417205810547,
455
+ "logps/rejected": -257.9246826171875,
456
+ "loss": 0.3928,
457
+ "rewards/accuracies": 0.8062499761581421,
458
+ "rewards/chosen": -0.6780133843421936,
459
+ "rewards/margins": 1.5433403253555298,
460
+ "rewards/rejected": -2.221353769302368,
461
  "step": 270
462
  },
463
  {
464
  "epoch": 0.32,
465
+ "grad_norm": 85.89753055245001,
466
  "learning_rate": 4.2836938302509256e-07,
467
+ "logits/chosen": -1.7531852722167969,
468
+ "logits/rejected": -1.6354246139526367,
469
+ "logps/chosen": -243.12454223632812,
470
+ "logps/rejected": -249.15576171875,
471
+ "loss": 0.4123,
472
+ "rewards/accuracies": 0.8125,
473
+ "rewards/chosen": -0.8161875605583191,
474
+ "rewards/margins": 1.92093825340271,
475
+ "rewards/rejected": -2.737125873565674,
476
  "step": 280
477
  },
478
  {
479
  "epoch": 0.33,
480
+ "grad_norm": 93.29457537220665,
481
  "learning_rate": 4.2118075592405874e-07,
482
+ "logits/chosen": -1.6761655807495117,
483
+ "logits/rejected": -1.668341040611267,
484
+ "logps/chosen": -263.28924560546875,
485
+ "logps/rejected": -276.89190673828125,
486
+ "loss": 0.3899,
487
+ "rewards/accuracies": 0.8062499761581421,
488
+ "rewards/chosen": -0.7706686854362488,
489
+ "rewards/margins": 1.6867666244506836,
490
+ "rewards/rejected": -2.457435131072998,
491
  "step": 290
492
  },
493
  {
494
  "epoch": 0.35,
495
+ "grad_norm": 104.22663391151875,
496
  "learning_rate": 4.137151834863213e-07,
497
+ "logits/chosen": -1.5810635089874268,
498
+ "logits/rejected": -1.56969153881073,
499
+ "logps/chosen": -215.5782012939453,
500
+ "logps/rejected": -242.8097381591797,
501
+ "loss": 0.4188,
502
+ "rewards/accuracies": 0.7562500238418579,
503
+ "rewards/chosen": -1.0694911479949951,
504
+ "rewards/margins": 1.6282224655151367,
505
+ "rewards/rejected": -2.6977133750915527,
506
  "step": 300
507
  },
508
  {
509
  "epoch": 0.35,
510
+ "eval_logits/chosen": -1.6898695230484009,
511
+ "eval_logits/rejected": -1.6142553091049194,
512
+ "eval_logps/chosen": -336.9513244628906,
513
+ "eval_logps/rejected": -373.6328430175781,
514
+ "eval_loss": 0.39220622181892395,
515
+ "eval_rewards/accuracies": 0.796875,
516
+ "eval_rewards/chosen": -0.07198946177959442,
517
+ "eval_rewards/margins": 1.9543578624725342,
518
+ "eval_rewards/rejected": -2.0263473987579346,
519
+ "eval_runtime": 98.0591,
520
+ "eval_samples_per_second": 20.396,
521
  "eval_steps_per_second": 0.326,
522
  "step": 300
523
  },
524
  {
525
  "epoch": 0.36,
526
+ "grad_norm": 83.12181184953562,
527
  "learning_rate": 4.059847439122671e-07,
528
+ "logits/chosen": -1.7229493856430054,
529
+ "logits/rejected": -1.590850591659546,
530
+ "logps/chosen": -260.96539306640625,
531
+ "logps/rejected": -262.43865966796875,
532
+ "loss": 0.4106,
533
+ "rewards/accuracies": 0.800000011920929,
534
+ "rewards/chosen": -1.0447856187820435,
535
+ "rewards/margins": 1.545778512954712,
536
+ "rewards/rejected": -2.590564250946045,
537
  "step": 310
538
  },
539
  {
540
  "epoch": 0.37,
541
+ "grad_norm": 101.57242075352875,
542
  "learning_rate": 3.98001943918432e-07,
543
+ "logits/chosen": -1.6267013549804688,
544
+ "logits/rejected": -1.5495989322662354,
545
+ "logps/chosen": -246.096923828125,
546
+ "logps/rejected": -268.87274169921875,
547
+ "loss": 0.3937,
548
+ "rewards/accuracies": 0.824999988079071,
549
+ "rewards/chosen": -1.1101688146591187,
550
+ "rewards/margins": 1.6341445446014404,
551
+ "rewards/rejected": -2.7443130016326904,
552
  "step": 320
553
  },
554
  {
555
  "epoch": 0.38,
556
+ "grad_norm": 95.29775137930592,
557
  "learning_rate": 3.8977969850346866e-07,
558
+ "logits/chosen": -1.5687649250030518,
559
+ "logits/rejected": -1.5476423501968384,
560
+ "logps/chosen": -262.67156982421875,
561
+ "logps/rejected": -260.74725341796875,
562
+ "loss": 0.3887,
563
+ "rewards/accuracies": 0.768750011920929,
564
+ "rewards/chosen": -0.9716947674751282,
565
+ "rewards/margins": 1.6578747034072876,
566
+ "rewards/rejected": -2.6295692920684814,
567
  "step": 330
568
  },
569
  {
570
  "epoch": 0.39,
571
+ "grad_norm": 105.61150036189296,
572
  "learning_rate": 3.8133131005357465e-07,
573
+ "logits/chosen": -1.662001371383667,
574
+ "logits/rejected": -1.5881233215332031,
575
+ "logps/chosen": -249.8246612548828,
576
+ "logps/rejected": -272.1719970703125,
577
+ "loss": 0.38,
578
+ "rewards/accuracies": 0.862500011920929,
579
+ "rewards/chosen": -0.9675191044807434,
580
+ "rewards/margins": 2.143432140350342,
581
+ "rewards/rejected": -3.1109511852264404,
582
  "step": 340
583
  },
584
  {
585
  "epoch": 0.4,
586
+ "grad_norm": 90.41304599394526,
587
  "learning_rate": 3.7267044682118435e-07,
588
+ "logits/chosen": -1.6518446207046509,
589
+ "logits/rejected": -1.6180702447891235,
590
+ "logps/chosen": -237.4104461669922,
591
+ "logps/rejected": -247.13473510742188,
592
+ "loss": 0.3749,
593
+ "rewards/accuracies": 0.824999988079071,
594
+ "rewards/chosen": -1.1348434686660767,
595
+ "rewards/margins": 1.6873624324798584,
596
+ "rewards/rejected": -2.8222060203552246,
597
  "step": 350
598
  },
599
  {
600
  "epoch": 0.41,
601
+ "grad_norm": 95.16076600849088,
602
  "learning_rate": 3.638111208117425e-07,
603
+ "logits/chosen": -1.7075703144073486,
604
+ "logits/rejected": -1.6682260036468506,
605
+ "logps/chosen": -247.7568817138672,
606
+ "logps/rejected": -263.2861633300781,
607
+ "loss": 0.3914,
608
+ "rewards/accuracies": 0.7250000238418579,
609
+ "rewards/chosen": -1.0359086990356445,
610
+ "rewards/margins": 1.2954949140548706,
611
+ "rewards/rejected": -2.3314034938812256,
612
  "step": 360
613
  },
614
  {
615
  "epoch": 0.43,
616
+ "grad_norm": 96.37188736705814,
617
  "learning_rate": 3.5476766511433605e-07,
618
+ "logits/chosen": -1.7550392150878906,
619
+ "logits/rejected": -1.6907631158828735,
620
+ "logps/chosen": -281.13836669921875,
621
+ "logps/rejected": -266.0453796386719,
622
+ "loss": 0.4097,
623
+ "rewards/accuracies": 0.7875000238418579,
624
+ "rewards/chosen": -1.0189838409423828,
625
+ "rewards/margins": 1.6438014507293701,
626
+ "rewards/rejected": -2.662785291671753,
627
  "step": 370
628
  },
629
  {
630
  "epoch": 0.44,
631
+ "grad_norm": 101.71059089383448,
632
  "learning_rate": 3.455547107128602e-07,
633
+ "logits/chosen": -1.7938178777694702,
634
+ "logits/rejected": -1.787418007850647,
635
+ "logps/chosen": -298.5751647949219,
636
+ "logps/rejected": -281.03118896484375,
637
+ "loss": 0.3611,
638
+ "rewards/accuracies": 0.831250011920929,
639
+ "rewards/chosen": -1.0122178792953491,
640
+ "rewards/margins": 2.0528714656829834,
641
+ "rewards/rejected": -3.065088987350464,
642
  "step": 380
643
  },
644
  {
645
  "epoch": 0.45,
646
+ "grad_norm": 109.30188938122785,
647
  "learning_rate": 3.361871628152338e-07,
648
+ "logits/chosen": -1.8471622467041016,
649
+ "logits/rejected": -1.820481300354004,
650
+ "logps/chosen": -251.8716278076172,
651
+ "logps/rejected": -283.156982421875,
652
+ "loss": 0.4006,
653
+ "rewards/accuracies": 0.800000011920929,
654
+ "rewards/chosen": -1.0065103769302368,
655
+ "rewards/margins": 1.774526834487915,
656
+ "rewards/rejected": -2.7810370922088623,
657
  "step": 390
658
  },
659
  {
660
  "epoch": 0.46,
661
+ "grad_norm": 89.15033820155391,
662
  "learning_rate": 3.2668017673896077e-07,
663
+ "logits/chosen": -1.8275989294052124,
664
+ "logits/rejected": -1.8090099096298218,
665
+ "logps/chosen": -244.0500946044922,
666
+ "logps/rejected": -245.302490234375,
667
+ "loss": 0.3506,
668
+ "rewards/accuracies": 0.800000011920929,
669
+ "rewards/chosen": -0.62747722864151,
670
+ "rewards/margins": 2.005857467651367,
671
+ "rewards/rejected": -2.6333346366882324,
672
  "step": 400
673
  },
674
  {
675
  "epoch": 0.46,
676
+ "eval_logits/chosen": -2.029554843902588,
677
+ "eval_logits/rejected": -1.9793704748153687,
678
+ "eval_logps/chosen": -331.169189453125,
679
+ "eval_logps/rejected": -374.0495300292969,
680
+ "eval_loss": 0.3456653654575348,
681
+ "eval_rewards/accuracies": 0.8203125,
682
+ "eval_rewards/chosen": 0.217118039727211,
683
+ "eval_rewards/margins": 2.2643015384674072,
684
+ "eval_rewards/rejected": -2.0471832752227783,
685
+ "eval_runtime": 97.9966,
686
+ "eval_samples_per_second": 20.409,
687
+ "eval_steps_per_second": 0.327,
688
  "step": 400
689
  },
690
  {
691
  "epoch": 0.47,
692
+ "grad_norm": 95.54284470696203,
693
  "learning_rate": 3.1704913339205103e-07,
694
+ "logits/chosen": -1.9682222604751587,
695
+ "logits/rejected": -1.9323310852050781,
696
+ "logps/chosen": -251.9891357421875,
697
+ "logps/rejected": -276.3076477050781,
698
+ "loss": 0.3841,
699
+ "rewards/accuracies": 0.831250011920929,
700
+ "rewards/chosen": -0.7251254320144653,
701
+ "rewards/margins": 1.8693323135375977,
702
+ "rewards/rejected": -2.5944573879241943,
703
  "step": 410
704
  },
705
  {
706
  "epoch": 0.48,
707
+ "grad_norm": 84.89194807368709,
708
  "learning_rate": 3.0730961438896885e-07,
709
+ "logits/chosen": -1.973587989807129,
710
+ "logits/rejected": -1.9758260250091553,
711
+ "logps/chosen": -326.0084533691406,
712
+ "logps/rejected": -316.46380615234375,
713
+ "loss": 0.3676,
714
+ "rewards/accuracies": 0.8187500238418579,
715
+ "rewards/chosen": -1.2638908624649048,
716
+ "rewards/margins": 1.681646704673767,
717
+ "rewards/rejected": -2.945537805557251,
718
  "step": 420
719
  },
720
  {
721
  "epoch": 0.5,
722
+ "grad_norm": 79.45330669787697,
723
  "learning_rate": 2.9747737684186795e-07,
724
+ "logits/chosen": -1.9824804067611694,
725
+ "logits/rejected": -2.0262296199798584,
726
+ "logps/chosen": -253.5684814453125,
727
+ "logps/rejected": -258.55352783203125,
728
+ "loss": 0.3861,
729
  "rewards/accuracies": 0.8125,
730
+ "rewards/chosen": -0.8729881048202515,
731
+ "rewards/margins": 1.7805559635162354,
732
+ "rewards/rejected": -2.6535439491271973,
733
  "step": 430
734
  },
735
  {
736
  "epoch": 0.51,
737
+ "grad_norm": 92.0142521630137,
738
  "learning_rate": 2.8756832786789663e-07,
739
+ "logits/chosen": -2.008882761001587,
740
+ "logits/rejected": -1.9781932830810547,
741
+ "logps/chosen": -269.37042236328125,
742
+ "logps/rejected": -264.4306335449219,
743
+ "loss": 0.382,
744
  "rewards/accuracies": 0.8187500238418579,
745
+ "rewards/chosen": -0.5206736326217651,
746
+ "rewards/margins": 1.916638970375061,
747
+ "rewards/rejected": -2.437312364578247,
748
  "step": 440
749
  },
750
  {
751
  "epoch": 0.52,
752
+ "grad_norm": 81.53843237959488,
753
  "learning_rate": 2.7759849885381747e-07,
754
+ "logits/chosen": -1.951061487197876,
755
+ "logits/rejected": -1.9269872903823853,
756
+ "logps/chosen": -282.7742614746094,
757
+ "logps/rejected": -261.77130126953125,
758
+ "loss": 0.3613,
759
+ "rewards/accuracies": 0.824999988079071,
760
+ "rewards/chosen": -0.7152455449104309,
761
+ "rewards/margins": 2.0386595726013184,
762
+ "rewards/rejected": -2.7539050579071045,
763
  "step": 450
764
  },
765
  {
766
  "epoch": 0.53,
767
+ "grad_norm": 81.29719600574327,
768
  "learning_rate": 2.675840195195762e-07,
769
+ "logits/chosen": -1.935346007347107,
770
+ "logits/rejected": -1.8713289499282837,
771
+ "logps/chosen": -237.9385528564453,
772
+ "logps/rejected": -262.90496826171875,
773
+ "loss": 0.3768,
774
+ "rewards/accuracies": 0.84375,
775
+ "rewards/chosen": -0.7526956796646118,
776
+ "rewards/margins": 2.036323070526123,
777
+ "rewards/rejected": -2.7890188694000244,
778
  "step": 460
779
  },
780
  {
781
  "epoch": 0.54,
782
+ "grad_norm": 79.41069847067702,
783
  "learning_rate": 2.575410918227829e-07,
784
+ "logits/chosen": -1.9017337560653687,
785
+ "logits/rejected": -1.8612467050552368,
786
+ "logps/chosen": -286.1165771484375,
787
+ "logps/rejected": -286.47576904296875,
788
+ "loss": 0.3801,
789
+ "rewards/accuracies": 0.7875000238418579,
790
+ "rewards/chosen": -0.7328876256942749,
791
+ "rewards/margins": 1.7589473724365234,
792
+ "rewards/rejected": -2.491835117340088,
793
  "step": 470
794
  },
795
  {
796
  "epoch": 0.55,
797
+ "grad_norm": 84.31202005756423,
798
  "learning_rate": 2.474859637463226e-07,
799
+ "logits/chosen": -1.7930266857147217,
800
+ "logits/rejected": -1.6920406818389893,
801
+ "logps/chosen": -261.0568542480469,
802
+ "logps/rejected": -249.1828155517578,
803
+ "loss": 0.373,
804
+ "rewards/accuracies": 0.8125,
805
+ "rewards/chosen": -0.6825451254844666,
806
+ "rewards/margins": 2.1065242290496826,
807
+ "rewards/rejected": -2.789069414138794,
808
  "step": 480
809
  },
810
  {
811
  "epoch": 0.56,
812
+ "grad_norm": 85.77456711320595,
813
  "learning_rate": 2.3743490301150355e-07,
814
+ "logits/chosen": -1.8167390823364258,
815
+ "logits/rejected": -1.6751165390014648,
816
+ "logps/chosen": -254.97879028320312,
817
+ "logps/rejected": -258.82794189453125,
818
+ "loss": 0.3847,
819
+ "rewards/accuracies": 0.8374999761581421,
820
+ "rewards/chosen": -0.421345055103302,
821
+ "rewards/margins": 2.164630174636841,
822
+ "rewards/rejected": -2.585975408554077,
823
  "step": 490
824
  },
825
  {
826
  "epoch": 0.58,
827
+ "grad_norm": 80.63419673971578,
828
  "learning_rate": 2.274041707592724e-07,
829
+ "logits/chosen": -1.7511818408966064,
830
+ "logits/rejected": -1.744807481765747,
831
+ "logps/chosen": -238.7528839111328,
832
+ "logps/rejected": -277.8020935058594,
833
+ "loss": 0.3611,
834
+ "rewards/accuracies": 0.831250011920929,
835
+ "rewards/chosen": -1.161329984664917,
836
+ "rewards/margins": 2.127523422241211,
837
+ "rewards/rejected": -3.288853406906128,
838
  "step": 500
839
  },
840
  {
841
  "epoch": 0.58,
842
+ "eval_logits/chosen": -1.8592296838760376,
843
+ "eval_logits/rejected": -1.818342685699463,
844
+ "eval_logps/chosen": -330.5163879394531,
845
+ "eval_logps/rejected": -381.7996520996094,
846
+ "eval_loss": 0.29585352540016174,
847
+ "eval_rewards/accuracies": 0.8515625,
848
+ "eval_rewards/chosen": 0.24975742399692535,
849
+ "eval_rewards/margins": 2.68444561958313,
850
+ "eval_rewards/rejected": -2.4346883296966553,
851
+ "eval_runtime": 97.917,
852
+ "eval_samples_per_second": 20.425,
853
+ "eval_steps_per_second": 0.327,
854
  "step": 500
855
  },
856
  {
857
  "epoch": 0.59,
858
+ "grad_norm": 93.97823403064545,
859
  "learning_rate": 2.17409995242075e-07,
860
+ "logits/chosen": -1.8619616031646729,
861
+ "logits/rejected": -1.8197988271713257,
862
+ "logps/chosen": -246.95120239257812,
863
+ "logps/rejected": -251.1341552734375,
864
+ "loss": 0.3787,
865
+ "rewards/accuracies": 0.831250011920929,
866
+ "rewards/chosen": -0.9560788869857788,
867
+ "rewards/margins": 2.1973986625671387,
868
+ "rewards/rejected": -3.153477668762207,
869
  "step": 510
870
  },
871
  {
872
  "epoch": 0.6,
873
+ "grad_norm": 81.29931903371055,
874
  "learning_rate": 2.0746854556892544e-07,
875
+ "logits/chosen": -1.8456933498382568,
876
+ "logits/rejected": -1.8626302480697632,
877
+ "logps/chosen": -228.947265625,
878
+ "logps/rejected": -255.99887084960938,
879
+ "loss": 0.383,
880
+ "rewards/accuracies": 0.7562500238418579,
881
+ "rewards/chosen": -0.850503146648407,
882
+ "rewards/margins": 1.7254650592803955,
883
+ "rewards/rejected": -2.5759682655334473,
884
  "step": 520
885
  },
886
  {
887
  "epoch": 0.61,
888
+ "grad_norm": 82.34962963997694,
889
  "learning_rate": 1.9759590554616173e-07,
890
+ "logits/chosen": -1.9397910833358765,
891
+ "logits/rejected": -1.9602453708648682,
892
+ "logps/chosen": -259.236328125,
893
+ "logps/rejected": -268.33233642578125,
894
+ "loss": 0.3812,
895
+ "rewards/accuracies": 0.84375,
896
+ "rewards/chosen": -0.5990003347396851,
897
+ "rewards/margins": 1.7735519409179688,
898
+ "rewards/rejected": -2.3725523948669434,
899
  "step": 530
900
  },
901
  {
902
  "epoch": 0.62,
903
+ "grad_norm": 106.09742448074628,
904
  "learning_rate": 1.8780804765620746e-07,
905
+ "logits/chosen": -1.8515198230743408,
906
+ "logits/rejected": -1.8524954319000244,
907
+ "logps/chosen": -268.37164306640625,
908
+ "logps/rejected": -304.6097106933594,
909
+ "loss": 0.3726,
910
+ "rewards/accuracies": 0.78125,
911
+ "rewards/chosen": -0.32395535707473755,
912
+ "rewards/margins": 1.7947556972503662,
913
+ "rewards/rejected": -2.118710994720459,
914
  "step": 540
915
  },
916
  {
917
  "epoch": 0.63,
918
+ "grad_norm": 89.06639468347461,
919
  "learning_rate": 1.7812080721643973e-07,
920
+ "logits/chosen": -1.8575937747955322,
921
+ "logits/rejected": -1.761614441871643,
922
+ "logps/chosen": -263.5863037109375,
923
+ "logps/rejected": -246.20455932617188,
924
+ "loss": 0.3972,
925
+ "rewards/accuracies": 0.8500000238418579,
926
+ "rewards/chosen": -0.7359617352485657,
927
+ "rewards/margins": 1.8330894708633423,
928
+ "rewards/rejected": -2.5690512657165527,
929
  "step": 550
930
  },
931
  {
932
  "epoch": 0.64,
933
+ "grad_norm": 91.51640548835455,
934
  "learning_rate": 1.6854985675997063e-07,
935
+ "logits/chosen": -1.8521220684051514,
936
+ "logits/rejected": -1.8151109218597412,
937
+ "logps/chosen": -261.0481872558594,
938
+ "logps/rejected": -272.9272155761719,
939
+ "loss": 0.3641,
940
+ "rewards/accuracies": 0.78125,
941
+ "rewards/chosen": -0.9162181615829468,
942
+ "rewards/margins": 1.6873579025268555,
943
+ "rewards/rejected": -2.603576183319092,
944
  "step": 560
945
  },
946
  {
947
  "epoch": 0.66,
948
+ "grad_norm": 83.89007997657542,
949
  "learning_rate": 1.5911068067978818e-07,
950
+ "logits/chosen": -1.7626062631607056,
951
+ "logits/rejected": -1.7946131229400635,
952
+ "logps/chosen": -240.85189819335938,
953
+ "logps/rejected": -273.8741149902344,
954
+ "loss": 0.3514,
955
+ "rewards/accuracies": 0.768750011920929,
956
+ "rewards/chosen": -0.9277374148368835,
957
+ "rewards/margins": 1.9961919784545898,
958
+ "rewards/rejected": -2.923929214477539,
959
  "step": 570
960
  },
961
  {
962
  "epoch": 0.67,
963
+ "grad_norm": 103.64429212789244,
964
  "learning_rate": 1.4981855017728197e-07,
965
+ "logits/chosen": -1.8487884998321533,
966
+ "logits/rejected": -1.8991634845733643,
967
+ "logps/chosen": -267.7337341308594,
968
+ "logps/rejected": -304.248291015625,
969
+ "loss": 0.37,
970
+ "rewards/accuracies": 0.8125,
971
+ "rewards/chosen": -1.1558853387832642,
972
+ "rewards/margins": 1.9654242992401123,
973
+ "rewards/rejected": -3.121309757232666,
974
  "step": 580
975
  },
976
  {
977
  "epoch": 0.68,
978
+ "grad_norm": 87.97513457883551,
979
  "learning_rate": 1.406884985556804e-07,
980
+ "logits/chosen": -1.8694698810577393,
981
+ "logits/rejected": -1.844276785850525,
982
+ "logps/chosen": -257.92413330078125,
983
+ "logps/rejected": -268.24462890625,
984
+ "loss": 0.3586,
985
+ "rewards/accuracies": 0.8125,
986
+ "rewards/chosen": -1.2031538486480713,
987
+ "rewards/margins": 2.1714892387390137,
988
+ "rewards/rejected": -3.374642848968506,
989
  "step": 590
990
  },
991
  {
992
  "epoch": 0.69,
993
+ "grad_norm": 85.79715679551722,
994
  "learning_rate": 1.3173529689837354e-07,
995
+ "logits/chosen": -1.8827781677246094,
996
+ "logits/rejected": -1.8290605545043945,
997
+ "logps/chosen": -256.6871032714844,
998
+ "logps/rejected": -256.0238952636719,
999
+ "loss": 0.3562,
1000
  "rewards/accuracies": 0.800000011920929,
1001
+ "rewards/chosen": -1.1103084087371826,
1002
+ "rewards/margins": 1.9665857553482056,
1003
+ "rewards/rejected": -3.0768942832946777,
1004
  "step": 600
1005
  },
1006
  {
1007
  "epoch": 0.69,
1008
+ "eval_logits/chosen": -1.9736415147781372,
1009
+ "eval_logits/rejected": -1.9216868877410889,
1010
+ "eval_logps/chosen": -327.7752685546875,
1011
+ "eval_logps/rejected": -382.5696105957031,
1012
+ "eval_loss": 0.2513369023799896,
1013
+ "eval_rewards/accuracies": 0.87109375,
1014
+ "eval_rewards/chosen": 0.386812299489975,
1015
+ "eval_rewards/margins": 2.8599982261657715,
1016
+ "eval_rewards/rejected": -2.4731857776641846,
1017
+ "eval_runtime": 97.9427,
1018
+ "eval_samples_per_second": 20.42,
1019
+ "eval_steps_per_second": 0.327,
1020
  "step": 600
1021
  },
1022
  {
1023
  "epoch": 0.7,
1024
+ "grad_norm": 91.70310566548932,
1025
  "learning_rate": 1.2297343017146726e-07,
1026
+ "logits/chosen": -1.8489186763763428,
1027
+ "logits/rejected": -1.8288800716400146,
1028
+ "logps/chosen": -259.6686096191406,
1029
+ "logps/rejected": -267.1070861816406,
1030
+ "loss": 0.3693,
1031
+ "rewards/accuracies": 0.8500000238418579,
1032
+ "rewards/chosen": -1.0914928913116455,
1033
+ "rewards/margins": 1.885284662246704,
1034
+ "rewards/rejected": -2.9767773151397705,
1035
  "step": 610
1036
  },
1037
  {
1038
  "epoch": 0.71,
1039
+ "grad_norm": 83.02991328479807,
1040
  "learning_rate": 1.1441707378923474e-07,
1041
+ "logits/chosen": -1.950595498085022,
1042
+ "logits/rejected": -1.8974205255508423,
1043
+ "logps/chosen": -237.6828155517578,
1044
+ "logps/rejected": -257.06005859375,
1045
+ "loss": 0.3625,
1046
+ "rewards/accuracies": 0.862500011920929,
1047
+ "rewards/chosen": -0.4481216371059418,
1048
+ "rewards/margins": 2.152703046798706,
1049
+ "rewards/rejected": -2.600825071334839,
1050
  "step": 620
1051
  },
1052
  {
1053
  "epoch": 0.73,
1054
+ "grad_norm": 97.69736464926112,
1055
  "learning_rate": 1.06080070680377e-07,
1056
+ "logits/chosen": -1.874447226524353,
1057
+ "logits/rejected": -1.8658256530761719,
1058
+ "logps/chosen": -269.42034912109375,
1059
+ "logps/rejected": -275.75946044921875,
1060
+ "loss": 0.3679,
1061
+ "rewards/accuracies": 0.862500011920929,
1062
+ "rewards/chosen": -0.6408880949020386,
1063
+ "rewards/margins": 2.110172986984253,
1064
+ "rewards/rejected": -2.751060962677002,
1065
  "step": 630
1066
  },
1067
  {
1068
  "epoch": 0.74,
1069
+ "grad_norm": 89.55100518815718,
1070
  "learning_rate": 9.797590889219587e-08,
1071
+ "logits/chosen": -1.927781343460083,
1072
+ "logits/rejected": -1.915203332901001,
1073
+ "logps/chosen": -262.8731384277344,
1074
+ "logps/rejected": -265.70404052734375,
1075
+ "loss": 0.3725,
1076
  "rewards/accuracies": 0.8374999761581421,
1077
+ "rewards/chosen": -0.5387133359909058,
1078
+ "rewards/margins": 2.1157500743865967,
1079
+ "rewards/rejected": -2.654463529586792,
1080
  "step": 640
1081
  },
1082
  {
1083
  "epoch": 0.75,
1084
+ "grad_norm": 88.47283263403602,
1085
  "learning_rate": 9.011769976891367e-08,
1086
+ "logits/chosen": -1.9129142761230469,
1087
+ "logits/rejected": -1.9037758111953735,
1088
+ "logps/chosen": -253.7337188720703,
1089
+ "logps/rejected": -273.1281433105469,
1090
+ "loss": 0.3654,
1091
+ "rewards/accuracies": 0.8187500238418579,
1092
+ "rewards/chosen": -0.8051580190658569,
1093
+ "rewards/margins": 1.8512630462646484,
1094
+ "rewards/rejected": -2.656421184539795,
1095
  "step": 650
1096
  },
1097
  {
1098
  "epoch": 0.76,
1099
+ "grad_norm": 101.63563998101947,
1100
  "learning_rate": 8.251815673944218e-08,
1101
+ "logits/chosen": -1.8455785512924194,
1102
+ "logits/rejected": -1.862540602684021,
1103
+ "logps/chosen": -269.90655517578125,
1104
+ "logps/rejected": -266.5269470214844,
1105
+ "loss": 0.3636,
1106
+ "rewards/accuracies": 0.831250011920929,
1107
+ "rewards/chosen": -1.0431994199752808,
1108
+ "rewards/margins": 1.969745397567749,
1109
+ "rewards/rejected": -3.0129449367523193,
1110
  "step": 660
1111
  },
1112
  {
1113
  "epoch": 0.77,
1114
+ "grad_norm": 101.36570058059344,
1115
  "learning_rate": 7.518957474892148e-08,
1116
+ "logits/chosen": -1.8844772577285767,
1117
+ "logits/rejected": -1.8293778896331787,
1118
+ "logps/chosen": -262.8396301269531,
1119
+ "logps/rejected": -270.7688293457031,
1120
+ "loss": 0.3566,
1121
+ "rewards/accuracies": 0.84375,
1122
+ "rewards/chosen": -1.0009379386901855,
1123
+ "rewards/margins": 2.288020133972168,
1124
+ "rewards/rejected": -3.2889580726623535,
1125
  "step": 670
1126
  },
1127
  {
1128
  "epoch": 0.78,
1129
+ "grad_norm": 108.29800746794625,
1130
  "learning_rate": 6.814381036730274e-08,
1131
+ "logits/chosen": -1.9113209247589111,
1132
+ "logits/rejected": -1.8669430017471313,
1133
+ "logps/chosen": -248.9401092529297,
1134
+ "logps/rejected": -263.89459228515625,
1135
+ "loss": 0.3789,
1136
+ "rewards/accuracies": 0.8062499761581421,
1137
+ "rewards/chosen": -0.7384175062179565,
1138
+ "rewards/margins": 2.038583993911743,
1139
+ "rewards/rejected": -2.77700138092041,
1140
  "step": 680
1141
  },
1142
  {
1143
  "epoch": 0.79,
1144
+ "grad_norm": 81.30832233061167,
1145
  "learning_rate": 6.139226260715872e-08,
1146
+ "logits/chosen": -1.936092734336853,
1147
+ "logits/rejected": -1.950042724609375,
1148
+ "logps/chosen": -261.84283447265625,
1149
+ "logps/rejected": -281.1523132324219,
1150
+ "loss": 0.3622,
1151
+ "rewards/accuracies": 0.8187500238418579,
1152
+ "rewards/chosen": -0.8801490068435669,
1153
+ "rewards/margins": 2.052794933319092,
1154
+ "rewards/rejected": -2.932943820953369,
1155
  "step": 690
1156
  },
1157
  {
1158
  "epoch": 0.81,
1159
+ "grad_norm": 81.87543968348014,
1160
  "learning_rate": 5.4945854481754734e-08,
1161
+ "logits/chosen": -1.8324077129364014,
1162
+ "logits/rejected": -1.7888282537460327,
1163
+ "logps/chosen": -245.5817413330078,
1164
+ "logps/rejected": -259.8585510253906,
1165
+ "loss": 0.3624,
1166
+ "rewards/accuracies": 0.78125,
1167
+ "rewards/chosen": -0.9253555536270142,
1168
+ "rewards/margins": 2.031710624694824,
1169
+ "rewards/rejected": -2.957066059112549,
1170
  "step": 700
1171
  },
1172
  {
1173
  "epoch": 0.81,
1174
+ "eval_logits/chosen": -1.9716989994049072,
1175
+ "eval_logits/rejected": -1.930106282234192,
1176
+ "eval_logps/chosen": -322.60308837890625,
1177
+ "eval_logps/rejected": -380.21783447265625,
1178
+ "eval_loss": 0.21937939524650574,
1179
+ "eval_rewards/accuracies": 0.90625,
1180
+ "eval_rewards/chosen": 0.6454216241836548,
1181
+ "eval_rewards/margins": 3.001021385192871,
1182
+ "eval_rewards/rejected": -2.3555996417999268,
1183
+ "eval_runtime": 98.0586,
1184
+ "eval_samples_per_second": 20.396,
1185
  "eval_steps_per_second": 0.326,
1186
  "step": 700
1187
  },
1188
  {
1189
  "epoch": 0.82,
1190
+ "grad_norm": 104.29775989655523,
1191
  "learning_rate": 4.881501533321605e-08,
1192
+ "logits/chosen": -1.7864516973495483,
1193
+ "logits/rejected": -1.7958072423934937,
1194
+ "logps/chosen": -228.2520294189453,
1195
+ "logps/rejected": -255.38577270507812,
1196
+ "loss": 0.3399,
1197
+ "rewards/accuracies": 0.8125,
1198
+ "rewards/chosen": -0.952102780342102,
1199
+ "rewards/margins": 2.169015884399414,
1200
+ "rewards/rejected": -3.1211180686950684,
1201
  "step": 710
1202
  },
1203
  {
1204
  "epoch": 0.83,
1205
+ "grad_norm": 87.51056493700892,
1206
  "learning_rate": 4.300966395938377e-08,
1207
+ "logits/chosen": -1.91278076171875,
1208
+ "logits/rejected": -1.887738823890686,
1209
+ "logps/chosen": -269.040283203125,
1210
+ "logps/rejected": -277.3271484375,
1211
+ "loss": 0.3737,
1212
+ "rewards/accuracies": 0.768750011920929,
1213
+ "rewards/chosen": -0.9400026202201843,
1214
+ "rewards/margins": 2.1387667655944824,
1215
+ "rewards/rejected": -3.0787696838378906,
1216
  "step": 720
1217
  },
1218
  {
1219
  "epoch": 0.84,
1220
+ "grad_norm": 82.85641235575972,
1221
  "learning_rate": 3.7539192566655246e-08,
1222
+ "logits/chosen": -1.8706934452056885,
1223
+ "logits/rejected": -1.8632476329803467,
1224
+ "logps/chosen": -258.32257080078125,
1225
+ "logps/rejected": -262.11151123046875,
1226
+ "loss": 0.3615,
1227
+ "rewards/accuracies": 0.862500011920929,
1228
+ "rewards/chosen": -0.5498217344284058,
1229
+ "rewards/margins": 1.930101752281189,
1230
+ "rewards/rejected": -2.479923725128174,
1231
  "step": 730
1232
  },
1233
  {
1234
  "epoch": 0.85,
1235
+ "grad_norm": 86.13162915966026,
1236
  "learning_rate": 3.24124515747731e-08,
1237
+ "logits/chosen": -1.827125906944275,
1238
+ "logits/rejected": -1.8333660364151,
1239
+ "logps/chosen": -245.1622314453125,
1240
+ "logps/rejected": -269.59857177734375,
1241
+ "loss": 0.376,
1242
+ "rewards/accuracies": 0.800000011920929,
1243
+ "rewards/chosen": -1.0357568264007568,
1244
+ "rewards/margins": 1.9602575302124023,
1245
+ "rewards/rejected": -2.996014356613159,
1246
  "step": 740
1247
  },
1248
  {
1249
  "epoch": 0.86,
1250
+ "grad_norm": 86.92913330390785,
1251
  "learning_rate": 2.763773529814506e-08,
1252
+ "logits/chosen": -1.940159559249878,
1253
+ "logits/rejected": -1.9158337116241455,
1254
+ "logps/chosen": -282.51397705078125,
1255
+ "logps/rejected": -276.21331787109375,
1256
+ "loss": 0.372,
1257
+ "rewards/accuracies": 0.800000011920929,
1258
+ "rewards/chosen": -0.7603832483291626,
1259
+ "rewards/margins": 2.119266986846924,
1260
+ "rewards/rejected": -2.879650354385376,
1261
  "step": 750
1262
  },
1263
  {
1264
  "epoch": 0.88,
1265
+ "grad_norm": 95.56287400260709,
1266
  "learning_rate": 2.3222768526860698e-08,
1267
+ "logits/chosen": -1.863567590713501,
1268
+ "logits/rejected": -1.788558006286621,
1269
+ "logps/chosen": -249.87838745117188,
1270
+ "logps/rejected": -264.7607727050781,
1271
+ "loss": 0.3676,
1272
+ "rewards/accuracies": 0.824999988079071,
1273
+ "rewards/chosen": -0.6970678567886353,
1274
+ "rewards/margins": 2.2210423946380615,
1275
+ "rewards/rejected": -2.9181103706359863,
1276
  "step": 760
1277
  },
1278
  {
1279
  "epoch": 0.89,
1280
+ "grad_norm": 82.6081133840389,
1281
  "learning_rate": 1.9174694029115146e-08,
1282
+ "logits/chosen": -1.9369175434112549,
1283
+ "logits/rejected": -1.9106756448745728,
1284
+ "logps/chosen": -287.55975341796875,
1285
+ "logps/rejected": -268.37493896484375,
1286
+ "loss": 0.3621,
1287
+ "rewards/accuracies": 0.800000011920929,
1288
+ "rewards/chosen": -0.7068324685096741,
1289
+ "rewards/margins": 1.9774510860443115,
1290
+ "rewards/rejected": -2.684283494949341,
1291
  "step": 770
1292
  },
1293
  {
1294
  "epoch": 0.9,
1295
+ "grad_norm": 84.83905388303022,
1296
  "learning_rate": 1.5500060995258134e-08,
1297
+ "logits/chosen": -1.8579909801483154,
1298
+ "logits/rejected": -1.8068830966949463,
1299
+ "logps/chosen": -258.0039367675781,
1300
+ "logps/rejected": -259.26202392578125,
1301
+ "loss": 0.3456,
1302
+ "rewards/accuracies": 0.875,
1303
+ "rewards/chosen": -0.8524407148361206,
1304
+ "rewards/margins": 2.122145414352417,
1305
+ "rewards/rejected": -2.974586248397827,
1306
  "step": 780
1307
  },
1308
  {
1309
  "epoch": 0.91,
1310
+ "grad_norm": 94.72717106858794,
1311
  "learning_rate": 1.2204814442165812e-08,
1312
+ "logits/chosen": -1.8840267658233643,
1313
+ "logits/rejected": -1.8291581869125366,
1314
+ "logps/chosen": -252.4437713623047,
1315
+ "logps/rejected": -252.0703582763672,
1316
+ "loss": 0.3741,
1317
+ "rewards/accuracies": 0.824999988079071,
1318
+ "rewards/chosen": -0.8425480127334595,
1319
+ "rewards/margins": 2.2477831840515137,
1320
+ "rewards/rejected": -3.0903308391571045,
1321
  "step": 790
1322
  },
1323
  {
1324
  "epoch": 0.92,
1325
+ "grad_norm": 102.52909854244074,
1326
  "learning_rate": 9.294285595075669e-09,
1327
+ "logits/chosen": -1.9619266986846924,
1328
+ "logits/rejected": -1.904088020324707,
1329
+ "logps/chosen": -277.6653747558594,
1330
+ "logps/rejected": -272.3161315917969,
1331
+ "loss": 0.4069,
1332
+ "rewards/accuracies": 0.78125,
1333
+ "rewards/chosen": -0.8278266787528992,
1334
+ "rewards/margins": 2.0403804779052734,
1335
+ "rewards/rejected": -2.868206739425659,
1336
  "step": 800
1337
  },
1338
  {
1339
  "epoch": 0.92,
1340
+ "eval_logits/chosen": -1.9591288566589355,
1341
+ "eval_logits/rejected": -1.9204463958740234,
1342
+ "eval_logps/chosen": -322.0539245605469,
1343
+ "eval_logps/rejected": -380.2658386230469,
1344
+ "eval_loss": 0.20271854102611542,
1345
+ "eval_rewards/accuracies": 0.9140625,
1346
+ "eval_rewards/chosen": 0.6728801131248474,
1347
+ "eval_rewards/margins": 3.0308780670166016,
1348
+ "eval_rewards/rejected": -2.3579981327056885,
1349
+ "eval_runtime": 97.802,
1350
+ "eval_samples_per_second": 20.449,
1351
+ "eval_steps_per_second": 0.327,
1352
  "step": 800
1353
  },
1354
  {
1355
  "epoch": 0.93,
1356
+ "grad_norm": 85.0418535767566,
1357
  "learning_rate": 6.773183262446914e-09,
1358
+ "logits/chosen": -1.8587850332260132,
1359
+ "logits/rejected": -1.7795337438583374,
1360
+ "logps/chosen": -248.3004913330078,
1361
+ "logps/rejected": -261.2222595214844,
1362
+ "loss": 0.3824,
1363
+ "rewards/accuracies": 0.8125,
1364
+ "rewards/chosen": -0.7576299905776978,
1365
+ "rewards/margins": 2.073674440383911,
1366
+ "rewards/rejected": -2.8313040733337402,
1367
  "step": 810
1368
  },
1369
  {
1370
  "epoch": 0.94,
1371
+ "grad_norm": 88.3275931540908,
1372
  "learning_rate": 4.645586217799452e-09,
1373
+ "logits/chosen": -1.8911056518554688,
1374
+ "logits/rejected": -1.940203309059143,
1375
+ "logps/chosen": -265.9781799316406,
1376
+ "logps/rejected": -290.08770751953125,
1377
+ "loss": 0.3902,
1378
+ "rewards/accuracies": 0.8125,
1379
+ "rewards/chosen": -0.839026927947998,
1380
+ "rewards/margins": 2.1166014671325684,
1381
+ "rewards/rejected": -2.9556286334991455,
1382
  "step": 820
1383
  },
1384
  {
1385
  "epoch": 0.96,
1386
+ "grad_norm": 108.33767167817186,
1387
  "learning_rate": 2.9149366008568987e-09,
1388
+ "logits/chosen": -1.8930469751358032,
1389
+ "logits/rejected": -1.9079952239990234,
1390
+ "logps/chosen": -263.65264892578125,
1391
+ "logps/rejected": -278.3803405761719,
1392
+ "loss": 0.3825,
1393
+ "rewards/accuracies": 0.793749988079071,
1394
+ "rewards/chosen": -0.6759049296379089,
1395
+ "rewards/margins": 2.146210193634033,
1396
+ "rewards/rejected": -2.822114944458008,
1397
  "step": 830
1398
  },
1399
  {
1400
  "epoch": 0.97,
1401
+ "grad_norm": 86.51713108380295,
1402
  "learning_rate": 1.5840343486700215e-09,
1403
+ "logits/chosen": -1.9469448328018188,
1404
+ "logits/rejected": -1.939854383468628,
1405
+ "logps/chosen": -281.2010498046875,
1406
+ "logps/rejected": -275.5614318847656,
1407
+ "loss": 0.367,
1408
+ "rewards/accuracies": 0.8374999761581421,
1409
+ "rewards/chosen": -0.5235612392425537,
1410
+ "rewards/margins": 2.1837284564971924,
1411
+ "rewards/rejected": -2.707289457321167,
1412
  "step": 840
1413
  },
1414
  {
1415
  "epoch": 0.98,
1416
+ "grad_norm": 82.94960696998083,
1417
  "learning_rate": 6.550326657293881e-10,
1418
+ "logits/chosen": -1.9341312646865845,
1419
+ "logits/rejected": -1.901523232460022,
1420
+ "logps/chosen": -257.5312805175781,
1421
+ "logps/rejected": -268.84619140625,
1422
+ "loss": 0.346,
1423
+ "rewards/accuracies": 0.8812500238418579,
1424
+ "rewards/chosen": -0.7536182999610901,
1425
+ "rewards/margins": 2.499413251876831,
1426
+ "rewards/rejected": -3.2530312538146973,
1427
  "step": 850
1428
  },
1429
  {
1430
  "epoch": 0.99,
1431
+ "grad_norm": 100.59142277641878,
1432
  "learning_rate": 1.2943454039654467e-10,
1433
+ "logits/chosen": -1.821735143661499,
1434
+ "logits/rejected": -1.835100531578064,
1435
+ "logps/chosen": -244.0499267578125,
1436
+ "logps/rejected": -261.69805908203125,
1437
+ "loss": 0.3624,
1438
+ "rewards/accuracies": 0.800000011920929,
1439
+ "rewards/chosen": -1.0147624015808105,
1440
+ "rewards/margins": 1.7580705881118774,
1441
+ "rewards/rejected": -2.7728328704833984,
1442
  "step": 860
1443
  },
1444
  {
1445
  "epoch": 1.0,
1446
  "step": 868,
1447
  "total_flos": 0.0,
1448
+ "train_loss": 0.40559157427005504,
1449
+ "train_runtime": 13777.3263,
1450
+ "train_samples_per_second": 8.066,
1451
+ "train_steps_per_second": 0.063
1452
  }
1453
  ],
1454
  "logging_steps": 10,