wzhouad commited on
Commit
0673efe
1 Parent(s): b6e71a5

Model save

Browse files
README.md CHANGED
@@ -15,15 +15,15 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  This model was trained from scratch on the None dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 0.0417
19
- - Rewards/chosen: -1.1011
20
- - Rewards/rejected: -1.9879
21
- - Rewards/accuracies: 0.7266
22
- - Rewards/margins: 0.8868
23
- - Logps/rejected: -528.9260
24
- - Logps/chosen: -433.9420
25
- - Logits/rejected: 0.8669
26
- - Logits/chosen: 0.7479
27
 
28
  ## Model description
29
 
@@ -45,7 +45,7 @@ The following hyperparameters were used during training:
45
  - learning_rate: 3e-06
46
  - train_batch_size: 4
47
  - eval_batch_size: 8
48
- - seed: 4
49
  - distributed_type: multi-GPU
50
  - num_devices: 8
51
  - gradient_accumulation_steps: 4
@@ -60,10 +60,10 @@ The following hyperparameters were used during training:
60
 
61
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
62
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
63
- | 0.0521 | 0.21 | 100 | 0.0521 | -0.8681 | -1.4945 | 0.7109 | 0.6264 | -479.5833 | -410.6368 | 0.9218 | 0.7855 |
64
- | 0.0475 | 0.42 | 200 | 0.0601 | -0.7216 | -1.4421 | 0.7383 | 0.7205 | -474.3407 | -395.9902 | 0.5575 | 0.4386 |
65
- | 0.0476 | 0.63 | 300 | 0.0584 | -0.8286 | -1.5875 | 0.7227 | 0.7589 | -488.8869 | -406.6899 | 0.5327 | 0.4048 |
66
- | 0.0392 | 0.84 | 400 | 0.0417 | -1.1011 | -1.9879 | 0.7266 | 0.8868 | -528.9260 | -433.9420 | 0.8669 | 0.7479 |
67
 
68
 
69
  ### Framework versions
 
15
 
16
  This model was trained from scratch on the None dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 0.0415
19
+ - Rewards/chosen: -1.1176
20
+ - Rewards/rejected: -2.0114
21
+ - Rewards/accuracies: 0.7070
22
+ - Rewards/margins: 0.8938
23
+ - Logps/rejected: -531.2747
24
+ - Logps/chosen: -435.5875
25
+ - Logits/rejected: 0.8196
26
+ - Logits/chosen: 0.7291
27
 
28
  ## Model description
29
 
 
45
  - learning_rate: 3e-06
46
  - train_batch_size: 4
47
  - eval_batch_size: 8
48
+ - seed: 5
49
  - distributed_type: multi-GPU
50
  - num_devices: 8
51
  - gradient_accumulation_steps: 4
 
60
 
61
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
62
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
63
+ | 0.066 | 0.21 | 100 | 0.0702 | -0.4714 | -1.0800 | 0.7266 | 0.6086 | -438.1371 | -370.9747 | 0.7687 | 0.6183 |
64
+ | 0.0477 | 0.42 | 200 | 0.0505 | -1.0382 | -1.8566 | 0.7461 | 0.8184 | -515.7967 | -427.6501 | 0.5198 | 0.4181 |
65
+ | 0.0313 | 0.63 | 300 | 0.0344 | -1.3029 | -2.2224 | 0.7227 | 0.9195 | -552.3698 | -454.1193 | 1.0434 | 0.9401 |
66
+ | 0.0359 | 0.84 | 400 | 0.0415 | -1.1176 | -2.0114 | 0.7070 | 0.8938 | -531.2747 | -435.5875 | 0.8196 | 0.7291 |
67
 
68
 
69
  ### Framework versions
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.05423464074699634,
4
- "train_runtime": 4545.6697,
5
  "train_samples": 61134,
6
- "train_samples_per_second": 13.449,
7
- "train_steps_per_second": 0.105
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.055112330793584664,
4
+ "train_runtime": 4571.3444,
5
  "train_samples": 61134,
6
+ "train_samples_per_second": 13.373,
7
+ "train_steps_per_second": 0.104
8
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bba28a181d97753031ca76691f7e70603aeddef1cd3970f974728b30188d52fe
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77c78f44ae927b8c5f876cba766716862c391ff327d777f630df2273dc608ad2
3
  size 4976698672
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a3634a8c506846675a8a7101c40445f227bb7af96b151c0ebfe45b1497d3a7ac
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52890ec8e3b01c2a425c75a5fe8026fad3760550ffe4ecc542adabcb6547e556
3
  size 4999802720
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:97c9de524013be6975b5843fe7fd0bde5a216581cbe0bb10c068b4dc17cffc0b
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c4e28b526b64115f67f1a7d9ceb1156546b14ddfbf6c799c751ac2c949af93b
3
  size 4915916176
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c12a0bb4b9560bc67876bae213267e33acdf73e22432c92dd70443ce0038244e
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:331daeef21c9b60a293872df524529661446efaf2f056cc336b124cce438e3cb
3
  size 1168138808
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.05423464074699634,
4
- "train_runtime": 4545.6697,
5
  "train_samples": 61134,
6
- "train_samples_per_second": 13.449,
7
- "train_steps_per_second": 0.105
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.055112330793584664,
4
+ "train_runtime": 4571.3444,
5
  "train_samples": 61134,
6
+ "train_samples_per_second": 13.373,
7
+ "train_steps_per_second": 0.104
8
  }
trainer_state.json CHANGED
@@ -11,11 +11,11 @@
11
  {
12
  "epoch": 0.0,
13
  "learning_rate": 6.25e-08,
14
- "logits/chosen": 0.09203790873289108,
15
- "logits/rejected": 0.2914758026599884,
16
- "logps/chosen": -315.45611572265625,
17
- "logps/rejected": -241.00250244140625,
18
- "loss": 0.1409,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
@@ -25,733 +25,733 @@
25
  {
26
  "epoch": 0.02,
27
  "learning_rate": 6.25e-07,
28
- "logits/chosen": 0.2606273889541626,
29
- "logits/rejected": 0.33430540561676025,
30
- "logps/chosen": -323.0765686035156,
31
- "logps/rejected": -307.2400817871094,
32
- "loss": 0.1441,
33
- "rewards/accuracies": 0.4236111044883728,
34
- "rewards/chosen": -0.0016678691608831286,
35
- "rewards/margins": -7.814847049303353e-05,
36
- "rewards/rejected": -0.0015897207194939256,
37
  "step": 10
38
  },
39
  {
40
  "epoch": 0.04,
41
  "learning_rate": 1.25e-06,
42
- "logits/chosen": 0.3121325671672821,
43
- "logits/rejected": 0.33961328864097595,
44
- "logps/chosen": -304.62060546875,
45
- "logps/rejected": -280.9560546875,
46
- "loss": 0.1425,
47
- "rewards/accuracies": 0.6312500238418579,
48
- "rewards/chosen": 0.0012181587517261505,
49
- "rewards/margins": 0.008560886606574059,
50
- "rewards/rejected": -0.007342727389186621,
51
  "step": 20
52
  },
53
  {
54
  "epoch": 0.06,
55
  "learning_rate": 1.875e-06,
56
- "logits/chosen": 0.21220548450946808,
57
- "logits/rejected": 0.3127239942550659,
58
- "logps/chosen": -373.3235778808594,
59
- "logps/rejected": -329.6069030761719,
60
- "loss": 0.1491,
61
- "rewards/accuracies": 0.675000011920929,
62
- "rewards/chosen": 0.031112518161535263,
63
- "rewards/margins": 0.08273597061634064,
64
- "rewards/rejected": -0.05162344500422478,
65
  "step": 30
66
  },
67
  {
68
  "epoch": 0.08,
69
  "learning_rate": 2.5e-06,
70
- "logits/chosen": 0.3977668881416321,
71
- "logits/rejected": 0.4907824397087097,
72
- "logps/chosen": -347.14422607421875,
73
- "logps/rejected": -337.64599609375,
74
- "loss": 0.1246,
75
- "rewards/accuracies": 0.6875,
76
- "rewards/chosen": -0.08021022379398346,
77
- "rewards/margins": 0.17033424973487854,
78
- "rewards/rejected": -0.2505444586277008,
79
  "step": 40
80
  },
81
  {
82
  "epoch": 0.1,
83
  "learning_rate": 2.999839121261416e-06,
84
- "logits/chosen": 0.7560127973556519,
85
- "logits/rejected": 0.8282445073127747,
86
- "logps/chosen": -400.455322265625,
87
- "logps/rejected": -433.1502380371094,
88
- "loss": 0.0829,
89
- "rewards/accuracies": 0.7124999761581421,
90
- "rewards/chosen": -0.5089501142501831,
91
- "rewards/margins": 0.3683263659477234,
92
- "rewards/rejected": -0.8772764205932617,
93
  "step": 50
94
  },
95
  {
96
  "epoch": 0.13,
97
  "learning_rate": 2.994211988057582e-06,
98
- "logits/chosen": 0.7812812328338623,
99
- "logits/rejected": 1.0137856006622314,
100
- "logps/chosen": -386.59674072265625,
101
- "logps/rejected": -442.959716796875,
102
- "loss": 0.0611,
103
- "rewards/accuracies": 0.643750011920929,
104
- "rewards/chosen": -0.805428683757782,
105
- "rewards/margins": 0.5730851292610168,
106
- "rewards/rejected": -1.3785139322280884,
107
  "step": 60
108
  },
109
  {
110
  "epoch": 0.15,
111
  "learning_rate": 2.9805753939568693e-06,
112
- "logits/chosen": 0.810570240020752,
113
- "logits/rejected": 0.9147614240646362,
114
- "logps/chosen": -360.248291015625,
115
- "logps/rejected": -423.7164611816406,
116
- "loss": 0.0539,
117
- "rewards/accuracies": 0.6499999761581421,
118
- "rewards/chosen": -0.7509704828262329,
119
- "rewards/margins": 0.4445374608039856,
120
- "rewards/rejected": -1.1955080032348633,
121
  "step": 70
122
  },
123
  {
124
  "epoch": 0.17,
125
  "learning_rate": 2.959002435526626e-06,
126
- "logits/chosen": 0.46960344910621643,
127
- "logits/rejected": 0.6186197996139526,
128
- "logps/chosen": -387.56683349609375,
129
- "logps/rejected": -440.8048400878906,
130
- "loss": 0.058,
131
- "rewards/accuracies": 0.71875,
132
- "rewards/chosen": -0.7455588579177856,
133
- "rewards/margins": 0.6929988265037537,
134
- "rewards/rejected": -1.438557744026184,
135
  "step": 80
136
  },
137
  {
138
  "epoch": 0.19,
139
  "learning_rate": 2.929608750821129e-06,
140
- "logits/chosen": 0.49714404344558716,
141
- "logits/rejected": 0.7643166780471802,
142
- "logps/chosen": -387.7370300292969,
143
- "logps/rejected": -411.63250732421875,
144
- "loss": 0.051,
145
- "rewards/accuracies": 0.737500011920929,
146
- "rewards/chosen": -0.8395982980728149,
147
- "rewards/margins": 0.5891604423522949,
148
- "rewards/rejected": -1.4287587404251099,
149
  "step": 90
150
  },
151
  {
152
  "epoch": 0.21,
153
  "learning_rate": 2.892551899524109e-06,
154
- "logits/chosen": 0.6043367981910706,
155
- "logits/rejected": 0.8246415853500366,
156
- "logps/chosen": -417.3805236816406,
157
- "logps/rejected": -413.0718688964844,
158
- "loss": 0.0521,
159
- "rewards/accuracies": 0.6499999761581421,
160
- "rewards/chosen": -0.8979790806770325,
161
- "rewards/margins": 0.3839009404182434,
162
- "rewards/rejected": -1.2818800210952759,
163
  "step": 100
164
  },
165
  {
166
  "epoch": 0.21,
167
- "eval_logits/chosen": 0.7855331897735596,
168
- "eval_logits/rejected": 0.9217743873596191,
169
- "eval_logps/chosen": -410.63677978515625,
170
- "eval_logps/rejected": -479.5832824707031,
171
- "eval_loss": 0.05205187946557999,
172
- "eval_rewards/accuracies": 0.7109375,
173
- "eval_rewards/chosen": -0.8680679798126221,
174
- "eval_rewards/margins": 0.6264181137084961,
175
- "eval_rewards/rejected": -1.4944860935211182,
176
- "eval_runtime": 73.9092,
177
- "eval_samples_per_second": 27.06,
178
- "eval_steps_per_second": 0.433,
179
  "step": 100
180
  },
181
  {
182
  "epoch": 0.23,
183
  "learning_rate": 2.848030518377739e-06,
184
- "logits/chosen": 0.4907767176628113,
185
- "logits/rejected": 0.6656876802444458,
186
- "logps/chosen": -397.6432189941406,
187
- "logps/rejected": -422.59735107421875,
188
- "loss": 0.0651,
189
- "rewards/accuracies": 0.6937500238418579,
190
- "rewards/chosen": -0.6896931529045105,
191
- "rewards/margins": 0.4766135811805725,
192
- "rewards/rejected": -1.166306734085083,
193
  "step": 110
194
  },
195
  {
196
  "epoch": 0.25,
197
  "learning_rate": 2.7962832564252724e-06,
198
- "logits/chosen": 0.39288032054901123,
199
- "logits/rejected": 0.555514931678772,
200
- "logps/chosen": -435.5804138183594,
201
- "logps/rejected": -464.86651611328125,
202
- "loss": 0.0483,
203
- "rewards/accuracies": 0.675000011920929,
204
- "rewards/chosen": -0.8776789903640747,
205
- "rewards/margins": 0.542883038520813,
206
- "rewards/rejected": -1.4205620288848877,
207
  "step": 120
208
  },
209
  {
210
  "epoch": 0.27,
211
  "learning_rate": 2.7375874957747644e-06,
212
- "logits/chosen": 0.7341902256011963,
213
- "logits/rejected": 0.9476861953735352,
214
- "logps/chosen": -487.042236328125,
215
- "logps/rejected": -509.67510986328125,
216
- "loss": 0.0303,
217
- "rewards/accuracies": 0.6499999761581421,
218
- "rewards/chosen": -1.6147487163543701,
219
- "rewards/margins": 0.5281225442886353,
220
- "rewards/rejected": -2.142871379852295,
221
  "step": 130
222
  },
223
  {
224
  "epoch": 0.29,
225
  "learning_rate": 2.672257864741005e-06,
226
- "logits/chosen": 0.7316943407058716,
227
- "logits/rejected": 0.9424102902412415,
228
- "logps/chosen": -451.3330078125,
229
- "logps/rejected": -464.85601806640625,
230
- "loss": 0.0309,
231
- "rewards/accuracies": 0.6625000238418579,
232
- "rewards/chosen": -1.3325563669204712,
233
- "rewards/margins": 0.5245406031608582,
234
- "rewards/rejected": -1.8570966720581055,
235
  "step": 140
236
  },
237
  {
238
  "epoch": 0.31,
239
  "learning_rate": 2.600644551335706e-06,
240
- "logits/chosen": 0.6526888608932495,
241
- "logits/rejected": 0.8180145025253296,
242
- "logps/chosen": -443.1895446777344,
243
- "logps/rejected": -462.8304748535156,
244
- "loss": 0.0465,
245
- "rewards/accuracies": 0.668749988079071,
246
- "rewards/chosen": -0.9286714792251587,
247
- "rewards/margins": 0.5423834919929504,
248
- "rewards/rejected": -1.471055030822754,
249
  "step": 150
250
  },
251
  {
252
  "epoch": 0.33,
253
  "learning_rate": 2.5231314261461732e-06,
254
- "logits/chosen": 0.5461139678955078,
255
- "logits/rejected": 0.7980540990829468,
256
- "logps/chosen": -441.15704345703125,
257
- "logps/rejected": -453.66033935546875,
258
- "loss": 0.0528,
259
- "rewards/accuracies": 0.675000011920929,
260
- "rewards/chosen": -0.9410704374313354,
261
- "rewards/margins": 0.6425323486328125,
262
- "rewards/rejected": -1.5836029052734375,
263
  "step": 160
264
  },
265
  {
266
  "epoch": 0.36,
267
  "learning_rate": 2.440133984664454e-06,
268
- "logits/chosen": 0.315818727016449,
269
- "logits/rejected": 0.576252281665802,
270
- "logps/chosen": -468.5994567871094,
271
- "logps/rejected": -481.32818603515625,
272
- "loss": 0.0531,
273
- "rewards/accuracies": 0.6875,
274
- "rewards/chosen": -0.8597052693367004,
275
- "rewards/margins": 0.47268643975257874,
276
- "rewards/rejected": -1.3323917388916016,
277
  "step": 170
278
  },
279
  {
280
  "epoch": 0.38,
281
  "learning_rate": 2.3520971200967337e-06,
282
- "logits/chosen": 0.44164711236953735,
283
- "logits/rejected": 0.6109380125999451,
284
- "logps/chosen": -452.79473876953125,
285
- "logps/rejected": -522.9495849609375,
286
- "loss": 0.04,
287
- "rewards/accuracies": 0.668749988079071,
288
- "rewards/chosen": -1.0779365301132202,
289
- "rewards/margins": 0.8940714597702026,
290
- "rewards/rejected": -1.9720081090927124,
291
  "step": 180
292
  },
293
  {
294
  "epoch": 0.4,
295
  "learning_rate": 2.2594927385914546e-06,
296
- "logits/chosen": 0.341614305973053,
297
- "logits/rejected": 0.5234003067016602,
298
- "logps/chosen": -462.4978942871094,
299
- "logps/rejected": -528.377685546875,
300
- "loss": 0.0362,
301
- "rewards/accuracies": 0.706250011920929,
302
- "rewards/chosen": -1.309615135192871,
303
- "rewards/margins": 0.7517553567886353,
304
- "rewards/rejected": -2.061370372772217,
305
  "step": 190
306
  },
307
  {
308
  "epoch": 0.42,
309
  "learning_rate": 2.1628172296692954e-06,
310
- "logits/chosen": 0.40094342827796936,
311
- "logits/rejected": 0.5750107169151306,
312
- "logps/chosen": -412.12445068359375,
313
- "logps/rejected": -439.3114318847656,
314
- "loss": 0.0475,
315
- "rewards/accuracies": 0.65625,
316
- "rewards/chosen": -0.915791392326355,
317
- "rewards/margins": 0.6095176935195923,
318
- "rewards/rejected": -1.5253090858459473,
319
  "step": 200
320
  },
321
  {
322
  "epoch": 0.42,
323
- "eval_logits/chosen": 0.4385632872581482,
324
- "eval_logits/rejected": 0.5574513077735901,
325
- "eval_logps/chosen": -395.9902038574219,
326
- "eval_logps/rejected": -474.3407287597656,
327
- "eval_loss": 0.06009303405880928,
328
- "eval_rewards/accuracies": 0.73828125,
329
- "eval_rewards/chosen": -0.7216026782989502,
330
- "eval_rewards/margins": 0.7204576134681702,
331
- "eval_rewards/rejected": -1.442060112953186,
332
- "eval_runtime": 73.4554,
333
- "eval_samples_per_second": 27.227,
334
- "eval_steps_per_second": 0.436,
335
  "step": 200
336
  },
337
  {
338
  "epoch": 0.44,
339
  "learning_rate": 2.062588805414343e-06,
340
- "logits/chosen": 0.45482128858566284,
341
- "logits/rejected": 0.6150248050689697,
342
- "logps/chosen": -403.6705322265625,
343
- "logps/rejected": -426.13140869140625,
344
- "loss": 0.0621,
345
- "rewards/accuracies": 0.6812499761581421,
346
- "rewards/chosen": -0.8358365297317505,
347
- "rewards/margins": 0.547526478767395,
348
- "rewards/rejected": -1.3833630084991455,
349
  "step": 210
350
  },
351
  {
352
  "epoch": 0.46,
353
  "learning_rate": 1.9593447226892386e-06,
354
- "logits/chosen": 0.604540228843689,
355
- "logits/rejected": 0.8555408716201782,
356
- "logps/chosen": -436.47479248046875,
357
- "logps/rejected": -434.32855224609375,
358
- "loss": 0.0508,
359
- "rewards/accuracies": 0.668749988079071,
360
- "rewards/chosen": -0.9561892747879028,
361
- "rewards/margins": 0.5492093563079834,
362
- "rewards/rejected": -1.5053986310958862,
363
  "step": 220
364
  },
365
  {
366
  "epoch": 0.48,
367
  "learning_rate": 1.853638403264141e-06,
368
- "logits/chosen": 0.6232072114944458,
369
- "logits/rejected": 0.8155434727668762,
370
- "logps/chosen": -421.7066955566406,
371
- "logps/rejected": -488.530517578125,
372
- "loss": 0.0455,
373
  "rewards/accuracies": 0.7250000238418579,
374
- "rewards/chosen": -0.9787136912345886,
375
- "rewards/margins": 0.787739098072052,
376
- "rewards/rejected": -1.7664527893066406,
377
  "step": 230
378
  },
379
  {
380
  "epoch": 0.5,
381
  "learning_rate": 1.7460364672965328e-06,
382
- "logits/chosen": 0.45760011672973633,
383
- "logits/rejected": 0.7767106890678406,
384
- "logps/chosen": -487.28802490234375,
385
- "logps/rejected": -456.274658203125,
386
- "loss": 0.0492,
387
- "rewards/accuracies": 0.6937500238418579,
388
- "rewards/chosen": -0.8502359390258789,
389
- "rewards/margins": 0.6182385683059692,
390
- "rewards/rejected": -1.4684743881225586,
391
  "step": 240
392
  },
393
  {
394
  "epoch": 0.52,
395
  "learning_rate": 1.637115696063402e-06,
396
- "logits/chosen": 0.4637879431247711,
397
- "logits/rejected": 0.684526801109314,
398
- "logps/chosen": -422.9723205566406,
399
- "logps/rejected": -474.6338806152344,
400
- "loss": 0.0418,
401
- "rewards/accuracies": 0.7437499761581421,
402
- "rewards/chosen": -0.8930182456970215,
403
- "rewards/margins": 0.8561725616455078,
404
- "rewards/rejected": -1.7491906881332397,
405
  "step": 250
406
  },
407
  {
408
  "epoch": 0.54,
409
  "learning_rate": 1.5274599402265162e-06,
410
- "logits/chosen": 0.455331027507782,
411
- "logits/rejected": 0.6168816089630127,
412
- "logps/chosen": -440.8057556152344,
413
- "logps/rejected": -504.82940673828125,
414
- "loss": 0.0426,
415
- "rewards/accuracies": 0.7250000238418579,
416
- "rewards/chosen": -1.1558371782302856,
417
- "rewards/margins": 0.8892456889152527,
418
- "rewards/rejected": -2.0450828075408936,
419
  "step": 260
420
  },
421
  {
422
  "epoch": 0.57,
423
  "learning_rate": 1.4176569902035088e-06,
424
- "logits/chosen": 0.4234936833381653,
425
- "logits/rejected": 0.5202213525772095,
426
- "logps/chosen": -428.471923828125,
427
- "logps/rejected": -493.4828186035156,
428
- "loss": 0.0394,
429
- "rewards/accuracies": 0.731249988079071,
430
- "rewards/chosen": -1.2287781238555908,
431
- "rewards/margins": 0.7696730494499207,
432
- "rewards/rejected": -1.9984509944915771,
433
  "step": 270
434
  },
435
  {
436
  "epoch": 0.59,
437
  "learning_rate": 1.308295425420593e-06,
438
- "logits/chosen": 0.3574947118759155,
439
- "logits/rejected": 0.5266100168228149,
440
- "logps/chosen": -484.1373596191406,
441
- "logps/rejected": -514.14794921875,
442
- "loss": 0.0387,
443
- "rewards/accuracies": 0.706250011920929,
444
- "rewards/chosen": -1.3580681085586548,
445
- "rewards/margins": 0.7606993317604065,
446
- "rewards/rejected": -2.118767261505127,
447
  "step": 280
448
  },
449
  {
450
  "epoch": 0.61,
451
  "learning_rate": 1.1999614593359337e-06,
452
- "logits/chosen": 0.3510825037956238,
453
- "logits/rejected": 0.566794753074646,
454
- "logps/chosen": -445.3075256347656,
455
- "logps/rejected": -521.5965576171875,
456
- "loss": 0.0467,
457
- "rewards/accuracies": 0.7124999761581421,
458
- "rewards/chosen": -0.9900503158569336,
459
- "rewards/margins": 0.8154880404472351,
460
- "rewards/rejected": -1.8055381774902344,
461
  "step": 290
462
  },
463
  {
464
  "epoch": 0.63,
465
  "learning_rate": 1.0932357971453745e-06,
466
- "logits/chosen": 0.32849544286727905,
467
- "logits/rejected": 0.4735318720340729,
468
- "logps/chosen": -399.39764404296875,
469
- "logps/rejected": -470.449951171875,
470
- "loss": 0.0476,
471
- "rewards/accuracies": 0.737500011920929,
472
- "rewards/chosen": -0.8132196664810181,
473
- "rewards/margins": 0.7410646080970764,
474
- "rewards/rejected": -1.5542842149734497,
475
  "step": 300
476
  },
477
  {
478
  "epoch": 0.63,
479
- "eval_logits/chosen": 0.4048309922218323,
480
- "eval_logits/rejected": 0.5326845049858093,
481
- "eval_logps/chosen": -406.68994140625,
482
- "eval_logps/rejected": -488.8869323730469,
483
- "eval_loss": 0.05840897932648659,
484
  "eval_rewards/accuracies": 0.72265625,
485
- "eval_rewards/chosen": -0.8285996913909912,
486
- "eval_rewards/margins": 0.7589226961135864,
487
- "eval_rewards/rejected": -1.5875223875045776,
488
- "eval_runtime": 74.4017,
489
- "eval_samples_per_second": 26.881,
490
- "eval_steps_per_second": 0.43,
491
  "step": 300
492
  },
493
  {
494
  "epoch": 0.65,
495
  "learning_rate": 9.886905230142433e-07,
496
- "logits/chosen": 0.36471518874168396,
497
- "logits/rejected": 0.5999516844749451,
498
- "logps/chosen": -382.37371826171875,
499
- "logps/rejected": -438.2478942871094,
500
- "loss": 0.0526,
501
- "rewards/accuracies": 0.731249988079071,
502
- "rewards/chosen": -0.8080012202262878,
503
- "rewards/margins": 0.687478244304657,
504
- "rewards/rejected": -1.4954793453216553,
505
  "step": 310
506
  },
507
  {
508
  "epoch": 0.67,
509
  "learning_rate": 8.868860335206678e-07,
510
- "logits/chosen": 0.4309239387512207,
511
- "logits/rejected": 0.6108436584472656,
512
- "logps/chosen": -416.4072265625,
513
- "logps/rejected": -505.565185546875,
514
- "loss": 0.0425,
515
- "rewards/accuracies": 0.800000011920929,
516
- "rewards/chosen": -1.0292515754699707,
517
- "rewards/margins": 1.0127887725830078,
518
- "rewards/rejected": -2.0420401096343994,
519
  "step": 320
520
  },
521
  {
522
  "epoch": 0.69,
523
  "learning_rate": 7.883680337481599e-07,
524
- "logits/chosen": 0.5084182620048523,
525
- "logits/rejected": 0.5882959961891174,
526
- "logps/chosen": -483.5926818847656,
527
- "logps/rejected": -550.4625854492188,
528
- "loss": 0.0389,
529
- "rewards/accuracies": 0.706250011920929,
530
- "rewards/chosen": -1.4037799835205078,
531
- "rewards/margins": 0.7988616228103638,
532
- "rewards/rejected": -2.202641725540161,
533
  "step": 330
534
  },
535
  {
536
  "epoch": 0.71,
537
  "learning_rate": 6.936646121293654e-07,
538
- "logits/chosen": 0.5944998860359192,
539
- "logits/rejected": 0.7185046076774597,
540
- "logps/chosen": -417.82891845703125,
541
- "logps/rejected": -470.05816650390625,
542
- "loss": 0.0373,
543
  "rewards/accuracies": 0.706250011920929,
544
- "rewards/chosen": -1.1171300411224365,
545
- "rewards/margins": 0.692410945892334,
546
- "rewards/rejected": -1.80954110622406,
547
  "step": 340
548
  },
549
  {
550
  "epoch": 0.73,
551
  "learning_rate": 6.032834097207889e-07,
552
- "logits/chosen": 0.5707942247390747,
553
- "logits/rejected": 0.6327140927314758,
554
- "logps/chosen": -440.57830810546875,
555
- "logps/rejected": -524.21337890625,
556
- "loss": 0.0426,
557
- "rewards/accuracies": 0.6625000238418579,
558
- "rewards/chosen": -1.2543996572494507,
559
- "rewards/margins": 0.7487698793411255,
560
- "rewards/rejected": -2.003169536590576,
561
  "step": 350
562
  },
563
  {
564
  "epoch": 0.75,
565
  "learning_rate": 5.177088990820725e-07,
566
- "logits/chosen": 0.5283172130584717,
567
- "logits/rejected": 0.70453280210495,
568
- "logps/chosen": -436.8048400878906,
569
- "logps/rejected": -522.0552978515625,
570
- "loss": 0.0438,
571
- "rewards/accuracies": 0.71875,
572
- "rewards/chosen": -1.0285941362380981,
573
- "rewards/margins": 0.8370414972305298,
574
- "rewards/rejected": -1.865635633468628,
575
  "step": 360
576
  },
577
  {
578
  "epoch": 0.77,
579
  "learning_rate": 4.3739978734594494e-07,
580
- "logits/chosen": 0.5157877206802368,
581
- "logits/rejected": 0.7344454526901245,
582
- "logps/chosen": -460.63690185546875,
583
- "logps/rejected": -486.64312744140625,
584
- "loss": 0.0446,
585
- "rewards/accuracies": 0.71875,
586
- "rewards/chosen": -1.1092673540115356,
587
- "rewards/margins": 0.6609092950820923,
588
- "rewards/rejected": -1.7701762914657593,
589
  "step": 370
590
  },
591
  {
592
  "epoch": 0.8,
593
  "learning_rate": 3.627865573992087e-07,
594
- "logits/chosen": 0.5912496447563171,
595
- "logits/rejected": 0.6915227770805359,
596
- "logps/chosen": -406.13079833984375,
597
- "logps/rejected": -485.49761962890625,
598
- "loss": 0.0426,
599
- "rewards/accuracies": 0.6875,
600
- "rewards/chosen": -1.1371818780899048,
601
- "rewards/margins": 0.7529508471488953,
602
- "rewards/rejected": -1.8901325464248657,
603
  "step": 380
604
  },
605
  {
606
  "epoch": 0.82,
607
  "learning_rate": 2.9426916035484166e-07,
608
- "logits/chosen": 0.6277132034301758,
609
- "logits/rejected": 0.8218109011650085,
610
- "logps/chosen": -414.68621826171875,
611
- "logps/rejected": -475.742919921875,
612
- "loss": 0.0455,
613
- "rewards/accuracies": 0.6625000238418579,
614
- "rewards/chosen": -1.0436302423477173,
615
- "rewards/margins": 0.6562752723693848,
616
- "rewards/rejected": -1.6999053955078125,
617
  "step": 390
618
  },
619
  {
620
  "epoch": 0.84,
621
  "learning_rate": 2.322148716843081e-07,
622
- "logits/chosen": 0.7061805725097656,
623
- "logits/rejected": 0.7324530482292175,
624
- "logps/chosen": -435.91534423828125,
625
- "logps/rejected": -503.77325439453125,
626
- "loss": 0.0392,
627
- "rewards/accuracies": 0.7124999761581421,
628
- "rewards/chosen": -1.2680333852767944,
629
- "rewards/margins": 0.7200425267219543,
630
- "rewards/rejected": -1.9880759716033936,
631
  "step": 400
632
  },
633
  {
634
  "epoch": 0.84,
635
- "eval_logits/chosen": 0.7478917241096497,
636
- "eval_logits/rejected": 0.8668873310089111,
637
- "eval_logps/chosen": -433.94195556640625,
638
- "eval_logps/rejected": -528.926025390625,
639
- "eval_loss": 0.04172799736261368,
640
- "eval_rewards/accuracies": 0.7265625,
641
- "eval_rewards/chosen": -1.1011202335357666,
642
- "eval_rewards/margins": 0.8867928981781006,
643
- "eval_rewards/rejected": -1.9879131317138672,
644
- "eval_runtime": 72.7813,
645
- "eval_samples_per_second": 27.48,
646
- "eval_steps_per_second": 0.44,
647
  "step": 400
648
  },
649
  {
650
  "epoch": 0.86,
651
  "learning_rate": 1.7695632250191002e-07,
652
- "logits/chosen": 0.5754364728927612,
653
- "logits/rejected": 0.834900975227356,
654
- "logps/chosen": -481.41033935546875,
655
- "logps/rejected": -517.2847290039062,
656
- "loss": 0.0392,
657
  "rewards/accuracies": 0.6875,
658
- "rewards/chosen": -1.2066195011138916,
659
- "rewards/margins": 0.7020702958106995,
660
- "rewards/rejected": -1.9086897373199463,
661
  "step": 410
662
  },
663
  {
664
  "epoch": 0.88,
665
  "learning_rate": 1.2878971655412515e-07,
666
- "logits/chosen": 0.5507728457450867,
667
- "logits/rejected": 0.6630374193191528,
668
- "logps/chosen": -465.51812744140625,
669
- "logps/rejected": -536.48974609375,
670
- "loss": 0.0414,
671
- "rewards/accuracies": 0.7124999761581421,
672
- "rewards/chosen": -1.2821763753890991,
673
- "rewards/margins": 0.6950958967208862,
674
- "rewards/rejected": -1.977272391319275,
675
  "step": 420
676
  },
677
  {
678
  "epoch": 0.9,
679
  "learning_rate": 8.797324247145411e-08,
680
- "logits/chosen": 0.5740979313850403,
681
- "logits/rejected": 0.8506999015808105,
682
- "logps/chosen": -438.9664611816406,
683
- "logps/rejected": -447.83563232421875,
684
- "loss": 0.0393,
685
- "rewards/accuracies": 0.706250011920929,
686
- "rewards/chosen": -1.1305776834487915,
687
- "rewards/margins": 0.7222962975502014,
688
- "rewards/rejected": -1.8528740406036377,
689
  "step": 430
690
  },
691
  {
692
  "epoch": 0.92,
693
  "learning_rate": 5.472568979361853e-08,
694
- "logits/chosen": 0.45752525329589844,
695
- "logits/rejected": 0.7472774386405945,
696
- "logps/chosen": -464.6128845214844,
697
- "logps/rejected": -493.8684997558594,
698
- "loss": 0.0451,
699
- "rewards/accuracies": 0.706250011920929,
700
- "rewards/chosen": -1.2262476682662964,
701
- "rewards/margins": 0.7051091194152832,
702
- "rewards/rejected": -1.9313567876815796,
703
  "step": 440
704
  },
705
  {
706
  "epoch": 0.94,
707
  "learning_rate": 2.922527618666465e-08,
708
- "logits/chosen": 0.5190210938453674,
709
- "logits/rejected": 0.6143854856491089,
710
- "logps/chosen": -426.7154235839844,
711
- "logps/rejected": -538.7984619140625,
712
- "loss": 0.0438,
713
- "rewards/accuracies": 0.731249988079071,
714
- "rewards/chosen": -1.0999343395233154,
715
- "rewards/margins": 0.9706279039382935,
716
- "rewards/rejected": -2.0705626010894775,
717
  "step": 450
718
  },
719
  {
720
  "epoch": 0.96,
721
  "learning_rate": 1.1608692138469379e-08,
722
- "logits/chosen": 0.548923134803772,
723
- "logits/rejected": 0.786422610282898,
724
- "logps/chosen": -473.91448974609375,
725
- "logps/rejected": -503.9942321777344,
726
- "loss": 0.0436,
727
- "rewards/accuracies": 0.75,
728
- "rewards/chosen": -1.1051746606826782,
729
- "rewards/margins": 0.7683829069137573,
730
- "rewards/rejected": -1.873557686805725,
731
  "step": 460
732
  },
733
  {
734
  "epoch": 0.98,
735
  "learning_rate": 1.970368253390198e-09,
736
- "logits/chosen": 0.5904209017753601,
737
- "logits/rejected": 0.7317419052124023,
738
- "logps/chosen": -472.50244140625,
739
- "logps/rejected": -537.5592041015625,
740
- "loss": 0.046,
741
- "rewards/accuracies": 0.762499988079071,
742
- "rewards/chosen": -1.2209278345108032,
743
- "rewards/margins": 0.8498128056526184,
744
- "rewards/rejected": -2.0707404613494873,
745
  "step": 470
746
  },
747
  {
748
  "epoch": 1.0,
749
  "step": 477,
750
  "total_flos": 0.0,
751
- "train_loss": 0.05423464074699634,
752
- "train_runtime": 4545.6697,
753
- "train_samples_per_second": 13.449,
754
- "train_steps_per_second": 0.105
755
  }
756
  ],
757
  "logging_steps": 10,
 
11
  {
12
  "epoch": 0.0,
13
  "learning_rate": 6.25e-08,
14
+ "logits/chosen": 0.10802720487117767,
15
+ "logits/rejected": 0.30745893716812134,
16
+ "logps/chosen": -475.5745544433594,
17
+ "logps/rejected": -317.21234130859375,
18
+ "loss": 0.1378,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
 
25
  {
26
  "epoch": 0.02,
27
  "learning_rate": 6.25e-07,
28
+ "logits/chosen": 0.21480141580104828,
29
+ "logits/rejected": 0.3137889802455902,
30
+ "logps/chosen": -308.09619140625,
31
+ "logps/rejected": -308.85736083984375,
32
+ "loss": 0.1432,
33
+ "rewards/accuracies": 0.4791666567325592,
34
+ "rewards/chosen": -0.0008134886738844216,
35
+ "rewards/margins": 0.0006454013055190444,
36
+ "rewards/rejected": -0.001458889921195805,
37
  "step": 10
38
  },
39
  {
40
  "epoch": 0.04,
41
  "learning_rate": 1.25e-06,
42
+ "logits/chosen": 0.249754399061203,
43
+ "logits/rejected": 0.2825905978679657,
44
+ "logps/chosen": -304.75286865234375,
45
+ "logps/rejected": -317.61688232421875,
46
+ "loss": 0.1418,
47
+ "rewards/accuracies": 0.6625000238418579,
48
+ "rewards/chosen": 0.0010095896432176232,
49
+ "rewards/margins": 0.010475357994437218,
50
+ "rewards/rejected": -0.009465768001973629,
51
  "step": 20
52
  },
53
  {
54
  "epoch": 0.06,
55
  "learning_rate": 1.875e-06,
56
+ "logits/chosen": 0.24968624114990234,
57
+ "logits/rejected": 0.2685222029685974,
58
+ "logps/chosen": -366.27813720703125,
59
+ "logps/rejected": -365.3521728515625,
60
+ "loss": 0.1431,
61
+ "rewards/accuracies": 0.6312500238418579,
62
+ "rewards/chosen": 0.014242827892303467,
63
+ "rewards/margins": 0.06069143861532211,
64
+ "rewards/rejected": -0.046448610723018646,
65
  "step": 30
66
  },
67
  {
68
  "epoch": 0.08,
69
  "learning_rate": 2.5e-06,
70
+ "logits/chosen": 0.5138859748840332,
71
+ "logits/rejected": 0.6031057238578796,
72
+ "logps/chosen": -333.85650634765625,
73
+ "logps/rejected": -331.0009765625,
74
+ "loss": 0.1181,
75
+ "rewards/accuracies": 0.6187499761581421,
76
+ "rewards/chosen": -0.17486190795898438,
77
+ "rewards/margins": 0.1082921177148819,
78
+ "rewards/rejected": -0.2831540107727051,
79
  "step": 40
80
  },
81
  {
82
  "epoch": 0.1,
83
  "learning_rate": 2.999839121261416e-06,
84
+ "logits/chosen": 0.7348484992980957,
85
+ "logits/rejected": 0.8855365514755249,
86
+ "logps/chosen": -370.4933776855469,
87
+ "logps/rejected": -411.83404541015625,
88
+ "loss": 0.0741,
89
+ "rewards/accuracies": 0.6875,
90
+ "rewards/chosen": -0.4408305287361145,
91
+ "rewards/margins": 0.4697234034538269,
92
+ "rewards/rejected": -0.9105539321899414,
93
  "step": 50
94
  },
95
  {
96
  "epoch": 0.13,
97
  "learning_rate": 2.994211988057582e-06,
98
+ "logits/chosen": 0.7168207764625549,
99
+ "logits/rejected": 0.8200086355209351,
100
+ "logps/chosen": -341.53277587890625,
101
+ "logps/rejected": -380.68243408203125,
102
+ "loss": 0.0819,
103
+ "rewards/accuracies": 0.6937500238418579,
104
+ "rewards/chosen": -0.4272558093070984,
105
+ "rewards/margins": 0.4549214839935303,
106
+ "rewards/rejected": -0.8821773529052734,
107
  "step": 60
108
  },
109
  {
110
  "epoch": 0.15,
111
  "learning_rate": 2.9805753939568693e-06,
112
+ "logits/chosen": 0.5615164041519165,
113
+ "logits/rejected": 0.7741672396659851,
114
+ "logps/chosen": -347.7218017578125,
115
+ "logps/rejected": -330.172607421875,
116
+ "loss": 0.0929,
117
+ "rewards/accuracies": 0.6625000238418579,
118
+ "rewards/chosen": -0.3559855818748474,
119
+ "rewards/margins": 0.2854944169521332,
120
+ "rewards/rejected": -0.6414799690246582,
121
  "step": 70
122
  },
123
  {
124
  "epoch": 0.17,
125
  "learning_rate": 2.959002435526626e-06,
126
+ "logits/chosen": 0.5198915004730225,
127
+ "logits/rejected": 0.725387454032898,
128
+ "logps/chosen": -389.0698547363281,
129
+ "logps/rejected": -371.3795471191406,
130
+ "loss": 0.0736,
131
+ "rewards/accuracies": 0.6937500238418579,
132
+ "rewards/chosen": -0.4846402108669281,
133
+ "rewards/margins": 0.395100474357605,
134
+ "rewards/rejected": -0.8797407150268555,
135
  "step": 80
136
  },
137
  {
138
  "epoch": 0.19,
139
  "learning_rate": 2.929608750821129e-06,
140
+ "logits/chosen": 0.3736918568611145,
141
+ "logits/rejected": 0.5658319592475891,
142
+ "logps/chosen": -444.59234619140625,
143
+ "logps/rejected": -464.6935119628906,
144
+ "loss": 0.0491,
145
+ "rewards/accuracies": 0.699999988079071,
146
+ "rewards/chosen": -0.8689848184585571,
147
+ "rewards/margins": 0.6035453081130981,
148
+ "rewards/rejected": -1.4725301265716553,
149
  "step": 90
150
  },
151
  {
152
  "epoch": 0.21,
153
  "learning_rate": 2.892551899524109e-06,
154
+ "logits/chosen": 0.3380030393600464,
155
+ "logits/rejected": 0.443446546792984,
156
+ "logps/chosen": -408.71551513671875,
157
+ "logps/rejected": -431.513671875,
158
+ "loss": 0.066,
159
+ "rewards/accuracies": 0.643750011920929,
160
+ "rewards/chosen": -0.8214343786239624,
161
+ "rewards/margins": 0.3987075388431549,
162
+ "rewards/rejected": -1.2201420068740845,
163
  "step": 100
164
  },
165
  {
166
  "epoch": 0.21,
167
+ "eval_logits/chosen": 0.6183323860168457,
168
+ "eval_logits/rejected": 0.7686768174171448,
169
+ "eval_logps/chosen": -370.9747009277344,
170
+ "eval_logps/rejected": -438.13714599609375,
171
+ "eval_loss": 0.07016688585281372,
172
+ "eval_rewards/accuracies": 0.7265625,
173
+ "eval_rewards/chosen": -0.47144782543182373,
174
+ "eval_rewards/margins": 0.6085766553878784,
175
+ "eval_rewards/rejected": -1.0800243616104126,
176
+ "eval_runtime": 74.3034,
177
+ "eval_samples_per_second": 26.917,
178
+ "eval_steps_per_second": 0.431,
179
  "step": 100
180
  },
181
  {
182
  "epoch": 0.23,
183
  "learning_rate": 2.848030518377739e-06,
184
+ "logits/chosen": 0.48754867911338806,
185
+ "logits/rejected": 0.6056569814682007,
186
+ "logps/chosen": -394.04449462890625,
187
+ "logps/rejected": -424.449951171875,
188
+ "loss": 0.06,
189
+ "rewards/accuracies": 0.6499999761581421,
190
+ "rewards/chosen": -0.6658821105957031,
191
+ "rewards/margins": 0.43674975633621216,
192
+ "rewards/rejected": -1.1026318073272705,
193
  "step": 110
194
  },
195
  {
196
  "epoch": 0.25,
197
  "learning_rate": 2.7962832564252724e-06,
198
+ "logits/chosen": 0.5436107516288757,
199
+ "logits/rejected": 0.6737319231033325,
200
+ "logps/chosen": -429.415283203125,
201
+ "logps/rejected": -469.0088806152344,
202
+ "loss": 0.0627,
203
+ "rewards/accuracies": 0.6499999761581421,
204
+ "rewards/chosen": -0.7700729370117188,
205
+ "rewards/margins": 0.48356789350509644,
206
+ "rewards/rejected": -1.2536407709121704,
207
  "step": 120
208
  },
209
  {
210
  "epoch": 0.27,
211
  "learning_rate": 2.7375874957747644e-06,
212
+ "logits/chosen": 0.5728715062141418,
213
+ "logits/rejected": 0.7463508248329163,
214
+ "logps/chosen": -441.0868225097656,
215
+ "logps/rejected": -454.98748779296875,
216
+ "loss": 0.0621,
217
+ "rewards/accuracies": 0.7124999761581421,
218
+ "rewards/chosen": -0.8495699763298035,
219
+ "rewards/margins": 0.5289269685745239,
220
+ "rewards/rejected": -1.3784968852996826,
221
  "step": 130
222
  },
223
  {
224
  "epoch": 0.29,
225
  "learning_rate": 2.672257864741005e-06,
226
+ "logits/chosen": 0.6253047585487366,
227
+ "logits/rejected": 0.786455512046814,
228
+ "logps/chosen": -433.4244079589844,
229
+ "logps/rejected": -461.5254821777344,
230
+ "loss": 0.0435,
231
+ "rewards/accuracies": 0.706250011920929,
232
+ "rewards/chosen": -1.083187460899353,
233
+ "rewards/margins": 0.5303990840911865,
234
+ "rewards/rejected": -1.61358642578125,
235
  "step": 140
236
  },
237
  {
238
  "epoch": 0.31,
239
  "learning_rate": 2.600644551335706e-06,
240
+ "logits/chosen": 0.7765518426895142,
241
+ "logits/rejected": 0.984174907207489,
242
+ "logps/chosen": -419.31109619140625,
243
+ "logps/rejected": -431.96795654296875,
244
+ "loss": 0.0444,
245
+ "rewards/accuracies": 0.637499988079071,
246
+ "rewards/chosen": -1.0778591632843018,
247
+ "rewards/margins": 0.4318017363548279,
248
+ "rewards/rejected": -1.5096609592437744,
249
  "step": 150
250
  },
251
  {
252
  "epoch": 0.33,
253
  "learning_rate": 2.5231314261461732e-06,
254
+ "logits/chosen": 0.513221025466919,
255
+ "logits/rejected": 0.7459092140197754,
256
+ "logps/chosen": -418.07421875,
257
+ "logps/rejected": -463.25408935546875,
258
+ "loss": 0.0586,
259
+ "rewards/accuracies": 0.706250011920929,
260
+ "rewards/chosen": -0.6770002245903015,
261
+ "rewards/margins": 0.5035561323165894,
262
+ "rewards/rejected": -1.1805565357208252,
263
  "step": 160
264
  },
265
  {
266
  "epoch": 0.36,
267
  "learning_rate": 2.440133984664454e-06,
268
+ "logits/chosen": 0.5670315027236938,
269
+ "logits/rejected": 0.8073333501815796,
270
+ "logps/chosen": -390.5821228027344,
271
+ "logps/rejected": -419.92626953125,
272
+ "loss": 0.0562,
273
+ "rewards/accuracies": 0.706250011920929,
274
+ "rewards/chosen": -0.8130921125411987,
275
+ "rewards/margins": 0.4765067994594574,
276
+ "rewards/rejected": -1.289598822593689,
277
  "step": 170
278
  },
279
  {
280
  "epoch": 0.38,
281
  "learning_rate": 2.3520971200967337e-06,
282
+ "logits/chosen": 0.39020082354545593,
283
+ "logits/rejected": 0.4927116334438324,
284
+ "logps/chosen": -379.1041259765625,
285
+ "logps/rejected": -440.0082092285156,
286
+ "loss": 0.0533,
287
+ "rewards/accuracies": 0.6187499761581421,
288
+ "rewards/chosen": -0.8111687898635864,
289
+ "rewards/margins": 0.5016359090805054,
290
+ "rewards/rejected": -1.3128045797348022,
291
  "step": 180
292
  },
293
  {
294
  "epoch": 0.4,
295
  "learning_rate": 2.2594927385914546e-06,
296
+ "logits/chosen": 0.32924190163612366,
297
+ "logits/rejected": 0.46087831258773804,
298
+ "logps/chosen": -382.1633605957031,
299
+ "logps/rejected": -444.0999450683594,
300
+ "loss": 0.0495,
301
+ "rewards/accuracies": 0.7124999761581421,
302
+ "rewards/chosen": -0.7652384042739868,
303
+ "rewards/margins": 0.6649683117866516,
304
+ "rewards/rejected": -1.4302066564559937,
305
  "step": 190
306
  },
307
  {
308
  "epoch": 0.42,
309
  "learning_rate": 2.1628172296692954e-06,
310
+ "logits/chosen": 0.21413707733154297,
311
+ "logits/rejected": 0.302509069442749,
312
+ "logps/chosen": -465.3833923339844,
313
+ "logps/rejected": -511.8447265625,
314
+ "loss": 0.0477,
315
+ "rewards/accuracies": 0.7124999761581421,
316
+ "rewards/chosen": -1.172499179840088,
317
+ "rewards/margins": 0.573866069316864,
318
+ "rewards/rejected": -1.7463653087615967,
319
  "step": 200
320
  },
321
  {
322
  "epoch": 0.42,
323
+ "eval_logits/chosen": 0.41806796193122864,
324
+ "eval_logits/rejected": 0.5197638273239136,
325
+ "eval_logps/chosen": -427.650146484375,
326
+ "eval_logps/rejected": -515.7966918945312,
327
+ "eval_loss": 0.050458863377571106,
328
+ "eval_rewards/accuracies": 0.74609375,
329
+ "eval_rewards/chosen": -1.038202166557312,
330
+ "eval_rewards/margins": 0.8184179663658142,
331
+ "eval_rewards/rejected": -1.856619954109192,
332
+ "eval_runtime": 75.1858,
333
+ "eval_samples_per_second": 26.601,
334
+ "eval_steps_per_second": 0.426,
335
  "step": 200
336
  },
337
  {
338
  "epoch": 0.44,
339
  "learning_rate": 2.062588805414343e-06,
340
+ "logits/chosen": 0.29592061042785645,
341
+ "logits/rejected": 0.39124542474746704,
342
+ "logps/chosen": -458.99554443359375,
343
+ "logps/rejected": -476.7998046875,
344
+ "loss": 0.0543,
345
+ "rewards/accuracies": 0.706250011920929,
346
+ "rewards/chosen": -1.0953991413116455,
347
+ "rewards/margins": 0.6356866955757141,
348
+ "rewards/rejected": -1.731086015701294,
349
  "step": 210
350
  },
351
  {
352
  "epoch": 0.46,
353
  "learning_rate": 1.9593447226892386e-06,
354
+ "logits/chosen": 0.23310557007789612,
355
+ "logits/rejected": 0.4742186963558197,
356
+ "logps/chosen": -441.21649169921875,
357
+ "logps/rejected": -468.25286865234375,
358
+ "loss": 0.0599,
359
+ "rewards/accuracies": 0.71875,
360
+ "rewards/chosen": -0.9769255518913269,
361
+ "rewards/margins": 0.7468104362487793,
362
+ "rewards/rejected": -1.723736047744751,
363
  "step": 220
364
  },
365
  {
366
  "epoch": 0.48,
367
  "learning_rate": 1.853638403264141e-06,
368
+ "logits/chosen": 0.4100280702114105,
369
+ "logits/rejected": 0.5993035435676575,
370
+ "logps/chosen": -494.64324951171875,
371
+ "logps/rejected": -490.0165100097656,
372
+ "loss": 0.0578,
373
  "rewards/accuracies": 0.7250000238418579,
374
+ "rewards/chosen": -1.2230786085128784,
375
+ "rewards/margins": 0.6530172824859619,
376
+ "rewards/rejected": -1.8760957717895508,
377
  "step": 230
378
  },
379
  {
380
  "epoch": 0.5,
381
  "learning_rate": 1.7460364672965328e-06,
382
+ "logits/chosen": 0.6504024267196655,
383
+ "logits/rejected": 0.7802666425704956,
384
+ "logps/chosen": -466.16973876953125,
385
+ "logps/rejected": -511.08502197265625,
386
+ "loss": 0.0549,
387
+ "rewards/accuracies": 0.643750011920929,
388
+ "rewards/chosen": -1.191239595413208,
389
+ "rewards/margins": 0.7851654291152954,
390
+ "rewards/rejected": -1.976405143737793,
391
  "step": 240
392
  },
393
  {
394
  "epoch": 0.52,
395
  "learning_rate": 1.637115696063402e-06,
396
+ "logits/chosen": 0.7357971668243408,
397
+ "logits/rejected": 0.8341084718704224,
398
+ "logps/chosen": -462.93048095703125,
399
+ "logps/rejected": -550.9013671875,
400
+ "loss": 0.0342,
401
+ "rewards/accuracies": 0.675000011920929,
402
+ "rewards/chosen": -1.577097773551941,
403
+ "rewards/margins": 0.7956889271736145,
404
+ "rewards/rejected": -2.3727867603302,
405
  "step": 250
406
  },
407
  {
408
  "epoch": 0.54,
409
  "learning_rate": 1.5274599402265162e-06,
410
+ "logits/chosen": 0.7676488757133484,
411
+ "logits/rejected": 0.9279497861862183,
412
+ "logps/chosen": -490.0227966308594,
413
+ "logps/rejected": -543.2033081054688,
414
+ "loss": 0.0336,
415
+ "rewards/accuracies": 0.6625000238418579,
416
+ "rewards/chosen": -1.5475876331329346,
417
+ "rewards/margins": 0.6384353041648865,
418
+ "rewards/rejected": -2.186022996902466,
419
  "step": 260
420
  },
421
  {
422
  "epoch": 0.57,
423
  "learning_rate": 1.4176569902035088e-06,
424
+ "logits/chosen": 0.7670334577560425,
425
+ "logits/rejected": 0.927658200263977,
426
+ "logps/chosen": -455.6305236816406,
427
+ "logps/rejected": -507.54913330078125,
428
+ "loss": 0.0334,
429
+ "rewards/accuracies": 0.675000011920929,
430
+ "rewards/chosen": -1.38298761844635,
431
+ "rewards/margins": 0.6534308195114136,
432
+ "rewards/rejected": -2.0364184379577637,
433
  "step": 270
434
  },
435
  {
436
  "epoch": 0.59,
437
  "learning_rate": 1.308295425420593e-06,
438
+ "logits/chosen": 0.7235329151153564,
439
+ "logits/rejected": 0.8158149719238281,
440
+ "logps/chosen": -491.1328125,
441
+ "logps/rejected": -560.6801147460938,
442
+ "loss": 0.0301,
443
+ "rewards/accuracies": 0.699999988079071,
444
+ "rewards/chosen": -1.4047319889068604,
445
+ "rewards/margins": 0.7390089631080627,
446
+ "rewards/rejected": -2.1437408924102783,
447
  "step": 280
448
  },
449
  {
450
  "epoch": 0.61,
451
  "learning_rate": 1.1999614593359337e-06,
452
+ "logits/chosen": 0.7884746789932251,
453
+ "logits/rejected": 1.0120609998703003,
454
+ "logps/chosen": -492.41693115234375,
455
+ "logps/rejected": -518.9060668945312,
456
+ "loss": 0.03,
457
+ "rewards/accuracies": 0.6625000238418579,
458
+ "rewards/chosen": -1.4595239162445068,
459
+ "rewards/margins": 0.7071082592010498,
460
+ "rewards/rejected": -2.1666321754455566,
461
  "step": 290
462
  },
463
  {
464
  "epoch": 0.63,
465
  "learning_rate": 1.0932357971453745e-06,
466
+ "logits/chosen": 0.8025213479995728,
467
+ "logits/rejected": 0.9630680084228516,
468
+ "logps/chosen": -472.7798767089844,
469
+ "logps/rejected": -523.0516967773438,
470
+ "loss": 0.0313,
471
+ "rewards/accuracies": 0.6875,
472
+ "rewards/chosen": -1.4041074514389038,
473
+ "rewards/margins": 0.6285351514816284,
474
+ "rewards/rejected": -2.0326426029205322,
475
  "step": 300
476
  },
477
  {
478
  "epoch": 0.63,
479
+ "eval_logits/chosen": 0.9400739669799805,
480
+ "eval_logits/rejected": 1.0433921813964844,
481
+ "eval_logps/chosen": -454.1192932128906,
482
+ "eval_logps/rejected": -552.3697509765625,
483
+ "eval_loss": 0.03436482325196266,
484
  "eval_rewards/accuracies": 0.72265625,
485
+ "eval_rewards/chosen": -1.3028936386108398,
486
+ "eval_rewards/margins": 0.9194571375846863,
487
+ "eval_rewards/rejected": -2.222350835800171,
488
+ "eval_runtime": 75.6069,
489
+ "eval_samples_per_second": 26.453,
490
+ "eval_steps_per_second": 0.423,
491
  "step": 300
492
  },
493
  {
494
  "epoch": 0.65,
495
  "learning_rate": 9.886905230142433e-07,
496
+ "logits/chosen": 0.7544746398925781,
497
+ "logits/rejected": 0.9142723083496094,
498
+ "logps/chosen": -462.0435485839844,
499
+ "logps/rejected": -525.331298828125,
500
+ "loss": 0.0346,
501
+ "rewards/accuracies": 0.6625000238418579,
502
+ "rewards/chosen": -1.3456170558929443,
503
+ "rewards/margins": 0.749636709690094,
504
+ "rewards/rejected": -2.0952537059783936,
505
  "step": 310
506
  },
507
  {
508
  "epoch": 0.67,
509
  "learning_rate": 8.868860335206678e-07,
510
+ "logits/chosen": 0.9283370971679688,
511
+ "logits/rejected": 1.136993169784546,
512
+ "logps/chosen": -478.44976806640625,
513
+ "logps/rejected": -530.1534423828125,
514
+ "loss": 0.0338,
515
+ "rewards/accuracies": 0.7124999761581421,
516
+ "rewards/chosen": -1.244257926940918,
517
+ "rewards/margins": 0.6402724385261536,
518
+ "rewards/rejected": -1.8845303058624268,
519
  "step": 320
520
  },
521
  {
522
  "epoch": 0.69,
523
  "learning_rate": 7.883680337481599e-07,
524
+ "logits/chosen": 0.7307278513908386,
525
+ "logits/rejected": 0.8725861310958862,
526
+ "logps/chosen": -448.43280029296875,
527
+ "logps/rejected": -533.6476440429688,
528
+ "loss": 0.0375,
529
+ "rewards/accuracies": 0.6812499761581421,
530
+ "rewards/chosen": -1.2363145351409912,
531
+ "rewards/margins": 0.7372487187385559,
532
+ "rewards/rejected": -1.9735629558563232,
533
  "step": 330
534
  },
535
  {
536
  "epoch": 0.71,
537
  "learning_rate": 6.936646121293654e-07,
538
+ "logits/chosen": 0.5649510622024536,
539
+ "logits/rejected": 0.7639907598495483,
540
+ "logps/chosen": -466.2808532714844,
541
+ "logps/rejected": -526.1297607421875,
542
+ "loss": 0.0435,
543
  "rewards/accuracies": 0.706250011920929,
544
+ "rewards/chosen": -1.0562084913253784,
545
+ "rewards/margins": 0.7370297312736511,
546
+ "rewards/rejected": -1.7932384014129639,
547
  "step": 340
548
  },
549
  {
550
  "epoch": 0.73,
551
  "learning_rate": 6.032834097207889e-07,
552
+ "logits/chosen": 0.7209309935569763,
553
+ "logits/rejected": 0.7828409671783447,
554
+ "logps/chosen": -401.2094421386719,
555
+ "logps/rejected": -480.31671142578125,
556
+ "loss": 0.0403,
557
+ "rewards/accuracies": 0.699999988079071,
558
+ "rewards/chosen": -1.098332405090332,
559
+ "rewards/margins": 0.6962517499923706,
560
+ "rewards/rejected": -1.7945845127105713,
561
  "step": 350
562
  },
563
  {
564
  "epoch": 0.75,
565
  "learning_rate": 5.177088990820725e-07,
566
+ "logits/chosen": 0.6787894368171692,
567
+ "logits/rejected": 0.8372275233268738,
568
+ "logps/chosen": -446.56317138671875,
569
+ "logps/rejected": -465.1809997558594,
570
+ "loss": 0.0453,
571
+ "rewards/accuracies": 0.6625000238418579,
572
+ "rewards/chosen": -1.136115550994873,
573
+ "rewards/margins": 0.6250497698783875,
574
+ "rewards/rejected": -1.7611652612686157,
575
  "step": 360
576
  },
577
  {
578
  "epoch": 0.77,
579
  "learning_rate": 4.3739978734594494e-07,
580
+ "logits/chosen": 0.6346519589424133,
581
+ "logits/rejected": 0.867949366569519,
582
+ "logps/chosen": -439.4676208496094,
583
+ "logps/rejected": -468.6329040527344,
584
+ "loss": 0.0364,
585
+ "rewards/accuracies": 0.75,
586
+ "rewards/chosen": -1.016570806503296,
587
+ "rewards/margins": 0.8048780560493469,
588
+ "rewards/rejected": -1.8214489221572876,
589
  "step": 370
590
  },
591
  {
592
  "epoch": 0.8,
593
  "learning_rate": 3.627865573992087e-07,
594
+ "logits/chosen": 0.6531890630722046,
595
+ "logits/rejected": 0.6925245523452759,
596
+ "logps/chosen": -437.359375,
597
+ "logps/rejected": -492.814453125,
598
+ "loss": 0.0425,
599
+ "rewards/accuracies": 0.6812499761581421,
600
+ "rewards/chosen": -1.09610116481781,
601
+ "rewards/margins": 0.6472191214561462,
602
+ "rewards/rejected": -1.7433204650878906,
603
  "step": 380
604
  },
605
  {
606
  "epoch": 0.82,
607
  "learning_rate": 2.9426916035484166e-07,
608
+ "logits/chosen": 0.4887206554412842,
609
+ "logits/rejected": 0.7168077230453491,
610
+ "logps/chosen": -490.0777893066406,
611
+ "logps/rejected": -530.9293212890625,
612
+ "loss": 0.038,
613
+ "rewards/accuracies": 0.7875000238418579,
614
+ "rewards/chosen": -1.2254283428192139,
615
+ "rewards/margins": 0.8675802946090698,
616
+ "rewards/rejected": -2.0930087566375732,
617
  "step": 390
618
  },
619
  {
620
  "epoch": 0.84,
621
  "learning_rate": 2.322148716843081e-07,
622
+ "logits/chosen": 0.6055541038513184,
623
+ "logits/rejected": 0.687682032585144,
624
+ "logps/chosen": -429.68603515625,
625
+ "logps/rejected": -461.8595275878906,
626
+ "loss": 0.0359,
627
+ "rewards/accuracies": 0.675000011920929,
628
+ "rewards/chosen": -1.1895955801010132,
629
+ "rewards/margins": 0.5694113373756409,
630
+ "rewards/rejected": -1.7590070962905884,
631
  "step": 400
632
  },
633
  {
634
  "epoch": 0.84,
635
+ "eval_logits/chosen": 0.7290832996368408,
636
+ "eval_logits/rejected": 0.8196390867233276,
637
+ "eval_logps/chosen": -435.5875244140625,
638
+ "eval_logps/rejected": -531.2747192382812,
639
+ "eval_loss": 0.04154704138636589,
640
+ "eval_rewards/accuracies": 0.70703125,
641
+ "eval_rewards/chosen": -1.1175758838653564,
642
+ "eval_rewards/margins": 0.8938245177268982,
643
+ "eval_rewards/rejected": -2.0114002227783203,
644
+ "eval_runtime": 75.1852,
645
+ "eval_samples_per_second": 26.601,
646
+ "eval_steps_per_second": 0.426,
647
  "step": 400
648
  },
649
  {
650
  "epoch": 0.86,
651
  "learning_rate": 1.7695632250191002e-07,
652
+ "logits/chosen": 0.5428584814071655,
653
+ "logits/rejected": 0.6822582483291626,
654
+ "logps/chosen": -435.78680419921875,
655
+ "logps/rejected": -452.6622009277344,
656
+ "loss": 0.0367,
657
  "rewards/accuracies": 0.6875,
658
+ "rewards/chosen": -1.1751288175582886,
659
+ "rewards/margins": 0.5176131129264832,
660
+ "rewards/rejected": -1.6927419900894165,
661
  "step": 410
662
  },
663
  {
664
  "epoch": 0.88,
665
  "learning_rate": 1.2878971655412515e-07,
666
+ "logits/chosen": 0.5744162797927856,
667
+ "logits/rejected": 0.6994149088859558,
668
+ "logps/chosen": -474.30908203125,
669
+ "logps/rejected": -495.92852783203125,
670
+ "loss": 0.0394,
671
+ "rewards/accuracies": 0.637499988079071,
672
+ "rewards/chosen": -1.3165512084960938,
673
+ "rewards/margins": 0.6040414571762085,
674
+ "rewards/rejected": -1.9205926656723022,
675
  "step": 420
676
  },
677
  {
678
  "epoch": 0.9,
679
  "learning_rate": 8.797324247145411e-08,
680
+ "logits/chosen": 0.6493648290634155,
681
+ "logits/rejected": 0.6758213043212891,
682
+ "logps/chosen": -426.60223388671875,
683
+ "logps/rejected": -521.1129150390625,
684
+ "loss": 0.0365,
685
+ "rewards/accuracies": 0.75,
686
+ "rewards/chosen": -1.1816965341567993,
687
+ "rewards/margins": 0.7749902009963989,
688
+ "rewards/rejected": -1.9566866159439087,
689
  "step": 430
690
  },
691
  {
692
  "epoch": 0.92,
693
  "learning_rate": 5.472568979361853e-08,
694
+ "logits/chosen": 0.7012882232666016,
695
+ "logits/rejected": 0.7845873832702637,
696
+ "logps/chosen": -459.6414489746094,
697
+ "logps/rejected": -518.3292846679688,
698
+ "loss": 0.0412,
699
+ "rewards/accuracies": 0.643750011920929,
700
+ "rewards/chosen": -1.2379354238510132,
701
+ "rewards/margins": 0.7565950155258179,
702
+ "rewards/rejected": -1.994530439376831,
703
  "step": 440
704
  },
705
  {
706
  "epoch": 0.94,
707
  "learning_rate": 2.922527618666465e-08,
708
+ "logits/chosen": 0.6378465294837952,
709
+ "logits/rejected": 0.8079195022583008,
710
+ "logps/chosen": -484.46197509765625,
711
+ "logps/rejected": -520.6287841796875,
712
+ "loss": 0.0404,
713
+ "rewards/accuracies": 0.6937500238418579,
714
+ "rewards/chosen": -1.2168313264846802,
715
+ "rewards/margins": 0.676922082901001,
716
+ "rewards/rejected": -1.8937534093856812,
717
  "step": 450
718
  },
719
  {
720
  "epoch": 0.96,
721
  "learning_rate": 1.1608692138469379e-08,
722
+ "logits/chosen": 0.7224764227867126,
723
+ "logits/rejected": 0.8670576214790344,
724
+ "logps/chosen": -398.640869140625,
725
+ "logps/rejected": -444.4422912597656,
726
+ "loss": 0.039,
727
+ "rewards/accuracies": 0.675000011920929,
728
+ "rewards/chosen": -1.110528826713562,
729
+ "rewards/margins": 0.6203423738479614,
730
+ "rewards/rejected": -1.7308712005615234,
731
  "step": 460
732
  },
733
  {
734
  "epoch": 0.98,
735
  "learning_rate": 1.970368253390198e-09,
736
+ "logits/chosen": 0.6133291125297546,
737
+ "logits/rejected": 0.744029700756073,
738
+ "logps/chosen": -413.31732177734375,
739
+ "logps/rejected": -497.829345703125,
740
+ "loss": 0.0407,
741
+ "rewards/accuracies": 0.699999988079071,
742
+ "rewards/chosen": -1.0763300657272339,
743
+ "rewards/margins": 0.8329319953918457,
744
+ "rewards/rejected": -1.9092620611190796,
745
  "step": 470
746
  },
747
  {
748
  "epoch": 1.0,
749
  "step": 477,
750
  "total_flos": 0.0,
751
+ "train_loss": 0.055112330793584664,
752
+ "train_runtime": 4571.3444,
753
+ "train_samples_per_second": 13.373,
754
+ "train_steps_per_second": 0.104
755
  }
756
  ],
757
  "logging_steps": 10,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:849bf876d39c1ecc4d413b77ba74c1ef1c656105fcb277b563b6359ad5dfa298
3
  size 5944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:200542098b43881df0df6dc0ff3056ca0236db5763f486bb392f305292932d2f
3
  size 5944