wzhouad commited on
Commit
374f588
1 Parent(s): 64a8545

Model save

Browse files
README.md CHANGED
@@ -15,15 +15,15 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  This model was trained from scratch on the None dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 0.6704
19
- - Rewards/chosen: -0.2623
20
- - Rewards/rejected: -0.4157
21
- - Rewards/accuracies: 0.6172
22
- - Rewards/margins: 0.1534
23
- - Logps/rejected: -407.2814
24
- - Logps/chosen: -385.6814
25
- - Logits/rejected: 0.7859
26
- - Logits/chosen: 0.6443
27
 
28
  ## Model description
29
 
@@ -45,7 +45,7 @@ The following hyperparameters were used during training:
45
  - learning_rate: 5e-07
46
  - train_batch_size: 4
47
  - eval_batch_size: 8
48
- - seed: 1
49
  - distributed_type: multi-GPU
50
  - num_devices: 8
51
  - gradient_accumulation_steps: 4
@@ -60,9 +60,10 @@ The following hyperparameters were used during training:
60
 
61
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
62
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
63
- | 0.6155 | 0.28 | 100 | 0.6849 | 0.0942 | 0.0473 | 0.5234 | 0.0469 | -360.9828 | -350.0315 | 0.5391 | 0.4541 |
64
- | 0.5661 | 0.56 | 200 | 0.6719 | -0.1694 | -0.2891 | 0.6055 | 0.1196 | -394.6170 | -376.3940 | 0.8087 | 0.6693 |
65
- | 0.5681 | 0.84 | 300 | 0.6704 | -0.2623 | -0.4157 | 0.6172 | 0.1534 | -407.2814 | -385.6814 | 0.7859 | 0.6443 |
 
66
 
67
 
68
  ### Framework versions
 
15
 
16
  This model was trained from scratch on the None dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 0.0495
19
+ - Rewards/chosen: -0.5743
20
+ - Rewards/rejected: -1.1134
21
+ - Rewards/accuracies: 0.7344
22
+ - Rewards/margins: 0.5391
23
+ - Logps/rejected: -477.0538
24
+ - Logps/chosen: -416.8812
25
+ - Logits/rejected: 0.8329
26
+ - Logits/chosen: 0.7145
27
 
28
  ## Model description
29
 
 
45
  - learning_rate: 5e-07
46
  - train_batch_size: 4
47
  - eval_batch_size: 8
48
+ - seed: 5
49
  - distributed_type: multi-GPU
50
  - num_devices: 8
51
  - gradient_accumulation_steps: 4
 
60
 
61
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
62
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
63
+ | 0.0975 | 0.21 | 100 | 0.0975 | -0.0605 | -0.2369 | 0.6914 | 0.1765 | -389.4015 | -365.4964 | 0.5340 | 0.4693 |
64
+ | 0.0589 | 0.42 | 200 | 0.0582 | -0.4455 | -0.8736 | 0.7148 | 0.4281 | -453.0718 | -404.0002 | 0.7808 | 0.6615 |
65
+ | 0.0465 | 0.63 | 300 | 0.0494 | -0.6054 | -1.1172 | 0.7031 | 0.5117 | -477.4249 | -419.9954 | 0.8961 | 0.7931 |
66
+ | 0.0419 | 0.84 | 400 | 0.0495 | -0.5743 | -1.1134 | 0.7344 | 0.5391 | -477.0538 | -416.8812 | 0.8329 | 0.7145 |
67
 
68
 
69
  ### Framework versions
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.5989744803878698,
4
- "train_runtime": 3249.9516,
5
- "train_samples": 45548,
6
- "train_samples_per_second": 14.015,
7
- "train_steps_per_second": 0.11
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.06584663976538356,
4
+ "train_runtime": 4434.0315,
5
+ "train_samples": 61134,
6
+ "train_samples_per_second": 13.787,
7
+ "train_steps_per_second": 0.108
8
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:863dc54b67f5e81f1f9d6bd0780fce9a033593530d5e8a615a12530d9e01f9d1
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:742a3a39155dfe2982b2079fe8048378854d72be66f8bb03992eab95c8d8613f
3
  size 4976698672
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1e443f3c5d5cb1fc73cdd58c50675f30a54cc8fba5b6b5800b9cf3ed189c65f7
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9663e310a114c8e5dfe67123c8fa2e0b3f06238bc3727bd48b2fbf862d129e4
3
  size 4999802720
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f117388c044429b966b07170241feb74d6fce6b4a96e19453046378eb573f9ef
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be6bbc86492f411d627c4ebb8e6aeaff116a8962892dec9b9af59b92427b849f
3
  size 4915916176
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2e94d35aef64c4dfe770fc21db27284361279aa63ef85154fa4b1e24ebc0c3ab
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:841fb520274242fc5c5655fa5d9e40cd6d96f0bb2ae1af50364d0590d1160c1f
3
  size 1168138808
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.5989744803878698,
4
- "train_runtime": 3249.9516,
5
- "train_samples": 45548,
6
- "train_samples_per_second": 14.015,
7
- "train_steps_per_second": 0.11
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.06584663976538356,
4
+ "train_runtime": 4434.0315,
5
+ "train_samples": 61134,
6
+ "train_samples_per_second": 13.787,
7
+ "train_steps_per_second": 0.108
8
  }
trainer_state.json CHANGED
@@ -1,21 +1,21 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
  "eval_steps": 100,
6
- "global_step": 356,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "learning_rate": 1.3888888888888887e-08,
14
- "logits/chosen": -0.07916320115327835,
15
- "logits/rejected": 0.09423620253801346,
16
- "logps/chosen": -527.0689697265625,
17
- "logps/rejected": -183.19036865234375,
18
- "loss": 0.6931,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
@@ -23,555 +23,739 @@
23
  "step": 1
24
  },
25
  {
26
- "epoch": 0.03,
27
- "learning_rate": 1.3888888888888888e-07,
28
- "logits/chosen": 0.00488958740606904,
29
- "logits/rejected": 0.11317457258701324,
30
- "logps/chosen": -361.6508483886719,
31
- "logps/rejected": -210.14126586914062,
32
- "loss": 0.6933,
33
  "rewards/accuracies": 0.4583333432674408,
34
- "rewards/chosen": 1.533585600554943e-05,
35
- "rewards/margins": 0.000240087800193578,
36
- "rewards/rejected": -0.00022475191508419812,
37
  "step": 10
38
  },
39
  {
40
- "epoch": 0.06,
41
- "learning_rate": 2.7777777777777776e-07,
42
- "logits/chosen": 0.026235083118081093,
43
- "logits/rejected": 0.12080521881580353,
44
- "logps/chosen": -340.08831787109375,
45
- "logps/rejected": -205.6613311767578,
46
- "loss": 0.6912,
47
- "rewards/accuracies": 0.5687500238418579,
48
- "rewards/chosen": 0.001607197686098516,
49
- "rewards/margins": 0.0044591957703232765,
50
- "rewards/rejected": -0.0028519982006400824,
51
  "step": 20
52
  },
53
  {
54
- "epoch": 0.08,
55
- "learning_rate": 4.1666666666666667e-07,
56
- "logits/chosen": 0.08616660535335541,
57
- "logits/rejected": 0.18304046988487244,
58
- "logps/chosen": -364.1296691894531,
59
- "logps/rejected": -224.0422821044922,
60
- "loss": 0.6816,
61
- "rewards/accuracies": 0.606249988079071,
62
- "rewards/chosen": 0.012839061208069324,
63
- "rewards/margins": 0.027920549735426903,
64
- "rewards/rejected": -0.015081489458680153,
65
  "step": 30
66
  },
67
  {
68
- "epoch": 0.11,
69
- "learning_rate": 4.998072590601808e-07,
70
- "logits/chosen": 0.026512805372476578,
71
- "logits/rejected": 0.10784071683883667,
72
- "logps/chosen": -326.8155822753906,
73
- "logps/rejected": -203.4437713623047,
74
- "loss": 0.668,
75
- "rewards/accuracies": 0.6312500238418579,
76
- "rewards/chosen": 0.006709781475365162,
77
- "rewards/margins": 0.05660278722643852,
78
- "rewards/rejected": -0.04989300295710564,
79
  "step": 40
80
  },
81
  {
82
- "epoch": 0.14,
83
- "learning_rate": 4.976423351108942e-07,
84
- "logits/chosen": 0.013665281236171722,
85
- "logits/rejected": 0.13254693150520325,
86
- "logps/chosen": -336.5925598144531,
87
- "logps/rejected": -230.033203125,
88
- "loss": 0.6523,
89
- "rewards/accuracies": 0.59375,
90
- "rewards/chosen": -0.012301790527999401,
91
- "rewards/margins": 0.07358390092849731,
92
- "rewards/rejected": -0.08588568866252899,
93
  "step": 50
94
  },
95
  {
96
- "epoch": 0.17,
97
- "learning_rate": 4.930924800994191e-07,
98
- "logits/chosen": -0.075148805975914,
99
- "logits/rejected": 0.06099820137023926,
100
- "logps/chosen": -382.0185852050781,
101
- "logps/rejected": -227.37222290039062,
102
- "loss": 0.6225,
103
- "rewards/accuracies": 0.6625000238418579,
104
- "rewards/chosen": -0.002184201730415225,
105
- "rewards/margins": 0.19139915704727173,
106
- "rewards/rejected": -0.19358336925506592,
107
  "step": 60
108
  },
109
  {
110
- "epoch": 0.2,
111
- "learning_rate": 4.862015116167195e-07,
112
- "logits/chosen": -0.07402805984020233,
113
- "logits/rejected": 0.04597530514001846,
114
- "logps/chosen": -395.2985534667969,
115
- "logps/rejected": -225.6622772216797,
116
- "loss": 0.6239,
117
- "rewards/accuracies": 0.6499999761581421,
118
- "rewards/chosen": 0.0852896124124527,
119
- "rewards/margins": 0.2375856637954712,
120
- "rewards/rejected": -0.1522960662841797,
121
  "step": 70
122
  },
123
  {
124
- "epoch": 0.22,
125
- "learning_rate": 4.770357934562704e-07,
126
- "logits/chosen": -0.08815683424472809,
127
- "logits/rejected": 0.0715162605047226,
128
- "logps/chosen": -335.470703125,
129
- "logps/rejected": -209.92050170898438,
130
- "loss": 0.6197,
131
- "rewards/accuracies": 0.6625000238418579,
132
- "rewards/chosen": 0.06459876894950867,
133
- "rewards/margins": 0.25264090299606323,
134
- "rewards/rejected": -0.18804213404655457,
135
  "step": 80
136
  },
137
  {
138
- "epoch": 0.25,
139
- "learning_rate": 4.6568359649444796e-07,
140
- "logits/chosen": -0.06088203191757202,
141
- "logits/rejected": 0.012268425896763802,
142
- "logps/chosen": -375.87322998046875,
143
- "logps/rejected": -250.84396362304688,
144
- "loss": 0.6153,
145
- "rewards/accuracies": 0.6312500238418579,
146
- "rewards/chosen": 0.05378664657473564,
147
- "rewards/margins": 0.28628265857696533,
148
- "rewards/rejected": -0.23249602317810059,
149
  "step": 90
150
  },
151
  {
152
- "epoch": 0.28,
153
- "learning_rate": 4.5225424859373684e-07,
154
- "logits/chosen": 0.02037966251373291,
155
- "logits/rejected": 0.23395180702209473,
156
- "logps/chosen": -363.3604736328125,
157
- "logps/rejected": -233.18899536132812,
158
- "loss": 0.6155,
159
- "rewards/accuracies": 0.7124999761581421,
160
- "rewards/chosen": 0.049579061567783356,
161
- "rewards/margins": 0.3324902653694153,
162
- "rewards/rejected": -0.2829112410545349,
163
  "step": 100
164
  },
165
  {
166
- "epoch": 0.28,
167
- "eval_logits/chosen": 0.4540720582008362,
168
- "eval_logits/rejected": 0.5391180515289307,
169
- "eval_logps/chosen": -350.0314636230469,
170
- "eval_logps/rejected": -360.9827575683594,
171
- "eval_loss": 0.6848979592323303,
172
- "eval_rewards/accuracies": 0.5234375,
173
- "eval_rewards/chosen": 0.09419750422239304,
174
- "eval_rewards/margins": 0.04693090170621872,
175
- "eval_rewards/rejected": 0.04726658761501312,
176
- "eval_runtime": 65.4973,
177
- "eval_samples_per_second": 30.536,
178
- "eval_steps_per_second": 0.489,
179
  "step": 100
180
  },
181
  {
182
- "epoch": 0.31,
183
- "learning_rate": 4.3687708171564917e-07,
184
- "logits/chosen": -0.010720082558691502,
185
- "logits/rejected": 0.19943444430828094,
186
- "logps/chosen": -320.583984375,
187
- "logps/rejected": -260.4593505859375,
188
- "loss": 0.618,
189
- "rewards/accuracies": 0.6812499761581421,
190
- "rewards/chosen": -0.04500458389520645,
191
- "rewards/margins": 0.1880386769771576,
192
- "rewards/rejected": -0.23304326832294464,
193
  "step": 110
194
  },
195
  {
196
- "epoch": 0.34,
197
- "learning_rate": 4.1970018638323547e-07,
198
- "logits/chosen": 0.22007820010185242,
199
- "logits/rejected": 0.3844499886035919,
200
- "logps/chosen": -318.66680908203125,
201
- "logps/rejected": -220.34548950195312,
202
- "loss": 0.6024,
203
- "rewards/accuracies": 0.65625,
204
- "rewards/chosen": -0.026034215465188026,
205
- "rewards/margins": 0.22070157527923584,
206
- "rewards/rejected": -0.2467358112335205,
207
  "step": 120
208
  },
209
  {
210
- "epoch": 0.37,
211
- "learning_rate": 4.0088898548839285e-07,
212
- "logits/chosen": 0.15705306828022003,
213
- "logits/rejected": 0.3788728713989258,
214
- "logps/chosen": -404.3878479003906,
215
- "logps/rejected": -239.31332397460938,
216
- "loss": 0.5953,
217
- "rewards/accuracies": 0.75,
218
- "rewards/chosen": 0.12396061420440674,
219
- "rewards/margins": 0.4470479488372803,
220
- "rewards/rejected": -0.3230873644351959,
221
  "step": 130
222
  },
223
  {
224
- "epoch": 0.39,
225
- "learning_rate": 3.806246411789872e-07,
226
- "logits/chosen": 0.18306098878383636,
227
- "logits/rejected": 0.36407768726348877,
228
- "logps/chosen": -325.9332580566406,
229
- "logps/rejected": -234.54403686523438,
230
- "loss": 0.5921,
231
- "rewards/accuracies": 0.7124999761581421,
232
- "rewards/chosen": -0.0044938018545508385,
233
- "rewards/margins": 0.27833661437034607,
234
- "rewards/rejected": -0.2828304171562195,
235
  "step": 140
236
  },
237
  {
238
- "epoch": 0.42,
239
- "learning_rate": 3.5910231016833546e-07,
240
- "logits/chosen": 0.23124487698078156,
241
- "logits/rejected": 0.4330722391605377,
242
- "logps/chosen": -350.52386474609375,
243
- "logps/rejected": -267.3382568359375,
244
- "loss": 0.5908,
245
- "rewards/accuracies": 0.675000011920929,
246
- "rewards/chosen": -0.06408815085887909,
247
- "rewards/margins": 0.2850314974784851,
248
- "rewards/rejected": -0.349119633436203,
249
  "step": 150
250
  },
251
  {
252
- "epoch": 0.45,
253
- "learning_rate": 3.3652926426937325e-07,
254
- "logits/chosen": 0.2642674148082733,
255
- "logits/rejected": 0.5020841360092163,
256
- "logps/chosen": -367.66485595703125,
257
- "logps/rejected": -243.3483123779297,
258
- "loss": 0.5932,
259
- "rewards/accuracies": 0.7437499761581421,
260
- "rewards/chosen": 0.005097188055515289,
261
- "rewards/margins": 0.35703176259994507,
262
- "rewards/rejected": -0.351934552192688,
263
  "step": 160
264
  },
265
  {
266
- "epoch": 0.48,
267
- "learning_rate": 3.1312289425378944e-07,
268
- "logits/chosen": 0.285301148891449,
269
- "logits/rejected": 0.4867871403694153,
270
- "logps/chosen": -334.489013671875,
271
- "logps/rejected": -243.4333953857422,
272
- "loss": 0.5806,
273
- "rewards/accuracies": 0.706250011920929,
274
- "rewards/chosen": -0.058128129690885544,
275
- "rewards/margins": 0.36435943841934204,
276
- "rewards/rejected": -0.4224874973297119,
277
  "step": 170
278
  },
279
  {
280
- "epoch": 0.51,
281
- "learning_rate": 2.8910861626005773e-07,
282
- "logits/chosen": 0.16094639897346497,
283
- "logits/rejected": 0.48152345418930054,
284
- "logps/chosen": -354.7293395996094,
285
- "logps/rejected": -256.0998840332031,
286
- "loss": 0.5832,
287
- "rewards/accuracies": 0.706250011920929,
288
- "rewards/chosen": -0.12431593984365463,
289
- "rewards/margins": 0.33745259046554565,
290
- "rewards/rejected": -0.4617684781551361,
291
  "step": 180
292
  },
293
  {
294
- "epoch": 0.53,
295
- "learning_rate": 2.647177009127972e-07,
296
- "logits/chosen": 0.24900703132152557,
297
- "logits/rejected": 0.4860251843929291,
298
- "logps/chosen": -368.28717041015625,
299
- "logps/rejected": -249.85092163085938,
300
- "loss": 0.5838,
301
- "rewards/accuracies": 0.6812499761581421,
302
- "rewards/chosen": -0.11349456012248993,
303
- "rewards/margins": 0.38067418336868286,
304
- "rewards/rejected": -0.494168758392334,
305
  "step": 190
306
  },
307
  {
308
- "epoch": 0.56,
309
- "learning_rate": 2.401850460602329e-07,
310
- "logits/chosen": 0.16625070571899414,
311
- "logits/rejected": 0.4386712610721588,
312
- "logps/chosen": -375.9776916503906,
313
- "logps/rejected": -249.65512084960938,
314
- "loss": 0.5661,
315
- "rewards/accuracies": 0.737500011920929,
316
- "rewards/chosen": -0.04777635633945465,
317
- "rewards/margins": 0.4425368309020996,
318
- "rewards/rejected": -0.4903131425380707,
319
  "step": 200
320
  },
321
  {
322
- "epoch": 0.56,
323
- "eval_logits/chosen": 0.6692676544189453,
324
- "eval_logits/rejected": 0.8087128400802612,
325
- "eval_logps/chosen": -376.39398193359375,
326
- "eval_logps/rejected": -394.6169738769531,
327
- "eval_loss": 0.6718646287918091,
328
- "eval_rewards/accuracies": 0.60546875,
329
- "eval_rewards/chosen": -0.1694278120994568,
330
- "eval_rewards/margins": 0.1196480467915535,
331
- "eval_rewards/rejected": -0.2890758514404297,
332
- "eval_runtime": 65.6783,
333
- "eval_samples_per_second": 30.451,
334
- "eval_steps_per_second": 0.487,
335
  "step": 200
336
  },
337
  {
338
- "epoch": 0.59,
339
- "learning_rate": 2.1574691457950803e-07,
340
- "logits/chosen": 0.18152353167533875,
341
- "logits/rejected": 0.4297953248023987,
342
- "logps/chosen": -432.59490966796875,
343
- "logps/rejected": -252.2208251953125,
344
- "loss": 0.5731,
345
- "rewards/accuracies": 0.6812499761581421,
346
- "rewards/chosen": -0.0022571056615561247,
347
- "rewards/margins": 0.511215090751648,
348
- "rewards/rejected": -0.5134721994400024,
349
  "step": 210
350
  },
351
  {
352
- "epoch": 0.62,
353
- "learning_rate": 1.9163865903602372e-07,
354
- "logits/chosen": 0.2716488242149353,
355
- "logits/rejected": 0.5078220963478088,
356
- "logps/chosen": -389.2140197753906,
357
- "logps/rejected": -263.9394226074219,
358
- "loss": 0.5783,
359
- "rewards/accuracies": 0.737500011920929,
360
- "rewards/chosen": -0.07382676750421524,
361
- "rewards/margins": 0.4518999457359314,
362
- "rewards/rejected": -0.5257267355918884,
363
  "step": 220
364
  },
365
  {
366
- "epoch": 0.65,
367
- "learning_rate": 1.6809245510957666e-07,
368
- "logits/chosen": 0.16021332144737244,
369
- "logits/rejected": 0.4130098223686218,
370
- "logps/chosen": -357.9176940917969,
371
- "logps/rejected": -252.9857940673828,
372
- "loss": 0.5683,
373
- "rewards/accuracies": 0.668749988079071,
374
- "rewards/chosen": -0.07785089313983917,
375
- "rewards/margins": 0.3897276818752289,
376
- "rewards/rejected": -0.467578649520874,
377
  "step": 230
378
  },
379
  {
380
- "epoch": 0.67,
381
- "learning_rate": 1.4533506561564305e-07,
382
- "logits/chosen": 0.06929950416088104,
383
- "logits/rejected": 0.36037522554397583,
384
- "logps/chosen": -385.23687744140625,
385
- "logps/rejected": -265.5404357910156,
386
- "loss": 0.5724,
387
- "rewards/accuracies": 0.731249988079071,
388
- "rewards/chosen": -0.07969608157873154,
389
- "rewards/margins": 0.4063330292701721,
390
- "rewards/rejected": -0.4860290586948395,
391
  "step": 240
392
  },
393
  {
394
- "epoch": 0.7,
395
- "learning_rate": 1.2358565665550387e-07,
396
- "logits/chosen": 0.17380349338054657,
397
- "logits/rejected": 0.45210400223731995,
398
- "logps/chosen": -323.52716064453125,
399
- "logps/rejected": -249.1572723388672,
400
- "loss": 0.5722,
401
- "rewards/accuracies": 0.71875,
402
- "rewards/chosen": -0.15534546971321106,
403
- "rewards/margins": 0.4003133773803711,
404
- "rewards/rejected": -0.5556589365005493,
405
  "step": 250
406
  },
407
  {
408
- "epoch": 0.73,
409
- "learning_rate": 1.0305368692688174e-07,
410
- "logits/chosen": 0.023118749260902405,
411
- "logits/rejected": 0.29933175444602966,
412
- "logps/chosen": -404.72479248046875,
413
- "logps/rejected": -276.384033203125,
414
- "loss": 0.5714,
415
- "rewards/accuracies": 0.7437499761581421,
416
- "rewards/chosen": -0.07409064471721649,
417
- "rewards/margins": 0.5126373767852783,
418
- "rewards/rejected": -0.5867279767990112,
419
  "step": 260
420
  },
421
  {
422
- "epoch": 0.76,
423
- "learning_rate": 8.393689052217964e-08,
424
- "logits/chosen": 0.10952025651931763,
425
- "logits/rejected": 0.2917477488517761,
426
- "logps/chosen": -342.96630859375,
427
- "logps/rejected": -270.0740966796875,
428
- "loss": 0.5788,
429
  "rewards/accuracies": 0.65625,
430
- "rewards/chosen": -0.1741449385881424,
431
- "rewards/margins": 0.3325015604496002,
432
- "rewards/rejected": -0.5066465139389038,
433
  "step": 270
434
  },
435
  {
436
- "epoch": 0.79,
437
- "learning_rate": 6.641937264107867e-08,
438
- "logits/chosen": 0.1401262879371643,
439
- "logits/rejected": 0.4076583981513977,
440
- "logps/chosen": -389.6405944824219,
441
- "logps/rejected": -279.58172607421875,
442
- "loss": 0.5697,
443
- "rewards/accuracies": 0.7250000238418579,
444
- "rewards/chosen": -0.1388498991727829,
445
- "rewards/margins": 0.4816059172153473,
446
- "rewards/rejected": -0.6204557418823242,
447
  "step": 280
448
  },
449
  {
450
- "epoch": 0.81,
451
- "learning_rate": 5.066983655682325e-08,
452
- "logits/chosen": 0.17886988818645477,
453
- "logits/rejected": 0.4664178788661957,
454
- "logps/chosen": -402.42510986328125,
455
- "logps/rejected": -274.7767639160156,
456
- "loss": 0.5709,
457
- "rewards/accuracies": 0.699999988079071,
458
- "rewards/chosen": -0.0810159370303154,
459
- "rewards/margins": 0.44364994764328003,
460
- "rewards/rejected": -0.5246659517288208,
461
  "step": 290
462
  },
463
  {
464
- "epoch": 0.84,
465
- "learning_rate": 3.683995891147695e-08,
466
- "logits/chosen": 0.12415604293346405,
467
- "logits/rejected": 0.3294488489627838,
468
- "logps/chosen": -402.2934875488281,
469
- "logps/rejected": -269.29876708984375,
470
- "loss": 0.5681,
471
- "rewards/accuracies": 0.737500011920929,
472
- "rewards/chosen": -0.05147252231836319,
473
- "rewards/margins": 0.512820839881897,
474
- "rewards/rejected": -0.564293384552002,
475
  "step": 300
476
  },
477
  {
478
- "epoch": 0.84,
479
- "eval_logits/chosen": 0.6442692279815674,
480
- "eval_logits/rejected": 0.7858577370643616,
481
- "eval_logps/chosen": -385.6814270019531,
482
- "eval_logps/rejected": -407.28143310546875,
483
- "eval_loss": 0.6703880429267883,
484
- "eval_rewards/accuracies": 0.6171875,
485
- "eval_rewards/chosen": -0.2623019516468048,
486
- "eval_rewards/margins": 0.1534184217453003,
487
- "eval_rewards/rejected": -0.4157203435897827,
488
- "eval_runtime": 65.8277,
489
- "eval_samples_per_second": 30.382,
490
- "eval_steps_per_second": 0.486,
491
  "step": 300
492
  },
493
  {
494
- "epoch": 0.87,
495
- "learning_rate": 2.5062928986944676e-08,
496
- "logits/chosen": 0.1317346841096878,
497
- "logits/rejected": 0.36644047498703003,
498
- "logps/chosen": -340.4246520996094,
499
- "logps/rejected": -262.36859130859375,
500
- "loss": 0.5722,
501
- "rewards/accuracies": 0.7124999761581421,
502
- "rewards/chosen": -0.09537344425916672,
503
- "rewards/margins": 0.40499648451805115,
504
- "rewards/rejected": -0.5003699064254761,
505
  "step": 310
506
  },
507
  {
508
- "epoch": 0.9,
509
- "learning_rate": 1.5452166019378987e-08,
510
- "logits/chosen": 0.17706182599067688,
511
- "logits/rejected": 0.40400177240371704,
512
- "logps/chosen": -361.12261962890625,
513
- "logps/rejected": -260.80511474609375,
514
- "loss": 0.5623,
515
- "rewards/accuracies": 0.7250000238418579,
516
- "rewards/chosen": -0.15819527208805084,
517
- "rewards/margins": 0.40806493163108826,
518
- "rewards/rejected": -0.5662601590156555,
519
  "step": 320
520
  },
521
  {
522
- "epoch": 0.93,
523
- "learning_rate": 8.100226909935059e-09,
524
- "logits/chosen": 0.1275455504655838,
525
- "logits/rejected": 0.3778701424598694,
526
- "logps/chosen": -376.87091064453125,
527
- "logps/rejected": -271.15924072265625,
528
- "loss": 0.5582,
529
- "rewards/accuracies": 0.675000011920929,
530
- "rewards/chosen": -0.18845218420028687,
531
- "rewards/margins": 0.3703801929950714,
532
- "rewards/rejected": -0.5588323473930359,
533
  "step": 330
534
  },
535
  {
536
- "epoch": 0.96,
537
- "learning_rate": 3.077914851215585e-09,
538
- "logits/chosen": 0.17431296408176422,
539
- "logits/rejected": 0.4034757614135742,
540
- "logps/chosen": -351.8921813964844,
541
- "logps/rejected": -258.91180419921875,
542
- "loss": 0.5746,
543
- "rewards/accuracies": 0.7124999761581421,
544
- "rewards/chosen": -0.21883301436901093,
545
- "rewards/margins": 0.366277277469635,
546
- "rewards/rejected": -0.5851103067398071,
547
  "step": 340
548
  },
549
  {
550
- "epoch": 0.98,
551
- "learning_rate": 4.3359745382104405e-10,
552
- "logits/chosen": 0.1437760889530182,
553
- "logits/rejected": 0.3430730104446411,
554
- "logps/chosen": -391.3290710449219,
555
- "logps/rejected": -273.56195068359375,
556
- "loss": 0.5851,
557
- "rewards/accuracies": 0.731249988079071,
558
- "rewards/chosen": -0.08795999735593796,
559
- "rewards/margins": 0.46402493119239807,
560
- "rewards/rejected": -0.551984965801239,
561
  "step": 350
562
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
563
  {
564
  "epoch": 1.0,
565
- "step": 356,
566
  "total_flos": 0.0,
567
- "train_loss": 0.5989744803878698,
568
- "train_runtime": 3249.9516,
569
- "train_samples_per_second": 14.015,
570
- "train_steps_per_second": 0.11
571
  }
572
  ],
573
  "logging_steps": 10,
574
- "max_steps": 356,
575
  "num_train_epochs": 1,
576
  "save_steps": 1000,
577
  "total_flos": 0.0,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9984301412872841,
5
  "eval_steps": 100,
6
+ "global_step": 477,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "learning_rate": 1.0416666666666666e-08,
14
+ "logits/chosen": 0.12788674235343933,
15
+ "logits/rejected": 0.34812721610069275,
16
+ "logps/chosen": -504.64813232421875,
17
+ "logps/rejected": -353.6391906738281,
18
+ "loss": 0.1069,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
 
23
  "step": 1
24
  },
25
  {
26
+ "epoch": 0.02,
27
+ "learning_rate": 1.0416666666666667e-07,
28
+ "logits/chosen": 0.22303083539009094,
29
+ "logits/rejected": 0.3398795425891876,
30
+ "logps/chosen": -343.9149475097656,
31
+ "logps/rejected": -345.42095947265625,
32
+ "loss": 0.1091,
33
  "rewards/accuracies": 0.4583333432674408,
34
+ "rewards/chosen": 0.0002915965160354972,
35
+ "rewards/margins": 0.0005722532514482737,
36
+ "rewards/rejected": -0.0002806567645166069,
37
  "step": 10
38
  },
39
  {
40
+ "epoch": 0.04,
41
+ "learning_rate": 2.0833333333333333e-07,
42
+ "logits/chosen": 0.2437092810869217,
43
+ "logits/rejected": 0.2768189311027527,
44
+ "logps/chosen": -342.15460205078125,
45
+ "logps/rejected": -352.68170166015625,
46
+ "loss": 0.1087,
47
+ "rewards/accuracies": 0.550000011920929,
48
+ "rewards/chosen": -0.0004921076470054686,
49
+ "rewards/margins": 0.00030653522117063403,
50
+ "rewards/rejected": -0.0007986428099684417,
51
  "step": 20
52
  },
53
  {
54
+ "epoch": 0.06,
55
+ "learning_rate": 3.1249999999999997e-07,
56
+ "logits/chosen": 0.2255886346101761,
57
+ "logits/rejected": 0.22949561476707458,
58
+ "logps/chosen": -403.088134765625,
59
+ "logps/rejected": -395.09552001953125,
60
+ "loss": 0.1112,
61
+ "rewards/accuracies": 0.612500011920929,
62
+ "rewards/chosen": -0.0018422408029437065,
63
+ "rewards/margins": 0.002462574513629079,
64
+ "rewards/rejected": -0.004304815083742142,
65
  "step": 30
66
  },
67
  {
68
+ "epoch": 0.08,
69
+ "learning_rate": 4.1666666666666667e-07,
70
+ "logits/chosen": 0.2738032341003418,
71
+ "logits/rejected": 0.32951346039772034,
72
+ "logps/chosen": -352.05938720703125,
73
+ "logps/rejected": -338.80743408203125,
74
+ "loss": 0.1093,
75
+ "rewards/accuracies": 0.581250011920929,
76
+ "rewards/chosen": -0.006794331129640341,
77
+ "rewards/margins": 0.003717987332493067,
78
+ "rewards/rejected": -0.010512317530810833,
79
  "step": 40
80
  },
81
  {
82
+ "epoch": 0.1,
83
+ "learning_rate": 4.999731868769026e-07,
84
+ "logits/chosen": 0.22654812037944794,
85
+ "logits/rejected": 0.31083282828330994,
86
+ "logps/chosen": -363.4710998535156,
87
+ "logps/rejected": -358.54168701171875,
88
+ "loss": 0.1041,
89
+ "rewards/accuracies": 0.65625,
90
+ "rewards/chosen": -0.005752457305788994,
91
+ "rewards/margins": 0.019924405962228775,
92
+ "rewards/rejected": -0.02567686140537262,
93
  "step": 50
94
  },
95
  {
96
+ "epoch": 0.13,
97
+ "learning_rate": 4.990353313429303e-07,
98
+ "logits/chosen": 0.3616481125354767,
99
+ "logits/rejected": 0.386046439409256,
100
+ "logps/chosen": -336.10211181640625,
101
+ "logps/rejected": -334.69024658203125,
102
+ "loss": 0.106,
103
+ "rewards/accuracies": 0.675000011920929,
104
+ "rewards/chosen": -0.01512543298304081,
105
+ "rewards/margins": 0.033848248422145844,
106
+ "rewards/rejected": -0.048973675817251205,
107
  "step": 60
108
  },
109
  {
110
+ "epoch": 0.15,
111
+ "learning_rate": 4.967625656594781e-07,
112
+ "logits/chosen": 0.23518328368663788,
113
+ "logits/rejected": 0.3344312310218811,
114
+ "logps/chosen": -350.3984680175781,
115
+ "logps/rejected": -307.37957763671875,
116
+ "loss": 0.1049,
117
+ "rewards/accuracies": 0.6625000238418579,
118
+ "rewards/chosen": -0.003296907991170883,
119
+ "rewards/margins": 0.052046000957489014,
120
+ "rewards/rejected": -0.0553429052233696,
121
  "step": 70
122
  },
123
  {
124
+ "epoch": 0.17,
125
+ "learning_rate": 4.93167072587771e-07,
126
+ "logits/chosen": 0.32164302468299866,
127
+ "logits/rejected": 0.3959673047065735,
128
+ "logps/chosen": -379.69647216796875,
129
+ "logps/rejected": -327.6635437011719,
130
+ "loss": 0.1132,
131
+ "rewards/accuracies": 0.6937500238418579,
132
+ "rewards/chosen": -0.004512617830187082,
133
+ "rewards/margins": 0.07668532431125641,
134
+ "rewards/rejected": -0.08119793236255646,
135
  "step": 80
136
  },
137
  {
138
+ "epoch": 0.19,
139
+ "learning_rate": 4.882681251368548e-07,
140
+ "logits/chosen": 0.31702089309692383,
141
+ "logits/rejected": 0.4289167821407318,
142
+ "logps/chosen": -394.7347717285156,
143
+ "logps/rejected": -366.826171875,
144
+ "loss": 0.1025,
145
+ "rewards/accuracies": 0.71875,
146
+ "rewards/chosen": -0.009242130443453789,
147
+ "rewards/margins": 0.12692956626415253,
148
+ "rewards/rejected": -0.13617169857025146,
149
  "step": 90
150
  },
151
  {
152
+ "epoch": 0.21,
153
+ "learning_rate": 4.820919832540181e-07,
154
+ "logits/chosen": 0.3820047080516815,
155
+ "logits/rejected": 0.4675898551940918,
156
+ "logps/chosen": -372.18115234375,
157
+ "logps/rejected": -365.79522705078125,
158
+ "loss": 0.0975,
159
+ "rewards/accuracies": 0.5874999761581421,
160
+ "rewards/chosen": -0.08189485222101212,
161
+ "rewards/margins": 0.11343145370483398,
162
+ "rewards/rejected": -0.1953262984752655,
163
  "step": 100
164
  },
165
  {
166
+ "epoch": 0.21,
167
+ "eval_logits/chosen": 0.4692724049091339,
168
+ "eval_logits/rejected": 0.533983588218689,
169
+ "eval_logps/chosen": -365.49639892578125,
170
+ "eval_logps/rejected": -389.4014587402344,
171
+ "eval_loss": 0.09751056134700775,
172
+ "eval_rewards/accuracies": 0.69140625,
173
+ "eval_rewards/chosen": -0.060451939702034,
174
+ "eval_rewards/margins": 0.17646832764148712,
175
+ "eval_rewards/rejected": -0.23692026734352112,
176
+ "eval_runtime": 76.9794,
177
+ "eval_samples_per_second": 25.981,
178
+ "eval_steps_per_second": 0.416,
179
  "step": 100
180
  },
181
  {
182
+ "epoch": 0.23,
183
+ "learning_rate": 4.7467175306295647e-07,
184
+ "logits/chosen": 0.37000179290771484,
185
+ "logits/rejected": 0.43369150161743164,
186
+ "logps/chosen": -378.1351318359375,
187
+ "logps/rejected": -378.1277770996094,
188
+ "loss": 0.0933,
189
+ "rewards/accuracies": 0.6187499761581421,
190
+ "rewards/chosen": -0.14543434977531433,
191
+ "rewards/margins": 0.1312834918498993,
192
+ "rewards/rejected": -0.2767178416252136,
193
  "step": 110
194
  },
195
  {
196
+ "epoch": 0.25,
197
+ "learning_rate": 4.6604720940421207e-07,
198
+ "logits/chosen": 0.4519842565059662,
199
+ "logits/rejected": 0.5497914552688599,
200
+ "logps/chosen": -408.4247131347656,
201
+ "logps/rejected": -414.9881286621094,
202
+ "loss": 0.0929,
203
+ "rewards/accuracies": 0.6625000238418579,
204
+ "rewards/chosen": -0.19376961886882782,
205
+ "rewards/margins": 0.1563883125782013,
206
+ "rewards/rejected": -0.3501579165458679,
207
  "step": 120
208
  },
209
  {
210
+ "epoch": 0.27,
211
+ "learning_rate": 4.5626458262912735e-07,
212
+ "logits/chosen": 0.5827921628952026,
213
+ "logits/rejected": 0.6809111833572388,
214
+ "logps/chosen": -420.0984802246094,
215
+ "logps/rejected": -399.5935363769531,
216
+ "loss": 0.0789,
217
+ "rewards/accuracies": 0.6000000238418579,
218
+ "rewards/chosen": -0.28970545530319214,
219
+ "rewards/margins": 0.15940071642398834,
220
+ "rewards/rejected": -0.4491061270236969,
221
  "step": 130
222
  },
223
  {
224
+ "epoch": 0.29,
225
+ "learning_rate": 4.453763107901675e-07,
226
+ "logits/chosen": 0.6244224309921265,
227
+ "logits/rejected": 0.746228814125061,
228
+ "logps/chosen": -396.53076171875,
229
+ "logps/rejected": -390.9623718261719,
230
+ "loss": 0.0717,
231
+ "rewards/accuracies": 0.6625000238418579,
232
+ "rewards/chosen": -0.3686402440071106,
233
+ "rewards/margins": 0.18962158262729645,
234
+ "rewards/rejected": -0.5582617521286011,
235
  "step": 140
236
  },
237
  {
238
+ "epoch": 0.31,
239
+ "learning_rate": 4.3344075855595097e-07,
240
+ "logits/chosen": 0.6669297218322754,
241
+ "logits/rejected": 0.8208922147750854,
242
+ "logps/chosen": -387.5301818847656,
243
+ "logps/rejected": -378.3419189453125,
244
+ "loss": 0.0647,
245
+ "rewards/accuracies": 0.6000000238418579,
246
+ "rewards/chosen": -0.41620713472366333,
247
+ "rewards/margins": 0.1934729665517807,
248
+ "rewards/rejected": -0.6096801161766052,
249
  "step": 150
250
  },
251
  {
252
+ "epoch": 0.33,
253
+ "learning_rate": 4.2052190435769554e-07,
254
+ "logits/chosen": 0.6333284974098206,
255
+ "logits/rejected": 0.7795067429542542,
256
+ "logps/chosen": -428.93841552734375,
257
+ "logps/rejected": -450.5494079589844,
258
+ "loss": 0.0619,
259
+ "rewards/accuracies": 0.6875,
260
+ "rewards/chosen": -0.4103819727897644,
261
+ "rewards/margins": 0.2781962454319,
262
+ "rewards/rejected": -0.6885782480239868,
263
  "step": 160
264
  },
265
  {
266
+ "epoch": 0.36,
267
+ "learning_rate": 4.0668899744407567e-07,
268
+ "logits/chosen": 0.6851844787597656,
269
+ "logits/rejected": 0.8698636889457703,
270
+ "logps/chosen": -394.453369140625,
271
+ "logps/rejected": -400.83892822265625,
272
+ "loss": 0.0613,
273
+ "rewards/accuracies": 0.612500011920929,
274
+ "rewards/chosen": -0.49455365538597107,
275
+ "rewards/margins": 0.24642686545848846,
276
+ "rewards/rejected": -0.7409806251525879,
277
  "step": 170
278
  },
279
  {
280
+ "epoch": 0.38,
281
+ "learning_rate": 3.920161866827889e-07,
282
+ "logits/chosen": 0.579459011554718,
283
+ "logits/rejected": 0.6854727864265442,
284
+ "logps/chosen": -381.6180419921875,
285
+ "logps/rejected": -419.34869384765625,
286
+ "loss": 0.0616,
287
+ "rewards/accuracies": 0.625,
288
+ "rewards/chosen": -0.476001501083374,
289
+ "rewards/margins": 0.2683504521846771,
290
+ "rewards/rejected": -0.7443519830703735,
291
  "step": 180
292
  },
293
  {
294
+ "epoch": 0.4,
295
+ "learning_rate": 3.765821230985757e-07,
296
+ "logits/chosen": 0.5569711923599243,
297
+ "logits/rejected": 0.6708570718765259,
298
+ "logps/chosen": -383.0780334472656,
299
+ "logps/rejected": -407.76837158203125,
300
+ "loss": 0.0592,
301
+ "rewards/accuracies": 0.637499988079071,
302
+ "rewards/chosen": -0.4089416563510895,
303
+ "rewards/margins": 0.28474992513656616,
304
+ "rewards/rejected": -0.693691611289978,
305
  "step": 190
306
  },
307
  {
308
+ "epoch": 0.42,
309
+ "learning_rate": 3.604695382782159e-07,
310
+ "logits/chosen": 0.49640387296676636,
311
+ "logits/rejected": 0.604566216468811,
312
+ "logps/chosen": -433.7373046875,
313
+ "logps/rejected": -452.308837890625,
314
+ "loss": 0.0589,
315
+ "rewards/accuracies": 0.668749988079071,
316
+ "rewards/chosen": -0.47900503873825073,
317
+ "rewards/margins": 0.30649885535240173,
318
+ "rewards/rejected": -0.7855038046836853,
319
  "step": 200
320
  },
321
  {
322
+ "epoch": 0.42,
323
+ "eval_logits/chosen": 0.6615116596221924,
324
+ "eval_logits/rejected": 0.7807996273040771,
325
+ "eval_logps/chosen": -404.0002136230469,
326
+ "eval_logps/rejected": -453.07177734375,
327
+ "eval_loss": 0.05819432809948921,
328
+ "eval_rewards/accuracies": 0.71484375,
329
+ "eval_rewards/chosen": -0.4454895853996277,
330
+ "eval_rewards/margins": 0.42813408374786377,
331
+ "eval_rewards/rejected": -0.8736236691474915,
332
+ "eval_runtime": 75.0575,
333
+ "eval_samples_per_second": 26.646,
334
+ "eval_steps_per_second": 0.426,
335
  "step": 200
336
  },
337
  {
338
+ "epoch": 0.44,
339
+ "learning_rate": 3.4376480090239047e-07,
340
+ "logits/chosen": 0.5758289098739624,
341
+ "logits/rejected": 0.6775172352790833,
342
+ "logps/chosen": -441.56683349609375,
343
+ "logps/rejected": -425.92437744140625,
344
+ "loss": 0.0567,
345
+ "rewards/accuracies": 0.668749988079071,
346
+ "rewards/chosen": -0.562717080116272,
347
+ "rewards/margins": 0.29301005601882935,
348
+ "rewards/rejected": -0.8557270765304565,
349
  "step": 210
350
  },
351
  {
352
+ "epoch": 0.46,
353
+ "learning_rate": 3.265574537815398e-07,
354
+ "logits/chosen": 0.423481285572052,
355
+ "logits/rejected": 0.6732310056686401,
356
+ "logps/chosen": -423.9397888183594,
357
+ "logps/rejected": -425.78045654296875,
358
+ "loss": 0.0577,
359
+ "rewards/accuracies": 0.7437499761581421,
360
+ "rewards/chosen": -0.43129101395606995,
361
+ "rewards/margins": 0.49137812852859497,
362
+ "rewards/rejected": -0.9226692318916321,
363
  "step": 220
364
  },
365
  {
366
+ "epoch": 0.48,
367
+ "learning_rate": 3.0893973387735683e-07,
368
+ "logits/chosen": 0.46089068055152893,
369
+ "logits/rejected": 0.6886599659919739,
370
+ "logps/chosen": -458.5089416503906,
371
+ "logps/rejected": -429.6102600097656,
372
+ "loss": 0.058,
373
+ "rewards/accuracies": 0.706250011920929,
374
+ "rewards/chosen": -0.49578744173049927,
375
+ "rewards/margins": 0.4050619602203369,
376
+ "rewards/rejected": -0.900849461555481,
377
  "step": 230
378
  },
379
  {
380
+ "epoch": 0.5,
381
+ "learning_rate": 2.910060778827554e-07,
382
+ "logits/chosen": 0.581864595413208,
383
+ "logits/rejected": 0.7646275758743286,
384
+ "logps/chosen": -428.42803955078125,
385
+ "logps/rejected": -440.18597412109375,
386
+ "loss": 0.0611,
387
+ "rewards/accuracies": 0.6812499761581421,
388
+ "rewards/chosen": -0.45797547698020935,
389
+ "rewards/margins": 0.4529312551021576,
390
+ "rewards/rejected": -0.9109067916870117,
391
  "step": 240
392
  },
393
  {
394
+ "epoch": 0.52,
395
+ "learning_rate": 2.7285261601056697e-07,
396
+ "logits/chosen": 0.5814759135246277,
397
+ "logits/rejected": 0.7270434498786926,
398
+ "logps/chosen": -398.45135498046875,
399
+ "logps/rejected": -447.3760681152344,
400
+ "loss": 0.0551,
401
+ "rewards/accuracies": 0.699999988079071,
402
+ "rewards/chosen": -0.5682977437973022,
403
+ "rewards/margins": 0.40714630484580994,
404
+ "rewards/rejected": -0.9754441380500793,
405
  "step": 250
406
  },
407
  {
408
+ "epoch": 0.54,
409
+ "learning_rate": 2.5457665670441937e-07,
410
+ "logits/chosen": 0.540181040763855,
411
+ "logits/rejected": 0.705514669418335,
412
+ "logps/chosen": -430.0947265625,
413
+ "logps/rejected": -455.96466064453125,
414
+ "loss": 0.0543,
415
+ "rewards/accuracies": 0.6937500238418579,
416
+ "rewards/chosen": -0.5835620164871216,
417
+ "rewards/margins": 0.3612635135650635,
418
+ "rewards/rejected": -0.9448255300521851,
419
  "step": 260
420
  },
421
  {
422
+ "epoch": 0.57,
423
+ "learning_rate": 2.3627616503391812e-07,
424
+ "logits/chosen": 0.531669020652771,
425
+ "logits/rejected": 0.6921663880348206,
426
+ "logps/chosen": -411.39947509765625,
427
+ "logps/rejected": -438.072265625,
428
+ "loss": 0.0529,
429
  "rewards/accuracies": 0.65625,
430
+ "rewards/chosen": -0.581498384475708,
431
+ "rewards/margins": 0.41039901971817017,
432
+ "rewards/rejected": -0.9918974041938782,
433
  "step": 270
434
  },
435
  {
436
+ "epoch": 0.59,
437
+ "learning_rate": 2.1804923757009882e-07,
438
+ "logits/chosen": 0.5589742064476013,
439
+ "logits/rejected": 0.6747141480445862,
440
+ "logps/chosen": -441.4170837402344,
441
+ "logps/rejected": -477.62310791015625,
442
+ "loss": 0.0506,
443
+ "rewards/accuracies": 0.668749988079071,
444
+ "rewards/chosen": -0.57341468334198,
445
+ "rewards/margins": 0.4166173040866852,
446
+ "rewards/rejected": -0.990031898021698,
447
  "step": 280
448
  },
449
  {
450
+ "epoch": 0.61,
451
+ "learning_rate": 1.9999357655598891e-07,
452
+ "logits/chosen": 0.6409920454025269,
453
+ "logits/rejected": 0.8697878122329712,
454
+ "logps/chosen": -446.1312561035156,
455
+ "logps/rejected": -445.7093811035156,
456
+ "loss": 0.048,
457
+ "rewards/accuracies": 0.625,
458
+ "rewards/chosen": -0.6841451525688171,
459
+ "rewards/margins": 0.47665899991989136,
460
+ "rewards/rejected": -1.1608041524887085,
461
  "step": 290
462
  },
463
  {
464
+ "epoch": 0.63,
465
+ "learning_rate": 1.8220596619089573e-07,
466
+ "logits/chosen": 0.67746901512146,
467
+ "logits/rejected": 0.8398680686950684,
468
+ "logps/chosen": -440.12237548828125,
469
+ "logps/rejected": -460.32086181640625,
470
+ "loss": 0.0465,
471
+ "rewards/accuracies": 0.637499988079071,
472
+ "rewards/chosen": -0.7187305688858032,
473
+ "rewards/margins": 0.3237985372543335,
474
+ "rewards/rejected": -1.0425291061401367,
475
  "step": 300
476
  },
477
  {
478
+ "epoch": 0.63,
479
+ "eval_logits/chosen": 0.7931328415870667,
480
+ "eval_logits/rejected": 0.8960775136947632,
481
+ "eval_logps/chosen": -419.99542236328125,
482
+ "eval_logps/rejected": -477.4249267578125,
483
+ "eval_loss": 0.04939539358019829,
484
+ "eval_rewards/accuracies": 0.703125,
485
+ "eval_rewards/chosen": -0.6054419279098511,
486
+ "eval_rewards/margins": 0.511713445186615,
487
+ "eval_rewards/rejected": -1.1171554327011108,
488
+ "eval_runtime": 75.2617,
489
+ "eval_samples_per_second": 26.574,
490
+ "eval_steps_per_second": 0.425,
491
  "step": 300
492
  },
493
  {
494
+ "epoch": 0.65,
495
+ "learning_rate": 1.647817538357072e-07,
496
+ "logits/chosen": 0.6320704817771912,
497
+ "logits/rejected": 0.8103192448616028,
498
+ "logps/chosen": -424.61865234375,
499
+ "logps/rejected": -452.2117614746094,
500
+ "loss": 0.0484,
501
+ "rewards/accuracies": 0.606249988079071,
502
+ "rewards/chosen": -0.608985185623169,
503
+ "rewards/margins": 0.3958033323287964,
504
+ "rewards/rejected": -1.0047886371612549,
505
  "step": 310
506
  },
507
  {
508
+ "epoch": 0.67,
509
+ "learning_rate": 1.478143389201113e-07,
510
+ "logits/chosen": 0.7435864806175232,
511
+ "logits/rejected": 0.9429095983505249,
512
+ "logps/chosen": -452.36004638671875,
513
+ "logps/rejected": -481.8624572753906,
514
+ "loss": 0.0448,
515
+ "rewards/accuracies": 0.731249988079071,
516
+ "rewards/chosen": -0.628174901008606,
517
+ "rewards/margins": 0.41646808385849,
518
+ "rewards/rejected": -1.0446430444717407,
519
  "step": 320
520
  },
521
  {
522
+ "epoch": 0.69,
523
+ "learning_rate": 1.3139467229135998e-07,
524
+ "logits/chosen": 0.6155081987380981,
525
+ "logits/rejected": 0.7582153081893921,
526
+ "logps/chosen": -426.2732849121094,
527
+ "logps/rejected": -476.5437927246094,
528
+ "loss": 0.0473,
529
+ "rewards/accuracies": 0.668749988079071,
530
+ "rewards/chosen": -0.6611535549163818,
531
+ "rewards/margins": 0.38004034757614136,
532
+ "rewards/rejected": -1.041193962097168,
533
  "step": 330
534
  },
535
  {
536
+ "epoch": 0.71,
537
+ "learning_rate": 1.1561076868822755e-07,
538
+ "logits/chosen": 0.5263934135437012,
539
+ "logits/rejected": 0.7371311783790588,
540
+ "logps/chosen": -459.9794006347656,
541
+ "logps/rejected": -492.32977294921875,
542
+ "loss": 0.0484,
543
+ "rewards/accuracies": 0.6875,
544
+ "rewards/chosen": -0.6238055229187012,
545
+ "rewards/margins": 0.46394386887550354,
546
+ "rewards/rejected": -1.0877494812011719,
547
  "step": 340
548
  },
549
  {
550
+ "epoch": 0.73,
551
+ "learning_rate": 1.0054723495346482e-07,
552
+ "logits/chosen": 0.6952361464500427,
553
+ "logits/rejected": 0.7730409502983093,
554
+ "logps/chosen": -392.53411865234375,
555
+ "logps/rejected": -445.5184631347656,
556
+ "loss": 0.0467,
557
+ "rewards/accuracies": 0.6812499761581421,
558
+ "rewards/chosen": -0.6540313959121704,
559
+ "rewards/margins": 0.43615293502807617,
560
+ "rewards/rejected": -1.0901843309402466,
561
  "step": 350
562
  },
563
+ {
564
+ "epoch": 0.75,
565
+ "learning_rate": 8.628481651367875e-08,
566
+ "logits/chosen": 0.645788311958313,
567
+ "logits/rejected": 0.8300139307975769,
568
+ "logps/chosen": -437.50830078125,
569
+ "logps/rejected": -429.37890625,
570
+ "loss": 0.0524,
571
+ "rewards/accuracies": 0.6499999761581421,
572
+ "rewards/chosen": -0.6759519577026367,
573
+ "rewards/margins": 0.35926973819732666,
574
+ "rewards/rejected": -1.0352216958999634,
575
+ "step": 360
576
+ },
577
+ {
578
+ "epoch": 0.77,
579
+ "learning_rate": 7.289996455765748e-08,
580
+ "logits/chosen": 0.6347015500068665,
581
+ "logits/rejected": 0.8841344714164734,
582
+ "logps/chosen": -434.65313720703125,
583
+ "logps/rejected": -425.39825439453125,
584
+ "loss": 0.0428,
585
+ "rewards/accuracies": 0.7124999761581421,
586
+ "rewards/chosen": -0.5998077988624573,
587
+ "rewards/margins": 0.45034995675086975,
588
+ "rewards/rejected": -1.0501576662063599,
589
+ "step": 370
590
+ },
591
+ {
592
+ "epoch": 0.8,
593
+ "learning_rate": 6.046442623320145e-08,
594
+ "logits/chosen": 0.653687596321106,
595
+ "logits/rejected": 0.7189717292785645,
596
+ "logps/chosen": -426.47674560546875,
597
+ "logps/rejected": -455.5611267089844,
598
+ "loss": 0.0501,
599
+ "rewards/accuracies": 0.6187499761581421,
600
+ "rewards/chosen": -0.6170892119407654,
601
+ "rewards/margins": 0.3933621644973755,
602
+ "rewards/rejected": -1.010451316833496,
603
+ "step": 380
604
+ },
605
+ {
606
+ "epoch": 0.82,
607
+ "learning_rate": 4.904486005914027e-08,
608
+ "logits/chosen": 0.5192676186561584,
609
+ "logits/rejected": 0.7548397183418274,
610
+ "logps/chosen": -476.45904541015625,
611
+ "logps/rejected": -474.6182556152344,
612
+ "loss": 0.0451,
613
+ "rewards/accuracies": 0.737500011920929,
614
+ "rewards/chosen": -0.718641459941864,
615
+ "rewards/margins": 0.44875186681747437,
616
+ "rewards/rejected": -1.167393445968628,
617
+ "step": 390
618
+ },
619
+ {
620
+ "epoch": 0.84,
621
+ "learning_rate": 3.8702478614051345e-08,
622
+ "logits/chosen": 0.6225503087043762,
623
+ "logits/rejected": 0.731469988822937,
624
+ "logps/chosen": -407.16912841796875,
625
+ "logps/rejected": -418.2110290527344,
626
+ "loss": 0.0419,
627
+ "rewards/accuracies": 0.6499999761581421,
628
+ "rewards/chosen": -0.6035235524177551,
629
+ "rewards/margins": 0.3522457182407379,
630
+ "rewards/rejected": -0.9557692408561707,
631
+ "step": 400
632
+ },
633
+ {
634
+ "epoch": 0.84,
635
+ "eval_logits/chosen": 0.7144887447357178,
636
+ "eval_logits/rejected": 0.832917332649231,
637
+ "eval_logps/chosen": -416.8811950683594,
638
+ "eval_logps/rejected": -477.0538330078125,
639
+ "eval_loss": 0.049533091485500336,
640
+ "eval_rewards/accuracies": 0.734375,
641
+ "eval_rewards/chosen": -0.5742998123168945,
642
+ "eval_rewards/margins": 0.5391446352005005,
643
+ "eval_rewards/rejected": -1.1134445667266846,
644
+ "eval_runtime": 76.9908,
645
+ "eval_samples_per_second": 25.977,
646
+ "eval_steps_per_second": 0.416,
647
+ "step": 400
648
+ },
649
+ {
650
+ "epoch": 0.86,
651
+ "learning_rate": 2.9492720416985e-08,
652
+ "logits/chosen": 0.5682260990142822,
653
+ "logits/rejected": 0.7126413583755493,
654
+ "logps/chosen": -419.2569885253906,
655
+ "logps/rejected": -410.59014892578125,
656
+ "loss": 0.0473,
657
+ "rewards/accuracies": 0.637499988079071,
658
+ "rewards/chosen": -0.652999222278595,
659
+ "rewards/margins": 0.26512840390205383,
660
+ "rewards/rejected": -0.9181275367736816,
661
+ "step": 410
662
+ },
663
+ {
664
+ "epoch": 0.88,
665
+ "learning_rate": 2.1464952759020856e-08,
666
+ "logits/chosen": 0.6080732345581055,
667
+ "logits/rejected": 0.7386394739151001,
668
+ "logps/chosen": -452.77789306640625,
669
+ "logps/rejected": -437.8445739746094,
670
+ "loss": 0.0469,
671
+ "rewards/accuracies": 0.65625,
672
+ "rewards/chosen": -0.7366248369216919,
673
+ "rewards/margins": 0.2541760802268982,
674
+ "rewards/rejected": -0.9908009767532349,
675
+ "step": 420
676
+ },
677
+ {
678
+ "epoch": 0.9,
679
+ "learning_rate": 1.4662207078575684e-08,
680
+ "logits/chosen": 0.6554642915725708,
681
+ "logits/rejected": 0.7158025503158569,
682
+ "logps/chosen": -407.20953369140625,
683
+ "logps/rejected": -471.7041015625,
684
+ "loss": 0.0453,
685
+ "rewards/accuracies": 0.762499988079071,
686
+ "rewards/chosen": -0.6344213485717773,
687
+ "rewards/margins": 0.481538861989975,
688
+ "rewards/rejected": -1.1159603595733643,
689
+ "step": 430
690
+ },
691
+ {
692
+ "epoch": 0.92,
693
+ "learning_rate": 9.12094829893642e-09,
694
+ "logits/chosen": 0.7153126001358032,
695
+ "logits/rejected": 0.7965753078460693,
696
+ "logps/chosen": -441.6089782714844,
697
+ "logps/rejected": -466.2574768066406,
698
+ "loss": 0.0487,
699
+ "rewards/accuracies": 0.612500011920929,
700
+ "rewards/chosen": -0.6991580724716187,
701
+ "rewards/margins": 0.4110774099826813,
702
+ "rewards/rejected": -1.1102354526519775,
703
+ "step": 440
704
+ },
705
+ {
706
+ "epoch": 0.94,
707
+ "learning_rate": 4.8708793644441086e-09,
708
+ "logits/chosen": 0.6587673425674438,
709
+ "logits/rejected": 0.830274760723114,
710
+ "logps/chosen": -465.8287658691406,
711
+ "logps/rejected": -467.7762145996094,
712
+ "loss": 0.0461,
713
+ "rewards/accuracies": 0.6937500238418579,
714
+ "rewards/chosen": -0.6734641194343567,
715
+ "rewards/margins": 0.3525208532810211,
716
+ "rewards/rejected": -1.0259850025177002,
717
+ "step": 450
718
+ },
719
+ {
720
+ "epoch": 0.96,
721
+ "learning_rate": 1.9347820230782295e-09,
722
+ "logits/chosen": 0.7173280715942383,
723
+ "logits/rejected": 0.8633974194526672,
724
+ "logps/chosen": -380.0779724121094,
725
+ "logps/rejected": -403.060302734375,
726
+ "loss": 0.0456,
727
+ "rewards/accuracies": 0.6875,
728
+ "rewards/chosen": -0.5644342303276062,
729
+ "rewards/margins": 0.3927594721317291,
730
+ "rewards/rejected": -0.9571938514709473,
731
+ "step": 460
732
+ },
733
+ {
734
+ "epoch": 0.98,
735
+ "learning_rate": 3.2839470889836627e-10,
736
+ "logits/chosen": 0.6316866278648376,
737
+ "logits/rejected": 0.7777234315872192,
738
+ "logps/chosen": -403.5115661621094,
739
+ "logps/rejected": -452.41864013671875,
740
+ "loss": 0.0474,
741
+ "rewards/accuracies": 0.675000011920929,
742
+ "rewards/chosen": -0.6106697916984558,
743
+ "rewards/margins": 0.4684695303440094,
744
+ "rewards/rejected": -1.0791394710540771,
745
+ "step": 470
746
+ },
747
  {
748
  "epoch": 1.0,
749
+ "step": 477,
750
  "total_flos": 0.0,
751
+ "train_loss": 0.06584663976538356,
752
+ "train_runtime": 4434.0315,
753
+ "train_samples_per_second": 13.787,
754
+ "train_steps_per_second": 0.108
755
  }
756
  ],
757
  "logging_steps": 10,
758
+ "max_steps": 477,
759
  "num_train_epochs": 1,
760
  "save_steps": 1000,
761
  "total_flos": 0.0,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:051c9e8ac9d43571a852867a53d4bf06c0d5fdcb8099e85e6dc4457824c35c1f
3
  size 5944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:244710b622fa4597e251d9d5432f6e641819c004ec5cdd6bd2c0a68718e30f4c
3
  size 5944