wzhouad commited on
Commit
0d93309
1 Parent(s): 374f588

Model save

Browse files
README.md CHANGED
@@ -15,15 +15,15 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  This model was trained from scratch on the None dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 0.0495
19
- - Rewards/chosen: -0.5743
20
- - Rewards/rejected: -1.1134
21
- - Rewards/accuracies: 0.7344
22
- - Rewards/margins: 0.5391
23
- - Logps/rejected: -477.0538
24
- - Logps/chosen: -416.8812
25
- - Logits/rejected: 0.8329
26
- - Logits/chosen: 0.7145
27
 
28
  ## Model description
29
 
@@ -45,7 +45,7 @@ The following hyperparameters were used during training:
45
  - learning_rate: 5e-07
46
  - train_batch_size: 4
47
  - eval_batch_size: 8
48
- - seed: 5
49
  - distributed_type: multi-GPU
50
  - num_devices: 8
51
  - gradient_accumulation_steps: 4
@@ -60,10 +60,9 @@ The following hyperparameters were used during training:
60
 
61
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
62
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
63
- | 0.0975 | 0.21 | 100 | 0.0975 | -0.0605 | -0.2369 | 0.6914 | 0.1765 | -389.4015 | -365.4964 | 0.5340 | 0.4693 |
64
- | 0.0589 | 0.42 | 200 | 0.0582 | -0.4455 | -0.8736 | 0.7148 | 0.4281 | -453.0718 | -404.0002 | 0.7808 | 0.6615 |
65
- | 0.0465 | 0.63 | 300 | 0.0494 | -0.6054 | -1.1172 | 0.7031 | 0.5117 | -477.4249 | -419.9954 | 0.8961 | 0.7931 |
66
- | 0.0419 | 0.84 | 400 | 0.0495 | -0.5743 | -1.1134 | 0.7344 | 0.5391 | -477.0538 | -416.8812 | 0.8329 | 0.7145 |
67
 
68
 
69
  ### Framework versions
 
15
 
16
  This model was trained from scratch on the None dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 0.1249
19
+ - Rewards/chosen: -0.2075
20
+ - Rewards/rejected: -0.2728
21
+ - Rewards/accuracies: 0.5391
22
+ - Rewards/margins: 0.0653
23
+ - Logps/rejected: -392.9848
24
+ - Logps/chosen: -380.2012
25
+ - Logits/rejected: 0.7227
26
+ - Logits/chosen: 0.6386
27
 
28
  ## Model description
29
 
 
45
  - learning_rate: 5e-07
46
  - train_batch_size: 4
47
  - eval_batch_size: 8
48
+ - seed: 1
49
  - distributed_type: multi-GPU
50
  - num_devices: 8
51
  - gradient_accumulation_steps: 4
 
60
 
61
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
62
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
63
+ | 0.2868 | 0.28 | 100 | 0.1333 | 0.1523 | 0.1165 | 0.5430 | 0.0358 | -354.0580 | -344.2188 | 0.3183 | 0.2622 |
64
+ | 0.2525 | 0.56 | 200 | 0.1256 | -0.0890 | -0.1477 | 0.5508 | 0.0587 | -380.4802 | -368.3549 | 0.5930 | 0.4955 |
65
+ | 0.2378 | 0.84 | 300 | 0.1249 | -0.2075 | -0.2728 | 0.5391 | 0.0653 | -392.9848 | -380.2012 | 0.7227 | 0.6386 |
 
66
 
67
 
68
  ### Framework versions
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.06584663976538356,
4
- "train_runtime": 4434.0315,
5
- "train_samples": 61134,
6
- "train_samples_per_second": 13.787,
7
- "train_steps_per_second": 0.108
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.25543520273117537,
4
+ "train_runtime": 3431.8585,
5
+ "train_samples": 45548,
6
+ "train_samples_per_second": 13.272,
7
+ "train_steps_per_second": 0.104
8
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:742a3a39155dfe2982b2079fe8048378854d72be66f8bb03992eab95c8d8613f
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c50447b1fa9a3fb8972cc1f2048273e615014e72e1ffdfd61fc351a7b891873
3
  size 4976698672
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9663e310a114c8e5dfe67123c8fa2e0b3f06238bc3727bd48b2fbf862d129e4
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4498d8c3e8a696ffa861b1ee0f26ff5a10be34c2c3e49976a7dfbbbf51cabee7
3
  size 4999802720
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:be6bbc86492f411d627c4ebb8e6aeaff116a8962892dec9b9af59b92427b849f
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a84321fc73683b3e3b8bac682baa0c983237b5d9f38c6384ae3c52399f993f9f
3
  size 4915916176
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:841fb520274242fc5c5655fa5d9e40cd6d96f0bb2ae1af50364d0590d1160c1f
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0d7e1ac6e8ed44294cee21a798e86385296ebe7182834397340cb054a7c4516
3
  size 1168138808
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.06584663976538356,
4
- "train_runtime": 4434.0315,
5
- "train_samples": 61134,
6
- "train_samples_per_second": 13.787,
7
- "train_steps_per_second": 0.108
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.25543520273117537,
4
+ "train_runtime": 3431.8585,
5
+ "train_samples": 45548,
6
+ "train_samples_per_second": 13.272,
7
+ "train_steps_per_second": 0.104
8
  }
trainer_state.json CHANGED
@@ -1,21 +1,21 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9984301412872841,
5
  "eval_steps": 100,
6
- "global_step": 477,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "learning_rate": 1.0416666666666666e-08,
14
- "logits/chosen": 0.12788674235343933,
15
- "logits/rejected": 0.34812721610069275,
16
- "logps/chosen": -504.64813232421875,
17
- "logps/rejected": -353.6391906738281,
18
- "loss": 0.1069,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
@@ -23,739 +23,555 @@
23
  "step": 1
24
  },
25
  {
26
- "epoch": 0.02,
27
- "learning_rate": 1.0416666666666667e-07,
28
- "logits/chosen": 0.22303083539009094,
29
- "logits/rejected": 0.3398795425891876,
30
- "logps/chosen": -343.9149475097656,
31
- "logps/rejected": -345.42095947265625,
32
- "loss": 0.1091,
33
- "rewards/accuracies": 0.4583333432674408,
34
- "rewards/chosen": 0.0002915965160354972,
35
- "rewards/margins": 0.0005722532514482737,
36
- "rewards/rejected": -0.0002806567645166069,
37
  "step": 10
38
  },
39
- {
40
- "epoch": 0.04,
41
- "learning_rate": 2.0833333333333333e-07,
42
- "logits/chosen": 0.2437092810869217,
43
- "logits/rejected": 0.2768189311027527,
44
- "logps/chosen": -342.15460205078125,
45
- "logps/rejected": -352.68170166015625,
46
- "loss": 0.1087,
47
- "rewards/accuracies": 0.550000011920929,
48
- "rewards/chosen": -0.0004921076470054686,
49
- "rewards/margins": 0.00030653522117063403,
50
- "rewards/rejected": -0.0007986428099684417,
51
- "step": 20
52
- },
53
  {
54
  "epoch": 0.06,
55
- "learning_rate": 3.1249999999999997e-07,
56
- "logits/chosen": 0.2255886346101761,
57
- "logits/rejected": 0.22949561476707458,
58
- "logps/chosen": -403.088134765625,
59
- "logps/rejected": -395.09552001953125,
60
- "loss": 0.1112,
61
- "rewards/accuracies": 0.612500011920929,
62
- "rewards/chosen": -0.0018422408029437065,
63
- "rewards/margins": 0.002462574513629079,
64
- "rewards/rejected": -0.004304815083742142,
65
- "step": 30
66
  },
67
  {
68
  "epoch": 0.08,
69
  "learning_rate": 4.1666666666666667e-07,
70
- "logits/chosen": 0.2738032341003418,
71
- "logits/rejected": 0.32951346039772034,
72
- "logps/chosen": -352.05938720703125,
73
- "logps/rejected": -338.80743408203125,
74
- "loss": 0.1093,
75
  "rewards/accuracies": 0.581250011920929,
76
- "rewards/chosen": -0.006794331129640341,
77
- "rewards/margins": 0.003717987332493067,
78
- "rewards/rejected": -0.010512317530810833,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  "step": 40
80
  },
81
  {
82
- "epoch": 0.1,
83
- "learning_rate": 4.999731868769026e-07,
84
- "logits/chosen": 0.22654812037944794,
85
- "logits/rejected": 0.31083282828330994,
86
- "logps/chosen": -363.4710998535156,
87
- "logps/rejected": -358.54168701171875,
88
- "loss": 0.1041,
89
- "rewards/accuracies": 0.65625,
90
- "rewards/chosen": -0.005752457305788994,
91
- "rewards/margins": 0.019924405962228775,
92
- "rewards/rejected": -0.02567686140537262,
93
  "step": 50
94
  },
95
  {
96
- "epoch": 0.13,
97
- "learning_rate": 4.990353313429303e-07,
98
- "logits/chosen": 0.3616481125354767,
99
- "logits/rejected": 0.386046439409256,
100
- "logps/chosen": -336.10211181640625,
101
- "logps/rejected": -334.69024658203125,
102
- "loss": 0.106,
103
- "rewards/accuracies": 0.675000011920929,
104
- "rewards/chosen": -0.01512543298304081,
105
- "rewards/margins": 0.033848248422145844,
106
- "rewards/rejected": -0.048973675817251205,
107
  "step": 60
108
  },
109
  {
110
- "epoch": 0.15,
111
- "learning_rate": 4.967625656594781e-07,
112
- "logits/chosen": 0.23518328368663788,
113
- "logits/rejected": 0.3344312310218811,
114
- "logps/chosen": -350.3984680175781,
115
- "logps/rejected": -307.37957763671875,
116
- "loss": 0.1049,
117
- "rewards/accuracies": 0.6625000238418579,
118
- "rewards/chosen": -0.003296907991170883,
119
- "rewards/margins": 0.052046000957489014,
120
- "rewards/rejected": -0.0553429052233696,
121
  "step": 70
122
  },
123
  {
124
- "epoch": 0.17,
125
- "learning_rate": 4.93167072587771e-07,
126
- "logits/chosen": 0.32164302468299866,
127
- "logits/rejected": 0.3959673047065735,
128
- "logps/chosen": -379.69647216796875,
129
- "logps/rejected": -327.6635437011719,
130
- "loss": 0.1132,
131
- "rewards/accuracies": 0.6937500238418579,
132
- "rewards/chosen": -0.004512617830187082,
133
- "rewards/margins": 0.07668532431125641,
134
- "rewards/rejected": -0.08119793236255646,
135
  "step": 80
136
  },
137
  {
138
- "epoch": 0.19,
139
- "learning_rate": 4.882681251368548e-07,
140
- "logits/chosen": 0.31702089309692383,
141
- "logits/rejected": 0.4289167821407318,
142
- "logps/chosen": -394.7347717285156,
143
- "logps/rejected": -366.826171875,
144
- "loss": 0.1025,
145
- "rewards/accuracies": 0.71875,
146
- "rewards/chosen": -0.009242130443453789,
147
- "rewards/margins": 0.12692956626415253,
148
- "rewards/rejected": -0.13617169857025146,
149
  "step": 90
150
  },
151
  {
152
- "epoch": 0.21,
153
- "learning_rate": 4.820919832540181e-07,
154
- "logits/chosen": 0.3820047080516815,
155
- "logits/rejected": 0.4675898551940918,
156
- "logps/chosen": -372.18115234375,
157
- "logps/rejected": -365.79522705078125,
158
- "loss": 0.0975,
159
- "rewards/accuracies": 0.5874999761581421,
160
- "rewards/chosen": -0.08189485222101212,
161
- "rewards/margins": 0.11343145370483398,
162
- "rewards/rejected": -0.1953262984752655,
163
  "step": 100
164
  },
165
  {
166
- "epoch": 0.21,
167
- "eval_logits/chosen": 0.4692724049091339,
168
- "eval_logits/rejected": 0.533983588218689,
169
- "eval_logps/chosen": -365.49639892578125,
170
- "eval_logps/rejected": -389.4014587402344,
171
- "eval_loss": 0.09751056134700775,
172
- "eval_rewards/accuracies": 0.69140625,
173
- "eval_rewards/chosen": -0.060451939702034,
174
- "eval_rewards/margins": 0.17646832764148712,
175
- "eval_rewards/rejected": -0.23692026734352112,
176
- "eval_runtime": 76.9794,
177
- "eval_samples_per_second": 25.981,
178
- "eval_steps_per_second": 0.416,
179
  "step": 100
180
  },
181
  {
182
- "epoch": 0.23,
183
- "learning_rate": 4.7467175306295647e-07,
184
- "logits/chosen": 0.37000179290771484,
185
- "logits/rejected": 0.43369150161743164,
186
- "logps/chosen": -378.1351318359375,
187
- "logps/rejected": -378.1277770996094,
188
- "loss": 0.0933,
189
- "rewards/accuracies": 0.6187499761581421,
190
- "rewards/chosen": -0.14543434977531433,
191
- "rewards/margins": 0.1312834918498993,
192
- "rewards/rejected": -0.2767178416252136,
193
  "step": 110
194
  },
195
  {
196
- "epoch": 0.25,
197
- "learning_rate": 4.6604720940421207e-07,
198
- "logits/chosen": 0.4519842565059662,
199
- "logits/rejected": 0.5497914552688599,
200
- "logps/chosen": -408.4247131347656,
201
- "logps/rejected": -414.9881286621094,
202
- "loss": 0.0929,
203
- "rewards/accuracies": 0.6625000238418579,
204
- "rewards/chosen": -0.19376961886882782,
205
- "rewards/margins": 0.1563883125782013,
206
- "rewards/rejected": -0.3501579165458679,
207
  "step": 120
208
  },
209
  {
210
- "epoch": 0.27,
211
- "learning_rate": 4.5626458262912735e-07,
212
- "logits/chosen": 0.5827921628952026,
213
- "logits/rejected": 0.6809111833572388,
214
- "logps/chosen": -420.0984802246094,
215
- "logps/rejected": -399.5935363769531,
216
- "loss": 0.0789,
217
- "rewards/accuracies": 0.6000000238418579,
218
- "rewards/chosen": -0.28970545530319214,
219
- "rewards/margins": 0.15940071642398834,
220
- "rewards/rejected": -0.4491061270236969,
221
  "step": 130
222
  },
223
  {
224
- "epoch": 0.29,
225
- "learning_rate": 4.453763107901675e-07,
226
- "logits/chosen": 0.6244224309921265,
227
- "logits/rejected": 0.746228814125061,
228
- "logps/chosen": -396.53076171875,
229
- "logps/rejected": -390.9623718261719,
230
- "loss": 0.0717,
231
- "rewards/accuracies": 0.6625000238418579,
232
- "rewards/chosen": -0.3686402440071106,
233
- "rewards/margins": 0.18962158262729645,
234
- "rewards/rejected": -0.5582617521286011,
235
  "step": 140
236
  },
237
  {
238
- "epoch": 0.31,
239
- "learning_rate": 4.3344075855595097e-07,
240
- "logits/chosen": 0.6669297218322754,
241
- "logits/rejected": 0.8208922147750854,
242
- "logps/chosen": -387.5301818847656,
243
- "logps/rejected": -378.3419189453125,
244
- "loss": 0.0647,
245
- "rewards/accuracies": 0.6000000238418579,
246
- "rewards/chosen": -0.41620713472366333,
247
- "rewards/margins": 0.1934729665517807,
248
- "rewards/rejected": -0.6096801161766052,
249
  "step": 150
250
  },
251
  {
252
- "epoch": 0.33,
253
- "learning_rate": 4.2052190435769554e-07,
254
- "logits/chosen": 0.6333284974098206,
255
- "logits/rejected": 0.7795067429542542,
256
- "logps/chosen": -428.93841552734375,
257
- "logps/rejected": -450.5494079589844,
258
- "loss": 0.0619,
259
- "rewards/accuracies": 0.6875,
260
- "rewards/chosen": -0.4103819727897644,
261
- "rewards/margins": 0.2781962454319,
262
- "rewards/rejected": -0.6885782480239868,
263
  "step": 160
264
  },
265
  {
266
- "epoch": 0.36,
267
- "learning_rate": 4.0668899744407567e-07,
268
- "logits/chosen": 0.6851844787597656,
269
- "logits/rejected": 0.8698636889457703,
270
- "logps/chosen": -394.453369140625,
271
- "logps/rejected": -400.83892822265625,
272
- "loss": 0.0613,
273
- "rewards/accuracies": 0.612500011920929,
274
- "rewards/chosen": -0.49455365538597107,
275
- "rewards/margins": 0.24642686545848846,
276
- "rewards/rejected": -0.7409806251525879,
277
  "step": 170
278
  },
279
  {
280
- "epoch": 0.38,
281
- "learning_rate": 3.920161866827889e-07,
282
- "logits/chosen": 0.579459011554718,
283
- "logits/rejected": 0.6854727864265442,
284
- "logps/chosen": -381.6180419921875,
285
- "logps/rejected": -419.34869384765625,
286
- "loss": 0.0616,
287
- "rewards/accuracies": 0.625,
288
- "rewards/chosen": -0.476001501083374,
289
- "rewards/margins": 0.2683504521846771,
290
- "rewards/rejected": -0.7443519830703735,
291
  "step": 180
292
  },
293
  {
294
- "epoch": 0.4,
295
- "learning_rate": 3.765821230985757e-07,
296
- "logits/chosen": 0.5569711923599243,
297
- "logits/rejected": 0.6708570718765259,
298
- "logps/chosen": -383.0780334472656,
299
- "logps/rejected": -407.76837158203125,
300
- "loss": 0.0592,
301
- "rewards/accuracies": 0.637499988079071,
302
- "rewards/chosen": -0.4089416563510895,
303
- "rewards/margins": 0.28474992513656616,
304
- "rewards/rejected": -0.693691611289978,
305
  "step": 190
306
  },
307
  {
308
- "epoch": 0.42,
309
- "learning_rate": 3.604695382782159e-07,
310
- "logits/chosen": 0.49640387296676636,
311
- "logits/rejected": 0.604566216468811,
312
- "logps/chosen": -433.7373046875,
313
- "logps/rejected": -452.308837890625,
314
- "loss": 0.0589,
315
- "rewards/accuracies": 0.668749988079071,
316
- "rewards/chosen": -0.47900503873825073,
317
- "rewards/margins": 0.30649885535240173,
318
- "rewards/rejected": -0.7855038046836853,
319
  "step": 200
320
  },
321
  {
322
- "epoch": 0.42,
323
- "eval_logits/chosen": 0.6615116596221924,
324
- "eval_logits/rejected": 0.7807996273040771,
325
- "eval_logps/chosen": -404.0002136230469,
326
- "eval_logps/rejected": -453.07177734375,
327
- "eval_loss": 0.05819432809948921,
328
- "eval_rewards/accuracies": 0.71484375,
329
- "eval_rewards/chosen": -0.4454895853996277,
330
- "eval_rewards/margins": 0.42813408374786377,
331
- "eval_rewards/rejected": -0.8736236691474915,
332
- "eval_runtime": 75.0575,
333
- "eval_samples_per_second": 26.646,
334
- "eval_steps_per_second": 0.426,
335
  "step": 200
336
  },
337
  {
338
- "epoch": 0.44,
339
- "learning_rate": 3.4376480090239047e-07,
340
- "logits/chosen": 0.5758289098739624,
341
- "logits/rejected": 0.6775172352790833,
342
- "logps/chosen": -441.56683349609375,
343
- "logps/rejected": -425.92437744140625,
344
- "loss": 0.0567,
345
- "rewards/accuracies": 0.668749988079071,
346
- "rewards/chosen": -0.562717080116272,
347
- "rewards/margins": 0.29301005601882935,
348
- "rewards/rejected": -0.8557270765304565,
349
  "step": 210
350
  },
351
  {
352
- "epoch": 0.46,
353
- "learning_rate": 3.265574537815398e-07,
354
- "logits/chosen": 0.423481285572052,
355
- "logits/rejected": 0.6732310056686401,
356
- "logps/chosen": -423.9397888183594,
357
- "logps/rejected": -425.78045654296875,
358
- "loss": 0.0577,
359
- "rewards/accuracies": 0.7437499761581421,
360
- "rewards/chosen": -0.43129101395606995,
361
- "rewards/margins": 0.49137812852859497,
362
- "rewards/rejected": -0.9226692318916321,
363
  "step": 220
364
  },
365
  {
366
- "epoch": 0.48,
367
- "learning_rate": 3.0893973387735683e-07,
368
- "logits/chosen": 0.46089068055152893,
369
- "logits/rejected": 0.6886599659919739,
370
- "logps/chosen": -458.5089416503906,
371
- "logps/rejected": -429.6102600097656,
372
- "loss": 0.058,
373
- "rewards/accuracies": 0.706250011920929,
374
- "rewards/chosen": -0.49578744173049927,
375
- "rewards/margins": 0.4050619602203369,
376
- "rewards/rejected": -0.900849461555481,
377
  "step": 230
378
  },
379
  {
380
- "epoch": 0.5,
381
- "learning_rate": 2.910060778827554e-07,
382
- "logits/chosen": 0.581864595413208,
383
- "logits/rejected": 0.7646275758743286,
384
- "logps/chosen": -428.42803955078125,
385
- "logps/rejected": -440.18597412109375,
386
- "loss": 0.0611,
387
- "rewards/accuracies": 0.6812499761581421,
388
- "rewards/chosen": -0.45797547698020935,
389
- "rewards/margins": 0.4529312551021576,
390
- "rewards/rejected": -0.9109067916870117,
391
  "step": 240
392
  },
393
  {
394
- "epoch": 0.52,
395
- "learning_rate": 2.7285261601056697e-07,
396
- "logits/chosen": 0.5814759135246277,
397
- "logits/rejected": 0.7270434498786926,
398
- "logps/chosen": -398.45135498046875,
399
- "logps/rejected": -447.3760681152344,
400
- "loss": 0.0551,
401
- "rewards/accuracies": 0.699999988079071,
402
- "rewards/chosen": -0.5682977437973022,
403
- "rewards/margins": 0.40714630484580994,
404
- "rewards/rejected": -0.9754441380500793,
405
  "step": 250
406
  },
407
  {
408
- "epoch": 0.54,
409
- "learning_rate": 2.5457665670441937e-07,
410
- "logits/chosen": 0.540181040763855,
411
- "logits/rejected": 0.705514669418335,
412
- "logps/chosen": -430.0947265625,
413
- "logps/rejected": -455.96466064453125,
414
- "loss": 0.0543,
415
- "rewards/accuracies": 0.6937500238418579,
416
- "rewards/chosen": -0.5835620164871216,
417
- "rewards/margins": 0.3612635135650635,
418
- "rewards/rejected": -0.9448255300521851,
419
  "step": 260
420
  },
421
  {
422
- "epoch": 0.57,
423
- "learning_rate": 2.3627616503391812e-07,
424
- "logits/chosen": 0.531669020652771,
425
- "logits/rejected": 0.6921663880348206,
426
- "logps/chosen": -411.39947509765625,
427
- "logps/rejected": -438.072265625,
428
- "loss": 0.0529,
429
- "rewards/accuracies": 0.65625,
430
- "rewards/chosen": -0.581498384475708,
431
- "rewards/margins": 0.41039901971817017,
432
- "rewards/rejected": -0.9918974041938782,
433
  "step": 270
434
  },
435
  {
436
- "epoch": 0.59,
437
- "learning_rate": 2.1804923757009882e-07,
438
- "logits/chosen": 0.5589742064476013,
439
- "logits/rejected": 0.6747141480445862,
440
- "logps/chosen": -441.4170837402344,
441
- "logps/rejected": -477.62310791015625,
442
- "loss": 0.0506,
443
- "rewards/accuracies": 0.668749988079071,
444
- "rewards/chosen": -0.57341468334198,
445
- "rewards/margins": 0.4166173040866852,
446
- "rewards/rejected": -0.990031898021698,
447
  "step": 280
448
  },
449
  {
450
- "epoch": 0.61,
451
- "learning_rate": 1.9999357655598891e-07,
452
- "logits/chosen": 0.6409920454025269,
453
- "logits/rejected": 0.8697878122329712,
454
- "logps/chosen": -446.1312561035156,
455
- "logps/rejected": -445.7093811035156,
456
- "loss": 0.048,
457
- "rewards/accuracies": 0.625,
458
- "rewards/chosen": -0.6841451525688171,
459
- "rewards/margins": 0.47665899991989136,
460
- "rewards/rejected": -1.1608041524887085,
461
  "step": 290
462
  },
463
  {
464
- "epoch": 0.63,
465
- "learning_rate": 1.8220596619089573e-07,
466
- "logits/chosen": 0.67746901512146,
467
- "logits/rejected": 0.8398680686950684,
468
- "logps/chosen": -440.12237548828125,
469
- "logps/rejected": -460.32086181640625,
470
- "loss": 0.0465,
471
- "rewards/accuracies": 0.637499988079071,
472
- "rewards/chosen": -0.7187305688858032,
473
- "rewards/margins": 0.3237985372543335,
474
- "rewards/rejected": -1.0425291061401367,
475
  "step": 300
476
  },
477
  {
478
- "epoch": 0.63,
479
- "eval_logits/chosen": 0.7931328415870667,
480
- "eval_logits/rejected": 0.8960775136947632,
481
- "eval_logps/chosen": -419.99542236328125,
482
- "eval_logps/rejected": -477.4249267578125,
483
- "eval_loss": 0.04939539358019829,
484
- "eval_rewards/accuracies": 0.703125,
485
- "eval_rewards/chosen": -0.6054419279098511,
486
- "eval_rewards/margins": 0.511713445186615,
487
- "eval_rewards/rejected": -1.1171554327011108,
488
- "eval_runtime": 75.2617,
489
- "eval_samples_per_second": 26.574,
490
- "eval_steps_per_second": 0.425,
491
  "step": 300
492
  },
493
  {
494
- "epoch": 0.65,
495
- "learning_rate": 1.647817538357072e-07,
496
- "logits/chosen": 0.6320704817771912,
497
- "logits/rejected": 0.8103192448616028,
498
- "logps/chosen": -424.61865234375,
499
- "logps/rejected": -452.2117614746094,
500
- "loss": 0.0484,
501
- "rewards/accuracies": 0.606249988079071,
502
- "rewards/chosen": -0.608985185623169,
503
- "rewards/margins": 0.3958033323287964,
504
- "rewards/rejected": -1.0047886371612549,
505
  "step": 310
506
  },
507
  {
508
- "epoch": 0.67,
509
- "learning_rate": 1.478143389201113e-07,
510
- "logits/chosen": 0.7435864806175232,
511
- "logits/rejected": 0.9429095983505249,
512
- "logps/chosen": -452.36004638671875,
513
- "logps/rejected": -481.8624572753906,
514
- "loss": 0.0448,
515
- "rewards/accuracies": 0.731249988079071,
516
- "rewards/chosen": -0.628174901008606,
517
- "rewards/margins": 0.41646808385849,
518
- "rewards/rejected": -1.0446430444717407,
519
  "step": 320
520
  },
521
  {
522
- "epoch": 0.69,
523
- "learning_rate": 1.3139467229135998e-07,
524
- "logits/chosen": 0.6155081987380981,
525
- "logits/rejected": 0.7582153081893921,
526
- "logps/chosen": -426.2732849121094,
527
- "logps/rejected": -476.5437927246094,
528
- "loss": 0.0473,
529
- "rewards/accuracies": 0.668749988079071,
530
- "rewards/chosen": -0.6611535549163818,
531
- "rewards/margins": 0.38004034757614136,
532
- "rewards/rejected": -1.041193962097168,
533
  "step": 330
534
  },
535
- {
536
- "epoch": 0.71,
537
- "learning_rate": 1.1561076868822755e-07,
538
- "logits/chosen": 0.5263934135437012,
539
- "logits/rejected": 0.7371311783790588,
540
- "logps/chosen": -459.9794006347656,
541
- "logps/rejected": -492.32977294921875,
542
- "loss": 0.0484,
543
- "rewards/accuracies": 0.6875,
544
- "rewards/chosen": -0.6238055229187012,
545
- "rewards/margins": 0.46394386887550354,
546
- "rewards/rejected": -1.0877494812011719,
547
- "step": 340
548
- },
549
- {
550
- "epoch": 0.73,
551
- "learning_rate": 1.0054723495346482e-07,
552
- "logits/chosen": 0.6952361464500427,
553
- "logits/rejected": 0.7730409502983093,
554
- "logps/chosen": -392.53411865234375,
555
- "logps/rejected": -445.5184631347656,
556
- "loss": 0.0467,
557
- "rewards/accuracies": 0.6812499761581421,
558
- "rewards/chosen": -0.6540313959121704,
559
- "rewards/margins": 0.43615293502807617,
560
- "rewards/rejected": -1.0901843309402466,
561
- "step": 350
562
- },
563
- {
564
- "epoch": 0.75,
565
- "learning_rate": 8.628481651367875e-08,
566
- "logits/chosen": 0.645788311958313,
567
- "logits/rejected": 0.8300139307975769,
568
- "logps/chosen": -437.50830078125,
569
- "logps/rejected": -429.37890625,
570
- "loss": 0.0524,
571
- "rewards/accuracies": 0.6499999761581421,
572
- "rewards/chosen": -0.6759519577026367,
573
- "rewards/margins": 0.35926973819732666,
574
- "rewards/rejected": -1.0352216958999634,
575
- "step": 360
576
- },
577
- {
578
- "epoch": 0.77,
579
- "learning_rate": 7.289996455765748e-08,
580
- "logits/chosen": 0.6347015500068665,
581
- "logits/rejected": 0.8841344714164734,
582
- "logps/chosen": -434.65313720703125,
583
- "logps/rejected": -425.39825439453125,
584
- "loss": 0.0428,
585
- "rewards/accuracies": 0.7124999761581421,
586
- "rewards/chosen": -0.5998077988624573,
587
- "rewards/margins": 0.45034995675086975,
588
- "rewards/rejected": -1.0501576662063599,
589
- "step": 370
590
- },
591
- {
592
- "epoch": 0.8,
593
- "learning_rate": 6.046442623320145e-08,
594
- "logits/chosen": 0.653687596321106,
595
- "logits/rejected": 0.7189717292785645,
596
- "logps/chosen": -426.47674560546875,
597
- "logps/rejected": -455.5611267089844,
598
- "loss": 0.0501,
599
- "rewards/accuracies": 0.6187499761581421,
600
- "rewards/chosen": -0.6170892119407654,
601
- "rewards/margins": 0.3933621644973755,
602
- "rewards/rejected": -1.010451316833496,
603
- "step": 380
604
- },
605
- {
606
- "epoch": 0.82,
607
- "learning_rate": 4.904486005914027e-08,
608
- "logits/chosen": 0.5192676186561584,
609
- "logits/rejected": 0.7548397183418274,
610
- "logps/chosen": -476.45904541015625,
611
- "logps/rejected": -474.6182556152344,
612
- "loss": 0.0451,
613
- "rewards/accuracies": 0.737500011920929,
614
- "rewards/chosen": -0.718641459941864,
615
- "rewards/margins": 0.44875186681747437,
616
- "rewards/rejected": -1.167393445968628,
617
- "step": 390
618
- },
619
- {
620
- "epoch": 0.84,
621
- "learning_rate": 3.8702478614051345e-08,
622
- "logits/chosen": 0.6225503087043762,
623
- "logits/rejected": 0.731469988822937,
624
- "logps/chosen": -407.16912841796875,
625
- "logps/rejected": -418.2110290527344,
626
- "loss": 0.0419,
627
- "rewards/accuracies": 0.6499999761581421,
628
- "rewards/chosen": -0.6035235524177551,
629
- "rewards/margins": 0.3522457182407379,
630
- "rewards/rejected": -0.9557692408561707,
631
- "step": 400
632
- },
633
- {
634
- "epoch": 0.84,
635
- "eval_logits/chosen": 0.7144887447357178,
636
- "eval_logits/rejected": 0.832917332649231,
637
- "eval_logps/chosen": -416.8811950683594,
638
- "eval_logps/rejected": -477.0538330078125,
639
- "eval_loss": 0.049533091485500336,
640
- "eval_rewards/accuracies": 0.734375,
641
- "eval_rewards/chosen": -0.5742998123168945,
642
- "eval_rewards/margins": 0.5391446352005005,
643
- "eval_rewards/rejected": -1.1134445667266846,
644
- "eval_runtime": 76.9908,
645
- "eval_samples_per_second": 25.977,
646
- "eval_steps_per_second": 0.416,
647
- "step": 400
648
- },
649
- {
650
- "epoch": 0.86,
651
- "learning_rate": 2.9492720416985e-08,
652
- "logits/chosen": 0.5682260990142822,
653
- "logits/rejected": 0.7126413583755493,
654
- "logps/chosen": -419.2569885253906,
655
- "logps/rejected": -410.59014892578125,
656
- "loss": 0.0473,
657
- "rewards/accuracies": 0.637499988079071,
658
- "rewards/chosen": -0.652999222278595,
659
- "rewards/margins": 0.26512840390205383,
660
- "rewards/rejected": -0.9181275367736816,
661
- "step": 410
662
- },
663
- {
664
- "epoch": 0.88,
665
- "learning_rate": 2.1464952759020856e-08,
666
- "logits/chosen": 0.6080732345581055,
667
- "logits/rejected": 0.7386394739151001,
668
- "logps/chosen": -452.77789306640625,
669
- "logps/rejected": -437.8445739746094,
670
- "loss": 0.0469,
671
- "rewards/accuracies": 0.65625,
672
- "rewards/chosen": -0.7366248369216919,
673
- "rewards/margins": 0.2541760802268982,
674
- "rewards/rejected": -0.9908009767532349,
675
- "step": 420
676
- },
677
- {
678
- "epoch": 0.9,
679
- "learning_rate": 1.4662207078575684e-08,
680
- "logits/chosen": 0.6554642915725708,
681
- "logits/rejected": 0.7158025503158569,
682
- "logps/chosen": -407.20953369140625,
683
- "logps/rejected": -471.7041015625,
684
- "loss": 0.0453,
685
- "rewards/accuracies": 0.762499988079071,
686
- "rewards/chosen": -0.6344213485717773,
687
- "rewards/margins": 0.481538861989975,
688
- "rewards/rejected": -1.1159603595733643,
689
- "step": 430
690
- },
691
- {
692
- "epoch": 0.92,
693
- "learning_rate": 9.12094829893642e-09,
694
- "logits/chosen": 0.7153126001358032,
695
- "logits/rejected": 0.7965753078460693,
696
- "logps/chosen": -441.6089782714844,
697
- "logps/rejected": -466.2574768066406,
698
- "loss": 0.0487,
699
- "rewards/accuracies": 0.612500011920929,
700
- "rewards/chosen": -0.6991580724716187,
701
- "rewards/margins": 0.4110774099826813,
702
- "rewards/rejected": -1.1102354526519775,
703
- "step": 440
704
- },
705
- {
706
- "epoch": 0.94,
707
- "learning_rate": 4.8708793644441086e-09,
708
- "logits/chosen": 0.6587673425674438,
709
- "logits/rejected": 0.830274760723114,
710
- "logps/chosen": -465.8287658691406,
711
- "logps/rejected": -467.7762145996094,
712
- "loss": 0.0461,
713
- "rewards/accuracies": 0.6937500238418579,
714
- "rewards/chosen": -0.6734641194343567,
715
- "rewards/margins": 0.3525208532810211,
716
- "rewards/rejected": -1.0259850025177002,
717
- "step": 450
718
- },
719
  {
720
  "epoch": 0.96,
721
- "learning_rate": 1.9347820230782295e-09,
722
- "logits/chosen": 0.7173280715942383,
723
- "logits/rejected": 0.8633974194526672,
724
- "logps/chosen": -380.0779724121094,
725
- "logps/rejected": -403.060302734375,
726
- "loss": 0.0456,
727
- "rewards/accuracies": 0.6875,
728
- "rewards/chosen": -0.5644342303276062,
729
- "rewards/margins": 0.3927594721317291,
730
- "rewards/rejected": -0.9571938514709473,
731
- "step": 460
732
  },
733
  {
734
  "epoch": 0.98,
735
- "learning_rate": 3.2839470889836627e-10,
736
- "logits/chosen": 0.6316866278648376,
737
- "logits/rejected": 0.7777234315872192,
738
- "logps/chosen": -403.5115661621094,
739
- "logps/rejected": -452.41864013671875,
740
- "loss": 0.0474,
741
- "rewards/accuracies": 0.675000011920929,
742
- "rewards/chosen": -0.6106697916984558,
743
- "rewards/margins": 0.4684695303440094,
744
- "rewards/rejected": -1.0791394710540771,
745
- "step": 470
746
  },
747
  {
748
  "epoch": 1.0,
749
- "step": 477,
750
  "total_flos": 0.0,
751
- "train_loss": 0.06584663976538356,
752
- "train_runtime": 4434.0315,
753
- "train_samples_per_second": 13.787,
754
- "train_steps_per_second": 0.108
755
  }
756
  ],
757
  "logging_steps": 10,
758
- "max_steps": 477,
759
  "num_train_epochs": 1,
760
  "save_steps": 1000,
761
  "total_flos": 0.0,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 100,
6
+ "global_step": 356,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "learning_rate": 1.3888888888888887e-08,
14
+ "logits/chosen": -0.07916320115327835,
15
+ "logits/rejected": 0.09423620253801346,
16
+ "logps/chosen": -527.0689697265625,
17
+ "logps/rejected": -183.19036865234375,
18
+ "loss": 0.2697,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
 
23
  "step": 1
24
  },
25
  {
26
+ "epoch": 0.03,
27
+ "learning_rate": 1.3888888888888888e-07,
28
+ "logits/chosen": 0.004179990850389004,
29
+ "logits/rejected": 0.11239409446716309,
30
+ "logps/chosen": -361.7335205078125,
31
+ "logps/rejected": -210.11724853515625,
32
+ "loss": 0.2694,
33
+ "rewards/accuracies": 0.4375,
34
+ "rewards/chosen": -0.0008115082746371627,
35
+ "rewards/margins": -0.0008267887169495225,
36
+ "rewards/rejected": 1.528057146060746e-05,
37
  "step": 10
38
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  {
40
  "epoch": 0.06,
41
+ "learning_rate": 2.7777777777777776e-07,
42
+ "logits/chosen": 0.026069095358252525,
43
+ "logits/rejected": 0.120852991938591,
44
+ "logps/chosen": -340.1752624511719,
45
+ "logps/rejected": -205.56005859375,
46
+ "loss": 0.2674,
47
+ "rewards/accuracies": 0.5375000238418579,
48
+ "rewards/chosen": 0.0007374463602900505,
49
+ "rewards/margins": 0.002576880855485797,
50
+ "rewards/rejected": -0.00183943472802639,
51
+ "step": 20
52
  },
53
  {
54
  "epoch": 0.08,
55
  "learning_rate": 4.1666666666666667e-07,
56
+ "logits/chosen": 0.0880424827337265,
57
+ "logits/rejected": 0.18464604020118713,
58
+ "logps/chosen": -364.30645751953125,
59
+ "logps/rejected": -223.30856323242188,
60
+ "loss": 0.2594,
61
  "rewards/accuracies": 0.581250011920929,
62
+ "rewards/chosen": 0.011071065440773964,
63
+ "rewards/margins": 0.01881546340882778,
64
+ "rewards/rejected": -0.007744398899376392,
65
+ "step": 30
66
+ },
67
+ {
68
+ "epoch": 0.11,
69
+ "learning_rate": 4.998072590601808e-07,
70
+ "logits/chosen": 0.03276750445365906,
71
+ "logits/rejected": 0.11399135738611221,
72
+ "logps/chosen": -326.2428894042969,
73
+ "logps/rejected": -201.30242919921875,
74
+ "loss": 0.2715,
75
+ "rewards/accuracies": 0.6312500238418579,
76
+ "rewards/chosen": 0.012436876073479652,
77
+ "rewards/margins": 0.04091664031147957,
78
+ "rewards/rejected": -0.028479766100645065,
79
  "step": 40
80
  },
81
  {
82
+ "epoch": 0.14,
83
+ "learning_rate": 4.976423351108942e-07,
84
+ "logits/chosen": 0.027437573298811913,
85
+ "logits/rejected": 0.14870569109916687,
86
+ "logps/chosen": -334.8109436035156,
87
+ "logps/rejected": -226.3872528076172,
88
+ "loss": 0.2705,
89
+ "rewards/accuracies": 0.5375000238418579,
90
+ "rewards/chosen": 0.005514688324183226,
91
+ "rewards/margins": 0.05494096130132675,
92
+ "rewards/rejected": -0.04942627623677254,
93
  "step": 50
94
  },
95
  {
96
+ "epoch": 0.17,
97
+ "learning_rate": 4.930924800994191e-07,
98
+ "logits/chosen": -0.05219824239611626,
99
+ "logits/rejected": 0.0783001258969307,
100
+ "logps/chosen": -381.02630615234375,
101
+ "logps/rejected": -222.3650665283203,
102
+ "loss": 0.279,
103
+ "rewards/accuracies": 0.5687500238418579,
104
+ "rewards/chosen": 0.007738674990832806,
105
+ "rewards/margins": 0.15125080943107605,
106
+ "rewards/rejected": -0.14351214468479156,
107
  "step": 60
108
  },
109
  {
110
+ "epoch": 0.2,
111
+ "learning_rate": 4.862015116167195e-07,
112
+ "logits/chosen": -0.03661634773015976,
113
+ "logits/rejected": 0.07865114510059357,
114
+ "logps/chosen": -399.9691467285156,
115
+ "logps/rejected": -226.57077026367188,
116
+ "loss": 0.2772,
117
+ "rewards/accuracies": 0.574999988079071,
118
+ "rewards/chosen": 0.03858373686671257,
119
+ "rewards/margins": 0.1999650001525879,
120
+ "rewards/rejected": -0.16138127446174622,
121
  "step": 70
122
  },
123
  {
124
+ "epoch": 0.22,
125
+ "learning_rate": 4.770357934562704e-07,
126
+ "logits/chosen": -0.09548693895339966,
127
+ "logits/rejected": 0.029117891564965248,
128
+ "logps/chosen": -338.6766662597656,
129
+ "logps/rejected": -209.7130889892578,
130
+ "loss": 0.2781,
131
+ "rewards/accuracies": 0.643750011920929,
132
+ "rewards/chosen": 0.03253953158855438,
133
+ "rewards/margins": 0.21850749850273132,
134
+ "rewards/rejected": -0.18596798181533813,
135
  "step": 80
136
  },
137
  {
138
+ "epoch": 0.25,
139
+ "learning_rate": 4.6568359649444796e-07,
140
+ "logits/chosen": -0.04077336937189102,
141
+ "logits/rejected": 0.011481313034892082,
142
+ "logps/chosen": -378.87640380859375,
143
+ "logps/rejected": -251.39126586914062,
144
+ "loss": 0.2914,
145
+ "rewards/accuracies": 0.625,
146
+ "rewards/chosen": 0.0237547867000103,
147
+ "rewards/margins": 0.26172345876693726,
148
+ "rewards/rejected": -0.23796863853931427,
149
  "step": 90
150
  },
151
  {
152
+ "epoch": 0.28,
153
+ "learning_rate": 4.5225424859373684e-07,
154
+ "logits/chosen": -0.031900886446237564,
155
+ "logits/rejected": 0.12457527965307236,
156
+ "logps/chosen": -362.72149658203125,
157
+ "logps/rejected": -228.3905029296875,
158
+ "loss": 0.2868,
159
+ "rewards/accuracies": 0.675000011920929,
160
+ "rewards/chosen": 0.0559692457318306,
161
+ "rewards/margins": 0.2908957004547119,
162
+ "rewards/rejected": -0.2349264919757843,
163
  "step": 100
164
  },
165
  {
166
+ "epoch": 0.28,
167
+ "eval_logits/chosen": 0.2621645927429199,
168
+ "eval_logits/rejected": 0.3182756006717682,
169
+ "eval_logps/chosen": -344.21881103515625,
170
+ "eval_logps/rejected": -354.05804443359375,
171
+ "eval_loss": 0.1333150714635849,
172
+ "eval_rewards/accuracies": 0.54296875,
173
+ "eval_rewards/chosen": 0.15232382714748383,
174
+ "eval_rewards/margins": 0.035810258239507675,
175
+ "eval_rewards/rejected": 0.11651356518268585,
176
+ "eval_runtime": 73.7435,
177
+ "eval_samples_per_second": 27.121,
178
+ "eval_steps_per_second": 0.434,
179
  "step": 100
180
  },
181
  {
182
+ "epoch": 0.31,
183
+ "learning_rate": 4.3687708171564917e-07,
184
+ "logits/chosen": -0.055425770580768585,
185
+ "logits/rejected": 0.09765736013650894,
186
+ "logps/chosen": -321.40301513671875,
187
+ "logps/rejected": -256.71246337890625,
188
+ "loss": 0.2724,
189
+ "rewards/accuracies": 0.612500011920929,
190
+ "rewards/chosen": -0.053195059299468994,
191
+ "rewards/margins": 0.14237919449806213,
192
+ "rewards/rejected": -0.19557425379753113,
193
  "step": 110
194
  },
195
  {
196
+ "epoch": 0.34,
197
+ "learning_rate": 4.1970018638323547e-07,
198
+ "logits/chosen": 0.15357539057731628,
199
+ "logits/rejected": 0.26186805963516235,
200
+ "logps/chosen": -320.7575378417969,
201
+ "logps/rejected": -218.4688262939453,
202
+ "loss": 0.2627,
203
+ "rewards/accuracies": 0.637499988079071,
204
+ "rewards/chosen": -0.046941496431827545,
205
+ "rewards/margins": 0.1810278296470642,
206
+ "rewards/rejected": -0.22796931862831116,
207
  "step": 120
208
  },
209
  {
210
+ "epoch": 0.37,
211
+ "learning_rate": 4.0088898548839285e-07,
212
+ "logits/chosen": 0.08365978300571442,
213
+ "logits/rejected": 0.24095574021339417,
214
+ "logps/chosen": -407.60296630859375,
215
+ "logps/rejected": -238.1927947998047,
216
+ "loss": 0.2686,
217
+ "rewards/accuracies": 0.706250011920929,
218
+ "rewards/chosen": 0.09180920571088791,
219
+ "rewards/margins": 0.40369096398353577,
220
+ "rewards/rejected": -0.31188178062438965,
221
  "step": 130
222
  },
223
  {
224
+ "epoch": 0.39,
225
+ "learning_rate": 3.806246411789872e-07,
226
+ "logits/chosen": 0.022735467180609703,
227
+ "logits/rejected": 0.15050409734249115,
228
+ "logps/chosen": -329.76239013671875,
229
+ "logps/rejected": -232.79110717773438,
230
+ "loss": 0.2578,
231
+ "rewards/accuracies": 0.625,
232
+ "rewards/chosen": -0.042784951627254486,
233
+ "rewards/margins": 0.22251620888710022,
234
+ "rewards/rejected": -0.2653011679649353,
235
  "step": 140
236
  },
237
  {
238
+ "epoch": 0.42,
239
+ "learning_rate": 3.5910231016833546e-07,
240
+ "logits/chosen": 0.03553224354982376,
241
+ "logits/rejected": 0.17889562249183655,
242
+ "logps/chosen": -353.3590087890625,
243
+ "logps/rejected": -262.5151062011719,
244
+ "loss": 0.2602,
245
+ "rewards/accuracies": 0.5625,
246
+ "rewards/chosen": -0.09243994206190109,
247
+ "rewards/margins": 0.20844857394695282,
248
+ "rewards/rejected": -0.3008885383605957,
249
  "step": 150
250
  },
251
  {
252
+ "epoch": 0.45,
253
+ "learning_rate": 3.3652926426937325e-07,
254
+ "logits/chosen": 0.1453518569469452,
255
+ "logits/rejected": 0.3264540731906891,
256
+ "logps/chosen": -371.01690673828125,
257
+ "logps/rejected": -241.61587524414062,
258
+ "loss": 0.2524,
259
+ "rewards/accuracies": 0.6937500238418579,
260
+ "rewards/chosen": -0.028423363342881203,
261
+ "rewards/margins": 0.3061867952346802,
262
+ "rewards/rejected": -0.3346101641654968,
263
  "step": 160
264
  },
265
  {
266
+ "epoch": 0.48,
267
+ "learning_rate": 3.1312289425378944e-07,
268
+ "logits/chosen": 0.1509348452091217,
269
+ "logits/rejected": 0.2768942713737488,
270
+ "logps/chosen": -338.87677001953125,
271
+ "logps/rejected": -241.0658721923828,
272
+ "loss": 0.2506,
273
+ "rewards/accuracies": 0.6312500238418579,
274
+ "rewards/chosen": -0.1020059734582901,
275
+ "rewards/margins": 0.2968062162399292,
276
+ "rewards/rejected": -0.3988121747970581,
277
  "step": 170
278
  },
279
  {
280
+ "epoch": 0.51,
281
+ "learning_rate": 2.8910861626005773e-07,
282
+ "logits/chosen": 0.040277767926454544,
283
+ "logits/rejected": 0.27446210384368896,
284
+ "logps/chosen": -357.395751953125,
285
+ "logps/rejected": -251.81118774414062,
286
+ "loss": 0.2477,
287
+ "rewards/accuracies": 0.606249988079071,
288
+ "rewards/chosen": -0.1509798765182495,
289
+ "rewards/margins": 0.2679017186164856,
290
+ "rewards/rejected": -0.4188815653324127,
291
  "step": 180
292
  },
293
  {
294
+ "epoch": 0.53,
295
+ "learning_rate": 2.647177009127972e-07,
296
+ "logits/chosen": 0.013982865028083324,
297
+ "logits/rejected": 0.2294052094221115,
298
+ "logps/chosen": -367.9175720214844,
299
+ "logps/rejected": -243.03515625,
300
+ "loss": 0.2495,
301
+ "rewards/accuracies": 0.643750011920929,
302
+ "rewards/chosen": -0.10979857295751572,
303
+ "rewards/margins": 0.31621265411376953,
304
+ "rewards/rejected": -0.42601123452186584,
305
  "step": 190
306
  },
307
  {
308
+ "epoch": 0.56,
309
+ "learning_rate": 2.401850460602329e-07,
310
+ "logits/chosen": -0.03239673003554344,
311
+ "logits/rejected": 0.16571494936943054,
312
+ "logps/chosen": -379.0220642089844,
313
+ "logps/rejected": -245.20059204101562,
314
+ "loss": 0.2525,
315
+ "rewards/accuracies": 0.6625000238418579,
316
+ "rewards/chosen": -0.0782201737165451,
317
+ "rewards/margins": 0.3675476312637329,
318
+ "rewards/rejected": -0.4457678198814392,
319
  "step": 200
320
  },
321
  {
322
+ "epoch": 0.56,
323
+ "eval_logits/chosen": 0.49550750851631165,
324
+ "eval_logits/rejected": 0.5930490493774414,
325
+ "eval_logps/chosen": -368.35491943359375,
326
+ "eval_logps/rejected": -380.480224609375,
327
+ "eval_loss": 0.1256043165922165,
328
+ "eval_rewards/accuracies": 0.55078125,
329
+ "eval_rewards/chosen": -0.08903706073760986,
330
+ "eval_rewards/margins": 0.058670774102211,
331
+ "eval_rewards/rejected": -0.14770783483982086,
332
+ "eval_runtime": 75.4328,
333
+ "eval_samples_per_second": 26.514,
334
+ "eval_steps_per_second": 0.424,
335
  "step": 200
336
  },
337
  {
338
+ "epoch": 0.59,
339
+ "learning_rate": 2.1574691457950803e-07,
340
+ "logits/chosen": 0.12455999851226807,
341
+ "logits/rejected": 0.324402391910553,
342
+ "logps/chosen": -438.97503662109375,
343
+ "logps/rejected": -254.6726837158203,
344
+ "loss": 0.2425,
345
+ "rewards/accuracies": 0.643750011920929,
346
+ "rewards/chosen": -0.06605833023786545,
347
+ "rewards/margins": 0.47193247079849243,
348
+ "rewards/rejected": -0.5379907488822937,
349
  "step": 210
350
  },
351
  {
352
+ "epoch": 0.62,
353
+ "learning_rate": 1.9163865903602372e-07,
354
+ "logits/chosen": 0.1709764003753662,
355
+ "logits/rejected": 0.3544366955757141,
356
+ "logps/chosen": -394.0027160644531,
357
+ "logps/rejected": -261.89007568359375,
358
+ "loss": 0.2455,
359
+ "rewards/accuracies": 0.675000011920929,
360
+ "rewards/chosen": -0.12171328067779541,
361
+ "rewards/margins": 0.3835197985172272,
362
+ "rewards/rejected": -0.5052330493927002,
363
  "step": 220
364
  },
365
  {
366
+ "epoch": 0.65,
367
+ "learning_rate": 1.6809245510957666e-07,
368
+ "logits/chosen": 0.09081762284040451,
369
+ "logits/rejected": 0.2811218202114105,
370
+ "logps/chosen": -364.36785888671875,
371
+ "logps/rejected": -250.978271484375,
372
+ "loss": 0.2368,
373
+ "rewards/accuracies": 0.59375,
374
+ "rewards/chosen": -0.14235250651836395,
375
+ "rewards/margins": 0.30515140295028687,
376
+ "rewards/rejected": -0.44750386476516724,
377
  "step": 230
378
  },
379
  {
380
+ "epoch": 0.67,
381
+ "learning_rate": 1.4533506561564305e-07,
382
+ "logits/chosen": 0.03653250262141228,
383
+ "logits/rejected": 0.2626824378967285,
384
+ "logps/chosen": -391.4312438964844,
385
+ "logps/rejected": -266.96502685546875,
386
+ "loss": 0.2292,
387
+ "rewards/accuracies": 0.6625000238418579,
388
+ "rewards/chosen": -0.14164027571678162,
389
+ "rewards/margins": 0.358634352684021,
390
+ "rewards/rejected": -0.500274658203125,
391
  "step": 240
392
  },
393
  {
394
+ "epoch": 0.7,
395
+ "learning_rate": 1.2358565665550387e-07,
396
+ "logits/chosen": 0.17212027311325073,
397
+ "logits/rejected": 0.40048956871032715,
398
+ "logps/chosen": -332.1188049316406,
399
+ "logps/rejected": -250.65560913085938,
400
+ "loss": 0.2391,
401
+ "rewards/accuracies": 0.6625000238418579,
402
+ "rewards/chosen": -0.24126167595386505,
403
+ "rewards/margins": 0.3293803334236145,
404
+ "rewards/rejected": -0.5706420540809631,
405
  "step": 250
406
  },
407
  {
408
+ "epoch": 0.73,
409
+ "learning_rate": 1.0305368692688174e-07,
410
+ "logits/chosen": -0.08241891115903854,
411
+ "logits/rejected": 0.16553013026714325,
412
+ "logps/chosen": -412.36114501953125,
413
+ "logps/rejected": -274.79400634765625,
414
+ "loss": 0.2439,
415
+ "rewards/accuracies": 0.637499988079071,
416
+ "rewards/chosen": -0.1504540741443634,
417
+ "rewards/margins": 0.4203736186027527,
418
+ "rewards/rejected": -0.5708277225494385,
419
  "step": 260
420
  },
421
  {
422
+ "epoch": 0.76,
423
+ "learning_rate": 8.393689052217964e-08,
424
+ "logits/chosen": -0.020510882139205933,
425
+ "logits/rejected": 0.12404396384954453,
426
+ "logps/chosen": -349.6496276855469,
427
+ "logps/rejected": -272.2840576171875,
428
+ "loss": 0.2399,
429
+ "rewards/accuracies": 0.6187499761581421,
430
+ "rewards/chosen": -0.24097779393196106,
431
+ "rewards/margins": 0.28776854276657104,
432
+ "rewards/rejected": -0.5287463068962097,
433
  "step": 270
434
  },
435
  {
436
+ "epoch": 0.79,
437
+ "learning_rate": 6.641937264107867e-08,
438
+ "logits/chosen": 0.010314036160707474,
439
+ "logits/rejected": 0.23556776344776154,
440
+ "logps/chosen": -394.3717956542969,
441
+ "logps/rejected": -275.3983459472656,
442
+ "loss": 0.2461,
443
+ "rewards/accuracies": 0.6875,
444
+ "rewards/chosen": -0.18616211414337158,
445
+ "rewards/margins": 0.3924596905708313,
446
+ "rewards/rejected": -0.5786218047142029,
447
  "step": 280
448
  },
449
  {
450
+ "epoch": 0.81,
451
+ "learning_rate": 5.066983655682325e-08,
452
+ "logits/chosen": 0.11865706741809845,
453
+ "logits/rejected": 0.37171706557273865,
454
+ "logps/chosen": -410.75238037109375,
455
+ "logps/rejected": -276.05096435546875,
456
+ "loss": 0.2417,
457
+ "rewards/accuracies": 0.643750011920929,
458
+ "rewards/chosen": -0.16428914666175842,
459
+ "rewards/margins": 0.37311890721321106,
460
+ "rewards/rejected": -0.5374081134796143,
461
  "step": 290
462
  },
463
  {
464
+ "epoch": 0.84,
465
+ "learning_rate": 3.683995891147695e-08,
466
+ "logits/chosen": 0.17865325510501862,
467
+ "logits/rejected": 0.34302350878715515,
468
+ "logps/chosen": -415.6402282714844,
469
+ "logps/rejected": -274.9131774902344,
470
+ "loss": 0.2378,
471
+ "rewards/accuracies": 0.668749988079071,
472
+ "rewards/chosen": -0.18494005501270294,
473
+ "rewards/margins": 0.4354974627494812,
474
+ "rewards/rejected": -0.620437502861023,
475
  "step": 300
476
  },
477
  {
478
+ "epoch": 0.84,
479
+ "eval_logits/chosen": 0.6386339664459229,
480
+ "eval_logits/rejected": 0.7226977348327637,
481
+ "eval_logps/chosen": -380.201171875,
482
+ "eval_logps/rejected": -392.9847717285156,
483
+ "eval_loss": 0.12494668364524841,
484
+ "eval_rewards/accuracies": 0.5390625,
485
+ "eval_rewards/chosen": -0.20749951899051666,
486
+ "eval_rewards/margins": 0.0652545690536499,
487
+ "eval_rewards/rejected": -0.27275407314300537,
488
+ "eval_runtime": 72.8503,
489
+ "eval_samples_per_second": 27.454,
490
+ "eval_steps_per_second": 0.439,
491
  "step": 300
492
  },
493
  {
494
+ "epoch": 0.87,
495
+ "learning_rate": 2.5062928986944676e-08,
496
+ "logits/chosen": 0.16634011268615723,
497
+ "logits/rejected": 0.354255348443985,
498
+ "logps/chosen": -353.1306457519531,
499
+ "logps/rejected": -266.2892761230469,
500
+ "loss": 0.237,
501
+ "rewards/accuracies": 0.637499988079071,
502
+ "rewards/chosen": -0.22243313491344452,
503
+ "rewards/margins": 0.31714385747909546,
504
+ "rewards/rejected": -0.5395770072937012,
505
  "step": 310
506
  },
507
  {
508
+ "epoch": 0.9,
509
+ "learning_rate": 1.5452166019378987e-08,
510
+ "logits/chosen": 0.2329043447971344,
511
+ "logits/rejected": 0.4254538416862488,
512
+ "logps/chosen": -371.19134521484375,
513
+ "logps/rejected": -265.32318115234375,
514
+ "loss": 0.2319,
515
+ "rewards/accuracies": 0.668749988079071,
516
+ "rewards/chosen": -0.2588824927806854,
517
+ "rewards/margins": 0.35255807638168335,
518
+ "rewards/rejected": -0.6114405989646912,
519
  "step": 320
520
  },
521
  {
522
+ "epoch": 0.93,
523
+ "learning_rate": 8.100226909935059e-09,
524
+ "logits/chosen": 0.1528688371181488,
525
+ "logits/rejected": 0.36090391874313354,
526
+ "logps/chosen": -388.44879150390625,
527
+ "logps/rejected": -276.200439453125,
528
+ "loss": 0.2326,
529
+ "rewards/accuracies": 0.5625,
530
+ "rewards/chosen": -0.30423063039779663,
531
+ "rewards/margins": 0.30501413345336914,
532
+ "rewards/rejected": -0.6092447638511658,
533
  "step": 330
534
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
535
  {
536
  "epoch": 0.96,
537
+ "learning_rate": 3.077914851215585e-09,
538
+ "logits/chosen": 0.19943758845329285,
539
+ "logits/rejected": 0.3966359496116638,
540
+ "logps/chosen": -362.4263916015625,
541
+ "logps/rejected": -263.9919738769531,
542
+ "loss": 0.2411,
543
+ "rewards/accuracies": 0.643750011920929,
544
+ "rewards/chosen": -0.3241748511791229,
545
+ "rewards/margins": 0.311737060546875,
546
+ "rewards/rejected": -0.6359119415283203,
547
+ "step": 340
548
  },
549
  {
550
  "epoch": 0.98,
551
+ "learning_rate": 4.3359745382104405e-10,
552
+ "logits/chosen": 0.1615952104330063,
553
+ "logits/rejected": 0.29963111877441406,
554
+ "logps/chosen": -399.9383544921875,
555
+ "logps/rejected": -277.37005615234375,
556
+ "loss": 0.2471,
557
+ "rewards/accuracies": 0.6875,
558
+ "rewards/chosen": -0.17405284941196442,
559
+ "rewards/margins": 0.4160131514072418,
560
+ "rewards/rejected": -0.5900660753250122,
561
+ "step": 350
562
  },
563
  {
564
  "epoch": 1.0,
565
+ "step": 356,
566
  "total_flos": 0.0,
567
+ "train_loss": 0.25543520273117537,
568
+ "train_runtime": 3431.8585,
569
+ "train_samples_per_second": 13.272,
570
+ "train_steps_per_second": 0.104
571
  }
572
  ],
573
  "logging_steps": 10,
574
+ "max_steps": 356,
575
  "num_train_epochs": 1,
576
  "save_steps": 1000,
577
  "total_flos": 0.0,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:244710b622fa4597e251d9d5432f6e641819c004ec5cdd6bd2c0a68718e30f4c
3
  size 5944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6f15dbfe379cf2e86bb1ff71cd78246124e71c6fe8e2b96ac4b71bb7fc947e1
3
  size 5944