wzhouad commited on
Commit
a773a48
1 Parent(s): e8d5098

Model save

Browse files
README.md CHANGED
@@ -17,15 +17,15 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [HuggingFaceH4/mistral-7b-sft-beta](https://huggingface.co/HuggingFaceH4/mistral-7b-sft-beta) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 0.0875
21
- - Rewards/chosen: -0.7669
22
- - Rewards/rejected: -0.9675
23
- - Rewards/accuracies: 0.6289
24
- - Rewards/margins: 0.2006
25
- - Logps/rejected: -354.1068
26
- - Logps/chosen: -333.7293
27
- - Logits/rejected: -2.4179
28
- - Logits/chosen: -2.4373
29
 
30
  ## Model description
31
 
@@ -47,7 +47,7 @@ The following hyperparameters were used during training:
47
  - learning_rate: 5e-07
48
  - train_batch_size: 8
49
  - eval_batch_size: 8
50
- - seed: 42
51
  - distributed_type: multi-GPU
52
  - num_devices: 8
53
  - gradient_accumulation_steps: 2
@@ -62,9 +62,10 @@ The following hyperparameters were used during training:
62
 
63
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
64
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
65
- | 0.3429 | 0.32 | 100 | 0.1483 | -0.1942 | -0.2917 | 0.6289 | 0.0975 | -286.5215 | -276.4608 | -2.6441 | -2.6638 |
66
- | 0.2067 | 0.65 | 200 | 0.0895 | -0.7273 | -0.9051 | 0.6133 | 0.1778 | -347.8626 | -329.7697 | -2.4372 | -2.4553 |
67
- | 0.1851 | 0.97 | 300 | 0.0875 | -0.7669 | -0.9675 | 0.6289 | 0.2006 | -354.1068 | -333.7293 | -2.4179 | -2.4373 |
 
68
 
69
 
70
  ### Framework versions
 
17
 
18
  This model is a fine-tuned version of [HuggingFaceH4/mistral-7b-sft-beta](https://huggingface.co/HuggingFaceH4/mistral-7b-sft-beta) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.5034
21
+ - Rewards/chosen: -1.3101
22
+ - Rewards/rejected: -2.2670
23
+ - Rewards/accuracies: 0.7695
24
+ - Rewards/margins: 0.9569
25
+ - Logps/rejected: -484.0533
26
+ - Logps/chosen: -388.0500
27
+ - Logits/rejected: -1.9827
28
+ - Logits/chosen: -2.0268
29
 
30
  ## Model description
31
 
 
47
  - learning_rate: 5e-07
48
  - train_batch_size: 8
49
  - eval_batch_size: 8
50
+ - seed: 1
51
  - distributed_type: multi-GPU
52
  - num_devices: 8
53
  - gradient_accumulation_steps: 2
 
62
 
63
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
64
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
65
+ | 0.5691 | 0.21 | 100 | 0.5829 | -0.6557 | -1.1886 | 0.7422 | 0.5328 | -376.2088 | -322.6110 | -2.7021 | -2.7191 |
66
+ | 0.5446 | 0.42 | 200 | 0.5301 | -0.8102 | -1.6275 | 0.7812 | 0.8173 | -420.1078 | -338.0599 | -2.2434 | -2.2738 |
67
+ | 0.5094 | 0.63 | 300 | 0.5146 | -1.3749 | -2.3136 | 0.7656 | 0.9387 | -488.7169 | -394.5290 | -1.9920 | -2.0372 |
68
+ | 0.5086 | 0.84 | 400 | 0.5034 | -1.3101 | -2.2670 | 0.7695 | 0.9569 | -484.0533 | -388.0500 | -1.9827 | -2.0268 |
69
 
70
 
71
  ### Framework versions
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.2655397044030594,
4
- "train_runtime": 2864.9962,
5
- "train_samples": 39494,
6
- "train_samples_per_second": 13.785,
7
- "train_steps_per_second": 0.108
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.5420855548092511,
4
+ "train_runtime": 4282.9885,
5
+ "train_samples": 61134,
6
+ "train_samples_per_second": 14.274,
7
+ "train_steps_per_second": 0.112
8
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fcd753fc1b3cf29c04f7a674c2ec092a91716b4c22e452c5f027fb245c79f5ce
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:397d100ccddd358af9a8d16ed6fc52f33294fea16ee999e84b74fda803d3787a
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fcea66ccd2e98117624f0f98598295d13220f2760e1fd132f969f01d1edf7717
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6776d245e34850a05a76e77d2a0d275e9cb2bd8d5aaefe0d413e646d1c7e6e3
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d3c1fa11d3e0106f2208b93d18dcf869985d3364dc0850b84911c6f8508db12f
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06c461e2e9d9c709bb1ed2d99059d8eaf0b3e868ccbb4e05dd6661287f916a45
3
  size 4540516344
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.2655397044030594,
4
- "train_runtime": 2864.9962,
5
- "train_samples": 39494,
6
- "train_samples_per_second": 13.785,
7
- "train_steps_per_second": 0.108
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.5420855548092511,
4
+ "train_runtime": 4282.9885,
5
+ "train_samples": 61134,
6
+ "train_samples_per_second": 14.274,
7
+ "train_steps_per_second": 0.112
8
  }
trainer_state.json CHANGED
@@ -3,19 +3,19 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 100,
6
- "global_step": 309,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "learning_rate": 1.6129032258064514e-08,
14
- "logits/chosen": -2.8823509216308594,
15
- "logits/rejected": -2.8926596641540527,
16
- "logps/chosen": -112.50870513916016,
17
- "logps/rejected": -106.02118682861328,
18
- "loss": 0.4154,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
@@ -23,485 +23,739 @@
23
  "step": 1
24
  },
25
  {
26
- "epoch": 0.03,
27
- "learning_rate": 1.6129032258064515e-07,
28
- "logits/chosen": -2.8375186920166016,
29
- "logits/rejected": -2.835359573364258,
30
- "logps/chosen": -103.35088348388672,
31
- "logps/rejected": -103.50007629394531,
32
- "loss": 0.426,
33
- "rewards/accuracies": 0.4861111044883728,
34
- "rewards/chosen": 6.0574482631636783e-05,
35
- "rewards/margins": 0.00032535579521209,
36
- "rewards/rejected": -0.000264781410805881,
37
  "step": 10
38
  },
39
  {
40
- "epoch": 0.06,
41
- "learning_rate": 3.225806451612903e-07,
42
- "logits/chosen": -2.8416688442230225,
43
- "logits/rejected": -2.8429081439971924,
44
- "logps/chosen": -106.20645904541016,
45
- "logps/rejected": -104.94671630859375,
46
- "loss": 0.4285,
47
- "rewards/accuracies": 0.518750011920929,
48
- "rewards/chosen": -2.488676727807615e-05,
49
- "rewards/margins": 0.00035443849628791213,
50
- "rewards/rejected": -0.00037932529812678695,
51
  "step": 20
52
  },
53
  {
54
- "epoch": 0.1,
55
- "learning_rate": 4.838709677419355e-07,
56
- "logits/chosen": -2.8320679664611816,
57
- "logits/rejected": -2.811432361602783,
58
- "logps/chosen": -103.60546875,
59
- "logps/rejected": -101.54490661621094,
60
- "loss": 0.4219,
61
- "rewards/accuracies": 0.5562499761581421,
62
- "rewards/chosen": 0.00409271102398634,
63
- "rewards/margins": 0.001978711923584342,
64
- "rewards/rejected": 0.0021139997988939285,
65
  "step": 30
66
  },
67
  {
68
- "epoch": 0.13,
69
- "learning_rate": 4.987080943856886e-07,
70
- "logits/chosen": -2.8202016353607178,
71
- "logits/rejected": -2.820786952972412,
72
- "logps/chosen": -100.2019271850586,
73
- "logps/rejected": -97.82835388183594,
74
- "loss": 0.4326,
75
- "rewards/accuracies": 0.637499988079071,
76
- "rewards/chosen": 0.009574097581207752,
77
- "rewards/margins": 0.002027861075475812,
78
- "rewards/rejected": 0.007546235807240009,
79
  "step": 40
80
  },
81
  {
82
- "epoch": 0.16,
83
- "learning_rate": 4.942593872763566e-07,
84
- "logits/chosen": -2.753264904022217,
85
- "logits/rejected": -2.731250047683716,
86
- "logps/chosen": -98.43879699707031,
87
- "logps/rejected": -94.35160827636719,
88
- "loss": 0.4454,
89
- "rewards/accuracies": 0.637499988079071,
90
- "rewards/chosen": 0.04011265188455582,
91
- "rewards/margins": 0.011035969480872154,
92
- "rewards/rejected": 0.02907668612897396,
93
  "step": 50
94
  },
95
  {
96
- "epoch": 0.19,
97
- "learning_rate": 4.866946677079314e-07,
98
- "logits/chosen": -2.7994751930236816,
99
- "logits/rejected": -2.798597812652588,
100
- "logps/chosen": -92.7613525390625,
101
- "logps/rejected": -94.10163116455078,
102
- "loss": 0.4371,
103
- "rewards/accuracies": 0.5562499761581421,
104
- "rewards/chosen": 0.0348241850733757,
105
- "rewards/margins": 0.01460187416523695,
106
- "rewards/rejected": 0.020222308114171028,
107
  "step": 60
108
  },
109
  {
110
- "epoch": 0.23,
111
- "learning_rate": 4.7611043866720737e-07,
112
- "logits/chosen": -2.7760140895843506,
113
- "logits/rejected": -2.764796733856201,
114
- "logps/chosen": -112.47688293457031,
115
- "logps/rejected": -109.36808013916016,
116
- "loss": 0.3733,
117
- "rewards/accuracies": 0.59375,
118
- "rewards/chosen": -0.03835242614150047,
119
- "rewards/margins": 0.042351335287094116,
120
- "rewards/rejected": -0.08070375770330429,
121
  "step": 70
122
  },
123
  {
124
- "epoch": 0.26,
125
- "learning_rate": 4.6264172296714e-07,
126
- "logits/chosen": -2.7709617614746094,
127
- "logits/rejected": -2.7712032794952393,
128
- "logps/chosen": -97.8966293334961,
129
- "logps/rejected": -105.83245849609375,
130
- "loss": 0.3447,
131
- "rewards/accuracies": 0.6312500238418579,
132
- "rewards/chosen": -0.09152723848819733,
133
- "rewards/margins": 0.052637260407209396,
134
- "rewards/rejected": -0.14416451752185822,
135
  "step": 80
136
  },
137
  {
138
- "epoch": 0.29,
139
- "learning_rate": 4.4646034076333254e-07,
140
- "logits/chosen": -2.7489993572235107,
141
- "logits/rejected": -2.7465405464172363,
142
- "logps/chosen": -113.80158996582031,
143
- "logps/rejected": -123.46051025390625,
144
- "loss": 0.2815,
145
- "rewards/accuracies": 0.643750011920929,
146
- "rewards/chosen": -0.24712736904621124,
147
- "rewards/margins": 0.05174848437309265,
148
- "rewards/rejected": -0.2988758385181427,
149
  "step": 90
150
  },
151
  {
152
- "epoch": 0.32,
153
- "learning_rate": 4.27772717647508e-07,
154
- "logits/chosen": -2.678942918777466,
155
- "logits/rejected": -2.6756670475006104,
156
- "logps/chosen": -118.59769439697266,
157
- "logps/rejected": -126.5873794555664,
158
- "loss": 0.3429,
159
- "rewards/accuracies": 0.606249988079071,
160
- "rewards/chosen": -0.12435013055801392,
161
- "rewards/margins": 0.07720569521188736,
162
- "rewards/rejected": -0.20155580341815948,
163
  "step": 100
164
  },
165
  {
166
- "epoch": 0.32,
167
- "eval_logits/chosen": -2.6638121604919434,
168
- "eval_logits/rejected": -2.644099712371826,
169
- "eval_logps/chosen": -276.46075439453125,
170
- "eval_logps/rejected": -286.5215148925781,
171
- "eval_loss": 0.14825774729251862,
172
- "eval_rewards/accuracies": 0.62890625,
173
- "eval_rewards/chosen": -0.19421111047267914,
174
- "eval_rewards/margins": 0.09747137129306793,
175
- "eval_rewards/rejected": -0.29168248176574707,
176
- "eval_runtime": 53.6891,
177
- "eval_samples_per_second": 37.252,
178
- "eval_steps_per_second": 0.596,
179
  "step": 100
180
  },
181
  {
182
- "epoch": 0.36,
183
- "learning_rate": 4.068172512800759e-07,
184
- "logits/chosen": -2.6089630126953125,
185
- "logits/rejected": -2.6102888584136963,
186
- "logps/chosen": -125.50984954833984,
187
- "logps/rejected": -136.15036010742188,
188
- "loss": 0.2964,
189
- "rewards/accuracies": 0.6312500238418579,
190
- "rewards/chosen": -0.19683870673179626,
191
- "rewards/margins": 0.09532684832811356,
192
- "rewards/rejected": -0.2921655774116516,
193
  "step": 110
194
  },
195
  {
196
- "epoch": 0.39,
197
- "learning_rate": 3.8386127015561377e-07,
198
- "logits/chosen": -2.5708675384521484,
199
- "logits/rejected": -2.5645718574523926,
200
- "logps/chosen": -143.8579864501953,
201
- "logps/rejected": -158.998779296875,
202
- "loss": 0.2323,
203
- "rewards/accuracies": 0.668749988079071,
204
- "rewards/chosen": -0.45791369676589966,
205
- "rewards/margins": 0.12861952185630798,
206
- "rewards/rejected": -0.5865331888198853,
207
  "step": 120
208
  },
209
  {
210
- "epoch": 0.42,
211
- "learning_rate": 3.591976232982355e-07,
212
- "logits/chosen": -2.615297317504883,
213
- "logits/rejected": -2.602008581161499,
214
- "logps/chosen": -133.473876953125,
215
- "logps/rejected": -142.97463989257812,
216
- "loss": 0.2649,
217
- "rewards/accuracies": 0.6312500238418579,
218
- "rewards/chosen": -0.3440694510936737,
219
- "rewards/margins": 0.09472953528165817,
220
- "rewards/rejected": -0.43879905343055725,
221
  "step": 130
222
  },
223
  {
224
- "epoch": 0.45,
225
- "learning_rate": 3.33140944392039e-07,
226
- "logits/chosen": -2.612290859222412,
227
- "logits/rejected": -2.5939009189605713,
228
- "logps/chosen": -148.60714721679688,
229
- "logps/rejected": -159.57359313964844,
230
- "loss": 0.2725,
231
- "rewards/accuracies": 0.6812499761581421,
232
- "rewards/chosen": -0.37939244508743286,
233
- "rewards/margins": 0.13712210953235626,
234
- "rewards/rejected": -0.5165144801139832,
235
  "step": 140
236
  },
237
  {
238
- "epoch": 0.49,
239
- "learning_rate": 3.060236380050519e-07,
240
- "logits/chosen": -2.568190813064575,
241
- "logits/rejected": -2.5750067234039307,
242
- "logps/chosen": -156.5549774169922,
243
- "logps/rejected": -169.692626953125,
244
- "loss": 0.2278,
245
- "rewards/accuracies": 0.6312500238418579,
246
- "rewards/chosen": -0.4971524178981781,
247
- "rewards/margins": 0.09818680584430695,
248
- "rewards/rejected": -0.5953391790390015,
249
  "step": 150
250
  },
251
  {
252
- "epoch": 0.52,
253
- "learning_rate": 2.781916391103417e-07,
254
- "logits/chosen": -2.485995292663574,
255
- "logits/rejected": -2.49863862991333,
256
- "logps/chosen": -169.91744995117188,
257
- "logps/rejected": -183.74435424804688,
258
- "loss": 0.1946,
259
- "rewards/accuracies": 0.6187499761581421,
260
- "rewards/chosen": -0.657067060470581,
261
- "rewards/margins": 0.120273157954216,
262
- "rewards/rejected": -0.7773402333259583,
263
  "step": 160
264
  },
265
  {
266
- "epoch": 0.55,
267
- "learning_rate": 2.5e-07,
268
- "logits/chosen": -2.510629415512085,
269
- "logits/rejected": -2.494657039642334,
270
- "logps/chosen": -173.4995574951172,
271
- "logps/rejected": -180.40773010253906,
272
- "loss": 0.1858,
273
- "rewards/accuracies": 0.5687500238418579,
274
- "rewards/chosen": -0.738297164440155,
275
- "rewards/margins": 0.08405095338821411,
276
- "rewards/rejected": -0.8223482370376587,
277
  "step": 170
278
  },
279
  {
280
- "epoch": 0.58,
281
- "learning_rate": 2.218083608896583e-07,
282
- "logits/chosen": -2.4216017723083496,
283
- "logits/rejected": -2.4322915077209473,
284
- "logps/chosen": -173.25381469726562,
285
- "logps/rejected": -190.62799072265625,
286
- "loss": 0.1881,
287
- "rewards/accuracies": 0.6000000238418579,
288
- "rewards/chosen": -0.7596505880355835,
289
- "rewards/margins": 0.11408589035272598,
290
- "rewards/rejected": -0.8737365007400513,
291
  "step": 180
292
  },
293
  {
294
- "epoch": 0.61,
295
- "learning_rate": 1.9397636199494806e-07,
296
- "logits/chosen": -2.4973485469818115,
297
- "logits/rejected": -2.486934185028076,
298
- "logps/chosen": -163.3054656982422,
299
- "logps/rejected": -177.75137329101562,
300
- "loss": 0.202,
301
- "rewards/accuracies": 0.6312500238418579,
302
- "rewards/chosen": -0.6502314805984497,
303
- "rewards/margins": 0.140285924077034,
304
- "rewards/rejected": -0.7905173301696777,
305
  "step": 190
306
  },
307
  {
308
- "epoch": 0.65,
309
- "learning_rate": 1.6685905560796098e-07,
310
- "logits/chosen": -2.4677557945251465,
311
- "logits/rejected": -2.445578098297119,
312
- "logps/chosen": -172.42599487304688,
313
- "logps/rejected": -180.99696350097656,
314
- "loss": 0.2067,
315
- "rewards/accuracies": 0.6499999761581421,
316
- "rewards/chosen": -0.6207637786865234,
317
- "rewards/margins": 0.15290972590446472,
318
- "rewards/rejected": -0.7736736536026001,
319
  "step": 200
320
  },
321
  {
322
- "epoch": 0.65,
323
- "eval_logits/chosen": -2.45526123046875,
324
- "eval_logits/rejected": -2.4372146129608154,
325
- "eval_logps/chosen": -329.7696533203125,
326
- "eval_logps/rejected": -347.8625793457031,
327
- "eval_loss": 0.08947259187698364,
328
- "eval_rewards/accuracies": 0.61328125,
329
- "eval_rewards/chosen": -0.7273001074790955,
330
- "eval_rewards/margins": 0.17779292166233063,
331
- "eval_rewards/rejected": -0.9050930142402649,
332
- "eval_runtime": 53.134,
333
- "eval_samples_per_second": 37.641,
334
- "eval_steps_per_second": 0.602,
335
  "step": 200
336
  },
337
  {
338
- "epoch": 0.68,
339
- "learning_rate": 1.4080237670176453e-07,
340
- "logits/chosen": -2.4492480754852295,
341
- "logits/rejected": -2.460223436355591,
342
- "logps/chosen": -186.71517944335938,
343
- "logps/rejected": -200.7089080810547,
344
- "loss": 0.1854,
345
- "rewards/accuracies": 0.606249988079071,
346
- "rewards/chosen": -0.7622874975204468,
347
- "rewards/margins": 0.10540244728326797,
348
- "rewards/rejected": -0.8676899075508118,
349
  "step": 210
350
  },
351
  {
352
- "epoch": 0.71,
353
- "learning_rate": 1.1613872984438628e-07,
354
- "logits/chosen": -2.4568495750427246,
355
- "logits/rejected": -2.4479267597198486,
356
- "logps/chosen": -181.4917449951172,
357
- "logps/rejected": -190.90139770507812,
358
- "loss": 0.1739,
359
- "rewards/accuracies": 0.6187499761581421,
360
- "rewards/chosen": -0.7812420129776001,
361
- "rewards/margins": 0.09995730221271515,
362
- "rewards/rejected": -0.8811992406845093,
363
  "step": 220
364
  },
365
  {
366
- "epoch": 0.74,
367
- "learning_rate": 9.318274871992407e-08,
368
- "logits/chosen": -2.472447156906128,
369
- "logits/rejected": -2.4776101112365723,
370
- "logps/chosen": -181.24545288085938,
371
- "logps/rejected": -197.37155151367188,
372
- "loss": 0.1752,
373
- "rewards/accuracies": 0.6187499761581421,
374
- "rewards/chosen": -0.8337165117263794,
375
- "rewards/margins": 0.14165589213371277,
376
- "rewards/rejected": -0.975372314453125,
377
  "step": 230
378
  },
379
  {
380
- "epoch": 0.78,
381
- "learning_rate": 7.222728235249195e-08,
382
- "logits/chosen": -2.4492805004119873,
383
- "logits/rejected": -2.4301629066467285,
384
- "logps/chosen": -186.33277893066406,
385
- "logps/rejected": -204.1112823486328,
386
- "loss": 0.176,
387
- "rewards/accuracies": 0.6187499761581421,
388
- "rewards/chosen": -0.7405187487602234,
389
- "rewards/margins": 0.15510497987270355,
390
- "rewards/rejected": -0.8956238031387329,
391
  "step": 240
392
  },
393
  {
394
- "epoch": 0.81,
395
- "learning_rate": 5.353965923666742e-08,
396
- "logits/chosen": -2.477843761444092,
397
- "logits/rejected": -2.4541637897491455,
398
- "logps/chosen": -188.32887268066406,
399
- "logps/rejected": -197.6189422607422,
400
- "loss": 0.1868,
401
- "rewards/accuracies": 0.581250011920929,
402
- "rewards/chosen": -0.735795795917511,
403
- "rewards/margins": 0.1740962564945221,
404
- "rewards/rejected": -0.9098919630050659,
405
  "step": 250
406
  },
407
  {
408
- "epoch": 0.84,
409
- "learning_rate": 3.7358277032860016e-08,
410
- "logits/chosen": -2.4209158420562744,
411
- "logits/rejected": -2.417498826980591,
412
- "logps/chosen": -164.65744018554688,
413
- "logps/rejected": -177.77316284179688,
414
- "loss": 0.1832,
415
- "rewards/accuracies": 0.5687500238418579,
416
- "rewards/chosen": -0.7374037504196167,
417
- "rewards/margins": 0.09586720168590546,
418
- "rewards/rejected": -0.8332709074020386,
419
  "step": 260
420
  },
421
  {
422
- "epoch": 0.87,
423
- "learning_rate": 2.3889561332792657e-08,
424
- "logits/chosen": -2.454702854156494,
425
- "logits/rejected": -2.43430757522583,
426
- "logps/chosen": -160.90245056152344,
427
- "logps/rejected": -190.77847290039062,
428
- "loss": 0.1892,
429
- "rewards/accuracies": 0.65625,
430
- "rewards/chosen": -0.7071911692619324,
431
- "rewards/margins": 0.18826141953468323,
432
- "rewards/rejected": -0.895452618598938,
433
  "step": 270
434
  },
435
  {
436
- "epoch": 0.91,
437
- "learning_rate": 1.3305332292068705e-08,
438
- "logits/chosen": -2.4471964836120605,
439
- "logits/rejected": -2.4374232292175293,
440
- "logps/chosen": -172.14927673339844,
441
- "logps/rejected": -189.0311279296875,
442
- "loss": 0.1906,
443
- "rewards/accuracies": 0.637499988079071,
444
- "rewards/chosen": -0.7235010862350464,
445
- "rewards/margins": 0.162649005651474,
446
- "rewards/rejected": -0.8861500024795532,
447
  "step": 280
448
  },
449
  {
450
- "epoch": 0.94,
451
- "learning_rate": 5.740612723643401e-09,
452
- "logits/chosen": -2.4600260257720947,
453
- "logits/rejected": -2.450124979019165,
454
- "logps/chosen": -174.05865478515625,
455
- "logps/rejected": -192.48147583007812,
456
- "loss": 0.1919,
457
- "rewards/accuracies": 0.637499988079071,
458
- "rewards/chosen": -0.7400654554367065,
459
- "rewards/margins": 0.17650790512561798,
460
- "rewards/rejected": -0.9165734052658081,
461
  "step": 290
462
  },
463
  {
464
- "epoch": 0.97,
465
- "learning_rate": 1.2919056143113061e-09,
466
- "logits/chosen": -2.455430030822754,
467
- "logits/rejected": -2.4445955753326416,
468
- "logps/chosen": -182.17767333984375,
469
- "logps/rejected": -200.54336547851562,
470
- "loss": 0.1851,
471
- "rewards/accuracies": 0.625,
472
- "rewards/chosen": -0.7133737802505493,
473
- "rewards/margins": 0.18328514695167542,
474
- "rewards/rejected": -0.8966588973999023,
475
  "step": 300
476
  },
477
  {
478
- "epoch": 0.97,
479
- "eval_logits/chosen": -2.4372708797454834,
480
- "eval_logits/rejected": -2.4179446697235107,
481
- "eval_logps/chosen": -333.72930908203125,
482
- "eval_logps/rejected": -354.1068420410156,
483
- "eval_loss": 0.08753985911607742,
484
- "eval_rewards/accuracies": 0.62890625,
485
- "eval_rewards/chosen": -0.7668967247009277,
486
- "eval_rewards/margins": 0.20063942670822144,
487
- "eval_rewards/rejected": -0.9675361514091492,
488
- "eval_runtime": 53.1055,
489
- "eval_samples_per_second": 37.661,
490
  "eval_steps_per_second": 0.603,
491
  "step": 300
492
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493
  {
494
  "epoch": 1.0,
495
- "step": 309,
496
  "total_flos": 0.0,
497
- "train_loss": 0.2655397044030594,
498
- "train_runtime": 2864.9962,
499
- "train_samples_per_second": 13.785,
500
- "train_steps_per_second": 0.108
501
  }
502
  ],
503
  "logging_steps": 10,
504
- "max_steps": 309,
505
  "num_train_epochs": 1,
506
  "save_steps": 100,
507
  "total_flos": 0.0,
 
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 100,
6
+ "global_step": 478,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "learning_rate": 1.0416666666666666e-08,
14
+ "logits/chosen": -2.847970962524414,
15
+ "logits/rejected": -2.79160213470459,
16
+ "logps/chosen": -284.9612731933594,
17
+ "logps/rejected": -276.45928955078125,
18
+ "loss": 0.6931,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
 
23
  "step": 1
24
  },
25
  {
26
+ "epoch": 0.02,
27
+ "learning_rate": 1.0416666666666667e-07,
28
+ "logits/chosen": -2.754901647567749,
29
+ "logits/rejected": -2.7529661655426025,
30
+ "logps/chosen": -249.956298828125,
31
+ "logps/rejected": -223.05245971679688,
32
+ "loss": 0.6931,
33
+ "rewards/accuracies": 0.3958333432674408,
34
+ "rewards/chosen": -8.542059367755428e-05,
35
+ "rewards/margins": -4.0294162317877635e-05,
36
+ "rewards/rejected": -4.512643499765545e-05,
37
  "step": 10
38
  },
39
  {
40
+ "epoch": 0.04,
41
+ "learning_rate": 2.0833333333333333e-07,
42
+ "logits/chosen": -2.7449066638946533,
43
+ "logits/rejected": -2.745481014251709,
44
+ "logps/chosen": -257.4268493652344,
45
+ "logps/rejected": -247.520751953125,
46
+ "loss": 0.6925,
47
+ "rewards/accuracies": 0.4937500059604645,
48
+ "rewards/chosen": 0.00028673160704784095,
49
+ "rewards/margins": 0.0011877163778990507,
50
+ "rewards/rejected": -0.0009009848581627011,
51
  "step": 20
52
  },
53
  {
54
+ "epoch": 0.06,
55
+ "learning_rate": 3.1249999999999997e-07,
56
+ "logits/chosen": -2.8009085655212402,
57
+ "logits/rejected": -2.7534918785095215,
58
+ "logps/chosen": -300.4103088378906,
59
+ "logps/rejected": -261.89532470703125,
60
+ "loss": 0.6882,
61
+ "rewards/accuracies": 0.71875,
62
+ "rewards/chosen": 0.0016673363279551268,
63
+ "rewards/margins": 0.009702490642666817,
64
+ "rewards/rejected": -0.008035155013203621,
65
  "step": 30
66
  },
67
  {
68
+ "epoch": 0.08,
69
+ "learning_rate": 4.1666666666666667e-07,
70
+ "logits/chosen": -2.7635364532470703,
71
+ "logits/rejected": -2.751422882080078,
72
+ "logps/chosen": -256.6298522949219,
73
+ "logps/rejected": -274.86297607421875,
74
+ "loss": 0.6805,
75
+ "rewards/accuracies": 0.6875,
76
+ "rewards/chosen": -0.0019601243548095226,
77
+ "rewards/margins": 0.025836413726210594,
78
+ "rewards/rejected": -0.027796542271971703,
79
  "step": 40
80
  },
81
  {
82
+ "epoch": 0.1,
83
+ "learning_rate": 4.999733114418725e-07,
84
+ "logits/chosen": -2.7672626972198486,
85
+ "logits/rejected": -2.7396867275238037,
86
+ "logps/chosen": -284.4268798828125,
87
+ "logps/rejected": -256.52667236328125,
88
+ "loss": 0.6675,
89
+ "rewards/accuracies": 0.675000011920929,
90
+ "rewards/chosen": -0.023474793881177902,
91
+ "rewards/margins": 0.06475953012704849,
92
+ "rewards/rejected": -0.0882343202829361,
93
  "step": 50
94
  },
95
  {
96
+ "epoch": 0.13,
97
+ "learning_rate": 4.990398100856366e-07,
98
+ "logits/chosen": -2.7358150482177734,
99
+ "logits/rejected": -2.724313259124756,
100
+ "logps/chosen": -281.9308166503906,
101
+ "logps/rejected": -256.6224670410156,
102
+ "loss": 0.6443,
103
+ "rewards/accuracies": 0.6875,
104
+ "rewards/chosen": -0.060463108122348785,
105
+ "rewards/margins": 0.1052827388048172,
106
+ "rewards/rejected": -0.1657458394765854,
107
  "step": 60
108
  },
109
  {
110
+ "epoch": 0.15,
111
+ "learning_rate": 4.967775735898179e-07,
112
+ "logits/chosen": -2.781935453414917,
113
+ "logits/rejected": -2.739537000656128,
114
+ "logps/chosen": -291.1555480957031,
115
+ "logps/rejected": -273.9505920410156,
116
+ "loss": 0.6246,
117
+ "rewards/accuracies": 0.6812499761581421,
118
+ "rewards/chosen": -0.24020154774188995,
119
+ "rewards/margins": 0.17989788949489594,
120
+ "rewards/rejected": -0.4200994074344635,
121
  "step": 70
122
  },
123
  {
124
+ "epoch": 0.17,
125
+ "learning_rate": 4.931986719649298e-07,
126
+ "logits/chosen": -2.782163143157959,
127
+ "logits/rejected": -2.7544727325439453,
128
+ "logps/chosen": -290.7063903808594,
129
+ "logps/rejected": -333.33160400390625,
130
+ "loss": 0.5953,
131
+ "rewards/accuracies": 0.6875,
132
+ "rewards/chosen": -0.30353400111198425,
133
+ "rewards/margins": 0.3068069517612457,
134
+ "rewards/rejected": -0.61034095287323,
135
  "step": 80
136
  },
137
  {
138
+ "epoch": 0.19,
139
+ "learning_rate": 4.883222001996351e-07,
140
+ "logits/chosen": -2.8103935718536377,
141
+ "logits/rejected": -2.7860381603240967,
142
+ "logps/chosen": -309.4369201660156,
143
+ "logps/rejected": -328.04937744140625,
144
+ "loss": 0.5871,
145
+ "rewards/accuracies": 0.762499988079071,
146
+ "rewards/chosen": -0.34070074558258057,
147
+ "rewards/margins": 0.4278062880039215,
148
+ "rewards/rejected": -0.7685070037841797,
149
  "step": 90
150
  },
151
  {
152
+ "epoch": 0.21,
153
+ "learning_rate": 4.821741763807186e-07,
154
+ "logits/chosen": -2.775650978088379,
155
+ "logits/rejected": -2.742344379425049,
156
+ "logps/chosen": -354.2271423339844,
157
+ "logps/rejected": -372.828369140625,
158
+ "loss": 0.5691,
159
+ "rewards/accuracies": 0.7250000238418579,
160
+ "rewards/chosen": -0.6526215672492981,
161
+ "rewards/margins": 0.4535134732723236,
162
+ "rewards/rejected": -1.1061351299285889,
163
  "step": 100
164
  },
165
  {
166
+ "epoch": 0.21,
167
+ "eval_logits/chosen": -2.7190756797790527,
168
+ "eval_logits/rejected": -2.702101707458496,
169
+ "eval_logps/chosen": -322.6109924316406,
170
+ "eval_logps/rejected": -376.20880126953125,
171
+ "eval_loss": 0.5829024910926819,
172
+ "eval_rewards/accuracies": 0.7421875,
173
+ "eval_rewards/chosen": -0.6557134985923767,
174
+ "eval_rewards/margins": 0.5328419208526611,
175
+ "eval_rewards/rejected": -1.188555359840393,
176
+ "eval_runtime": 53.0851,
177
+ "eval_samples_per_second": 37.675,
178
+ "eval_steps_per_second": 0.603,
179
  "step": 100
180
  },
181
  {
182
+ "epoch": 0.23,
183
+ "learning_rate": 4.747874028753375e-07,
184
+ "logits/chosen": -2.667227268218994,
185
+ "logits/rejected": -2.6603758335113525,
186
+ "logps/chosen": -321.42108154296875,
187
+ "logps/rejected": -396.7526550292969,
188
+ "loss": 0.5384,
189
+ "rewards/accuracies": 0.78125,
190
+ "rewards/chosen": -0.5715780258178711,
191
+ "rewards/margins": 0.6688358187675476,
192
+ "rewards/rejected": -1.2404139041900635,
193
  "step": 110
194
  },
195
  {
196
+ "epoch": 0.25,
197
+ "learning_rate": 4.662012913161997e-07,
198
+ "logits/chosen": -2.622821807861328,
199
+ "logits/rejected": -2.583700656890869,
200
+ "logps/chosen": -340.69219970703125,
201
+ "logps/rejected": -375.4017333984375,
202
+ "loss": 0.5579,
203
+ "rewards/accuracies": 0.7437499761581421,
204
+ "rewards/chosen": -0.7315243482589722,
205
+ "rewards/margins": 0.5486994981765747,
206
+ "rewards/rejected": -1.2802238464355469,
207
  "step": 120
208
  },
209
  {
210
+ "epoch": 0.27,
211
+ "learning_rate": 4.5646165232345103e-07,
212
+ "logits/chosen": -2.5822339057922363,
213
+ "logits/rejected": -2.547309398651123,
214
+ "logps/chosen": -359.7410583496094,
215
+ "logps/rejected": -351.17999267578125,
216
+ "loss": 0.5523,
217
+ "rewards/accuracies": 0.706250011920929,
218
+ "rewards/chosen": -0.6760958433151245,
219
+ "rewards/margins": 0.4332718849182129,
220
+ "rewards/rejected": -1.1093676090240479,
221
  "step": 130
222
  },
223
  {
224
+ "epoch": 0.29,
225
+ "learning_rate": 4.456204510851956e-07,
226
+ "logits/chosen": -2.458064079284668,
227
+ "logits/rejected": -2.434985637664795,
228
+ "logps/chosen": -344.94622802734375,
229
+ "logps/rejected": -373.15277099609375,
230
+ "loss": 0.5431,
231
+ "rewards/accuracies": 0.65625,
232
+ "rewards/chosen": -0.9658713340759277,
233
+ "rewards/margins": 0.568038821220398,
234
+ "rewards/rejected": -1.5339101552963257,
235
  "step": 140
236
  },
237
  {
238
+ "epoch": 0.31,
239
+ "learning_rate": 4.337355301007335e-07,
240
+ "logits/chosen": -2.430382490158081,
241
+ "logits/rejected": -2.411181926727295,
242
+ "logps/chosen": -362.24664306640625,
243
+ "logps/rejected": -394.7173767089844,
244
+ "loss": 0.541,
245
+ "rewards/accuracies": 0.737500011920929,
246
+ "rewards/chosen": -0.970133900642395,
247
+ "rewards/margins": 0.5773912668228149,
248
+ "rewards/rejected": -1.5475252866744995,
249
  "step": 150
250
  },
251
  {
252
+ "epoch": 0.33,
253
+ "learning_rate": 4.2087030056579986e-07,
254
+ "logits/chosen": -2.3705012798309326,
255
+ "logits/rejected": -2.3451476097106934,
256
+ "logps/chosen": -340.9483947753906,
257
+ "logps/rejected": -381.2392883300781,
258
+ "loss": 0.5488,
259
+ "rewards/accuracies": 0.731249988079071,
260
+ "rewards/chosen": -0.7996856570243835,
261
+ "rewards/margins": 0.6973718404769897,
262
+ "rewards/rejected": -1.497057557106018,
263
  "step": 160
264
  },
265
  {
266
+ "epoch": 0.36,
267
+ "learning_rate": 4.070934040463998e-07,
268
+ "logits/chosen": -2.298063278198242,
269
+ "logits/rejected": -2.2643802165985107,
270
+ "logps/chosen": -356.18292236328125,
271
+ "logps/rejected": -401.3460998535156,
272
+ "loss": 0.5395,
273
+ "rewards/accuracies": 0.75,
274
+ "rewards/chosen": -0.8752641677856445,
275
+ "rewards/margins": 0.6319175958633423,
276
+ "rewards/rejected": -1.5071817636489868,
277
  "step": 170
278
  },
279
  {
280
+ "epoch": 0.38,
281
+ "learning_rate": 3.9247834624635404e-07,
282
+ "logits/chosen": -2.3489673137664795,
283
+ "logits/rejected": -2.294405937194824,
284
+ "logps/chosen": -366.259765625,
285
+ "logps/rejected": -413.059326171875,
286
+ "loss": 0.5228,
287
+ "rewards/accuracies": 0.75,
288
+ "rewards/chosen": -0.8981040716171265,
289
+ "rewards/margins": 0.7530413866043091,
290
+ "rewards/rejected": -1.651145339012146,
291
  "step": 180
292
  },
293
  {
294
+ "epoch": 0.4,
295
+ "learning_rate": 3.7710310482256523e-07,
296
+ "logits/chosen": -2.22472882270813,
297
+ "logits/rejected": -2.1942319869995117,
298
+ "logps/chosen": -390.96893310546875,
299
+ "logps/rejected": -435.68634033203125,
300
+ "loss": 0.5221,
301
+ "rewards/accuracies": 0.7250000238418579,
302
+ "rewards/chosen": -1.3375661373138428,
303
+ "rewards/margins": 0.6510864496231079,
304
+ "rewards/rejected": -1.9886524677276611,
305
  "step": 190
306
  },
307
  {
308
+ "epoch": 0.42,
309
+ "learning_rate": 3.610497133404795e-07,
310
+ "logits/chosen": -2.330658197402954,
311
+ "logits/rejected": -2.253397226333618,
312
+ "logps/chosen": -424.68511962890625,
313
+ "logps/rejected": -460.4125061035156,
314
+ "loss": 0.5446,
315
+ "rewards/accuracies": 0.762499988079071,
316
+ "rewards/chosen": -0.9010859727859497,
317
+ "rewards/margins": 0.9040181040763855,
318
+ "rewards/rejected": -1.8051040172576904,
319
  "step": 200
320
  },
321
  {
322
+ "epoch": 0.42,
323
+ "eval_logits/chosen": -2.273806571960449,
324
+ "eval_logits/rejected": -2.2433524131774902,
325
+ "eval_logps/chosen": -338.0599365234375,
326
+ "eval_logps/rejected": -420.1078186035156,
327
+ "eval_loss": 0.5300609469413757,
328
+ "eval_rewards/accuracies": 0.78125,
329
+ "eval_rewards/chosen": -0.810202956199646,
330
+ "eval_rewards/margins": 0.8173429369926453,
331
+ "eval_rewards/rejected": -1.6275460720062256,
332
+ "eval_runtime": 53.0552,
333
+ "eval_samples_per_second": 37.697,
334
+ "eval_steps_per_second": 0.603,
335
  "step": 200
336
  },
337
  {
338
+ "epoch": 0.44,
339
+ "learning_rate": 3.4440382358952115e-07,
340
+ "logits/chosen": -2.26928448677063,
341
+ "logits/rejected": -2.201911449432373,
342
+ "logps/chosen": -353.4331970214844,
343
+ "logps/rejected": -383.96044921875,
344
+ "loss": 0.5455,
345
+ "rewards/accuracies": 0.75,
346
+ "rewards/chosen": -0.8622655868530273,
347
+ "rewards/margins": 0.5730525255203247,
348
+ "rewards/rejected": -1.4353179931640625,
349
  "step": 210
350
  },
351
  {
352
+ "epoch": 0.46,
353
+ "learning_rate": 3.272542485937368e-07,
354
+ "logits/chosen": -2.2439053058624268,
355
+ "logits/rejected": -2.206618070602417,
356
+ "logps/chosen": -370.7458190917969,
357
+ "logps/rejected": -391.848388671875,
358
+ "loss": 0.5253,
359
+ "rewards/accuracies": 0.7562500238418579,
360
+ "rewards/chosen": -0.7618538737297058,
361
+ "rewards/margins": 0.7462855577468872,
362
+ "rewards/rejected": -1.5081393718719482,
363
  "step": 220
364
  },
365
  {
366
+ "epoch": 0.48,
367
+ "learning_rate": 3.096924887558854e-07,
368
+ "logits/chosen": -2.1762518882751465,
369
+ "logits/rejected": -2.1476693153381348,
370
+ "logps/chosen": -382.38946533203125,
371
+ "logps/rejected": -465.69561767578125,
372
+ "loss": 0.5132,
373
+ "rewards/accuracies": 0.71875,
374
+ "rewards/chosen": -1.005793571472168,
375
+ "rewards/margins": 0.7425030469894409,
376
+ "rewards/rejected": -1.7482967376708984,
377
  "step": 230
378
  },
379
  {
380
+ "epoch": 0.5,
381
+ "learning_rate": 2.9181224366319943e-07,
382
+ "logits/chosen": -2.1192374229431152,
383
+ "logits/rejected": -2.0674259662628174,
384
+ "logps/chosen": -391.3011474609375,
385
+ "logps/rejected": -484.4254455566406,
386
+ "loss": 0.5263,
387
+ "rewards/accuracies": 0.7250000238418579,
388
+ "rewards/chosen": -1.375982642173767,
389
+ "rewards/margins": 0.8829982876777649,
390
+ "rewards/rejected": -2.2589809894561768,
391
  "step": 240
392
  },
393
  {
394
+ "epoch": 0.52,
395
+ "learning_rate": 2.7370891215954565e-07,
396
+ "logits/chosen": -2.1064059734344482,
397
+ "logits/rejected": -2.0222904682159424,
398
+ "logps/chosen": -397.3945007324219,
399
+ "logps/rejected": -454.42340087890625,
400
+ "loss": 0.5111,
401
+ "rewards/accuracies": 0.762499988079071,
402
+ "rewards/chosen": -1.3846924304962158,
403
+ "rewards/margins": 0.8052938580513,
404
+ "rewards/rejected": -2.18998646736145,
405
  "step": 250
406
  },
407
  {
408
+ "epoch": 0.54,
409
+ "learning_rate": 2.55479083351317e-07,
410
+ "logits/chosen": -2.0734519958496094,
411
+ "logits/rejected": -2.041645050048828,
412
+ "logps/chosen": -403.8518371582031,
413
+ "logps/rejected": -443.9764099121094,
414
+ "loss": 0.5362,
415
+ "rewards/accuracies": 0.6812499761581421,
416
+ "rewards/chosen": -1.352430820465088,
417
+ "rewards/margins": 0.6026407480239868,
418
+ "rewards/rejected": -1.9550716876983643,
419
  "step": 260
420
  },
421
  {
422
+ "epoch": 0.56,
423
+ "learning_rate": 2.3722002126275822e-07,
424
+ "logits/chosen": -2.0378193855285645,
425
+ "logits/rejected": -2.006934881210327,
426
+ "logps/chosen": -402.4918518066406,
427
+ "logps/rejected": -457.62811279296875,
428
+ "loss": 0.5152,
429
+ "rewards/accuracies": 0.7437499761581421,
430
+ "rewards/chosen": -1.2763839960098267,
431
+ "rewards/margins": 0.744287371635437,
432
+ "rewards/rejected": -2.0206713676452637,
433
  "step": 270
434
  },
435
  {
436
+ "epoch": 0.59,
437
+ "learning_rate": 2.19029145890313e-07,
438
+ "logits/chosen": -2.033855438232422,
439
+ "logits/rejected": -1.9725334644317627,
440
+ "logps/chosen": -366.2498474121094,
441
+ "logps/rejected": -433.2369079589844,
442
+ "loss": 0.5284,
443
+ "rewards/accuracies": 0.7749999761581421,
444
+ "rewards/chosen": -1.2484426498413086,
445
+ "rewards/margins": 0.7818558812141418,
446
+ "rewards/rejected": -2.0302984714508057,
447
  "step": 280
448
  },
449
  {
450
+ "epoch": 0.61,
451
+ "learning_rate": 2.0100351342479216e-07,
452
+ "logits/chosen": -2.018475294113159,
453
+ "logits/rejected": -1.949302077293396,
454
+ "logps/chosen": -367.6812438964844,
455
+ "logps/rejected": -429.4832458496094,
456
+ "loss": 0.5041,
457
+ "rewards/accuracies": 0.793749988079071,
458
+ "rewards/chosen": -1.210700273513794,
459
+ "rewards/margins": 0.7417057752609253,
460
+ "rewards/rejected": -1.9524061679840088,
461
  "step": 290
462
  },
463
  {
464
+ "epoch": 0.63,
465
+ "learning_rate": 1.8323929841460178e-07,
466
+ "logits/chosen": -2.01090145111084,
467
+ "logits/rejected": -1.9497419595718384,
468
+ "logps/chosen": -396.8717956542969,
469
+ "logps/rejected": -473.7056579589844,
470
+ "loss": 0.5094,
471
+ "rewards/accuracies": 0.737500011920929,
472
+ "rewards/chosen": -1.406031847000122,
473
+ "rewards/margins": 0.7575126886367798,
474
+ "rewards/rejected": -2.1635446548461914,
475
  "step": 300
476
  },
477
  {
478
+ "epoch": 0.63,
479
+ "eval_logits/chosen": -2.0371742248535156,
480
+ "eval_logits/rejected": -1.9920138120651245,
481
+ "eval_logps/chosen": -394.5289611816406,
482
+ "eval_logps/rejected": -488.7168884277344,
483
+ "eval_loss": 0.514569878578186,
484
+ "eval_rewards/accuracies": 0.765625,
485
+ "eval_rewards/chosen": -1.374893307685852,
486
+ "eval_rewards/margins": 0.9387427568435669,
487
+ "eval_rewards/rejected": -2.313636064529419,
488
+ "eval_runtime": 53.0256,
489
+ "eval_samples_per_second": 37.718,
490
  "eval_steps_per_second": 0.603,
491
  "step": 300
492
  },
493
+ {
494
+ "epoch": 0.65,
495
+ "learning_rate": 1.6583128063291573e-07,
496
+ "logits/chosen": -1.9743964672088623,
497
+ "logits/rejected": -1.8795156478881836,
498
+ "logps/chosen": -401.95098876953125,
499
+ "logps/rejected": -473.07586669921875,
500
+ "loss": 0.4934,
501
+ "rewards/accuracies": 0.793749988079071,
502
+ "rewards/chosen": -1.2877211570739746,
503
+ "rewards/margins": 0.9713341593742371,
504
+ "rewards/rejected": -2.2590553760528564,
505
+ "step": 310
506
+ },
507
+ {
508
+ "epoch": 0.67,
509
+ "learning_rate": 1.488723393865766e-07,
510
+ "logits/chosen": -2.009753704071045,
511
+ "logits/rejected": -1.9591827392578125,
512
+ "logps/chosen": -424.99468994140625,
513
+ "logps/rejected": -447.1941833496094,
514
+ "loss": 0.5096,
515
+ "rewards/accuracies": 0.768750011920929,
516
+ "rewards/chosen": -1.286123275756836,
517
+ "rewards/margins": 0.8317530751228333,
518
+ "rewards/rejected": -2.1178765296936035,
519
+ "step": 320
520
+ },
521
+ {
522
+ "epoch": 0.69,
523
+ "learning_rate": 1.3245295796480788e-07,
524
+ "logits/chosen": -2.021080493927002,
525
+ "logits/rejected": -1.9558074474334717,
526
+ "logps/chosen": -386.18670654296875,
527
+ "logps/rejected": -441.7825622558594,
528
+ "loss": 0.5108,
529
+ "rewards/accuracies": 0.75,
530
+ "rewards/chosen": -1.3397352695465088,
531
+ "rewards/margins": 0.7417815923690796,
532
+ "rewards/rejected": -2.081516742706299,
533
+ "step": 330
534
+ },
535
+ {
536
+ "epoch": 0.71,
537
+ "learning_rate": 1.1666074087171627e-07,
538
+ "logits/chosen": -1.9722802639007568,
539
+ "logits/rejected": -1.9194387197494507,
540
+ "logps/chosen": -390.5426330566406,
541
+ "logps/rejected": -470.82958984375,
542
+ "loss": 0.5234,
543
+ "rewards/accuracies": 0.737500011920929,
544
+ "rewards/chosen": -1.3929929733276367,
545
+ "rewards/margins": 0.8960745930671692,
546
+ "rewards/rejected": -2.2890677452087402,
547
+ "step": 340
548
+ },
549
+ {
550
+ "epoch": 0.73,
551
+ "learning_rate": 1.0157994641835734e-07,
552
+ "logits/chosen": -1.9723567962646484,
553
+ "logits/rejected": -1.9255586862564087,
554
+ "logps/chosen": -353.8846740722656,
555
+ "logps/rejected": -450.743408203125,
556
+ "loss": 0.4932,
557
+ "rewards/accuracies": 0.6812499761581421,
558
+ "rewards/chosen": -1.279344916343689,
559
+ "rewards/margins": 0.8211178779602051,
560
+ "rewards/rejected": -2.1004626750946045,
561
+ "step": 350
562
+ },
563
+ {
564
+ "epoch": 0.75,
565
+ "learning_rate": 8.729103716819111e-08,
566
+ "logits/chosen": -1.9666541814804077,
567
+ "logits/rejected": -1.8845767974853516,
568
+ "logps/chosen": -398.8426818847656,
569
+ "logps/rejected": -476.284912109375,
570
+ "loss": 0.4746,
571
+ "rewards/accuracies": 0.7437499761581421,
572
+ "rewards/chosen": -1.3157447576522827,
573
+ "rewards/margins": 1.0537182092666626,
574
+ "rewards/rejected": -2.3694632053375244,
575
+ "step": 360
576
+ },
577
+ {
578
+ "epoch": 0.77,
579
+ "learning_rate": 7.387025063449081e-08,
580
+ "logits/chosen": -1.922550916671753,
581
+ "logits/rejected": -1.8920552730560303,
582
+ "logps/chosen": -385.36676025390625,
583
+ "logps/rejected": -481.94219970703125,
584
+ "loss": 0.4884,
585
+ "rewards/accuracies": 0.75,
586
+ "rewards/chosen": -1.3011926412582397,
587
+ "rewards/margins": 0.9755498766899109,
588
+ "rewards/rejected": -2.2767422199249268,
589
+ "step": 370
590
+ },
591
+ {
592
+ "epoch": 0.79,
593
+ "learning_rate": 6.138919252022435e-08,
594
+ "logits/chosen": -1.964270830154419,
595
+ "logits/rejected": -1.9201478958129883,
596
+ "logps/chosen": -422.5608825683594,
597
+ "logps/rejected": -470.6983337402344,
598
+ "loss": 0.4982,
599
+ "rewards/accuracies": 0.7749999761581421,
600
+ "rewards/chosen": -1.5673155784606934,
601
+ "rewards/margins": 0.7714017629623413,
602
+ "rewards/rejected": -2.338717460632324,
603
+ "step": 380
604
+ },
605
+ {
606
+ "epoch": 0.82,
607
+ "learning_rate": 4.991445467064689e-08,
608
+ "logits/chosen": -1.9478000402450562,
609
+ "logits/rejected": -1.9133468866348267,
610
+ "logps/chosen": -396.41827392578125,
611
+ "logps/rejected": -474.91168212890625,
612
+ "loss": 0.4906,
613
+ "rewards/accuracies": 0.7562500238418579,
614
+ "rewards/chosen": -1.392188549041748,
615
+ "rewards/margins": 0.9372695684432983,
616
+ "rewards/rejected": -2.329457998275757,
617
+ "step": 390
618
+ },
619
+ {
620
+ "epoch": 0.84,
621
+ "learning_rate": 3.9507259776993954e-08,
622
+ "logits/chosen": -2.014727830886841,
623
+ "logits/rejected": -1.9725955724716187,
624
+ "logps/chosen": -433.93402099609375,
625
+ "logps/rejected": -496.03948974609375,
626
+ "loss": 0.5086,
627
+ "rewards/accuracies": 0.7437499761581421,
628
+ "rewards/chosen": -1.337740182876587,
629
+ "rewards/margins": 0.89354008436203,
630
+ "rewards/rejected": -2.231280565261841,
631
+ "step": 400
632
+ },
633
+ {
634
+ "epoch": 0.84,
635
+ "eval_logits/chosen": -2.0268406867980957,
636
+ "eval_logits/rejected": -1.9826929569244385,
637
+ "eval_logps/chosen": -388.0500183105469,
638
+ "eval_logps/rejected": -484.0532531738281,
639
+ "eval_loss": 0.5034094452857971,
640
+ "eval_rewards/accuracies": 0.76953125,
641
+ "eval_rewards/chosen": -1.3101037740707397,
642
+ "eval_rewards/margins": 0.9568960070610046,
643
+ "eval_rewards/rejected": -2.2669999599456787,
644
+ "eval_runtime": 53.0612,
645
+ "eval_samples_per_second": 37.692,
646
+ "eval_steps_per_second": 0.603,
647
+ "step": 400
648
+ },
649
+ {
650
+ "epoch": 0.86,
651
+ "learning_rate": 3.022313472693447e-08,
652
+ "logits/chosen": -1.9932317733764648,
653
+ "logits/rejected": -1.9669653177261353,
654
+ "logps/chosen": -391.12274169921875,
655
+ "logps/rejected": -434.02191162109375,
656
+ "loss": 0.5097,
657
+ "rewards/accuracies": 0.699999988079071,
658
+ "rewards/chosen": -1.3643336296081543,
659
+ "rewards/margins": 0.6512311100959778,
660
+ "rewards/rejected": -2.0155646800994873,
661
+ "step": 410
662
+ },
663
+ {
664
+ "epoch": 0.88,
665
+ "learning_rate": 2.2111614344599684e-08,
666
+ "logits/chosen": -2.064518928527832,
667
+ "logits/rejected": -1.9801286458969116,
668
+ "logps/chosen": -398.71868896484375,
669
+ "logps/rejected": -479.0596618652344,
670
+ "loss": 0.4848,
671
+ "rewards/accuracies": 0.7875000238418579,
672
+ "rewards/chosen": -1.2102010250091553,
673
+ "rewards/margins": 1.0837668180465698,
674
+ "rewards/rejected": -2.2939677238464355,
675
+ "step": 420
676
+ },
677
+ {
678
+ "epoch": 0.9,
679
+ "learning_rate": 1.521597710086439e-08,
680
+ "logits/chosen": -2.049975633621216,
681
+ "logits/rejected": -1.996206521987915,
682
+ "logps/chosen": -411.322509765625,
683
+ "logps/rejected": -459.893798828125,
684
+ "loss": 0.492,
685
+ "rewards/accuracies": 0.699999988079071,
686
+ "rewards/chosen": -1.363693356513977,
687
+ "rewards/margins": 0.7787196636199951,
688
+ "rewards/rejected": -2.1424131393432617,
689
+ "step": 430
690
+ },
691
+ {
692
+ "epoch": 0.92,
693
+ "learning_rate": 9.57301420397924e-09,
694
+ "logits/chosen": -1.9763036966323853,
695
+ "logits/rejected": -1.950627326965332,
696
+ "logps/chosen": -419.8603515625,
697
+ "logps/rejected": -458.17822265625,
698
+ "loss": 0.4956,
699
+ "rewards/accuracies": 0.7437499761581421,
700
+ "rewards/chosen": -1.4159471988677979,
701
+ "rewards/margins": 0.7630717158317566,
702
+ "rewards/rejected": -2.179018974304199,
703
+ "step": 440
704
+ },
705
+ {
706
+ "epoch": 0.94,
707
+ "learning_rate": 5.212833302556258e-09,
708
+ "logits/chosen": -2.0032382011413574,
709
+ "logits/rejected": -1.9466326236724854,
710
+ "logps/chosen": -413.5555114746094,
711
+ "logps/rejected": -492.5790100097656,
712
+ "loss": 0.4873,
713
+ "rewards/accuracies": 0.731249988079071,
714
+ "rewards/chosen": -1.3693794012069702,
715
+ "rewards/margins": 0.907731831073761,
716
+ "rewards/rejected": -2.277111291885376,
717
+ "step": 450
718
+ },
719
+ {
720
+ "epoch": 0.96,
721
+ "learning_rate": 2.158697848236607e-09,
722
+ "logits/chosen": -1.964643120765686,
723
+ "logits/rejected": -1.9253301620483398,
724
+ "logps/chosen": -396.19683837890625,
725
+ "logps/rejected": -466.6449279785156,
726
+ "loss": 0.4853,
727
+ "rewards/accuracies": 0.7562500238418579,
728
+ "rewards/chosen": -1.3089487552642822,
729
+ "rewards/margins": 0.8880389332771301,
730
+ "rewards/rejected": -2.1969876289367676,
731
+ "step": 460
732
+ },
733
+ {
734
+ "epoch": 0.98,
735
+ "learning_rate": 4.269029751107489e-10,
736
+ "logits/chosen": -2.0099263191223145,
737
+ "logits/rejected": -1.9355924129486084,
738
+ "logps/chosen": -420.68408203125,
739
+ "logps/rejected": -471.353515625,
740
+ "loss": 0.4977,
741
+ "rewards/accuracies": 0.7562500238418579,
742
+ "rewards/chosen": -1.3482139110565186,
743
+ "rewards/margins": 1.0080922842025757,
744
+ "rewards/rejected": -2.356306314468384,
745
+ "step": 470
746
+ },
747
  {
748
  "epoch": 1.0,
749
+ "step": 478,
750
  "total_flos": 0.0,
751
+ "train_loss": 0.5420855548092511,
752
+ "train_runtime": 4282.9885,
753
+ "train_samples_per_second": 14.274,
754
+ "train_steps_per_second": 0.112
755
  }
756
  ],
757
  "logging_steps": 10,
758
+ "max_steps": 478,
759
  "num_train_epochs": 1,
760
  "save_steps": 100,
761
  "total_flos": 0.0,