wzhouad commited on
Commit
b30172b
1 Parent(s): 2f5d778

Model save

Browse files
README.md CHANGED
@@ -17,15 +17,15 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [HuggingFaceH4/mistral-7b-sft-beta](https://huggingface.co/HuggingFaceH4/mistral-7b-sft-beta) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 0.0675
21
- - Rewards/chosen: -2.4788
22
- - Rewards/rejected: -2.9505
23
- - Rewards/accuracies: 0.6406
24
- - Rewards/margins: 0.4717
25
- - Logps/rejected: -552.4012
26
- - Logps/chosen: -504.9170
27
- - Logits/rejected: -2.1295
28
- - Logits/chosen: -2.1638
29
 
30
  ## Model description
31
 
@@ -47,7 +47,7 @@ The following hyperparameters were used during training:
47
  - learning_rate: 5e-07
48
  - train_batch_size: 8
49
  - eval_batch_size: 8
50
- - seed: 5
51
  - distributed_type: multi-GPU
52
  - num_devices: 8
53
  - gradient_accumulation_steps: 2
@@ -62,10 +62,14 @@ The following hyperparameters were used during training:
62
 
63
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
64
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
65
- | 0.067 | 0.25 | 100 | 0.1174 | -1.4873 | -1.7314 | 0.6133 | 0.2442 | -430.4969 | -405.7653 | -2.3244 | -2.3408 |
66
- | 0.0435 | 0.49 | 200 | 0.0799 | -2.1802 | -2.5492 | 0.6211 | 0.3690 | -512.2731 | -475.0585 | -2.1421 | -2.1734 |
67
- | 0.0288 | 0.74 | 300 | 0.0710 | -2.4383 | -2.9105 | 0.6172 | 0.4722 | -548.4017 | -500.8697 | -2.1339 | -2.1675 |
68
- | 0.032 | 0.99 | 400 | 0.0675 | -2.4788 | -2.9505 | 0.6406 | 0.4717 | -552.4012 | -504.9170 | -2.1295 | -2.1638 |
 
 
 
 
69
 
70
 
71
  ### Framework versions
 
17
 
18
  This model is a fine-tuned version of [HuggingFaceH4/mistral-7b-sft-beta](https://huggingface.co/HuggingFaceH4/mistral-7b-sft-beta) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.4965
21
+ - Rewards/chosen: -2.9708
22
+ - Rewards/rejected: -4.3017
23
+ - Rewards/accuracies: 0.7695
24
+ - Rewards/margins: 1.3309
25
+ - Logps/rejected: -687.5271
26
+ - Logps/chosen: -554.1226
27
+ - Logits/rejected: -0.1928
28
+ - Logits/chosen: -0.6531
29
 
30
  ## Model description
31
 
 
47
  - learning_rate: 5e-07
48
  - train_batch_size: 8
49
  - eval_batch_size: 8
50
+ - seed: 3
51
  - distributed_type: multi-GPU
52
  - num_devices: 8
53
  - gradient_accumulation_steps: 2
 
62
 
63
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
64
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
65
+ | 0.5326 | 0.11 | 100 | 0.6180 | -0.4024 | -0.6993 | 0.6797 | 0.2969 | -327.2873 | -297.2842 | -2.5800 | -2.5958 |
66
+ | 0.4709 | 0.23 | 200 | 0.5608 | -1.1383 | -1.7616 | 0.7109 | 0.6233 | -433.5121 | -370.8716 | -2.1515 | -2.1720 |
67
+ | 0.4289 | 0.34 | 300 | 0.5293 | -1.5404 | -2.3958 | 0.7539 | 0.8554 | -496.9380 | -411.0811 | -2.0882 | -2.1204 |
68
+ | 0.4195 | 0.45 | 400 | 0.5096 | -1.7916 | -2.8995 | 0.7812 | 1.1079 | -547.3041 | -436.1970 | -1.0571 | -1.2976 |
69
+ | 0.3891 | 0.57 | 500 | 0.5086 | -2.6047 | -3.9255 | 0.7812 | 1.3208 | -649.9016 | -517.5072 | -0.8608 | -1.1314 |
70
+ | 0.4182 | 0.68 | 600 | 0.4976 | -2.4968 | -3.7962 | 0.7695 | 1.2994 | -636.9742 | -506.7195 | -0.4354 | -0.8384 |
71
+ | 0.3845 | 0.79 | 700 | 0.4967 | -2.6976 | -4.0084 | 0.7695 | 1.3108 | -658.1885 | -526.7999 | -0.2826 | -0.7200 |
72
+ | 0.3896 | 0.91 | 800 | 0.4965 | -2.9708 | -4.3017 | 0.7695 | 1.3309 | -687.5271 | -554.1226 | -0.1928 | -0.6531 |
73
 
74
 
75
  ### Framework versions
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.07294640629379838,
4
- "train_runtime": 3765.6331,
5
- "train_samples": 51894,
6
- "train_samples_per_second": 13.781,
7
- "train_steps_per_second": 0.108
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.43856339019935237,
4
+ "train_runtime": 7937.4578,
5
+ "train_samples": 113028,
6
+ "train_samples_per_second": 14.24,
7
+ "train_steps_per_second": 0.111
8
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c975a67a6eda1c94e6b5d34eee82ff0a365b06c38ca419207b5a3cbb4d49613b
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e86dbc79da5b6221b132cbc04faab8b97a554c9ca39df05f8010da50192c1d5
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ff29be45fbf4dc2f3ea1a73522b7ec72c85a2b7374393b7e454e465b85543885
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48e30d3b8da1f10b4ad9f2abada379c9f4a92e61d6a7079ac884d33bf9e5d6d5
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf3dea0f30a8c199a5dc983dd433e00255a8a010974837b08cc4d7f1b0d14b89
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:331cb75e2dc5f4ae29337f4eff1c5fad0704c34b95053780a0f84d881b68c2a4
3
  size 4540516344
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.07294640629379838,
4
- "train_runtime": 3765.6331,
5
- "train_samples": 51894,
6
- "train_samples_per_second": 13.781,
7
- "train_steps_per_second": 0.108
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.43856339019935237,
4
+ "train_runtime": 7937.4578,
5
+ "train_samples": 113028,
6
+ "train_samples_per_second": 14.24,
7
+ "train_steps_per_second": 0.111
8
  }
trainer_state.json CHANGED
@@ -1,21 +1,21 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.998766954377312,
5
  "eval_steps": 100,
6
- "global_step": 405,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "learning_rate": 1.2195121951219512e-08,
14
- "logits/chosen": -2.8695335388183594,
15
- "logits/rejected": -2.8522377014160156,
16
- "logps/chosen": -537.80126953125,
17
- "logps/rejected": -108.91968536376953,
18
- "loss": 0.3287,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
@@ -23,641 +23,1377 @@
23
  "step": 1
24
  },
25
  {
26
- "epoch": 0.02,
27
- "learning_rate": 1.219512195121951e-07,
28
- "logits/chosen": -2.8006999492645264,
29
- "logits/rejected": -2.7513413429260254,
30
- "logps/chosen": -339.1315612792969,
31
- "logps/rejected": -113.41014862060547,
32
- "loss": 0.3429,
33
- "rewards/accuracies": 0.5625,
34
- "rewards/chosen": 0.0010660986881703138,
35
- "rewards/margins": 0.0017727299127727747,
36
- "rewards/rejected": -0.0007066310499794781,
37
  "step": 10
38
  },
39
  {
40
- "epoch": 0.05,
41
- "learning_rate": 2.439024390243902e-07,
42
- "logits/chosen": -2.8162312507629395,
43
- "logits/rejected": -2.8079066276550293,
44
- "logps/chosen": -435.261962890625,
45
- "logps/rejected": -116.0378189086914,
46
- "loss": 0.3332,
47
- "rewards/accuracies": 0.7875000238418579,
48
- "rewards/chosen": 0.021241962909698486,
49
- "rewards/margins": 0.038096584379673004,
50
- "rewards/rejected": -0.016854625195264816,
51
  "step": 20
52
  },
53
  {
54
- "epoch": 0.07,
55
- "learning_rate": 3.6585365853658536e-07,
56
- "logits/chosen": -2.7254586219787598,
57
- "logits/rejected": -2.688169002532959,
58
- "logps/chosen": -437.81072998046875,
59
- "logps/rejected": -141.1324920654297,
60
- "loss": 0.309,
61
- "rewards/accuracies": 0.8500000238418579,
62
- "rewards/chosen": 0.08655615150928497,
63
- "rewards/margins": 0.20261511206626892,
64
- "rewards/rejected": -0.11605894565582275,
65
  "step": 30
66
  },
67
  {
68
- "epoch": 0.1,
69
- "learning_rate": 4.878048780487804e-07,
70
- "logits/chosen": -2.5956244468688965,
71
- "logits/rejected": -2.5760390758514404,
72
- "logps/chosen": -414.177490234375,
73
- "logps/rejected": -168.2574005126953,
74
- "loss": 0.2577,
75
- "rewards/accuracies": 0.7749999761581421,
76
- "rewards/chosen": 0.02355712652206421,
77
- "rewards/margins": 0.46499890089035034,
78
- "rewards/rejected": -0.4414418339729309,
79
  "step": 40
80
  },
81
  {
82
- "epoch": 0.12,
83
- "learning_rate": 4.992461696250783e-07,
84
- "logits/chosen": -2.474365711212158,
85
- "logits/rejected": -2.461667537689209,
86
- "logps/chosen": -420.9219665527344,
87
- "logps/rejected": -210.11111450195312,
88
- "loss": 0.1873,
89
- "rewards/accuracies": 0.7250000238418579,
90
- "rewards/chosen": -0.11808023601770401,
91
- "rewards/margins": 0.785495400428772,
92
- "rewards/rejected": -0.903575599193573,
93
  "step": 50
94
  },
95
  {
96
- "epoch": 0.15,
97
- "learning_rate": 4.966461721767899e-07,
98
- "logits/chosen": -2.4058423042297363,
99
- "logits/rejected": -2.3743529319763184,
100
- "logps/chosen": -377.65484619140625,
101
- "logps/rejected": -207.171142578125,
102
- "loss": 0.1202,
103
- "rewards/accuracies": 0.762499988079071,
104
- "rewards/chosen": -0.32305708527565,
105
- "rewards/margins": 0.8489507436752319,
106
- "rewards/rejected": -1.1720077991485596,
107
  "step": 60
108
  },
109
  {
110
- "epoch": 0.17,
111
- "learning_rate": 4.922100518015975e-07,
112
- "logits/chosen": -2.4014129638671875,
113
- "logits/rejected": -2.365219831466675,
114
- "logps/chosen": -448.73516845703125,
115
- "logps/rejected": -269.4298400878906,
116
- "loss": 0.0894,
117
- "rewards/accuracies": 0.75,
118
- "rewards/chosen": -0.4889157712459564,
119
- "rewards/margins": 1.191009759902954,
120
- "rewards/rejected": -1.679925560951233,
121
  "step": 70
122
  },
123
  {
124
- "epoch": 0.2,
125
- "learning_rate": 4.859708325770919e-07,
126
- "logits/chosen": -2.3665881156921387,
127
- "logits/rejected": -2.313610792160034,
128
- "logps/chosen": -442.3289489746094,
129
- "logps/rejected": -284.679931640625,
130
- "loss": 0.0863,
131
- "rewards/accuracies": 0.8374999761581421,
132
- "rewards/chosen": -0.31383419036865234,
133
- "rewards/margins": 1.4407079219818115,
134
- "rewards/rejected": -1.7545421123504639,
135
  "step": 80
136
  },
137
  {
138
- "epoch": 0.22,
139
- "learning_rate": 4.779749614980225e-07,
140
- "logits/chosen": -2.3862123489379883,
141
- "logits/rejected": -2.336027145385742,
142
- "logps/chosen": -456.930419921875,
143
- "logps/rejected": -332.8990478515625,
144
- "loss": 0.0628,
145
- "rewards/accuracies": 0.8062499761581421,
146
- "rewards/chosen": -0.7161350846290588,
147
- "rewards/margins": 1.5251940488815308,
148
- "rewards/rejected": -2.2413289546966553,
149
  "step": 90
150
  },
151
  {
152
- "epoch": 0.25,
153
- "learning_rate": 4.682819627081427e-07,
154
- "logits/chosen": -2.3527557849884033,
155
- "logits/rejected": -2.2716236114501953,
156
- "logps/chosen": -491.0079650878906,
157
- "logps/rejected": -360.1409912109375,
158
- "loss": 0.067,
159
- "rewards/accuracies": 0.7437499761581421,
160
- "rewards/chosen": -0.676110029220581,
161
- "rewards/margins": 1.8081252574920654,
162
- "rewards/rejected": -2.4842352867126465,
163
  "step": 100
164
  },
165
  {
166
- "epoch": 0.25,
167
- "eval_logits/chosen": -2.340771198272705,
168
- "eval_logits/rejected": -2.3243579864501953,
169
- "eval_logps/chosen": -405.76531982421875,
170
- "eval_logps/rejected": -430.49688720703125,
171
- "eval_loss": 0.11736096441745758,
172
- "eval_rewards/accuracies": 0.61328125,
173
- "eval_rewards/chosen": -1.4872568845748901,
174
- "eval_rewards/margins": 0.2441793829202652,
175
- "eval_rewards/rejected": -1.7314363718032837,
176
- "eval_runtime": 53.3203,
177
- "eval_samples_per_second": 37.509,
178
- "eval_steps_per_second": 0.6,
179
  "step": 100
180
  },
181
  {
182
- "epoch": 0.27,
183
- "learning_rate": 4.569639943810477e-07,
184
- "logits/chosen": -2.361056089401245,
185
- "logits/rejected": -2.282515287399292,
186
- "logps/chosen": -496.8377380371094,
187
- "logps/rejected": -344.2028503417969,
188
- "loss": 0.0656,
189
- "rewards/accuracies": 0.8062499761581421,
190
- "rewards/chosen": -0.5422588586807251,
191
- "rewards/margins": 1.8025261163711548,
192
- "rewards/rejected": -2.344785213470459,
193
  "step": 110
194
  },
195
  {
196
- "epoch": 0.3,
197
- "learning_rate": 4.4410531154874543e-07,
198
- "logits/chosen": -2.31123685836792,
199
- "logits/rejected": -2.2176012992858887,
200
- "logps/chosen": -541.8471069335938,
201
- "logps/rejected": -430.7569885253906,
202
- "loss": 0.0355,
203
- "rewards/accuracies": 0.8125,
204
- "rewards/chosen": -1.0784400701522827,
205
- "rewards/margins": 2.053567409515381,
206
- "rewards/rejected": -3.132007360458374,
207
  "step": 120
208
  },
209
  {
210
- "epoch": 0.32,
211
- "learning_rate": 4.298016388768561e-07,
212
- "logits/chosen": -2.288428783416748,
213
- "logits/rejected": -2.175412178039551,
214
- "logps/chosen": -530.9747314453125,
215
- "logps/rejected": -429.48785400390625,
216
- "loss": 0.0441,
217
- "rewards/accuracies": 0.8125,
218
- "rewards/chosen": -1.0411689281463623,
219
- "rewards/margins": 1.9912636280059814,
220
- "rewards/rejected": -3.0324320793151855,
221
  "step": 130
222
  },
223
  {
224
- "epoch": 0.35,
225
- "learning_rate": 4.1415945805573005e-07,
226
- "logits/chosen": -2.3122925758361816,
227
- "logits/rejected": -2.227431058883667,
228
- "logps/chosen": -509.9908142089844,
229
- "logps/rejected": -396.0080871582031,
230
- "loss": 0.0515,
231
- "rewards/accuracies": 0.856249988079071,
232
- "rewards/chosen": -0.6500633358955383,
233
- "rewards/margins": 2.154536724090576,
234
- "rewards/rejected": -2.804600238800049,
235
  "step": 140
236
  },
237
  {
238
- "epoch": 0.37,
239
- "learning_rate": 3.972952151123984e-07,
240
- "logits/chosen": -2.2672953605651855,
241
- "logits/rejected": -2.170297384262085,
242
- "logps/chosen": -522.5911254882812,
243
- "logps/rejected": -445.1206970214844,
244
- "loss": 0.0433,
245
- "rewards/accuracies": 0.84375,
246
- "rewards/chosen": -0.9818037748336792,
247
- "rewards/margins": 2.1947696208953857,
248
- "rewards/rejected": -3.1765732765197754,
249
  "step": 150
250
  },
251
  {
252
- "epoch": 0.39,
253
- "learning_rate": 3.793344535444142e-07,
254
- "logits/chosen": -2.3276212215423584,
255
- "logits/rejected": -2.2313549518585205,
256
- "logps/chosen": -537.3373413085938,
257
- "logps/rejected": -405.67559814453125,
258
- "loss": 0.0465,
259
- "rewards/accuracies": 0.8687499761581421,
260
- "rewards/chosen": -0.7096673250198364,
261
- "rewards/margins": 2.1882476806640625,
262
- "rewards/rejected": -2.8979151248931885,
263
  "step": 160
264
  },
265
  {
266
- "epoch": 0.42,
267
- "learning_rate": 3.604108797288461e-07,
268
- "logits/chosen": -2.2612414360046387,
269
- "logits/rejected": -2.178300142288208,
270
- "logps/chosen": -512.8599853515625,
271
- "logps/rejected": -435.66693115234375,
272
- "loss": 0.0375,
273
- "rewards/accuracies": 0.831250011920929,
274
- "rewards/chosen": -1.0879271030426025,
275
- "rewards/margins": 2.0758705139160156,
276
- "rewards/rejected": -3.163797616958618,
277
  "step": 170
278
  },
279
  {
280
- "epoch": 0.44,
281
- "learning_rate": 3.40665367563858e-07,
282
- "logits/chosen": -2.242908477783203,
283
- "logits/rejected": -2.1256449222564697,
284
- "logps/chosen": -503.54547119140625,
285
- "logps/rejected": -403.26593017578125,
286
- "loss": 0.037,
287
- "rewards/accuracies": 0.762499988079071,
288
- "rewards/chosen": -1.299680471420288,
289
- "rewards/margins": 1.7938661575317383,
290
- "rewards/rejected": -3.0935468673706055,
291
  "step": 180
292
  },
293
  {
294
- "epoch": 0.47,
295
- "learning_rate": 3.202449097526798e-07,
296
- "logits/chosen": -2.2713847160339355,
297
- "logits/rejected": -2.163483142852783,
298
- "logps/chosen": -501.21185302734375,
299
- "logps/rejected": -417.178955078125,
300
- "loss": 0.0478,
301
- "rewards/accuracies": 0.800000011920929,
302
- "rewards/chosen": -1.1433241367340088,
303
- "rewards/margins": 1.9010562896728516,
304
- "rewards/rejected": -3.0443806648254395,
305
  "step": 190
306
  },
307
  {
308
- "epoch": 0.49,
309
- "learning_rate": 2.993015235369905e-07,
310
- "logits/chosen": -2.2553932666778564,
311
- "logits/rejected": -2.1447877883911133,
312
- "logps/chosen": -530.4608154296875,
313
- "logps/rejected": -431.3677673339844,
314
- "loss": 0.0435,
315
- "rewards/accuracies": 0.862500011920929,
316
- "rewards/chosen": -0.9609147906303406,
317
- "rewards/margins": 2.1664257049560547,
318
- "rewards/rejected": -3.12734055519104,
319
  "step": 200
320
  },
321
  {
322
- "epoch": 0.49,
323
- "eval_logits/chosen": -2.1734278202056885,
324
- "eval_logits/rejected": -2.142083168029785,
325
- "eval_logps/chosen": -475.05853271484375,
326
- "eval_logps/rejected": -512.2731323242188,
327
- "eval_loss": 0.07987947016954422,
328
- "eval_rewards/accuracies": 0.62109375,
329
- "eval_rewards/chosen": -2.1801888942718506,
330
- "eval_rewards/margins": 0.36900976300239563,
331
- "eval_rewards/rejected": -2.549198627471924,
332
- "eval_runtime": 53.3455,
333
- "eval_samples_per_second": 37.491,
334
- "eval_steps_per_second": 0.6,
335
  "step": 200
336
  },
337
  {
338
- "epoch": 0.52,
339
- "learning_rate": 2.7799111902582693e-07,
340
- "logits/chosen": -2.1741156578063965,
341
- "logits/rejected": -2.0635221004486084,
342
- "logps/chosen": -524.964111328125,
343
- "logps/rejected": -450.088623046875,
344
- "loss": 0.0342,
345
- "rewards/accuracies": 0.793749988079071,
346
- "rewards/chosen": -1.4079291820526123,
347
- "rewards/margins": 1.9792110919952393,
348
- "rewards/rejected": -3.3871402740478516,
349
  "step": 210
350
  },
351
  {
352
- "epoch": 0.54,
353
- "learning_rate": 2.564723385445869e-07,
354
- "logits/chosen": -2.1615562438964844,
355
- "logits/rejected": -2.0492231845855713,
356
- "logps/chosen": -561.4410400390625,
357
- "logps/rejected": -483.4110412597656,
358
- "loss": 0.0356,
359
- "rewards/accuracies": 0.824999988079071,
360
- "rewards/chosen": -1.2221920490264893,
361
- "rewards/margins": 2.281812906265259,
362
- "rewards/rejected": -3.504004955291748,
363
  "step": 220
364
  },
365
  {
366
- "epoch": 0.57,
367
- "learning_rate": 2.3490537564442845e-07,
368
- "logits/chosen": -2.1855053901672363,
369
- "logits/rejected": -2.075396776199341,
370
- "logps/chosen": -530.4036254882812,
371
- "logps/rejected": -457.735107421875,
372
- "loss": 0.0301,
373
- "rewards/accuracies": 0.8125,
374
- "rewards/chosen": -1.2399654388427734,
375
- "rewards/margins": 2.154582977294922,
376
- "rewards/rejected": -3.3945488929748535,
377
  "step": 230
378
  },
379
  {
380
- "epoch": 0.59,
381
- "learning_rate": 2.1345078256378801e-07,
382
- "logits/chosen": -2.187288284301758,
383
- "logits/rejected": -2.0610218048095703,
384
- "logps/chosen": -533.1801147460938,
385
- "logps/rejected": -487.4967346191406,
386
- "loss": 0.0244,
387
- "rewards/accuracies": 0.8374999761581421,
388
- "rewards/chosen": -1.2845171689987183,
389
- "rewards/margins": 2.3898768424987793,
390
- "rewards/rejected": -3.674394130706787,
391
  "step": 240
392
  },
393
  {
394
- "epoch": 0.62,
395
- "learning_rate": 1.9226827501969865e-07,
396
- "logits/chosen": -2.166536331176758,
397
- "logits/rejected": -2.0207314491271973,
398
- "logps/chosen": -566.1360473632812,
399
- "logps/rejected": -520.3924560546875,
400
- "loss": 0.0354,
401
- "rewards/accuracies": 0.862500011920929,
402
- "rewards/chosen": -1.3790395259857178,
403
- "rewards/margins": 2.647700786590576,
404
- "rewards/rejected": -4.026740550994873,
405
  "step": 250
406
  },
407
  {
408
- "epoch": 0.64,
409
- "learning_rate": 1.715155432264775e-07,
410
- "logits/chosen": -2.160069704055786,
411
- "logits/rejected": -2.0387001037597656,
412
- "logps/chosen": -520.0031127929688,
413
- "logps/rejected": -447.298828125,
414
- "loss": 0.0421,
415
- "rewards/accuracies": 0.84375,
416
- "rewards/chosen": -1.3448396921157837,
417
- "rewards/margins": 2.034497022628784,
418
- "rewards/rejected": -3.3793368339538574,
419
  "step": 260
420
  },
421
  {
422
- "epoch": 0.67,
423
- "learning_rate": 1.51347077992983e-07,
424
- "logits/chosen": -2.201711416244507,
425
- "logits/rejected": -2.0760598182678223,
426
- "logps/chosen": -518.4276733398438,
427
- "logps/rejected": -448.0538024902344,
428
- "loss": 0.0352,
429
- "rewards/accuracies": 0.84375,
430
- "rewards/chosen": -1.2280199527740479,
431
- "rewards/margins": 2.027623176574707,
432
- "rewards/rejected": -3.255643129348755,
433
  "step": 270
434
  },
435
  {
436
- "epoch": 0.69,
437
- "learning_rate": 1.3191302063739906e-07,
438
- "logits/chosen": -2.2061142921447754,
439
- "logits/rejected": -2.08331298828125,
440
- "logps/chosen": -536.6764526367188,
441
- "logps/rejected": -460.00250244140625,
442
- "loss": 0.0335,
443
- "rewards/accuracies": 0.8187500238418579,
444
- "rewards/chosen": -1.3899211883544922,
445
- "rewards/margins": 2.058790445327759,
446
- "rewards/rejected": -3.44871187210083,
447
  "step": 280
448
  },
449
  {
450
- "epoch": 0.72,
451
- "learning_rate": 1.1335804528119475e-07,
452
- "logits/chosen": -2.194516658782959,
453
- "logits/rejected": -2.0347750186920166,
454
- "logps/chosen": -583.8428955078125,
455
- "logps/rejected": -485.03057861328125,
456
- "loss": 0.0266,
457
- "rewards/accuracies": 0.8500000238418579,
458
- "rewards/chosen": -1.3776942491531372,
459
- "rewards/margins": 2.384459972381592,
460
- "rewards/rejected": -3.7621541023254395,
461
  "step": 290
462
  },
463
  {
464
- "epoch": 0.74,
465
- "learning_rate": 9.582028184286423e-08,
466
- "logits/chosen": -2.1787655353546143,
467
- "logits/rejected": -2.075291156768799,
468
- "logps/chosen": -514.74072265625,
469
- "logps/rejected": -461.1167907714844,
470
- "loss": 0.0288,
471
- "rewards/accuracies": 0.768750011920929,
472
- "rewards/chosen": -1.5570707321166992,
473
- "rewards/margins": 1.984126091003418,
474
- "rewards/rejected": -3.541196823120117,
475
  "step": 300
476
  },
477
  {
478
- "epoch": 0.74,
479
- "eval_logits/chosen": -2.1675052642822266,
480
- "eval_logits/rejected": -2.133922576904297,
481
- "eval_logps/chosen": -500.8697204589844,
482
- "eval_logps/rejected": -548.4016723632812,
483
- "eval_loss": 0.07103094458580017,
484
- "eval_rewards/accuracies": 0.6171875,
485
- "eval_rewards/chosen": -2.438300848007202,
486
- "eval_rewards/margins": 0.4721827805042267,
487
- "eval_rewards/rejected": -2.9104835987091064,
488
- "eval_runtime": 53.3323,
489
- "eval_samples_per_second": 37.501,
490
- "eval_steps_per_second": 0.6,
491
  "step": 300
492
  },
493
  {
494
- "epoch": 0.76,
495
- "learning_rate": 7.943028774907065e-08,
496
- "logits/chosen": -2.1723272800445557,
497
- "logits/rejected": -2.0614748001098633,
498
- "logps/chosen": -535.1619873046875,
499
- "logps/rejected": -455.0377502441406,
500
- "loss": 0.0292,
501
- "rewards/accuracies": 0.793749988079071,
502
- "rewards/chosen": -1.294924020767212,
503
- "rewards/margins": 2.182978630065918,
504
- "rewards/rejected": -3.47790265083313,
505
  "step": 310
506
  },
507
  {
508
- "epoch": 0.79,
509
- "learning_rate": 6.431007601814637e-08,
510
- "logits/chosen": -2.2771997451782227,
511
- "logits/rejected": -2.146629810333252,
512
- "logps/chosen": -576.0748291015625,
513
- "logps/rejected": -492.65167236328125,
514
- "loss": 0.031,
515
- "rewards/accuracies": 0.78125,
516
- "rewards/chosen": -1.481297254562378,
517
- "rewards/margins": 2.240931510925293,
518
- "rewards/rejected": -3.722228527069092,
519
  "step": 320
520
  },
521
  {
522
- "epoch": 0.81,
523
- "learning_rate": 5.0572206951246e-08,
524
- "logits/chosen": -2.2158615589141846,
525
- "logits/rejected": -2.0619277954101562,
526
- "logps/chosen": -581.7293701171875,
527
- "logps/rejected": -495.04638671875,
528
- "loss": 0.0317,
529
- "rewards/accuracies": 0.8812500238418579,
530
- "rewards/chosen": -1.2919968366622925,
531
- "rewards/margins": 2.5048012733459473,
532
- "rewards/rejected": -3.7967982292175293,
533
  "step": 330
534
  },
535
  {
536
- "epoch": 0.84,
537
- "learning_rate": 3.831895019292897e-08,
538
- "logits/chosen": -2.2480924129486084,
539
- "logits/rejected": -2.110482692718506,
540
- "logps/chosen": -597.642333984375,
541
- "logps/rejected": -521.8727416992188,
542
- "loss": 0.0281,
543
- "rewards/accuracies": 0.8500000238418579,
544
- "rewards/chosen": -1.3266541957855225,
545
- "rewards/margins": 2.570845127105713,
546
- "rewards/rejected": -3.8974990844726562,
547
  "step": 340
548
  },
549
  {
550
- "epoch": 0.86,
551
- "learning_rate": 2.764152339909756e-08,
552
- "logits/chosen": -2.1906208992004395,
553
- "logits/rejected": -2.0595510005950928,
554
- "logps/chosen": -539.5335083007812,
555
- "logps/rejected": -462.76025390625,
556
- "loss": 0.034,
557
- "rewards/accuracies": 0.7875000238418579,
558
- "rewards/chosen": -1.460292935371399,
559
- "rewards/margins": 2.1251232624053955,
560
- "rewards/rejected": -3.585416078567505,
561
  "step": 350
562
  },
563
  {
564
- "epoch": 0.89,
565
- "learning_rate": 1.861941317991664e-08,
566
- "logits/chosen": -2.1756224632263184,
567
- "logits/rejected": -2.0511136054992676,
568
- "logps/chosen": -544.6201171875,
569
- "logps/rejected": -468.48760986328125,
570
- "loss": 0.0303,
571
- "rewards/accuracies": 0.762499988079071,
572
- "rewards/chosen": -1.5998585224151611,
573
- "rewards/margins": 1.8565304279327393,
574
- "rewards/rejected": -3.4563891887664795,
575
  "step": 360
576
  },
577
  {
578
- "epoch": 0.91,
579
- "learning_rate": 1.13197833728636e-08,
580
- "logits/chosen": -2.1858394145965576,
581
- "logits/rejected": -2.0600364208221436,
582
- "logps/chosen": -554.5750732421875,
583
- "logps/rejected": -469.27484130859375,
584
- "loss": 0.0296,
585
- "rewards/accuracies": 0.793749988079071,
586
- "rewards/chosen": -1.3729755878448486,
587
- "rewards/margins": 2.1774544715881348,
588
- "rewards/rejected": -3.5504302978515625,
589
  "step": 370
590
  },
591
  {
592
- "epoch": 0.94,
593
- "learning_rate": 5.79697505093521e-09,
594
- "logits/chosen": -2.161980628967285,
595
- "logits/rejected": -2.0485546588897705,
596
- "logps/chosen": -514.3355712890625,
597
- "logps/rejected": -432.40704345703125,
598
- "loss": 0.0282,
599
- "rewards/accuracies": 0.8062499761581421,
600
- "rewards/chosen": -1.297263741493225,
601
- "rewards/margins": 1.9683955907821655,
602
- "rewards/rejected": -3.2656593322753906,
603
  "step": 380
604
  },
605
  {
606
- "epoch": 0.96,
607
- "learning_rate": 2.092101988131256e-09,
608
- "logits/chosen": -2.2410452365875244,
609
- "logits/rejected": -2.109318256378174,
610
- "logps/chosen": -566.7930297851562,
611
- "logps/rejected": -473.2035217285156,
612
- "loss": 0.0274,
613
- "rewards/accuracies": 0.762499988079071,
614
- "rewards/chosen": -1.3028382062911987,
615
- "rewards/margins": 2.2111973762512207,
616
- "rewards/rejected": -3.514035701751709,
617
  "step": 390
618
  },
619
  {
620
- "epoch": 0.99,
621
- "learning_rate": 2.327445937151673e-10,
622
- "logits/chosen": -2.168402910232544,
623
- "logits/rejected": -2.057384967803955,
624
- "logps/chosen": -555.5765380859375,
625
- "logps/rejected": -468.2821350097656,
626
- "loss": 0.032,
627
- "rewards/accuracies": 0.831250011920929,
628
- "rewards/chosen": -1.3704006671905518,
629
- "rewards/margins": 2.0777838230133057,
630
- "rewards/rejected": -3.4481842517852783,
631
  "step": 400
632
  },
633
  {
634
- "epoch": 0.99,
635
- "eval_logits/chosen": -2.163764476776123,
636
- "eval_logits/rejected": -2.129502534866333,
637
- "eval_logps/chosen": -504.9170227050781,
638
- "eval_logps/rejected": -552.4011840820312,
639
- "eval_loss": 0.06752217561006546,
640
- "eval_rewards/accuracies": 0.640625,
641
- "eval_rewards/chosen": -2.478773355484009,
642
- "eval_rewards/margins": 0.47170597314834595,
643
- "eval_rewards/rejected": -2.950479507446289,
644
- "eval_runtime": 53.3856,
645
- "eval_samples_per_second": 37.463,
646
- "eval_steps_per_second": 0.599,
647
  "step": 400
648
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
649
  {
650
  "epoch": 1.0,
651
- "step": 405,
652
  "total_flos": 0.0,
653
- "train_loss": 0.07294640629379838,
654
- "train_runtime": 3765.6331,
655
- "train_samples_per_second": 13.781,
656
- "train_steps_per_second": 0.108
657
  }
658
  ],
659
  "logging_steps": 10,
660
- "max_steps": 405,
661
  "num_train_epochs": 1,
662
  "save_steps": 100,
663
  "total_flos": 0.0,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9994340690435767,
5
  "eval_steps": 100,
6
+ "global_step": 883,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "learning_rate": 5.617977528089887e-09,
14
+ "logits/chosen": -2.604583740234375,
15
+ "logits/rejected": -2.6370604038238525,
16
+ "logps/chosen": -392.0871887207031,
17
+ "logps/rejected": -333.6990966796875,
18
+ "loss": 0.6931,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
 
23
  "step": 1
24
  },
25
  {
26
+ "epoch": 0.01,
27
+ "learning_rate": 5.617977528089887e-08,
28
+ "logits/chosen": -2.798471212387085,
29
+ "logits/rejected": -2.7608420848846436,
30
+ "logps/chosen": -291.8177490234375,
31
+ "logps/rejected": -192.53457641601562,
32
+ "loss": 0.6931,
33
+ "rewards/accuracies": 0.4583333432674408,
34
+ "rewards/chosen": -0.00019937421893700957,
35
+ "rewards/margins": -0.00015055020048748702,
36
+ "rewards/rejected": -4.882401117356494e-05,
37
  "step": 10
38
  },
39
  {
40
+ "epoch": 0.02,
41
+ "learning_rate": 1.1235955056179774e-07,
42
+ "logits/chosen": -2.8122811317443848,
43
+ "logits/rejected": -2.7911267280578613,
44
+ "logps/chosen": -334.42919921875,
45
+ "logps/rejected": -200.9188995361328,
46
+ "loss": 0.6917,
47
+ "rewards/accuracies": 0.606249988079071,
48
+ "rewards/chosen": 0.001965261297300458,
49
+ "rewards/margins": 0.0033602300100028515,
50
+ "rewards/rejected": -0.001394969061948359,
51
  "step": 20
52
  },
53
  {
54
+ "epoch": 0.03,
55
+ "learning_rate": 1.6853932584269663e-07,
56
+ "logits/chosen": -2.775857448577881,
57
+ "logits/rejected": -2.73417329788208,
58
+ "logps/chosen": -372.3719482421875,
59
+ "logps/rejected": -164.69215393066406,
60
+ "loss": 0.6843,
61
+ "rewards/accuracies": 0.6875,
62
+ "rewards/chosen": 0.014180044643580914,
63
+ "rewards/margins": 0.02330140210688114,
64
+ "rewards/rejected": -0.009121356531977654,
65
  "step": 30
66
  },
67
  {
68
+ "epoch": 0.05,
69
+ "learning_rate": 2.2471910112359549e-07,
70
+ "logits/chosen": -2.775045871734619,
71
+ "logits/rejected": -2.7430758476257324,
72
+ "logps/chosen": -326.87652587890625,
73
+ "logps/rejected": -221.24282836914062,
74
+ "loss": 0.6638,
75
+ "rewards/accuracies": 0.65625,
76
+ "rewards/chosen": 0.01945655047893524,
77
+ "rewards/margins": 0.05223391577601433,
78
+ "rewards/rejected": -0.03277735784649849,
79
  "step": 40
80
  },
81
  {
82
+ "epoch": 0.06,
83
+ "learning_rate": 2.8089887640449437e-07,
84
+ "logits/chosen": -2.652109146118164,
85
+ "logits/rejected": -2.6346287727355957,
86
+ "logps/chosen": -323.96392822265625,
87
+ "logps/rejected": -199.55160522460938,
88
+ "loss": 0.6411,
89
+ "rewards/accuracies": 0.71875,
90
+ "rewards/chosen": 0.019086791202425957,
91
+ "rewards/margins": 0.12331440299749374,
92
+ "rewards/rejected": -0.10422760248184204,
93
  "step": 50
94
  },
95
  {
96
+ "epoch": 0.07,
97
+ "learning_rate": 3.3707865168539325e-07,
98
+ "logits/chosen": -2.6026113033294678,
99
+ "logits/rejected": -2.593480110168457,
100
+ "logps/chosen": -313.3958740234375,
101
+ "logps/rejected": -222.98287963867188,
102
+ "loss": 0.6195,
103
+ "rewards/accuracies": 0.65625,
104
+ "rewards/chosen": -0.029142415151000023,
105
+ "rewards/margins": 0.17074736952781677,
106
+ "rewards/rejected": -0.19988977909088135,
107
  "step": 60
108
  },
109
  {
110
+ "epoch": 0.08,
111
+ "learning_rate": 3.9325842696629214e-07,
112
+ "logits/chosen": -2.5947117805480957,
113
+ "logits/rejected": -2.569990634918213,
114
+ "logps/chosen": -395.15338134765625,
115
+ "logps/rejected": -245.3045196533203,
116
+ "loss": 0.5814,
117
+ "rewards/accuracies": 0.737500011920929,
118
+ "rewards/chosen": -0.006087436340749264,
119
+ "rewards/margins": 0.3716045022010803,
120
+ "rewards/rejected": -0.37769192457199097,
121
  "step": 70
122
  },
123
  {
124
+ "epoch": 0.09,
125
+ "learning_rate": 4.4943820224719097e-07,
126
+ "logits/chosen": -2.5502541065216064,
127
+ "logits/rejected": -2.5388641357421875,
128
+ "logps/chosen": -367.24169921875,
129
+ "logps/rejected": -243.64401245117188,
130
+ "loss": 0.5683,
131
+ "rewards/accuracies": 0.731249988079071,
132
+ "rewards/chosen": -0.05494023486971855,
133
+ "rewards/margins": 0.48627549409866333,
134
+ "rewards/rejected": -0.5412156581878662,
135
  "step": 80
136
  },
137
  {
138
+ "epoch": 0.1,
139
+ "learning_rate": 4.999980431020109e-07,
140
+ "logits/chosen": -2.5524516105651855,
141
+ "logits/rejected": -2.554391860961914,
142
+ "logps/chosen": -351.7366943359375,
143
+ "logps/rejected": -259.1504821777344,
144
+ "loss": 0.5401,
145
+ "rewards/accuracies": 0.65625,
146
+ "rewards/chosen": -0.16875556111335754,
147
+ "rewards/margins": 0.5268481373786926,
148
+ "rewards/rejected": -0.6956037282943726,
149
  "step": 90
150
  },
151
  {
152
+ "epoch": 0.11,
153
+ "learning_rate": 4.997632524101301e-07,
154
+ "logits/chosen": -2.595895767211914,
155
+ "logits/rejected": -2.5705056190490723,
156
+ "logps/chosen": -351.92919921875,
157
+ "logps/rejected": -244.37753295898438,
158
+ "loss": 0.5326,
159
+ "rewards/accuracies": 0.7124999761581421,
160
+ "rewards/chosen": -0.17635223269462585,
161
+ "rewards/margins": 0.589404821395874,
162
+ "rewards/rejected": -0.7657570838928223,
163
  "step": 100
164
  },
165
  {
166
+ "epoch": 0.11,
167
+ "eval_logits/chosen": -2.595787525177002,
168
+ "eval_logits/rejected": -2.5800230503082275,
169
+ "eval_logps/chosen": -297.28424072265625,
170
+ "eval_logps/rejected": -327.2872619628906,
171
+ "eval_loss": 0.6179993748664856,
172
+ "eval_rewards/accuracies": 0.6796875,
173
+ "eval_rewards/chosen": -0.40244585275650024,
174
+ "eval_rewards/margins": 0.2968939244747162,
175
+ "eval_rewards/rejected": -0.699339747428894,
176
+ "eval_runtime": 53.2358,
177
+ "eval_samples_per_second": 37.569,
178
+ "eval_steps_per_second": 0.601,
179
  "step": 100
180
  },
181
  {
182
+ "epoch": 0.12,
183
+ "learning_rate": 4.991375032514749e-07,
184
+ "logits/chosen": -2.571720600128174,
185
+ "logits/rejected": -2.509852886199951,
186
+ "logps/chosen": -345.0278015136719,
187
+ "logps/rejected": -304.7156677246094,
188
+ "loss": 0.4987,
189
+ "rewards/accuracies": 0.75,
190
+ "rewards/chosen": -0.33357003331184387,
191
+ "rewards/margins": 0.8169819712638855,
192
+ "rewards/rejected": -1.1505521535873413,
193
  "step": 110
194
  },
195
  {
196
+ "epoch": 0.14,
197
+ "learning_rate": 4.98121775121344e-07,
198
+ "logits/chosen": -2.5141258239746094,
199
+ "logits/rejected": -2.4955408573150635,
200
+ "logps/chosen": -368.97735595703125,
201
+ "logps/rejected": -351.9383239746094,
202
+ "loss": 0.4716,
203
+ "rewards/accuracies": 0.762499988079071,
204
+ "rewards/chosen": -0.5964471101760864,
205
+ "rewards/margins": 0.8667058944702148,
206
+ "rewards/rejected": -1.4631531238555908,
207
  "step": 120
208
  },
209
  {
210
+ "epoch": 0.15,
211
+ "learning_rate": 4.96717657955441e-07,
212
+ "logits/chosen": -2.4679911136627197,
213
+ "logits/rejected": -2.4308536052703857,
214
+ "logps/chosen": -375.72186279296875,
215
+ "logps/rejected": -334.9486999511719,
216
+ "loss": 0.4703,
217
+ "rewards/accuracies": 0.768750011920929,
218
+ "rewards/chosen": -0.7533982992172241,
219
+ "rewards/margins": 0.7337401509284973,
220
+ "rewards/rejected": -1.4871385097503662,
221
  "step": 130
222
  },
223
  {
224
+ "epoch": 0.16,
225
+ "learning_rate": 4.949273496411216e-07,
226
+ "logits/chosen": -2.432471513748169,
227
+ "logits/rejected": -2.3869972229003906,
228
+ "logps/chosen": -416.4320373535156,
229
+ "logps/rejected": -364.8056640625,
230
+ "loss": 0.4724,
231
+ "rewards/accuracies": 0.8374999761581421,
232
+ "rewards/chosen": -0.5794404745101929,
233
+ "rewards/margins": 1.0570305585861206,
234
+ "rewards/rejected": -1.6364710330963135,
235
  "step": 140
236
  },
237
  {
238
+ "epoch": 0.17,
239
+ "learning_rate": 4.927536525770046e-07,
240
+ "logits/chosen": -2.351346969604492,
241
+ "logits/rejected": -2.2865800857543945,
242
+ "logps/chosen": -441.26666259765625,
243
+ "logps/rejected": -346.3591613769531,
244
+ "loss": 0.4494,
245
+ "rewards/accuracies": 0.762499988079071,
246
+ "rewards/chosen": -0.8117995262145996,
247
+ "rewards/margins": 0.9228641390800476,
248
+ "rewards/rejected": -1.734663724899292,
249
  "step": 150
250
  },
251
  {
252
+ "epoch": 0.18,
253
+ "learning_rate": 4.901999692863326e-07,
254
+ "logits/chosen": -2.2583577632904053,
255
+ "logits/rejected": -2.2541985511779785,
256
+ "logps/chosen": -421.876708984375,
257
+ "logps/rejected": -400.15728759765625,
258
+ "loss": 0.4724,
259
+ "rewards/accuracies": 0.699999988079071,
260
+ "rewards/chosen": -1.0339548587799072,
261
+ "rewards/margins": 0.9418285489082336,
262
+ "rewards/rejected": -1.9757835865020752,
263
  "step": 160
264
  },
265
  {
266
+ "epoch": 0.19,
267
+ "learning_rate": 4.872702970909464e-07,
268
+ "logits/chosen": -2.2731730937957764,
269
+ "logits/rejected": -2.218116283416748,
270
+ "logps/chosen": -405.0645446777344,
271
+ "logps/rejected": -364.96832275390625,
272
+ "loss": 0.4362,
273
+ "rewards/accuracies": 0.800000011920929,
274
+ "rewards/chosen": -0.658772349357605,
275
+ "rewards/margins": 1.0431811809539795,
276
+ "rewards/rejected": -1.7019535303115845,
277
  "step": 170
278
  },
279
  {
280
+ "epoch": 0.2,
281
+ "learning_rate": 4.839692218542131e-07,
282
+ "logits/chosen": -2.172924757003784,
283
+ "logits/rejected": -2.1244688034057617,
284
+ "logps/chosen": -403.5210266113281,
285
+ "logps/rejected": -400.0758361816406,
286
+ "loss": 0.4438,
287
+ "rewards/accuracies": 0.793749988079071,
288
+ "rewards/chosen": -0.9747382998466492,
289
+ "rewards/margins": 1.138419508934021,
290
+ "rewards/rejected": -2.1131579875946045,
291
  "step": 180
292
  },
293
  {
294
+ "epoch": 0.22,
295
+ "learning_rate": 4.803019108026997e-07,
296
+ "logits/chosen": -2.0333516597747803,
297
+ "logits/rejected": -2.007887840270996,
298
+ "logps/chosen": -449.2581481933594,
299
+ "logps/rejected": -425.9677734375,
300
+ "loss": 0.4446,
301
+ "rewards/accuracies": 0.793749988079071,
302
+ "rewards/chosen": -1.2398478984832764,
303
+ "rewards/margins": 1.2218430042266846,
304
+ "rewards/rejected": -2.46169114112854,
305
  "step": 190
306
  },
307
  {
308
+ "epoch": 0.23,
309
+ "learning_rate": 4.7627410443782887e-07,
310
+ "logits/chosen": -2.0739877223968506,
311
+ "logits/rejected": -2.035784959793091,
312
+ "logps/chosen": -468.93829345703125,
313
+ "logps/rejected": -442.99249267578125,
314
+ "loss": 0.4709,
315
+ "rewards/accuracies": 0.762499988079071,
316
+ "rewards/chosen": -1.4709949493408203,
317
+ "rewards/margins": 0.9461126327514648,
318
+ "rewards/rejected": -2.417107582092285,
319
  "step": 200
320
  },
321
  {
322
+ "epoch": 0.23,
323
+ "eval_logits/chosen": -2.172028064727783,
324
+ "eval_logits/rejected": -2.151461601257324,
325
+ "eval_logps/chosen": -370.87158203125,
326
+ "eval_logps/rejected": -433.51214599609375,
327
+ "eval_loss": 0.560804545879364,
328
+ "eval_rewards/accuracies": 0.7109375,
329
+ "eval_rewards/chosen": -1.138319492340088,
330
+ "eval_rewards/margins": 0.6232693195343018,
331
+ "eval_rewards/rejected": -1.7615886926651,
332
+ "eval_runtime": 52.9918,
333
+ "eval_samples_per_second": 37.742,
334
+ "eval_steps_per_second": 0.604,
335
  "step": 200
336
  },
337
  {
338
+ "epoch": 0.24,
339
+ "learning_rate": 4.7189210755018034e-07,
340
+ "logits/chosen": -2.178704261779785,
341
+ "logits/rejected": -2.1404006481170654,
342
+ "logps/chosen": -413.65814208984375,
343
+ "logps/rejected": -374.3295593261719,
344
+ "loss": 0.4607,
345
+ "rewards/accuracies": 0.78125,
346
+ "rewards/chosen": -0.8382428884506226,
347
+ "rewards/margins": 1.006833791732788,
348
+ "rewards/rejected": -1.8450767993927002,
349
  "step": 210
350
  },
351
  {
352
+ "epoch": 0.25,
353
+ "learning_rate": 4.671627793504988e-07,
354
+ "logits/chosen": -2.1911635398864746,
355
+ "logits/rejected": -2.098259210586548,
356
+ "logps/chosen": -445.24298095703125,
357
+ "logps/rejected": -407.4277038574219,
358
+ "loss": 0.4574,
359
+ "rewards/accuracies": 0.793749988079071,
360
+ "rewards/chosen": -1.1178652048110962,
361
+ "rewards/margins": 1.1009081602096558,
362
+ "rewards/rejected": -2.218773365020752,
363
  "step": 220
364
  },
365
  {
366
+ "epoch": 0.26,
367
+ "learning_rate": 4.6209352273286095e-07,
368
+ "logits/chosen": -2.0867063999176025,
369
+ "logits/rejected": -2.0185961723327637,
370
+ "logps/chosen": -440.40093994140625,
371
+ "logps/rejected": -437.75811767578125,
372
+ "loss": 0.4422,
373
+ "rewards/accuracies": 0.8187500238418579,
374
+ "rewards/chosen": -1.0934244394302368,
375
+ "rewards/margins": 1.364727258682251,
376
+ "rewards/rejected": -2.4581520557403564,
377
  "step": 230
378
  },
379
  {
380
+ "epoch": 0.27,
381
+ "learning_rate": 4.56692272686805e-07,
382
+ "logits/chosen": -2.0416159629821777,
383
+ "logits/rejected": -1.9667298793792725,
384
+ "logps/chosen": -497.56610107421875,
385
+ "logps/rejected": -434.28863525390625,
386
+ "loss": 0.4247,
387
+ "rewards/accuracies": 0.800000011920929,
388
+ "rewards/chosen": -1.3035621643066406,
389
+ "rewards/margins": 1.2145717144012451,
390
+ "rewards/rejected": -2.5181336402893066,
391
  "step": 240
392
  },
393
  {
394
+ "epoch": 0.28,
395
+ "learning_rate": 4.5096748387656326e-07,
396
+ "logits/chosen": -2.1263952255249023,
397
+ "logits/rejected": -2.0554358959198,
398
+ "logps/chosen": -489.86602783203125,
399
+ "logps/rejected": -454.3324279785156,
400
+ "loss": 0.4466,
401
+ "rewards/accuracies": 0.824999988079071,
402
+ "rewards/chosen": -1.5867921113967896,
403
+ "rewards/margins": 1.2441270351409912,
404
+ "rewards/rejected": -2.830918788909912,
405
  "step": 250
406
  },
407
  {
408
+ "epoch": 0.29,
409
+ "learning_rate": 4.4492811740683877e-07,
410
+ "logits/chosen": -1.9983489513397217,
411
+ "logits/rejected": -1.9614261388778687,
412
+ "logps/chosen": -488.27056884765625,
413
+ "logps/rejected": -473.0369567871094,
414
+ "loss": 0.4341,
415
+ "rewards/accuracies": 0.7562500238418579,
416
+ "rewards/chosen": -1.4624744653701782,
417
+ "rewards/margins": 1.2226989269256592,
418
+ "rewards/rejected": -2.685173511505127,
419
  "step": 260
420
  },
421
  {
422
+ "epoch": 0.31,
423
+ "learning_rate": 4.3858362679584354e-07,
424
+ "logits/chosen": -2.133930206298828,
425
+ "logits/rejected": -2.0397889614105225,
426
+ "logps/chosen": -483.59979248046875,
427
+ "logps/rejected": -459.24053955078125,
428
+ "loss": 0.4415,
429
+ "rewards/accuracies": 0.737500011920929,
430
+ "rewards/chosen": -1.6541706323623657,
431
+ "rewards/margins": 1.0623505115509033,
432
+ "rewards/rejected": -2.7165210247039795,
433
  "step": 270
434
  },
435
  {
436
+ "epoch": 0.32,
437
+ "learning_rate": 4.3194394317755245e-07,
438
+ "logits/chosen": -2.099963426589966,
439
+ "logits/rejected": -2.0109364986419678,
440
+ "logps/chosen": -519.8792724609375,
441
+ "logps/rejected": -501.48187255859375,
442
+ "loss": 0.4303,
443
+ "rewards/accuracies": 0.7875000238418579,
444
+ "rewards/chosen": -1.325263261795044,
445
+ "rewards/margins": 1.4841763973236084,
446
+ "rewards/rejected": -2.8094398975372314,
447
  "step": 280
448
  },
449
  {
450
+ "epoch": 0.33,
451
+ "learning_rate": 4.2501945975633914e-07,
452
+ "logits/chosen": -2.1019272804260254,
453
+ "logits/rejected": -1.9885908365249634,
454
+ "logps/chosen": -519.796630859375,
455
+ "logps/rejected": -514.5078125,
456
+ "loss": 0.4384,
457
+ "rewards/accuracies": 0.75,
458
+ "rewards/chosen": -1.6320844888687134,
459
+ "rewards/margins": 1.3468807935714722,
460
+ "rewards/rejected": -2.9789652824401855,
461
  "step": 290
462
  },
463
  {
464
+ "epoch": 0.34,
465
+ "learning_rate": 4.1782101553832405e-07,
466
+ "logits/chosen": -2.0356638431549072,
467
+ "logits/rejected": -1.985108733177185,
468
+ "logps/chosen": -473.4766540527344,
469
+ "logps/rejected": -430.5616149902344,
470
+ "loss": 0.4289,
471
+ "rewards/accuracies": 0.8125,
472
+ "rewards/chosen": -1.208434820175171,
473
+ "rewards/margins": 1.417704701423645,
474
+ "rewards/rejected": -2.6261394023895264,
475
  "step": 300
476
  },
477
  {
478
+ "epoch": 0.34,
479
+ "eval_logits/chosen": -2.1203949451446533,
480
+ "eval_logits/rejected": -2.0882043838500977,
481
+ "eval_logps/chosen": -411.08111572265625,
482
+ "eval_logps/rejected": -496.93798828125,
483
+ "eval_loss": 0.5292558670043945,
484
+ "eval_rewards/accuracies": 0.75390625,
485
+ "eval_rewards/chosen": -1.5404143333435059,
486
+ "eval_rewards/margins": 0.8554330468177795,
487
+ "eval_rewards/rejected": -2.3958473205566406,
488
+ "eval_runtime": 52.9556,
489
+ "eval_samples_per_second": 37.767,
490
+ "eval_steps_per_second": 0.604,
491
  "step": 300
492
  },
493
  {
494
+ "epoch": 0.35,
495
+ "learning_rate": 4.103598783649029e-07,
496
+ "logits/chosen": -2.0809109210968018,
497
+ "logits/rejected": -2.0090012550354004,
498
+ "logps/chosen": -435.18414306640625,
499
+ "logps/rejected": -414.2064514160156,
500
+ "loss": 0.4236,
501
+ "rewards/accuracies": 0.8062499761581421,
502
+ "rewards/chosen": -1.0976500511169434,
503
+ "rewards/margins": 1.1829910278320312,
504
+ "rewards/rejected": -2.2806410789489746,
505
  "step": 310
506
  },
507
  {
508
+ "epoch": 0.36,
509
+ "learning_rate": 4.026477272750119e-07,
510
+ "logits/chosen": -2.0164308547973633,
511
+ "logits/rejected": -1.9634422063827515,
512
+ "logps/chosen": -502.15911865234375,
513
+ "logps/rejected": -492.40350341796875,
514
+ "loss": 0.454,
515
+ "rewards/accuracies": 0.768750011920929,
516
+ "rewards/chosen": -1.72931706905365,
517
+ "rewards/margins": 1.1704776287078857,
518
+ "rewards/rejected": -2.8997950553894043,
519
  "step": 320
520
  },
521
  {
522
+ "epoch": 0.37,
523
+ "learning_rate": 3.9469663422373864e-07,
524
+ "logits/chosen": -2.0206305980682373,
525
+ "logits/rejected": -1.9137179851531982,
526
+ "logps/chosen": -533.25537109375,
527
+ "logps/rejected": -506.52899169921875,
528
+ "loss": 0.4423,
529
+ "rewards/accuracies": 0.800000011920929,
530
+ "rewards/chosen": -1.628199815750122,
531
+ "rewards/margins": 1.3168383836746216,
532
+ "rewards/rejected": -2.945038318634033,
533
  "step": 330
534
  },
535
  {
536
+ "epoch": 0.38,
537
+ "learning_rate": 3.865190451858954e-07,
538
+ "logits/chosen": -1.9749759435653687,
539
+ "logits/rejected": -1.9049227237701416,
540
+ "logps/chosen": -466.3131408691406,
541
+ "logps/rejected": -443.3275451660156,
542
+ "loss": 0.4345,
543
+ "rewards/accuracies": 0.7562500238418579,
544
+ "rewards/chosen": -1.4465491771697998,
545
+ "rewards/margins": 1.1582015752792358,
546
+ "rewards/rejected": -2.604750394821167,
547
  "step": 340
548
  },
549
  {
550
+ "epoch": 0.4,
551
+ "learning_rate": 3.781277606741327e-07,
552
+ "logits/chosen": -1.8896055221557617,
553
+ "logits/rejected": -1.6728490591049194,
554
+ "logps/chosen": -479.135986328125,
555
+ "logps/rejected": -470.610107421875,
556
+ "loss": 0.4149,
557
+ "rewards/accuracies": 0.856249988079071,
558
+ "rewards/chosen": -1.2967784404754639,
559
+ "rewards/margins": 1.580676794052124,
560
+ "rewards/rejected": -2.877455234527588,
561
  "step": 350
562
  },
563
  {
564
+ "epoch": 0.41,
565
+ "learning_rate": 3.6953591570208996e-07,
566
+ "logits/chosen": -1.6949100494384766,
567
+ "logits/rejected": -1.4142816066741943,
568
+ "logps/chosen": -497.4435119628906,
569
+ "logps/rejected": -502.25799560546875,
570
+ "loss": 0.439,
571
+ "rewards/accuracies": 0.7124999761581421,
572
+ "rewards/chosen": -1.900599479675293,
573
+ "rewards/margins": 1.1688846349716187,
574
+ "rewards/rejected": -3.069484233856201,
575
  "step": 360
576
  },
577
  {
578
+ "epoch": 0.42,
579
+ "learning_rate": 3.607569592239452e-07,
580
+ "logits/chosen": -1.7967230081558228,
581
+ "logits/rejected": -1.6560561656951904,
582
+ "logps/chosen": -447.05877685546875,
583
+ "logps/rejected": -430.55865478515625,
584
+ "loss": 0.4419,
585
+ "rewards/accuracies": 0.75,
586
+ "rewards/chosen": -1.2963721752166748,
587
+ "rewards/margins": 1.0919477939605713,
588
+ "rewards/rejected": -2.388319730758667,
589
  "step": 370
590
  },
591
  {
592
+ "epoch": 0.43,
593
+ "learning_rate": 3.518046330825494e-07,
594
+ "logits/chosen": -1.5726587772369385,
595
+ "logits/rejected": -1.3357789516448975,
596
+ "logps/chosen": -474.6183166503906,
597
+ "logps/rejected": -467.42144775390625,
598
+ "loss": 0.4082,
599
+ "rewards/accuracies": 0.78125,
600
+ "rewards/chosen": -1.395471453666687,
601
+ "rewards/margins": 1.4275726079940796,
602
+ "rewards/rejected": -2.8230443000793457,
603
  "step": 380
604
  },
605
  {
606
+ "epoch": 0.44,
607
+ "learning_rate": 3.4269295049909713e-07,
608
+ "logits/chosen": -1.300771951675415,
609
+ "logits/rejected": -1.0319937467575073,
610
+ "logps/chosen": -538.4091796875,
611
+ "logps/rejected": -550.3985595703125,
612
+ "loss": 0.4245,
613
+ "rewards/accuracies": 0.8062499761581421,
614
+ "rewards/chosen": -2.087444305419922,
615
+ "rewards/margins": 1.5177741050720215,
616
+ "rewards/rejected": -3.6052188873291016,
617
  "step": 390
618
  },
619
  {
620
+ "epoch": 0.45,
621
+ "learning_rate": 3.3343617413800453e-07,
622
+ "logits/chosen": -1.3637840747833252,
623
+ "logits/rejected": -1.0050981044769287,
624
+ "logps/chosen": -517.8087768554688,
625
+ "logps/rejected": -541.1546020507812,
626
+ "loss": 0.4195,
627
+ "rewards/accuracies": 0.800000011920929,
628
+ "rewards/chosen": -1.8266891241073608,
629
+ "rewards/margins": 1.5495359897613525,
630
+ "rewards/rejected": -3.3762245178222656,
631
  "step": 400
632
  },
633
  {
634
+ "epoch": 0.45,
635
+ "eval_logits/chosen": -1.297582983970642,
636
+ "eval_logits/rejected": -1.0570608377456665,
637
+ "eval_logps/chosen": -436.197021484375,
638
+ "eval_logps/rejected": -547.3040771484375,
639
+ "eval_loss": 0.509623646736145,
640
+ "eval_rewards/accuracies": 0.78125,
641
+ "eval_rewards/chosen": -1.7915737628936768,
642
+ "eval_rewards/margins": 1.1079347133636475,
643
+ "eval_rewards/rejected": -2.899508476257324,
644
+ "eval_runtime": 52.9183,
645
+ "eval_samples_per_second": 37.794,
646
+ "eval_steps_per_second": 0.605,
647
  "step": 400
648
  },
649
+ {
650
+ "epoch": 0.46,
651
+ "learning_rate": 3.2404879378132893e-07,
652
+ "logits/chosen": -1.4247735738754272,
653
+ "logits/rejected": -1.0035231113433838,
654
+ "logps/chosen": -466.69964599609375,
655
+ "logps/rejected": -445.894287109375,
656
+ "loss": 0.415,
657
+ "rewards/accuracies": 0.8374999761581421,
658
+ "rewards/chosen": -1.3474479913711548,
659
+ "rewards/margins": 1.4210073947906494,
660
+ "rewards/rejected": -2.768455743789673,
661
+ "step": 410
662
+ },
663
+ {
664
+ "epoch": 0.48,
665
+ "learning_rate": 3.1454550364767894e-07,
666
+ "logits/chosen": -1.5311955213546753,
667
+ "logits/rejected": -1.2638423442840576,
668
+ "logps/chosen": -522.9796142578125,
669
+ "logps/rejected": -500.9322814941406,
670
+ "loss": 0.4131,
671
+ "rewards/accuracies": 0.8125,
672
+ "rewards/chosen": -1.5469942092895508,
673
+ "rewards/margins": 1.44197416305542,
674
+ "rewards/rejected": -2.9889683723449707,
675
+ "step": 420
676
+ },
677
+ {
678
+ "epoch": 0.49,
679
+ "learning_rate": 3.049411793911154e-07,
680
+ "logits/chosen": -1.5343776941299438,
681
+ "logits/rejected": -1.0935866832733154,
682
+ "logps/chosen": -515.9620971679688,
683
+ "logps/rejected": -521.0631103515625,
684
+ "loss": 0.4101,
685
+ "rewards/accuracies": 0.7749999761581421,
686
+ "rewards/chosen": -1.6215946674346924,
687
+ "rewards/margins": 1.5825344324111938,
688
+ "rewards/rejected": -3.204129457473755,
689
+ "step": 430
690
+ },
691
+ {
692
+ "epoch": 0.5,
693
+ "learning_rate": 2.9525085481604914e-07,
694
+ "logits/chosen": -1.2731831073760986,
695
+ "logits/rejected": -0.8194789886474609,
696
+ "logps/chosen": -521.5642700195312,
697
+ "logps/rejected": -497.09075927734375,
698
+ "loss": 0.4035,
699
+ "rewards/accuracies": 0.8687499761581421,
700
+ "rewards/chosen": -1.4858245849609375,
701
+ "rewards/margins": 1.8156074285507202,
702
+ "rewards/rejected": -3.301431655883789,
703
+ "step": 440
704
+ },
705
+ {
706
+ "epoch": 0.51,
707
+ "learning_rate": 2.854896983445833e-07,
708
+ "logits/chosen": -1.4735174179077148,
709
+ "logits/rejected": -1.1519749164581299,
710
+ "logps/chosen": -523.7742919921875,
711
+ "logps/rejected": -497.34075927734375,
712
+ "loss": 0.4077,
713
+ "rewards/accuracies": 0.8062499761581421,
714
+ "rewards/chosen": -1.5442702770233154,
715
+ "rewards/margins": 1.5848388671875,
716
+ "rewards/rejected": -3.1291093826293945,
717
+ "step": 450
718
+ },
719
+ {
720
+ "epoch": 0.52,
721
+ "learning_rate": 2.7567298927313654e-07,
722
+ "logits/chosen": -1.3687413930892944,
723
+ "logits/rejected": -1.104038953781128,
724
+ "logps/chosen": -545.3075561523438,
725
+ "logps/rejected": -563.0989990234375,
726
+ "loss": 0.4223,
727
+ "rewards/accuracies": 0.7875000238418579,
728
+ "rewards/chosen": -1.9977827072143555,
729
+ "rewards/margins": 1.3200435638427734,
730
+ "rewards/rejected": -3.317826509475708,
731
+ "step": 460
732
+ },
733
+ {
734
+ "epoch": 0.53,
735
+ "learning_rate": 2.658160938555123e-07,
736
+ "logits/chosen": -1.1723263263702393,
737
+ "logits/rejected": -0.8053513765335083,
738
+ "logps/chosen": -516.1744995117188,
739
+ "logps/rejected": -529.2353515625,
740
+ "loss": 0.4353,
741
+ "rewards/accuracies": 0.7562500238418579,
742
+ "rewards/chosen": -2.1037230491638184,
743
+ "rewards/margins": 1.373581886291504,
744
+ "rewards/rejected": -3.4773049354553223,
745
+ "step": 470
746
+ },
747
+ {
748
+ "epoch": 0.54,
749
+ "learning_rate": 2.559344412498532e-07,
750
+ "logits/chosen": -1.3750767707824707,
751
+ "logits/rejected": -1.1043331623077393,
752
+ "logps/chosen": -494.09619140625,
753
+ "logps/rejected": -510.038818359375,
754
+ "loss": 0.4033,
755
+ "rewards/accuracies": 0.7875000238418579,
756
+ "rewards/chosen": -1.6110206842422485,
757
+ "rewards/margins": 1.3316563367843628,
758
+ "rewards/rejected": -2.9426772594451904,
759
+ "step": 480
760
+ },
761
+ {
762
+ "epoch": 0.55,
763
+ "learning_rate": 2.460434993671294e-07,
764
+ "logits/chosen": -1.3292906284332275,
765
+ "logits/rejected": -0.969641387462616,
766
+ "logps/chosen": -499.32977294921875,
767
+ "logps/rejected": -508.908447265625,
768
+ "loss": 0.4074,
769
+ "rewards/accuracies": 0.800000011920929,
770
+ "rewards/chosen": -1.8106292486190796,
771
+ "rewards/margins": 1.3505693674087524,
772
+ "rewards/rejected": -3.161198616027832,
773
+ "step": 490
774
+ },
775
+ {
776
+ "epoch": 0.57,
777
+ "learning_rate": 2.361587506589672e-07,
778
+ "logits/chosen": -1.271968126296997,
779
+ "logits/rejected": -0.9058429002761841,
780
+ "logps/chosen": -505.27294921875,
781
+ "logps/rejected": -511.2696228027344,
782
+ "loss": 0.3891,
783
+ "rewards/accuracies": 0.793749988079071,
784
+ "rewards/chosen": -1.695939064025879,
785
+ "rewards/margins": 1.5928833484649658,
786
+ "rewards/rejected": -3.288822650909424,
787
+ "step": 500
788
+ },
789
+ {
790
+ "epoch": 0.57,
791
+ "eval_logits/chosen": -1.1313989162445068,
792
+ "eval_logits/rejected": -0.8608421087265015,
793
+ "eval_logps/chosen": -517.5072021484375,
794
+ "eval_logps/rejected": -649.901611328125,
795
+ "eval_loss": 0.5085692405700684,
796
+ "eval_rewards/accuracies": 0.78125,
797
+ "eval_rewards/chosen": -2.60467529296875,
798
+ "eval_rewards/margins": 1.3208080530166626,
799
+ "eval_rewards/rejected": -3.925483226776123,
800
+ "eval_runtime": 52.952,
801
+ "eval_samples_per_second": 37.77,
802
+ "eval_steps_per_second": 0.604,
803
+ "step": 500
804
+ },
805
+ {
806
+ "epoch": 0.58,
807
+ "learning_rate": 2.2629566788271613e-07,
808
+ "logits/chosen": -1.1410466432571411,
809
+ "logits/rejected": -0.7384732365608215,
810
+ "logps/chosen": -585.402099609375,
811
+ "logps/rejected": -608.8575439453125,
812
+ "loss": 0.402,
813
+ "rewards/accuracies": 0.8125,
814
+ "rewards/chosen": -2.134608268737793,
815
+ "rewards/margins": 2.0132174491882324,
816
+ "rewards/rejected": -4.147825717926025,
817
+ "step": 510
818
+ },
819
+ {
820
+ "epoch": 0.59,
821
+ "learning_rate": 2.1646968988169135e-07,
822
+ "logits/chosen": -1.249940276145935,
823
+ "logits/rejected": -0.7865885496139526,
824
+ "logps/chosen": -539.4969482421875,
825
+ "logps/rejected": -514.3495483398438,
826
+ "loss": 0.3732,
827
+ "rewards/accuracies": 0.8374999761581421,
828
+ "rewards/chosen": -1.8132226467132568,
829
+ "rewards/margins": 1.6672155857086182,
830
+ "rewards/rejected": -3.480438232421875,
831
+ "step": 520
832
+ },
833
+ {
834
+ "epoch": 0.6,
835
+ "learning_rate": 2.0669619741850232e-07,
836
+ "logits/chosen": -1.2208908796310425,
837
+ "logits/rejected": -0.6407105922698975,
838
+ "logps/chosen": -574.2348022460938,
839
+ "logps/rejected": -547.8206787109375,
840
+ "loss": 0.3974,
841
+ "rewards/accuracies": 0.824999988079071,
842
+ "rewards/chosen": -1.831973671913147,
843
+ "rewards/margins": 1.7614845037460327,
844
+ "rewards/rejected": -3.5934581756591797,
845
+ "step": 530
846
+ },
847
+ {
848
+ "epoch": 0.61,
849
+ "learning_rate": 1.9699048909929518e-07,
850
+ "logits/chosen": -1.1419695615768433,
851
+ "logits/rejected": -0.7198764681816101,
852
+ "logps/chosen": -544.6316528320312,
853
+ "logps/rejected": -555.2297973632812,
854
+ "loss": 0.4004,
855
+ "rewards/accuracies": 0.84375,
856
+ "rewards/chosen": -1.7934131622314453,
857
+ "rewards/margins": 1.6694806814193726,
858
+ "rewards/rejected": -3.4628939628601074,
859
+ "step": 540
860
+ },
861
+ {
862
+ "epoch": 0.62,
863
+ "learning_rate": 1.8736775742659732e-07,
864
+ "logits/chosen": -1.162408471107483,
865
+ "logits/rejected": -0.6814004182815552,
866
+ "logps/chosen": -485.46234130859375,
867
+ "logps/rejected": -522.3754272460938,
868
+ "loss": 0.3946,
869
+ "rewards/accuracies": 0.824999988079071,
870
+ "rewards/chosen": -1.6819603443145752,
871
+ "rewards/margins": 1.7173573970794678,
872
+ "rewards/rejected": -3.399317979812622,
873
+ "step": 550
874
+ },
875
+ {
876
+ "epoch": 0.63,
877
+ "learning_rate": 1.7784306501824616e-07,
878
+ "logits/chosen": -1.2207626104354858,
879
+ "logits/rejected": -0.8465560674667358,
880
+ "logps/chosen": -520.7455444335938,
881
+ "logps/rejected": -532.8163452148438,
882
+ "loss": 0.3927,
883
+ "rewards/accuracies": 0.78125,
884
+ "rewards/chosen": -1.7277084589004517,
885
+ "rewards/margins": 1.5865801572799683,
886
+ "rewards/rejected": -3.31428861618042,
887
+ "step": 560
888
+ },
889
+ {
890
+ "epoch": 0.65,
891
+ "learning_rate": 1.6843132102963025e-07,
892
+ "logits/chosen": -1.1111127138137817,
893
+ "logits/rejected": -0.8249386548995972,
894
+ "logps/chosen": -499.12030029296875,
895
+ "logps/rejected": -508.18719482421875,
896
+ "loss": 0.4308,
897
+ "rewards/accuracies": 0.762499988079071,
898
+ "rewards/chosen": -1.8714163303375244,
899
+ "rewards/margins": 1.3305481672286987,
900
+ "rewards/rejected": -3.2019646167755127,
901
+ "step": 570
902
+ },
903
+ {
904
+ "epoch": 0.66,
905
+ "learning_rate": 1.591472578161458e-07,
906
+ "logits/chosen": -1.1796176433563232,
907
+ "logits/rejected": -0.8019029498100281,
908
+ "logps/chosen": -501.35992431640625,
909
+ "logps/rejected": -508.736572265625,
910
+ "loss": 0.3972,
911
+ "rewards/accuracies": 0.8125,
912
+ "rewards/chosen": -1.7980735301971436,
913
+ "rewards/margins": 1.468266248703003,
914
+ "rewards/rejected": -3.2663397789001465,
915
+ "step": 580
916
+ },
917
+ {
918
+ "epoch": 0.67,
919
+ "learning_rate": 1.5000540787240274e-07,
920
+ "logits/chosen": -0.9903499484062195,
921
+ "logits/rejected": -0.6644443273544312,
922
+ "logps/chosen": -504.76409912109375,
923
+ "logps/rejected": -518.8839721679688,
924
+ "loss": 0.3749,
925
+ "rewards/accuracies": 0.800000011920929,
926
+ "rewards/chosen": -1.8273483514785767,
927
+ "rewards/margins": 1.4342434406280518,
928
+ "rewards/rejected": -3.261591672897339,
929
+ "step": 590
930
+ },
931
+ {
932
+ "epoch": 0.68,
933
+ "learning_rate": 1.410200810842749e-07,
934
+ "logits/chosen": -0.9152101278305054,
935
+ "logits/rejected": -0.43462666869163513,
936
+ "logps/chosen": -494.7369689941406,
937
+ "logps/rejected": -552.8912353515625,
938
+ "loss": 0.4182,
939
+ "rewards/accuracies": 0.800000011920929,
940
+ "rewards/chosen": -1.983428955078125,
941
+ "rewards/margins": 1.5005247592926025,
942
+ "rewards/rejected": -3.4839534759521484,
943
+ "step": 600
944
+ },
945
+ {
946
+ "epoch": 0.68,
947
+ "eval_logits/chosen": -0.838397741317749,
948
+ "eval_logits/rejected": -0.4354328513145447,
949
+ "eval_logps/chosen": -506.7194519042969,
950
+ "eval_logps/rejected": -636.9742431640625,
951
+ "eval_loss": 0.49758800864219666,
952
+ "eval_rewards/accuracies": 0.76953125,
953
+ "eval_rewards/chosen": -2.496797561645508,
954
+ "eval_rewards/margins": 1.2994122505187988,
955
+ "eval_rewards/rejected": -3.7962098121643066,
956
+ "eval_runtime": 52.9477,
957
+ "eval_samples_per_second": 37.773,
958
+ "eval_steps_per_second": 0.604,
959
+ "step": 600
960
+ },
961
+ {
962
+ "epoch": 0.69,
963
+ "learning_rate": 1.322053423294041e-07,
964
+ "logits/chosen": -0.7785909175872803,
965
+ "logits/rejected": -0.1633441299200058,
966
+ "logps/chosen": -553.189697265625,
967
+ "logps/rejected": -588.4357299804688,
968
+ "loss": 0.386,
969
+ "rewards/accuracies": 0.8187500238418579,
970
+ "rewards/chosen": -2.27974009513855,
971
+ "rewards/margins": 1.8430849313735962,
972
+ "rewards/rejected": -4.122824668884277,
973
+ "step": 610
974
+ },
975
+ {
976
+ "epoch": 0.7,
977
+ "learning_rate": 1.2357498946121905e-07,
978
+ "logits/chosen": -0.7135148644447327,
979
+ "logits/rejected": -0.3086184859275818,
980
+ "logps/chosen": -552.1744384765625,
981
+ "logps/rejected": -593.6995849609375,
982
+ "loss": 0.3885,
983
+ "rewards/accuracies": 0.800000011920929,
984
+ "rewards/chosen": -2.253349781036377,
985
+ "rewards/margins": 1.7079604864120483,
986
+ "rewards/rejected": -3.9613101482391357,
987
+ "step": 620
988
+ },
989
+ {
990
+ "epoch": 0.71,
991
+ "learning_rate": 1.1514253171093161e-07,
992
+ "logits/chosen": -0.8449075818061829,
993
+ "logits/rejected": -0.3063100278377533,
994
+ "logps/chosen": -538.8817749023438,
995
+ "logps/rejected": -539.5289306640625,
996
+ "loss": 0.4102,
997
+ "rewards/accuracies": 0.800000011920929,
998
+ "rewards/chosen": -2.042537212371826,
999
+ "rewards/margins": 1.56886887550354,
1000
+ "rewards/rejected": -3.611405611038208,
1001
+ "step": 630
1002
+ },
1003
+ {
1004
+ "epoch": 0.72,
1005
+ "learning_rate": 1.0692116854131883e-07,
1006
+ "logits/chosen": -0.7637182474136353,
1007
+ "logits/rejected": -0.20512516796588898,
1008
+ "logps/chosen": -576.058349609375,
1009
+ "logps/rejected": -590.4248046875,
1010
+ "loss": 0.4046,
1011
+ "rewards/accuracies": 0.8374999761581421,
1012
+ "rewards/chosen": -2.343686580657959,
1013
+ "rewards/margins": 1.6815674304962158,
1014
+ "rewards/rejected": -4.025254249572754,
1015
+ "step": 640
1016
+ },
1017
+ {
1018
+ "epoch": 0.74,
1019
+ "learning_rate": 9.89237689853889e-08,
1020
+ "logits/chosen": -0.8669427633285522,
1021
+ "logits/rejected": -0.28752464056015015,
1022
+ "logps/chosen": -585.344482421875,
1023
+ "logps/rejected": -613.337646484375,
1024
+ "loss": 0.3747,
1025
+ "rewards/accuracies": 0.8062499761581421,
1026
+ "rewards/chosen": -2.232487440109253,
1027
+ "rewards/margins": 1.73616623878479,
1028
+ "rewards/rejected": -3.968653917312622,
1029
+ "step": 650
1030
+ },
1031
+ {
1032
+ "epoch": 0.75,
1033
+ "learning_rate": 9.11628515022765e-08,
1034
+ "logits/chosen": -0.8596148490905762,
1035
+ "logits/rejected": -0.15762929618358612,
1036
+ "logps/chosen": -537.1088256835938,
1037
+ "logps/rejected": -582.6017456054688,
1038
+ "loss": 0.3997,
1039
+ "rewards/accuracies": 0.8062499761581421,
1040
+ "rewards/chosen": -2.121387481689453,
1041
+ "rewards/margins": 1.9997097253799438,
1042
+ "rewards/rejected": -4.121097087860107,
1043
+ "step": 660
1044
+ },
1045
+ {
1046
+ "epoch": 0.76,
1047
+ "learning_rate": 8.365056438189486e-08,
1048
+ "logits/chosen": -0.9106462597846985,
1049
+ "logits/rejected": -0.3486366868019104,
1050
+ "logps/chosen": -603.9283447265625,
1051
+ "logps/rejected": -620.8770141601562,
1052
+ "loss": 0.394,
1053
+ "rewards/accuracies": 0.8187500238418579,
1054
+ "rewards/chosen": -2.358280897140503,
1055
+ "rewards/margins": 1.6525799036026,
1056
+ "rewards/rejected": -4.010860919952393,
1057
+ "step": 670
1058
+ },
1059
+ {
1060
+ "epoch": 0.77,
1061
+ "learning_rate": 7.639866672902101e-08,
1062
+ "logits/chosen": -0.8860856294631958,
1063
+ "logits/rejected": -0.19501088559627533,
1064
+ "logps/chosen": -558.152587890625,
1065
+ "logps/rejected": -582.1339721679688,
1066
+ "loss": 0.3877,
1067
+ "rewards/accuracies": 0.84375,
1068
+ "rewards/chosen": -2.326817750930786,
1069
+ "rewards/margins": 1.7357206344604492,
1070
+ "rewards/rejected": -4.0625386238098145,
1071
+ "step": 680
1072
+ },
1073
+ {
1074
+ "epoch": 0.78,
1075
+ "learning_rate": 6.941851005657851e-08,
1076
+ "logits/chosen": -0.7948504686355591,
1077
+ "logits/rejected": -0.22474519908428192,
1078
+ "logps/chosen": -547.021484375,
1079
+ "logps/rejected": -597.5274047851562,
1080
+ "loss": 0.3979,
1081
+ "rewards/accuracies": 0.8062499761581421,
1082
+ "rewards/chosen": -2.1950807571411133,
1083
+ "rewards/margins": 1.8335994482040405,
1084
+ "rewards/rejected": -4.028680324554443,
1085
+ "step": 690
1086
+ },
1087
+ {
1088
+ "epoch": 0.79,
1089
+ "learning_rate": 6.272102051693051e-08,
1090
+ "logits/chosen": -0.7106924057006836,
1091
+ "logits/rejected": -0.11677990108728409,
1092
+ "logps/chosen": -548.4378662109375,
1093
+ "logps/rejected": -583.5083618164062,
1094
+ "loss": 0.3845,
1095
+ "rewards/accuracies": 0.824999988079071,
1096
+ "rewards/chosen": -2.1655876636505127,
1097
+ "rewards/margins": 1.7359482049942017,
1098
+ "rewards/rejected": -3.901536226272583,
1099
+ "step": 700
1100
+ },
1101
+ {
1102
+ "epoch": 0.79,
1103
+ "eval_logits/chosen": -0.7200266718864441,
1104
+ "eval_logits/rejected": -0.2825911045074463,
1105
+ "eval_logps/chosen": -526.7998657226562,
1106
+ "eval_logps/rejected": -658.1885375976562,
1107
+ "eval_loss": 0.4966849386692047,
1108
+ "eval_rewards/accuracies": 0.76953125,
1109
+ "eval_rewards/chosen": -2.6976022720336914,
1110
+ "eval_rewards/margins": 1.3107508420944214,
1111
+ "eval_rewards/rejected": -4.008352756500244,
1112
+ "eval_runtime": 52.9036,
1113
+ "eval_samples_per_second": 37.805,
1114
+ "eval_steps_per_second": 0.605,
1115
+ "step": 700
1116
+ },
1117
+ {
1118
+ "epoch": 0.8,
1119
+ "learning_rate": 5.6316681798995844e-08,
1120
+ "logits/chosen": -0.7543665170669556,
1121
+ "logits/rejected": -0.3065817952156067,
1122
+ "logps/chosen": -570.3900146484375,
1123
+ "logps/rejected": -601.1921997070312,
1124
+ "loss": 0.3962,
1125
+ "rewards/accuracies": 0.793749988079071,
1126
+ "rewards/chosen": -2.327669858932495,
1127
+ "rewards/margins": 1.7061312198638916,
1128
+ "rewards/rejected": -4.033801555633545,
1129
+ "step": 710
1130
+ },
1131
+ {
1132
+ "epoch": 0.81,
1133
+ "learning_rate": 5.0215518717961256e-08,
1134
+ "logits/chosen": -0.8484760522842407,
1135
+ "logits/rejected": -0.2536430358886719,
1136
+ "logps/chosen": -571.7559204101562,
1137
+ "logps/rejected": -592.8436889648438,
1138
+ "loss": 0.3922,
1139
+ "rewards/accuracies": 0.793749988079071,
1140
+ "rewards/chosen": -2.2247302532196045,
1141
+ "rewards/margins": 1.9966415166854858,
1142
+ "rewards/rejected": -4.221371650695801,
1143
+ "step": 720
1144
+ },
1145
+ {
1146
+ "epoch": 0.83,
1147
+ "learning_rate": 4.4427081523275925e-08,
1148
+ "logits/chosen": -0.8367404937744141,
1149
+ "logits/rejected": -0.18715055286884308,
1150
+ "logps/chosen": -587.681884765625,
1151
+ "logps/rejected": -586.436279296875,
1152
+ "loss": 0.3943,
1153
+ "rewards/accuracies": 0.762499988079071,
1154
+ "rewards/chosen": -2.551619291305542,
1155
+ "rewards/margins": 1.5289732217788696,
1156
+ "rewards/rejected": -4.080592155456543,
1157
+ "step": 730
1158
+ },
1159
+ {
1160
+ "epoch": 0.84,
1161
+ "learning_rate": 3.896043094949061e-08,
1162
+ "logits/chosen": -0.7256150245666504,
1163
+ "logits/rejected": -0.2999003529548645,
1164
+ "logps/chosen": -577.1644897460938,
1165
+ "logps/rejected": -619.8590698242188,
1166
+ "loss": 0.3992,
1167
+ "rewards/accuracies": 0.824999988079071,
1168
+ "rewards/chosen": -2.4170877933502197,
1169
+ "rewards/margins": 1.7640550136566162,
1170
+ "rewards/rejected": -4.181142330169678,
1171
+ "step": 740
1172
+ },
1173
+ {
1174
+ "epoch": 0.85,
1175
+ "learning_rate": 3.3824124033343557e-08,
1176
+ "logits/chosen": -0.7496400475502014,
1177
+ "logits/rejected": -0.18662114441394806,
1178
+ "logps/chosen": -579.8140869140625,
1179
+ "logps/rejected": -607.1034545898438,
1180
+ "loss": 0.4029,
1181
+ "rewards/accuracies": 0.856249988079071,
1182
+ "rewards/chosen": -2.3119702339172363,
1183
+ "rewards/margins": 1.9123681783676147,
1184
+ "rewards/rejected": -4.224338531494141,
1185
+ "step": 750
1186
+ },
1187
+ {
1188
+ "epoch": 0.86,
1189
+ "learning_rate": 2.9026200719291904e-08,
1190
+ "logits/chosen": -0.8657780885696411,
1191
+ "logits/rejected": -0.27606701850891113,
1192
+ "logps/chosen": -584.1732177734375,
1193
+ "logps/rejected": -619.30859375,
1194
+ "loss": 0.3828,
1195
+ "rewards/accuracies": 0.824999988079071,
1196
+ "rewards/chosen": -2.2997066974639893,
1197
+ "rewards/margins": 1.865792989730835,
1198
+ "rewards/rejected": -4.165499687194824,
1199
+ "step": 760
1200
+ },
1201
+ {
1202
+ "epoch": 0.87,
1203
+ "learning_rate": 2.4574171274456433e-08,
1204
+ "logits/chosen": -0.7996637225151062,
1205
+ "logits/rejected": -0.18977174162864685,
1206
+ "logps/chosen": -581.4425659179688,
1207
+ "logps/rejected": -592.1394653320312,
1208
+ "loss": 0.3729,
1209
+ "rewards/accuracies": 0.8187500238418579,
1210
+ "rewards/chosen": -2.2692534923553467,
1211
+ "rewards/margins": 1.7808440923690796,
1212
+ "rewards/rejected": -4.050097942352295,
1213
+ "step": 770
1214
+ },
1215
+ {
1216
+ "epoch": 0.88,
1217
+ "learning_rate": 2.047500453267881e-08,
1218
+ "logits/chosen": -0.7119861245155334,
1219
+ "logits/rejected": -0.11405928432941437,
1220
+ "logps/chosen": -592.5960083007812,
1221
+ "logps/rejected": -605.414306640625,
1222
+ "loss": 0.399,
1223
+ "rewards/accuracies": 0.8374999761581421,
1224
+ "rewards/chosen": -2.3485360145568848,
1225
+ "rewards/margins": 1.844259262084961,
1226
+ "rewards/rejected": -4.192794322967529,
1227
+ "step": 780
1228
+ },
1229
+ {
1230
+ "epoch": 0.89,
1231
+ "learning_rate": 1.673511698609292e-08,
1232
+ "logits/chosen": -0.7867909669876099,
1233
+ "logits/rejected": -0.11047197878360748,
1234
+ "logps/chosen": -590.2078857421875,
1235
+ "logps/rejected": -598.898681640625,
1236
+ "loss": 0.41,
1237
+ "rewards/accuracies": 0.8500000238418579,
1238
+ "rewards/chosen": -2.408952236175537,
1239
+ "rewards/margins": 1.6835685968399048,
1240
+ "rewards/rejected": -4.092520713806152,
1241
+ "step": 790
1242
+ },
1243
+ {
1244
+ "epoch": 0.91,
1245
+ "learning_rate": 1.3360362741285769e-08,
1246
+ "logits/chosen": -0.7789919972419739,
1247
+ "logits/rejected": -0.2865374684333801,
1248
+ "logps/chosen": -587.2545166015625,
1249
+ "logps/rejected": -613.9022216796875,
1250
+ "loss": 0.3896,
1251
+ "rewards/accuracies": 0.8187500238418579,
1252
+ "rewards/chosen": -2.680772304534912,
1253
+ "rewards/margins": 1.56209397315979,
1254
+ "rewards/rejected": -4.242866516113281,
1255
+ "step": 800
1256
+ },
1257
+ {
1258
+ "epoch": 0.91,
1259
+ "eval_logits/chosen": -0.6531276106834412,
1260
+ "eval_logits/rejected": -0.1928076446056366,
1261
+ "eval_logps/chosen": -554.1226196289062,
1262
+ "eval_logps/rejected": -687.527099609375,
1263
+ "eval_loss": 0.4964603781700134,
1264
+ "eval_rewards/accuracies": 0.76953125,
1265
+ "eval_rewards/chosen": -2.970829725265503,
1266
+ "eval_rewards/margins": 1.3309086561203003,
1267
+ "eval_rewards/rejected": -4.301738739013672,
1268
+ "eval_runtime": 52.9306,
1269
+ "eval_samples_per_second": 37.785,
1270
+ "eval_steps_per_second": 0.605,
1271
+ "step": 800
1272
+ },
1273
+ {
1274
+ "epoch": 0.92,
1275
+ "learning_rate": 1.0356024355769433e-08,
1276
+ "logits/chosen": -0.7060940861701965,
1277
+ "logits/rejected": -0.22113999724388123,
1278
+ "logps/chosen": -593.3995361328125,
1279
+ "logps/rejected": -653.0950317382812,
1280
+ "loss": 0.3726,
1281
+ "rewards/accuracies": 0.856249988079071,
1282
+ "rewards/chosen": -2.5047855377197266,
1283
+ "rewards/margins": 1.8576644659042358,
1284
+ "rewards/rejected": -4.36245059967041,
1285
+ "step": 810
1286
+ },
1287
+ {
1288
+ "epoch": 0.93,
1289
+ "learning_rate": 7.726804569108597e-09,
1290
+ "logits/chosen": -0.7532386779785156,
1291
+ "logits/rejected": -0.2760005593299866,
1292
+ "logps/chosen": -578.0054931640625,
1293
+ "logps/rejected": -620.122314453125,
1294
+ "loss": 0.4064,
1295
+ "rewards/accuracies": 0.7875000238418579,
1296
+ "rewards/chosen": -2.5314385890960693,
1297
+ "rewards/margins": 1.613104224205017,
1298
+ "rewards/rejected": -4.144542694091797,
1299
+ "step": 820
1300
+ },
1301
+ {
1302
+ "epoch": 0.94,
1303
+ "learning_rate": 5.476818941645561e-09,
1304
+ "logits/chosen": -0.822808563709259,
1305
+ "logits/rejected": -0.3094715476036072,
1306
+ "logps/chosen": -599.0675659179688,
1307
+ "logps/rejected": -619.3136596679688,
1308
+ "loss": 0.3862,
1309
+ "rewards/accuracies": 0.78125,
1310
+ "rewards/chosen": -2.7111198902130127,
1311
+ "rewards/margins": 1.4173959493637085,
1312
+ "rewards/rejected": -4.12851619720459,
1313
+ "step": 830
1314
+ },
1315
+ {
1316
+ "epoch": 0.95,
1317
+ "learning_rate": 3.609589412347347e-09,
1318
+ "logits/chosen": -0.6289047002792358,
1319
+ "logits/rejected": -0.11015300452709198,
1320
+ "logps/chosen": -534.1783447265625,
1321
+ "logps/rejected": -600.945556640625,
1322
+ "loss": 0.3787,
1323
+ "rewards/accuracies": 0.8687499761581421,
1324
+ "rewards/chosen": -2.5174965858459473,
1325
+ "rewards/margins": 1.8095569610595703,
1326
+ "rewards/rejected": -4.327054023742676,
1327
+ "step": 840
1328
+ },
1329
+ {
1330
+ "epoch": 0.96,
1331
+ "learning_rate": 2.1280387858572667e-09,
1332
+ "logits/chosen": -0.6604621410369873,
1333
+ "logits/rejected": -0.242179274559021,
1334
+ "logps/chosen": -580.7008056640625,
1335
+ "logps/rejected": -598.622314453125,
1336
+ "loss": 0.4136,
1337
+ "rewards/accuracies": 0.731249988079071,
1338
+ "rewards/chosen": -2.638731002807617,
1339
+ "rewards/margins": 1.3258345127105713,
1340
+ "rewards/rejected": -3.9645659923553467,
1341
+ "step": 850
1342
+ },
1343
+ {
1344
+ "epoch": 0.97,
1345
+ "learning_rate": 1.03448615738172e-09,
1346
+ "logits/chosen": -0.602516770362854,
1347
+ "logits/rejected": -0.018761873245239258,
1348
+ "logps/chosen": -582.8955688476562,
1349
+ "logps/rejected": -606.2899169921875,
1350
+ "loss": 0.4235,
1351
+ "rewards/accuracies": 0.8125,
1352
+ "rewards/chosen": -2.5163681507110596,
1353
+ "rewards/margins": 1.8120934963226318,
1354
+ "rewards/rejected": -4.328461647033691,
1355
+ "step": 860
1356
+ },
1357
+ {
1358
+ "epoch": 0.98,
1359
+ "learning_rate": 3.3064328257259575e-10,
1360
+ "logits/chosen": -0.8884924054145813,
1361
+ "logits/rejected": -0.2350286990404129,
1362
+ "logps/chosen": -635.0806884765625,
1363
+ "logps/rejected": -663.0999145507812,
1364
+ "loss": 0.3724,
1365
+ "rewards/accuracies": 0.8187500238418579,
1366
+ "rewards/chosen": -2.5359692573547363,
1367
+ "rewards/margins": 1.9048057794570923,
1368
+ "rewards/rejected": -4.440774917602539,
1369
+ "step": 870
1370
+ },
1371
+ {
1372
+ "epoch": 1.0,
1373
+ "learning_rate": 1.7611898088715216e-11,
1374
+ "logits/chosen": -0.7480968832969666,
1375
+ "logits/rejected": -0.27411407232284546,
1376
+ "logps/chosen": -547.1424560546875,
1377
+ "logps/rejected": -604.2337646484375,
1378
+ "loss": 0.361,
1379
+ "rewards/accuracies": 0.831250011920929,
1380
+ "rewards/chosen": -2.3420045375823975,
1381
+ "rewards/margins": 1.844956636428833,
1382
+ "rewards/rejected": -4.1869611740112305,
1383
+ "step": 880
1384
+ },
1385
  {
1386
  "epoch": 1.0,
1387
+ "step": 883,
1388
  "total_flos": 0.0,
1389
+ "train_loss": 0.43856339019935237,
1390
+ "train_runtime": 7937.4578,
1391
+ "train_samples_per_second": 14.24,
1392
+ "train_steps_per_second": 0.111
1393
  }
1394
  ],
1395
  "logging_steps": 10,
1396
+ "max_steps": 883,
1397
  "num_train_epochs": 1,
1398
  "save_steps": 100,
1399
  "total_flos": 0.0,