wzhouad commited on
Commit
f20d900
1 Parent(s): b2792b0

Model save

Browse files
README.md CHANGED
@@ -17,15 +17,15 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [HuggingFaceH4/mistral-7b-sft-beta](https://huggingface.co/HuggingFaceH4/mistral-7b-sft-beta) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 0.0660
21
- - Rewards/chosen: -2.5606
22
- - Rewards/rejected: -2.9549
23
- - Rewards/accuracies: 0.625
24
- - Rewards/margins: 0.3944
25
- - Logps/rejected: -552.8470
26
- - Logps/chosen: -513.0960
27
- - Logits/rejected: -2.2459
28
- - Logits/chosen: -2.2708
29
 
30
  ## Model description
31
 
@@ -47,7 +47,7 @@ The following hyperparameters were used during training:
47
  - learning_rate: 5e-07
48
  - train_batch_size: 8
49
  - eval_batch_size: 8
50
- - seed: 4
51
  - distributed_type: multi-GPU
52
  - num_devices: 8
53
  - gradient_accumulation_steps: 2
@@ -62,10 +62,14 @@ The following hyperparameters were used during training:
62
 
63
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
64
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
65
- | 0.0437 | 0.25 | 100 | 0.0824 | -2.2538 | -2.4741 | 0.5859 | 0.2203 | -504.7590 | -482.4154 | -2.3143 | -2.3260 |
66
- | 0.0258 | 0.49 | 200 | 0.0581 | -2.8677 | -3.2192 | 0.5977 | 0.3515 | -579.2755 | -543.8072 | -2.1155 | -2.1394 |
67
- | 0.0402 | 0.74 | 300 | 0.0837 | -2.0997 | -2.5006 | 0.6289 | 0.4009 | -507.4115 | -467.0057 | -2.2751 | -2.2980 |
68
- | 0.0288 | 0.99 | 400 | 0.0660 | -2.5606 | -2.9549 | 0.625 | 0.3944 | -552.8470 | -513.0960 | -2.2459 | -2.2708 |
 
 
 
 
69
 
70
 
71
  ### Framework versions
 
17
 
18
  This model is a fine-tuned version of [HuggingFaceH4/mistral-7b-sft-beta](https://huggingface.co/HuggingFaceH4/mistral-7b-sft-beta) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.4920
21
+ - Rewards/chosen: -2.3074
22
+ - Rewards/rejected: -3.5196
23
+ - Rewards/accuracies: 0.7734
24
+ - Rewards/margins: 1.2122
25
+ - Logps/rejected: -609.3139
26
+ - Logps/chosen: -487.7755
27
+ - Logits/rejected: -0.7242
28
+ - Logits/chosen: -0.9597
29
 
30
  ## Model description
31
 
 
47
  - learning_rate: 5e-07
48
  - train_batch_size: 8
49
  - eval_batch_size: 8
50
+ - seed: 2
51
  - distributed_type: multi-GPU
52
  - num_devices: 8
53
  - gradient_accumulation_steps: 2
 
62
 
63
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
64
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
65
+ | 0.5392 | 0.11 | 100 | 0.6286 | -0.6554 | -0.9418 | 0.6523 | 0.2865 | -351.5352 | -322.5750 | -2.5756 | -2.5908 |
66
+ | 0.4524 | 0.23 | 200 | 0.5475 | -1.4831 | -2.1698 | 0.7227 | 0.6867 | -474.3327 | -405.3454 | -1.9678 | -1.9878 |
67
+ | 0.3976 | 0.34 | 300 | 0.5194 | -1.8541 | -2.8790 | 0.7617 | 1.0249 | -545.2501 | -442.4474 | -0.9783 | -1.1841 |
68
+ | 0.3892 | 0.45 | 400 | 0.5160 | -2.0795 | -3.1766 | 0.7773 | 1.0971 | -575.0087 | -464.9888 | -0.6002 | -0.8579 |
69
+ | 0.3964 | 0.57 | 500 | 0.4992 | -2.1896 | -3.3081 | 0.7656 | 1.1185 | -588.1666 | -476.0038 | -0.8012 | -1.0189 |
70
+ | 0.4149 | 0.68 | 600 | 0.4948 | -2.2061 | -3.3241 | 0.7461 | 1.1179 | -589.7601 | -477.6525 | -1.0527 | -1.2398 |
71
+ | 0.4004 | 0.79 | 700 | 0.4905 | -2.1723 | -3.3652 | 0.7695 | 1.1929 | -593.8731 | -474.2662 | -0.8519 | -1.0643 |
72
+ | 0.3887 | 0.91 | 800 | 0.4920 | -2.3074 | -3.5196 | 0.7734 | 1.2122 | -609.3139 | -487.7755 | -0.7242 | -0.9597 |
73
 
74
 
75
  ### Framework versions
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.07151281171374851,
4
- "train_runtime": 3738.25,
5
- "train_samples": 51894,
6
- "train_samples_per_second": 13.882,
7
- "train_steps_per_second": 0.108
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.43981607611020046,
4
+ "train_runtime": 8273.4147,
5
+ "train_samples": 113028,
6
+ "train_samples_per_second": 13.662,
7
+ "train_steps_per_second": 0.107
8
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fb2741318d25fa010663fe61ed02f4f293fa8ad301934c24bbabaf6e60633fb3
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3706298fa431e1f5810589ffba965a8702ccfbe931becf39115a25cf32b0500d
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0f559ba79771ca5e5cdbae085a55b0de304927c43a4793b3f8234d1f33152354
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce0bba006f75edd300be523b9ddf4224846bca628f24428e1354ecbf84bda2f1
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:901f2833dc2e0f3adb4f4bd18d3a372877da1018c193c779ed31f78d98f1f0a4
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bdc684b29544e54449b1407ec61244d68a0f37ef83f91adbb77ec167e63be6c
3
  size 4540516344
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.07151281171374851,
4
- "train_runtime": 3738.25,
5
- "train_samples": 51894,
6
- "train_samples_per_second": 13.882,
7
- "train_steps_per_second": 0.108
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.43981607611020046,
4
+ "train_runtime": 8273.4147,
5
+ "train_samples": 113028,
6
+ "train_samples_per_second": 13.662,
7
+ "train_steps_per_second": 0.107
8
  }
trainer_state.json CHANGED
@@ -1,21 +1,21 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.998766954377312,
5
  "eval_steps": 100,
6
- "global_step": 405,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "learning_rate": 1.2195121951219512e-08,
14
- "logits/chosen": -2.8681135177612305,
15
- "logits/rejected": -2.8858838081359863,
16
- "logps/chosen": -518.1907958984375,
17
- "logps/rejected": -109.31971740722656,
18
- "loss": 0.3475,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
@@ -23,641 +23,1377 @@
23
  "step": 1
24
  },
25
  {
26
- "epoch": 0.02,
27
- "learning_rate": 1.219512195121951e-07,
28
- "logits/chosen": -2.7986178398132324,
29
- "logits/rejected": -2.752176284790039,
30
- "logps/chosen": -434.208251953125,
31
- "logps/rejected": -114.19618225097656,
32
- "loss": 0.3394,
33
- "rewards/accuracies": 0.4930555522441864,
34
- "rewards/chosen": 0.0002649651141837239,
35
- "rewards/margins": 0.0009347840095870197,
36
- "rewards/rejected": -0.0006698188371956348,
37
  "step": 10
38
  },
39
  {
40
- "epoch": 0.05,
41
- "learning_rate": 2.439024390243902e-07,
42
- "logits/chosen": -2.8215415477752686,
43
- "logits/rejected": -2.7983882427215576,
44
- "logps/chosen": -417.2633361816406,
45
- "logps/rejected": -118.0062026977539,
46
- "loss": 0.3373,
47
- "rewards/accuracies": 0.7250000238418579,
48
- "rewards/chosen": 0.019945567473769188,
49
- "rewards/margins": 0.03575458750128746,
50
- "rewards/rejected": -0.015809018164873123,
51
  "step": 20
52
  },
53
  {
54
- "epoch": 0.07,
55
- "learning_rate": 3.6585365853658536e-07,
56
- "logits/chosen": -2.6574862003326416,
57
- "logits/rejected": -2.6451315879821777,
58
- "logps/chosen": -398.87353515625,
59
- "logps/rejected": -125.69970703125,
60
- "loss": 0.3045,
61
- "rewards/accuracies": 0.7562500238418579,
62
- "rewards/chosen": 0.07569055259227753,
63
- "rewards/margins": 0.19884702563285828,
64
- "rewards/rejected": -0.12315647304058075,
65
  "step": 30
66
  },
67
  {
68
- "epoch": 0.1,
69
- "learning_rate": 4.878048780487804e-07,
70
- "logits/chosen": -2.54256272315979,
71
- "logits/rejected": -2.5281729698181152,
72
- "logps/chosen": -384.5321044921875,
73
- "logps/rejected": -168.55758666992188,
74
- "loss": 0.2564,
75
- "rewards/accuracies": 0.737500011920929,
76
- "rewards/chosen": 0.024631643667817116,
77
- "rewards/margins": 0.41851943731307983,
78
- "rewards/rejected": -0.39388787746429443,
79
  "step": 40
80
  },
81
  {
82
- "epoch": 0.12,
83
- "learning_rate": 4.992461696250783e-07,
84
- "logits/chosen": -2.4257261753082275,
85
- "logits/rejected": -2.3928446769714355,
86
- "logps/chosen": -436.45330810546875,
87
- "logps/rejected": -219.0617218017578,
88
- "loss": 0.1809,
89
- "rewards/accuracies": 0.7749999761581421,
90
- "rewards/chosen": -0.1671580970287323,
91
- "rewards/margins": 0.7879143953323364,
92
- "rewards/rejected": -0.9550724029541016,
93
  "step": 50
94
  },
95
  {
96
- "epoch": 0.15,
97
- "learning_rate": 4.966461721767899e-07,
98
- "logits/chosen": -2.3805835247039795,
99
- "logits/rejected": -2.3364853858947754,
100
- "logps/chosen": -437.4466857910156,
101
- "logps/rejected": -240.6685791015625,
102
- "loss": 0.1377,
103
- "rewards/accuracies": 0.768750011920929,
104
- "rewards/chosen": -0.32454290986061096,
105
- "rewards/margins": 0.9316140413284302,
106
- "rewards/rejected": -1.2561569213867188,
107
  "step": 60
108
  },
109
  {
110
- "epoch": 0.17,
111
- "learning_rate": 4.922100518015975e-07,
112
- "logits/chosen": -2.3752458095550537,
113
- "logits/rejected": -2.3281030654907227,
114
- "logps/chosen": -419.6747131347656,
115
- "logps/rejected": -264.75787353515625,
116
- "loss": 0.103,
117
- "rewards/accuracies": 0.7562500238418579,
118
- "rewards/chosen": -0.3937300443649292,
119
- "rewards/margins": 1.1917842626571655,
120
- "rewards/rejected": -1.5855143070220947,
121
  "step": 70
122
  },
123
  {
124
- "epoch": 0.2,
125
- "learning_rate": 4.859708325770919e-07,
126
- "logits/chosen": -2.4320530891418457,
127
- "logits/rejected": -2.3738579750061035,
128
- "logps/chosen": -472.10479736328125,
129
- "logps/rejected": -330.32403564453125,
130
- "loss": 0.0674,
131
- "rewards/accuracies": 0.8187500238418579,
132
- "rewards/chosen": -0.5765678286552429,
133
- "rewards/margins": 1.5421369075775146,
134
- "rewards/rejected": -2.1187047958374023,
135
  "step": 80
136
  },
137
  {
138
- "epoch": 0.22,
139
- "learning_rate": 4.779749614980225e-07,
140
- "logits/chosen": -2.3991949558258057,
141
- "logits/rejected": -2.357053279876709,
142
- "logps/chosen": -487.83074951171875,
143
- "logps/rejected": -349.1925354003906,
144
- "loss": 0.0553,
145
- "rewards/accuracies": 0.8062499761581421,
146
- "rewards/chosen": -0.666537880897522,
147
- "rewards/margins": 1.7182201147079468,
148
- "rewards/rejected": -2.3847577571868896,
149
  "step": 90
150
  },
151
  {
152
- "epoch": 0.25,
153
- "learning_rate": 4.682819627081427e-07,
154
- "logits/chosen": -2.3752927780151367,
155
- "logits/rejected": -2.326216220855713,
156
- "logps/chosen": -515.1549682617188,
157
- "logps/rejected": -378.8877868652344,
158
- "loss": 0.0437,
159
- "rewards/accuracies": 0.862500011920929,
160
- "rewards/chosen": -0.6667075157165527,
161
- "rewards/margins": 2.000246524810791,
162
- "rewards/rejected": -2.666954278945923,
163
  "step": 100
164
  },
165
  {
166
- "epoch": 0.25,
167
- "eval_logits/chosen": -2.3259778022766113,
168
- "eval_logits/rejected": -2.314302682876587,
169
- "eval_logps/chosen": -482.4153747558594,
170
- "eval_logps/rejected": -504.759033203125,
171
- "eval_loss": 0.08243285864591599,
172
- "eval_rewards/accuracies": 0.5859375,
173
- "eval_rewards/chosen": -2.2537574768066406,
174
- "eval_rewards/margins": 0.22029951214790344,
175
- "eval_rewards/rejected": -2.4740567207336426,
176
- "eval_runtime": 53.3582,
177
- "eval_samples_per_second": 37.483,
178
- "eval_steps_per_second": 0.6,
179
  "step": 100
180
  },
181
  {
182
- "epoch": 0.27,
183
- "learning_rate": 4.569639943810477e-07,
184
- "logits/chosen": -2.3144371509552,
185
- "logits/rejected": -2.2340025901794434,
186
- "logps/chosen": -490.12921142578125,
187
- "logps/rejected": -419.07867431640625,
188
- "loss": 0.0359,
189
- "rewards/accuracies": 0.737500011920929,
190
- "rewards/chosen": -1.3260681629180908,
191
- "rewards/margins": 1.7610738277435303,
192
- "rewards/rejected": -3.0871422290802,
193
  "step": 110
194
  },
195
  {
196
- "epoch": 0.3,
197
- "learning_rate": 4.4410531154874543e-07,
198
- "logits/chosen": -2.3716444969177246,
199
- "logits/rejected": -2.3235533237457275,
200
- "logps/chosen": -466.01702880859375,
201
- "logps/rejected": -356.8735046386719,
202
- "loss": 0.0575,
203
- "rewards/accuracies": 0.7875000238418579,
204
- "rewards/chosen": -0.7180399298667908,
205
- "rewards/margins": 1.6505486965179443,
206
- "rewards/rejected": -2.368588924407959,
207
  "step": 120
208
  },
209
  {
210
- "epoch": 0.32,
211
- "learning_rate": 4.298016388768561e-07,
212
- "logits/chosen": -2.3074584007263184,
213
- "logits/rejected": -2.257930040359497,
214
- "logps/chosen": -472.1845703125,
215
- "logps/rejected": -373.66522216796875,
216
- "loss": 0.0498,
217
- "rewards/accuracies": 0.768750011920929,
218
- "rewards/chosen": -1.0088322162628174,
219
- "rewards/margins": 1.6445964574813843,
220
- "rewards/rejected": -2.653428792953491,
221
  "step": 130
222
  },
223
  {
224
- "epoch": 0.35,
225
- "learning_rate": 4.1415945805573005e-07,
226
- "logits/chosen": -2.225804328918457,
227
- "logits/rejected": -2.15400767326355,
228
- "logps/chosen": -534.1700439453125,
229
- "logps/rejected": -430.3104553222656,
230
- "loss": 0.0361,
231
- "rewards/accuracies": 0.8500000238418579,
232
- "rewards/chosen": -1.042690634727478,
233
- "rewards/margins": 2.092653751373291,
234
- "rewards/rejected": -3.1353445053100586,
235
  "step": 140
236
  },
237
  {
238
- "epoch": 0.37,
239
- "learning_rate": 3.972952151123984e-07,
240
- "logits/chosen": -2.2562787532806396,
241
- "logits/rejected": -2.164506673812866,
242
- "logps/chosen": -522.7659912109375,
243
- "logps/rejected": -425.18109130859375,
244
- "loss": 0.0344,
245
- "rewards/accuracies": 0.8500000238418579,
246
- "rewards/chosen": -1.1226718425750732,
247
- "rewards/margins": 2.0036892890930176,
248
- "rewards/rejected": -3.12636137008667,
249
  "step": 150
250
  },
251
  {
252
- "epoch": 0.39,
253
- "learning_rate": 3.793344535444142e-07,
254
- "logits/chosen": -2.267565965652466,
255
- "logits/rejected": -2.1969974040985107,
256
- "logps/chosen": -530.3189086914062,
257
- "logps/rejected": -426.72332763671875,
258
- "loss": 0.0393,
259
- "rewards/accuracies": 0.768750011920929,
260
- "rewards/chosen": -1.2129985094070435,
261
- "rewards/margins": 1.8231168985366821,
262
- "rewards/rejected": -3.0361156463623047,
263
  "step": 160
264
  },
265
  {
266
- "epoch": 0.42,
267
- "learning_rate": 3.604108797288461e-07,
268
- "logits/chosen": -2.237342119216919,
269
- "logits/rejected": -2.1961898803710938,
270
- "logps/chosen": -448.13812255859375,
271
- "logps/rejected": -372.9068298339844,
272
- "loss": 0.0465,
273
- "rewards/accuracies": 0.768750011920929,
274
- "rewards/chosen": -1.092370629310608,
275
- "rewards/margins": 1.5985119342803955,
276
- "rewards/rejected": -2.690882444381714,
277
  "step": 170
278
  },
279
  {
280
- "epoch": 0.44,
281
- "learning_rate": 3.40665367563858e-07,
282
- "logits/chosen": -2.2571911811828613,
283
- "logits/rejected": -2.140353202819824,
284
- "logps/chosen": -548.1529541015625,
285
- "logps/rejected": -449.4532165527344,
286
- "loss": 0.035,
287
- "rewards/accuracies": 0.8187500238418579,
288
- "rewards/chosen": -0.8807584643363953,
289
- "rewards/margins": 2.355053663253784,
290
- "rewards/rejected": -3.2358124256134033,
291
  "step": 180
292
  },
293
  {
294
- "epoch": 0.47,
295
- "learning_rate": 3.202449097526798e-07,
296
- "logits/chosen": -2.1954236030578613,
297
- "logits/rejected": -2.113832950592041,
298
- "logps/chosen": -545.7277221679688,
299
- "logps/rejected": -466.76580810546875,
300
- "loss": 0.029,
301
  "rewards/accuracies": 0.768750011920929,
302
- "rewards/chosen": -1.4056795835494995,
303
- "rewards/margins": 2.1022555828094482,
304
- "rewards/rejected": -3.5079357624053955,
305
  "step": 190
306
  },
307
  {
308
- "epoch": 0.49,
309
- "learning_rate": 2.993015235369905e-07,
310
- "logits/chosen": -2.1386027336120605,
311
- "logits/rejected": -2.0572166442871094,
312
- "logps/chosen": -560.2534790039062,
313
- "logps/rejected": -491.8816833496094,
314
- "loss": 0.0258,
315
- "rewards/accuracies": 0.7437499761581421,
316
- "rewards/chosen": -1.810624361038208,
317
- "rewards/margins": 1.9691530466079712,
318
- "rewards/rejected": -3.7797775268554688,
319
  "step": 200
320
  },
321
  {
322
- "epoch": 0.49,
323
- "eval_logits/chosen": -2.1394448280334473,
324
- "eval_logits/rejected": -2.1155476570129395,
325
- "eval_logps/chosen": -543.8071899414062,
326
- "eval_logps/rejected": -579.2755126953125,
327
- "eval_loss": 0.0581156425178051,
328
- "eval_rewards/accuracies": 0.59765625,
329
- "eval_rewards/chosen": -2.86767578125,
330
- "eval_rewards/margins": 0.35154610872268677,
331
- "eval_rewards/rejected": -3.219222068786621,
332
- "eval_runtime": 53.2701,
333
- "eval_samples_per_second": 37.545,
334
- "eval_steps_per_second": 0.601,
335
  "step": 200
336
  },
337
  {
338
- "epoch": 0.52,
339
- "learning_rate": 2.7799111902582693e-07,
340
- "logits/chosen": -2.1782305240631104,
341
- "logits/rejected": -2.044674873352051,
342
- "logps/chosen": -579.908935546875,
343
- "logps/rejected": -500.6641540527344,
344
- "loss": 0.0219,
345
- "rewards/accuracies": 0.8187500238418579,
346
- "rewards/chosen": -1.408044695854187,
347
- "rewards/margins": 2.4992563724517822,
348
- "rewards/rejected": -3.9073009490966797,
349
  "step": 210
350
  },
351
  {
352
- "epoch": 0.54,
353
- "learning_rate": 2.564723385445869e-07,
354
- "logits/chosen": -2.2589755058288574,
355
- "logits/rejected": -2.156228542327881,
356
- "logps/chosen": -563.1976318359375,
357
- "logps/rejected": -475.75030517578125,
358
- "loss": 0.038,
359
- "rewards/accuracies": 0.762499988079071,
360
- "rewards/chosen": -1.3078866004943848,
361
- "rewards/margins": 2.1681323051452637,
362
- "rewards/rejected": -3.4760184288024902,
363
  "step": 220
364
  },
365
  {
366
- "epoch": 0.57,
367
- "learning_rate": 2.3490537564442845e-07,
368
- "logits/chosen": -2.2288191318511963,
369
- "logits/rejected": -2.136579751968384,
370
- "logps/chosen": -507.54632568359375,
371
- "logps/rejected": -419.88470458984375,
372
- "loss": 0.0432,
373
- "rewards/accuracies": 0.737500011920929,
374
- "rewards/chosen": -1.2000774145126343,
375
- "rewards/margins": 1.7510545253753662,
376
- "rewards/rejected": -2.951131820678711,
377
  "step": 230
378
  },
379
  {
380
- "epoch": 0.59,
381
- "learning_rate": 2.1345078256378801e-07,
382
- "logits/chosen": -2.282217264175415,
383
- "logits/rejected": -2.1927459239959717,
384
- "logps/chosen": -539.92822265625,
385
- "logps/rejected": -433.8241271972656,
386
- "loss": 0.0373,
387
- "rewards/accuracies": 0.7749999761581421,
388
- "rewards/chosen": -1.2784963846206665,
389
- "rewards/margins": 1.8950881958007812,
390
- "rewards/rejected": -3.1735846996307373,
391
  "step": 240
392
  },
393
  {
394
- "epoch": 0.62,
395
- "learning_rate": 1.9226827501969865e-07,
396
- "logits/chosen": -2.2803092002868652,
397
- "logits/rejected": -2.1990160942077637,
398
- "logps/chosen": -537.9136962890625,
399
- "logps/rejected": -442.28350830078125,
400
- "loss": 0.04,
401
- "rewards/accuracies": 0.800000011920929,
402
- "rewards/chosen": -1.2320274114608765,
403
- "rewards/margins": 2.1027939319610596,
404
- "rewards/rejected": -3.3348212242126465,
405
  "step": 250
406
  },
407
  {
408
- "epoch": 0.64,
409
- "learning_rate": 1.715155432264775e-07,
410
- "logits/chosen": -2.2646355628967285,
411
- "logits/rejected": -2.14613676071167,
412
- "logps/chosen": -502.49664306640625,
413
- "logps/rejected": -420.11004638671875,
414
- "loss": 0.0396,
415
  "rewards/accuracies": 0.800000011920929,
416
- "rewards/chosen": -1.1264328956604004,
417
- "rewards/margins": 2.006878614425659,
418
- "rewards/rejected": -3.1333117485046387,
419
  "step": 260
420
  },
421
  {
422
- "epoch": 0.67,
423
- "learning_rate": 1.51347077992983e-07,
424
- "logits/chosen": -2.3088698387145996,
425
- "logits/rejected": -2.2018628120422363,
426
- "logps/chosen": -554.0256958007812,
427
- "logps/rejected": -421.2101135253906,
428
- "loss": 0.0375,
429
- "rewards/accuracies": 0.831250011920929,
430
- "rewards/chosen": -1.10258948802948,
431
- "rewards/margins": 1.9626919031143188,
432
- "rewards/rejected": -3.065281391143799,
433
  "step": 270
434
  },
435
  {
436
- "epoch": 0.69,
437
- "learning_rate": 1.3191302063739906e-07,
438
- "logits/chosen": -2.310133457183838,
439
- "logits/rejected": -2.216827392578125,
440
- "logps/chosen": -522.3606567382812,
441
- "logps/rejected": -438.058349609375,
442
- "loss": 0.043,
443
- "rewards/accuracies": 0.800000011920929,
444
- "rewards/chosen": -1.1809624433517456,
445
- "rewards/margins": 1.9291051626205444,
446
- "rewards/rejected": -3.110067844390869,
447
  "step": 280
448
  },
449
  {
450
- "epoch": 0.72,
451
- "learning_rate": 1.1335804528119475e-07,
452
- "logits/chosen": -2.3108785152435303,
453
- "logits/rejected": -2.2141172885894775,
454
- "logps/chosen": -544.7510986328125,
455
- "logps/rejected": -427.60150146484375,
456
- "loss": 0.044,
457
- "rewards/accuracies": 0.831250011920929,
458
- "rewards/chosen": -1.00501549243927,
459
- "rewards/margins": 2.146829605102539,
460
- "rewards/rejected": -3.1518452167510986,
461
  "step": 290
462
  },
463
  {
464
- "epoch": 0.74,
465
- "learning_rate": 9.582028184286423e-08,
466
- "logits/chosen": -2.350487470626831,
467
- "logits/rejected": -2.307096481323242,
468
- "logps/chosen": -554.42529296875,
469
- "logps/rejected": -470.14434814453125,
470
- "loss": 0.0402,
471
- "rewards/accuracies": 0.8062499761581421,
472
- "rewards/chosen": -1.1662867069244385,
473
- "rewards/margins": 2.156501531600952,
474
- "rewards/rejected": -3.3227882385253906,
475
  "step": 300
476
  },
477
  {
478
- "epoch": 0.74,
479
- "eval_logits/chosen": -2.2979543209075928,
480
- "eval_logits/rejected": -2.2751243114471436,
481
- "eval_logps/chosen": -467.0057067871094,
482
- "eval_logps/rejected": -507.4114685058594,
483
- "eval_loss": 0.08367828279733658,
484
- "eval_rewards/accuracies": 0.62890625,
485
- "eval_rewards/chosen": -2.099660634994507,
486
- "eval_rewards/margins": 0.4009218215942383,
487
- "eval_rewards/rejected": -2.500582218170166,
488
- "eval_runtime": 53.3734,
489
- "eval_samples_per_second": 37.472,
490
- "eval_steps_per_second": 0.6,
491
  "step": 300
492
  },
493
  {
494
- "epoch": 0.76,
495
- "learning_rate": 7.943028774907065e-08,
496
- "logits/chosen": -2.316253185272217,
497
- "logits/rejected": -2.209606170654297,
498
- "logps/chosen": -524.6145629882812,
499
- "logps/rejected": -420.94671630859375,
500
- "loss": 0.0324,
501
- "rewards/accuracies": 0.768750011920929,
502
- "rewards/chosen": -1.0568145513534546,
503
- "rewards/margins": 2.0644707679748535,
504
- "rewards/rejected": -3.1212852001190186,
505
  "step": 310
506
  },
507
  {
508
- "epoch": 0.79,
509
- "learning_rate": 6.431007601814637e-08,
510
- "logits/chosen": -2.2733869552612305,
511
- "logits/rejected": -2.169506549835205,
512
- "logps/chosen": -532.5906982421875,
513
- "logps/rejected": -450.932373046875,
514
- "loss": 0.0316,
515
- "rewards/accuracies": 0.762499988079071,
516
- "rewards/chosen": -1.5221502780914307,
517
- "rewards/margins": 1.9199508428573608,
518
- "rewards/rejected": -3.442101001739502,
519
  "step": 320
520
  },
521
  {
522
- "epoch": 0.81,
523
- "learning_rate": 5.0572206951246e-08,
524
- "logits/chosen": -2.290539503097534,
525
- "logits/rejected": -2.193920850753784,
526
- "logps/chosen": -562.043701171875,
527
- "logps/rejected": -479.5208435058594,
528
- "loss": 0.0278,
529
- "rewards/accuracies": 0.793749988079071,
530
- "rewards/chosen": -1.3767458200454712,
531
- "rewards/margins": 2.1372973918914795,
532
- "rewards/rejected": -3.514043092727661,
533
  "step": 330
534
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
535
  {
536
  "epoch": 0.84,
537
- "learning_rate": 3.831895019292897e-08,
538
- "logits/chosen": -2.3263535499572754,
539
- "logits/rejected": -2.207899570465088,
540
- "logps/chosen": -619.2625122070312,
541
- "logps/rejected": -520.6148071289062,
542
- "loss": 0.0305,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
543
  "rewards/accuracies": 0.7562500238418579,
544
- "rewards/chosen": -1.5666420459747314,
545
- "rewards/margins": 2.300938844680786,
546
- "rewards/rejected": -3.8675804138183594,
547
- "step": 340
548
  },
549
  {
550
  "epoch": 0.86,
551
- "learning_rate": 2.764152339909756e-08,
552
- "logits/chosen": -2.305875539779663,
553
- "logits/rejected": -2.1924188137054443,
554
- "logps/chosen": -568.1319580078125,
555
- "logps/rejected": -475.6539611816406,
556
- "loss": 0.0245,
557
  "rewards/accuracies": 0.8062499761581421,
558
- "rewards/chosen": -1.341552972793579,
559
- "rewards/margins": 2.3198726177215576,
560
- "rewards/rejected": -3.661425828933716,
561
- "step": 350
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
562
  },
563
  {
564
  "epoch": 0.89,
565
- "learning_rate": 1.861941317991664e-08,
566
- "logits/chosen": -2.31453800201416,
567
- "logits/rejected": -2.209552049636841,
568
- "logps/chosen": -574.0198974609375,
569
- "logps/rejected": -498.809326171875,
570
- "loss": 0.0246,
571
- "rewards/accuracies": 0.8187500238418579,
572
- "rewards/chosen": -1.3858083486557007,
573
- "rewards/margins": 2.422987699508667,
574
- "rewards/rejected": -3.80879545211792,
575
- "step": 360
576
  },
577
  {
578
  "epoch": 0.91,
579
- "learning_rate": 1.13197833728636e-08,
580
- "logits/chosen": -2.2876980304718018,
581
- "logits/rejected": -2.1881823539733887,
582
- "logps/chosen": -583.4609985351562,
583
- "logps/rejected": -515.4216918945312,
584
- "loss": 0.0274,
585
  "rewards/accuracies": 0.8125,
586
- "rewards/chosen": -1.2713569402694702,
587
- "rewards/margins": 2.548645496368408,
588
- "rewards/rejected": -3.820002317428589,
589
- "step": 370
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
590
  },
591
  {
592
  "epoch": 0.94,
593
- "learning_rate": 5.79697505093521e-09,
594
- "logits/chosen": -2.2938995361328125,
595
- "logits/rejected": -2.161371946334839,
596
- "logps/chosen": -567.2229614257812,
597
- "logps/rejected": -493.6429138183594,
598
- "loss": 0.0339,
599
- "rewards/accuracies": 0.8187500238418579,
600
- "rewards/chosen": -1.495012879371643,
601
- "rewards/margins": 2.173337697982788,
602
- "rewards/rejected": -3.6683506965637207,
603
- "step": 380
604
  },
605
  {
606
- "epoch": 0.96,
607
- "learning_rate": 2.092101988131256e-09,
608
- "logits/chosen": -2.3137059211730957,
609
- "logits/rejected": -2.1986048221588135,
610
- "logps/chosen": -600.1227416992188,
611
- "logps/rejected": -496.6559143066406,
612
- "loss": 0.0258,
613
- "rewards/accuracies": 0.7875000238418579,
614
- "rewards/chosen": -1.368238925933838,
615
- "rewards/margins": 2.388805389404297,
616
- "rewards/rejected": -3.7570443153381348,
617
- "step": 390
618
  },
619
  {
620
- "epoch": 0.99,
621
- "learning_rate": 2.327445937151673e-10,
622
- "logits/chosen": -2.316849708557129,
623
- "logits/rejected": -2.1959729194641113,
624
- "logps/chosen": -559.0263061523438,
625
- "logps/rejected": -482.11773681640625,
626
- "loss": 0.0288,
627
  "rewards/accuracies": 0.78125,
628
- "rewards/chosen": -1.659148931503296,
629
- "rewards/margins": 2.0086750984191895,
630
- "rewards/rejected": -3.6678237915039062,
631
- "step": 400
632
  },
633
  {
634
- "epoch": 0.99,
635
- "eval_logits/chosen": -2.2708253860473633,
636
- "eval_logits/rejected": -2.245922565460205,
637
- "eval_logps/chosen": -513.0960083007812,
638
- "eval_logps/rejected": -552.8470458984375,
639
- "eval_loss": 0.06599809229373932,
640
- "eval_rewards/accuracies": 0.625,
641
- "eval_rewards/chosen": -2.560563564300537,
642
- "eval_rewards/margins": 0.3943747282028198,
643
- "eval_rewards/rejected": -2.9549384117126465,
644
- "eval_runtime": 53.3482,
645
- "eval_samples_per_second": 37.49,
646
- "eval_steps_per_second": 0.6,
647
- "step": 400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
648
  },
649
  {
650
  "epoch": 1.0,
651
- "step": 405,
652
  "total_flos": 0.0,
653
- "train_loss": 0.07151281171374851,
654
- "train_runtime": 3738.25,
655
- "train_samples_per_second": 13.882,
656
- "train_steps_per_second": 0.108
657
  }
658
  ],
659
  "logging_steps": 10,
660
- "max_steps": 405,
661
  "num_train_epochs": 1,
662
  "save_steps": 100,
663
  "total_flos": 0.0,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9994340690435767,
5
  "eval_steps": 100,
6
+ "global_step": 883,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "learning_rate": 5.617977528089887e-09,
14
+ "logits/chosen": -2.7943434715270996,
15
+ "logits/rejected": -2.817823886871338,
16
+ "logps/chosen": -334.107666015625,
17
+ "logps/rejected": -197.05621337890625,
18
+ "loss": 0.6931,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
 
23
  "step": 1
24
  },
25
  {
26
+ "epoch": 0.01,
27
+ "learning_rate": 5.617977528089887e-08,
28
+ "logits/chosen": -2.833451271057129,
29
+ "logits/rejected": -2.7827768325805664,
30
+ "logps/chosen": -323.80584716796875,
31
+ "logps/rejected": -189.39964294433594,
32
+ "loss": 0.6931,
33
+ "rewards/accuracies": 0.4722222089767456,
34
+ "rewards/chosen": 0.0005755923339165747,
35
+ "rewards/margins": 0.0003566421801224351,
36
+ "rewards/rejected": 0.00021895011013839394,
37
  "step": 10
38
  },
39
  {
40
+ "epoch": 0.02,
41
+ "learning_rate": 1.1235955056179774e-07,
42
+ "logits/chosen": -2.778655767440796,
43
+ "logits/rejected": -2.7627151012420654,
44
+ "logps/chosen": -323.3365783691406,
45
+ "logps/rejected": -168.40744018554688,
46
+ "loss": 0.6917,
47
+ "rewards/accuracies": 0.581250011920929,
48
+ "rewards/chosen": 0.0010369193041697145,
49
+ "rewards/margins": 0.0018870027270168066,
50
+ "rewards/rejected": -0.0008500836556777358,
51
  "step": 20
52
  },
53
  {
54
+ "epoch": 0.03,
55
+ "learning_rate": 1.6853932584269663e-07,
56
+ "logits/chosen": -2.7871737480163574,
57
+ "logits/rejected": -2.7326064109802246,
58
+ "logps/chosen": -305.997314453125,
59
+ "logps/rejected": -180.06800842285156,
60
+ "loss": 0.683,
61
+ "rewards/accuracies": 0.6875,
62
+ "rewards/chosen": 0.009164368733763695,
63
+ "rewards/margins": 0.015919247642159462,
64
+ "rewards/rejected": -0.006754877977073193,
65
  "step": 30
66
  },
67
  {
68
+ "epoch": 0.05,
69
+ "learning_rate": 2.2471910112359549e-07,
70
+ "logits/chosen": -2.7199320793151855,
71
+ "logits/rejected": -2.711822032928467,
72
+ "logps/chosen": -314.8984680175781,
73
+ "logps/rejected": -178.45077514648438,
74
+ "loss": 0.6653,
75
+ "rewards/accuracies": 0.6625000238418579,
76
+ "rewards/chosen": 0.034292496740818024,
77
+ "rewards/margins": 0.06667112559080124,
78
+ "rewards/rejected": -0.03237862139940262,
79
  "step": 40
80
  },
81
  {
82
+ "epoch": 0.06,
83
+ "learning_rate": 2.8089887640449437e-07,
84
+ "logits/chosen": -2.6660404205322266,
85
+ "logits/rejected": -2.6608872413635254,
86
+ "logps/chosen": -340.89056396484375,
87
+ "logps/rejected": -192.22543334960938,
88
+ "loss": 0.6387,
89
+ "rewards/accuracies": 0.65625,
90
+ "rewards/chosen": 0.0421828031539917,
91
+ "rewards/margins": 0.14488723874092102,
92
+ "rewards/rejected": -0.10270445048809052,
93
  "step": 50
94
  },
95
  {
96
+ "epoch": 0.07,
97
+ "learning_rate": 3.3707865168539325e-07,
98
+ "logits/chosen": -2.6621761322021484,
99
+ "logits/rejected": -2.6332790851593018,
100
+ "logps/chosen": -290.0724182128906,
101
+ "logps/rejected": -199.76377868652344,
102
+ "loss": 0.6192,
103
+ "rewards/accuracies": 0.606249988079071,
104
+ "rewards/chosen": -0.03800051286816597,
105
+ "rewards/margins": 0.1342642456293106,
106
+ "rewards/rejected": -0.17226476967334747,
107
  "step": 60
108
  },
109
  {
110
+ "epoch": 0.08,
111
+ "learning_rate": 3.9325842696629214e-07,
112
+ "logits/chosen": -2.5926709175109863,
113
+ "logits/rejected": -2.5758044719696045,
114
+ "logps/chosen": -318.26446533203125,
115
+ "logps/rejected": -217.8231964111328,
116
+ "loss": 0.5872,
117
+ "rewards/accuracies": 0.6937500238418579,
118
+ "rewards/chosen": -0.061497531831264496,
119
+ "rewards/margins": 0.23234911262989044,
120
+ "rewards/rejected": -0.29384663701057434,
121
  "step": 70
122
  },
123
  {
124
+ "epoch": 0.09,
125
+ "learning_rate": 4.4943820224719097e-07,
126
+ "logits/chosen": -2.561908483505249,
127
+ "logits/rejected": -2.5379791259765625,
128
+ "logps/chosen": -396.86993408203125,
129
+ "logps/rejected": -253.50143432617188,
130
+ "loss": 0.5579,
131
+ "rewards/accuracies": 0.731249988079071,
132
+ "rewards/chosen": -0.01939094439148903,
133
+ "rewards/margins": 0.49067315459251404,
134
+ "rewards/rejected": -0.5100641250610352,
135
  "step": 80
136
  },
137
  {
138
+ "epoch": 0.1,
139
+ "learning_rate": 4.999980431020109e-07,
140
+ "logits/chosen": -2.5810797214508057,
141
+ "logits/rejected": -2.5567336082458496,
142
+ "logps/chosen": -380.4464416503906,
143
+ "logps/rejected": -262.82904052734375,
144
+ "loss": 0.5455,
145
+ "rewards/accuracies": 0.706250011920929,
146
+ "rewards/chosen": -0.24769897758960724,
147
+ "rewards/margins": 0.5812320709228516,
148
+ "rewards/rejected": -0.8289310336112976,
149
  "step": 90
150
  },
151
  {
152
+ "epoch": 0.11,
153
+ "learning_rate": 4.997632524101301e-07,
154
+ "logits/chosen": -2.6055984497070312,
155
+ "logits/rejected": -2.5864219665527344,
156
+ "logps/chosen": -367.29071044921875,
157
+ "logps/rejected": -280.4869079589844,
158
+ "loss": 0.5392,
159
+ "rewards/accuracies": 0.65625,
160
+ "rewards/chosen": -0.2197243869304657,
161
+ "rewards/margins": 0.5160216689109802,
162
+ "rewards/rejected": -0.7357459664344788,
163
  "step": 100
164
  },
165
  {
166
+ "epoch": 0.11,
167
+ "eval_logits/chosen": -2.590785264968872,
168
+ "eval_logits/rejected": -2.5756187438964844,
169
+ "eval_logps/chosen": -322.57501220703125,
170
+ "eval_logps/rejected": -351.5352478027344,
171
+ "eval_loss": 0.6285832524299622,
172
+ "eval_rewards/accuracies": 0.65234375,
173
+ "eval_rewards/chosen": -0.6553537845611572,
174
+ "eval_rewards/margins": 0.2864663004875183,
175
+ "eval_rewards/rejected": -0.9418200850486755,
176
+ "eval_runtime": 53.1932,
177
+ "eval_samples_per_second": 37.599,
178
+ "eval_steps_per_second": 0.602,
179
  "step": 100
180
  },
181
  {
182
+ "epoch": 0.12,
183
+ "learning_rate": 4.991375032514749e-07,
184
+ "logits/chosen": -2.5533313751220703,
185
+ "logits/rejected": -2.5264110565185547,
186
+ "logps/chosen": -363.4510498046875,
187
+ "logps/rejected": -284.7992248535156,
188
+ "loss": 0.5232,
189
+ "rewards/accuracies": 0.7749999761581421,
190
+ "rewards/chosen": -0.3205306828022003,
191
+ "rewards/margins": 0.6818863749504089,
192
+ "rewards/rejected": -1.002416968345642,
193
  "step": 110
194
  },
195
  {
196
+ "epoch": 0.14,
197
+ "learning_rate": 4.98121775121344e-07,
198
+ "logits/chosen": -2.6315197944641113,
199
+ "logits/rejected": -2.5978212356567383,
200
+ "logps/chosen": -410.644775390625,
201
+ "logps/rejected": -323.01190185546875,
202
+ "loss": 0.4994,
203
+ "rewards/accuracies": 0.7124999761581421,
204
+ "rewards/chosen": -0.27390944957733154,
205
+ "rewards/margins": 0.8327864408493042,
206
+ "rewards/rejected": -1.1066958904266357,
207
  "step": 120
208
  },
209
  {
210
+ "epoch": 0.15,
211
+ "learning_rate": 4.96717657955441e-07,
212
+ "logits/chosen": -2.59904408454895,
213
+ "logits/rejected": -2.5410983562469482,
214
+ "logps/chosen": -416.3720703125,
215
+ "logps/rejected": -325.9648132324219,
216
+ "loss": 0.5013,
217
+ "rewards/accuracies": 0.793749988079071,
218
+ "rewards/chosen": -0.40370965003967285,
219
+ "rewards/margins": 0.9006286859512329,
220
+ "rewards/rejected": -1.3043382167816162,
221
  "step": 130
222
  },
223
  {
224
+ "epoch": 0.16,
225
+ "learning_rate": 4.949273496411216e-07,
226
+ "logits/chosen": -2.545508861541748,
227
+ "logits/rejected": -2.5205612182617188,
228
+ "logps/chosen": -379.17767333984375,
229
+ "logps/rejected": -337.29962158203125,
230
+ "loss": 0.4954,
231
+ "rewards/accuracies": 0.731249988079071,
232
+ "rewards/chosen": -0.451716810464859,
233
+ "rewards/margins": 0.8486088514328003,
234
+ "rewards/rejected": -1.300325632095337,
235
  "step": 140
236
  },
237
  {
238
+ "epoch": 0.17,
239
+ "learning_rate": 4.927536525770046e-07,
240
+ "logits/chosen": -2.5130438804626465,
241
+ "logits/rejected": -2.487233877182007,
242
+ "logps/chosen": -423.2710876464844,
243
+ "logps/rejected": -352.5829772949219,
244
+ "loss": 0.4976,
245
+ "rewards/accuracies": 0.8125,
246
+ "rewards/chosen": -0.5199152231216431,
247
+ "rewards/margins": 1.0131314992904663,
248
+ "rewards/rejected": -1.5330466032028198,
249
  "step": 150
250
  },
251
  {
252
+ "epoch": 0.18,
253
+ "learning_rate": 4.901999692863326e-07,
254
+ "logits/chosen": -2.520357847213745,
255
+ "logits/rejected": -2.4684462547302246,
256
+ "logps/chosen": -498.07098388671875,
257
+ "logps/rejected": -388.2645263671875,
258
+ "loss": 0.463,
259
+ "rewards/accuracies": 0.8125,
260
+ "rewards/chosen": -0.5807570219039917,
261
+ "rewards/margins": 1.1767116785049438,
262
+ "rewards/rejected": -1.757468581199646,
263
  "step": 160
264
  },
265
  {
266
+ "epoch": 0.19,
267
+ "learning_rate": 4.872702970909464e-07,
268
+ "logits/chosen": -2.345059633255005,
269
+ "logits/rejected": -2.281158924102783,
270
+ "logps/chosen": -455.2555236816406,
271
+ "logps/rejected": -373.2399597167969,
272
+ "loss": 0.4471,
273
+ "rewards/accuracies": 0.8187500238418579,
274
+ "rewards/chosen": -0.8019993901252747,
275
+ "rewards/margins": 1.065147042274475,
276
+ "rewards/rejected": -1.8671462535858154,
277
  "step": 170
278
  },
279
  {
280
+ "epoch": 0.2,
281
+ "learning_rate": 4.839692218542131e-07,
282
+ "logits/chosen": -2.167600631713867,
283
+ "logits/rejected": -2.1524620056152344,
284
+ "logps/chosen": -445.18963623046875,
285
+ "logps/rejected": -420.07354736328125,
286
+ "loss": 0.4607,
287
+ "rewards/accuracies": 0.6937500238418579,
288
+ "rewards/chosen": -1.5789515972137451,
289
+ "rewards/margins": 0.9403783082962036,
290
+ "rewards/rejected": -2.5193300247192383,
291
  "step": 180
292
  },
293
  {
294
+ "epoch": 0.22,
295
+ "learning_rate": 4.803019108026997e-07,
296
+ "logits/chosen": -2.0659067630767822,
297
+ "logits/rejected": -2.0179924964904785,
298
+ "logps/chosen": -446.5098571777344,
299
+ "logps/rejected": -408.96685791015625,
300
+ "loss": 0.4605,
301
  "rewards/accuracies": 0.768750011920929,
302
+ "rewards/chosen": -1.1547313928604126,
303
+ "rewards/margins": 1.099097490310669,
304
+ "rewards/rejected": -2.253828525543213,
305
  "step": 190
306
  },
307
  {
308
+ "epoch": 0.23,
309
+ "learning_rate": 4.7627410443782887e-07,
310
+ "logits/chosen": -1.9613704681396484,
311
+ "logits/rejected": -1.9336235523223877,
312
+ "logps/chosen": -434.38311767578125,
313
+ "logps/rejected": -421.72308349609375,
314
+ "loss": 0.4524,
315
+ "rewards/accuracies": 0.78125,
316
+ "rewards/chosen": -1.2233312129974365,
317
+ "rewards/margins": 1.0944594144821167,
318
+ "rewards/rejected": -2.3177905082702637,
319
  "step": 200
320
  },
321
  {
322
+ "epoch": 0.23,
323
+ "eval_logits/chosen": -1.9878398180007935,
324
+ "eval_logits/rejected": -1.9677612781524658,
325
+ "eval_logps/chosen": -405.3453674316406,
326
+ "eval_logps/rejected": -474.3326721191406,
327
+ "eval_loss": 0.5474696755409241,
328
+ "eval_rewards/accuracies": 0.72265625,
329
+ "eval_rewards/chosen": -1.4830571413040161,
330
+ "eval_rewards/margins": 0.6867368221282959,
331
+ "eval_rewards/rejected": -2.1697940826416016,
332
+ "eval_runtime": 53.0465,
333
+ "eval_samples_per_second": 37.703,
334
+ "eval_steps_per_second": 0.603,
335
  "step": 200
336
  },
337
  {
338
+ "epoch": 0.24,
339
+ "learning_rate": 4.7189210755018034e-07,
340
+ "logits/chosen": -1.916168212890625,
341
+ "logits/rejected": -1.849001169204712,
342
+ "logps/chosen": -497.56134033203125,
343
+ "logps/rejected": -451.7841796875,
344
+ "loss": 0.4423,
345
+ "rewards/accuracies": 0.800000011920929,
346
+ "rewards/chosen": -1.2633593082427979,
347
+ "rewards/margins": 1.236897587776184,
348
+ "rewards/rejected": -2.5002567768096924,
349
  "step": 210
350
  },
351
  {
352
+ "epoch": 0.25,
353
+ "learning_rate": 4.671627793504988e-07,
354
+ "logits/chosen": -1.965778112411499,
355
+ "logits/rejected": -1.8829681873321533,
356
+ "logps/chosen": -516.19921875,
357
+ "logps/rejected": -489.0526428222656,
358
+ "loss": 0.4306,
359
+ "rewards/accuracies": 0.7749999761581421,
360
+ "rewards/chosen": -1.368606686592102,
361
+ "rewards/margins": 1.4575475454330444,
362
+ "rewards/rejected": -2.8261542320251465,
363
  "step": 220
364
  },
365
  {
366
+ "epoch": 0.26,
367
+ "learning_rate": 4.6209352273286095e-07,
368
+ "logits/chosen": -1.8527837991714478,
369
+ "logits/rejected": -1.7781047821044922,
370
+ "logps/chosen": -492.2167053222656,
371
+ "logps/rejected": -515.4146728515625,
372
+ "loss": 0.4315,
373
+ "rewards/accuracies": 0.800000011920929,
374
+ "rewards/chosen": -1.5558216571807861,
375
+ "rewards/margins": 1.2494769096374512,
376
+ "rewards/rejected": -2.8052985668182373,
377
  "step": 230
378
  },
379
  {
380
+ "epoch": 0.27,
381
+ "learning_rate": 4.56692272686805e-07,
382
+ "logits/chosen": -1.8593418598175049,
383
+ "logits/rejected": -1.7763780355453491,
384
+ "logps/chosen": -473.20245361328125,
385
+ "logps/rejected": -463.26849365234375,
386
+ "loss": 0.4462,
387
+ "rewards/accuracies": 0.793749988079071,
388
+ "rewards/chosen": -1.5299947261810303,
389
+ "rewards/margins": 1.291903018951416,
390
+ "rewards/rejected": -2.8218979835510254,
391
  "step": 240
392
  },
393
  {
394
+ "epoch": 0.28,
395
+ "learning_rate": 4.5096748387656326e-07,
396
+ "logits/chosen": -1.6604913473129272,
397
+ "logits/rejected": -1.5300872325897217,
398
+ "logps/chosen": -527.0318603515625,
399
+ "logps/rejected": -502.64129638671875,
400
+ "loss": 0.4618,
401
+ "rewards/accuracies": 0.75,
402
+ "rewards/chosen": -2.062129497528076,
403
+ "rewards/margins": 1.1275193691253662,
404
+ "rewards/rejected": -3.1896486282348633,
405
  "step": 250
406
  },
407
  {
408
+ "epoch": 0.29,
409
+ "learning_rate": 4.4492811740683877e-07,
410
+ "logits/chosen": -1.5592234134674072,
411
+ "logits/rejected": -1.3744081258773804,
412
+ "logps/chosen": -491.737548828125,
413
+ "logps/rejected": -486.6441345214844,
414
+ "loss": 0.4473,
415
  "rewards/accuracies": 0.800000011920929,
416
+ "rewards/chosen": -1.907570481300354,
417
+ "rewards/margins": 1.1632691621780396,
418
+ "rewards/rejected": -3.0708391666412354,
419
  "step": 260
420
  },
421
  {
422
+ "epoch": 0.31,
423
+ "learning_rate": 4.3858362679584354e-07,
424
+ "logits/chosen": -1.5746996402740479,
425
+ "logits/rejected": -1.2380870580673218,
426
+ "logps/chosen": -457.90753173828125,
427
+ "logps/rejected": -446.56683349609375,
428
+ "loss": 0.4103,
429
+ "rewards/accuracies": 0.887499988079071,
430
+ "rewards/chosen": -1.1260545253753662,
431
+ "rewards/margins": 1.6088136434555054,
432
+ "rewards/rejected": -2.734868288040161,
433
  "step": 270
434
  },
435
  {
436
+ "epoch": 0.32,
437
+ "learning_rate": 4.3194394317755245e-07,
438
+ "logits/chosen": -1.3573920726776123,
439
+ "logits/rejected": -1.0481122732162476,
440
+ "logps/chosen": -512.2153930664062,
441
+ "logps/rejected": -469.4527893066406,
442
+ "loss": 0.4381,
443
+ "rewards/accuracies": 0.768750011920929,
444
+ "rewards/chosen": -1.6660184860229492,
445
+ "rewards/margins": 1.3182036876678467,
446
+ "rewards/rejected": -2.984222173690796,
447
  "step": 280
448
  },
449
  {
450
+ "epoch": 0.33,
451
+ "learning_rate": 4.2501945975633914e-07,
452
+ "logits/chosen": -1.5231261253356934,
453
+ "logits/rejected": -1.2471725940704346,
454
+ "logps/chosen": -508.29248046875,
455
+ "logps/rejected": -447.50689697265625,
456
+ "loss": 0.4364,
457
+ "rewards/accuracies": 0.768750011920929,
458
+ "rewards/chosen": -1.4383156299591064,
459
+ "rewards/margins": 1.2735927104949951,
460
+ "rewards/rejected": -2.7119078636169434,
461
  "step": 290
462
  },
463
  {
464
+ "epoch": 0.34,
465
+ "learning_rate": 4.1782101553832405e-07,
466
+ "logits/chosen": -1.4166069030761719,
467
+ "logits/rejected": -1.1425375938415527,
468
+ "logps/chosen": -467.41717529296875,
469
+ "logps/rejected": -439.3959045410156,
470
+ "loss": 0.3976,
471
+ "rewards/accuracies": 0.78125,
472
+ "rewards/chosen": -1.5179073810577393,
473
+ "rewards/margins": 1.1478455066680908,
474
+ "rewards/rejected": -2.665753126144409,
475
  "step": 300
476
  },
477
  {
478
+ "epoch": 0.34,
479
+ "eval_logits/chosen": -1.18406081199646,
480
+ "eval_logits/rejected": -0.9782991409301758,
481
+ "eval_logps/chosen": -442.4473571777344,
482
+ "eval_logps/rejected": -545.2501220703125,
483
+ "eval_loss": 0.5194380879402161,
484
+ "eval_rewards/accuracies": 0.76171875,
485
+ "eval_rewards/chosen": -1.8540773391723633,
486
+ "eval_rewards/margins": 1.0248912572860718,
487
+ "eval_rewards/rejected": -2.8789682388305664,
488
+ "eval_runtime": 53.0005,
489
+ "eval_samples_per_second": 37.736,
490
+ "eval_steps_per_second": 0.604,
491
  "step": 300
492
  },
493
  {
494
+ "epoch": 0.35,
495
+ "learning_rate": 4.103598783649029e-07,
496
+ "logits/chosen": -1.0781385898590088,
497
+ "logits/rejected": -0.6068095564842224,
498
+ "logps/chosen": -542.6256713867188,
499
+ "logps/rejected": -505.87078857421875,
500
+ "loss": 0.4248,
501
+ "rewards/accuracies": 0.78125,
502
+ "rewards/chosen": -1.779624342918396,
503
+ "rewards/margins": 1.5406283140182495,
504
+ "rewards/rejected": -3.3202528953552246,
505
  "step": 310
506
  },
507
  {
508
+ "epoch": 0.36,
509
+ "learning_rate": 4.026477272750119e-07,
510
+ "logits/chosen": -0.7725287079811096,
511
+ "logits/rejected": -0.2756701111793518,
512
+ "logps/chosen": -545.5137329101562,
513
+ "logps/rejected": -528.4269409179688,
514
+ "loss": 0.4226,
515
+ "rewards/accuracies": 0.7749999761581421,
516
+ "rewards/chosen": -2.142789602279663,
517
+ "rewards/margins": 1.3576524257659912,
518
+ "rewards/rejected": -3.500441789627075,
519
  "step": 320
520
  },
521
  {
522
+ "epoch": 0.37,
523
+ "learning_rate": 3.9469663422373864e-07,
524
+ "logits/chosen": -0.9761560559272766,
525
+ "logits/rejected": -0.6311030983924866,
526
+ "logps/chosen": -517.2960205078125,
527
+ "logps/rejected": -506.86328125,
528
+ "loss": 0.4432,
529
+ "rewards/accuracies": 0.7749999761581421,
530
+ "rewards/chosen": -1.8108386993408203,
531
+ "rewards/margins": 1.37090265750885,
532
+ "rewards/rejected": -3.181741237640381,
533
  "step": 330
534
  },
535
+ {
536
+ "epoch": 0.38,
537
+ "learning_rate": 3.865190451858954e-07,
538
+ "logits/chosen": -0.865078330039978,
539
+ "logits/rejected": -0.3488244414329529,
540
+ "logps/chosen": -540.340087890625,
541
+ "logps/rejected": -525.5319213867188,
542
+ "loss": 0.43,
543
+ "rewards/accuracies": 0.793749988079071,
544
+ "rewards/chosen": -1.597611904144287,
545
+ "rewards/margins": 1.6154896020889282,
546
+ "rewards/rejected": -3.213101625442505,
547
+ "step": 340
548
+ },
549
+ {
550
+ "epoch": 0.4,
551
+ "learning_rate": 3.781277606741327e-07,
552
+ "logits/chosen": -1.0114878416061401,
553
+ "logits/rejected": -0.7175018191337585,
554
+ "logps/chosen": -450.4183654785156,
555
+ "logps/rejected": -459.7533264160156,
556
+ "loss": 0.4271,
557
+ "rewards/accuracies": 0.737500011920929,
558
+ "rewards/chosen": -1.427380919456482,
559
+ "rewards/margins": 1.2692419290542603,
560
+ "rewards/rejected": -2.696622610092163,
561
+ "step": 350
562
+ },
563
+ {
564
+ "epoch": 0.41,
565
+ "learning_rate": 3.6953591570208996e-07,
566
+ "logits/chosen": -1.2963850498199463,
567
+ "logits/rejected": -0.8218928575515747,
568
+ "logps/chosen": -540.1664428710938,
569
+ "logps/rejected": -555.89306640625,
570
+ "loss": 0.4147,
571
+ "rewards/accuracies": 0.8500000238418579,
572
+ "rewards/chosen": -1.701040506362915,
573
+ "rewards/margins": 1.8265488147735596,
574
+ "rewards/rejected": -3.5275893211364746,
575
+ "step": 360
576
+ },
577
+ {
578
+ "epoch": 0.42,
579
+ "learning_rate": 3.607569592239452e-07,
580
+ "logits/chosen": -1.0880775451660156,
581
+ "logits/rejected": -0.6546664237976074,
582
+ "logps/chosen": -559.7450561523438,
583
+ "logps/rejected": -538.1546630859375,
584
+ "loss": 0.4192,
585
+ "rewards/accuracies": 0.793749988079071,
586
+ "rewards/chosen": -1.7286078929901123,
587
+ "rewards/margins": 1.7532542943954468,
588
+ "rewards/rejected": -3.4818618297576904,
589
+ "step": 370
590
+ },
591
+ {
592
+ "epoch": 0.43,
593
+ "learning_rate": 3.518046330825494e-07,
594
+ "logits/chosen": -1.1186842918395996,
595
+ "logits/rejected": -0.6067591905593872,
596
+ "logps/chosen": -560.1696166992188,
597
+ "logps/rejected": -522.5840454101562,
598
+ "loss": 0.4349,
599
+ "rewards/accuracies": 0.831250011920929,
600
+ "rewards/chosen": -1.8213374614715576,
601
+ "rewards/margins": 1.5133308172225952,
602
+ "rewards/rejected": -3.3346683979034424,
603
+ "step": 380
604
+ },
605
+ {
606
+ "epoch": 0.44,
607
+ "learning_rate": 3.4269295049909713e-07,
608
+ "logits/chosen": -1.1209189891815186,
609
+ "logits/rejected": -0.7713836431503296,
610
+ "logps/chosen": -473.28759765625,
611
+ "logps/rejected": -484.11065673828125,
612
+ "loss": 0.3979,
613
+ "rewards/accuracies": 0.8187500238418579,
614
+ "rewards/chosen": -1.7088820934295654,
615
+ "rewards/margins": 1.425378441810608,
616
+ "rewards/rejected": -3.1342601776123047,
617
+ "step": 390
618
+ },
619
+ {
620
+ "epoch": 0.45,
621
+ "learning_rate": 3.3343617413800453e-07,
622
+ "logits/chosen": -1.1869983673095703,
623
+ "logits/rejected": -0.6728812456130981,
624
+ "logps/chosen": -529.2347412109375,
625
+ "logps/rejected": -498.1748962402344,
626
+ "loss": 0.3892,
627
+ "rewards/accuracies": 0.8374999761581421,
628
+ "rewards/chosen": -1.649171233177185,
629
+ "rewards/margins": 1.6641887426376343,
630
+ "rewards/rejected": -3.3133597373962402,
631
+ "step": 400
632
+ },
633
+ {
634
+ "epoch": 0.45,
635
+ "eval_logits/chosen": -0.8579260110855103,
636
+ "eval_logits/rejected": -0.6001935601234436,
637
+ "eval_logps/chosen": -464.98876953125,
638
+ "eval_logps/rejected": -575.0087280273438,
639
+ "eval_loss": 0.5159767866134644,
640
+ "eval_rewards/accuracies": 0.77734375,
641
+ "eval_rewards/chosen": -2.079491376876831,
642
+ "eval_rewards/margins": 1.0970630645751953,
643
+ "eval_rewards/rejected": -3.1765542030334473,
644
+ "eval_runtime": 53.0852,
645
+ "eval_samples_per_second": 37.675,
646
+ "eval_steps_per_second": 0.603,
647
+ "step": 400
648
+ },
649
+ {
650
+ "epoch": 0.46,
651
+ "learning_rate": 3.2404879378132893e-07,
652
+ "logits/chosen": -0.8699030876159668,
653
+ "logits/rejected": -0.48875007033348083,
654
+ "logps/chosen": -468.9755859375,
655
+ "logps/rejected": -495.357421875,
656
+ "loss": 0.4084,
657
+ "rewards/accuracies": 0.8187500238418579,
658
+ "rewards/chosen": -1.6207454204559326,
659
+ "rewards/margins": 1.6607239246368408,
660
+ "rewards/rejected": -3.2814698219299316,
661
+ "step": 410
662
+ },
663
+ {
664
+ "epoch": 0.48,
665
+ "learning_rate": 3.1454550364767894e-07,
666
+ "logits/chosen": -1.098257303237915,
667
+ "logits/rejected": -0.709359347820282,
668
+ "logps/chosen": -512.3826904296875,
669
+ "logps/rejected": -523.95458984375,
670
+ "loss": 0.4354,
671
+ "rewards/accuracies": 0.7562500238418579,
672
+ "rewards/chosen": -1.7999498844146729,
673
+ "rewards/margins": 1.4373884201049805,
674
+ "rewards/rejected": -3.2373383045196533,
675
+ "step": 420
676
+ },
677
+ {
678
+ "epoch": 0.49,
679
+ "learning_rate": 3.049411793911154e-07,
680
+ "logits/chosen": -0.9810858964920044,
681
+ "logits/rejected": -0.6282259821891785,
682
+ "logps/chosen": -509.84368896484375,
683
+ "logps/rejected": -517.60107421875,
684
+ "loss": 0.3974,
685
+ "rewards/accuracies": 0.8062499761581421,
686
+ "rewards/chosen": -1.8548433780670166,
687
+ "rewards/margins": 1.386967420578003,
688
+ "rewards/rejected": -3.2418110370635986,
689
+ "step": 430
690
+ },
691
+ {
692
+ "epoch": 0.5,
693
+ "learning_rate": 2.9525085481604914e-07,
694
+ "logits/chosen": -0.6511877775192261,
695
+ "logits/rejected": -0.07081355899572372,
696
+ "logps/chosen": -509.661376953125,
697
+ "logps/rejected": -524.3201904296875,
698
+ "loss": 0.4151,
699
+ "rewards/accuracies": 0.793749988079071,
700
+ "rewards/chosen": -1.8540523052215576,
701
+ "rewards/margins": 1.5340583324432373,
702
+ "rewards/rejected": -3.388110399246216,
703
+ "step": 440
704
+ },
705
+ {
706
+ "epoch": 0.51,
707
+ "learning_rate": 2.854896983445833e-07,
708
+ "logits/chosen": -0.5572197437286377,
709
+ "logits/rejected": 0.0708194151520729,
710
+ "logps/chosen": -562.8184814453125,
711
+ "logps/rejected": -528.6909790039062,
712
+ "loss": 0.4329,
713
+ "rewards/accuracies": 0.7749999761581421,
714
+ "rewards/chosen": -1.886749267578125,
715
+ "rewards/margins": 1.5286136865615845,
716
+ "rewards/rejected": -3.415362596511841,
717
+ "step": 450
718
+ },
719
+ {
720
+ "epoch": 0.52,
721
+ "learning_rate": 2.7567298927313654e-07,
722
+ "logits/chosen": -0.8817178606987,
723
+ "logits/rejected": -0.6781443357467651,
724
+ "logps/chosen": -470.3621520996094,
725
+ "logps/rejected": -495.33111572265625,
726
+ "loss": 0.4137,
727
+ "rewards/accuracies": 0.7749999761581421,
728
+ "rewards/chosen": -1.4939197301864624,
729
+ "rewards/margins": 1.3965364694595337,
730
+ "rewards/rejected": -2.890456199645996,
731
+ "step": 460
732
+ },
733
+ {
734
+ "epoch": 0.53,
735
+ "learning_rate": 2.658160938555123e-07,
736
+ "logits/chosen": -0.900059700012207,
737
+ "logits/rejected": -0.38037875294685364,
738
+ "logps/chosen": -530.0759887695312,
739
+ "logps/rejected": -549.6453857421875,
740
+ "loss": 0.3727,
741
+ "rewards/accuracies": 0.8187500238418579,
742
+ "rewards/chosen": -1.6465427875518799,
743
+ "rewards/margins": 1.722328543663025,
744
+ "rewards/rejected": -3.3688716888427734,
745
+ "step": 470
746
+ },
747
+ {
748
+ "epoch": 0.54,
749
+ "learning_rate": 2.559344412498532e-07,
750
+ "logits/chosen": -0.5834644436836243,
751
+ "logits/rejected": -0.024540895596146584,
752
+ "logps/chosen": -526.4287719726562,
753
+ "logps/rejected": -525.1407470703125,
754
+ "loss": 0.4301,
755
+ "rewards/accuracies": 0.800000011920929,
756
+ "rewards/chosen": -1.8217204809188843,
757
+ "rewards/margins": 1.5024107694625854,
758
+ "rewards/rejected": -3.3241310119628906,
759
+ "step": 480
760
+ },
761
+ {
762
+ "epoch": 0.55,
763
+ "learning_rate": 2.460434993671294e-07,
764
+ "logits/chosen": -0.999637246131897,
765
+ "logits/rejected": -0.7088354825973511,
766
+ "logps/chosen": -467.12353515625,
767
+ "logps/rejected": -472.5782775878906,
768
+ "loss": 0.3968,
769
+ "rewards/accuracies": 0.7875000238418579,
770
+ "rewards/chosen": -1.5620094537734985,
771
+ "rewards/margins": 1.4622641801834106,
772
+ "rewards/rejected": -3.02427339553833,
773
+ "step": 490
774
+ },
775
+ {
776
+ "epoch": 0.57,
777
+ "learning_rate": 2.361587506589672e-07,
778
+ "logits/chosen": -1.169862151145935,
779
+ "logits/rejected": -0.6735583543777466,
780
+ "logps/chosen": -547.8793334960938,
781
+ "logps/rejected": -530.5782470703125,
782
+ "loss": 0.3964,
783
+ "rewards/accuracies": 0.824999988079071,
784
+ "rewards/chosen": -1.728243112564087,
785
+ "rewards/margins": 1.6642783880233765,
786
+ "rewards/rejected": -3.392521381378174,
787
+ "step": 500
788
+ },
789
+ {
790
+ "epoch": 0.57,
791
+ "eval_logits/chosen": -1.018913984298706,
792
+ "eval_logits/rejected": -0.8011811375617981,
793
+ "eval_logps/chosen": -476.00384521484375,
794
+ "eval_logps/rejected": -588.1665649414062,
795
+ "eval_loss": 0.49919986724853516,
796
+ "eval_rewards/accuracies": 0.765625,
797
+ "eval_rewards/chosen": -2.1896419525146484,
798
+ "eval_rewards/margins": 1.11849045753479,
799
+ "eval_rewards/rejected": -3.3081324100494385,
800
+ "eval_runtime": 73.4341,
801
+ "eval_samples_per_second": 27.235,
802
+ "eval_steps_per_second": 0.436,
803
+ "step": 500
804
+ },
805
+ {
806
+ "epoch": 0.58,
807
+ "learning_rate": 2.2629566788271613e-07,
808
+ "logits/chosen": -1.1643812656402588,
809
+ "logits/rejected": -0.6770884394645691,
810
+ "logps/chosen": -498.718994140625,
811
+ "logps/rejected": -513.8646240234375,
812
+ "loss": 0.4072,
813
+ "rewards/accuracies": 0.8374999761581421,
814
+ "rewards/chosen": -1.7343899011611938,
815
+ "rewards/margins": 1.711033582687378,
816
+ "rewards/rejected": -3.4454236030578613,
817
+ "step": 510
818
+ },
819
+ {
820
+ "epoch": 0.59,
821
+ "learning_rate": 2.1646968988169135e-07,
822
+ "logits/chosen": -1.2519400119781494,
823
+ "logits/rejected": -0.7656970620155334,
824
+ "logps/chosen": -552.4429931640625,
825
+ "logps/rejected": -580.3065185546875,
826
+ "loss": 0.3859,
827
+ "rewards/accuracies": 0.8187500238418579,
828
+ "rewards/chosen": -1.9031795263290405,
829
+ "rewards/margins": 1.720510721206665,
830
+ "rewards/rejected": -3.623690366744995,
831
+ "step": 520
832
+ },
833
+ {
834
+ "epoch": 0.6,
835
+ "learning_rate": 2.0669619741850232e-07,
836
+ "logits/chosen": -1.166473388671875,
837
+ "logits/rejected": -0.5304248929023743,
838
+ "logps/chosen": -543.8204345703125,
839
+ "logps/rejected": -517.4561767578125,
840
+ "loss": 0.4265,
841
+ "rewards/accuracies": 0.737500011920929,
842
+ "rewards/chosen": -1.9481357336044312,
843
+ "rewards/margins": 1.5012562274932861,
844
+ "rewards/rejected": -3.4493918418884277,
845
+ "step": 530
846
+ },
847
+ {
848
+ "epoch": 0.61,
849
+ "learning_rate": 1.9699048909929518e-07,
850
+ "logits/chosen": -1.3502863645553589,
851
+ "logits/rejected": -0.972245991230011,
852
+ "logps/chosen": -513.7689819335938,
853
+ "logps/rejected": -506.95953369140625,
854
+ "loss": 0.3917,
855
+ "rewards/accuracies": 0.856249988079071,
856
+ "rewards/chosen": -1.8054568767547607,
857
+ "rewards/margins": 1.3885786533355713,
858
+ "rewards/rejected": -3.194035291671753,
859
+ "step": 540
860
+ },
861
+ {
862
+ "epoch": 0.62,
863
+ "learning_rate": 1.8736775742659732e-07,
864
+ "logits/chosen": -1.1914501190185547,
865
+ "logits/rejected": -0.8519012331962585,
866
+ "logps/chosen": -489.40234375,
867
+ "logps/rejected": -517.2066040039062,
868
+ "loss": 0.3892,
869
+ "rewards/accuracies": 0.862500011920929,
870
+ "rewards/chosen": -1.6697397232055664,
871
+ "rewards/margins": 1.5511436462402344,
872
+ "rewards/rejected": -3.2208831310272217,
873
+ "step": 550
874
+ },
875
+ {
876
+ "epoch": 0.63,
877
+ "learning_rate": 1.7784306501824616e-07,
878
+ "logits/chosen": -1.1693607568740845,
879
+ "logits/rejected": -0.501569926738739,
880
+ "logps/chosen": -549.26220703125,
881
+ "logps/rejected": -523.9554443359375,
882
+ "loss": 0.4249,
883
+ "rewards/accuracies": 0.7875000238418579,
884
+ "rewards/chosen": -1.8511940240859985,
885
+ "rewards/margins": 1.4909955263137817,
886
+ "rewards/rejected": -3.342189311981201,
887
+ "step": 560
888
+ },
889
+ {
890
+ "epoch": 0.65,
891
+ "learning_rate": 1.6843132102963025e-07,
892
+ "logits/chosen": -1.1927831172943115,
893
+ "logits/rejected": -0.8532694578170776,
894
+ "logps/chosen": -539.3836669921875,
895
+ "logps/rejected": -510.72637939453125,
896
+ "loss": 0.3897,
897
+ "rewards/accuracies": 0.8125,
898
+ "rewards/chosen": -1.6676031351089478,
899
+ "rewards/margins": 1.5155996084213257,
900
+ "rewards/rejected": -3.1832027435302734,
901
+ "step": 570
902
+ },
903
+ {
904
+ "epoch": 0.66,
905
+ "learning_rate": 1.591472578161458e-07,
906
+ "logits/chosen": -1.3109443187713623,
907
+ "logits/rejected": -0.9485646486282349,
908
+ "logps/chosen": -494.5043029785156,
909
+ "logps/rejected": -489.75537109375,
910
+ "loss": 0.4009,
911
+ "rewards/accuracies": 0.7875000238418579,
912
+ "rewards/chosen": -1.5513614416122437,
913
+ "rewards/margins": 1.510770559310913,
914
+ "rewards/rejected": -3.062131881713867,
915
+ "step": 580
916
+ },
917
+ {
918
+ "epoch": 0.67,
919
+ "learning_rate": 1.5000540787240274e-07,
920
+ "logits/chosen": -1.2452589273452759,
921
+ "logits/rejected": -0.857632040977478,
922
+ "logps/chosen": -504.5924377441406,
923
+ "logps/rejected": -519.4934692382812,
924
+ "loss": 0.3993,
925
+ "rewards/accuracies": 0.8062499761581421,
926
+ "rewards/chosen": -1.7807199954986572,
927
+ "rewards/margins": 1.571396827697754,
928
+ "rewards/rejected": -3.352116823196411,
929
+ "step": 590
930
+ },
931
+ {
932
+ "epoch": 0.68,
933
+ "learning_rate": 1.410200810842749e-07,
934
+ "logits/chosen": -1.2575485706329346,
935
+ "logits/rejected": -0.8479830622673035,
936
+ "logps/chosen": -503.79388427734375,
937
+ "logps/rejected": -516.9588012695312,
938
+ "loss": 0.4149,
939
+ "rewards/accuracies": 0.824999988079071,
940
+ "rewards/chosen": -1.7127326726913452,
941
+ "rewards/margins": 1.6169878244400024,
942
+ "rewards/rejected": -3.3297202587127686,
943
+ "step": 600
944
+ },
945
+ {
946
+ "epoch": 0.68,
947
+ "eval_logits/chosen": -1.2397898435592651,
948
+ "eval_logits/rejected": -1.0526514053344727,
949
+ "eval_logps/chosen": -477.6524658203125,
950
+ "eval_logps/rejected": -589.7600708007812,
951
+ "eval_loss": 0.4948367774486542,
952
+ "eval_rewards/accuracies": 0.74609375,
953
+ "eval_rewards/chosen": -2.2061285972595215,
954
+ "eval_rewards/margins": 1.1179393529891968,
955
+ "eval_rewards/rejected": -3.3240678310394287,
956
+ "eval_runtime": 53.1159,
957
+ "eval_samples_per_second": 37.654,
958
+ "eval_steps_per_second": 0.602,
959
+ "step": 600
960
+ },
961
+ {
962
+ "epoch": 0.69,
963
+ "learning_rate": 1.322053423294041e-07,
964
+ "logits/chosen": -1.256247639656067,
965
+ "logits/rejected": -0.9272082448005676,
966
+ "logps/chosen": -501.60675048828125,
967
+ "logps/rejected": -531.3302001953125,
968
+ "loss": 0.4028,
969
+ "rewards/accuracies": 0.831250011920929,
970
+ "rewards/chosen": -1.7432218790054321,
971
+ "rewards/margins": 1.7136541604995728,
972
+ "rewards/rejected": -3.456876039505005,
973
+ "step": 610
974
+ },
975
+ {
976
+ "epoch": 0.7,
977
+ "learning_rate": 1.2357498946121905e-07,
978
+ "logits/chosen": -1.3026126623153687,
979
+ "logits/rejected": -0.9675828218460083,
980
+ "logps/chosen": -534.3182373046875,
981
+ "logps/rejected": -527.3935546875,
982
+ "loss": 0.4187,
983
+ "rewards/accuracies": 0.824999988079071,
984
+ "rewards/chosen": -1.8490867614746094,
985
+ "rewards/margins": 1.5810914039611816,
986
+ "rewards/rejected": -3.430178165435791,
987
+ "step": 620
988
+ },
989
+ {
990
+ "epoch": 0.71,
991
+ "learning_rate": 1.1514253171093161e-07,
992
+ "logits/chosen": -1.2203739881515503,
993
+ "logits/rejected": -0.7822047472000122,
994
+ "logps/chosen": -493.72821044921875,
995
+ "logps/rejected": -506.88690185546875,
996
+ "loss": 0.4051,
997
+ "rewards/accuracies": 0.8062499761581421,
998
+ "rewards/chosen": -1.6152998208999634,
999
+ "rewards/margins": 1.5858867168426514,
1000
+ "rewards/rejected": -3.201186418533325,
1001
+ "step": 630
1002
+ },
1003
+ {
1004
+ "epoch": 0.72,
1005
+ "learning_rate": 1.0692116854131883e-07,
1006
+ "logits/chosen": -1.000585913658142,
1007
+ "logits/rejected": -0.7414053678512573,
1008
+ "logps/chosen": -493.9928283691406,
1009
+ "logps/rejected": -534.2704467773438,
1010
+ "loss": 0.3866,
1011
+ "rewards/accuracies": 0.824999988079071,
1012
+ "rewards/chosen": -1.7848412990570068,
1013
+ "rewards/margins": 1.534330129623413,
1014
+ "rewards/rejected": -3.319171905517578,
1015
+ "step": 640
1016
+ },
1017
+ {
1018
+ "epoch": 0.74,
1019
+ "learning_rate": 9.89237689853889e-08,
1020
+ "logits/chosen": -0.9636529684066772,
1021
+ "logits/rejected": -0.6193439364433289,
1022
+ "logps/chosen": -499.71234130859375,
1023
+ "logps/rejected": -517.5823974609375,
1024
+ "loss": 0.394,
1025
+ "rewards/accuracies": 0.831250011920929,
1026
+ "rewards/chosen": -1.829134225845337,
1027
+ "rewards/margins": 1.6988353729248047,
1028
+ "rewards/rejected": -3.5279693603515625,
1029
+ "step": 650
1030
+ },
1031
+ {
1032
+ "epoch": 0.75,
1033
+ "learning_rate": 9.11628515022765e-08,
1034
+ "logits/chosen": -1.0789777040481567,
1035
+ "logits/rejected": -0.6399408578872681,
1036
+ "logps/chosen": -513.3380737304688,
1037
+ "logps/rejected": -544.7978515625,
1038
+ "loss": 0.3623,
1039
+ "rewards/accuracies": 0.875,
1040
+ "rewards/chosen": -1.7524398565292358,
1041
+ "rewards/margins": 1.8130983114242554,
1042
+ "rewards/rejected": -3.565537929534912,
1043
+ "step": 660
1044
+ },
1045
+ {
1046
+ "epoch": 0.76,
1047
+ "learning_rate": 8.365056438189486e-08,
1048
+ "logits/chosen": -1.0069674253463745,
1049
+ "logits/rejected": -0.5994616746902466,
1050
+ "logps/chosen": -542.05712890625,
1051
+ "logps/rejected": -564.6227416992188,
1052
+ "loss": 0.4122,
1053
+ "rewards/accuracies": 0.800000011920929,
1054
+ "rewards/chosen": -1.9871015548706055,
1055
+ "rewards/margins": 1.6281112432479858,
1056
+ "rewards/rejected": -3.6152126789093018,
1057
+ "step": 670
1058
+ },
1059
+ {
1060
+ "epoch": 0.77,
1061
+ "learning_rate": 7.639866672902101e-08,
1062
+ "logits/chosen": -1.0949068069458008,
1063
+ "logits/rejected": -0.7090824246406555,
1064
+ "logps/chosen": -549.8911743164062,
1065
+ "logps/rejected": -559.15771484375,
1066
+ "loss": 0.4132,
1067
+ "rewards/accuracies": 0.8374999761581421,
1068
+ "rewards/chosen": -1.8535239696502686,
1069
+ "rewards/margins": 1.733758568763733,
1070
+ "rewards/rejected": -3.587282657623291,
1071
+ "step": 680
1072
+ },
1073
+ {
1074
+ "epoch": 0.78,
1075
+ "learning_rate": 6.941851005657851e-08,
1076
+ "logits/chosen": -1.1339385509490967,
1077
+ "logits/rejected": -0.738599419593811,
1078
+ "logps/chosen": -494.1913146972656,
1079
+ "logps/rejected": -504.7791442871094,
1080
+ "loss": 0.3813,
1081
+ "rewards/accuracies": 0.84375,
1082
+ "rewards/chosen": -1.752722978591919,
1083
+ "rewards/margins": 1.4443397521972656,
1084
+ "rewards/rejected": -3.1970624923706055,
1085
+ "step": 690
1086
+ },
1087
+ {
1088
+ "epoch": 0.79,
1089
+ "learning_rate": 6.272102051693051e-08,
1090
+ "logits/chosen": -1.2199567556381226,
1091
+ "logits/rejected": -0.9412355422973633,
1092
+ "logps/chosen": -552.1275024414062,
1093
+ "logps/rejected": -515.4296264648438,
1094
+ "loss": 0.4004,
1095
+ "rewards/accuracies": 0.793749988079071,
1096
+ "rewards/chosen": -1.7129371166229248,
1097
+ "rewards/margins": 1.4609147310256958,
1098
+ "rewards/rejected": -3.1738522052764893,
1099
+ "step": 700
1100
+ },
1101
+ {
1102
+ "epoch": 0.79,
1103
+ "eval_logits/chosen": -1.0643391609191895,
1104
+ "eval_logits/rejected": -0.8519161343574524,
1105
+ "eval_logps/chosen": -474.26617431640625,
1106
+ "eval_logps/rejected": -593.8731079101562,
1107
+ "eval_loss": 0.49052032828330994,
1108
+ "eval_rewards/accuracies": 0.76953125,
1109
+ "eval_rewards/chosen": -2.1722652912139893,
1110
+ "eval_rewards/margins": 1.192933201789856,
1111
+ "eval_rewards/rejected": -3.3651983737945557,
1112
+ "eval_runtime": 53.0717,
1113
+ "eval_samples_per_second": 37.685,
1114
+ "eval_steps_per_second": 0.603,
1115
+ "step": 700
1116
+ },
1117
+ {
1118
+ "epoch": 0.8,
1119
+ "learning_rate": 5.6316681798995844e-08,
1120
+ "logits/chosen": -1.0180628299713135,
1121
+ "logits/rejected": -0.7236673831939697,
1122
+ "logps/chosen": -491.35565185546875,
1123
+ "logps/rejected": -525.1397705078125,
1124
+ "loss": 0.3851,
1125
+ "rewards/accuracies": 0.824999988079071,
1126
+ "rewards/chosen": -1.7499040365219116,
1127
+ "rewards/margins": 1.7332220077514648,
1128
+ "rewards/rejected": -3.483126163482666,
1129
+ "step": 710
1130
+ },
1131
+ {
1132
+ "epoch": 0.81,
1133
+ "learning_rate": 5.0215518717961256e-08,
1134
+ "logits/chosen": -1.0655405521392822,
1135
+ "logits/rejected": -0.6608148813247681,
1136
+ "logps/chosen": -525.560302734375,
1137
+ "logps/rejected": -529.1053466796875,
1138
+ "loss": 0.3984,
1139
+ "rewards/accuracies": 0.7875000238418579,
1140
+ "rewards/chosen": -1.8151214122772217,
1141
+ "rewards/margins": 1.7203428745269775,
1142
+ "rewards/rejected": -3.53546404838562,
1143
+ "step": 720
1144
+ },
1145
+ {
1146
+ "epoch": 0.83,
1147
+ "learning_rate": 4.4427081523275925e-08,
1148
+ "logits/chosen": -1.0117073059082031,
1149
+ "logits/rejected": -0.715721607208252,
1150
+ "logps/chosen": -504.0994567871094,
1151
+ "logps/rejected": -539.0814208984375,
1152
+ "loss": 0.3756,
1153
+ "rewards/accuracies": 0.800000011920929,
1154
+ "rewards/chosen": -1.8760732412338257,
1155
+ "rewards/margins": 1.5654491186141968,
1156
+ "rewards/rejected": -3.4415221214294434,
1157
+ "step": 730
1158
+ },
1159
  {
1160
  "epoch": 0.84,
1161
+ "learning_rate": 3.896043094949061e-08,
1162
+ "logits/chosen": -1.1520367860794067,
1163
+ "logits/rejected": -0.5986729860305786,
1164
+ "logps/chosen": -532.8388061523438,
1165
+ "logps/rejected": -558.7303466796875,
1166
+ "loss": 0.4003,
1167
+ "rewards/accuracies": 0.831250011920929,
1168
+ "rewards/chosen": -1.8816320896148682,
1169
+ "rewards/margins": 1.7765041589736938,
1170
+ "rewards/rejected": -3.6581363677978516,
1171
+ "step": 740
1172
+ },
1173
+ {
1174
+ "epoch": 0.85,
1175
+ "learning_rate": 3.3824124033343557e-08,
1176
+ "logits/chosen": -0.8991321325302124,
1177
+ "logits/rejected": -0.6385317444801331,
1178
+ "logps/chosen": -567.1549072265625,
1179
+ "logps/rejected": -579.142578125,
1180
+ "loss": 0.3994,
1181
  "rewards/accuracies": 0.7562500238418579,
1182
+ "rewards/chosen": -2.1505260467529297,
1183
+ "rewards/margins": 1.6301181316375732,
1184
+ "rewards/rejected": -3.780644178390503,
1185
+ "step": 750
1186
  },
1187
  {
1188
  "epoch": 0.86,
1189
+ "learning_rate": 2.9026200719291904e-08,
1190
+ "logits/chosen": -1.001379370689392,
1191
+ "logits/rejected": -0.6102081537246704,
1192
+ "logps/chosen": -508.70147705078125,
1193
+ "logps/rejected": -536.8555297851562,
1194
+ "loss": 0.4286,
1195
  "rewards/accuracies": 0.8062499761581421,
1196
+ "rewards/chosen": -1.9450082778930664,
1197
+ "rewards/margins": 1.5907418727874756,
1198
+ "rewards/rejected": -3.535750150680542,
1199
+ "step": 760
1200
+ },
1201
+ {
1202
+ "epoch": 0.87,
1203
+ "learning_rate": 2.4574171274456433e-08,
1204
+ "logits/chosen": -1.0912601947784424,
1205
+ "logits/rejected": -0.6700750589370728,
1206
+ "logps/chosen": -519.89208984375,
1207
+ "logps/rejected": -525.0479736328125,
1208
+ "loss": 0.3678,
1209
+ "rewards/accuracies": 0.8125,
1210
+ "rewards/chosen": -1.884690523147583,
1211
+ "rewards/margins": 1.6182489395141602,
1212
+ "rewards/rejected": -3.502938747406006,
1213
+ "step": 770
1214
+ },
1215
+ {
1216
+ "epoch": 0.88,
1217
+ "learning_rate": 2.047500453267881e-08,
1218
+ "logits/chosen": -1.0198689699172974,
1219
+ "logits/rejected": -0.5865429043769836,
1220
+ "logps/chosen": -526.2073974609375,
1221
+ "logps/rejected": -554.8323974609375,
1222
+ "loss": 0.3887,
1223
+ "rewards/accuracies": 0.8125,
1224
+ "rewards/chosen": -1.9122520685195923,
1225
+ "rewards/margins": 1.7002776861190796,
1226
+ "rewards/rejected": -3.612529754638672,
1227
+ "step": 780
1228
  },
1229
  {
1230
  "epoch": 0.89,
1231
+ "learning_rate": 1.673511698609292e-08,
1232
+ "logits/chosen": -0.9797528982162476,
1233
+ "logits/rejected": -0.5832753777503967,
1234
+ "logps/chosen": -553.0879516601562,
1235
+ "logps/rejected": -561.693603515625,
1236
+ "loss": 0.3901,
1237
+ "rewards/accuracies": 0.768750011920929,
1238
+ "rewards/chosen": -1.9004099369049072,
1239
+ "rewards/margins": 1.7125294208526611,
1240
+ "rewards/rejected": -3.6129393577575684,
1241
+ "step": 790
1242
  },
1243
  {
1244
  "epoch": 0.91,
1245
+ "learning_rate": 1.3360362741285769e-08,
1246
+ "logits/chosen": -1.0027343034744263,
1247
+ "logits/rejected": -0.7030217051506042,
1248
+ "logps/chosen": -501.4288024902344,
1249
+ "logps/rejected": -526.0103759765625,
1250
+ "loss": 0.3887,
1251
  "rewards/accuracies": 0.8125,
1252
+ "rewards/chosen": -1.869350790977478,
1253
+ "rewards/margins": 1.5730822086334229,
1254
+ "rewards/rejected": -3.4424331188201904,
1255
+ "step": 800
1256
+ },
1257
+ {
1258
+ "epoch": 0.91,
1259
+ "eval_logits/chosen": -0.9597108364105225,
1260
+ "eval_logits/rejected": -0.7242004871368408,
1261
+ "eval_logps/chosen": -487.77545166015625,
1262
+ "eval_logps/rejected": -609.3139038085938,
1263
+ "eval_loss": 0.4919503927230835,
1264
+ "eval_rewards/accuracies": 0.7734375,
1265
+ "eval_rewards/chosen": -2.3073582649230957,
1266
+ "eval_rewards/margins": 1.21224844455719,
1267
+ "eval_rewards/rejected": -3.519606590270996,
1268
+ "eval_runtime": 53.0285,
1269
+ "eval_samples_per_second": 37.716,
1270
+ "eval_steps_per_second": 0.603,
1271
+ "step": 800
1272
+ },
1273
+ {
1274
+ "epoch": 0.92,
1275
+ "learning_rate": 1.0356024355769433e-08,
1276
+ "logits/chosen": -1.0092878341674805,
1277
+ "logits/rejected": -0.7896069884300232,
1278
+ "logps/chosen": -532.9703369140625,
1279
+ "logps/rejected": -526.88330078125,
1280
+ "loss": 0.3767,
1281
+ "rewards/accuracies": 0.78125,
1282
+ "rewards/chosen": -1.830877661705017,
1283
+ "rewards/margins": 1.5391663312911987,
1284
+ "rewards/rejected": -3.370044231414795,
1285
+ "step": 810
1286
+ },
1287
+ {
1288
+ "epoch": 0.93,
1289
+ "learning_rate": 7.726804569108597e-09,
1290
+ "logits/chosen": -1.1117920875549316,
1291
+ "logits/rejected": -0.6487603187561035,
1292
+ "logps/chosen": -553.5621337890625,
1293
+ "logps/rejected": -571.2651977539062,
1294
+ "loss": 0.4191,
1295
+ "rewards/accuracies": 0.762499988079071,
1296
+ "rewards/chosen": -1.9834985733032227,
1297
+ "rewards/margins": 1.6336091756820679,
1298
+ "rewards/rejected": -3.61710786819458,
1299
+ "step": 820
1300
  },
1301
  {
1302
  "epoch": 0.94,
1303
+ "learning_rate": 5.476818941645561e-09,
1304
+ "logits/chosen": -1.1343705654144287,
1305
+ "logits/rejected": -0.5430102348327637,
1306
+ "logps/chosen": -569.5715942382812,
1307
+ "logps/rejected": -541.0032958984375,
1308
+ "loss": 0.3755,
1309
+ "rewards/accuracies": 0.8125,
1310
+ "rewards/chosen": -1.8141847848892212,
1311
+ "rewards/margins": 1.6208727359771729,
1312
+ "rewards/rejected": -3.4350574016571045,
1313
+ "step": 830
1314
  },
1315
  {
1316
+ "epoch": 0.95,
1317
+ "learning_rate": 3.609589412347347e-09,
1318
+ "logits/chosen": -1.0426546335220337,
1319
+ "logits/rejected": -0.6130795478820801,
1320
+ "logps/chosen": -517.9793701171875,
1321
+ "logps/rejected": -554.4876098632812,
1322
+ "loss": 0.3741,
1323
+ "rewards/accuracies": 0.8500000238418579,
1324
+ "rewards/chosen": -1.7485253810882568,
1325
+ "rewards/margins": 1.8532413244247437,
1326
+ "rewards/rejected": -3.601766586303711,
1327
+ "step": 840
1328
  },
1329
  {
1330
+ "epoch": 0.96,
1331
+ "learning_rate": 2.1280387858572667e-09,
1332
+ "logits/chosen": -0.9971386194229126,
1333
+ "logits/rejected": -0.6877419352531433,
1334
+ "logps/chosen": -496.14208984375,
1335
+ "logps/rejected": -510.2310485839844,
1336
+ "loss": 0.3879,
1337
  "rewards/accuracies": 0.78125,
1338
+ "rewards/chosen": -1.8203623294830322,
1339
+ "rewards/margins": 1.555535078048706,
1340
+ "rewards/rejected": -3.3758976459503174,
1341
+ "step": 850
1342
  },
1343
  {
1344
+ "epoch": 0.97,
1345
+ "learning_rate": 1.03448615738172e-09,
1346
+ "logits/chosen": -1.004620909690857,
1347
+ "logits/rejected": -0.6449930667877197,
1348
+ "logps/chosen": -515.4451904296875,
1349
+ "logps/rejected": -543.5704956054688,
1350
+ "loss": 0.3946,
1351
+ "rewards/accuracies": 0.824999988079071,
1352
+ "rewards/chosen": -1.7996273040771484,
1353
+ "rewards/margins": 1.7972816228866577,
1354
+ "rewards/rejected": -3.5969085693359375,
1355
+ "step": 860
1356
+ },
1357
+ {
1358
+ "epoch": 0.98,
1359
+ "learning_rate": 3.3064328257259575e-10,
1360
+ "logits/chosen": -1.0538240671157837,
1361
+ "logits/rejected": -0.67207270860672,
1362
+ "logps/chosen": -510.56414794921875,
1363
+ "logps/rejected": -538.0349731445312,
1364
+ "loss": 0.3821,
1365
+ "rewards/accuracies": 0.800000011920929,
1366
+ "rewards/chosen": -1.884305715560913,
1367
+ "rewards/margins": 1.7009254693984985,
1368
+ "rewards/rejected": -3.585231065750122,
1369
+ "step": 870
1370
+ },
1371
+ {
1372
+ "epoch": 1.0,
1373
+ "learning_rate": 1.7611898088715216e-11,
1374
+ "logits/chosen": -1.063408613204956,
1375
+ "logits/rejected": -0.8093023300170898,
1376
+ "logps/chosen": -536.09716796875,
1377
+ "logps/rejected": -549.3472290039062,
1378
+ "loss": 0.4077,
1379
+ "rewards/accuracies": 0.824999988079071,
1380
+ "rewards/chosen": -1.7760584354400635,
1381
+ "rewards/margins": 1.5979098081588745,
1382
+ "rewards/rejected": -3.3739686012268066,
1383
+ "step": 880
1384
  },
1385
  {
1386
  "epoch": 1.0,
1387
+ "step": 883,
1388
  "total_flos": 0.0,
1389
+ "train_loss": 0.43981607611020046,
1390
+ "train_runtime": 8273.4147,
1391
+ "train_samples_per_second": 13.662,
1392
+ "train_steps_per_second": 0.107
1393
  }
1394
  ],
1395
  "logging_steps": 10,
1396
+ "max_steps": 883,
1397
  "num_train_epochs": 1,
1398
  "save_steps": 100,
1399
  "total_flos": 0.0,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c44f53c9792f03eceacb06a2800acd74827f6b7f87a069e588eb192870cb597
3
  size 5944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b78f99d6c1a4671754549b6b5e97ff7d0343ff96a70b1d1ddf812b2a5bc88bf
3
  size 5944