wzhouad commited on
Commit
1edbb80
1 Parent(s): 0514a66

Model save

Browse files
README.md CHANGED
@@ -17,15 +17,15 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [HuggingFaceH4/mistral-7b-sft-beta](https://huggingface.co/HuggingFaceH4/mistral-7b-sft-beta) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 0.0914
21
- - Rewards/chosen: -1.4891
22
- - Rewards/rejected: -2.3865
23
- - Rewards/accuracies: 0.7617
24
- - Rewards/margins: 0.8974
25
- - Logps/rejected: -496.0016
26
- - Logps/chosen: -405.9468
27
- - Logits/rejected: -2.1902
28
- - Logits/chosen: -2.2209
29
 
30
  ## Model description
31
 
@@ -47,7 +47,7 @@ The following hyperparameters were used during training:
47
  - learning_rate: 5e-07
48
  - train_batch_size: 8
49
  - eval_batch_size: 8
50
- - seed: 4
51
  - distributed_type: multi-GPU
52
  - num_devices: 8
53
  - gradient_accumulation_steps: 2
@@ -62,10 +62,10 @@ The following hyperparameters were used during training:
62
 
63
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
64
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
65
- | 0.1471 | 0.21 | 100 | 0.1496 | -0.4703 | -0.8675 | 0.7227 | 0.3971 | -344.0998 | -304.0730 | -2.7365 | -2.7549 |
66
- | 0.0982 | 0.42 | 200 | 0.1094 | -0.9801 | -1.6826 | 0.7617 | 0.7025 | -425.6152 | -355.0506 | -2.3640 | -2.3857 |
67
- | 0.0947 | 0.63 | 300 | 0.1038 | -1.2570 | -2.0884 | 0.7656 | 0.8313 | -466.1884 | -382.7410 | -2.2552 | -2.2847 |
68
- | 0.083 | 0.84 | 400 | 0.0914 | -1.4891 | -2.3865 | 0.7617 | 0.8974 | -496.0016 | -405.9468 | -2.1902 | -2.2209 |
69
 
70
 
71
  ### Framework versions
 
17
 
18
  This model is a fine-tuned version of [HuggingFaceH4/mistral-7b-sft-beta](https://huggingface.co/HuggingFaceH4/mistral-7b-sft-beta) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.0945
21
+ - Rewards/chosen: -1.3600
22
+ - Rewards/rejected: -2.1836
23
+ - Rewards/accuracies: 0.7656
24
+ - Rewards/margins: 0.8237
25
+ - Logps/rejected: -475.7151
26
+ - Logps/chosen: -393.0347
27
+ - Logits/rejected: -2.3019
28
+ - Logits/chosen: -2.3254
29
 
30
  ## Model description
31
 
 
47
  - learning_rate: 5e-07
48
  - train_batch_size: 8
49
  - eval_batch_size: 8
50
+ - seed: 5
51
  - distributed_type: multi-GPU
52
  - num_devices: 8
53
  - gradient_accumulation_steps: 2
 
62
 
63
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
64
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
65
+ | 0.1643 | 0.21 | 100 | 0.1558 | -0.4076 | -0.7972 | 0.7461 | 0.3896 | -337.0709 | -297.7996 | -2.7691 | -2.7902 |
66
+ | 0.1003 | 0.42 | 200 | 0.0997 | -1.2712 | -1.9340 | 0.7031 | 0.6629 | -450.7552 | -384.1553 | -2.5137 | -2.5340 |
67
+ | 0.0953 | 0.63 | 300 | 0.1024 | -1.2036 | -1.9243 | 0.7539 | 0.7207 | -449.7823 | -377.3981 | -2.3837 | -2.4030 |
68
+ | 0.0811 | 0.84 | 400 | 0.0945 | -1.3600 | -2.1836 | 0.7656 | 0.8237 | -475.7151 | -393.0347 | -2.3019 | -2.3254 |
69
 
70
 
71
  ### Framework versions
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.12798248798777367,
4
- "train_runtime": 3957.5373,
5
  "train_samples": 61134,
6
- "train_samples_per_second": 15.447,
7
  "train_steps_per_second": 0.121
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.13007899894375183,
4
+ "train_runtime": 3956.3918,
5
  "train_samples": 61134,
6
+ "train_samples_per_second": 15.452,
7
  "train_steps_per_second": 0.121
8
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:532f4dcf1f93515a44cf1702d52c84f1e0b9b053fb14ee953199bf86495ce259
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e47517cfb73405f7bf5d01294c3a110798173231e018029481b0de05019a0d5
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5bed16ba1c001248f0dcc7895f91cec89c17b901f2b4344d01b470f7ee10fc6d
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d629d1ed6bc30b2268feb5f0fd326e9d78f1f4b1e67ba64737866501c3c6231
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:adc159708aba146b36498c4f08620af52db9ed4e0b40a94946232930ccfb890b
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42f17dfe6c68b6934688f9caf72473729842e30bb46aab3751edb49af3ac7626
3
  size 4540516344
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.12798248798777367,
4
- "train_runtime": 3957.5373,
5
  "train_samples": 61134,
6
- "train_samples_per_second": 15.447,
7
  "train_steps_per_second": 0.121
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.13007899894375183,
4
+ "train_runtime": 3956.3918,
5
  "train_samples": 61134,
6
+ "train_samples_per_second": 15.452,
7
  "train_steps_per_second": 0.121
8
  }
trainer_state.json CHANGED
@@ -11,11 +11,11 @@
11
  {
12
  "epoch": 0.0,
13
  "learning_rate": 1.0416666666666666e-08,
14
- "logits/chosen": -2.7386245727539062,
15
- "logits/rejected": -2.7273669242858887,
16
- "logps/chosen": -262.8376159667969,
17
- "logps/rejected": -255.88758850097656,
18
- "loss": 0.2831,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
@@ -25,732 +25,732 @@
25
  {
26
  "epoch": 0.02,
27
  "learning_rate": 1.0416666666666667e-07,
28
- "logits/chosen": -2.741614580154419,
29
- "logits/rejected": -2.735690116882324,
30
- "logps/chosen": -305.9348449707031,
31
- "logps/rejected": -270.5089111328125,
32
- "loss": 0.287,
33
- "rewards/accuracies": 0.4375,
34
- "rewards/chosen": 0.00032112703775055707,
35
- "rewards/margins": 4.9469334044260904e-05,
36
- "rewards/rejected": 0.0002716576855164021,
37
  "step": 10
38
  },
39
  {
40
  "epoch": 0.04,
41
  "learning_rate": 2.0833333333333333e-07,
42
- "logits/chosen": -2.7983665466308594,
43
- "logits/rejected": -2.778775215148926,
44
- "logps/chosen": -296.0061950683594,
45
- "logps/rejected": -258.1866760253906,
46
  "loss": 0.2856,
47
- "rewards/accuracies": 0.5562499761581421,
48
- "rewards/chosen": 0.0003985298681072891,
49
- "rewards/margins": 0.0015834126388654113,
50
- "rewards/rejected": -0.0011848828289657831,
51
  "step": 20
52
  },
53
  {
54
  "epoch": 0.06,
55
  "learning_rate": 3.1249999999999997e-07,
56
- "logits/chosen": -2.8277342319488525,
57
- "logits/rejected": -2.7996597290039062,
58
- "logps/chosen": -300.72467041015625,
59
- "logps/rejected": -259.6136169433594,
60
- "loss": 0.2877,
61
- "rewards/accuracies": 0.675000011920929,
62
- "rewards/chosen": 0.00015763216651976109,
63
- "rewards/margins": 0.007802808191627264,
64
- "rewards/rejected": -0.007645177189260721,
65
  "step": 30
66
  },
67
  {
68
  "epoch": 0.08,
69
  "learning_rate": 4.1666666666666667e-07,
70
- "logits/chosen": -2.776419162750244,
71
- "logits/rejected": -2.750056743621826,
72
- "logps/chosen": -257.345458984375,
73
- "logps/rejected": -249.3175048828125,
74
- "loss": 0.2785,
75
- "rewards/accuracies": 0.6625000238418579,
76
- "rewards/chosen": -0.0039919293485581875,
77
- "rewards/margins": 0.02145785465836525,
78
- "rewards/rejected": -0.025449782609939575,
79
  "step": 40
80
  },
81
  {
82
  "epoch": 0.1,
83
  "learning_rate": 4.999733114418725e-07,
84
- "logits/chosen": -2.762019395828247,
85
- "logits/rejected": -2.747554063796997,
86
- "logps/chosen": -256.4012451171875,
87
- "logps/rejected": -249.68222045898438,
88
- "loss": 0.2709,
89
- "rewards/accuracies": 0.6812499761581421,
90
- "rewards/chosen": -0.031310662627220154,
91
- "rewards/margins": 0.06291759759187698,
92
- "rewards/rejected": -0.09422825276851654,
93
  "step": 50
94
  },
95
  {
96
  "epoch": 0.13,
97
  "learning_rate": 4.990398100856366e-07,
98
- "logits/chosen": -2.7042574882507324,
99
- "logits/rejected": -2.6715445518493652,
100
- "logps/chosen": -268.1338806152344,
101
- "logps/rejected": -258.9618835449219,
102
- "loss": 0.2565,
103
- "rewards/accuracies": 0.6625000238418579,
104
- "rewards/chosen": -0.11207356303930283,
105
- "rewards/margins": 0.09182411432266235,
106
- "rewards/rejected": -0.20389768481254578,
107
  "step": 60
108
  },
109
  {
110
  "epoch": 0.15,
111
  "learning_rate": 4.967775735898179e-07,
112
- "logits/chosen": -2.7274832725524902,
113
- "logits/rejected": -2.7061522006988525,
114
- "logps/chosen": -293.3223876953125,
115
- "logps/rejected": -294.04046630859375,
116
- "loss": 0.2152,
117
- "rewards/accuracies": 0.6875,
118
- "rewards/chosen": -0.19794370234012604,
119
- "rewards/margins": 0.1808309704065323,
120
- "rewards/rejected": -0.37877464294433594,
121
  "step": 70
122
  },
123
  {
124
  "epoch": 0.17,
125
  "learning_rate": 4.931986719649298e-07,
126
- "logits/chosen": -2.758890151977539,
127
- "logits/rejected": -2.7324841022491455,
128
- "logps/chosen": -307.97601318359375,
129
- "logps/rejected": -309.65765380859375,
130
- "loss": 0.1946,
131
- "rewards/accuracies": 0.6187499761581421,
132
- "rewards/chosen": -0.3057263493537903,
133
- "rewards/margins": 0.19875012338161469,
134
- "rewards/rejected": -0.5044764280319214,
135
  "step": 80
136
  },
137
  {
138
  "epoch": 0.19,
139
  "learning_rate": 4.883222001996351e-07,
140
- "logits/chosen": -2.75748348236084,
141
- "logits/rejected": -2.7254340648651123,
142
- "logps/chosen": -290.4283142089844,
143
- "logps/rejected": -291.56689453125,
144
- "loss": 0.1545,
145
- "rewards/accuracies": 0.6625000238418579,
146
- "rewards/chosen": -0.449845552444458,
147
- "rewards/margins": 0.2664111256599426,
148
- "rewards/rejected": -0.7162567377090454,
149
  "step": 90
150
  },
151
  {
152
  "epoch": 0.21,
153
  "learning_rate": 4.821741763807186e-07,
154
- "logits/chosen": -2.703679323196411,
155
- "logits/rejected": -2.7015912532806396,
156
- "logps/chosen": -333.2982482910156,
157
- "logps/rejected": -333.4163818359375,
158
- "loss": 0.1471,
159
- "rewards/accuracies": 0.706250011920929,
160
- "rewards/chosen": -0.5292958617210388,
161
- "rewards/margins": 0.2932383418083191,
162
- "rewards/rejected": -0.8225342631340027,
163
  "step": 100
164
  },
165
  {
166
  "epoch": 0.21,
167
- "eval_logits/chosen": -2.754857301712036,
168
- "eval_logits/rejected": -2.736537218093872,
169
- "eval_logps/chosen": -304.0729675292969,
170
- "eval_logps/rejected": -344.09979248046875,
171
- "eval_loss": 0.1496078222990036,
172
- "eval_rewards/accuracies": 0.72265625,
173
- "eval_rewards/chosen": -0.470333069562912,
174
- "eval_rewards/margins": 0.3971319794654846,
175
- "eval_rewards/rejected": -0.867465078830719,
176
- "eval_runtime": 53.5518,
177
- "eval_samples_per_second": 37.347,
178
  "eval_steps_per_second": 0.598,
179
  "step": 100
180
  },
181
  {
182
  "epoch": 0.23,
183
  "learning_rate": 4.747874028753375e-07,
184
- "logits/chosen": -2.740710735321045,
185
- "logits/rejected": -2.704590082168579,
186
- "logps/chosen": -348.2076721191406,
187
- "logps/rejected": -351.06005859375,
188
- "loss": 0.1505,
189
- "rewards/accuracies": 0.71875,
190
- "rewards/chosen": -0.5250994563102722,
191
- "rewards/margins": 0.40582275390625,
192
- "rewards/rejected": -0.9309221506118774,
193
  "step": 110
194
  },
195
  {
196
  "epoch": 0.25,
197
  "learning_rate": 4.662012913161997e-07,
198
- "logits/chosen": -2.645005226135254,
199
- "logits/rejected": -2.6513335704803467,
200
- "logps/chosen": -298.77880859375,
201
- "logps/rejected": -345.088134765625,
202
- "loss": 0.1247,
203
- "rewards/accuracies": 0.6625000238418579,
204
- "rewards/chosen": -0.6757990717887878,
205
- "rewards/margins": 0.4480660557746887,
206
- "rewards/rejected": -1.123865008354187,
207
  "step": 120
208
  },
209
  {
210
  "epoch": 0.27,
211
  "learning_rate": 4.5646165232345103e-07,
212
- "logits/chosen": -2.637960910797119,
213
- "logits/rejected": -2.6174166202545166,
214
- "logps/chosen": -308.6398010253906,
215
- "logps/rejected": -348.3229675292969,
216
- "loss": 0.1292,
217
- "rewards/accuracies": 0.6812499761581421,
218
- "rewards/chosen": -0.7194039225578308,
219
- "rewards/margins": 0.3896581530570984,
220
- "rewards/rejected": -1.1090620756149292,
221
  "step": 130
222
  },
223
  {
224
  "epoch": 0.29,
225
  "learning_rate": 4.456204510851956e-07,
226
- "logits/chosen": -2.642385959625244,
227
- "logits/rejected": -2.6195216178894043,
228
- "logps/chosen": -329.4587097167969,
229
- "logps/rejected": -343.2635192871094,
230
- "loss": 0.1224,
231
- "rewards/accuracies": 0.699999988079071,
232
- "rewards/chosen": -0.6560414433479309,
233
- "rewards/margins": 0.4665776789188385,
234
- "rewards/rejected": -1.1226190328598022,
235
  "step": 140
236
  },
237
  {
238
  "epoch": 0.31,
239
  "learning_rate": 4.337355301007335e-07,
240
- "logits/chosen": -2.5479798316955566,
241
- "logits/rejected": -2.5365424156188965,
242
- "logps/chosen": -346.84490966796875,
243
- "logps/rejected": -362.29901123046875,
244
- "loss": 0.1223,
245
- "rewards/accuracies": 0.6812499761581421,
246
- "rewards/chosen": -0.8243889808654785,
247
- "rewards/margins": 0.49307242035865784,
248
- "rewards/rejected": -1.317461371421814,
249
  "step": 150
250
  },
251
  {
252
  "epoch": 0.33,
253
  "learning_rate": 4.2087030056579986e-07,
254
- "logits/chosen": -2.4889659881591797,
255
- "logits/rejected": -2.477649211883545,
256
- "logps/chosen": -362.55755615234375,
257
- "logps/rejected": -399.641845703125,
258
- "loss": 0.1207,
259
- "rewards/accuracies": 0.731249988079071,
260
- "rewards/chosen": -0.8799713253974915,
261
- "rewards/margins": 0.6216905117034912,
262
- "rewards/rejected": -1.5016618967056274,
263
  "step": 160
264
  },
265
  {
266
  "epoch": 0.36,
267
  "learning_rate": 4.070934040463998e-07,
268
- "logits/chosen": -2.559722423553467,
269
- "logits/rejected": -2.5232198238372803,
270
- "logps/chosen": -401.0594787597656,
271
- "logps/rejected": -391.15142822265625,
272
- "loss": 0.117,
273
- "rewards/accuracies": 0.737500011920929,
274
- "rewards/chosen": -0.7758759260177612,
275
- "rewards/margins": 0.5728363394737244,
276
- "rewards/rejected": -1.3487123250961304,
277
  "step": 170
278
  },
279
  {
280
  "epoch": 0.38,
281
  "learning_rate": 3.9247834624635404e-07,
282
- "logits/chosen": -2.5075507164001465,
283
- "logits/rejected": -2.4996590614318848,
284
- "logps/chosen": -351.1426696777344,
285
- "logps/rejected": -411.367431640625,
286
- "loss": 0.1056,
287
- "rewards/accuracies": 0.699999988079071,
288
- "rewards/chosen": -0.9261913299560547,
289
- "rewards/margins": 0.5514571070671082,
290
- "rewards/rejected": -1.4776484966278076,
291
  "step": 180
292
  },
293
  {
294
  "epoch": 0.4,
295
  "learning_rate": 3.7710310482256523e-07,
296
- "logits/chosen": -2.3475754261016846,
297
- "logits/rejected": -2.2911109924316406,
298
- "logps/chosen": -398.92950439453125,
299
- "logps/rejected": -435.80615234375,
300
- "loss": 0.0986,
301
  "rewards/accuracies": 0.6812499761581421,
302
- "rewards/chosen": -1.256566047668457,
303
- "rewards/margins": 0.5512439608573914,
304
- "rewards/rejected": -1.8078101873397827,
305
  "step": 190
306
  },
307
  {
308
  "epoch": 0.42,
309
  "learning_rate": 3.610497133404795e-07,
310
- "logits/chosen": -2.3151419162750244,
311
- "logits/rejected": -2.3180220127105713,
312
- "logps/chosen": -354.53363037109375,
313
- "logps/rejected": -410.30535888671875,
314
- "loss": 0.0982,
315
- "rewards/accuracies": 0.7124999761581421,
316
- "rewards/chosen": -1.2037290334701538,
317
- "rewards/margins": 0.5386644601821899,
318
- "rewards/rejected": -1.7423932552337646,
319
  "step": 200
320
  },
321
  {
322
  "epoch": 0.42,
323
- "eval_logits/chosen": -2.385664463043213,
324
- "eval_logits/rejected": -2.3639919757843018,
325
- "eval_logps/chosen": -355.0506286621094,
326
- "eval_logps/rejected": -425.61517333984375,
327
- "eval_loss": 0.10938204079866409,
328
- "eval_rewards/accuracies": 0.76171875,
329
- "eval_rewards/chosen": -0.9801100492477417,
330
- "eval_rewards/margins": 0.7025091648101807,
331
- "eval_rewards/rejected": -1.6826190948486328,
332
- "eval_runtime": 53.519,
333
- "eval_samples_per_second": 37.37,
334
  "eval_steps_per_second": 0.598,
335
  "step": 200
336
  },
337
  {
338
  "epoch": 0.44,
339
  "learning_rate": 3.4440382358952115e-07,
340
- "logits/chosen": -2.3312506675720215,
341
- "logits/rejected": -2.2346677780151367,
342
- "logps/chosen": -438.0108337402344,
343
- "logps/rejected": -433.43170166015625,
344
- "loss": 0.104,
345
- "rewards/accuracies": 0.6875,
346
- "rewards/chosen": -1.4116973876953125,
347
- "rewards/margins": 0.5293289422988892,
348
- "rewards/rejected": -1.9410263299942017,
349
  "step": 210
350
  },
351
  {
352
  "epoch": 0.46,
353
  "learning_rate": 3.272542485937368e-07,
354
- "logits/chosen": -2.3366806507110596,
355
- "logits/rejected": -2.2605834007263184,
356
- "logps/chosen": -423.37646484375,
357
- "logps/rejected": -451.29840087890625,
358
- "loss": 0.0958,
359
- "rewards/accuracies": 0.7124999761581421,
360
- "rewards/chosen": -1.4365549087524414,
361
- "rewards/margins": 0.6946216225624084,
362
- "rewards/rejected": -2.131176471710205,
363
  "step": 220
364
  },
365
  {
366
  "epoch": 0.48,
367
  "learning_rate": 3.096924887558854e-07,
368
- "logits/chosen": -2.3550989627838135,
369
- "logits/rejected": -2.310964584350586,
370
- "logps/chosen": -395.25634765625,
371
- "logps/rejected": -446.5535583496094,
372
- "loss": 0.1025,
373
- "rewards/accuracies": 0.7437499761581421,
374
- "rewards/chosen": -1.0768907070159912,
375
- "rewards/margins": 0.760982871055603,
376
- "rewards/rejected": -1.8378736972808838,
377
  "step": 230
378
  },
379
  {
380
  "epoch": 0.5,
381
  "learning_rate": 2.9181224366319943e-07,
382
- "logits/chosen": -2.309509515762329,
383
- "logits/rejected": -2.287087917327881,
384
- "logps/chosen": -428.4371032714844,
385
- "logps/rejected": -442.5956115722656,
386
- "loss": 0.0994,
387
- "rewards/accuracies": 0.800000011920929,
388
- "rewards/chosen": -1.2399303913116455,
389
- "rewards/margins": 0.6991497278213501,
390
- "rewards/rejected": -1.9390798807144165,
391
  "step": 240
392
  },
393
  {
394
  "epoch": 0.52,
395
  "learning_rate": 2.7370891215954565e-07,
396
- "logits/chosen": -2.246298313140869,
397
- "logits/rejected": -2.210582971572876,
398
- "logps/chosen": -379.5111389160156,
399
- "logps/rejected": -443.7802734375,
400
- "loss": 0.0921,
401
  "rewards/accuracies": 0.731249988079071,
402
- "rewards/chosen": -1.2617695331573486,
403
- "rewards/margins": 0.7775195240974426,
404
- "rewards/rejected": -2.0392889976501465,
405
  "step": 250
406
  },
407
  {
408
  "epoch": 0.54,
409
  "learning_rate": 2.55479083351317e-07,
410
- "logits/chosen": -2.2891409397125244,
411
- "logits/rejected": -2.2563395500183105,
412
- "logps/chosen": -419.97528076171875,
413
- "logps/rejected": -466.9248962402344,
414
- "loss": 0.1054,
415
- "rewards/accuracies": 0.7124999761581421,
416
- "rewards/chosen": -1.2249999046325684,
417
- "rewards/margins": 0.6663479208946228,
418
- "rewards/rejected": -1.891347885131836,
419
  "step": 260
420
  },
421
  {
422
  "epoch": 0.56,
423
  "learning_rate": 2.3722002126275822e-07,
424
- "logits/chosen": -2.322089672088623,
425
- "logits/rejected": -2.290865182876587,
426
- "logps/chosen": -396.2856140136719,
427
- "logps/rejected": -445.915771484375,
428
- "loss": 0.0964,
429
- "rewards/accuracies": 0.6625000238418579,
430
- "rewards/chosen": -1.426606297492981,
431
- "rewards/margins": 0.43242138624191284,
432
- "rewards/rejected": -1.859027624130249,
433
  "step": 270
434
  },
435
  {
436
  "epoch": 0.59,
437
  "learning_rate": 2.19029145890313e-07,
438
- "logits/chosen": -2.2537693977355957,
439
- "logits/rejected": -2.228264331817627,
440
- "logps/chosen": -402.9222106933594,
441
- "logps/rejected": -462.06768798828125,
442
- "loss": 0.0888,
443
- "rewards/accuracies": 0.7437499761581421,
444
- "rewards/chosen": -1.4026682376861572,
445
- "rewards/margins": 0.7444050908088684,
446
- "rewards/rejected": -2.147073268890381,
447
  "step": 280
448
  },
449
  {
450
  "epoch": 0.61,
451
  "learning_rate": 2.0100351342479216e-07,
452
- "logits/chosen": -2.315237522125244,
453
- "logits/rejected": -2.2631287574768066,
454
- "logps/chosen": -430.11029052734375,
455
- "logps/rejected": -481.6625061035156,
456
- "loss": 0.1009,
457
- "rewards/accuracies": 0.699999988079071,
458
- "rewards/chosen": -1.295500636100769,
459
- "rewards/margins": 0.6579625606536865,
460
- "rewards/rejected": -1.9534631967544556,
461
  "step": 290
462
  },
463
  {
464
  "epoch": 0.63,
465
  "learning_rate": 1.8323929841460178e-07,
466
- "logits/chosen": -2.23759126663208,
467
- "logits/rejected": -2.2217366695404053,
468
- "logps/chosen": -411.9175720214844,
469
- "logps/rejected": -444.6880798339844,
470
- "loss": 0.0947,
471
- "rewards/accuracies": 0.6499999761581421,
472
- "rewards/chosen": -1.3923585414886475,
473
- "rewards/margins": 0.5795921683311462,
474
- "rewards/rejected": -1.971950888633728,
475
  "step": 300
476
  },
477
  {
478
  "epoch": 0.63,
479
- "eval_logits/chosen": -2.2846696376800537,
480
- "eval_logits/rejected": -2.2552103996276855,
481
- "eval_logps/chosen": -382.740966796875,
482
- "eval_logps/rejected": -466.18841552734375,
483
- "eval_loss": 0.1038329154253006,
484
- "eval_rewards/accuracies": 0.765625,
485
- "eval_rewards/chosen": -1.2570133209228516,
486
- "eval_rewards/margins": 0.8313380479812622,
487
- "eval_rewards/rejected": -2.0883514881134033,
488
- "eval_runtime": 53.5285,
489
- "eval_samples_per_second": 37.363,
490
- "eval_steps_per_second": 0.598,
491
  "step": 300
492
  },
493
  {
494
  "epoch": 0.65,
495
  "learning_rate": 1.6583128063291573e-07,
496
- "logits/chosen": -2.2596614360809326,
497
- "logits/rejected": -2.2274169921875,
498
- "logps/chosen": -420.64349365234375,
499
- "logps/rejected": -468.998291015625,
500
- "loss": 0.0939,
501
- "rewards/accuracies": 0.7749999761581421,
502
- "rewards/chosen": -1.2388842105865479,
503
- "rewards/margins": 0.8670794367790222,
504
- "rewards/rejected": -2.1059632301330566,
505
  "step": 310
506
  },
507
  {
508
  "epoch": 0.67,
509
  "learning_rate": 1.488723393865766e-07,
510
- "logits/chosen": -2.2100915908813477,
511
- "logits/rejected": -2.170304775238037,
512
- "logps/chosen": -413.4375915527344,
513
- "logps/rejected": -470.0476989746094,
514
- "loss": 0.0909,
515
- "rewards/accuracies": 0.675000011920929,
516
- "rewards/chosen": -1.7292497158050537,
517
- "rewards/margins": 0.5965095162391663,
518
- "rewards/rejected": -2.3257594108581543,
519
  "step": 320
520
  },
521
  {
522
  "epoch": 0.69,
523
  "learning_rate": 1.3245295796480788e-07,
524
- "logits/chosen": -2.176182746887207,
525
- "logits/rejected": -2.163472890853882,
526
- "logps/chosen": -457.1171875,
527
- "logps/rejected": -519.6802368164062,
528
- "loss": 0.085,
529
- "rewards/accuracies": 0.71875,
530
- "rewards/chosen": -1.7242799997329712,
531
- "rewards/margins": 0.5835026502609253,
532
- "rewards/rejected": -2.3077826499938965,
533
  "step": 330
534
  },
535
  {
536
  "epoch": 0.71,
537
  "learning_rate": 1.1666074087171627e-07,
538
- "logits/chosen": -2.231558322906494,
539
- "logits/rejected": -2.195356607437134,
540
- "logps/chosen": -441.5918884277344,
541
- "logps/rejected": -469.4264221191406,
542
- "loss": 0.0875,
543
- "rewards/accuracies": 0.731249988079071,
544
- "rewards/chosen": -1.5626678466796875,
545
- "rewards/margins": 0.6448178291320801,
546
- "rewards/rejected": -2.2074856758117676,
547
  "step": 340
548
  },
549
  {
550
  "epoch": 0.73,
551
  "learning_rate": 1.0157994641835734e-07,
552
- "logits/chosen": -2.242619752883911,
553
- "logits/rejected": -2.178529977798462,
554
- "logps/chosen": -410.87420654296875,
555
- "logps/rejected": -505.684814453125,
556
- "loss": 0.0897,
557
- "rewards/accuracies": 0.7437499761581421,
558
- "rewards/chosen": -1.548431634902954,
559
- "rewards/margins": 0.896828293800354,
560
- "rewards/rejected": -2.4452598094940186,
561
  "step": 350
562
  },
563
  {
564
  "epoch": 0.75,
565
  "learning_rate": 8.729103716819111e-08,
566
- "logits/chosen": -2.1977174282073975,
567
- "logits/rejected": -2.136091470718384,
568
- "logps/chosen": -430.148193359375,
569
- "logps/rejected": -491.0294494628906,
570
- "loss": 0.0862,
571
- "rewards/accuracies": 0.7562500238418579,
572
- "rewards/chosen": -1.44559907913208,
573
- "rewards/margins": 0.836572527885437,
574
- "rewards/rejected": -2.2821714878082275,
575
  "step": 360
576
  },
577
  {
578
  "epoch": 0.77,
579
  "learning_rate": 7.387025063449081e-08,
580
- "logits/chosen": -2.2159745693206787,
581
- "logits/rejected": -2.1735687255859375,
582
- "logps/chosen": -385.21685791015625,
583
- "logps/rejected": -474.96600341796875,
584
- "loss": 0.0899,
585
- "rewards/accuracies": 0.762499988079071,
586
- "rewards/chosen": -1.4045782089233398,
587
- "rewards/margins": 0.8680235743522644,
588
- "rewards/rejected": -2.27260160446167,
589
  "step": 370
590
  },
591
  {
592
  "epoch": 0.79,
593
  "learning_rate": 6.138919252022435e-08,
594
- "logits/chosen": -2.2161214351654053,
595
- "logits/rejected": -2.1563704013824463,
596
- "logps/chosen": -408.2288513183594,
597
- "logps/rejected": -469.1304626464844,
598
- "loss": 0.0839,
599
- "rewards/accuracies": 0.7124999761581421,
600
- "rewards/chosen": -1.4448230266571045,
601
- "rewards/margins": 0.8660017251968384,
602
- "rewards/rejected": -2.3108248710632324,
603
  "step": 380
604
  },
605
  {
606
  "epoch": 0.82,
607
  "learning_rate": 4.991445467064689e-08,
608
- "logits/chosen": -2.1576311588287354,
609
- "logits/rejected": -2.1036527156829834,
610
- "logps/chosen": -437.1475524902344,
611
- "logps/rejected": -490.7696228027344,
612
- "loss": 0.0935,
613
- "rewards/accuracies": 0.6875,
614
- "rewards/chosen": -1.4927804470062256,
615
- "rewards/margins": 0.7215965986251831,
616
- "rewards/rejected": -2.214376926422119,
617
  "step": 390
618
  },
619
  {
620
  "epoch": 0.84,
621
  "learning_rate": 3.9507259776993954e-08,
622
- "logits/chosen": -2.210411548614502,
623
- "logits/rejected": -2.1940970420837402,
624
- "logps/chosen": -429.2776794433594,
625
- "logps/rejected": -510.29852294921875,
626
- "loss": 0.083,
627
- "rewards/accuracies": 0.706250011920929,
628
- "rewards/chosen": -1.6240886449813843,
629
- "rewards/margins": 0.8352212905883789,
630
- "rewards/rejected": -2.4593098163604736,
631
  "step": 400
632
  },
633
  {
634
  "epoch": 0.84,
635
- "eval_logits/chosen": -2.220853805541992,
636
- "eval_logits/rejected": -2.1902339458465576,
637
- "eval_logps/chosen": -405.94677734375,
638
- "eval_logps/rejected": -496.0015869140625,
639
- "eval_loss": 0.09142392128705978,
640
- "eval_rewards/accuracies": 0.76171875,
641
- "eval_rewards/chosen": -1.4890713691711426,
642
- "eval_rewards/margins": 0.8974115252494812,
643
- "eval_rewards/rejected": -2.3864829540252686,
644
- "eval_runtime": 53.5164,
645
- "eval_samples_per_second": 37.372,
646
- "eval_steps_per_second": 0.598,
647
  "step": 400
648
  },
649
  {
650
  "epoch": 0.86,
651
  "learning_rate": 3.022313472693447e-08,
652
- "logits/chosen": -2.189878463745117,
653
- "logits/rejected": -2.160414218902588,
654
- "logps/chosen": -443.77197265625,
655
- "logps/rejected": -464.19622802734375,
656
- "loss": 0.0836,
657
- "rewards/accuracies": 0.675000011920929,
658
- "rewards/chosen": -1.610609769821167,
659
- "rewards/margins": 0.6741793751716614,
660
- "rewards/rejected": -2.284789562225342,
661
  "step": 410
662
  },
663
  {
664
  "epoch": 0.88,
665
  "learning_rate": 2.2111614344599684e-08,
666
- "logits/chosen": -2.2698795795440674,
667
- "logits/rejected": -2.2317731380462646,
668
- "logps/chosen": -449.2178649902344,
669
- "logps/rejected": -477.58184814453125,
670
- "loss": 0.091,
671
- "rewards/accuracies": 0.7124999761581421,
672
- "rewards/chosen": -1.5456479787826538,
673
- "rewards/margins": 0.6279827356338501,
674
- "rewards/rejected": -2.173630714416504,
675
  "step": 420
676
  },
677
  {
678
  "epoch": 0.9,
679
  "learning_rate": 1.521597710086439e-08,
680
- "logits/chosen": -2.2749388217926025,
681
- "logits/rejected": -2.2024426460266113,
682
- "logps/chosen": -438.5753479003906,
683
- "logps/rejected": -455.7244567871094,
684
- "loss": 0.0824,
685
- "rewards/accuracies": 0.7250000238418579,
686
- "rewards/chosen": -1.5082874298095703,
687
- "rewards/margins": 0.7672165632247925,
688
- "rewards/rejected": -2.2755041122436523,
689
  "step": 430
690
  },
691
  {
692
  "epoch": 0.92,
693
  "learning_rate": 9.57301420397924e-09,
694
- "logits/chosen": -2.241633176803589,
695
- "logits/rejected": -2.1836299896240234,
696
- "logps/chosen": -457.36724853515625,
697
- "logps/rejected": -469.69012451171875,
698
- "loss": 0.0919,
699
- "rewards/accuracies": 0.75,
700
- "rewards/chosen": -1.5391572713851929,
701
- "rewards/margins": 0.6639242172241211,
702
- "rewards/rejected": -2.2030816078186035,
703
  "step": 440
704
  },
705
  {
706
  "epoch": 0.94,
707
  "learning_rate": 5.212833302556258e-09,
708
- "logits/chosen": -2.2500534057617188,
709
- "logits/rejected": -2.21012282371521,
710
- "logps/chosen": -421.68475341796875,
711
- "logps/rejected": -549.7516479492188,
712
- "loss": 0.0907,
713
- "rewards/accuracies": 0.793749988079071,
714
- "rewards/chosen": -1.4475462436676025,
715
- "rewards/margins": 1.129213571548462,
716
- "rewards/rejected": -2.5767598152160645,
717
  "step": 450
718
  },
719
  {
720
  "epoch": 0.96,
721
  "learning_rate": 2.158697848236607e-09,
722
- "logits/chosen": -2.197016477584839,
723
- "logits/rejected": -2.139648914337158,
724
- "logps/chosen": -462.3150939941406,
725
- "logps/rejected": -487.34381103515625,
726
- "loss": 0.0875,
727
- "rewards/accuracies": 0.75,
728
- "rewards/chosen": -1.5197908878326416,
729
- "rewards/margins": 0.749596118927002,
730
- "rewards/rejected": -2.2693867683410645,
731
  "step": 460
732
  },
733
  {
734
  "epoch": 0.98,
735
  "learning_rate": 4.269029751107489e-10,
736
- "logits/chosen": -2.229257583618164,
737
- "logits/rejected": -2.1693339347839355,
738
- "logps/chosen": -460.55706787109375,
739
- "logps/rejected": -498.900634765625,
740
- "loss": 0.0933,
741
- "rewards/accuracies": 0.7875000238418579,
742
- "rewards/chosen": -1.5752160549163818,
743
- "rewards/margins": 0.7894729375839233,
744
- "rewards/rejected": -2.3646891117095947,
745
  "step": 470
746
  },
747
  {
748
  "epoch": 1.0,
749
  "step": 478,
750
  "total_flos": 0.0,
751
- "train_loss": 0.12798248798777367,
752
- "train_runtime": 3957.5373,
753
- "train_samples_per_second": 15.447,
754
  "train_steps_per_second": 0.121
755
  }
756
  ],
 
11
  {
12
  "epoch": 0.0,
13
  "learning_rate": 1.0416666666666666e-08,
14
+ "logits/chosen": -2.8386030197143555,
15
+ "logits/rejected": -2.823939323425293,
16
+ "logps/chosen": -324.3727722167969,
17
+ "logps/rejected": -231.64634704589844,
18
+ "loss": 0.2826,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
 
25
  {
26
  "epoch": 0.02,
27
  "learning_rate": 1.0416666666666667e-07,
28
+ "logits/chosen": -2.8247194290161133,
29
+ "logits/rejected": -2.750765800476074,
30
+ "logps/chosen": -275.7482604980469,
31
+ "logps/rejected": -253.39404296875,
32
+ "loss": 0.2847,
33
+ "rewards/accuracies": 0.4513888955116272,
34
+ "rewards/chosen": 0.00012852638610638678,
35
+ "rewards/margins": -0.0004244056181050837,
36
+ "rewards/rejected": 0.0005529320333153009,
37
  "step": 10
38
  },
39
  {
40
  "epoch": 0.04,
41
  "learning_rate": 2.0833333333333333e-07,
42
+ "logits/chosen": -2.7973198890686035,
43
+ "logits/rejected": -2.779845714569092,
44
+ "logps/chosen": -261.89483642578125,
45
+ "logps/rejected": -257.04736328125,
46
  "loss": 0.2856,
47
+ "rewards/accuracies": 0.5874999761581421,
48
+ "rewards/chosen": 0.0005934558575972915,
49
+ "rewards/margins": 0.0017298649763688445,
50
+ "rewards/rejected": -0.001136409118771553,
51
  "step": 20
52
  },
53
  {
54
  "epoch": 0.06,
55
  "learning_rate": 3.1249999999999997e-07,
56
+ "logits/chosen": -2.783583164215088,
57
+ "logits/rejected": -2.777108907699585,
58
+ "logps/chosen": -294.8003234863281,
59
+ "logps/rejected": -259.10296630859375,
60
+ "loss": 0.2889,
61
+ "rewards/accuracies": 0.65625,
62
+ "rewards/chosen": 0.0027175676077604294,
63
+ "rewards/margins": 0.011478239670395851,
64
+ "rewards/rejected": -0.008760671131312847,
65
  "step": 30
66
  },
67
  {
68
  "epoch": 0.08,
69
  "learning_rate": 4.1666666666666667e-07,
70
+ "logits/chosen": -2.802429676055908,
71
+ "logits/rejected": -2.7715487480163574,
72
+ "logps/chosen": -284.63958740234375,
73
+ "logps/rejected": -264.9128112792969,
74
+ "loss": 0.2823,
75
+ "rewards/accuracies": 0.6937500238418579,
76
+ "rewards/chosen": -0.007285858038812876,
77
+ "rewards/margins": 0.022248882800340652,
78
+ "rewards/rejected": -0.029534736648201942,
79
  "step": 40
80
  },
81
  {
82
  "epoch": 0.1,
83
  "learning_rate": 4.999733114418725e-07,
84
+ "logits/chosen": -2.781130790710449,
85
+ "logits/rejected": -2.718773126602173,
86
+ "logps/chosen": -284.725341796875,
87
+ "logps/rejected": -255.60073852539062,
88
+ "loss": 0.2671,
89
+ "rewards/accuracies": 0.731249988079071,
90
+ "rewards/chosen": -0.023446276783943176,
91
+ "rewards/margins": 0.06585647165775299,
92
+ "rewards/rejected": -0.08930274099111557,
93
  "step": 50
94
  },
95
  {
96
  "epoch": 0.13,
97
  "learning_rate": 4.990398100856366e-07,
98
+ "logits/chosen": -2.8104348182678223,
99
+ "logits/rejected": -2.788311243057251,
100
+ "logps/chosen": -297.0313720703125,
101
+ "logps/rejected": -266.0052795410156,
102
+ "loss": 0.2428,
103
+ "rewards/accuracies": 0.606249988079071,
104
+ "rewards/chosen": -0.10381942987442017,
105
+ "rewards/margins": 0.084610715508461,
106
+ "rewards/rejected": -0.18843016028404236,
107
  "step": 60
108
  },
109
  {
110
  "epoch": 0.15,
111
  "learning_rate": 4.967775735898179e-07,
112
+ "logits/chosen": -2.704342842102051,
113
+ "logits/rejected": -2.6683297157287598,
114
+ "logps/chosen": -276.36395263671875,
115
+ "logps/rejected": -271.9848327636719,
116
+ "loss": 0.2192,
117
+ "rewards/accuracies": 0.737500011920929,
118
+ "rewards/chosen": -0.16314834356307983,
119
+ "rewards/margins": 0.17039458453655243,
120
+ "rewards/rejected": -0.33354294300079346,
121
  "step": 70
122
  },
123
  {
124
  "epoch": 0.17,
125
  "learning_rate": 4.931986719649298e-07,
126
+ "logits/chosen": -2.7222819328308105,
127
+ "logits/rejected": -2.7045040130615234,
128
+ "logps/chosen": -298.33831787109375,
129
+ "logps/rejected": -293.718017578125,
130
+ "loss": 0.1999,
131
+ "rewards/accuracies": 0.6499999761581421,
132
+ "rewards/chosen": -0.30510228872299194,
133
+ "rewards/margins": 0.1686253696680069,
134
+ "rewards/rejected": -0.47372761368751526,
135
  "step": 80
136
  },
137
  {
138
  "epoch": 0.19,
139
  "learning_rate": 4.883222001996351e-07,
140
+ "logits/chosen": -2.759632110595703,
141
+ "logits/rejected": -2.734144449234009,
142
+ "logps/chosen": -331.0855712890625,
143
+ "logps/rejected": -346.59991455078125,
144
+ "loss": 0.1682,
145
+ "rewards/accuracies": 0.6812499761581421,
146
+ "rewards/chosen": -0.45922285318374634,
147
+ "rewards/margins": 0.3295659124851227,
148
+ "rewards/rejected": -0.7887887954711914,
149
  "step": 90
150
  },
151
  {
152
  "epoch": 0.21,
153
  "learning_rate": 4.821741763807186e-07,
154
+ "logits/chosen": -2.6711103916168213,
155
+ "logits/rejected": -2.664060115814209,
156
+ "logps/chosen": -336.68927001953125,
157
+ "logps/rejected": -331.12799072265625,
158
+ "loss": 0.1643,
159
+ "rewards/accuracies": 0.643750011920929,
160
+ "rewards/chosen": -0.4235810339450836,
161
+ "rewards/margins": 0.24597103893756866,
162
+ "rewards/rejected": -0.6695520281791687,
163
  "step": 100
164
  },
165
  {
166
  "epoch": 0.21,
167
+ "eval_logits/chosen": -2.790248394012451,
168
+ "eval_logits/rejected": -2.7691245079040527,
169
+ "eval_logps/chosen": -297.79962158203125,
170
+ "eval_logps/rejected": -337.0708923339844,
171
+ "eval_loss": 0.15584461390972137,
172
+ "eval_rewards/accuracies": 0.74609375,
173
+ "eval_rewards/chosen": -0.4075998365879059,
174
+ "eval_rewards/margins": 0.38957637548446655,
175
+ "eval_rewards/rejected": -0.79717618227005,
176
+ "eval_runtime": 53.5413,
177
+ "eval_samples_per_second": 37.354,
178
  "eval_steps_per_second": 0.598,
179
  "step": 100
180
  },
181
  {
182
  "epoch": 0.23,
183
  "learning_rate": 4.747874028753375e-07,
184
+ "logits/chosen": -2.7711846828460693,
185
+ "logits/rejected": -2.7162532806396484,
186
+ "logps/chosen": -322.896484375,
187
+ "logps/rejected": -321.31158447265625,
188
+ "loss": 0.1423,
189
+ "rewards/accuracies": 0.699999988079071,
190
+ "rewards/chosen": -0.4842161536216736,
191
+ "rewards/margins": 0.4429897367954254,
192
+ "rewards/rejected": -0.9272058606147766,
193
  "step": 110
194
  },
195
  {
196
  "epoch": 0.25,
197
  "learning_rate": 4.662012913161997e-07,
198
+ "logits/chosen": -2.6857857704162598,
199
+ "logits/rejected": -2.664361000061035,
200
+ "logps/chosen": -340.3297119140625,
201
+ "logps/rejected": -381.2372741699219,
202
+ "loss": 0.1325,
203
+ "rewards/accuracies": 0.699999988079071,
204
+ "rewards/chosen": -0.7687980532646179,
205
+ "rewards/margins": 0.4345701336860657,
206
+ "rewards/rejected": -1.203368067741394,
207
  "step": 120
208
  },
209
  {
210
  "epoch": 0.27,
211
  "learning_rate": 4.5646165232345103e-07,
212
+ "logits/chosen": -2.679908037185669,
213
+ "logits/rejected": -2.661154270172119,
214
+ "logps/chosen": -350.47247314453125,
215
+ "logps/rejected": -386.91656494140625,
216
+ "loss": 0.1191,
217
+ "rewards/accuracies": 0.7124999761581421,
218
+ "rewards/chosen": -0.8761329650878906,
219
+ "rewards/margins": 0.5328775644302368,
220
+ "rewards/rejected": -1.4090105295181274,
221
  "step": 130
222
  },
223
  {
224
  "epoch": 0.29,
225
  "learning_rate": 4.456204510851956e-07,
226
+ "logits/chosen": -2.622180461883545,
227
+ "logits/rejected": -2.604306697845459,
228
+ "logps/chosen": -338.3455505371094,
229
+ "logps/rejected": -356.08990478515625,
230
+ "loss": 0.1244,
231
+ "rewards/accuracies": 0.706250011920929,
232
+ "rewards/chosen": -0.8404749035835266,
233
+ "rewards/margins": 0.39392346143722534,
234
+ "rewards/rejected": -1.2343984842300415,
235
  "step": 140
236
  },
237
  {
238
  "epoch": 0.31,
239
  "learning_rate": 4.337355301007335e-07,
240
+ "logits/chosen": -2.541025400161743,
241
+ "logits/rejected": -2.5166730880737305,
242
+ "logps/chosen": -345.60760498046875,
243
+ "logps/rejected": -372.7431335449219,
244
+ "loss": 0.1258,
245
+ "rewards/accuracies": 0.6875,
246
+ "rewards/chosen": -0.7515507936477661,
247
+ "rewards/margins": 0.42334675788879395,
248
+ "rewards/rejected": -1.17489755153656,
249
  "step": 150
250
  },
251
  {
252
  "epoch": 0.33,
253
  "learning_rate": 4.2087030056579986e-07,
254
+ "logits/chosen": -2.5379557609558105,
255
+ "logits/rejected": -2.528388261795044,
256
+ "logps/chosen": -345.44384765625,
257
+ "logps/rejected": -388.0000915527344,
258
+ "loss": 0.1209,
259
+ "rewards/accuracies": 0.65625,
260
+ "rewards/chosen": -0.8115363121032715,
261
+ "rewards/margins": 0.39176443219184875,
262
+ "rewards/rejected": -1.2033007144927979,
263
  "step": 160
264
  },
265
  {
266
  "epoch": 0.36,
267
  "learning_rate": 4.070934040463998e-07,
268
+ "logits/chosen": -2.508551836013794,
269
+ "logits/rejected": -2.4616193771362305,
270
+ "logps/chosen": -371.34246826171875,
271
+ "logps/rejected": -380.660888671875,
272
+ "loss": 0.1105,
273
+ "rewards/accuracies": 0.675000011920929,
274
+ "rewards/chosen": -0.9994179606437683,
275
+ "rewards/margins": 0.4626193940639496,
276
+ "rewards/rejected": -1.4620373249053955,
277
  "step": 170
278
  },
279
  {
280
  "epoch": 0.38,
281
  "learning_rate": 3.9247834624635404e-07,
282
+ "logits/chosen": -2.51965594291687,
283
+ "logits/rejected": -2.5132761001586914,
284
+ "logps/chosen": -332.5484924316406,
285
+ "logps/rejected": -384.0250549316406,
286
+ "loss": 0.1124,
287
+ "rewards/accuracies": 0.6812499761581421,
288
+ "rewards/chosen": -0.9845203161239624,
289
+ "rewards/margins": 0.4795452654361725,
290
+ "rewards/rejected": -1.4640657901763916,
291
  "step": 180
292
  },
293
  {
294
  "epoch": 0.4,
295
  "learning_rate": 3.7710310482256523e-07,
296
+ "logits/chosen": -2.55594539642334,
297
+ "logits/rejected": -2.5516602993011475,
298
+ "logps/chosen": -353.2313537597656,
299
+ "logps/rejected": -384.13861083984375,
300
+ "loss": 0.1058,
301
  "rewards/accuracies": 0.6812499761581421,
302
+ "rewards/chosen": -0.9792869687080383,
303
+ "rewards/margins": 0.40680208802223206,
304
+ "rewards/rejected": -1.3860890865325928,
305
  "step": 190
306
  },
307
  {
308
  "epoch": 0.42,
309
  "learning_rate": 3.610497133404795e-07,
310
+ "logits/chosen": -2.5069775581359863,
311
+ "logits/rejected": -2.5189363956451416,
312
+ "logps/chosen": -398.85382080078125,
313
+ "logps/rejected": -431.91455078125,
314
+ "loss": 0.1003,
315
+ "rewards/accuracies": 0.65625,
316
+ "rewards/chosen": -1.2246520519256592,
317
+ "rewards/margins": 0.3960326015949249,
318
+ "rewards/rejected": -1.6206846237182617,
319
  "step": 200
320
  },
321
  {
322
  "epoch": 0.42,
323
+ "eval_logits/chosen": -2.5340371131896973,
324
+ "eval_logits/rejected": -2.513735294342041,
325
+ "eval_logps/chosen": -384.15533447265625,
326
+ "eval_logps/rejected": -450.7552185058594,
327
+ "eval_loss": 0.0996941402554512,
328
+ "eval_rewards/accuracies": 0.703125,
329
+ "eval_rewards/chosen": -1.2711572647094727,
330
+ "eval_rewards/margins": 0.6628624200820923,
331
+ "eval_rewards/rejected": -1.934019684791565,
332
+ "eval_runtime": 53.511,
333
+ "eval_samples_per_second": 37.375,
334
  "eval_steps_per_second": 0.598,
335
  "step": 200
336
  },
337
  {
338
  "epoch": 0.44,
339
  "learning_rate": 3.4440382358952115e-07,
340
+ "logits/chosen": -2.455578565597534,
341
+ "logits/rejected": -2.446720838546753,
342
+ "logps/chosen": -391.07830810546875,
343
+ "logps/rejected": -428.397705078125,
344
+ "loss": 0.1038,
345
+ "rewards/accuracies": 0.675000011920929,
346
+ "rewards/chosen": -1.3349438905715942,
347
+ "rewards/margins": 0.5562185645103455,
348
+ "rewards/rejected": -1.8911622762680054,
349
  "step": 210
350
  },
351
  {
352
  "epoch": 0.46,
353
  "learning_rate": 3.272542485937368e-07,
354
+ "logits/chosen": -2.551090955734253,
355
+ "logits/rejected": -2.529384136199951,
356
+ "logps/chosen": -385.6699523925781,
357
+ "logps/rejected": -405.87615966796875,
358
+ "loss": 0.1138,
359
+ "rewards/accuracies": 0.637499988079071,
360
+ "rewards/chosen": -1.1878398656845093,
361
+ "rewards/margins": 0.44445449113845825,
362
+ "rewards/rejected": -1.6322942972183228,
363
  "step": 220
364
  },
365
  {
366
  "epoch": 0.48,
367
  "learning_rate": 3.096924887558854e-07,
368
+ "logits/chosen": -2.5678157806396484,
369
+ "logits/rejected": -2.5255255699157715,
370
+ "logps/chosen": -411.07745361328125,
371
+ "logps/rejected": -404.2816467285156,
372
+ "loss": 0.1149,
373
+ "rewards/accuracies": 0.731249988079071,
374
+ "rewards/chosen": -0.9748584628105164,
375
+ "rewards/margins": 0.47213855385780334,
376
+ "rewards/rejected": -1.446997046470642,
377
  "step": 230
378
  },
379
  {
380
  "epoch": 0.5,
381
  "learning_rate": 2.9181224366319943e-07,
382
+ "logits/chosen": -2.4429595470428467,
383
+ "logits/rejected": -2.4049136638641357,
384
+ "logps/chosen": -386.62530517578125,
385
+ "logps/rejected": -397.7767028808594,
386
+ "loss": 0.1092,
387
+ "rewards/accuracies": 0.6875,
388
+ "rewards/chosen": -1.1572192907333374,
389
+ "rewards/margins": 0.4687051773071289,
390
+ "rewards/rejected": -1.6259244680404663,
391
  "step": 240
392
  },
393
  {
394
  "epoch": 0.52,
395
  "learning_rate": 2.7370891215954565e-07,
396
+ "logits/chosen": -2.400578260421753,
397
+ "logits/rejected": -2.3846592903137207,
398
+ "logps/chosen": -413.29266357421875,
399
+ "logps/rejected": -441.35748291015625,
400
+ "loss": 0.0928,
401
  "rewards/accuracies": 0.731249988079071,
402
+ "rewards/chosen": -1.3619310855865479,
403
+ "rewards/margins": 0.6331827044487,
404
+ "rewards/rejected": -1.9951136112213135,
405
  "step": 250
406
  },
407
  {
408
  "epoch": 0.54,
409
  "learning_rate": 2.55479083351317e-07,
410
+ "logits/chosen": -2.435859203338623,
411
+ "logits/rejected": -2.4128081798553467,
412
+ "logps/chosen": -418.8388671875,
413
+ "logps/rejected": -462.96282958984375,
414
+ "loss": 0.097,
415
+ "rewards/accuracies": 0.768750011920929,
416
+ "rewards/chosen": -1.2928632497787476,
417
+ "rewards/margins": 0.7572471499443054,
418
+ "rewards/rejected": -2.050110340118408,
419
  "step": 260
420
  },
421
  {
422
  "epoch": 0.56,
423
  "learning_rate": 2.3722002126275822e-07,
424
+ "logits/chosen": -2.3607535362243652,
425
+ "logits/rejected": -2.3512327671051025,
426
+ "logps/chosen": -393.47845458984375,
427
+ "logps/rejected": -424.65692138671875,
428
+ "loss": 0.0942,
429
+ "rewards/accuracies": 0.706250011920929,
430
+ "rewards/chosen": -1.2448090314865112,
431
+ "rewards/margins": 0.5817195177078247,
432
+ "rewards/rejected": -1.8265281915664673,
433
  "step": 270
434
  },
435
  {
436
  "epoch": 0.59,
437
  "learning_rate": 2.19029145890313e-07,
438
+ "logits/chosen": -2.384596586227417,
439
+ "logits/rejected": -2.357322931289673,
440
+ "logps/chosen": -401.50152587890625,
441
+ "logps/rejected": -447.069580078125,
442
+ "loss": 0.0894,
443
+ "rewards/accuracies": 0.7250000238418579,
444
+ "rewards/chosen": -1.463189721107483,
445
+ "rewards/margins": 0.627885103225708,
446
+ "rewards/rejected": -2.0910747051239014,
447
  "step": 280
448
  },
449
  {
450
  "epoch": 0.61,
451
  "learning_rate": 2.0100351342479216e-07,
452
+ "logits/chosen": -2.3855137825012207,
453
+ "logits/rejected": -2.334260940551758,
454
+ "logps/chosen": -441.15118408203125,
455
+ "logps/rejected": -456.8433532714844,
456
+ "loss": 0.0895,
457
+ "rewards/accuracies": 0.6875,
458
+ "rewards/chosen": -1.5582000017166138,
459
+ "rewards/margins": 0.6062092185020447,
460
+ "rewards/rejected": -2.1644091606140137,
461
  "step": 290
462
  },
463
  {
464
  "epoch": 0.63,
465
  "learning_rate": 1.8323929841460178e-07,
466
+ "logits/chosen": -2.4264094829559326,
467
+ "logits/rejected": -2.403550624847412,
468
+ "logps/chosen": -412.9310607910156,
469
+ "logps/rejected": -471.4112854003906,
470
+ "loss": 0.0953,
471
+ "rewards/accuracies": 0.71875,
472
+ "rewards/chosen": -1.2313965559005737,
473
+ "rewards/margins": 0.6434706449508667,
474
+ "rewards/rejected": -1.8748672008514404,
475
  "step": 300
476
  },
477
  {
478
  "epoch": 0.63,
479
+ "eval_logits/chosen": -2.4030282497406006,
480
+ "eval_logits/rejected": -2.3836517333984375,
481
+ "eval_logps/chosen": -377.3980712890625,
482
+ "eval_logps/rejected": -449.78228759765625,
483
+ "eval_loss": 0.10235561430454254,
484
+ "eval_rewards/accuracies": 0.75390625,
485
+ "eval_rewards/chosen": -1.2035841941833496,
486
+ "eval_rewards/margins": 0.7207058072090149,
487
+ "eval_rewards/rejected": -1.9242901802062988,
488
+ "eval_runtime": 53.5723,
489
+ "eval_samples_per_second": 37.333,
490
+ "eval_steps_per_second": 0.597,
491
  "step": 300
492
  },
493
  {
494
  "epoch": 0.65,
495
  "learning_rate": 1.6583128063291573e-07,
496
+ "logits/chosen": -2.3959908485412598,
497
+ "logits/rejected": -2.366027593612671,
498
+ "logps/chosen": -389.87841796875,
499
+ "logps/rejected": -428.79150390625,
500
+ "loss": 0.0967,
501
+ "rewards/accuracies": 0.699999988079071,
502
+ "rewards/chosen": -1.2791574001312256,
503
+ "rewards/margins": 0.5353385806083679,
504
+ "rewards/rejected": -1.8144958019256592,
505
  "step": 310
506
  },
507
  {
508
  "epoch": 0.67,
509
  "learning_rate": 1.488723393865766e-07,
510
+ "logits/chosen": -2.315176010131836,
511
+ "logits/rejected": -2.303180694580078,
512
+ "logps/chosen": -419.81304931640625,
513
+ "logps/rejected": -451.9205627441406,
514
+ "loss": 0.0913,
515
+ "rewards/accuracies": 0.706250011920929,
516
+ "rewards/chosen": -1.3704838752746582,
517
+ "rewards/margins": 0.4932515621185303,
518
+ "rewards/rejected": -1.8637354373931885,
519
  "step": 320
520
  },
521
  {
522
  "epoch": 0.69,
523
  "learning_rate": 1.3245295796480788e-07,
524
+ "logits/chosen": -2.3155629634857178,
525
+ "logits/rejected": -2.306206226348877,
526
+ "logps/chosen": -373.34173583984375,
527
+ "logps/rejected": -451.43304443359375,
528
+ "loss": 0.094,
529
+ "rewards/accuracies": 0.7437499761581421,
530
+ "rewards/chosen": -1.2377197742462158,
531
+ "rewards/margins": 0.7202552556991577,
532
+ "rewards/rejected": -1.957975149154663,
533
  "step": 330
534
  },
535
  {
536
  "epoch": 0.71,
537
  "learning_rate": 1.1666074087171627e-07,
538
+ "logits/chosen": -2.3178515434265137,
539
+ "logits/rejected": -2.317112684249878,
540
+ "logps/chosen": -421.288330078125,
541
+ "logps/rejected": -464.2798767089844,
542
+ "loss": 0.1012,
543
+ "rewards/accuracies": 0.6937500238418579,
544
+ "rewards/chosen": -1.3072739839553833,
545
+ "rewards/margins": 0.6341418027877808,
546
+ "rewards/rejected": -1.941415786743164,
547
  "step": 340
548
  },
549
  {
550
  "epoch": 0.73,
551
  "learning_rate": 1.0157994641835734e-07,
552
+ "logits/chosen": -2.352154016494751,
553
+ "logits/rejected": -2.310459852218628,
554
+ "logps/chosen": -371.04180908203125,
555
+ "logps/rejected": -418.411376953125,
556
+ "loss": 0.0964,
557
+ "rewards/accuracies": 0.7250000238418579,
558
+ "rewards/chosen": -1.2016589641571045,
559
+ "rewards/margins": 0.6332089900970459,
560
+ "rewards/rejected": -1.8348678350448608,
561
  "step": 350
562
  },
563
  {
564
  "epoch": 0.75,
565
  "learning_rate": 8.729103716819111e-08,
566
+ "logits/chosen": -2.3340022563934326,
567
+ "logits/rejected": -2.2888753414154053,
568
+ "logps/chosen": -399.73870849609375,
569
+ "logps/rejected": -433.62939453125,
570
+ "loss": 0.103,
571
+ "rewards/accuracies": 0.706250011920929,
572
+ "rewards/chosen": -1.3542587757110596,
573
+ "rewards/margins": 0.6527735590934753,
574
+ "rewards/rejected": -2.0070323944091797,
575
  "step": 360
576
  },
577
  {
578
  "epoch": 0.77,
579
  "learning_rate": 7.387025063449081e-08,
580
+ "logits/chosen": -2.305725574493408,
581
+ "logits/rejected": -2.2590928077697754,
582
+ "logps/chosen": -424.70269775390625,
583
+ "logps/rejected": -478.83160400390625,
584
+ "loss": 0.0832,
585
+ "rewards/accuracies": 0.71875,
586
+ "rewards/chosen": -1.3823884725570679,
587
+ "rewards/margins": 0.7607783079147339,
588
+ "rewards/rejected": -2.143167018890381,
589
  "step": 370
590
  },
591
  {
592
  "epoch": 0.79,
593
  "learning_rate": 6.138919252022435e-08,
594
+ "logits/chosen": -2.3276476860046387,
595
+ "logits/rejected": -2.3130292892456055,
596
+ "logps/chosen": -431.13568115234375,
597
+ "logps/rejected": -477.88824462890625,
598
+ "loss": 0.0903,
599
+ "rewards/accuracies": 0.7437499761581421,
600
+ "rewards/chosen": -1.4548090696334839,
601
+ "rewards/margins": 0.671941876411438,
602
+ "rewards/rejected": -2.126750946044922,
603
  "step": 380
604
  },
605
  {
606
  "epoch": 0.82,
607
  "learning_rate": 4.991445467064689e-08,
608
+ "logits/chosen": -2.2263472080230713,
609
+ "logits/rejected": -2.1942696571350098,
610
+ "logps/chosen": -418.37335205078125,
611
+ "logps/rejected": -485.0545349121094,
612
+ "loss": 0.0883,
613
+ "rewards/accuracies": 0.75,
614
+ "rewards/chosen": -1.488586187362671,
615
+ "rewards/margins": 0.7860161662101746,
616
+ "rewards/rejected": -2.2746024131774902,
617
  "step": 390
618
  },
619
  {
620
  "epoch": 0.84,
621
  "learning_rate": 3.9507259776993954e-08,
622
+ "logits/chosen": -2.319228410720825,
623
+ "logits/rejected": -2.2877087593078613,
624
+ "logps/chosen": -417.96875,
625
+ "logps/rejected": -461.0101623535156,
626
+ "loss": 0.0811,
627
+ "rewards/accuracies": 0.699999988079071,
628
+ "rewards/chosen": -1.4231641292572021,
629
+ "rewards/margins": 0.647831916809082,
630
+ "rewards/rejected": -2.0709962844848633,
631
  "step": 400
632
  },
633
  {
634
  "epoch": 0.84,
635
+ "eval_logits/chosen": -2.3254384994506836,
636
+ "eval_logits/rejected": -2.301893472671509,
637
+ "eval_logps/chosen": -393.03472900390625,
638
+ "eval_logps/rejected": -475.715087890625,
639
+ "eval_loss": 0.09447792172431946,
640
+ "eval_rewards/accuracies": 0.765625,
641
+ "eval_rewards/chosen": -1.3599507808685303,
642
+ "eval_rewards/margins": 0.8236675262451172,
643
+ "eval_rewards/rejected": -2.1836180686950684,
644
+ "eval_runtime": 53.5742,
645
+ "eval_samples_per_second": 37.331,
646
+ "eval_steps_per_second": 0.597,
647
  "step": 400
648
  },
649
  {
650
  "epoch": 0.86,
651
  "learning_rate": 3.022313472693447e-08,
652
+ "logits/chosen": -2.3134891986846924,
653
+ "logits/rejected": -2.2576441764831543,
654
+ "logps/chosen": -405.07867431640625,
655
+ "logps/rejected": -426.08770751953125,
656
+ "loss": 0.088,
657
+ "rewards/accuracies": 0.762499988079071,
658
+ "rewards/chosen": -1.304164171218872,
659
+ "rewards/margins": 0.7416768074035645,
660
+ "rewards/rejected": -2.0458409786224365,
661
  "step": 410
662
  },
663
  {
664
  "epoch": 0.88,
665
  "learning_rate": 2.2111614344599684e-08,
666
+ "logits/chosen": -2.3239588737487793,
667
+ "logits/rejected": -2.2752654552459717,
668
+ "logps/chosen": -434.28118896484375,
669
+ "logps/rejected": -482.84234619140625,
670
+ "loss": 0.0896,
671
+ "rewards/accuracies": 0.7250000238418579,
672
+ "rewards/chosen": -1.4340513944625854,
673
+ "rewards/margins": 0.8941879272460938,
674
+ "rewards/rejected": -2.3282394409179688,
675
  "step": 420
676
  },
677
  {
678
  "epoch": 0.9,
679
  "learning_rate": 1.521597710086439e-08,
680
+ "logits/chosen": -2.278296947479248,
681
+ "logits/rejected": -2.2763679027557373,
682
+ "logps/chosen": -423.744384765625,
683
+ "logps/rejected": -485.7794494628906,
684
+ "loss": 0.0868,
685
+ "rewards/accuracies": 0.6875,
686
+ "rewards/chosen": -1.4879920482635498,
687
+ "rewards/margins": 0.6670708656311035,
688
+ "rewards/rejected": -2.1550629138946533,
689
  "step": 430
690
  },
691
  {
692
  "epoch": 0.92,
693
  "learning_rate": 9.57301420397924e-09,
694
+ "logits/chosen": -2.26120924949646,
695
+ "logits/rejected": -2.2485973834991455,
696
+ "logps/chosen": -404.76959228515625,
697
+ "logps/rejected": -461.03448486328125,
698
+ "loss": 0.0892,
699
+ "rewards/accuracies": 0.668749988079071,
700
+ "rewards/chosen": -1.4389055967330933,
701
+ "rewards/margins": 0.6930050253868103,
702
+ "rewards/rejected": -2.131910800933838,
703
  "step": 440
704
  },
705
  {
706
  "epoch": 0.94,
707
  "learning_rate": 5.212833302556258e-09,
708
+ "logits/chosen": -2.2681469917297363,
709
+ "logits/rejected": -2.275200366973877,
710
+ "logps/chosen": -404.1940612792969,
711
+ "logps/rejected": -463.80401611328125,
712
+ "loss": 0.0902,
713
+ "rewards/accuracies": 0.7437499761581421,
714
+ "rewards/chosen": -1.5075231790542603,
715
+ "rewards/margins": 0.6551094055175781,
716
+ "rewards/rejected": -2.162632703781128,
717
  "step": 450
718
  },
719
  {
720
  "epoch": 0.96,
721
  "learning_rate": 2.158697848236607e-09,
722
+ "logits/chosen": -2.2567198276519775,
723
+ "logits/rejected": -2.215657949447632,
724
+ "logps/chosen": -404.21527099609375,
725
+ "logps/rejected": -441.24945068359375,
726
+ "loss": 0.0867,
727
+ "rewards/accuracies": 0.6937500238418579,
728
+ "rewards/chosen": -1.5105773210525513,
729
+ "rewards/margins": 0.5308315753936768,
730
+ "rewards/rejected": -2.0414090156555176,
731
  "step": 460
732
  },
733
  {
734
  "epoch": 0.98,
735
  "learning_rate": 4.269029751107489e-10,
736
+ "logits/chosen": -2.2435102462768555,
737
+ "logits/rejected": -2.2021100521087646,
738
+ "logps/chosen": -399.60418701171875,
739
+ "logps/rejected": -474.943359375,
740
+ "loss": 0.0902,
741
+ "rewards/accuracies": 0.762499988079071,
742
+ "rewards/chosen": -1.4095227718353271,
743
+ "rewards/margins": 0.8036805391311646,
744
+ "rewards/rejected": -2.213203191757202,
745
  "step": 470
746
  },
747
  {
748
  "epoch": 1.0,
749
  "step": 478,
750
  "total_flos": 0.0,
751
+ "train_loss": 0.13007899894375183,
752
+ "train_runtime": 3956.3918,
753
+ "train_samples_per_second": 15.452,
754
  "train_steps_per_second": 0.121
755
  }
756
  ],
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:11d31916749d9d511e3b1889a2afe059d4a8035e5dcdd7b6a1e214f3f2a94603
3
- size 6008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9f4850dd2c180ba89e106647ab83af688a7502f2777cfe5fbc8857151119497
3
+ size 5944