jikaixuan commited on
Commit
be9d7d4
1 Parent(s): ae2a467

Model save

Browse files
Files changed (5) hide show
  1. README.md +21 -24
  2. adapter_model.safetensors +1 -1
  3. all_results.json +3 -18
  4. train_results.json +3 -3
  5. trainer_state.json +670 -670
README.md CHANGED
@@ -2,13 +2,10 @@
2
  license: apache-2.0
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
  - trl
7
  - dpo
8
  - generated_from_trainer
9
  base_model: mistralai/Mistral-7B-v0.1
10
- datasets:
11
- - HuggingFaceH4/ultrafeedback_binarized
12
  model-index:
13
  - name: zephyr-7b
14
  results: []
@@ -19,19 +16,19 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  # zephyr-7b
21
 
22
- This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-qlora](https://huggingface.co/alignment-handbook/zephyr-7b-sft-qlora) on the HuggingFaceH4/ultrafeedback_binarized dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 0.6918
25
- - Rewards/chosen: -0.0862
26
- - Rewards/rejected: -0.1980
27
- - Rewards/accuracies: 0.3591
28
- - Rewards/margins: 0.1117
29
- - Logps/rejected: -95.1937
30
- - Logps/chosen: -77.5232
31
- - Logits/rejected: -1.9123
32
- - Logits/chosen: -1.9402
33
- - Use Label: 15333.4131
34
- - Pred Label: 4738.5874
35
 
36
  ## Model description
37
 
@@ -68,15 +65,15 @@ The following hyperparameters were used during training:
68
 
69
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen | Use Label | Pred Label |
70
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|:----------:|:----------:|
71
- | 0.6876 | 0.1 | 100 | 0.6896 | -0.0555 | -0.0989 | 0.3353 | 0.0434 | -85.2883 | -74.4495 | -2.0761 | -2.1076 | 1766.8572 | 89.1429 |
72
- | 0.6892 | 0.21 | 200 | 0.6894 | -0.0049 | -0.0560 | 0.3492 | 0.0511 | -80.9954 | -69.3876 | -2.0287 | -2.0520 | 3500.8889 | 459.1111 |
73
- | 0.6904 | 0.31 | 300 | 0.6909 | -0.0625 | -0.1410 | 0.3532 | 0.0785 | -89.5016 | -75.1524 | -1.9943 | -2.0164 | 5140.6826 | 923.3174 |
74
- | 0.6906 | 0.42 | 400 | 0.6921 | -0.0637 | -0.1541 | 0.3512 | 0.0904 | -90.8064 | -75.2687 | -2.0248 | -2.0481 | 6695.4287 | 1472.5714 |
75
- | 0.6903 | 0.52 | 500 | 0.6914 | -0.0747 | -0.1726 | 0.3492 | 0.0979 | -92.6561 | -76.3697 | -1.9801 | -2.0071 | 8246.2061 | 2025.7937 |
76
- | 0.6903 | 0.63 | 600 | 0.6917 | -0.1005 | -0.2047 | 0.3552 | 0.1042 | -95.8670 | -78.9543 | -1.9601 | -1.9870 | 9772.0635 | 2603.9365 |
77
- | 0.6917 | 0.73 | 700 | 0.6917 | -0.1117 | -0.2224 | 0.3512 | 0.1108 | -97.6411 | -80.0681 | -1.9401 | -1.9659 | 11284.7773 | 3195.2222 |
78
- | 0.6912 | 0.84 | 800 | 0.6917 | -0.0869 | -0.1981 | 0.3631 | 0.1112 | -95.2089 | -77.5874 | -1.9144 | -1.9422 | 12826.8252 | 3757.1746 |
79
- | 0.6914 | 0.94 | 900 | 0.6918 | -0.0863 | -0.1983 | 0.3571 | 0.1120 | -95.2291 | -77.5275 | -1.9113 | -1.9391 | 14335.7139 | 4352.2856 |
80
 
81
 
82
  ### Framework versions
 
2
  license: apache-2.0
3
  library_name: peft
4
  tags:
 
5
  - trl
6
  - dpo
7
  - generated_from_trainer
8
  base_model: mistralai/Mistral-7B-v0.1
 
 
9
  model-index:
10
  - name: zephyr-7b
11
  results: []
 
16
 
17
  # zephyr-7b
18
 
19
+ This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the None dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.6928
22
+ - Rewards/chosen: -0.0288
23
+ - Rewards/rejected: -0.1012
24
+ - Rewards/accuracies: 0.3492
25
+ - Rewards/margins: 0.0723
26
+ - Logps/rejected: -85.5160
27
+ - Logps/chosen: -71.7842
28
+ - Logits/rejected: -2.1139
29
+ - Logits/chosen: -2.1428
30
+ - Use Label: 13461.3809
31
+ - Pred Label: 5226.6191
32
 
33
  ## Model description
34
 
 
65
 
66
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen | Use Label | Pred Label |
67
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|:----------:|:----------:|
68
+ | 0.6911 | 0.1 | 100 | 0.6919 | -0.0053 | -0.0356 | 0.3393 | 0.0303 | -78.9541 | -69.4262 | -2.0935 | -2.1210 | 1705.8572 | 150.1429 |
69
+ | 0.692 | 0.21 | 200 | 0.6927 | -0.0264 | -0.0695 | 0.3433 | 0.0431 | -82.3504 | -71.5409 | -2.1057 | -2.1268 | 3337.0476 | 622.9524 |
70
+ | 0.6924 | 0.31 | 300 | 0.6929 | -0.0369 | -0.0896 | 0.3393 | 0.0527 | -84.3537 | -72.5877 | -2.1933 | -2.2169 | 4863.7300 | 1200.2699 |
71
+ | 0.6927 | 0.42 | 400 | 0.6925 | -0.0211 | -0.0804 | 0.3413 | 0.0593 | -83.4364 | -71.0104 | -2.0934 | -2.1190 | 6324.0796 | 1843.9207 |
72
+ | 0.6924 | 0.52 | 500 | 0.6929 | -0.0206 | -0.0831 | 0.3433 | 0.0625 | -83.7112 | -70.9618 | -2.1518 | -2.1762 | 7772.7778 | 2499.2222 |
73
+ | 0.6929 | 0.63 | 600 | 0.6927 | -0.0452 | -0.1160 | 0.3512 | 0.0708 | -86.9945 | -73.4171 | -2.1125 | -2.1408 | 9198.8574 | 3177.1428 |
74
+ | 0.6928 | 0.73 | 700 | 0.6930 | -0.0507 | -0.1231 | 0.3512 | 0.0724 | -87.7077 | -73.9657 | -2.1086 | -2.1372 | 10627.2695 | 3852.7302 |
75
+ | 0.6927 | 0.84 | 800 | 0.6928 | -0.0272 | -0.0999 | 0.3552 | 0.0726 | -85.3832 | -71.6247 | -2.1141 | -2.1431 | 12045.5234 | 4538.4761 |
76
+ | 0.6929 | 0.94 | 900 | 0.6928 | -0.0288 | -0.1012 | 0.3492 | 0.0723 | -85.5160 | -71.7842 | -2.1139 | -2.1428 | 13461.3809 | 5226.6191 |
77
 
78
 
79
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d1d8fa522ecbd41d7ed29a7426d9923a51393e6fb2d160dd942b03bce23414f6
3
  size 671150064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7756f5d0cc022294ccf38c261284a9de8e425fd482035784aa767bab75061bc0
3
  size 671150064
all_results.json CHANGED
@@ -1,23 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_logits/chosen": -1.9401931762695312,
4
- "eval_logits/rejected": -1.9123154878616333,
5
- "eval_logps/chosen": -77.5232162475586,
6
- "eval_logps/rejected": -95.19373321533203,
7
- "eval_loss": 0.6917868852615356,
8
- "eval_pred_label": 4738.58740234375,
9
- "eval_rewards/accuracies": 0.3591269850730896,
10
- "eval_rewards/chosen": -0.0862266793847084,
11
- "eval_rewards/margins": 0.11172995716333389,
12
- "eval_rewards/rejected": -0.19795666635036469,
13
- "eval_runtime": 247.3331,
14
- "eval_samples": 2000,
15
- "eval_samples_per_second": 8.086,
16
- "eval_steps_per_second": 0.255,
17
- "eval_use_label": 15333.4130859375,
18
- "train_loss": 0.6906769273168754,
19
- "train_runtime": 20027.4031,
20
  "train_samples": 61135,
21
- "train_samples_per_second": 3.053,
22
  "train_steps_per_second": 0.048
23
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.692275420283772,
4
+ "train_runtime": 20019.5915,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "train_samples": 61135,
6
+ "train_samples_per_second": 3.054,
7
  "train_steps_per_second": 0.048
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.6906769273168754,
4
- "train_runtime": 20027.4031,
5
  "train_samples": 61135,
6
- "train_samples_per_second": 3.053,
7
  "train_steps_per_second": 0.048
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.692275420283772,
4
+ "train_runtime": 20019.5915,
5
  "train_samples": 61135,
6
+ "train_samples_per_second": 3.054,
7
  "train_steps_per_second": 0.048
8
  }
trainer_state.json CHANGED
@@ -29,970 +29,970 @@
29
  "epoch": 0.02,
30
  "grad_norm": 0.6796875,
31
  "learning_rate": 1.0416666666666667e-06,
32
- "logits/chosen": -2.2281553745269775,
33
- "logits/rejected": -2.276446580886841,
34
- "logps/chosen": -57.036190032958984,
35
- "logps/rejected": -66.88007354736328,
36
  "loss": 0.6927,
37
  "pred_label": 0.0,
38
- "rewards/accuracies": 0.24013157188892365,
39
- "rewards/chosen": 0.003924594726413488,
40
- "rewards/margins": 0.0009102027979679406,
41
- "rewards/rejected": 0.0030143915209919214,
42
  "step": 20,
43
  "use_label": 170.0
44
  },
45
  {
46
  "epoch": 0.04,
47
- "grad_norm": 0.6328125,
48
  "learning_rate": 2.0833333333333334e-06,
49
- "logits/chosen": -2.2738099098205566,
50
- "logits/rejected": -2.2623789310455322,
51
- "logps/chosen": -54.78137969970703,
52
- "logps/rejected": -67.2437515258789,
53
- "loss": 0.6914,
54
  "pred_label": 0.0,
55
  "rewards/accuracies": 0.24687500298023224,
56
- "rewards/chosen": 0.01747792772948742,
57
- "rewards/margins": 0.001674558618105948,
58
- "rewards/rejected": 0.015803368762135506,
59
  "step": 40,
60
  "use_label": 482.0
61
  },
62
  {
63
  "epoch": 0.06,
64
- "grad_norm": 0.71875,
65
  "learning_rate": 3.125e-06,
66
- "logits/chosen": -2.3237431049346924,
67
- "logits/rejected": -2.321906089782715,
68
- "logps/chosen": -75.5770034790039,
69
- "logps/rejected": -87.68544006347656,
70
- "loss": 0.6885,
71
- "pred_label": 0.0,
72
- "rewards/accuracies": 0.3125,
73
- "rewards/chosen": 0.031676117330789566,
74
- "rewards/margins": 0.009719676338136196,
75
- "rewards/rejected": 0.021956440061330795,
76
  "step": 60,
77
- "use_label": 802.0
78
  },
79
  {
80
  "epoch": 0.08,
81
- "grad_norm": 0.73828125,
82
  "learning_rate": 4.166666666666667e-06,
83
- "logits/chosen": -2.2948005199432373,
84
- "logits/rejected": -2.2623462677001953,
85
- "logps/chosen": -79.29240417480469,
86
- "logps/rejected": -83.04844665527344,
87
- "loss": 0.6876,
88
- "pred_label": 5.800000190734863,
89
  "rewards/accuracies": 0.3343749940395355,
90
- "rewards/chosen": 0.016009245067834854,
91
- "rewards/margins": 0.018887853249907494,
92
- "rewards/rejected": -0.0028786074835807085,
93
  "step": 80,
94
- "use_label": 1116.199951171875
95
  },
96
  {
97
  "epoch": 0.1,
98
- "grad_norm": 0.6953125,
99
  "learning_rate": 4.9997324926814375e-06,
100
- "logits/chosen": -2.2056884765625,
101
- "logits/rejected": -2.210036039352417,
102
- "logps/chosen": -68.87937927246094,
103
- "logps/rejected": -77.87590026855469,
104
- "loss": 0.6876,
105
- "pred_label": 27.537500381469727,
106
- "rewards/accuracies": 0.34062498807907104,
107
- "rewards/chosen": -0.010471501387655735,
108
- "rewards/margins": 0.03584115579724312,
109
- "rewards/rejected": -0.04631265625357628,
110
  "step": 100,
111
- "use_label": 1414.4625244140625
112
  },
113
  {
114
  "epoch": 0.1,
115
- "eval_logits/chosen": -2.1076083183288574,
116
- "eval_logits/rejected": -2.0761499404907227,
117
- "eval_logps/chosen": -74.44951629638672,
118
- "eval_logps/rejected": -85.2883071899414,
119
- "eval_loss": 0.6895647048950195,
120
- "eval_pred_label": 89.14286041259766,
121
- "eval_rewards/accuracies": 0.335317462682724,
122
- "eval_rewards/chosen": -0.05548960343003273,
123
- "eval_rewards/margins": 0.04341282695531845,
124
- "eval_rewards/rejected": -0.09890241920948029,
125
- "eval_runtime": 247.5952,
126
- "eval_samples_per_second": 8.078,
127
  "eval_steps_per_second": 0.254,
128
- "eval_use_label": 1766.857177734375,
129
  "step": 100
130
  },
131
  {
132
  "epoch": 0.13,
133
- "grad_norm": 0.7578125,
134
  "learning_rate": 4.9903757462135984e-06,
135
- "logits/chosen": -2.2542896270751953,
136
- "logits/rejected": -2.1902401447296143,
137
- "logps/chosen": -70.2941665649414,
138
- "logps/rejected": -84.7874755859375,
139
- "loss": 0.6884,
140
- "pred_label": 155.6374969482422,
141
- "rewards/accuracies": 0.3187499940395355,
142
- "rewards/chosen": -0.023759985342621803,
143
- "rewards/margins": 0.051492441445589066,
144
- "rewards/rejected": -0.07525241374969482,
145
  "step": 120,
146
- "use_label": 2110.362548828125
147
  },
148
  {
149
  "epoch": 0.15,
150
- "grad_norm": 0.55859375,
151
  "learning_rate": 4.967700826904229e-06,
152
- "logits/chosen": -2.1823272705078125,
153
- "logits/rejected": -2.210157632827759,
154
- "logps/chosen": -61.80498504638672,
155
- "logps/rejected": -76.43424224853516,
156
- "loss": 0.6907,
157
- "pred_label": 204.22500610351562,
158
- "rewards/accuracies": 0.26875001192092896,
159
- "rewards/chosen": -0.029314354062080383,
160
- "rewards/margins": 0.036702848970890045,
161
- "rewards/rejected": -0.06601719558238983,
162
  "step": 140,
163
- "use_label": 2381.77490234375
164
  },
165
  {
166
  "epoch": 0.17,
167
- "grad_norm": 0.70703125,
168
  "learning_rate": 4.931828996974498e-06,
169
- "logits/chosen": -2.251568555831909,
170
- "logits/rejected": -2.220432996749878,
171
- "logps/chosen": -66.60148620605469,
172
- "logps/rejected": -71.53702545166016,
173
- "loss": 0.69,
174
- "pred_label": 257.2124938964844,
175
- "rewards/accuracies": 0.3343749940395355,
176
- "rewards/chosen": -0.020524730905890465,
177
- "rewards/margins": 0.05932433158159256,
178
- "rewards/rejected": -0.07984906435012817,
179
  "step": 160,
180
- "use_label": 2648.78759765625
181
  },
182
  {
183
  "epoch": 0.19,
184
- "grad_norm": 0.6796875,
185
  "learning_rate": 4.882952093833628e-06,
186
- "logits/chosen": -2.114015817642212,
187
- "logits/rejected": -2.126950740814209,
188
- "logps/chosen": -66.40071868896484,
189
- "logps/rejected": -78.54503631591797,
190
- "loss": 0.6901,
191
- "pred_label": 319.9624938964844,
192
- "rewards/accuracies": 0.328125,
193
- "rewards/chosen": -0.03171534463763237,
194
- "rewards/margins": 0.0544399619102478,
195
- "rewards/rejected": -0.08615531027317047,
196
  "step": 180,
197
- "use_label": 2906.03759765625
198
  },
199
  {
200
  "epoch": 0.21,
201
- "grad_norm": 0.9140625,
202
  "learning_rate": 4.821331504159906e-06,
203
- "logits/chosen": -2.138213872909546,
204
- "logits/rejected": -2.108750343322754,
205
- "logps/chosen": -77.92289733886719,
206
- "logps/rejected": -78.32075500488281,
207
- "loss": 0.6892,
208
- "pred_label": 383.5249938964844,
209
- "rewards/accuracies": 0.37812501192092896,
210
- "rewards/chosen": -0.009543296881020069,
211
- "rewards/margins": 0.06037301942706108,
212
- "rewards/rejected": -0.06991632282733917,
213
  "step": 200,
214
- "use_label": 3162.47509765625
215
  },
216
  {
217
  "epoch": 0.21,
218
- "eval_logits/chosen": -2.051973581314087,
219
- "eval_logits/rejected": -2.028658390045166,
220
- "eval_logps/chosen": -69.3875503540039,
221
- "eval_logps/rejected": -80.99542999267578,
222
- "eval_loss": 0.6893584132194519,
223
- "eval_pred_label": 459.1111145019531,
224
- "eval_rewards/accuracies": 0.3492063581943512,
225
- "eval_rewards/chosen": -0.0048699695616960526,
226
- "eval_rewards/margins": 0.05110359564423561,
227
- "eval_rewards/rejected": -0.05597356706857681,
228
- "eval_runtime": 247.8689,
229
- "eval_samples_per_second": 8.069,
230
  "eval_steps_per_second": 0.254,
231
- "eval_use_label": 3500.888916015625,
232
  "step": 200
233
  },
234
  {
235
  "epoch": 0.23,
236
- "grad_norm": 0.765625,
237
  "learning_rate": 4.747296766042161e-06,
238
- "logits/chosen": -2.172316074371338,
239
- "logits/rejected": -2.1599390506744385,
240
- "logps/chosen": -73.75865173339844,
241
- "logps/rejected": -76.45826721191406,
242
- "loss": 0.6906,
243
- "pred_label": 537.4000244140625,
244
- "rewards/accuracies": 0.34375,
245
- "rewards/chosen": -0.017265746369957924,
246
- "rewards/margins": 0.061459798365831375,
247
- "rewards/rejected": -0.07872554659843445,
248
  "step": 220,
249
- "use_label": 3832.60009765625
250
  },
251
  {
252
  "epoch": 0.25,
253
- "grad_norm": 0.671875,
254
  "learning_rate": 4.661243806657256e-06,
255
- "logits/chosen": -2.1377243995666504,
256
- "logits/rejected": -2.114131450653076,
257
- "logps/chosen": -78.08522033691406,
258
- "logps/rejected": -88.16291809082031,
259
- "loss": 0.6906,
260
- "pred_label": 610.8624877929688,
261
- "rewards/accuracies": 0.3375000059604645,
262
- "rewards/chosen": -0.06858871877193451,
263
- "rewards/margins": 0.07855252921581268,
264
- "rewards/rejected": -0.1471412628889084,
265
  "step": 240,
266
- "use_label": 4079.137451171875
267
  },
268
  {
269
  "epoch": 0.27,
270
- "grad_norm": 0.70703125,
271
  "learning_rate": 4.563632824908252e-06,
272
- "logits/chosen": -2.1762757301330566,
273
- "logits/rejected": -2.173243999481201,
274
- "logps/chosen": -69.33678436279297,
275
- "logps/rejected": -82.98787689208984,
276
- "loss": 0.6907,
277
- "pred_label": 682.2750244140625,
278
- "rewards/accuracies": 0.33125001192092896,
279
- "rewards/chosen": -0.06302420794963837,
280
- "rewards/margins": 0.0732887014746666,
281
- "rewards/rejected": -0.13631291687488556,
282
  "step": 260,
283
- "use_label": 4327.72509765625
284
  },
285
  {
286
  "epoch": 0.29,
287
- "grad_norm": 0.625,
288
  "learning_rate": 4.454985830346574e-06,
289
- "logits/chosen": -2.16465425491333,
290
- "logits/rejected": -2.1788923740386963,
291
- "logps/chosen": -74.41441345214844,
292
- "logps/rejected": -78.55416870117188,
293
- "loss": 0.6892,
294
- "pred_label": 749.125,
295
- "rewards/accuracies": 0.3062500059604645,
296
- "rewards/chosen": -0.06083650514483452,
297
- "rewards/margins": 0.04520425945520401,
298
- "rewards/rejected": -0.10604077577590942,
299
  "step": 280,
300
- "use_label": 4580.875
301
  },
302
  {
303
  "epoch": 0.31,
304
- "grad_norm": 0.65234375,
305
  "learning_rate": 4.335883851539693e-06,
306
- "logits/chosen": -2.0553781986236572,
307
- "logits/rejected": -2.0573229789733887,
308
- "logps/chosen": -69.96788024902344,
309
- "logps/rejected": -80.52223205566406,
310
- "loss": 0.6904,
311
- "pred_label": 824.5499877929688,
312
- "rewards/accuracies": 0.359375,
313
- "rewards/chosen": -0.04866168648004532,
314
- "rewards/margins": 0.09801270812749863,
315
- "rewards/rejected": -0.14667439460754395,
316
  "step": 300,
317
- "use_label": 4825.4501953125
318
  },
319
  {
320
  "epoch": 0.31,
321
- "eval_logits/chosen": -2.0163989067077637,
322
- "eval_logits/rejected": -1.9942671060562134,
323
- "eval_logps/chosen": -75.15243530273438,
324
- "eval_logps/rejected": -89.50163269042969,
325
- "eval_loss": 0.6908969879150391,
326
- "eval_pred_label": 923.3174438476562,
327
- "eval_rewards/accuracies": 0.3531745970249176,
328
- "eval_rewards/chosen": -0.06251893937587738,
329
- "eval_rewards/margins": 0.07851671427488327,
330
- "eval_rewards/rejected": -0.14103564620018005,
331
- "eval_runtime": 247.8241,
332
- "eval_samples_per_second": 8.07,
333
  "eval_steps_per_second": 0.254,
334
- "eval_use_label": 5140.6826171875,
335
  "step": 300
336
  },
337
  {
338
  "epoch": 0.33,
339
- "grad_norm": 0.9140625,
340
  "learning_rate": 4.206963828813555e-06,
341
- "logits/chosen": -2.065279483795166,
342
- "logits/rejected": -2.0684821605682373,
343
- "logps/chosen": -72.58639526367188,
344
- "logps/rejected": -89.45655822753906,
345
- "loss": 0.6899,
346
- "pred_label": 1033.7874755859375,
347
- "rewards/accuracies": 0.3125,
348
- "rewards/chosen": -0.11120834201574326,
349
- "rewards/margins": 0.0645986869931221,
350
- "rewards/rejected": -0.17580702900886536,
351
  "step": 320,
352
- "use_label": 5440.21240234375
353
  },
354
  {
355
  "epoch": 0.36,
356
- "grad_norm": 0.56640625,
357
  "learning_rate": 4.068915207986931e-06,
358
- "logits/chosen": -2.033398151397705,
359
- "logits/rejected": -1.991502046585083,
360
- "logps/chosen": -71.1894760131836,
361
- "logps/rejected": -84.0774154663086,
362
- "loss": 0.6917,
363
- "pred_label": 1122.112548828125,
364
- "rewards/accuracies": 0.3375000059604645,
365
- "rewards/chosen": -0.07950185984373093,
366
- "rewards/margins": 0.08617939054965973,
367
- "rewards/rejected": -0.16568127274513245,
368
  "step": 340,
369
- "use_label": 5671.8876953125
370
  },
371
  {
372
  "epoch": 0.38,
373
- "grad_norm": 0.84765625,
374
  "learning_rate": 3.922476253313921e-06,
375
- "logits/chosen": -2.0358688831329346,
376
- "logits/rejected": -2.0224781036376953,
377
- "logps/chosen": -76.57051849365234,
378
- "logps/rejected": -84.2589340209961,
379
- "loss": 0.6914,
380
- "pred_label": 1204.4124755859375,
381
- "rewards/accuracies": 0.31562501192092896,
382
- "rewards/chosen": -0.11715561151504517,
383
- "rewards/margins": 0.07723374664783478,
384
- "rewards/rejected": -0.19438934326171875,
385
  "step": 360,
386
- "use_label": 5909.58740234375
387
  },
388
  {
389
  "epoch": 0.4,
390
- "grad_norm": 0.55078125,
391
  "learning_rate": 3.768430099352445e-06,
392
- "logits/chosen": -2.12782621383667,
393
- "logits/rejected": -2.086026430130005,
394
- "logps/chosen": -74.41622161865234,
395
- "logps/rejected": -85.17180633544922,
396
- "loss": 0.6918,
397
- "pred_label": 1289.9375,
398
- "rewards/accuracies": 0.3656249940395355,
399
- "rewards/chosen": -0.07592298835515976,
400
- "rewards/margins": 0.08457346260547638,
401
- "rewards/rejected": -0.16049645841121674,
402
  "step": 380,
403
- "use_label": 6144.0625
404
  },
405
  {
406
  "epoch": 0.42,
407
- "grad_norm": 0.73046875,
408
  "learning_rate": 3.607600562872785e-06,
409
- "logits/chosen": -2.126784086227417,
410
- "logits/rejected": -2.1261298656463623,
411
- "logps/chosen": -83.82131958007812,
412
- "logps/rejected": -86.00455474853516,
413
- "loss": 0.6906,
414
- "pred_label": 1373.137451171875,
415
- "rewards/accuracies": 0.3375000059604645,
416
- "rewards/chosen": -0.05874443054199219,
417
- "rewards/margins": 0.06775099784135818,
418
- "rewards/rejected": -0.12649545073509216,
419
  "step": 400,
420
- "use_label": 6380.8623046875
421
  },
422
  {
423
  "epoch": 0.42,
424
- "eval_logits/chosen": -2.0480618476867676,
425
- "eval_logits/rejected": -2.0248324871063232,
426
- "eval_logps/chosen": -75.26866149902344,
427
- "eval_logps/rejected": -90.80635070800781,
428
- "eval_loss": 0.6920759081840515,
429
- "eval_pred_label": 1472.5714111328125,
430
- "eval_rewards/accuracies": 0.3511904776096344,
431
- "eval_rewards/chosen": -0.06368114054203033,
432
- "eval_rewards/margins": 0.09040173143148422,
433
- "eval_rewards/rejected": -0.15408287942409515,
434
- "eval_runtime": 248.0088,
435
  "eval_samples_per_second": 8.064,
436
  "eval_steps_per_second": 0.254,
437
- "eval_use_label": 6695.4287109375,
438
  "step": 400
439
  },
440
  {
441
  "epoch": 0.44,
442
- "grad_norm": 0.78515625,
443
  "learning_rate": 3.4408477372034743e-06,
444
- "logits/chosen": -2.055358409881592,
445
- "logits/rejected": -2.068175792694092,
446
- "logps/chosen": -70.47552490234375,
447
- "logps/rejected": -79.02010345458984,
448
- "loss": 0.6903,
449
- "pred_label": 1589.0374755859375,
450
- "rewards/accuracies": 0.3656249940395355,
451
- "rewards/chosen": -0.06399895995855331,
452
- "rewards/margins": 0.0963120311498642,
453
- "rewards/rejected": -0.16031098365783691,
454
  "step": 420,
455
- "use_label": 6988.96240234375
456
  },
457
  {
458
  "epoch": 0.46,
459
- "grad_norm": 0.95703125,
460
  "learning_rate": 3.269063392575352e-06,
461
- "logits/chosen": -2.0893940925598145,
462
- "logits/rejected": -2.09212589263916,
463
- "logps/chosen": -85.68560028076172,
464
- "logps/rejected": -87.41291809082031,
465
- "loss": 0.6912,
466
- "pred_label": 1667.6875,
467
- "rewards/accuracies": 0.33125001192092896,
468
- "rewards/chosen": -0.13728377223014832,
469
- "rewards/margins": 0.07875251770019531,
470
- "rewards/rejected": -0.21603628993034363,
471
  "step": 440,
472
- "use_label": 7230.3125
473
  },
474
  {
475
  "epoch": 0.48,
476
- "grad_norm": 0.53515625,
477
  "learning_rate": 3.09316620706208e-06,
478
- "logits/chosen": -2.079465389251709,
479
- "logits/rejected": -2.091001033782959,
480
- "logps/chosen": -73.67254638671875,
481
- "logps/rejected": -81.05415344238281,
482
- "loss": 0.6916,
483
- "pred_label": 1751.75,
484
- "rewards/accuracies": 0.30000001192092896,
485
- "rewards/chosen": -0.0876312330365181,
486
- "rewards/margins": 0.08376732468605042,
487
- "rewards/rejected": -0.17139855027198792,
488
  "step": 460,
489
- "use_label": 7466.25
490
  },
491
  {
492
  "epoch": 0.5,
493
- "grad_norm": 0.69921875,
494
  "learning_rate": 2.91409685362137e-06,
495
- "logits/chosen": -2.0379364490509033,
496
- "logits/rejected": -2.0492634773254395,
497
- "logps/chosen": -77.06828308105469,
498
- "logps/rejected": -89.38865661621094,
499
- "loss": 0.6912,
500
- "pred_label": 1832.6500244140625,
501
- "rewards/accuracies": 0.36250001192092896,
502
- "rewards/chosen": -0.06041146069765091,
503
- "rewards/margins": 0.10216375440359116,
504
- "rewards/rejected": -0.16257521510124207,
505
  "step": 480,
506
- "use_label": 7705.35009765625
507
  },
508
  {
509
  "epoch": 0.52,
510
- "grad_norm": 0.86328125,
511
  "learning_rate": 2.7328129695107205e-06,
512
- "logits/chosen": -2.031346082687378,
513
- "logits/rejected": -2.0272762775421143,
514
- "logps/chosen": -79.55888366699219,
515
- "logps/rejected": -84.47586822509766,
516
- "loss": 0.6903,
517
- "pred_label": 1919.5374755859375,
518
- "rewards/accuracies": 0.36250001192092896,
519
- "rewards/chosen": -0.08177755773067474,
520
- "rewards/margins": 0.08017835766077042,
521
- "rewards/rejected": -0.16195592284202576,
522
  "step": 500,
523
- "use_label": 7938.46240234375
524
  },
525
  {
526
  "epoch": 0.52,
527
- "eval_logits/chosen": -2.0070507526397705,
528
- "eval_logits/rejected": -1.9800992012023926,
529
- "eval_logps/chosen": -76.36968231201172,
530
- "eval_logps/rejected": -92.65614318847656,
531
- "eval_loss": 0.6914148926734924,
532
- "eval_pred_label": 2025.793701171875,
533
- "eval_rewards/accuracies": 0.3492063581943512,
534
- "eval_rewards/chosen": -0.07469133287668228,
535
- "eval_rewards/margins": 0.09788943827152252,
536
- "eval_rewards/rejected": -0.1725807636976242,
537
- "eval_runtime": 247.8554,
538
- "eval_samples_per_second": 8.069,
539
  "eval_steps_per_second": 0.254,
540
- "eval_use_label": 8246.2060546875,
541
  "step": 500
542
  },
543
  {
544
  "epoch": 0.54,
545
- "grad_norm": 0.78125,
546
  "learning_rate": 2.5502840349805074e-06,
547
- "logits/chosen": -2.026449203491211,
548
- "logits/rejected": -2.0701510906219482,
549
- "logps/chosen": -75.1209487915039,
550
- "logps/rejected": -88.01356506347656,
551
- "loss": 0.6913,
552
- "pred_label": 2148.887451171875,
553
- "rewards/accuracies": 0.3531250059604645,
554
- "rewards/chosen": -0.06801941990852356,
555
- "rewards/margins": 0.09691040217876434,
556
- "rewards/rejected": -0.1649298369884491,
557
  "step": 520,
558
- "use_label": 8533.1123046875
559
  },
560
  {
561
  "epoch": 0.57,
562
- "grad_norm": 1.09375,
563
  "learning_rate": 2.367486188632446e-06,
564
- "logits/chosen": -2.0245327949523926,
565
- "logits/rejected": -2.0479135513305664,
566
- "logps/chosen": -84.60169219970703,
567
- "logps/rejected": -90.6330795288086,
568
- "loss": 0.692,
569
- "pred_label": 2235.550048828125,
570
- "rewards/accuracies": 0.359375,
571
- "rewards/chosen": -0.09091995656490326,
572
- "rewards/margins": 0.11123095452785492,
573
- "rewards/rejected": -0.20215091109275818,
574
  "step": 540,
575
- "use_label": 8766.4501953125
576
  },
577
  {
578
  "epoch": 0.59,
579
- "grad_norm": 0.75390625,
580
  "learning_rate": 2.1853970071701415e-06,
581
- "logits/chosen": -2.0177600383758545,
582
- "logits/rejected": -2.016798257827759,
583
- "logps/chosen": -78.94650268554688,
584
- "logps/rejected": -80.36412811279297,
585
- "loss": 0.6917,
586
- "pred_label": 2319.53759765625,
587
  "rewards/accuracies": 0.2874999940395355,
588
- "rewards/chosen": -0.10138510167598724,
589
- "rewards/margins": 0.06911652535200119,
590
- "rewards/rejected": -0.17050163447856903,
591
  "step": 560,
592
- "use_label": 9002.462890625
593
  },
594
  {
595
  "epoch": 0.61,
596
- "grad_norm": 0.71875,
597
  "learning_rate": 2.00499027745888e-06,
598
- "logits/chosen": -2.054065704345703,
599
- "logits/rejected": -2.0555384159088135,
600
- "logps/chosen": -80.3529281616211,
601
- "logps/rejected": -95.12947082519531,
602
- "loss": 0.6919,
603
- "pred_label": 2401.675048828125,
604
- "rewards/accuracies": 0.359375,
605
- "rewards/chosen": -0.09597108513116837,
606
- "rewards/margins": 0.09131233394145966,
607
- "rewards/rejected": -0.18728342652320862,
608
  "step": 580,
609
- "use_label": 9240.3251953125
610
  },
611
  {
612
  "epoch": 0.63,
613
- "grad_norm": 0.76171875,
614
  "learning_rate": 1.8272307888529276e-06,
615
- "logits/chosen": -2.059126377105713,
616
- "logits/rejected": -2.099806547164917,
617
- "logps/chosen": -89.58797454833984,
618
- "logps/rejected": -108.6166000366211,
619
- "loss": 0.6903,
620
- "pred_label": 2492.9375,
621
- "rewards/accuracies": 0.41874998807907104,
622
- "rewards/chosen": -0.12580521404743195,
623
- "rewards/margins": 0.10241512209177017,
624
- "rewards/rejected": -0.22822031378746033,
625
  "step": 600,
626
- "use_label": 9469.0625
627
  },
628
  {
629
  "epoch": 0.63,
630
- "eval_logits/chosen": -1.9870026111602783,
631
- "eval_logits/rejected": -1.960112452507019,
632
- "eval_logps/chosen": -78.95431518554688,
633
- "eval_logps/rejected": -95.86695861816406,
634
- "eval_loss": 0.6917396187782288,
635
- "eval_pred_label": 2603.9365234375,
636
- "eval_rewards/accuracies": 0.3551587164402008,
637
- "eval_rewards/chosen": -0.1005377396941185,
638
- "eval_rewards/margins": 0.104151152074337,
639
- "eval_rewards/rejected": -0.2046888917684555,
640
- "eval_runtime": 247.9642,
641
- "eval_samples_per_second": 8.066,
642
  "eval_steps_per_second": 0.254,
643
- "eval_use_label": 9772.0634765625,
644
  "step": 600
645
  },
646
  {
647
  "epoch": 0.65,
648
- "grad_norm": 0.5859375,
649
  "learning_rate": 1.6530691736402317e-06,
650
- "logits/chosen": -1.9752880334854126,
651
- "logits/rejected": -2.011981964111328,
652
- "logps/chosen": -69.71615600585938,
653
- "logps/rejected": -95.88337707519531,
654
- "loss": 0.6918,
655
- "pred_label": 2726.324951171875,
656
- "rewards/accuracies": 0.34687501192092896,
657
- "rewards/chosen": -0.09408678859472275,
658
- "rewards/margins": 0.09362435340881348,
659
- "rewards/rejected": -0.18771114945411682,
660
  "step": 620,
661
- "use_label": 10059.6748046875
662
  },
663
  {
664
  "epoch": 0.67,
665
- "grad_norm": 0.73046875,
666
  "learning_rate": 1.4834368231970922e-06,
667
- "logits/chosen": -2.0288071632385254,
668
- "logits/rejected": -2.0409998893737793,
669
- "logps/chosen": -82.56907653808594,
670
- "logps/rejected": -90.75765228271484,
671
- "loss": 0.6894,
672
- "pred_label": 2805.512451171875,
673
- "rewards/accuracies": 0.36250001192092896,
674
- "rewards/chosen": -0.10210500657558441,
675
- "rewards/margins": 0.10695278644561768,
676
- "rewards/rejected": -0.2090577781200409,
677
  "step": 640,
678
- "use_label": 10300.4873046875
679
  },
680
  {
681
  "epoch": 0.69,
682
- "grad_norm": 0.5625,
683
  "learning_rate": 1.3192409070404582e-06,
684
- "logits/chosen": -2.055405855178833,
685
- "logits/rejected": -2.0071816444396973,
686
- "logps/chosen": -77.25361633300781,
687
- "logps/rejected": -88.34065246582031,
688
- "loss": 0.6915,
689
- "pred_label": 2899.9375,
690
- "rewards/accuracies": 0.34687501192092896,
691
- "rewards/chosen": -0.11595650017261505,
692
- "rewards/margins": 0.0952102541923523,
693
- "rewards/rejected": -0.21116676926612854,
694
  "step": 660,
695
- "use_label": 10526.0625
696
  },
697
  {
698
  "epoch": 0.71,
699
- "grad_norm": 0.67578125,
700
  "learning_rate": 1.1613595214152713e-06,
701
- "logits/chosen": -2.056795597076416,
702
- "logits/rejected": -2.071035861968994,
703
- "logps/chosen": -88.15283203125,
704
- "logps/rejected": -96.39839172363281,
705
- "loss": 0.6918,
706
- "pred_label": 2978.0625,
707
- "rewards/accuracies": 0.3499999940395355,
708
- "rewards/chosen": -0.12273094803094864,
709
- "rewards/margins": 0.09404005855321884,
710
- "rewards/rejected": -0.2167709767818451,
711
  "step": 680,
712
- "use_label": 10767.9375
713
  },
714
  {
715
  "epoch": 0.73,
716
- "grad_norm": 0.74609375,
717
  "learning_rate": 1.0106369933615043e-06,
718
- "logits/chosen": -2.0782313346862793,
719
- "logits/rejected": -2.0467371940612793,
720
- "logps/chosen": -97.93621826171875,
721
- "logps/rejected": -106.91497802734375,
722
- "loss": 0.6917,
723
- "pred_label": 3075.71240234375,
724
- "rewards/accuracies": 0.3687500059604645,
725
- "rewards/chosen": -0.1391007900238037,
726
- "rewards/margins": 0.10766571760177612,
727
- "rewards/rejected": -0.24676652252674103,
728
  "step": 700,
729
- "use_label": 10990.287109375
730
  },
731
  {
732
  "epoch": 0.73,
733
- "eval_logits/chosen": -1.9658821821212769,
734
- "eval_logits/rejected": -1.9401167631149292,
735
- "eval_logps/chosen": -80.06806182861328,
736
- "eval_logps/rejected": -97.64107513427734,
737
- "eval_loss": 0.6917343735694885,
738
- "eval_pred_label": 3195.22216796875,
739
  "eval_rewards/accuracies": 0.3511904776096344,
740
- "eval_rewards/chosen": -0.11167524009943008,
741
- "eval_rewards/margins": 0.1107548326253891,
742
- "eval_rewards/rejected": -0.2224300652742386,
743
- "eval_runtime": 247.943,
744
- "eval_samples_per_second": 8.066,
745
  "eval_steps_per_second": 0.254,
746
- "eval_use_label": 11284.77734375,
747
  "step": 700
748
  },
749
  {
750
  "epoch": 0.75,
751
- "grad_norm": 0.72265625,
752
  "learning_rate": 8.678793653740633e-07,
753
- "logits/chosen": -2.015249729156494,
754
- "logits/rejected": -2.0358498096466064,
755
- "logps/chosen": -70.9017562866211,
756
- "logps/rejected": -86.4397201538086,
757
- "loss": 0.6908,
758
- "pred_label": 3306.39990234375,
759
- "rewards/accuracies": 0.3187499940395355,
760
- "rewards/chosen": -0.10931293666362762,
761
- "rewards/margins": 0.0925455391407013,
762
- "rewards/rejected": -0.20185847580432892,
763
  "step": 720,
764
- "use_label": 11583.599609375
765
  },
766
  {
767
  "epoch": 0.77,
768
- "grad_norm": 0.83203125,
769
  "learning_rate": 7.338500848029603e-07,
770
- "logits/chosen": -2.01334810256958,
771
- "logits/rejected": -2.0296788215637207,
772
- "logps/chosen": -74.19635772705078,
773
- "logps/rejected": -83.99024200439453,
774
- "loss": 0.6911,
775
- "pred_label": 3386.16259765625,
776
- "rewards/accuracies": 0.32499998807907104,
777
- "rewards/chosen": -0.08706559240818024,
778
- "rewards/margins": 0.11473299562931061,
779
- "rewards/rejected": -0.20179858803749084,
780
  "step": 740,
781
- "use_label": 11823.837890625
782
  },
783
  {
784
  "epoch": 0.8,
785
- "grad_norm": 0.66015625,
786
  "learning_rate": 6.092659210462232e-07,
787
- "logits/chosen": -2.052433967590332,
788
- "logits/rejected": -2.060997724533081,
789
- "logps/chosen": -76.93110656738281,
790
- "logps/rejected": -97.30107879638672,
791
- "loss": 0.6904,
792
- "pred_label": 3466.5,
793
  "rewards/accuracies": 0.33125001192092896,
794
- "rewards/chosen": -0.11182014644145966,
795
- "rewards/margins": 0.07981495559215546,
796
- "rewards/rejected": -0.1916351020336151,
797
  "step": 760,
798
- "use_label": 12063.5
799
  },
800
  {
801
  "epoch": 0.82,
802
- "grad_norm": 0.859375,
803
  "learning_rate": 4.947931323697983e-07,
804
- "logits/chosen": -2.032320737838745,
805
- "logits/rejected": -2.047227144241333,
806
- "logps/chosen": -89.46810913085938,
807
- "logps/rejected": -95.58660125732422,
808
- "loss": 0.6913,
809
- "pred_label": 3558.875,
810
  "rewards/accuracies": 0.375,
811
- "rewards/chosen": -0.11294672638177872,
812
- "rewards/margins": 0.11753211170434952,
813
- "rewards/rejected": -0.23047883808612823,
814
  "step": 780,
815
- "use_label": 12291.125
816
  },
817
  {
818
  "epoch": 0.84,
819
- "grad_norm": 0.74609375,
820
  "learning_rate": 3.910439028537638e-07,
821
- "logits/chosen": -2.010045289993286,
822
- "logits/rejected": -1.989505410194397,
823
- "logps/chosen": -70.47514343261719,
824
- "logps/rejected": -75.11082458496094,
825
- "loss": 0.6912,
826
- "pred_label": 3649.22509765625,
827
- "rewards/accuracies": 0.3656249940395355,
828
- "rewards/chosen": -0.08034199476242065,
829
- "rewards/margins": 0.0995674580335617,
830
- "rewards/rejected": -0.17990948259830475,
831
  "step": 800,
832
- "use_label": 12520.775390625
833
  },
834
  {
835
  "epoch": 0.84,
836
- "eval_logits/chosen": -1.9421576261520386,
837
- "eval_logits/rejected": -1.9144233465194702,
838
- "eval_logps/chosen": -77.5874252319336,
839
- "eval_logps/rejected": -95.20885467529297,
840
- "eval_loss": 0.6917100548744202,
841
- "eval_pred_label": 3757.174560546875,
842
- "eval_rewards/accuracies": 0.363095223903656,
843
- "eval_rewards/chosen": -0.08686873316764832,
844
- "eval_rewards/margins": 0.11123905330896378,
845
- "eval_rewards/rejected": -0.19810780882835388,
846
- "eval_runtime": 247.8932,
847
- "eval_samples_per_second": 8.068,
848
  "eval_steps_per_second": 0.254,
849
- "eval_use_label": 12826.8251953125,
850
  "step": 800
851
  },
852
  {
853
  "epoch": 0.86,
854
- "grad_norm": 0.828125,
855
  "learning_rate": 2.98573068519539e-07,
856
- "logits/chosen": -2.035728931427002,
857
- "logits/rejected": -2.029679775238037,
858
- "logps/chosen": -74.97032165527344,
859
- "logps/rejected": -84.2763900756836,
860
- "loss": 0.6908,
861
- "pred_label": 3872.199951171875,
862
- "rewards/accuracies": 0.3343749940395355,
863
- "rewards/chosen": -0.1004786491394043,
864
- "rewards/margins": 0.08142165094614029,
865
- "rewards/rejected": -0.181900292634964,
866
  "step": 820,
867
- "use_label": 13121.7998046875
868
  },
869
  {
870
  "epoch": 0.88,
871
- "grad_norm": 0.6953125,
872
  "learning_rate": 2.178751501463036e-07,
873
- "logits/chosen": -2.0276803970336914,
874
- "logits/rejected": -2.0149848461151123,
875
- "logps/chosen": -66.70552062988281,
876
- "logps/rejected": -70.63726806640625,
877
- "loss": 0.6915,
878
- "pred_label": 3954.60009765625,
879
- "rewards/accuracies": 0.28437501192092896,
880
- "rewards/chosen": -0.08035041391849518,
881
- "rewards/margins": 0.07462439686059952,
882
- "rewards/rejected": -0.1549748182296753,
883
  "step": 840,
884
- "use_label": 13359.400390625
885
  },
886
  {
887
  "epoch": 0.9,
888
- "grad_norm": 0.7578125,
889
  "learning_rate": 1.4938170864468636e-07,
890
- "logits/chosen": -2.048083543777466,
891
- "logits/rejected": -2.0321922302246094,
892
- "logps/chosen": -90.8042221069336,
893
- "logps/rejected": -100.8233413696289,
894
- "loss": 0.69,
895
- "pred_label": 4041.72509765625,
896
- "rewards/accuracies": 0.40625,
897
- "rewards/chosen": -0.0809466689825058,
898
- "rewards/margins": 0.1332779824733734,
899
- "rewards/rejected": -0.2142246663570404,
900
  "step": 860,
901
- "use_label": 13592.275390625
902
  },
903
  {
904
  "epoch": 0.92,
905
- "grad_norm": 0.5546875,
906
  "learning_rate": 9.345903713082305e-08,
907
- "logits/chosen": -2.047487735748291,
908
- "logits/rejected": -2.034466505050659,
909
- "logps/chosen": -81.69231414794922,
910
- "logps/rejected": -101.5263442993164,
911
- "loss": 0.6915,
912
- "pred_label": 4142.625,
913
- "rewards/accuracies": 0.38749998807907104,
914
- "rewards/chosen": -0.09660721570253372,
915
- "rewards/margins": 0.13364934921264648,
916
- "rewards/rejected": -0.23025652766227722,
917
  "step": 880,
918
- "use_label": 13811.375
919
  },
920
  {
921
  "epoch": 0.94,
922
- "grad_norm": 0.7578125,
923
  "learning_rate": 5.0406202043228604e-08,
924
- "logits/chosen": -1.9304163455963135,
925
- "logits/rejected": -1.9657026529312134,
926
- "logps/chosen": -75.30284118652344,
927
- "logps/rejected": -99.71704864501953,
928
- "loss": 0.6914,
929
- "pred_label": 4235.9248046875,
930
  "rewards/accuracies": 0.3375000059604645,
931
- "rewards/chosen": -0.08683101832866669,
932
- "rewards/margins": 0.10066400468349457,
933
- "rewards/rejected": -0.18749502301216125,
934
  "step": 900,
935
- "use_label": 14038.0751953125
936
  },
937
  {
938
  "epoch": 0.94,
939
- "eval_logits/chosen": -1.939072847366333,
940
- "eval_logits/rejected": -1.9112603664398193,
941
- "eval_logps/chosen": -77.5274658203125,
942
- "eval_logps/rejected": -95.22908020019531,
943
- "eval_loss": 0.6917905211448669,
944
- "eval_pred_label": 4352.28564453125,
945
- "eval_rewards/accuracies": 0.3571428656578064,
946
- "eval_rewards/chosen": -0.08626923710107803,
947
- "eval_rewards/margins": 0.1120409369468689,
948
- "eval_rewards/rejected": -0.19831016659736633,
949
- "eval_runtime": 247.7794,
950
- "eval_samples_per_second": 8.072,
951
- "eval_steps_per_second": 0.254,
952
- "eval_use_label": 14335.7138671875,
953
  "step": 900
954
  },
955
  {
956
  "epoch": 0.96,
957
- "grad_norm": 0.80078125,
958
  "learning_rate": 2.0453443778310766e-08,
959
- "logits/chosen": -1.9801095724105835,
960
- "logits/rejected": -1.9714418649673462,
961
- "logps/chosen": -63.8930778503418,
962
- "logps/rejected": -85.15528869628906,
963
- "loss": 0.6906,
964
- "pred_label": 4473.8125,
965
- "rewards/accuracies": 0.31562501192092896,
966
- "rewards/chosen": -0.06585933268070221,
967
- "rewards/margins": 0.11039040982723236,
968
- "rewards/rejected": -0.17624975740909576,
969
  "step": 920,
970
- "use_label": 14624.1875
971
  },
972
  {
973
  "epoch": 0.98,
974
- "grad_norm": 0.8359375,
975
  "learning_rate": 3.760945397705828e-09,
976
- "logits/chosen": -1.9589160680770874,
977
- "logits/rejected": -1.9971154928207397,
978
- "logps/chosen": -74.0462646484375,
979
- "logps/rejected": -91.64708709716797,
980
- "loss": 0.6913,
981
- "pred_label": 4558.71240234375,
982
  "rewards/accuracies": 0.32499998807907104,
983
- "rewards/chosen": -0.0799408107995987,
984
- "rewards/margins": 0.10116855055093765,
985
- "rewards/rejected": -0.18110935389995575,
986
  "step": 940,
987
- "use_label": 14859.287109375
988
  },
989
  {
990
  "epoch": 1.0,
991
  "step": 955,
992
  "total_flos": 0.0,
993
- "train_loss": 0.6906769273168754,
994
- "train_runtime": 20027.4031,
995
- "train_samples_per_second": 3.053,
996
  "train_steps_per_second": 0.048
997
  }
998
  ],
 
29
  "epoch": 0.02,
30
  "grad_norm": 0.6796875,
31
  "learning_rate": 1.0416666666666667e-06,
32
+ "logits/chosen": -2.227864980697632,
33
+ "logits/rejected": -2.276106834411621,
34
+ "logps/chosen": -57.02927780151367,
35
+ "logps/rejected": -66.8729019165039,
36
  "loss": 0.6927,
37
  "pred_label": 0.0,
38
+ "rewards/accuracies": 0.24671052396297455,
39
+ "rewards/chosen": 0.003993770573288202,
40
+ "rewards/margins": 0.0009077258291654289,
41
+ "rewards/rejected": 0.003086044918745756,
42
  "step": 20,
43
  "use_label": 170.0
44
  },
45
  {
46
  "epoch": 0.04,
47
+ "grad_norm": 0.546875,
48
  "learning_rate": 2.0833333333333334e-06,
49
+ "logits/chosen": -2.2728817462921143,
50
+ "logits/rejected": -2.261592388153076,
51
+ "logps/chosen": -54.7827033996582,
52
+ "logps/rejected": -67.2376708984375,
53
+ "loss": 0.6915,
54
  "pred_label": 0.0,
55
  "rewards/accuracies": 0.24687500298023224,
56
+ "rewards/chosen": 0.017464743927121162,
57
+ "rewards/margins": 0.0016005486249923706,
58
+ "rewards/rejected": 0.015864195302128792,
59
  "step": 40,
60
  "use_label": 482.0
61
  },
62
  {
63
  "epoch": 0.06,
64
+ "grad_norm": 0.625,
65
  "learning_rate": 3.125e-06,
66
+ "logits/chosen": -2.321017026901245,
67
+ "logits/rejected": -2.318946123123169,
68
+ "logps/chosen": -75.58020782470703,
69
+ "logps/rejected": -87.66261291503906,
70
+ "loss": 0.6905,
71
+ "pred_label": 4.987500190734863,
72
+ "rewards/accuracies": 0.3187499940395355,
73
+ "rewards/chosen": 0.031644098460674286,
74
+ "rewards/margins": 0.009459299966692924,
75
+ "rewards/rejected": 0.02218480221927166,
76
  "step": 60,
77
+ "use_label": 797.0125122070312
78
  },
79
  {
80
  "epoch": 0.08,
81
+ "grad_norm": 0.57421875,
82
  "learning_rate": 4.166666666666667e-06,
83
+ "logits/chosen": -2.2973294258117676,
84
+ "logits/rejected": -2.2655692100524902,
85
+ "logps/chosen": -77.97566223144531,
86
+ "logps/rejected": -81.31121826171875,
87
+ "loss": 0.6909,
88
+ "pred_label": 29.850000381469727,
89
  "rewards/accuracies": 0.3343749940395355,
90
+ "rewards/chosen": 0.02917659282684326,
91
+ "rewards/margins": 0.014682939276099205,
92
+ "rewards/rejected": 0.014493651688098907,
93
  "step": 80,
94
+ "use_label": 1092.1500244140625
95
  },
96
  {
97
  "epoch": 0.1,
98
+ "grad_norm": 0.625,
99
  "learning_rate": 4.9997324926814375e-06,
100
+ "logits/chosen": -2.210732936859131,
101
+ "logits/rejected": -2.2176434993743896,
102
+ "logps/chosen": -66.4733657836914,
103
+ "logps/rejected": -74.55338287353516,
104
+ "loss": 0.6911,
105
+ "pred_label": 68.07499694824219,
106
+ "rewards/accuracies": 0.33125001192092896,
107
+ "rewards/chosen": 0.013588580302894115,
108
+ "rewards/margins": 0.026675995439291,
109
+ "rewards/rejected": -0.01308741606771946,
110
  "step": 100,
111
+ "use_label": 1373.925048828125
112
  },
113
  {
114
  "epoch": 0.1,
115
+ "eval_logits/chosen": -2.120985984802246,
116
+ "eval_logits/rejected": -2.093513250350952,
117
+ "eval_logps/chosen": -69.42622375488281,
118
+ "eval_logps/rejected": -78.9540786743164,
119
+ "eval_loss": 0.691917359828949,
120
+ "eval_pred_label": 150.14285278320312,
121
+ "eval_rewards/accuracies": 0.3392857015132904,
122
+ "eval_rewards/chosen": -0.005256766453385353,
123
+ "eval_rewards/margins": 0.030303288251161575,
124
+ "eval_rewards/rejected": -0.03556005656719208,
125
+ "eval_runtime": 247.9513,
126
+ "eval_samples_per_second": 8.066,
127
  "eval_steps_per_second": 0.254,
128
+ "eval_use_label": 1705.857177734375,
129
  "step": 100
130
  },
131
  {
132
  "epoch": 0.13,
133
+ "grad_norm": 0.53125,
134
  "learning_rate": 4.9903757462135984e-06,
135
+ "logits/chosen": -2.24790620803833,
136
+ "logits/rejected": -2.1782658100128174,
137
+ "logps/chosen": -67.23531341552734,
138
+ "logps/rejected": -80.04717254638672,
139
+ "loss": 0.6914,
140
+ "pred_label": 243.0,
141
+ "rewards/accuracies": 0.3125,
142
+ "rewards/chosen": 0.00682856747880578,
143
+ "rewards/margins": 0.03467796370387077,
144
+ "rewards/rejected": -0.02784939482808113,
145
  "step": 120,
146
+ "use_label": 2023.0
147
  },
148
  {
149
  "epoch": 0.15,
150
+ "grad_norm": 0.341796875,
151
  "learning_rate": 4.967700826904229e-06,
152
+ "logits/chosen": -2.1205825805664062,
153
+ "logits/rejected": -2.150360584259033,
154
+ "logps/chosen": -58.376564025878906,
155
+ "logps/rejected": -71.84730529785156,
156
+ "loss": 0.6917,
157
+ "pred_label": 304.9125061035156,
158
+ "rewards/accuracies": 0.2874999940395355,
159
+ "rewards/chosen": 0.0049698068760335445,
160
+ "rewards/margins": 0.025117725133895874,
161
+ "rewards/rejected": -0.02014791965484619,
162
  "step": 140,
163
+ "use_label": 2281.08740234375
164
  },
165
  {
166
  "epoch": 0.17,
167
+ "grad_norm": 0.515625,
168
  "learning_rate": 4.931828996974498e-06,
169
+ "logits/chosen": -2.1872148513793945,
170
+ "logits/rejected": -2.1555256843566895,
171
+ "logps/chosen": -66.3367919921875,
172
+ "logps/rejected": -69.24983215332031,
173
+ "loss": 0.6918,
174
+ "pred_label": 371.5375061035156,
175
+ "rewards/accuracies": 0.3218750059604645,
176
+ "rewards/chosen": -0.017877796664834023,
177
+ "rewards/margins": 0.03909943252801895,
178
+ "rewards/rejected": -0.05697723478078842,
179
  "step": 160,
180
+ "use_label": 2534.46240234375
181
  },
182
  {
183
  "epoch": 0.19,
184
+ "grad_norm": 0.4921875,
185
  "learning_rate": 4.882952093833628e-06,
186
+ "logits/chosen": -2.1010584831237793,
187
+ "logits/rejected": -2.112929582595825,
188
+ "logps/chosen": -67.18075561523438,
189
+ "logps/rejected": -77.23786163330078,
190
+ "loss": 0.6925,
191
+ "pred_label": 444.2124938964844,
192
+ "rewards/accuracies": 0.3031249940395355,
193
+ "rewards/chosen": -0.03951570764183998,
194
+ "rewards/margins": 0.03356783464550972,
195
+ "rewards/rejected": -0.0730835422873497,
196
  "step": 180,
197
+ "use_label": 2781.78759765625
198
  },
199
  {
200
  "epoch": 0.21,
201
+ "grad_norm": 0.546875,
202
  "learning_rate": 4.821331504159906e-06,
203
+ "logits/chosen": -2.181281805038452,
204
+ "logits/rejected": -2.155298948287964,
205
+ "logps/chosen": -78.88096618652344,
206
+ "logps/rejected": -77.27136993408203,
207
+ "loss": 0.692,
208
+ "pred_label": 513.2125244140625,
209
+ "rewards/accuracies": 0.3812499940395355,
210
+ "rewards/chosen": -0.019123973324894905,
211
+ "rewards/margins": 0.040298379957675934,
212
+ "rewards/rejected": -0.05942235141992569,
213
  "step": 200,
214
+ "use_label": 3032.78759765625
215
  },
216
  {
217
  "epoch": 0.21,
218
+ "eval_logits/chosen": -2.1267549991607666,
219
+ "eval_logits/rejected": -2.1057066917419434,
220
+ "eval_logps/chosen": -71.54093170166016,
221
+ "eval_logps/rejected": -82.35039520263672,
222
+ "eval_loss": 0.6926834583282471,
223
+ "eval_pred_label": 622.952392578125,
224
+ "eval_rewards/accuracies": 0.3432539701461792,
225
+ "eval_rewards/chosen": -0.026403911411762238,
226
+ "eval_rewards/margins": 0.043119337409734726,
227
+ "eval_rewards/rejected": -0.06952324509620667,
228
+ "eval_runtime": 248.2687,
229
+ "eval_samples_per_second": 8.056,
230
  "eval_steps_per_second": 0.254,
231
+ "eval_use_label": 3337.047607421875,
232
  "step": 200
233
  },
234
  {
235
  "epoch": 0.23,
236
+ "grad_norm": 0.609375,
237
  "learning_rate": 4.747296766042161e-06,
238
+ "logits/chosen": -2.2548727989196777,
239
+ "logits/rejected": -2.2427258491516113,
240
+ "logps/chosen": -74.4991683959961,
241
+ "logps/rejected": -75.8321762084961,
242
+ "loss": 0.6924,
243
+ "pred_label": 738.5,
244
+ "rewards/accuracies": 0.3531250059604645,
245
+ "rewards/chosen": -0.024670986458659172,
246
+ "rewards/margins": 0.04779377579689026,
247
+ "rewards/rejected": -0.07246476411819458,
248
  "step": 220,
249
+ "use_label": 3631.5
250
  },
251
  {
252
  "epoch": 0.25,
253
+ "grad_norm": 0.46875,
254
  "learning_rate": 4.661243806657256e-06,
255
+ "logits/chosen": -2.2358717918395996,
256
+ "logits/rejected": -2.216477870941162,
257
+ "logps/chosen": -72.57451629638672,
258
+ "logps/rejected": -79.20014953613281,
259
+ "loss": 0.6921,
260
+ "pred_label": 830.7750244140625,
261
+ "rewards/accuracies": 0.3499999940395355,
262
+ "rewards/chosen": -0.013481785543262959,
263
+ "rewards/margins": 0.0440317802131176,
264
+ "rewards/rejected": -0.05751357227563858,
265
  "step": 240,
266
+ "use_label": 3859.22509765625
267
  },
268
  {
269
  "epoch": 0.27,
270
+ "grad_norm": 0.5390625,
271
  "learning_rate": 4.563632824908252e-06,
272
+ "logits/chosen": -2.204738140106201,
273
+ "logits/rejected": -2.2045350074768066,
274
+ "logps/chosen": -64.52825164794922,
275
+ "logps/rejected": -74.71345520019531,
276
+ "loss": 0.6919,
277
+ "pred_label": 912.1624755859375,
278
+ "rewards/accuracies": 0.3125,
279
+ "rewards/chosen": -0.01493888907134533,
280
+ "rewards/margins": 0.038629818707704544,
281
+ "rewards/rejected": -0.05356870964169502,
282
  "step": 260,
283
+ "use_label": 4097.83740234375
284
  },
285
  {
286
  "epoch": 0.29,
287
+ "grad_norm": 0.431640625,
288
  "learning_rate": 4.454985830346574e-06,
289
+ "logits/chosen": -2.224844455718994,
290
+ "logits/rejected": -2.247999668121338,
291
+ "logps/chosen": -72.3452377319336,
292
+ "logps/rejected": -75.01800537109375,
293
+ "loss": 0.6916,
294
+ "pred_label": 993.7874755859375,
295
+ "rewards/accuracies": 0.3125,
296
+ "rewards/chosen": -0.04014473780989647,
297
+ "rewards/margins": 0.030534306541085243,
298
+ "rewards/rejected": -0.07067903876304626,
299
  "step": 280,
300
+ "use_label": 4336.21240234375
301
  },
302
  {
303
  "epoch": 0.31,
304
+ "grad_norm": 0.423828125,
305
  "learning_rate": 4.335883851539693e-06,
306
+ "logits/chosen": -2.2155380249023438,
307
+ "logits/rejected": -2.2151846885681152,
308
+ "logps/chosen": -67.15587615966797,
309
+ "logps/rejected": -74.2086181640625,
310
+ "loss": 0.6924,
311
+ "pred_label": 1083.4625244140625,
312
+ "rewards/accuracies": 0.34687501192092896,
313
+ "rewards/chosen": -0.020541679114103317,
314
+ "rewards/margins": 0.06299655884504318,
315
+ "rewards/rejected": -0.0835382491350174,
316
  "step": 300,
317
+ "use_label": 4566.53759765625
318
  },
319
  {
320
  "epoch": 0.31,
321
+ "eval_logits/chosen": -2.2169294357299805,
322
+ "eval_logits/rejected": -2.1932876110076904,
323
+ "eval_logps/chosen": -72.5876693725586,
324
+ "eval_logps/rejected": -84.35366821289062,
325
+ "eval_loss": 0.6928625702857971,
326
+ "eval_pred_label": 1200.2698974609375,
327
+ "eval_rewards/accuracies": 0.3392857015132904,
328
+ "eval_rewards/chosen": -0.03687124699354172,
329
+ "eval_rewards/margins": 0.0526847243309021,
330
+ "eval_rewards/rejected": -0.08955597132444382,
331
+ "eval_runtime": 247.9119,
332
+ "eval_samples_per_second": 8.067,
333
  "eval_steps_per_second": 0.254,
334
+ "eval_use_label": 4863.72998046875,
335
  "step": 300
336
  },
337
  {
338
  "epoch": 0.33,
339
+ "grad_norm": 0.61328125,
340
  "learning_rate": 4.206963828813555e-06,
341
+ "logits/chosen": -2.291391134262085,
342
+ "logits/rejected": -2.3002986907958984,
343
+ "logps/chosen": -68.5405502319336,
344
+ "logps/rejected": -83.0180435180664,
345
+ "loss": 0.6927,
346
+ "pred_label": 1323.074951171875,
347
+ "rewards/accuracies": 0.30000001192092896,
348
+ "rewards/chosen": -0.07074997574090958,
349
+ "rewards/margins": 0.04067195579409599,
350
+ "rewards/rejected": -0.11142192780971527,
351
  "step": 320,
352
+ "use_label": 5150.9248046875
353
  },
354
  {
355
  "epoch": 0.36,
356
+ "grad_norm": 0.455078125,
357
  "learning_rate": 4.068915207986931e-06,
358
+ "logits/chosen": -2.2867865562438965,
359
+ "logits/rejected": -2.2617173194885254,
360
+ "logps/chosen": -64.90373229980469,
361
+ "logps/rejected": -74.42888641357422,
362
+ "loss": 0.692,
363
+ "pred_label": 1427.7750244140625,
364
+ "rewards/accuracies": 0.3531250059604645,
365
+ "rewards/chosen": -0.016644436866044998,
366
+ "rewards/margins": 0.052551619708538055,
367
+ "rewards/rejected": -0.06919606029987335,
368
  "step": 340,
369
+ "use_label": 5366.22509765625
370
  },
371
  {
372
  "epoch": 0.38,
373
+ "grad_norm": 0.458984375,
374
  "learning_rate": 3.922476253313921e-06,
375
+ "logits/chosen": -2.249298572540283,
376
+ "logits/rejected": -2.253566265106201,
377
+ "logps/chosen": -68.57295989990234,
378
+ "logps/rejected": -73.1113510131836,
379
+ "loss": 0.693,
380
+ "pred_label": 1522.0999755859375,
381
+ "rewards/accuracies": 0.328125,
382
+ "rewards/chosen": -0.037180084735155106,
383
+ "rewards/margins": 0.045733559876680374,
384
+ "rewards/rejected": -0.08291363716125488,
385
  "step": 360,
386
+ "use_label": 5591.89990234375
387
  },
388
  {
389
  "epoch": 0.4,
390
+ "grad_norm": 0.4453125,
391
  "learning_rate": 3.768430099352445e-06,
392
+ "logits/chosen": -2.2458603382110596,
393
+ "logits/rejected": -2.2051453590393066,
394
+ "logps/chosen": -70.38607788085938,
395
+ "logps/rejected": -78.15666198730469,
396
+ "loss": 0.6923,
397
+ "pred_label": 1625.5374755859375,
398
+ "rewards/accuracies": 0.3531250059604645,
399
+ "rewards/chosen": -0.03562153875827789,
400
+ "rewards/margins": 0.054723359644412994,
401
+ "rewards/rejected": -0.09034489840269089,
402
  "step": 380,
403
+ "use_label": 5808.46240234375
404
  },
405
  {
406
  "epoch": 0.42,
407
+ "grad_norm": 0.59765625,
408
  "learning_rate": 3.607600562872785e-06,
409
+ "logits/chosen": -2.196977138519287,
410
+ "logits/rejected": -2.197218656539917,
411
+ "logps/chosen": -81.0395736694336,
412
+ "logps/rejected": -81.44091033935547,
413
+ "loss": 0.6927,
414
+ "pred_label": 1725.362548828125,
415
+ "rewards/accuracies": 0.32499998807907104,
416
+ "rewards/chosen": -0.03092697635293007,
417
+ "rewards/margins": 0.049932099878787994,
418
+ "rewards/rejected": -0.08085907250642776,
419
  "step": 400,
420
+ "use_label": 6028.6376953125
421
  },
422
  {
423
  "epoch": 0.42,
424
+ "eval_logits/chosen": -2.118962526321411,
425
+ "eval_logits/rejected": -2.093430995941162,
426
+ "eval_logps/chosen": -71.01036071777344,
427
+ "eval_logps/rejected": -83.43638610839844,
428
+ "eval_loss": 0.6925376653671265,
429
+ "eval_pred_label": 1843.920654296875,
430
+ "eval_rewards/accuracies": 0.341269850730896,
431
+ "eval_rewards/chosen": -0.021098149940371513,
432
+ "eval_rewards/margins": 0.05928494408726692,
433
+ "eval_rewards/rejected": -0.08038310706615448,
434
+ "eval_runtime": 248.0095,
435
  "eval_samples_per_second": 8.064,
436
  "eval_steps_per_second": 0.254,
437
+ "eval_use_label": 6324.07958984375,
438
  "step": 400
439
  },
440
  {
441
  "epoch": 0.44,
442
+ "grad_norm": 0.30078125,
443
  "learning_rate": 3.4408477372034743e-06,
444
+ "logits/chosen": -2.146075487136841,
445
+ "logits/rejected": -2.152238607406616,
446
+ "logps/chosen": -65.8438720703125,
447
+ "logps/rejected": -70.74162292480469,
448
+ "loss": 0.692,
449
+ "pred_label": 1975.637451171875,
450
+ "rewards/accuracies": 0.3531250059604645,
451
+ "rewards/chosen": -0.017682421952486038,
452
+ "rewards/margins": 0.05984373763203621,
453
+ "rewards/rejected": -0.07752615213394165,
454
  "step": 420,
455
+ "use_label": 6602.3623046875
456
  },
457
  {
458
  "epoch": 0.46,
459
+ "grad_norm": 0.9296875,
460
  "learning_rate": 3.269063392575352e-06,
461
+ "logits/chosen": -2.2523856163024902,
462
+ "logits/rejected": -2.2490224838256836,
463
+ "logps/chosen": -74.74308013916016,
464
+ "logps/rejected": -74.57176208496094,
465
+ "loss": 0.6927,
466
+ "pred_label": 2072.27490234375,
467
+ "rewards/accuracies": 0.3218750059604645,
468
+ "rewards/chosen": -0.027858540415763855,
469
+ "rewards/margins": 0.05976608395576477,
470
+ "rewards/rejected": -0.08762462437152863,
471
  "step": 440,
472
+ "use_label": 6825.72509765625
473
  },
474
  {
475
  "epoch": 0.48,
476
+ "grad_norm": 0.34375,
477
  "learning_rate": 3.09316620706208e-06,
478
+ "logits/chosen": -2.2484962940216064,
479
+ "logits/rejected": -2.253873109817505,
480
+ "logps/chosen": -68.02134704589844,
481
+ "logps/rejected": -73.40286254882812,
482
+ "loss": 0.6929,
483
+ "pred_label": 2175.53759765625,
484
+ "rewards/accuracies": 0.328125,
485
+ "rewards/chosen": -0.03111925721168518,
486
+ "rewards/margins": 0.06376632302999496,
487
+ "rewards/rejected": -0.09488557279109955,
488
  "step": 460,
489
+ "use_label": 7042.46240234375
490
  },
491
  {
492
  "epoch": 0.5,
493
+ "grad_norm": 0.3984375,
494
  "learning_rate": 2.91409685362137e-06,
495
+ "logits/chosen": -2.2359812259674072,
496
+ "logits/rejected": -2.2330563068389893,
497
+ "logps/chosen": -75.03883361816406,
498
+ "logps/rejected": -84.55928039550781,
499
+ "loss": 0.6922,
500
+ "pred_label": 2276.949951171875,
501
+ "rewards/accuracies": 0.3687500059604645,
502
+ "rewards/chosen": -0.040116917341947556,
503
+ "rewards/margins": 0.0741645023226738,
504
+ "rewards/rejected": -0.11428143084049225,
505
  "step": 480,
506
+ "use_label": 7261.0498046875
507
  },
508
  {
509
  "epoch": 0.52,
510
+ "grad_norm": 0.5703125,
511
  "learning_rate": 2.7328129695107205e-06,
512
+ "logits/chosen": -2.2053210735321045,
513
+ "logits/rejected": -2.2094616889953613,
514
+ "logps/chosen": -75.30181121826172,
515
+ "logps/rejected": -77.61902618408203,
516
+ "loss": 0.6924,
517
+ "pred_label": 2379.137451171875,
518
+ "rewards/accuracies": 0.3656249940395355,
519
+ "rewards/chosen": -0.039206866174936295,
520
+ "rewards/margins": 0.05418051406741142,
521
+ "rewards/rejected": -0.09338738024234772,
522
  "step": 500,
523
+ "use_label": 7478.8623046875
524
  },
525
  {
526
  "epoch": 0.52,
527
+ "eval_logits/chosen": -2.176236152648926,
528
+ "eval_logits/rejected": -2.151799201965332,
529
+ "eval_logps/chosen": -70.96183776855469,
530
+ "eval_logps/rejected": -83.7112045288086,
531
+ "eval_loss": 0.6929337382316589,
532
+ "eval_pred_label": 2499.22216796875,
533
+ "eval_rewards/accuracies": 0.3432539701461792,
534
+ "eval_rewards/chosen": -0.02061287872493267,
535
+ "eval_rewards/margins": 0.06251849234104156,
536
+ "eval_rewards/rejected": -0.08313137292861938,
537
+ "eval_runtime": 248.0888,
538
+ "eval_samples_per_second": 8.062,
539
  "eval_steps_per_second": 0.254,
540
+ "eval_use_label": 7772.77783203125,
541
  "step": 500
542
  },
543
  {
544
  "epoch": 0.54,
545
+ "grad_norm": 0.61328125,
546
  "learning_rate": 2.5502840349805074e-06,
547
+ "logits/chosen": -2.195094347000122,
548
+ "logits/rejected": -2.237112045288086,
549
+ "logps/chosen": -70.13484954833984,
550
+ "logps/rejected": -79.53434753417969,
551
+ "loss": 0.692,
552
+ "pred_label": 2632.125,
553
+ "rewards/accuracies": 0.3656249940395355,
554
+ "rewards/chosen": -0.018158430233597755,
555
+ "rewards/margins": 0.061979226768016815,
556
+ "rewards/rejected": -0.08013766258955002,
557
  "step": 520,
558
+ "use_label": 8049.875
559
  },
560
  {
561
  "epoch": 0.57,
562
+ "grad_norm": 0.55078125,
563
  "learning_rate": 2.367486188632446e-06,
564
+ "logits/chosen": -2.1844329833984375,
565
+ "logits/rejected": -2.1980721950531006,
566
+ "logps/chosen": -78.40437316894531,
567
+ "logps/rejected": -80.49110412597656,
568
+ "loss": 0.6925,
569
+ "pred_label": 2729.66259765625,
570
+ "rewards/accuracies": 0.3656249940395355,
571
+ "rewards/chosen": -0.028946753591299057,
572
+ "rewards/margins": 0.0717843621969223,
573
+ "rewards/rejected": -0.10073111951351166,
574
  "step": 540,
575
+ "use_label": 8272.337890625
576
  },
577
  {
578
  "epoch": 0.59,
579
+ "grad_norm": 0.48828125,
580
  "learning_rate": 2.1853970071701415e-06,
581
+ "logits/chosen": -2.19417667388916,
582
+ "logits/rejected": -2.1900599002838135,
583
+ "logps/chosen": -73.69783020019531,
584
+ "logps/rejected": -72.62937927246094,
585
+ "loss": 0.6926,
586
+ "pred_label": 2827.875,
587
  "rewards/accuracies": 0.2874999940395355,
588
+ "rewards/chosen": -0.04889845848083496,
589
+ "rewards/margins": 0.04425561800599098,
590
+ "rewards/rejected": -0.09315408021211624,
591
  "step": 560,
592
+ "use_label": 8494.125
593
  },
594
  {
595
  "epoch": 0.61,
596
+ "grad_norm": 0.328125,
597
  "learning_rate": 2.00499027745888e-06,
598
+ "logits/chosen": -2.224670171737671,
599
+ "logits/rejected": -2.230435371398926,
600
+ "logps/chosen": -76.27436065673828,
601
+ "logps/rejected": -87.6956787109375,
602
+ "loss": 0.6922,
603
+ "pred_label": 2926.862548828125,
604
+ "rewards/accuracies": 0.3687500059604645,
605
+ "rewards/chosen": -0.055185507982969284,
606
+ "rewards/margins": 0.05776001885533333,
607
+ "rewards/rejected": -0.11294553428888321,
608
  "step": 580,
609
+ "use_label": 8715.1376953125
610
  },
611
  {
612
  "epoch": 0.63,
613
+ "grad_norm": 0.5546875,
614
  "learning_rate": 1.8272307888529276e-06,
615
+ "logits/chosen": -2.231316089630127,
616
+ "logits/rejected": -2.258852481842041,
617
+ "logps/chosen": -84.25640106201172,
618
+ "logps/rejected": -99.73040771484375,
619
+ "loss": 0.6929,
620
+ "pred_label": 3042.83740234375,
621
+ "rewards/accuracies": 0.40312498807907104,
622
+ "rewards/chosen": -0.07248945534229279,
623
+ "rewards/margins": 0.06686891615390778,
624
+ "rewards/rejected": -0.13935837149620056,
625
  "step": 600,
626
+ "use_label": 8919.162109375
627
  },
628
  {
629
  "epoch": 0.63,
630
+ "eval_logits/chosen": -2.1407980918884277,
631
+ "eval_logits/rejected": -2.1125032901763916,
632
+ "eval_logps/chosen": -73.41705322265625,
633
+ "eval_logps/rejected": -86.9944839477539,
634
+ "eval_loss": 0.6927017569541931,
635
+ "eval_pred_label": 3177.142822265625,
636
+ "eval_rewards/accuracies": 0.3511904776096344,
637
+ "eval_rewards/chosen": -0.04516514018177986,
638
+ "eval_rewards/margins": 0.07079902291297913,
639
+ "eval_rewards/rejected": -0.11596415936946869,
640
+ "eval_runtime": 248.1359,
641
+ "eval_samples_per_second": 8.06,
642
  "eval_steps_per_second": 0.254,
643
+ "eval_use_label": 9198.857421875,
644
  "step": 600
645
  },
646
  {
647
  "epoch": 0.65,
648
+ "grad_norm": 0.404296875,
649
  "learning_rate": 1.6530691736402317e-06,
650
+ "logits/chosen": -2.1386609077453613,
651
+ "logits/rejected": -2.1743404865264893,
652
+ "logps/chosen": -65.55394744873047,
653
+ "logps/rejected": -88.32081604003906,
654
+ "loss": 0.6924,
655
+ "pred_label": 3318.58740234375,
656
+ "rewards/accuracies": 0.34375,
657
+ "rewards/chosen": -0.05246468633413315,
658
+ "rewards/margins": 0.059620797634124756,
659
+ "rewards/rejected": -0.1120854839682579,
660
  "step": 620,
661
+ "use_label": 9467.412109375
662
  },
663
  {
664
  "epoch": 0.67,
665
+ "grad_norm": 0.51171875,
666
  "learning_rate": 1.4834368231970922e-06,
667
+ "logits/chosen": -2.1956310272216797,
668
+ "logits/rejected": -2.2024998664855957,
669
+ "logps/chosen": -77.41986846923828,
670
+ "logps/rejected": -82.58815002441406,
671
+ "loss": 0.692,
672
+ "pred_label": 3414.199951171875,
673
+ "rewards/accuracies": 0.3656249940395355,
674
+ "rewards/chosen": -0.05061299726366997,
675
+ "rewards/margins": 0.07674984633922577,
676
+ "rewards/rejected": -0.12736284732818604,
677
  "step": 640,
678
+ "use_label": 9691.7998046875
679
  },
680
  {
681
  "epoch": 0.69,
682
+ "grad_norm": 0.2890625,
683
  "learning_rate": 1.3192409070404582e-06,
684
+ "logits/chosen": -2.1827545166015625,
685
+ "logits/rejected": -2.1392319202423096,
686
+ "logps/chosen": -71.07948303222656,
687
+ "logps/rejected": -78.78751373291016,
688
+ "loss": 0.6924,
689
+ "pred_label": 3519.35009765625,
690
+ "rewards/accuracies": 0.3375000059604645,
691
+ "rewards/chosen": -0.0542152114212513,
692
+ "rewards/margins": 0.06142013147473335,
693
+ "rewards/rejected": -0.11563535034656525,
694
  "step": 660,
695
+ "use_label": 9906.650390625
696
  },
697
  {
698
  "epoch": 0.71,
699
+ "grad_norm": 0.435546875,
700
  "learning_rate": 1.1613595214152713e-06,
701
+ "logits/chosen": -2.2185826301574707,
702
+ "logits/rejected": -2.2344555854797363,
703
+ "logps/chosen": -81.96281433105469,
704
+ "logps/rejected": -87.13890838623047,
705
+ "loss": 0.6923,
706
+ "pred_label": 3610.012451171875,
707
+ "rewards/accuracies": 0.34375,
708
+ "rewards/chosen": -0.06083091348409653,
709
+ "rewards/margins": 0.0633452981710434,
710
+ "rewards/rejected": -0.12417621910572052,
711
  "step": 680,
712
+ "use_label": 10135.9873046875
713
  },
714
  {
715
  "epoch": 0.73,
716
+ "grad_norm": 0.5078125,
717
  "learning_rate": 1.0106369933615043e-06,
718
+ "logits/chosen": -2.2393274307250977,
719
+ "logits/rejected": -2.2085208892822266,
720
+ "logps/chosen": -90.31179809570312,
721
+ "logps/rejected": -96.00973510742188,
722
+ "loss": 0.6928,
723
+ "pred_label": 3716.97509765625,
724
+ "rewards/accuracies": 0.3843750059604645,
725
+ "rewards/chosen": -0.06285654008388519,
726
+ "rewards/margins": 0.07485760748386383,
727
+ "rewards/rejected": -0.13771414756774902,
728
  "step": 700,
729
+ "use_label": 10349.025390625
730
  },
731
  {
732
  "epoch": 0.73,
733
+ "eval_logits/chosen": -2.1372170448303223,
734
+ "eval_logits/rejected": -2.1086459159851074,
735
+ "eval_logps/chosen": -73.96572875976562,
736
+ "eval_logps/rejected": -87.70773315429688,
737
+ "eval_loss": 0.6929500102996826,
738
+ "eval_pred_label": 3852.730224609375,
739
  "eval_rewards/accuracies": 0.3511904776096344,
740
+ "eval_rewards/chosen": -0.05065184459090233,
741
+ "eval_rewards/margins": 0.07244490087032318,
742
+ "eval_rewards/rejected": -0.12309674173593521,
743
+ "eval_runtime": 248.0038,
744
+ "eval_samples_per_second": 8.064,
745
  "eval_steps_per_second": 0.254,
746
+ "eval_use_label": 10627.26953125,
747
  "step": 700
748
  },
749
  {
750
  "epoch": 0.75,
751
+ "grad_norm": 0.55078125,
752
  "learning_rate": 8.678793653740633e-07,
753
+ "logits/chosen": -2.1876041889190674,
754
+ "logits/rejected": -2.1966712474823,
755
+ "logps/chosen": -64.94602966308594,
756
+ "logps/rejected": -77.46949005126953,
757
+ "loss": 0.6927,
758
+ "pred_label": 3992.16259765625,
759
+ "rewards/accuracies": 0.31562501192092896,
760
+ "rewards/chosen": -0.04975567013025284,
761
+ "rewards/margins": 0.06240048259496689,
762
+ "rewards/rejected": -0.11215615272521973,
763
  "step": 720,
764
+ "use_label": 10897.837890625
765
  },
766
  {
767
  "epoch": 0.77,
768
+ "grad_norm": 0.416015625,
769
  "learning_rate": 7.338500848029603e-07,
770
+ "logits/chosen": -2.194794178009033,
771
+ "logits/rejected": -2.2083091735839844,
772
+ "logps/chosen": -69.16300201416016,
773
+ "logps/rejected": -74.87442779541016,
774
+ "loss": 0.6927,
775
+ "pred_label": 4088.0625,
776
+ "rewards/accuracies": 0.3187499940395355,
777
+ "rewards/chosen": -0.03673207014799118,
778
+ "rewards/margins": 0.07390830665826797,
779
+ "rewards/rejected": -0.11064038425683975,
780
  "step": 740,
781
+ "use_label": 11121.9375
782
  },
783
  {
784
  "epoch": 0.8,
785
+ "grad_norm": 0.47265625,
786
  "learning_rate": 6.092659210462232e-07,
787
+ "logits/chosen": -2.2297511100769043,
788
+ "logits/rejected": -2.232818841934204,
789
+ "logps/chosen": -70.27059173583984,
790
+ "logps/rejected": -88.61542510986328,
791
+ "loss": 0.6927,
792
+ "pred_label": 4190.375,
793
  "rewards/accuracies": 0.33125001192092896,
794
+ "rewards/chosen": -0.04521505907177925,
795
+ "rewards/margins": 0.05956338718533516,
796
+ "rewards/rejected": -0.10477845370769501,
797
  "step": 760,
798
+ "use_label": 11339.625
799
  },
800
  {
801
  "epoch": 0.82,
802
+ "grad_norm": 0.515625,
803
  "learning_rate": 4.947931323697983e-07,
804
+ "logits/chosen": -2.224112033843994,
805
+ "logits/rejected": -2.241053581237793,
806
+ "logps/chosen": -82.8070068359375,
807
+ "logps/rejected": -85.62196350097656,
808
+ "loss": 0.6927,
809
+ "pred_label": 4299.97509765625,
810
  "rewards/accuracies": 0.375,
811
+ "rewards/chosen": -0.04633576422929764,
812
+ "rewards/margins": 0.0844966396689415,
813
+ "rewards/rejected": -0.13083240389823914,
814
  "step": 780,
815
+ "use_label": 11550.025390625
816
  },
817
  {
818
  "epoch": 0.84,
819
+ "grad_norm": 0.498046875,
820
  "learning_rate": 3.910439028537638e-07,
821
+ "logits/chosen": -2.201280117034912,
822
+ "logits/rejected": -2.177452325820923,
823
+ "logps/chosen": -65.0578842163086,
824
+ "logps/rejected": -66.19197082519531,
825
+ "loss": 0.6927,
826
+ "pred_label": 4407.78759765625,
827
+ "rewards/accuracies": 0.3499999940395355,
828
+ "rewards/chosen": -0.026169428601861,
829
+ "rewards/margins": 0.06455135345458984,
830
+ "rewards/rejected": -0.0907207801938057,
831
  "step": 800,
832
+ "use_label": 11762.212890625
833
  },
834
  {
835
  "epoch": 0.84,
836
+ "eval_logits/chosen": -2.1430623531341553,
837
+ "eval_logits/rejected": -2.1141114234924316,
838
+ "eval_logps/chosen": -71.62469482421875,
839
+ "eval_logps/rejected": -85.3831787109375,
840
+ "eval_loss": 0.6928467750549316,
841
+ "eval_pred_label": 4538.47607421875,
842
+ "eval_rewards/accuracies": 0.3551587164402008,
843
+ "eval_rewards/chosen": -0.027241550385951996,
844
+ "eval_rewards/margins": 0.072609543800354,
845
+ "eval_rewards/rejected": -0.099851094186306,
846
+ "eval_runtime": 247.951,
847
+ "eval_samples_per_second": 8.066,
848
  "eval_steps_per_second": 0.254,
849
+ "eval_use_label": 12045.5234375,
850
  "step": 800
851
  },
852
  {
853
  "epoch": 0.86,
854
+ "grad_norm": 0.48046875,
855
  "learning_rate": 2.98573068519539e-07,
856
+ "logits/chosen": -2.228102684020996,
857
+ "logits/rejected": -2.2112691402435303,
858
+ "logps/chosen": -68.63658142089844,
859
+ "logps/rejected": -75.33064270019531,
860
+ "loss": 0.6923,
861
+ "pred_label": 4678.53759765625,
862
+ "rewards/accuracies": 0.3218750059604645,
863
+ "rewards/chosen": -0.03714119642972946,
864
+ "rewards/margins": 0.05530167371034622,
865
+ "rewards/rejected": -0.09244287014007568,
866
  "step": 820,
867
+ "use_label": 12315.462890625
868
  },
869
  {
870
  "epoch": 0.88,
871
+ "grad_norm": 0.56640625,
872
  "learning_rate": 2.178751501463036e-07,
873
+ "logits/chosen": -2.204557418823242,
874
+ "logits/rejected": -2.2018847465515137,
875
+ "logps/chosen": -61.4800910949707,
876
+ "logps/rejected": -63.1760139465332,
877
+ "loss": 0.6929,
878
+ "pred_label": 4777.375,
879
+ "rewards/accuracies": 0.29374998807907104,
880
+ "rewards/chosen": -0.02809613011777401,
881
+ "rewards/margins": 0.05226613208651543,
882
+ "rewards/rejected": -0.08036227524280548,
883
  "step": 840,
884
+ "use_label": 12536.625
885
  },
886
  {
887
  "epoch": 0.9,
888
+ "grad_norm": 0.8515625,
889
  "learning_rate": 1.4938170864468636e-07,
890
+ "logits/chosen": -2.252244234085083,
891
+ "logits/rejected": -2.242299795150757,
892
+ "logps/chosen": -84.9459228515625,
893
+ "logps/rejected": -90.69441223144531,
894
+ "loss": 0.6922,
895
+ "pred_label": 4874.3251953125,
896
+ "rewards/accuracies": 0.4124999940395355,
897
+ "rewards/chosen": -0.022363774478435516,
898
+ "rewards/margins": 0.09057153016328812,
899
+ "rewards/rejected": -0.11293530464172363,
900
  "step": 860,
901
+ "use_label": 12759.6748046875
902
  },
903
  {
904
  "epoch": 0.92,
905
+ "grad_norm": 0.4296875,
906
  "learning_rate": 9.345903713082305e-08,
907
+ "logits/chosen": -2.2364704608917236,
908
+ "logits/rejected": -2.224773406982422,
909
+ "logps/chosen": -75.7426528930664,
910
+ "logps/rejected": -91.20499420166016,
911
+ "loss": 0.6925,
912
+ "pred_label": 4988.4873046875,
913
+ "rewards/accuracies": 0.3843750059604645,
914
+ "rewards/chosen": -0.03711060434579849,
915
+ "rewards/margins": 0.08993253856897354,
916
+ "rewards/rejected": -0.12704312801361084,
917
  "step": 880,
918
+ "use_label": 12965.5126953125
919
  },
920
  {
921
  "epoch": 0.94,
922
+ "grad_norm": 0.6015625,
923
  "learning_rate": 5.0406202043228604e-08,
924
+ "logits/chosen": -2.121796131134033,
925
+ "logits/rejected": -2.15610671043396,
926
+ "logps/chosen": -69.87088775634766,
927
+ "logps/rejected": -90.85367584228516,
928
+ "loss": 0.6929,
929
+ "pred_label": 5089.85009765625,
930
  "rewards/accuracies": 0.3375000059604645,
931
+ "rewards/chosen": -0.032511431723833084,
932
+ "rewards/margins": 0.06634987145662308,
933
+ "rewards/rejected": -0.09886129945516586,
934
  "step": 900,
935
+ "use_label": 13184.150390625
936
  },
937
  {
938
  "epoch": 0.94,
939
+ "eval_logits/chosen": -2.1427581310272217,
940
+ "eval_logits/rejected": -2.113929510116577,
941
+ "eval_logps/chosen": -71.7841567993164,
942
+ "eval_logps/rejected": -85.5160140991211,
943
+ "eval_loss": 0.6928035020828247,
944
+ "eval_pred_label": 5226.619140625,
945
+ "eval_rewards/accuracies": 0.3492063581943512,
946
+ "eval_rewards/chosen": -0.02883605659008026,
947
+ "eval_rewards/margins": 0.0723433569073677,
948
+ "eval_rewards/rejected": -0.10117942094802856,
949
+ "eval_runtime": 246.4796,
950
+ "eval_samples_per_second": 8.114,
951
+ "eval_steps_per_second": 0.256,
952
+ "eval_use_label": 13461.380859375,
953
  "step": 900
954
  },
955
  {
956
  "epoch": 0.96,
957
+ "grad_norm": 0.52734375,
958
  "learning_rate": 2.0453443778310766e-08,
959
+ "logits/chosen": -2.1679275035858154,
960
+ "logits/rejected": -2.1737468242645264,
961
+ "logps/chosen": -59.419395446777344,
962
+ "logps/rejected": -76.71382141113281,
963
+ "loss": 0.6925,
964
+ "pred_label": 5365.3876953125,
965
+ "rewards/accuracies": 0.3187499940395355,
966
+ "rewards/chosen": -0.021122563630342484,
967
+ "rewards/margins": 0.0707126036286354,
968
+ "rewards/rejected": -0.09183517098426819,
969
  "step": 920,
970
+ "use_label": 13732.6123046875
971
  },
972
  {
973
  "epoch": 0.98,
974
+ "grad_norm": 0.64453125,
975
  "learning_rate": 3.760945397705828e-09,
976
+ "logits/chosen": -2.1522116661071777,
977
+ "logits/rejected": -2.1893556118011475,
978
+ "logps/chosen": -68.75323486328125,
979
+ "logps/rejected": -82.70423889160156,
980
+ "loss": 0.6926,
981
+ "pred_label": 5459.0751953125,
982
  "rewards/accuracies": 0.32499998807907104,
983
+ "rewards/chosen": -0.02701050415635109,
984
+ "rewards/margins": 0.06467042118310928,
985
+ "rewards/rejected": -0.09168092906475067,
986
  "step": 940,
987
+ "use_label": 13958.9248046875
988
  },
989
  {
990
  "epoch": 1.0,
991
  "step": 955,
992
  "total_flos": 0.0,
993
+ "train_loss": 0.692275420283772,
994
+ "train_runtime": 20019.5915,
995
+ "train_samples_per_second": 3.054,
996
  "train_steps_per_second": 0.048
997
  }
998
  ],