RikkiXu commited on
Commit
93e30fd
1 Parent(s): 4550791

Model save

Browse files
README.md CHANGED
@@ -2,15 +2,9 @@
2
  license: apache-2.0
3
  base_model: alignment-handbook/zephyr-7b-sft-full
4
  tags:
5
- - alignment-handbook
6
  - trl
7
  - dpo
8
  - generated_from_trainer
9
- - trl
10
- - dpo
11
- - generated_from_trainer
12
- datasets:
13
- - HuggingFaceH4/ultrafeedback_binarized
14
  model-index:
15
  - name: zephyr-7b-dpo-full
16
  results: []
@@ -21,17 +15,17 @@ should probably proofread and complete it, then remove this comment. -->
21
 
22
  # zephyr-7b-dpo-full
23
 
24
- This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the HuggingFaceH4/ultrafeedback_binarized dataset.
25
  It achieves the following results on the evaluation set:
26
- - Loss: 0.2130
27
- - Rewards/chosen: -2.1752
28
- - Rewards/rejected: -6.3268
29
- - Rewards/accuracies: 0.8984
30
- - Rewards/margins: 4.1516
31
- - Logps/rejected: -812.6320
32
- - Logps/chosen: -474.9558
33
- - Logits/rejected: 1.0727
34
- - Logits/chosen: -0.3077
35
 
36
  ## Model description
37
 
@@ -68,10 +62,10 @@ The following hyperparameters were used during training:
68
 
69
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
70
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
71
- | 0.3617 | 0.21 | 100 | 0.3445 | -1.1381 | -2.9522 | 0.8633 | 1.8140 | -475.1705 | -371.2483 | -1.3192 | -1.5665 |
72
- | 0.2941 | 0.42 | 200 | 0.2595 | -1.5303 | -4.5965 | 0.8711 | 3.0662 | -639.6045 | -410.4631 | -0.2909 | -1.0255 |
73
- | 0.259 | 0.63 | 300 | 0.2187 | -2.2257 | -6.1115 | 0.8945 | 3.8858 | -791.1059 | -480.0016 | 1.2573 | -0.0803 |
74
- | 0.2268 | 0.84 | 400 | 0.2144 | -2.2032 | -6.3258 | 0.8984 | 4.1226 | -812.5331 | -477.7561 | 1.2277 | -0.1965 |
75
 
76
 
77
  ### Framework versions
 
2
  license: apache-2.0
3
  base_model: alignment-handbook/zephyr-7b-sft-full
4
  tags:
 
5
  - trl
6
  - dpo
7
  - generated_from_trainer
 
 
 
 
 
8
  model-index:
9
  - name: zephyr-7b-dpo-full
10
  results: []
 
15
 
16
  # zephyr-7b-dpo-full
17
 
18
+ This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.2103
21
+ - Rewards/chosen: 4.5125
22
+ - Rewards/rejected: -5.2808
23
+ - Rewards/accuracies: 0.9258
24
+ - Rewards/margins: 9.7933
25
+ - Logps/rejected: -190.5157
26
+ - Logps/chosen: -248.4084
27
+ - Logits/rejected: -2.4965
28
+ - Logits/chosen: -2.5123
29
 
30
  ## Model description
31
 
 
62
 
63
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
64
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
65
+ | 0.3315 | 0.21 | 100 | 0.2975 | 3.4373 | -3.8279 | 0.9023 | 7.2653 | -187.6101 | -250.5587 | -2.5158 | -2.5301 |
66
+ | 0.2909 | 0.42 | 200 | 0.2754 | 4.8618 | -4.1200 | 0.9180 | 8.9818 | -188.1942 | -247.7097 | -2.5310 | -2.5459 |
67
+ | 0.6445 | 0.63 | 300 | 0.2245 | 4.2165 | -5.3713 | 0.9102 | 9.5878 | -190.6968 | -249.0004 | -2.4915 | -2.5059 |
68
+ | 0.2653 | 0.84 | 400 | 0.2103 | 4.5125 | -5.2808 | 0.9258 | 9.7933 | -190.5157 | -248.4084 | -2.4965 | -2.5123 |
69
 
70
 
71
  ### Framework versions
all_results.json CHANGED
@@ -1,21 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_logits/chosen": -0.30772945284843445,
4
- "eval_logits/rejected": 1.072676658630371,
5
- "eval_logps/chosen": -474.955810546875,
6
- "eval_logps/rejected": -812.6320190429688,
7
- "eval_loss": 0.21300281584262848,
8
- "eval_rewards/accuracies": 0.8984375,
9
- "eval_rewards/chosen": -2.175225019454956,
10
- "eval_rewards/margins": 4.151553153991699,
11
- "eval_rewards/rejected": -6.326778411865234,
12
- "eval_runtime": 97.1627,
13
- "eval_samples": 2000,
14
- "eval_samples_per_second": 20.584,
15
- "eval_steps_per_second": 0.329,
16
- "train_loss": 0.3183088973476298,
17
- "train_runtime": 7617.3811,
18
  "train_samples": 61135,
19
- "train_samples_per_second": 8.026,
20
  "train_steps_per_second": 0.063
21
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.4018711235732713,
4
+ "train_runtime": 7633.33,
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "train_samples": 61135,
6
+ "train_samples_per_second": 8.009,
7
  "train_steps_per_second": 0.063
8
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:53ac6df0d03218239512bb011a349cc1a6665ac951834fb76560fada358e9db4
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfb8492185343c9d55349b5a527c87db359b2d402335a698c56b9e370a925913
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db33c501b889745f07777b00b2c0fd1fd8665c2ff7a19cee38258a0e09a03e03
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba393e9a2d11b34d07ac9d99ef5e04d768742661ce3b2481b1f60cc95ca2f01b
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b223406133e79f5c8c003c669f1770e612e12d740a68558422c1308ec1945f1
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d94e5203e43f9e579be385f67b02371d9511906fc38a8db6ea9093ff97e8bee
3
  size 4540516344
runs/May14_20-23-03_n136-100-194/events.out.tfevents.1715689406.n136-100-194.2851926.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d08383880dffba823bfba035bab3b5b5d47c0ae4af47983d874a57d41ad405c0
3
- size 35906
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b21f361a2f1b9226a3d464b4ca00300094d3406bd4598a1725bc34ccf713d40
3
+ size 41076
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.3183088973476298,
4
- "train_runtime": 7617.3811,
5
  "train_samples": 61135,
6
- "train_samples_per_second": 8.026,
7
  "train_steps_per_second": 0.063
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.4018711235732713,
4
+ "train_runtime": 7633.33,
5
  "train_samples": 61135,
6
+ "train_samples_per_second": 8.009,
7
  "train_steps_per_second": 0.063
8
  }
trainer_state.json CHANGED
@@ -10,13 +10,13 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "grad_norm": 11.649026188613895,
14
  "learning_rate": 1.0416666666666666e-08,
15
  "logits/chosen": -2.408252239227295,
16
  "logits/rejected": -2.408294677734375,
17
  "logps/chosen": -208.4792022705078,
18
  "logps/rejected": -178.0951690673828,
19
- "loss": 0.693,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
22
  "rewards/margins": 0.0,
@@ -25,780 +25,780 @@
25
  },
26
  {
27
  "epoch": 0.02,
28
- "grad_norm": 10.900219473552317,
29
  "learning_rate": 1.0416666666666667e-07,
30
- "logits/chosen": -2.5447659492492676,
31
- "logits/rejected": -2.538891315460205,
32
- "logps/chosen": -261.517333984375,
33
- "logps/rejected": -166.39056396484375,
34
- "loss": 0.693,
35
- "rewards/accuracies": 0.4027777910232544,
36
- "rewards/chosen": 0.00011150226055178791,
37
- "rewards/margins": 5.223603511694819e-05,
38
- "rewards/rejected": 5.9266225434839725e-05,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.04,
43
- "grad_norm": 11.586256170716794,
44
  "learning_rate": 2.0833333333333333e-07,
45
- "logits/chosen": -2.5193417072296143,
46
- "logits/rejected": -2.526468276977539,
47
- "logps/chosen": -252.56442260742188,
48
- "logps/rejected": -178.0738525390625,
49
- "loss": 0.69,
50
- "rewards/accuracies": 0.7749999761581421,
51
- "rewards/chosen": 0.0030069700442254543,
52
- "rewards/margins": 0.006722055375576019,
53
- "rewards/rejected": -0.003715085331350565,
54
  "step": 20
55
  },
56
  {
57
  "epoch": 0.06,
58
- "grad_norm": 37.65056149520632,
59
  "learning_rate": 3.1249999999999997e-07,
60
- "logits/chosen": -2.4743459224700928,
61
- "logits/rejected": -2.4606471061706543,
62
- "logps/chosen": -240.6038818359375,
63
- "logps/rejected": -181.88919067382812,
64
- "loss": 0.6755,
65
- "rewards/accuracies": 0.84375,
66
- "rewards/chosen": 0.008357289247214794,
67
- "rewards/margins": 0.03687674552202225,
68
- "rewards/rejected": -0.028519460931420326,
69
  "step": 30
70
  },
71
  {
72
  "epoch": 0.08,
73
- "grad_norm": 11.450052464046935,
74
  "learning_rate": 4.1666666666666667e-07,
75
- "logits/chosen": -2.3846631050109863,
76
- "logits/rejected": -2.3543992042541504,
77
- "logps/chosen": -280.9198913574219,
78
- "logps/rejected": -211.95913696289062,
79
- "loss": 0.6346,
80
- "rewards/accuracies": 0.862500011920929,
81
- "rewards/chosen": 0.06864231824874878,
82
- "rewards/margins": 0.12311089038848877,
83
- "rewards/rejected": -0.05446857959032059,
84
  "step": 40
85
  },
86
  {
87
  "epoch": 0.1,
88
- "grad_norm": 13.049787778502237,
89
  "learning_rate": 4.999733114418725e-07,
90
- "logits/chosen": -2.256699323654175,
91
- "logits/rejected": -2.269990921020508,
92
- "logps/chosen": -250.5127716064453,
93
- "logps/rejected": -210.73342895507812,
94
- "loss": 0.5816,
95
- "rewards/accuracies": 0.78125,
96
- "rewards/chosen": 0.005404283292591572,
97
- "rewards/margins": 0.28600800037384033,
98
- "rewards/rejected": -0.2806037366390228,
99
  "step": 50
100
  },
101
  {
102
  "epoch": 0.13,
103
- "grad_norm": 23.40921657971245,
104
  "learning_rate": 4.990398100856366e-07,
105
- "logits/chosen": -2.1533896923065186,
106
- "logits/rejected": -2.1007308959960938,
107
- "logps/chosen": -282.63739013671875,
108
- "logps/rejected": -243.5396270751953,
109
- "loss": 0.5257,
110
- "rewards/accuracies": 0.831250011920929,
111
- "rewards/chosen": -0.08040512353181839,
112
- "rewards/margins": 0.459780216217041,
113
- "rewards/rejected": -0.540185272693634,
114
  "step": 60
115
  },
116
  {
117
  "epoch": 0.15,
118
- "grad_norm": 18.316826712401,
119
  "learning_rate": 4.967775735898179e-07,
120
- "logits/chosen": -2.008669137954712,
121
- "logits/rejected": -2.016932249069214,
122
- "logps/chosen": -298.5509338378906,
123
- "logps/rejected": -281.0208435058594,
124
- "loss": 0.4813,
125
- "rewards/accuracies": 0.8187500238418579,
126
- "rewards/chosen": -0.18154878914356232,
127
- "rewards/margins": 0.7048229575157166,
128
- "rewards/rejected": -0.8863717317581177,
129
  "step": 70
130
  },
131
  {
132
  "epoch": 0.17,
133
- "grad_norm": 17.640568045589177,
134
  "learning_rate": 4.931986719649298e-07,
135
- "logits/chosen": -1.7213737964630127,
136
- "logits/rejected": -1.639634370803833,
137
- "logps/chosen": -321.4610900878906,
138
- "logps/rejected": -327.48675537109375,
139
- "loss": 0.4345,
140
- "rewards/accuracies": 0.875,
141
- "rewards/chosen": -0.5062848925590515,
142
- "rewards/margins": 0.9567643404006958,
143
- "rewards/rejected": -1.4630491733551025,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 0.19,
148
- "grad_norm": 29.13663512751633,
149
  "learning_rate": 4.883222001996351e-07,
150
- "logits/chosen": -1.2982016801834106,
151
- "logits/rejected": -1.1025583744049072,
152
- "logps/chosen": -372.08209228515625,
153
- "logps/rejected": -424.2044372558594,
154
- "loss": 0.4001,
155
- "rewards/accuracies": 0.8062499761581421,
156
- "rewards/chosen": -0.9675124287605286,
157
- "rewards/margins": 1.4019938707351685,
158
- "rewards/rejected": -2.369506359100342,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 0.21,
163
- "grad_norm": 20.753196262748457,
164
  "learning_rate": 4.821741763807186e-07,
165
- "logits/chosen": -1.386314034461975,
166
- "logits/rejected": -1.0822856426239014,
167
- "logps/chosen": -367.71942138671875,
168
- "logps/rejected": -453.9052734375,
169
- "loss": 0.3617,
170
- "rewards/accuracies": 0.8374999761581421,
171
- "rewards/chosen": -1.3193985223770142,
172
- "rewards/margins": 1.4559285640716553,
173
- "rewards/rejected": -2.77532696723938,
174
  "step": 100
175
  },
176
  {
177
  "epoch": 0.21,
178
- "eval_logits/chosen": -1.5665340423583984,
179
- "eval_logits/rejected": -1.3192166090011597,
180
- "eval_logps/chosen": -371.2482604980469,
181
- "eval_logps/rejected": -475.1705017089844,
182
- "eval_loss": 0.34448927640914917,
183
- "eval_rewards/accuracies": 0.86328125,
184
- "eval_rewards/chosen": -1.138149380683899,
185
- "eval_rewards/margins": 1.8140134811401367,
186
- "eval_rewards/rejected": -2.952162981033325,
187
- "eval_runtime": 97.3413,
188
- "eval_samples_per_second": 20.546,
189
- "eval_steps_per_second": 0.329,
190
  "step": 100
191
  },
192
  {
193
  "epoch": 0.23,
194
- "grad_norm": 18.97759709132058,
195
  "learning_rate": 4.747874028753375e-07,
196
- "logits/chosen": -1.1811072826385498,
197
- "logits/rejected": -0.979759693145752,
198
- "logps/chosen": -376.93353271484375,
199
- "logps/rejected": -487.2027282714844,
200
- "loss": 0.3614,
201
- "rewards/accuracies": 0.8500000238418579,
202
- "rewards/chosen": -1.3289892673492432,
203
- "rewards/margins": 1.6757100820541382,
204
- "rewards/rejected": -3.004699230194092,
205
  "step": 110
206
  },
207
  {
208
  "epoch": 0.25,
209
- "grad_norm": 15.72680069184703,
210
  "learning_rate": 4.662012913161997e-07,
211
- "logits/chosen": -0.8997787237167358,
212
- "logits/rejected": -0.6056855916976929,
213
- "logps/chosen": -377.9947509765625,
214
- "logps/rejected": -490.7288513183594,
215
- "loss": 0.3357,
216
- "rewards/accuracies": 0.831250011920929,
217
- "rewards/chosen": -1.3285906314849854,
218
- "rewards/margins": 1.7274820804595947,
219
- "rewards/rejected": -3.056072473526001,
220
  "step": 120
221
  },
222
  {
223
  "epoch": 0.27,
224
- "grad_norm": 22.07740307319267,
225
  "learning_rate": 4.5646165232345103e-07,
226
- "logits/chosen": -0.7402850389480591,
227
- "logits/rejected": -0.2074509561061859,
228
- "logps/chosen": -404.84442138671875,
229
- "logps/rejected": -538.0674438476562,
230
- "loss": 0.3035,
231
- "rewards/accuracies": 0.8999999761581421,
232
- "rewards/chosen": -1.3628482818603516,
233
- "rewards/margins": 2.222651720046997,
234
- "rewards/rejected": -3.5854995250701904,
235
  "step": 130
236
  },
237
  {
238
  "epoch": 0.29,
239
- "grad_norm": 23.838607690547235,
240
  "learning_rate": 4.456204510851956e-07,
241
- "logits/chosen": -0.677836537361145,
242
- "logits/rejected": -0.11348800361156464,
243
- "logps/chosen": -422.9839782714844,
244
- "logps/rejected": -556.4301147460938,
245
- "loss": 0.3117,
246
- "rewards/accuracies": 0.8500000238418579,
247
- "rewards/chosen": -1.539533257484436,
248
- "rewards/margins": 2.1120970249176025,
249
- "rewards/rejected": -3.651630401611328,
250
  "step": 140
251
  },
252
  {
253
  "epoch": 0.31,
254
- "grad_norm": 27.05325199994427,
255
  "learning_rate": 4.337355301007335e-07,
256
- "logits/chosen": -0.3853974938392639,
257
- "logits/rejected": 0.20319394767284393,
258
- "logps/chosen": -417.6180725097656,
259
- "logps/rejected": -582.4807739257812,
260
- "loss": 0.3038,
261
- "rewards/accuracies": 0.84375,
262
- "rewards/chosen": -1.7072023153305054,
263
- "rewards/margins": 2.395305871963501,
264
- "rewards/rejected": -4.102508068084717,
265
  "step": 150
266
  },
267
  {
268
  "epoch": 0.33,
269
- "grad_norm": 17.832180625180946,
270
  "learning_rate": 4.2087030056579986e-07,
271
- "logits/chosen": -0.3905831575393677,
272
- "logits/rejected": 0.32560938596725464,
273
- "logps/chosen": -401.91839599609375,
274
- "logps/rejected": -541.798583984375,
275
- "loss": 0.291,
276
- "rewards/accuracies": 0.8687499761581421,
277
- "rewards/chosen": -1.3554723262786865,
278
- "rewards/margins": 2.1516823768615723,
279
- "rewards/rejected": -3.507154941558838,
280
  "step": 160
281
  },
282
  {
283
  "epoch": 0.36,
284
- "grad_norm": 33.740386423043056,
285
  "learning_rate": 4.070934040463998e-07,
286
- "logits/chosen": -0.7972911596298218,
287
- "logits/rejected": 0.013139176182448864,
288
- "logps/chosen": -431.9474182128906,
289
- "logps/rejected": -614.335205078125,
290
- "loss": 0.2818,
291
- "rewards/accuracies": 0.8687499761581421,
292
- "rewards/chosen": -1.7542915344238281,
293
- "rewards/margins": 2.622352361679077,
294
- "rewards/rejected": -4.376644134521484,
295
  "step": 170
296
  },
297
  {
298
  "epoch": 0.38,
299
- "grad_norm": 36.32706683293822,
300
  "learning_rate": 3.9247834624635404e-07,
301
- "logits/chosen": -0.6585810780525208,
302
- "logits/rejected": 0.35586631298065186,
303
- "logps/chosen": -425.5533752441406,
304
- "logps/rejected": -627.4122924804688,
305
- "loss": 0.2821,
306
- "rewards/accuracies": 0.887499988079071,
307
- "rewards/chosen": -1.5755112171173096,
308
- "rewards/margins": 2.9501824378967285,
309
- "rewards/rejected": -4.525693416595459,
310
  "step": 180
311
  },
312
  {
313
  "epoch": 0.4,
314
- "grad_norm": 17.303305712606004,
315
  "learning_rate": 3.7710310482256523e-07,
316
- "logits/chosen": 0.05398033186793327,
317
- "logits/rejected": 0.8622593879699707,
318
- "logps/chosen": -430.6288146972656,
319
- "logps/rejected": -650.1162109375,
320
- "loss": 0.2857,
321
- "rewards/accuracies": 0.84375,
322
- "rewards/chosen": -1.835845947265625,
323
- "rewards/margins": 2.724783420562744,
324
- "rewards/rejected": -4.560629844665527,
325
  "step": 190
326
  },
327
  {
328
  "epoch": 0.42,
329
- "grad_norm": 25.750253047861463,
330
  "learning_rate": 3.610497133404795e-07,
331
- "logits/chosen": -0.71821129322052,
332
- "logits/rejected": 0.2892279028892517,
333
- "logps/chosen": -424.61773681640625,
334
- "logps/rejected": -588.4847412109375,
335
- "loss": 0.2941,
336
- "rewards/accuracies": 0.8812500238418579,
337
- "rewards/chosen": -1.6481313705444336,
338
- "rewards/margins": 2.587430477142334,
339
- "rewards/rejected": -4.235561847686768,
340
  "step": 200
341
  },
342
  {
343
  "epoch": 0.42,
344
- "eval_logits/chosen": -1.0254615545272827,
345
- "eval_logits/rejected": -0.29090192914009094,
346
- "eval_logps/chosen": -410.46307373046875,
347
- "eval_logps/rejected": -639.6044921875,
348
- "eval_loss": 0.2594895660877228,
349
- "eval_rewards/accuracies": 0.87109375,
350
- "eval_rewards/chosen": -1.5302979946136475,
351
- "eval_rewards/margins": 3.0662055015563965,
352
- "eval_rewards/rejected": -4.596503734588623,
353
- "eval_runtime": 97.4272,
354
- "eval_samples_per_second": 20.528,
355
- "eval_steps_per_second": 0.328,
356
  "step": 200
357
  },
358
  {
359
  "epoch": 0.44,
360
- "grad_norm": 22.833636886609728,
361
  "learning_rate": 3.4440382358952115e-07,
362
- "logits/chosen": -0.23435378074645996,
363
- "logits/rejected": 0.7262285351753235,
364
- "logps/chosen": -413.39495849609375,
365
- "logps/rejected": -610.934326171875,
366
- "loss": 0.2652,
367
- "rewards/accuracies": 0.831250011920929,
368
- "rewards/chosen": -1.7757046222686768,
369
- "rewards/margins": 2.6218485832214355,
370
- "rewards/rejected": -4.397553443908691,
371
  "step": 210
372
  },
373
  {
374
  "epoch": 0.46,
375
- "grad_norm": 19.35747892667393,
376
  "learning_rate": 3.272542485937368e-07,
377
- "logits/chosen": -0.03855214640498161,
378
- "logits/rejected": 1.1163934469223022,
379
- "logps/chosen": -422.90655517578125,
380
- "logps/rejected": -638.5581665039062,
381
- "loss": 0.2649,
382
- "rewards/accuracies": 0.8687499761581421,
383
- "rewards/chosen": -1.665327787399292,
384
- "rewards/margins": 2.9846928119659424,
385
- "rewards/rejected": -4.650020122528076,
386
  "step": 220
387
  },
388
  {
389
  "epoch": 0.48,
390
- "grad_norm": 21.48535535733013,
391
  "learning_rate": 3.096924887558854e-07,
392
- "logits/chosen": 0.25134754180908203,
393
- "logits/rejected": 1.690708875656128,
394
- "logps/chosen": -465.64324951171875,
395
- "logps/rejected": -718.0191650390625,
396
- "loss": 0.2576,
397
- "rewards/accuracies": 0.90625,
398
- "rewards/chosen": -1.8850457668304443,
399
- "rewards/margins": 3.3224689960479736,
400
- "rewards/rejected": -5.20751428604126,
401
  "step": 230
402
  },
403
  {
404
  "epoch": 0.5,
405
- "grad_norm": 21.422183287654935,
406
  "learning_rate": 2.9181224366319943e-07,
407
- "logits/chosen": 0.8468208312988281,
408
- "logits/rejected": 2.1126351356506348,
409
- "logps/chosen": -494.220703125,
410
- "logps/rejected": -765.5153198242188,
411
- "loss": 0.2597,
412
- "rewards/accuracies": 0.887499988079071,
413
- "rewards/chosen": -2.599130153656006,
414
- "rewards/margins": 3.3152058124542236,
415
- "rewards/rejected": -5.914336204528809,
416
  "step": 240
417
  },
418
  {
419
  "epoch": 0.52,
420
- "grad_norm": 20.426996979966383,
421
  "learning_rate": 2.7370891215954565e-07,
422
- "logits/chosen": -0.12486964464187622,
423
- "logits/rejected": 1.0274794101715088,
424
- "logps/chosen": -427.1630859375,
425
- "logps/rejected": -634.4889526367188,
426
- "loss": 0.2648,
427
- "rewards/accuracies": 0.8812500238418579,
428
- "rewards/chosen": -1.721609115600586,
429
- "rewards/margins": 2.7955689430236816,
430
- "rewards/rejected": -4.517177581787109,
431
  "step": 250
432
  },
433
  {
434
  "epoch": 0.54,
435
- "grad_norm": 25.70991400557029,
436
  "learning_rate": 2.55479083351317e-07,
437
- "logits/chosen": -0.1112385243177414,
438
- "logits/rejected": 1.0832737684249878,
439
- "logps/chosen": -449.45709228515625,
440
- "logps/rejected": -671.4852294921875,
441
- "loss": 0.2494,
442
- "rewards/accuracies": 0.8374999761581421,
443
- "rewards/chosen": -1.8507035970687866,
444
- "rewards/margins": 3.2304184436798096,
445
- "rewards/rejected": -5.081121921539307,
446
  "step": 260
447
  },
448
  {
449
  "epoch": 0.56,
450
- "grad_norm": 26.743751356983804,
451
  "learning_rate": 2.3722002126275822e-07,
452
- "logits/chosen": -0.3570239245891571,
453
- "logits/rejected": 0.8737660646438599,
454
- "logps/chosen": -427.79931640625,
455
- "logps/rejected": -680.0591430664062,
456
- "loss": 0.2351,
457
- "rewards/accuracies": 0.887499988079071,
458
- "rewards/chosen": -1.8343093395233154,
459
- "rewards/margins": 3.1806130409240723,
460
- "rewards/rejected": -5.014922142028809,
461
  "step": 270
462
  },
463
  {
464
  "epoch": 0.59,
465
- "grad_norm": 20.437919430365636,
466
  "learning_rate": 2.19029145890313e-07,
467
- "logits/chosen": -0.14857754111289978,
468
- "logits/rejected": 0.7723418474197388,
469
- "logps/chosen": -476.52099609375,
470
- "logps/rejected": -727.02734375,
471
- "loss": 0.2542,
472
- "rewards/accuracies": 0.84375,
473
- "rewards/chosen": -2.1171650886535645,
474
- "rewards/margins": 3.235008716583252,
475
- "rewards/rejected": -5.352174282073975,
476
  "step": 280
477
  },
478
  {
479
  "epoch": 0.61,
480
- "grad_norm": 18.45732619902188,
481
  "learning_rate": 2.0100351342479216e-07,
482
- "logits/chosen": -0.04501671344041824,
483
- "logits/rejected": 0.9899295568466187,
484
- "logps/chosen": -446.81451416015625,
485
- "logps/rejected": -742.0169677734375,
486
- "loss": 0.2265,
487
- "rewards/accuracies": 0.862500011920929,
488
- "rewards/chosen": -2.055858850479126,
489
- "rewards/margins": 3.6457011699676514,
490
- "rewards/rejected": -5.701560020446777,
491
  "step": 290
492
  },
493
  {
494
  "epoch": 0.63,
495
- "grad_norm": 18.065200464485685,
496
  "learning_rate": 1.8323929841460178e-07,
497
- "logits/chosen": -0.25825151801109314,
498
- "logits/rejected": 1.0551276206970215,
499
- "logps/chosen": -482.5679626464844,
500
- "logps/rejected": -777.6588134765625,
501
- "loss": 0.259,
502
- "rewards/accuracies": 0.893750011920929,
503
- "rewards/chosen": -2.0962753295898438,
504
- "rewards/margins": 3.666637897491455,
505
- "rewards/rejected": -5.762913227081299,
506
  "step": 300
507
  },
508
  {
509
  "epoch": 0.63,
510
- "eval_logits/chosen": -0.08032596111297607,
511
- "eval_logits/rejected": 1.257252812385559,
512
- "eval_logps/chosen": -480.0015563964844,
513
- "eval_logps/rejected": -791.1058959960938,
514
- "eval_loss": 0.21872717142105103,
515
- "eval_rewards/accuracies": 0.89453125,
516
- "eval_rewards/chosen": -2.2256827354431152,
517
- "eval_rewards/margins": 3.8858344554901123,
518
- "eval_rewards/rejected": -6.111516952514648,
519
- "eval_runtime": 97.4838,
520
- "eval_samples_per_second": 20.516,
521
- "eval_steps_per_second": 0.328,
522
  "step": 300
523
  },
524
  {
525
  "epoch": 0.65,
526
- "grad_norm": 52.672419540257096,
527
  "learning_rate": 1.6583128063291573e-07,
528
- "logits/chosen": -0.3141949772834778,
529
- "logits/rejected": 0.9004285931587219,
530
- "logps/chosen": -475.15936279296875,
531
- "logps/rejected": -737.10009765625,
532
- "loss": 0.2596,
533
- "rewards/accuracies": 0.90625,
534
- "rewards/chosen": -2.111830949783325,
535
- "rewards/margins": 3.3903605937957764,
536
- "rewards/rejected": -5.502191543579102,
537
  "step": 310
538
  },
539
  {
540
  "epoch": 0.67,
541
- "grad_norm": 22.65476006635999,
542
  "learning_rate": 1.488723393865766e-07,
543
- "logits/chosen": -0.16484542191028595,
544
- "logits/rejected": 1.090689778327942,
545
- "logps/chosen": -455.7721252441406,
546
- "logps/rejected": -757.0924072265625,
547
- "loss": 0.2429,
548
- "rewards/accuracies": 0.893750011920929,
549
- "rewards/chosen": -2.08683180809021,
550
- "rewards/margins": 3.6439883708953857,
551
- "rewards/rejected": -5.730820178985596,
552
  "step": 320
553
  },
554
  {
555
  "epoch": 0.69,
556
- "grad_norm": 19.80833079454062,
557
  "learning_rate": 1.3245295796480788e-07,
558
- "logits/chosen": -0.27468693256378174,
559
- "logits/rejected": 1.061683177947998,
560
- "logps/chosen": -481.45501708984375,
561
- "logps/rejected": -751.7589721679688,
562
- "loss": 0.2279,
563
- "rewards/accuracies": 0.887499988079071,
564
- "rewards/chosen": -2.233105421066284,
565
- "rewards/margins": 3.570781707763672,
566
- "rewards/rejected": -5.803887367248535,
567
  "step": 330
568
  },
569
  {
570
  "epoch": 0.71,
571
- "grad_norm": 24.881750772146,
572
  "learning_rate": 1.1666074087171627e-07,
573
- "logits/chosen": -0.03138185292482376,
574
- "logits/rejected": 1.0509425401687622,
575
- "logps/chosen": -470.1802673339844,
576
- "logps/rejected": -690.557373046875,
577
- "loss": 0.237,
578
- "rewards/accuracies": 0.875,
579
- "rewards/chosen": -2.2647526264190674,
580
- "rewards/margins": 3.035360097885132,
581
- "rewards/rejected": -5.300112724304199,
582
  "step": 340
583
  },
584
  {
585
  "epoch": 0.73,
586
- "grad_norm": 19.587851322719366,
587
  "learning_rate": 1.0157994641835734e-07,
588
- "logits/chosen": 0.006643450353294611,
589
- "logits/rejected": 1.0511295795440674,
590
- "logps/chosen": -478.51055908203125,
591
- "logps/rejected": -806.5072631835938,
592
- "loss": 0.2256,
593
- "rewards/accuracies": 0.862500011920929,
594
- "rewards/chosen": -2.348606586456299,
595
- "rewards/margins": 3.6702816486358643,
596
- "rewards/rejected": -6.018888473510742,
597
  "step": 350
598
  },
599
  {
600
  "epoch": 0.75,
601
- "grad_norm": 22.679114876876472,
602
  "learning_rate": 8.729103716819111e-08,
603
- "logits/chosen": -0.024983350187540054,
604
- "logits/rejected": 1.071063756942749,
605
- "logps/chosen": -459.17669677734375,
606
- "logps/rejected": -731.5552368164062,
607
- "loss": 0.2495,
608
- "rewards/accuracies": 0.8687499761581421,
609
- "rewards/chosen": -2.0910515785217285,
610
- "rewards/margins": 3.42362642288208,
611
- "rewards/rejected": -5.514677047729492,
612
  "step": 360
613
  },
614
  {
615
  "epoch": 0.77,
616
- "grad_norm": 24.62831310204124,
617
  "learning_rate": 7.387025063449081e-08,
618
- "logits/chosen": -0.07435999810695648,
619
- "logits/rejected": 1.1522341966629028,
620
- "logps/chosen": -457.25128173828125,
621
- "logps/rejected": -728.47412109375,
622
- "loss": 0.234,
623
- "rewards/accuracies": 0.8812500238418579,
624
- "rewards/chosen": -2.266648292541504,
625
- "rewards/margins": 3.339157819747925,
626
- "rewards/rejected": -5.60580587387085,
627
  "step": 370
628
  },
629
  {
630
  "epoch": 0.79,
631
- "grad_norm": 18.56178014954704,
632
  "learning_rate": 6.138919252022435e-08,
633
- "logits/chosen": -0.3609544634819031,
634
- "logits/rejected": 0.9311397671699524,
635
- "logps/chosen": -469.6280212402344,
636
- "logps/rejected": -802.5978393554688,
637
- "loss": 0.2329,
638
  "rewards/accuracies": 0.8999999761581421,
639
- "rewards/chosen": -2.1230244636535645,
640
- "rewards/margins": 4.024425506591797,
641
- "rewards/rejected": -6.147449016571045,
642
  "step": 380
643
  },
644
  {
645
  "epoch": 0.82,
646
- "grad_norm": 32.926290807333665,
647
  "learning_rate": 4.991445467064689e-08,
648
- "logits/chosen": -0.012720714323222637,
649
- "logits/rejected": 1.3005478382110596,
650
- "logps/chosen": -489.65802001953125,
651
- "logps/rejected": -779.3419799804688,
652
- "loss": 0.2265,
653
- "rewards/accuracies": 0.9125000238418579,
654
- "rewards/chosen": -2.43365740776062,
655
- "rewards/margins": 3.560826539993286,
656
- "rewards/rejected": -5.994483947753906,
657
  "step": 390
658
  },
659
  {
660
  "epoch": 0.84,
661
- "grad_norm": 30.149198169987162,
662
  "learning_rate": 3.9507259776993954e-08,
663
- "logits/chosen": 0.26025086641311646,
664
- "logits/rejected": 1.3578455448150635,
665
- "logps/chosen": -470.298095703125,
666
- "logps/rejected": -760.2213134765625,
667
- "loss": 0.2268,
668
- "rewards/accuracies": 0.856249988079071,
669
- "rewards/chosen": -2.2305097579956055,
670
- "rewards/margins": 3.6479294300079346,
671
- "rewards/rejected": -5.878438472747803,
672
  "step": 400
673
  },
674
  {
675
  "epoch": 0.84,
676
- "eval_logits/chosen": -0.19649244844913483,
677
- "eval_logits/rejected": 1.227736234664917,
678
- "eval_logps/chosen": -477.7560729980469,
679
- "eval_logps/rejected": -812.5330810546875,
680
- "eval_loss": 0.2144031673669815,
681
- "eval_rewards/accuracies": 0.8984375,
682
- "eval_rewards/chosen": -2.2032277584075928,
683
- "eval_rewards/margins": 4.122560977935791,
684
- "eval_rewards/rejected": -6.325788974761963,
685
- "eval_runtime": 97.384,
686
- "eval_samples_per_second": 20.537,
687
  "eval_steps_per_second": 0.329,
688
  "step": 400
689
  },
690
  {
691
  "epoch": 0.86,
692
- "grad_norm": 15.686531185394536,
693
  "learning_rate": 3.022313472693447e-08,
694
- "logits/chosen": -0.13663654029369354,
695
- "logits/rejected": 1.1233164072036743,
696
- "logps/chosen": -492.34588623046875,
697
- "logps/rejected": -787.8416137695312,
698
- "loss": 0.2073,
699
- "rewards/accuracies": 0.9125000238418579,
700
- "rewards/chosen": -2.1595890522003174,
701
- "rewards/margins": 3.8674533367156982,
702
- "rewards/rejected": -6.027042388916016,
703
  "step": 410
704
  },
705
  {
706
  "epoch": 0.88,
707
- "grad_norm": 25.36773289746514,
708
  "learning_rate": 2.2111614344599684e-08,
709
- "logits/chosen": 0.07713554799556732,
710
- "logits/rejected": 1.1611943244934082,
711
- "logps/chosen": -468.55523681640625,
712
- "logps/rejected": -737.3845825195312,
713
- "loss": 0.2294,
714
- "rewards/accuracies": 0.856249988079071,
715
- "rewards/chosen": -2.246133804321289,
716
- "rewards/margins": 3.3159337043762207,
717
- "rewards/rejected": -5.562067985534668,
718
  "step": 420
719
  },
720
  {
721
  "epoch": 0.9,
722
- "grad_norm": 23.64735863277161,
723
  "learning_rate": 1.521597710086439e-08,
724
- "logits/chosen": -0.11199776083230972,
725
- "logits/rejected": 1.2544041872024536,
726
- "logps/chosen": -487.00787353515625,
727
- "logps/rejected": -748.0345458984375,
728
- "loss": 0.2283,
729
- "rewards/accuracies": 0.8374999761581421,
730
- "rewards/chosen": -2.158412218093872,
731
- "rewards/margins": 3.493891954421997,
732
- "rewards/rejected": -5.652304649353027,
733
  "step": 430
734
  },
735
  {
736
  "epoch": 0.92,
737
- "grad_norm": 32.07046590554996,
738
  "learning_rate": 9.57301420397924e-09,
739
- "logits/chosen": -0.2779064476490021,
740
- "logits/rejected": 0.990594744682312,
741
- "logps/chosen": -476.5415954589844,
742
- "logps/rejected": -816.4949340820312,
743
- "loss": 0.2335,
744
- "rewards/accuracies": 0.925000011920929,
745
- "rewards/chosen": -2.1302719116210938,
746
- "rewards/margins": 4.236142158508301,
747
- "rewards/rejected": -6.366414546966553,
748
  "step": 440
749
  },
750
  {
751
  "epoch": 0.94,
752
- "grad_norm": 29.966664104398923,
753
  "learning_rate": 5.212833302556258e-09,
754
- "logits/chosen": -0.2724097967147827,
755
- "logits/rejected": 1.02344810962677,
756
- "logps/chosen": -477.5227966308594,
757
- "logps/rejected": -801.6556396484375,
758
- "loss": 0.235,
759
- "rewards/accuracies": 0.84375,
760
- "rewards/chosen": -2.263077735900879,
761
- "rewards/margins": 3.993912935256958,
762
- "rewards/rejected": -6.256990432739258,
763
  "step": 450
764
  },
765
  {
766
  "epoch": 0.96,
767
- "grad_norm": 54.30402743121173,
768
  "learning_rate": 2.158697848236607e-09,
769
- "logits/chosen": -0.18335244059562683,
770
- "logits/rejected": 0.9830573201179504,
771
- "logps/chosen": -479.820556640625,
772
- "logps/rejected": -772.7644653320312,
773
- "loss": 0.2201,
774
  "rewards/accuracies": 0.893750011920929,
775
- "rewards/chosen": -2.1699745655059814,
776
- "rewards/margins": 3.6132774353027344,
777
- "rewards/rejected": -5.783252239227295,
778
  "step": 460
779
  },
780
  {
781
  "epoch": 0.98,
782
- "grad_norm": 24.80298516842009,
783
  "learning_rate": 4.269029751107489e-10,
784
- "logits/chosen": -0.0621672198176384,
785
- "logits/rejected": 1.1333558559417725,
786
- "logps/chosen": -449.02386474609375,
787
- "logps/rejected": -767.277099609375,
788
- "loss": 0.2269,
789
- "rewards/accuracies": 0.887499988079071,
790
- "rewards/chosen": -2.1144473552703857,
791
- "rewards/margins": 3.668858289718628,
792
- "rewards/rejected": -5.783305644989014,
793
  "step": 470
794
  },
795
  {
796
  "epoch": 1.0,
797
  "step": 478,
798
  "total_flos": 0.0,
799
- "train_loss": 0.3183088973476298,
800
- "train_runtime": 7617.3811,
801
- "train_samples_per_second": 8.026,
802
  "train_steps_per_second": 0.063
803
  }
804
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "grad_norm": 579.7744748341399,
14
  "learning_rate": 1.0416666666666666e-08,
15
  "logits/chosen": -2.408252239227295,
16
  "logits/rejected": -2.408294677734375,
17
  "logps/chosen": -208.4792022705078,
18
  "logps/rejected": -178.0951690673828,
19
+ "loss": 0.69,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
22
  "rewards/margins": 0.0,
 
25
  },
26
  {
27
  "epoch": 0.02,
28
+ "grad_norm": 535.1486260819535,
29
  "learning_rate": 1.0416666666666667e-07,
30
+ "logits/chosen": -2.544614553451538,
31
+ "logits/rejected": -2.538756847381592,
32
+ "logps/chosen": -261.5119323730469,
33
+ "logps/rejected": -166.40280151367188,
34
+ "loss": 0.7009,
35
+ "rewards/accuracies": 0.4652777910232544,
36
+ "rewards/chosen": 0.008291814476251602,
37
+ "rewards/margins": 0.011442840099334717,
38
+ "rewards/rejected": -0.003151026088744402,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.04,
43
+ "grad_norm": 384.83945719674614,
44
  "learning_rate": 2.0833333333333333e-07,
45
+ "logits/chosen": -2.5195038318634033,
46
+ "logits/rejected": -2.5265002250671387,
47
+ "logps/chosen": -252.65768432617188,
48
+ "logps/rejected": -178.087158203125,
49
+ "loss": 0.5876,
50
+ "rewards/accuracies": 0.762499988079071,
51
+ "rewards/chosen": 0.10371126234531403,
52
+ "rewards/margins": 0.296107679605484,
53
+ "rewards/rejected": -0.1923964023590088,
54
  "step": 20
55
  },
56
  {
57
  "epoch": 0.06,
58
+ "grad_norm": 202.13874393663576,
59
  "learning_rate": 3.1249999999999997e-07,
60
+ "logits/chosen": -2.4858596324920654,
61
+ "logits/rejected": -2.47269606590271,
62
+ "logps/chosen": -240.64242553710938,
63
+ "logps/rejected": -181.0841522216797,
64
+ "loss": 0.3862,
65
+ "rewards/accuracies": 0.8374999761581421,
66
+ "rewards/chosen": 0.3985957205295563,
67
+ "rewards/margins": 1.4220483303070068,
68
+ "rewards/rejected": -1.023452639579773,
69
  "step": 30
70
  },
71
  {
72
  "epoch": 0.08,
73
+ "grad_norm": 142.19955934070376,
74
  "learning_rate": 4.1666666666666667e-07,
75
+ "logits/chosen": -2.4594264030456543,
76
+ "logits/rejected": -2.4308407306671143,
77
+ "logps/chosen": -283.8061828613281,
78
+ "logps/rejected": -208.5924835205078,
79
+ "loss": 0.3332,
80
+ "rewards/accuracies": 0.8687499761581421,
81
+ "rewards/chosen": 1.9889726638793945,
82
+ "rewards/margins": 3.029081344604492,
83
+ "rewards/rejected": -1.0401084423065186,
84
  "step": 40
85
  },
86
  {
87
  "epoch": 0.1,
88
+ "grad_norm": 713.6549875009309,
89
  "learning_rate": 4.999733114418725e-07,
90
+ "logits/chosen": -2.442108631134033,
91
+ "logits/rejected": -2.46040678024292,
92
+ "logps/chosen": -246.4988555908203,
93
+ "logps/rejected": -187.14138793945312,
94
+ "loss": 0.4454,
95
+ "rewards/accuracies": 0.8187500238418579,
96
+ "rewards/chosen": 2.2771620750427246,
97
+ "rewards/margins": 4.51132345199585,
98
+ "rewards/rejected": -2.234161853790283,
99
  "step": 50
100
  },
101
  {
102
  "epoch": 0.13,
103
+ "grad_norm": 206.28301377298783,
104
  "learning_rate": 4.990398100856366e-07,
105
+ "logits/chosen": -2.4876925945281982,
106
+ "logits/rejected": -2.449016571044922,
107
+ "logps/chosen": -271.1671142578125,
108
+ "logps/rejected": -196.48915100097656,
109
+ "loss": 0.282,
110
+ "rewards/accuracies": 0.887499988079071,
111
+ "rewards/chosen": 1.714887261390686,
112
+ "rewards/margins": 5.19891881942749,
113
+ "rewards/rejected": -3.4840316772460938,
114
  "step": 60
115
  },
116
  {
117
  "epoch": 0.15,
118
+ "grad_norm": 133.99852927659347,
119
  "learning_rate": 4.967775735898179e-07,
120
+ "logits/chosen": -2.4287192821502686,
121
+ "logits/rejected": -2.4621310234069824,
122
+ "logps/chosen": -274.3385925292969,
123
+ "logps/rejected": -198.64663696289062,
124
+ "loss": 0.3118,
125
+ "rewards/accuracies": 0.831250011920929,
126
+ "rewards/chosen": 3.02872371673584,
127
+ "rewards/margins": 6.160218238830566,
128
+ "rewards/rejected": -3.1314942836761475,
129
  "step": 70
130
  },
131
  {
132
  "epoch": 0.17,
133
+ "grad_norm": 274.89332519675594,
134
  "learning_rate": 4.931986719649298e-07,
135
+ "logits/chosen": -2.42030668258667,
136
+ "logits/rejected": -2.423424243927002,
137
+ "logps/chosen": -263.48272705078125,
138
+ "logps/rejected": -186.29991149902344,
139
+ "loss": 0.3017,
140
+ "rewards/accuracies": 0.893750011920929,
141
+ "rewards/chosen": 3.6749210357666016,
142
+ "rewards/margins": 6.233966827392578,
143
+ "rewards/rejected": -2.559046506881714,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 0.19,
148
+ "grad_norm": 202.07196994453352,
149
  "learning_rate": 4.883222001996351e-07,
150
+ "logits/chosen": -2.4479732513427734,
151
+ "logits/rejected": -2.4088387489318848,
152
+ "logps/chosen": -267.5174255371094,
153
+ "logps/rejected": -193.828125,
154
+ "loss": 0.3603,
155
+ "rewards/accuracies": 0.8812500238418579,
156
+ "rewards/chosen": 3.906665802001953,
157
+ "rewards/margins": 7.1938323974609375,
158
+ "rewards/rejected": -3.2871665954589844,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 0.21,
163
+ "grad_norm": 260.799637557766,
164
  "learning_rate": 4.821741763807186e-07,
165
+ "logits/chosen": -2.4610652923583984,
166
+ "logits/rejected": -2.454207181930542,
167
+ "logps/chosen": -231.1355743408203,
168
+ "logps/rejected": -183.55123901367188,
169
+ "loss": 0.3315,
170
+ "rewards/accuracies": 0.9125000238418579,
171
+ "rewards/chosen": 2.3220086097717285,
172
+ "rewards/margins": 5.911337852478027,
173
+ "rewards/rejected": -3.589329242706299,
174
  "step": 100
175
  },
176
  {
177
  "epoch": 0.21,
178
+ "eval_logits/chosen": -2.5300614833831787,
179
+ "eval_logits/rejected": -2.515779495239258,
180
+ "eval_logps/chosen": -250.55865478515625,
181
+ "eval_logps/rejected": -187.6100616455078,
182
+ "eval_loss": 0.297485888004303,
183
+ "eval_rewards/accuracies": 0.90234375,
184
+ "eval_rewards/chosen": 3.437309980392456,
185
+ "eval_rewards/margins": 7.265252113342285,
186
+ "eval_rewards/rejected": -3.82794189453125,
187
+ "eval_runtime": 97.5532,
188
+ "eval_samples_per_second": 20.502,
189
+ "eval_steps_per_second": 0.328,
190
  "step": 100
191
  },
192
  {
193
  "epoch": 0.23,
194
+ "grad_norm": 164.61060726491735,
195
  "learning_rate": 4.747874028753375e-07,
196
+ "logits/chosen": -2.36721134185791,
197
+ "logits/rejected": -2.398892879486084,
198
+ "logps/chosen": -238.1865692138672,
199
+ "logps/rejected": -193.205810546875,
200
+ "loss": 0.3602,
201
+ "rewards/accuracies": 0.893750011920929,
202
+ "rewards/chosen": 2.924015522003174,
203
+ "rewards/margins": 6.160517692565918,
204
+ "rewards/rejected": -3.2365028858184814,
205
  "step": 110
206
  },
207
  {
208
  "epoch": 0.25,
209
+ "grad_norm": 394.19842542709944,
210
  "learning_rate": 4.662012913161997e-07,
211
+ "logits/chosen": -2.4796204566955566,
212
+ "logits/rejected": -2.462428331375122,
213
+ "logps/chosen": -237.8124237060547,
214
+ "logps/rejected": -191.6023712158203,
215
+ "loss": 0.2323,
216
+ "rewards/accuracies": 0.8687499761581421,
217
+ "rewards/chosen": 3.661634922027588,
218
+ "rewards/margins": 6.902056694030762,
219
+ "rewards/rejected": -3.240421772003174,
220
  "step": 120
221
  },
222
  {
223
  "epoch": 0.27,
224
+ "grad_norm": 409.3218341794219,
225
  "learning_rate": 4.5646165232345103e-07,
226
+ "logits/chosen": -2.4416961669921875,
227
+ "logits/rejected": -2.473330497741699,
228
+ "logps/chosen": -259.2460632324219,
229
+ "logps/rejected": -185.64683532714844,
230
+ "loss": 0.3435,
231
+ "rewards/accuracies": 0.90625,
232
+ "rewards/chosen": 4.6567606925964355,
233
+ "rewards/margins": 7.721456050872803,
234
+ "rewards/rejected": -3.064694881439209,
235
  "step": 130
236
  },
237
  {
238
  "epoch": 0.29,
239
+ "grad_norm": 328.4389039905761,
240
  "learning_rate": 4.456204510851956e-07,
241
+ "logits/chosen": -2.326636791229248,
242
+ "logits/rejected": -2.3331127166748047,
243
+ "logps/chosen": -261.88006591796875,
244
+ "logps/rejected": -197.84036254882812,
245
+ "loss": 0.2942,
246
+ "rewards/accuracies": 0.862500011920929,
247
+ "rewards/chosen": 3.5752804279327393,
248
+ "rewards/margins": 6.8619065284729,
249
+ "rewards/rejected": -3.2866263389587402,
250
  "step": 140
251
  },
252
  {
253
  "epoch": 0.31,
254
+ "grad_norm": 664.6717326413732,
255
  "learning_rate": 4.337355301007335e-07,
256
+ "logits/chosen": -2.4599616527557373,
257
+ "logits/rejected": -2.4934587478637695,
258
+ "logps/chosen": -239.1993865966797,
259
+ "logps/rejected": -179.16249084472656,
260
+ "loss": 0.4201,
261
+ "rewards/accuracies": 0.875,
262
+ "rewards/chosen": 3.849210262298584,
263
+ "rewards/margins": 7.315499782562256,
264
+ "rewards/rejected": -3.46628999710083,
265
  "step": 150
266
  },
267
  {
268
  "epoch": 0.33,
269
+ "grad_norm": 389.31781863733255,
270
  "learning_rate": 4.2087030056579986e-07,
271
+ "logits/chosen": -2.4196219444274902,
272
+ "logits/rejected": -2.3700897693634033,
273
+ "logps/chosen": -260.79736328125,
274
+ "logps/rejected": -198.62042236328125,
275
+ "loss": 0.4699,
276
+ "rewards/accuracies": 0.856249988079071,
277
+ "rewards/chosen": 2.786912441253662,
278
+ "rewards/margins": 6.555578708648682,
279
+ "rewards/rejected": -3.768665313720703,
280
  "step": 160
281
  },
282
  {
283
  "epoch": 0.36,
284
+ "grad_norm": 450.98983026990686,
285
  "learning_rate": 4.070934040463998e-07,
286
+ "logits/chosen": -2.5323286056518555,
287
+ "logits/rejected": -2.534219741821289,
288
+ "logps/chosen": -251.9270782470703,
289
+ "logps/rejected": -188.5828094482422,
290
+ "loss": 0.3003,
291
+ "rewards/accuracies": 0.893750011920929,
292
+ "rewards/chosen": 2.2955946922302246,
293
+ "rewards/margins": 8.251626968383789,
294
+ "rewards/rejected": -5.956032752990723,
295
  "step": 170
296
  },
297
  {
298
  "epoch": 0.38,
299
+ "grad_norm": 222.8025881040315,
300
  "learning_rate": 3.9247834624635404e-07,
301
+ "logits/chosen": -2.4943268299102783,
302
+ "logits/rejected": -2.4797425270080566,
303
+ "logps/chosen": -261.6493835449219,
304
+ "logps/rejected": -186.13827514648438,
305
+ "loss": 0.3893,
306
+ "rewards/accuracies": 0.918749988079071,
307
+ "rewards/chosen": 3.1764323711395264,
308
+ "rewards/margins": 8.824117660522461,
309
+ "rewards/rejected": -5.6476850509643555,
310
  "step": 180
311
  },
312
  {
313
  "epoch": 0.4,
314
+ "grad_norm": 170.38903025294675,
315
  "learning_rate": 3.7710310482256523e-07,
316
+ "logits/chosen": -2.4573581218719482,
317
+ "logits/rejected": -2.4471302032470703,
318
+ "logps/chosen": -242.3362274169922,
319
+ "logps/rejected": -206.1191864013672,
320
+ "loss": 0.2873,
321
+ "rewards/accuracies": 0.8812500238418579,
322
+ "rewards/chosen": 2.354017496109009,
323
+ "rewards/margins": 8.386995315551758,
324
+ "rewards/rejected": -6.0329766273498535,
325
  "step": 190
326
  },
327
  {
328
  "epoch": 0.42,
329
+ "grad_norm": 233.638489796621,
330
  "learning_rate": 3.610497133404795e-07,
331
+ "logits/chosen": -2.4837393760681152,
332
+ "logits/rejected": -2.4798505306243896,
333
+ "logps/chosen": -252.32040405273438,
334
+ "logps/rejected": -173.71713256835938,
335
+ "loss": 0.2909,
336
+ "rewards/accuracies": 0.925000011920929,
337
+ "rewards/chosen": 3.7421042919158936,
338
+ "rewards/margins": 8.136415481567383,
339
+ "rewards/rejected": -4.394310474395752,
340
  "step": 200
341
  },
342
  {
343
  "epoch": 0.42,
344
+ "eval_logits/chosen": -2.5459139347076416,
345
+ "eval_logits/rejected": -2.5309653282165527,
346
+ "eval_logps/chosen": -247.70968627929688,
347
+ "eval_logps/rejected": -188.19419860839844,
348
+ "eval_loss": 0.2753676474094391,
349
+ "eval_rewards/accuracies": 0.91796875,
350
+ "eval_rewards/chosen": 4.861801624298096,
351
+ "eval_rewards/margins": 8.981813430786133,
352
+ "eval_rewards/rejected": -4.120011806488037,
353
+ "eval_runtime": 97.2888,
354
+ "eval_samples_per_second": 20.557,
355
+ "eval_steps_per_second": 0.329,
356
  "step": 200
357
  },
358
  {
359
  "epoch": 0.44,
360
+ "grad_norm": 337.30781066112155,
361
  "learning_rate": 3.4440382358952115e-07,
362
+ "logits/chosen": -2.4054043292999268,
363
+ "logits/rejected": -2.334240436553955,
364
+ "logps/chosen": -229.5664825439453,
365
+ "logps/rejected": -177.37989807128906,
366
+ "loss": 0.3982,
367
+ "rewards/accuracies": 0.824999988079071,
368
+ "rewards/chosen": 3.1289913654327393,
369
+ "rewards/margins": 6.229460716247559,
370
+ "rewards/rejected": -3.100468873977661,
371
  "step": 210
372
  },
373
  {
374
  "epoch": 0.46,
375
+ "grad_norm": 196.23824528381684,
376
  "learning_rate": 3.272542485937368e-07,
377
+ "logits/chosen": -2.4668684005737305,
378
+ "logits/rejected": -2.4853501319885254,
379
+ "logps/chosen": -247.2740478515625,
380
+ "logps/rejected": -182.8220977783203,
381
+ "loss": 0.564,
382
+ "rewards/accuracies": 0.925000011920929,
383
+ "rewards/chosen": 4.549849510192871,
384
+ "rewards/margins": 9.18285083770752,
385
+ "rewards/rejected": -4.633000373840332,
386
  "step": 220
387
  },
388
  {
389
  "epoch": 0.48,
390
+ "grad_norm": 188.59878410714853,
391
  "learning_rate": 3.096924887558854e-07,
392
+ "logits/chosen": -2.4467105865478516,
393
+ "logits/rejected": -2.394397735595703,
394
+ "logps/chosen": -269.22100830078125,
395
+ "logps/rejected": -207.54287719726562,
396
+ "loss": 0.61,
397
+ "rewards/accuracies": 0.8999999761581421,
398
+ "rewards/chosen": 3.9587960243225098,
399
+ "rewards/margins": 9.096400260925293,
400
+ "rewards/rejected": -5.137604713439941,
401
  "step": 230
402
  },
403
  {
404
  "epoch": 0.5,
405
+ "grad_norm": 248.56418253631352,
406
  "learning_rate": 2.9181224366319943e-07,
407
+ "logits/chosen": -2.438028335571289,
408
+ "logits/rejected": -2.4430088996887207,
409
+ "logps/chosen": -228.5868377685547,
410
+ "logps/rejected": -184.39010620117188,
411
+ "loss": 0.2929,
412
+ "rewards/accuracies": 0.90625,
413
+ "rewards/chosen": 2.860379934310913,
414
+ "rewards/margins": 8.014602661132812,
415
+ "rewards/rejected": -5.15422248840332,
416
  "step": 240
417
  },
418
  {
419
  "epoch": 0.52,
420
+ "grad_norm": 236.10656708079867,
421
  "learning_rate": 2.7370891215954565e-07,
422
+ "logits/chosen": -2.468085527420044,
423
+ "logits/rejected": -2.471389055252075,
424
+ "logps/chosen": -245.74221801757812,
425
+ "logps/rejected": -189.75096130371094,
426
+ "loss": 0.2793,
427
+ "rewards/accuracies": 0.918749988079071,
428
+ "rewards/chosen": 4.629996299743652,
429
+ "rewards/margins": 8.119891166687012,
430
+ "rewards/rejected": -3.489894151687622,
431
  "step": 250
432
  },
433
  {
434
  "epoch": 0.54,
435
+ "grad_norm": 536.510359732396,
436
  "learning_rate": 2.55479083351317e-07,
437
+ "logits/chosen": -2.4861550331115723,
438
+ "logits/rejected": -2.4899330139160156,
439
+ "logps/chosen": -254.806396484375,
440
+ "logps/rejected": -172.7798614501953,
441
+ "loss": 0.343,
442
+ "rewards/accuracies": 0.8500000238418579,
443
+ "rewards/chosen": 4.790152549743652,
444
+ "rewards/margins": 9.49361515045166,
445
+ "rewards/rejected": -4.703463077545166,
446
  "step": 260
447
  },
448
  {
449
  "epoch": 0.56,
450
+ "grad_norm": 203.6794572887938,
451
  "learning_rate": 2.3722002126275822e-07,
452
+ "logits/chosen": -2.456101179122925,
453
+ "logits/rejected": -2.407947063446045,
454
+ "logps/chosen": -236.4749755859375,
455
+ "logps/rejected": -187.09112548828125,
456
+ "loss": 0.2939,
457
+ "rewards/accuracies": 0.8687499761581421,
458
+ "rewards/chosen": 3.9466958045959473,
459
+ "rewards/margins": 8.208832740783691,
460
+ "rewards/rejected": -4.262135982513428,
461
  "step": 270
462
  },
463
  {
464
  "epoch": 0.59,
465
+ "grad_norm": 394.1475095150393,
466
  "learning_rate": 2.19029145890313e-07,
467
+ "logits/chosen": -2.343445301055908,
468
+ "logits/rejected": -2.3995354175567627,
469
+ "logps/chosen": -257.370361328125,
470
+ "logps/rejected": -201.5979461669922,
471
+ "loss": 0.3501,
472
+ "rewards/accuracies": 0.875,
473
+ "rewards/chosen": 3.717083692550659,
474
+ "rewards/margins": 8.61104965209961,
475
+ "rewards/rejected": -4.893965721130371,
476
  "step": 280
477
  },
478
  {
479
  "epoch": 0.61,
480
+ "grad_norm": 206.90087071186883,
481
  "learning_rate": 2.0100351342479216e-07,
482
+ "logits/chosen": -2.4418747425079346,
483
+ "logits/rejected": -2.428316593170166,
484
+ "logps/chosen": -234.2547607421875,
485
+ "logps/rejected": -182.3361053466797,
486
+ "loss": 0.3955,
487
+ "rewards/accuracies": 0.8687499761581421,
488
+ "rewards/chosen": 3.4869391918182373,
489
+ "rewards/margins": 8.724508285522461,
490
+ "rewards/rejected": -5.2375688552856445,
491
  "step": 290
492
  },
493
  {
494
  "epoch": 0.63,
495
+ "grad_norm": 232.39029956497873,
496
  "learning_rate": 1.8323929841460178e-07,
497
+ "logits/chosen": -2.434248208999634,
498
+ "logits/rejected": -2.357980251312256,
499
+ "logps/chosen": -265.85308837890625,
500
+ "logps/rejected": -211.96859741210938,
501
+ "loss": 0.6445,
502
+ "rewards/accuracies": 0.8999999761581421,
503
+ "rewards/chosen": 3.5436806678771973,
504
+ "rewards/margins": 8.844223976135254,
505
+ "rewards/rejected": -5.300544738769531,
506
  "step": 300
507
  },
508
  {
509
  "epoch": 0.63,
510
+ "eval_logits/chosen": -2.5059330463409424,
511
+ "eval_logits/rejected": -2.491495132446289,
512
+ "eval_logps/chosen": -249.0003662109375,
513
+ "eval_logps/rejected": -190.69680786132812,
514
+ "eval_loss": 0.22447091341018677,
515
+ "eval_rewards/accuracies": 0.91015625,
516
+ "eval_rewards/chosen": 4.216455936431885,
517
+ "eval_rewards/margins": 9.587770462036133,
518
+ "eval_rewards/rejected": -5.371314525604248,
519
+ "eval_runtime": 97.2649,
520
+ "eval_samples_per_second": 20.562,
521
+ "eval_steps_per_second": 0.329,
522
  "step": 300
523
  },
524
  {
525
  "epoch": 0.65,
526
+ "grad_norm": 216.5700858394941,
527
  "learning_rate": 1.6583128063291573e-07,
528
+ "logits/chosen": -2.4202470779418945,
529
+ "logits/rejected": -2.42097806930542,
530
+ "logps/chosen": -255.98904418945312,
531
+ "logps/rejected": -196.6630401611328,
532
+ "loss": 0.3789,
533
+ "rewards/accuracies": 0.8812500238418579,
534
+ "rewards/chosen": 3.9936203956604004,
535
+ "rewards/margins": 8.884687423706055,
536
+ "rewards/rejected": -4.891066551208496,
537
  "step": 310
538
  },
539
  {
540
  "epoch": 0.67,
541
+ "grad_norm": 232.75719945601446,
542
  "learning_rate": 1.488723393865766e-07,
543
+ "logits/chosen": -2.493706464767456,
544
+ "logits/rejected": -2.461465358734131,
545
+ "logps/chosen": -238.67599487304688,
546
+ "logps/rejected": -194.2203826904297,
547
+ "loss": 0.3515,
548
+ "rewards/accuracies": 0.8687499761581421,
549
+ "rewards/chosen": 4.206465244293213,
550
+ "rewards/margins": 9.311426162719727,
551
+ "rewards/rejected": -5.104961395263672,
552
  "step": 320
553
  },
554
  {
555
  "epoch": 0.69,
556
+ "grad_norm": 281.239048485679,
557
  "learning_rate": 1.3245295796480788e-07,
558
+ "logits/chosen": -2.4040043354034424,
559
+ "logits/rejected": -2.4380762577056885,
560
+ "logps/chosen": -247.7274169921875,
561
+ "logps/rejected": -179.15911865234375,
562
+ "loss": 0.4043,
563
+ "rewards/accuracies": 0.893750011920929,
564
+ "rewards/chosen": 5.208529472351074,
565
+ "rewards/margins": 9.102964401245117,
566
+ "rewards/rejected": -3.8944339752197266,
567
  "step": 330
568
  },
569
  {
570
  "epoch": 0.71,
571
+ "grad_norm": 183.00963279480595,
572
  "learning_rate": 1.1666074087171627e-07,
573
+ "logits/chosen": -2.3740360736846924,
574
+ "logits/rejected": -2.4260330200195312,
575
+ "logps/chosen": -235.00357055664062,
576
+ "logps/rejected": -166.9297637939453,
577
+ "loss": 0.2751,
578
+ "rewards/accuracies": 0.8687499761581421,
579
+ "rewards/chosen": 4.350711822509766,
580
+ "rewards/margins": 7.54253625869751,
581
+ "rewards/rejected": -3.1918249130249023,
582
  "step": 340
583
  },
584
  {
585
  "epoch": 0.73,
586
+ "grad_norm": 258.2328231454923,
587
  "learning_rate": 1.0157994641835734e-07,
588
+ "logits/chosen": -2.4078361988067627,
589
+ "logits/rejected": -2.4156928062438965,
590
+ "logps/chosen": -234.7695770263672,
591
+ "logps/rejected": -214.1270294189453,
592
+ "loss": 0.3005,
593
+ "rewards/accuracies": 0.887499988079071,
594
+ "rewards/chosen": 4.440161228179932,
595
+ "rewards/margins": 9.19445514678955,
596
+ "rewards/rejected": -4.754293918609619,
597
  "step": 350
598
  },
599
  {
600
  "epoch": 0.75,
601
+ "grad_norm": 260.2428729882813,
602
  "learning_rate": 8.729103716819111e-08,
603
+ "logits/chosen": -2.372528076171875,
604
+ "logits/rejected": -2.3916471004486084,
605
+ "logps/chosen": -241.23934936523438,
606
+ "logps/rejected": -187.92953491210938,
607
+ "loss": 0.2667,
608
+ "rewards/accuracies": 0.875,
609
+ "rewards/chosen": 4.416121482849121,
610
+ "rewards/margins": 8.337173461914062,
611
+ "rewards/rejected": -3.921051502227783,
612
  "step": 360
613
  },
614
  {
615
  "epoch": 0.77,
616
+ "grad_norm": 316.09399625633,
617
  "learning_rate": 7.387025063449081e-08,
618
+ "logits/chosen": -2.4834017753601074,
619
+ "logits/rejected": -2.428774118423462,
620
+ "logps/chosen": -223.4984893798828,
621
+ "logps/rejected": -176.99790954589844,
622
+ "loss": 0.3594,
623
+ "rewards/accuracies": 0.90625,
624
+ "rewards/chosen": 3.544004440307617,
625
+ "rewards/margins": 8.096233367919922,
626
+ "rewards/rejected": -4.552228927612305,
627
  "step": 370
628
  },
629
  {
630
  "epoch": 0.79,
631
+ "grad_norm": 229.93737644129538,
632
  "learning_rate": 6.138919252022435e-08,
633
+ "logits/chosen": -2.4690134525299072,
634
+ "logits/rejected": -2.4799323081970215,
635
+ "logps/chosen": -249.5747528076172,
636
+ "logps/rejected": -198.73275756835938,
637
+ "loss": 0.3121,
638
  "rewards/accuracies": 0.8999999761581421,
639
+ "rewards/chosen": 3.8754029273986816,
640
+ "rewards/margins": 9.315340042114258,
641
+ "rewards/rejected": -5.439937591552734,
642
  "step": 380
643
  },
644
  {
645
  "epoch": 0.82,
646
+ "grad_norm": 178.09073107787978,
647
  "learning_rate": 4.991445467064689e-08,
648
+ "logits/chosen": -2.463972568511963,
649
+ "logits/rejected": -2.4395322799682617,
650
+ "logps/chosen": -239.1776123046875,
651
+ "logps/rejected": -189.71803283691406,
652
+ "loss": 0.222,
653
+ "rewards/accuracies": 0.9375,
654
+ "rewards/chosen": 3.557317018508911,
655
+ "rewards/margins": 8.469549179077148,
656
+ "rewards/rejected": -4.912230968475342,
657
  "step": 390
658
  },
659
  {
660
  "epoch": 0.84,
661
+ "grad_norm": 211.56309254204024,
662
  "learning_rate": 3.9507259776993954e-08,
663
+ "logits/chosen": -2.3866400718688965,
664
+ "logits/rejected": -2.4389476776123047,
665
+ "logps/chosen": -239.34860229492188,
666
+ "logps/rejected": -181.77676391601562,
667
+ "loss": 0.2653,
668
+ "rewards/accuracies": 0.8812500238418579,
669
+ "rewards/chosen": 3.949272632598877,
670
+ "rewards/margins": 8.648920059204102,
671
+ "rewards/rejected": -4.699648857116699,
672
  "step": 400
673
  },
674
  {
675
  "epoch": 0.84,
676
+ "eval_logits/chosen": -2.512333631515503,
677
+ "eval_logits/rejected": -2.4965250492095947,
678
+ "eval_logps/chosen": -248.40835571289062,
679
+ "eval_logps/rejected": -190.5157470703125,
680
+ "eval_loss": 0.21030600368976593,
681
+ "eval_rewards/accuracies": 0.92578125,
682
+ "eval_rewards/chosen": 4.512471675872803,
683
+ "eval_rewards/margins": 9.793259620666504,
684
+ "eval_rewards/rejected": -5.280787944793701,
685
+ "eval_runtime": 97.2069,
686
+ "eval_samples_per_second": 20.575,
687
  "eval_steps_per_second": 0.329,
688
  "step": 400
689
  },
690
  {
691
  "epoch": 0.86,
692
+ "grad_norm": 337.4045735200527,
693
  "learning_rate": 3.022313472693447e-08,
694
+ "logits/chosen": -2.431990146636963,
695
+ "logits/rejected": -2.4296875,
696
+ "logps/chosen": -269.318603515625,
697
+ "logps/rejected": -194.96946716308594,
698
+ "loss": 0.6111,
699
+ "rewards/accuracies": 0.8999999761581421,
700
+ "rewards/chosen": 3.5342013835906982,
701
+ "rewards/margins": 8.450251579284668,
702
+ "rewards/rejected": -4.916050910949707,
703
  "step": 410
704
  },
705
  {
706
  "epoch": 0.88,
707
+ "grad_norm": 403.24647604289254,
708
  "learning_rate": 2.2111614344599684e-08,
709
+ "logits/chosen": -2.39760684967041,
710
+ "logits/rejected": -2.376661777496338,
711
+ "logps/chosen": -237.7588653564453,
712
+ "logps/rejected": -190.60279846191406,
713
+ "loss": 0.3801,
714
+ "rewards/accuracies": 0.875,
715
+ "rewards/chosen": 3.0914576053619385,
716
+ "rewards/margins": 7.803999423980713,
717
+ "rewards/rejected": -4.712540626525879,
718
  "step": 420
719
  },
720
  {
721
  "epoch": 0.9,
722
+ "grad_norm": 208.90618387532385,
723
  "learning_rate": 1.521597710086439e-08,
724
+ "logits/chosen": -2.4121103286743164,
725
+ "logits/rejected": -2.3859496116638184,
726
+ "logps/chosen": -263.43292236328125,
727
+ "logps/rejected": -191.71656799316406,
728
+ "loss": 0.3449,
729
+ "rewards/accuracies": 0.887499988079071,
730
+ "rewards/chosen": 3.8668575286865234,
731
+ "rewards/margins": 8.32303524017334,
732
+ "rewards/rejected": -4.456177234649658,
733
  "step": 430
734
  },
735
  {
736
  "epoch": 0.92,
737
+ "grad_norm": 221.6697110810539,
738
  "learning_rate": 9.57301420397924e-09,
739
+ "logits/chosen": -2.5127618312835693,
740
+ "logits/rejected": -2.4956202507019043,
741
+ "logps/chosen": -254.6200408935547,
742
+ "logps/rejected": -190.78817749023438,
743
+ "loss": 0.3624,
744
+ "rewards/accuracies": 0.918749988079071,
745
+ "rewards/chosen": 4.447186470031738,
746
+ "rewards/margins": 9.914505004882812,
747
+ "rewards/rejected": -5.467317581176758,
748
  "step": 440
749
  },
750
  {
751
  "epoch": 0.94,
752
+ "grad_norm": 304.27875154546507,
753
  "learning_rate": 5.212833302556258e-09,
754
+ "logits/chosen": -2.5326547622680664,
755
+ "logits/rejected": -2.4754910469055176,
756
+ "logps/chosen": -242.94784545898438,
757
+ "logps/rejected": -187.10147094726562,
758
+ "loss": 0.2887,
759
+ "rewards/accuracies": 0.918749988079071,
760
+ "rewards/chosen": 4.133586406707764,
761
+ "rewards/margins": 9.706035614013672,
762
+ "rewards/rejected": -5.572449207305908,
763
  "step": 450
764
  },
765
  {
766
  "epoch": 0.96,
767
+ "grad_norm": 480.62571484354635,
768
  "learning_rate": 2.158697848236607e-09,
769
+ "logits/chosen": -2.419916868209839,
770
+ "logits/rejected": -2.4316189289093018,
771
+ "logps/chosen": -255.69723510742188,
772
+ "logps/rejected": -203.59130859375,
773
+ "loss": 0.4414,
774
  "rewards/accuracies": 0.893750011920929,
775
+ "rewards/chosen": 3.5629382133483887,
776
+ "rewards/margins": 8.139005661010742,
777
+ "rewards/rejected": -4.576067924499512,
778
  "step": 460
779
  },
780
  {
781
  "epoch": 0.98,
782
+ "grad_norm": 258.3454560672873,
783
  "learning_rate": 4.269029751107489e-10,
784
+ "logits/chosen": -2.416884660720825,
785
+ "logits/rejected": -2.3896336555480957,
786
+ "logps/chosen": -230.7972412109375,
787
+ "logps/rejected": -200.25830078125,
788
+ "loss": 0.2781,
789
+ "rewards/accuracies": 0.862500011920929,
790
+ "rewards/chosen": 3.390946865081787,
791
+ "rewards/margins": 9.046823501586914,
792
+ "rewards/rejected": -5.655877113342285,
793
  "step": 470
794
  },
795
  {
796
  "epoch": 1.0,
797
  "step": 478,
798
  "total_flos": 0.0,
799
+ "train_loss": 0.4018711235732713,
800
+ "train_runtime": 7633.33,
801
+ "train_samples_per_second": 8.009,
802
  "train_steps_per_second": 0.063
803
  }
804
  ],