Minbyul commited on
Commit
91fa69c
1 Parent(s): 3251ecb

Model save

Browse files
README.md CHANGED
@@ -17,15 +17,15 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 0.2889
21
- - Rewards/chosen: -0.9091
22
- - Rewards/rejected: -3.8737
23
- - Rewards/accuracies: 0.8250
24
- - Rewards/margins: 2.9646
25
- - Logps/rejected: -1290.0824
26
- - Logps/chosen: -656.5975
27
- - Logits/rejected: -2.9198
28
- - Logits/chosen: -3.1186
29
 
30
  ## Model description
31
 
@@ -62,7 +62,7 @@ The following hyperparameters were used during training:
62
 
63
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
64
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
65
- | 0.1851 | 0.53 | 100 | 0.2889 | -0.9091 | -3.8737 | 0.8250 | 2.9646 | -1290.0824 | -656.5975 | -2.9198 | -3.1186 |
66
 
67
 
68
  ### Framework versions
 
17
 
18
  This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.2503
21
+ - Rewards/chosen: -1.6026
22
+ - Rewards/rejected: -5.5026
23
+ - Rewards/accuracies: 0.8313
24
+ - Rewards/margins: 3.9001
25
+ - Logps/rejected: -1452.9772
26
+ - Logps/chosen: -725.9427
27
+ - Logits/rejected: -2.7934
28
+ - Logits/chosen: -3.0544
29
 
30
  ## Model description
31
 
 
62
 
63
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
64
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
65
+ | 0.1645 | 0.53 | 100 | 0.2503 | -1.6026 | -5.5026 | 0.8313 | 3.9001 | -1452.9772 | -725.9427 | -2.7934 | -3.0544 |
66
 
67
 
68
  ### Framework versions
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.27576306701343967,
4
- "train_runtime": 3082.9663,
5
  "train_samples": 11996,
6
- "train_samples_per_second": 3.891,
7
- "train_steps_per_second": 0.061
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.2699868052719749,
4
+ "train_runtime": 2833.2764,
5
  "train_samples": 11996,
6
+ "train_samples_per_second": 4.234,
7
+ "train_steps_per_second": 0.066
8
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:267f7e96c3a254a09f752a3256ac512b2e248b9cfe7fa3017005c580906977d2
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eca7a7e2d10da3fc65162096af6021270b1b5deeecc1f58bb27f04941da24365
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:493c8e696fb4bdba9978b1cb053a743fe75c8954129d62aaec43baaa987cd934
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78281ee40099c3707606330f350cdb85ecb558b99e3d78e52e60a371a73a9d7f
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dfc8ff93ae161def1d5859043c3dc04f83ca0519c89c4a8b0f846dd7572f64e8
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b45bed137d92aa903ceaed4637df045bd7b336959148997009a053d184c9ea4
3
  size 4540516344
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.27576306701343967,
4
- "train_runtime": 3082.9663,
5
  "train_samples": 11996,
6
- "train_samples_per_second": 3.891,
7
- "train_steps_per_second": 0.061
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.2699868052719749,
4
+ "train_runtime": 2833.2764,
5
  "train_samples": 11996,
6
+ "train_samples_per_second": 4.234,
7
+ "train_steps_per_second": 0.066
8
  }
trainer_state.json CHANGED
@@ -10,7 +10,7 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.01,
13
- "grad_norm": 19.110474755913092,
14
  "learning_rate": 2.6315789473684208e-08,
15
  "logits/chosen": -2.964515209197998,
16
  "logits/rejected": -2.865140914916992,
@@ -25,298 +25,298 @@
25
  },
26
  {
27
  "epoch": 0.05,
28
- "grad_norm": 18.87772208802828,
29
  "learning_rate": 2.631578947368421e-07,
30
- "logits/chosen": -2.773491859436035,
31
- "logits/rejected": -2.7407619953155518,
32
- "logps/chosen": -604.6580810546875,
33
- "logps/rejected": -1056.211181640625,
34
- "loss": 0.6925,
35
- "rewards/accuracies": 0.5069444179534912,
36
- "rewards/chosen": 0.0016368491342291236,
37
- "rewards/margins": 0.001945263589732349,
38
- "rewards/rejected": -0.00030841471743769944,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.11,
43
- "grad_norm": 19.583298204247857,
44
  "learning_rate": 4.999562902281866e-07,
45
- "logits/chosen": -2.7963502407073975,
46
- "logits/rejected": -2.827116012573242,
47
- "logps/chosen": -571.3228759765625,
48
- "logps/rejected": -971.4885864257812,
49
- "loss": 0.675,
50
- "rewards/accuracies": 0.706250011920929,
51
- "rewards/chosen": 0.03118445910513401,
52
- "rewards/margins": 0.03232298418879509,
53
- "rewards/rejected": -0.0011385272955521941,
54
  "step": 20
55
  },
56
  {
57
  "epoch": 0.16,
58
- "grad_norm": 23.803330324477223,
59
  "learning_rate": 4.947295864744121e-07,
60
- "logits/chosen": -2.8588290214538574,
61
- "logits/rejected": -2.8853542804718018,
62
- "logps/chosen": -529.5823974609375,
63
- "logps/rejected": -1093.4755859375,
64
- "loss": 0.6299,
65
  "rewards/accuracies": 0.7749999761581421,
66
- "rewards/chosen": 0.04894017428159714,
67
- "rewards/margins": 0.15211351215839386,
68
- "rewards/rejected": -0.10317333787679672,
69
  "step": 30
70
  },
71
  {
72
  "epoch": 0.21,
73
- "grad_norm": 73.67534704558324,
74
  "learning_rate": 4.809698831278217e-07,
75
- "logits/chosen": -3.104024648666382,
76
- "logits/rejected": -3.1031734943389893,
77
- "logps/chosen": -630.6378173828125,
78
- "logps/rejected": -1098.404541015625,
79
- "loss": 0.5082,
80
  "rewards/accuracies": 0.7437499761581421,
81
- "rewards/chosen": -0.45488986372947693,
82
- "rewards/margins": 0.6105720400810242,
83
- "rewards/rejected": -1.0654619932174683,
84
  "step": 40
85
  },
86
  {
87
  "epoch": 0.27,
88
- "grad_norm": 70.45752193577601,
89
  "learning_rate": 4.591569405016049e-07,
90
- "logits/chosen": -3.1391983032226562,
91
- "logits/rejected": -3.341365098953247,
92
- "logps/chosen": -616.8588256835938,
93
- "logps/rejected": -1329.1591796875,
94
- "loss": 0.3004,
95
  "rewards/accuracies": 0.856249988079071,
96
- "rewards/chosen": -0.7416905164718628,
97
- "rewards/margins": 2.4845588207244873,
98
- "rewards/rejected": -3.2262492179870605,
99
  "step": 50
100
  },
101
  {
102
  "epoch": 0.32,
103
- "grad_norm": 39.13638850853986,
104
  "learning_rate": 4.3005131163403164e-07,
105
- "logits/chosen": -3.2234439849853516,
106
- "logits/rejected": -3.387814998626709,
107
- "logps/chosen": -613.5189208984375,
108
- "logps/rejected": -1566.4166259765625,
109
- "loss": 0.2494,
110
- "rewards/accuracies": 0.925000011920929,
111
- "rewards/chosen": -0.7437312006950378,
112
- "rewards/margins": 4.0797834396362305,
113
- "rewards/rejected": -4.823514938354492,
114
  "step": 60
115
  },
116
  {
117
  "epoch": 0.37,
118
- "grad_norm": 64.03359322882075,
119
  "learning_rate": 3.946678240449515e-07,
120
- "logits/chosen": -2.9871840476989746,
121
- "logits/rejected": -3.163975715637207,
122
- "logps/chosen": -617.3074340820312,
123
- "logps/rejected": -1532.427978515625,
124
- "loss": 0.2231,
125
- "rewards/accuracies": 0.90625,
126
- "rewards/chosen": -0.8076885938644409,
127
- "rewards/margins": 4.495209693908691,
128
- "rewards/rejected": -5.302898406982422,
129
  "step": 70
130
  },
131
  {
132
  "epoch": 0.43,
133
- "grad_norm": 30.339221409943534,
134
  "learning_rate": 3.5424019569033206e-07,
135
- "logits/chosen": -2.9922754764556885,
136
- "logits/rejected": -3.0162343978881836,
137
- "logps/chosen": -707.5257568359375,
138
- "logps/rejected": -1704.016357421875,
139
- "loss": 0.2364,
140
  "rewards/accuracies": 0.9312499761581421,
141
- "rewards/chosen": -1.1326261758804321,
142
- "rewards/margins": 5.4823126792907715,
143
- "rewards/rejected": -6.614939212799072,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 0.48,
148
- "grad_norm": 29.449732213039997,
149
  "learning_rate": 3.1017801885224326e-07,
150
- "logits/chosen": -3.0152523517608643,
151
- "logits/rejected": -3.0497183799743652,
152
- "logps/chosen": -653.091552734375,
153
- "logps/rejected": -1524.4569091796875,
154
- "loss": 0.2151,
155
- "rewards/accuracies": 0.918749988079071,
156
- "rewards/chosen": -0.900044322013855,
157
- "rewards/margins": 4.4186201095581055,
158
- "rewards/rejected": -5.318665027618408,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 0.53,
163
- "grad_norm": 30.099525435462873,
164
  "learning_rate": 2.640176118092979e-07,
165
- "logits/chosen": -2.930925130844116,
166
- "logits/rejected": -3.0645267963409424,
167
- "logps/chosen": -727.9111938476562,
168
- "logps/rejected": -1594.2391357421875,
169
- "loss": 0.1851,
170
- "rewards/accuracies": 0.8812500238418579,
171
- "rewards/chosen": -0.967258632183075,
172
- "rewards/margins": 4.204586505889893,
173
- "rewards/rejected": -5.171844482421875,
174
  "step": 100
175
  },
176
  {
177
  "epoch": 0.53,
178
- "eval_logits/chosen": -3.1186375617980957,
179
- "eval_logits/rejected": -2.9198145866394043,
180
- "eval_logps/chosen": -656.5974731445312,
181
- "eval_logps/rejected": -1290.0823974609375,
182
- "eval_loss": 0.288867712020874,
183
- "eval_rewards/accuracies": 0.824999988079071,
184
- "eval_rewards/chosen": -0.9091285467147827,
185
- "eval_rewards/margins": 2.96455717086792,
186
- "eval_rewards/rejected": -3.873685836791992,
187
- "eval_runtime": 78.3784,
188
- "eval_samples_per_second": 7.859,
189
- "eval_steps_per_second": 0.255,
190
  "step": 100
191
  },
192
  {
193
  "epoch": 0.59,
194
- "grad_norm": 44.578131376508864,
195
  "learning_rate": 2.1736845194498716e-07,
196
- "logits/chosen": -2.9603259563446045,
197
- "logits/rejected": -3.03521466255188,
198
- "logps/chosen": -594.0872802734375,
199
- "logps/rejected": -1612.428466796875,
200
- "loss": 0.1691,
201
  "rewards/accuracies": 0.925000011920929,
202
- "rewards/chosen": -0.8532097935676575,
203
- "rewards/margins": 5.7642388343811035,
204
- "rewards/rejected": -6.6174492835998535,
205
  "step": 110
206
  },
207
  {
208
  "epoch": 0.64,
209
- "grad_norm": 45.452576155560585,
210
  "learning_rate": 1.718570580135889e-07,
211
- "logits/chosen": -2.975057363510132,
212
- "logits/rejected": -3.041111707687378,
213
- "logps/chosen": -633.195068359375,
214
- "logps/rejected": -1821.9937744140625,
215
- "loss": 0.1416,
216
- "rewards/accuracies": 0.9312499761581421,
217
- "rewards/chosen": -1.0681055784225464,
218
- "rewards/margins": 6.503140926361084,
219
- "rewards/rejected": -7.571246147155762,
220
  "step": 120
221
  },
222
  {
223
  "epoch": 0.69,
224
- "grad_norm": 44.49014668597494,
225
  "learning_rate": 1.2907027822369005e-07,
226
- "logits/chosen": -2.963229179382324,
227
- "logits/rejected": -3.067150831222534,
228
- "logps/chosen": -721.2030029296875,
229
- "logps/rejected": -1949.7958984375,
230
- "loss": 0.1515,
231
- "rewards/accuracies": 0.918749988079071,
232
- "rewards/chosen": -1.216491937637329,
233
- "rewards/margins": 7.717434883117676,
234
- "rewards/rejected": -8.93392562866211,
235
  "step": 130
236
  },
237
  {
238
  "epoch": 0.75,
239
- "grad_norm": 24.238718650060004,
240
  "learning_rate": 9.049996151674788e-08,
241
- "logits/chosen": -3.040541172027588,
242
- "logits/rejected": -3.041283130645752,
243
- "logps/chosen": -626.3173828125,
244
- "logps/rejected": -1778.029541015625,
245
- "loss": 0.185,
246
- "rewards/accuracies": 0.9312499761581421,
247
- "rewards/chosen": -0.9689780473709106,
248
- "rewards/margins": 6.321756362915039,
249
- "rewards/rejected": -7.29073429107666,
250
  "step": 140
251
  },
252
  {
253
  "epoch": 0.8,
254
- "grad_norm": 25.38260287029375,
255
  "learning_rate": 5.74909411901843e-08,
256
- "logits/chosen": -2.9344351291656494,
257
- "logits/rejected": -2.9482829570770264,
258
- "logps/chosen": -602.518798828125,
259
- "logps/rejected": -1676.466796875,
260
- "loss": 0.141,
261
- "rewards/accuracies": 0.96875,
262
- "rewards/chosen": -0.8014429807662964,
263
- "rewards/margins": 5.64138126373291,
264
- "rewards/rejected": -6.442823886871338,
265
  "step": 150
266
  },
267
  {
268
  "epoch": 0.85,
269
- "grad_norm": 37.72257002210603,
270
  "learning_rate": 3.119414452281158e-08,
271
- "logits/chosen": -2.971004009246826,
272
- "logits/rejected": -3.0024333000183105,
273
- "logps/chosen": -657.559326171875,
274
- "logps/rejected": -1856.759033203125,
275
- "loss": 0.1321,
276
- "rewards/accuracies": 0.9437500238418579,
277
- "rewards/chosen": -0.8327785730361938,
278
- "rewards/margins": 7.157062530517578,
279
- "rewards/rejected": -7.989840507507324,
280
  "step": 160
281
  },
282
  {
283
  "epoch": 0.91,
284
- "grad_norm": 23.531414397024523,
285
  "learning_rate": 1.2526463331788501e-08,
286
- "logits/chosen": -3.06162428855896,
287
- "logits/rejected": -2.9571709632873535,
288
- "logps/chosen": -637.3997192382812,
289
- "logps/rejected": -1728.625732421875,
290
- "loss": 0.133,
291
  "rewards/accuracies": 0.9624999761581421,
292
- "rewards/chosen": -0.9682890772819519,
293
- "rewards/margins": 6.160744667053223,
294
- "rewards/rejected": -7.129033088684082,
295
  "step": 170
296
  },
297
  {
298
  "epoch": 0.96,
299
- "grad_norm": 29.678702291223654,
300
  "learning_rate": 2.1387846565474044e-09,
301
- "logits/chosen": -3.0236430168151855,
302
- "logits/rejected": -2.9661448001861572,
303
- "logps/chosen": -605.8822021484375,
304
- "logps/rejected": -1746.495849609375,
305
- "loss": 0.1363,
306
- "rewards/accuracies": 0.9312499761581421,
307
- "rewards/chosen": -1.0128748416900635,
308
- "rewards/margins": 6.32363224029541,
309
- "rewards/rejected": -7.3365068435668945,
310
  "step": 180
311
  },
312
  {
313
  "epoch": 1.0,
314
  "step": 187,
315
  "total_flos": 0.0,
316
- "train_loss": 0.27576306701343967,
317
- "train_runtime": 3082.9663,
318
- "train_samples_per_second": 3.891,
319
- "train_steps_per_second": 0.061
320
  }
321
  ],
322
  "logging_steps": 10,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.01,
13
+ "grad_norm": 19.109572167610484,
14
  "learning_rate": 2.6315789473684208e-08,
15
  "logits/chosen": -2.964515209197998,
16
  "logits/rejected": -2.865140914916992,
 
25
  },
26
  {
27
  "epoch": 0.05,
28
+ "grad_norm": 18.895223645335697,
29
  "learning_rate": 2.631578947368421e-07,
30
+ "logits/chosen": -2.7736825942993164,
31
+ "logits/rejected": -2.7408108711242676,
32
+ "logps/chosen": -604.7006225585938,
33
+ "logps/rejected": -1056.1942138671875,
34
+ "loss": 0.6926,
35
+ "rewards/accuracies": 0.5416666865348816,
36
+ "rewards/chosen": 0.0012125401990488172,
37
+ "rewards/margins": 0.001352548599243164,
38
+ "rewards/rejected": -0.00014000837109051645,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.11,
43
+ "grad_norm": 19.562748691217283,
44
  "learning_rate": 4.999562902281866e-07,
45
+ "logits/chosen": -2.7962822914123535,
46
+ "logits/rejected": -2.8271851539611816,
47
+ "logps/chosen": -571.3375854492188,
48
+ "logps/rejected": -971.5126953125,
49
+ "loss": 0.6749,
50
+ "rewards/accuracies": 0.7124999761581421,
51
+ "rewards/chosen": 0.03103743866086006,
52
+ "rewards/margins": 0.03241748735308647,
53
+ "rewards/rejected": -0.0013800484593957663,
54
  "step": 20
55
  },
56
  {
57
  "epoch": 0.16,
58
+ "grad_norm": 23.57935669375875,
59
  "learning_rate": 4.947295864744121e-07,
60
+ "logits/chosen": -2.859532117843628,
61
+ "logits/rejected": -2.8859381675720215,
62
+ "logps/chosen": -529.7252197265625,
63
+ "logps/rejected": -1093.7412109375,
64
+ "loss": 0.6296,
65
  "rewards/accuracies": 0.7749999761581421,
66
+ "rewards/chosen": 0.047512348741292953,
67
+ "rewards/margins": 0.15334269404411316,
68
+ "rewards/rejected": -0.10583032667636871,
69
  "step": 30
70
  },
71
  {
72
  "epoch": 0.21,
73
+ "grad_norm": 74.03794269111636,
74
  "learning_rate": 4.809698831278217e-07,
75
+ "logits/chosen": -3.1058590412139893,
76
+ "logits/rejected": -3.105548143386841,
77
+ "logps/chosen": -631.2692260742188,
78
+ "logps/rejected": -1100.1131591796875,
79
+ "loss": 0.5067,
80
  "rewards/accuracies": 0.7437499761581421,
81
+ "rewards/chosen": -0.4612053334712982,
82
+ "rewards/margins": 0.6213432550430298,
83
+ "rewards/rejected": -1.0825484991073608,
84
  "step": 40
85
  },
86
  {
87
  "epoch": 0.27,
88
+ "grad_norm": 80.08928437177174,
89
  "learning_rate": 4.591569405016049e-07,
90
+ "logits/chosen": -3.1383297443389893,
91
+ "logits/rejected": -3.338413953781128,
92
+ "logps/chosen": -614.7294921875,
93
+ "logps/rejected": -1324.274658203125,
94
+ "loss": 0.3007,
95
  "rewards/accuracies": 0.856249988079071,
96
+ "rewards/chosen": -0.7203965187072754,
97
+ "rewards/margins": 2.4570107460021973,
98
+ "rewards/rejected": -3.1774070262908936,
99
  "step": 50
100
  },
101
  {
102
  "epoch": 0.32,
103
+ "grad_norm": 52.8412534701194,
104
  "learning_rate": 4.3005131163403164e-07,
105
+ "logits/chosen": -3.232844829559326,
106
+ "logits/rejected": -3.4020397663116455,
107
+ "logps/chosen": -607.4974365234375,
108
+ "logps/rejected": -1571.42578125,
109
+ "loss": 0.2467,
110
+ "rewards/accuracies": 0.918749988079071,
111
+ "rewards/chosen": -0.6835159063339233,
112
+ "rewards/margins": 4.190090656280518,
113
+ "rewards/rejected": -4.8736066818237305,
114
  "step": 60
115
  },
116
  {
117
  "epoch": 0.37,
118
+ "grad_norm": 45.803944170508274,
119
  "learning_rate": 3.946678240449515e-07,
120
+ "logits/chosen": -3.016165256500244,
121
+ "logits/rejected": -3.2087910175323486,
122
+ "logps/chosen": -602.6742553710938,
123
+ "logps/rejected": -1499.858154296875,
124
+ "loss": 0.2227,
125
+ "rewards/accuracies": 0.893750011920929,
126
+ "rewards/chosen": -0.6613572239875793,
127
+ "rewards/margins": 4.315842628479004,
128
+ "rewards/rejected": -4.977200031280518,
129
  "step": 70
130
  },
131
  {
132
  "epoch": 0.43,
133
+ "grad_norm": 33.74568647416123,
134
  "learning_rate": 3.5424019569033206e-07,
135
+ "logits/chosen": -2.980517864227295,
136
+ "logits/rejected": -2.997511863708496,
137
+ "logps/chosen": -698.8486328125,
138
+ "logps/rejected": -1709.7763671875,
139
+ "loss": 0.2216,
140
  "rewards/accuracies": 0.9312499761581421,
141
+ "rewards/chosen": -1.0458548069000244,
142
+ "rewards/margins": 5.626683712005615,
143
+ "rewards/rejected": -6.672537803649902,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 0.48,
148
+ "grad_norm": 32.76518067019826,
149
  "learning_rate": 3.1017801885224326e-07,
150
+ "logits/chosen": -3.0111451148986816,
151
+ "logits/rejected": -3.0090878009796143,
152
+ "logps/chosen": -650.3148193359375,
153
+ "logps/rejected": -1498.55419921875,
154
+ "loss": 0.2021,
155
+ "rewards/accuracies": 0.90625,
156
+ "rewards/chosen": -0.8722761869430542,
157
+ "rewards/margins": 4.187361717224121,
158
+ "rewards/rejected": -5.059638023376465,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 0.53,
163
+ "grad_norm": 64.24324243411806,
164
  "learning_rate": 2.640176118092979e-07,
165
+ "logits/chosen": -2.9020493030548096,
166
+ "logits/rejected": -2.935757875442505,
167
+ "logps/chosen": -751.5125732421875,
168
+ "logps/rejected": -1689.5228271484375,
169
+ "loss": 0.1645,
170
+ "rewards/accuracies": 0.893750011920929,
171
+ "rewards/chosen": -1.2032721042633057,
172
+ "rewards/margins": 4.921408653259277,
173
+ "rewards/rejected": -6.124680995941162,
174
  "step": 100
175
  },
176
  {
177
  "epoch": 0.53,
178
+ "eval_logits/chosen": -3.0544369220733643,
179
+ "eval_logits/rejected": -2.793405294418335,
180
+ "eval_logps/chosen": -725.9426879882812,
181
+ "eval_logps/rejected": -1452.9771728515625,
182
+ "eval_loss": 0.25031739473342896,
183
+ "eval_rewards/accuracies": 0.831250011920929,
184
+ "eval_rewards/chosen": -1.6025804281234741,
185
+ "eval_rewards/margins": 3.9000518321990967,
186
+ "eval_rewards/rejected": -5.502632141113281,
187
+ "eval_runtime": 65.7537,
188
+ "eval_samples_per_second": 9.368,
189
+ "eval_steps_per_second": 0.304,
190
  "step": 100
191
  },
192
  {
193
  "epoch": 0.59,
194
+ "grad_norm": 41.59873680369454,
195
  "learning_rate": 2.1736845194498716e-07,
196
+ "logits/chosen": -2.9784274101257324,
197
+ "logits/rejected": -2.980086088180542,
198
+ "logps/chosen": -600.6064453125,
199
+ "logps/rejected": -1670.901611328125,
200
+ "loss": 0.1595,
201
  "rewards/accuracies": 0.925000011920929,
202
+ "rewards/chosen": -0.918400764465332,
203
+ "rewards/margins": 6.283780574798584,
204
+ "rewards/rejected": -7.202181339263916,
205
  "step": 110
206
  },
207
  {
208
  "epoch": 0.64,
209
+ "grad_norm": 28.23680644032835,
210
  "learning_rate": 1.718570580135889e-07,
211
+ "logits/chosen": -3.0252156257629395,
212
+ "logits/rejected": -3.080897569656372,
213
+ "logps/chosen": -611.710693359375,
214
+ "logps/rejected": -1694.8226318359375,
215
+ "loss": 0.1391,
216
+ "rewards/accuracies": 0.9375,
217
+ "rewards/chosen": -0.8532626032829285,
218
+ "rewards/margins": 5.446272850036621,
219
+ "rewards/rejected": -6.299535751342773,
220
  "step": 120
221
  },
222
  {
223
  "epoch": 0.69,
224
+ "grad_norm": 40.906944468121836,
225
  "learning_rate": 1.2907027822369005e-07,
226
+ "logits/chosen": -2.9933369159698486,
227
+ "logits/rejected": -3.124406576156616,
228
+ "logps/chosen": -700.328125,
229
+ "logps/rejected": -1804.997802734375,
230
+ "loss": 0.1477,
231
+ "rewards/accuracies": 0.925000011920929,
232
+ "rewards/chosen": -1.007743000984192,
233
+ "rewards/margins": 6.478204250335693,
234
+ "rewards/rejected": -7.485948085784912,
235
  "step": 130
236
  },
237
  {
238
  "epoch": 0.75,
239
+ "grad_norm": 22.754078194499957,
240
  "learning_rate": 9.049996151674788e-08,
241
+ "logits/chosen": -3.086073875427246,
242
+ "logits/rejected": -3.1164612770080566,
243
+ "logps/chosen": -631.7467651367188,
244
+ "logps/rejected": -1740.2171630859375,
245
+ "loss": 0.1821,
246
+ "rewards/accuracies": 0.9125000238418579,
247
+ "rewards/chosen": -1.0232716798782349,
248
+ "rewards/margins": 5.889337539672852,
249
+ "rewards/rejected": -6.912609100341797,
250
  "step": 140
251
  },
252
  {
253
  "epoch": 0.8,
254
+ "grad_norm": 20.144359719952234,
255
  "learning_rate": 5.74909411901843e-08,
256
+ "logits/chosen": -2.9675424098968506,
257
+ "logits/rejected": -2.990185499191284,
258
+ "logps/chosen": -617.1038818359375,
259
+ "logps/rejected": -1656.051513671875,
260
+ "loss": 0.1413,
261
+ "rewards/accuracies": 0.949999988079071,
262
+ "rewards/chosen": -0.9472934603691101,
263
+ "rewards/margins": 5.291378974914551,
264
+ "rewards/rejected": -6.238672733306885,
265
  "step": 150
266
  },
267
  {
268
  "epoch": 0.85,
269
+ "grad_norm": 26.642508471840806,
270
  "learning_rate": 3.119414452281158e-08,
271
+ "logits/chosen": -2.9869649410247803,
272
+ "logits/rejected": -3.0431644916534424,
273
+ "logps/chosen": -662.4171142578125,
274
+ "logps/rejected": -1831.9390869140625,
275
+ "loss": 0.1189,
276
+ "rewards/accuracies": 0.949999988079071,
277
+ "rewards/chosen": -0.8813556432723999,
278
+ "rewards/margins": 6.860285758972168,
279
+ "rewards/rejected": -7.741641044616699,
280
  "step": 160
281
  },
282
  {
283
  "epoch": 0.91,
284
+ "grad_norm": 18.842250875900756,
285
  "learning_rate": 1.2526463331788501e-08,
286
+ "logits/chosen": -3.083080291748047,
287
+ "logits/rejected": -2.9783942699432373,
288
+ "logps/chosen": -638.3408203125,
289
+ "logps/rejected": -1725.673583984375,
290
+ "loss": 0.1265,
291
  "rewards/accuracies": 0.9624999761581421,
292
+ "rewards/chosen": -0.9777008891105652,
293
+ "rewards/margins": 6.12181282043457,
294
+ "rewards/rejected": -7.099513053894043,
295
  "step": 170
296
  },
297
  {
298
  "epoch": 0.96,
299
+ "grad_norm": 34.250119439829845,
300
  "learning_rate": 2.1387846565474044e-09,
301
+ "logits/chosen": -3.0460267066955566,
302
+ "logits/rejected": -2.9695019721984863,
303
+ "logps/chosen": -608.745849609375,
304
+ "logps/rejected": -1744.884521484375,
305
+ "loss": 0.1257,
306
+ "rewards/accuracies": 0.9437500238418579,
307
+ "rewards/chosen": -1.041512131690979,
308
+ "rewards/margins": 6.2788825035095215,
309
+ "rewards/rejected": -7.320394992828369,
310
  "step": 180
311
  },
312
  {
313
  "epoch": 1.0,
314
  "step": 187,
315
  "total_flos": 0.0,
316
+ "train_loss": 0.2699868052719749,
317
+ "train_runtime": 2833.2764,
318
+ "train_samples_per_second": 4.234,
319
+ "train_steps_per_second": 0.066
320
  }
321
  ],
322
  "logging_steps": 10,