wzhouad commited on
Commit
418fa35
1 Parent(s): 8712cfb

Model save

Browse files
README.md CHANGED
@@ -17,15 +17,15 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [HuggingFaceH4/mistral-7b-sft-beta](https://huggingface.co/HuggingFaceH4/mistral-7b-sft-beta) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 0.0712
21
- - Rewards/chosen: -2.3718
22
- - Rewards/rejected: -2.8225
23
  - Rewards/accuracies: 0.625
24
- - Rewards/margins: 0.4507
25
- - Logps/rejected: -539.6053
26
- - Logps/chosen: -494.2236
27
- - Logits/rejected: -2.2822
28
- - Logits/chosen: -2.3030
29
 
30
  ## Model description
31
 
@@ -47,7 +47,7 @@ The following hyperparameters were used during training:
47
  - learning_rate: 5e-07
48
  - train_batch_size: 8
49
  - eval_batch_size: 8
50
- - seed: 3
51
  - distributed_type: multi-GPU
52
  - num_devices: 8
53
  - gradient_accumulation_steps: 2
@@ -62,10 +62,10 @@ The following hyperparameters were used during training:
62
 
63
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
64
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
65
- | 0.0594 | 0.25 | 100 | 0.1035 | -1.7191 | -1.9450 | 0.6172 | 0.2259 | -451.8574 | -428.9503 | -2.3270 | -2.3408 |
66
- | 0.0329 | 0.49 | 200 | 0.0693 | -2.4492 | -2.8068 | 0.6094 | 0.3576 | -538.0304 | -501.9568 | -2.2147 | -2.2352 |
67
- | 0.0312 | 0.74 | 300 | 0.0689 | -2.4412 | -2.8616 | 0.6133 | 0.4204 | -543.5178 | -501.1634 | -2.2721 | -2.2933 |
68
- | 0.0331 | 0.99 | 400 | 0.0712 | -2.3718 | -2.8225 | 0.625 | 0.4507 | -539.6053 | -494.2236 | -2.2822 | -2.3030 |
69
 
70
 
71
  ### Framework versions
 
17
 
18
  This model is a fine-tuned version of [HuggingFaceH4/mistral-7b-sft-beta](https://huggingface.co/HuggingFaceH4/mistral-7b-sft-beta) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.0660
21
+ - Rewards/chosen: -2.5606
22
+ - Rewards/rejected: -2.9549
23
  - Rewards/accuracies: 0.625
24
+ - Rewards/margins: 0.3944
25
+ - Logps/rejected: -552.8470
26
+ - Logps/chosen: -513.0960
27
+ - Logits/rejected: -2.2459
28
+ - Logits/chosen: -2.2708
29
 
30
  ## Model description
31
 
 
47
  - learning_rate: 5e-07
48
  - train_batch_size: 8
49
  - eval_batch_size: 8
50
+ - seed: 4
51
  - distributed_type: multi-GPU
52
  - num_devices: 8
53
  - gradient_accumulation_steps: 2
 
62
 
63
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
64
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
65
+ | 0.0437 | 0.25 | 100 | 0.0824 | -2.2538 | -2.4741 | 0.5859 | 0.2203 | -504.7590 | -482.4154 | -2.3143 | -2.3260 |
66
+ | 0.0258 | 0.49 | 200 | 0.0581 | -2.8677 | -3.2192 | 0.5977 | 0.3515 | -579.2755 | -543.8072 | -2.1155 | -2.1394 |
67
+ | 0.0402 | 0.74 | 300 | 0.0837 | -2.0997 | -2.5006 | 0.6289 | 0.4009 | -507.4115 | -467.0057 | -2.2751 | -2.2980 |
68
+ | 0.0288 | 0.99 | 400 | 0.0660 | -2.5606 | -2.9549 | 0.625 | 0.3944 | -552.8470 | -513.0960 | -2.2459 | -2.2708 |
69
 
70
 
71
  ### Framework versions
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.0722552685457983,
4
- "train_runtime": 3732.8792,
5
  "train_samples": 51894,
6
- "train_samples_per_second": 13.902,
7
  "train_steps_per_second": 0.108
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.07151281171374851,
4
+ "train_runtime": 3738.25,
5
  "train_samples": 51894,
6
+ "train_samples_per_second": 13.882,
7
  "train_steps_per_second": 0.108
8
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:df5f8551f34bd5fa2c36c62a9e1e02db72d830f8b080213c70f6615d9f81b129
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb2741318d25fa010663fe61ed02f4f293fa8ad301934c24bbabaf6e60633fb3
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec4273703f96f7c3f2cf7aaa5e04be8cea024440c644d9e1a06f6ec8a234f06a
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f559ba79771ca5e5cdbae085a55b0de304927c43a4793b3f8234d1f33152354
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9d1a5ffde550f3d000a899abb7c1f554363bee7053e537892516534d9b1b6cf9
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:901f2833dc2e0f3adb4f4bd18d3a372877da1018c193c779ed31f78d98f1f0a4
3
  size 4540516344
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.0722552685457983,
4
- "train_runtime": 3732.8792,
5
  "train_samples": 51894,
6
- "train_samples_per_second": 13.902,
7
  "train_steps_per_second": 0.108
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.07151281171374851,
4
+ "train_runtime": 3738.25,
5
  "train_samples": 51894,
6
+ "train_samples_per_second": 13.882,
7
  "train_steps_per_second": 0.108
8
  }
trainer_state.json CHANGED
@@ -11,11 +11,11 @@
11
  {
12
  "epoch": 0.0,
13
  "learning_rate": 1.2195121951219512e-08,
14
- "logits/chosen": -2.8088459968566895,
15
- "logits/rejected": -2.7595884799957275,
16
- "logps/chosen": -368.90777587890625,
17
- "logps/rejected": -133.10202026367188,
18
- "loss": 0.3669,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
@@ -25,634 +25,634 @@
25
  {
26
  "epoch": 0.02,
27
  "learning_rate": 1.219512195121951e-07,
28
- "logits/chosen": -2.838677406311035,
29
- "logits/rejected": -2.8248190879821777,
30
- "logps/chosen": -433.822265625,
31
- "logps/rejected": -114.71543884277344,
32
- "loss": 0.3373,
33
- "rewards/accuracies": 0.5555555820465088,
34
- "rewards/chosen": 0.0010175479110330343,
35
- "rewards/margins": 0.0018583540804684162,
36
- "rewards/rejected": -0.0008408060530200601,
37
  "step": 10
38
  },
39
  {
40
  "epoch": 0.05,
41
  "learning_rate": 2.439024390243902e-07,
42
- "logits/chosen": -2.798461437225342,
43
- "logits/rejected": -2.765454053878784,
44
- "logps/chosen": -436.7164001464844,
45
- "logps/rejected": -109.3239517211914,
46
- "loss": 0.3366,
47
- "rewards/accuracies": 0.71875,
48
- "rewards/chosen": 0.020252179354429245,
49
- "rewards/margins": 0.03614808991551399,
50
- "rewards/rejected": -0.015895914286375046,
51
  "step": 20
52
  },
53
  {
54
  "epoch": 0.07,
55
  "learning_rate": 3.6585365853658536e-07,
56
- "logits/chosen": -2.7184653282165527,
57
- "logits/rejected": -2.6913540363311768,
58
- "logps/chosen": -422.36480712890625,
59
- "logps/rejected": -127.92415618896484,
60
- "loss": 0.3034,
61
- "rewards/accuracies": 0.78125,
62
- "rewards/chosen": 0.06996239721775055,
63
- "rewards/margins": 0.19669881463050842,
64
- "rewards/rejected": -0.12673643231391907,
65
  "step": 30
66
  },
67
  {
68
  "epoch": 0.1,
69
  "learning_rate": 4.878048780487804e-07,
70
- "logits/chosen": -2.592528820037842,
71
- "logits/rejected": -2.5740997791290283,
72
- "logps/chosen": -396.34332275390625,
73
- "logps/rejected": -138.47140502929688,
74
- "loss": 0.2563,
75
- "rewards/accuracies": 0.768750011920929,
76
- "rewards/chosen": 0.023515433073043823,
77
- "rewards/margins": 0.41449323296546936,
78
- "rewards/rejected": -0.39097777009010315,
79
  "step": 40
80
  },
81
  {
82
  "epoch": 0.12,
83
  "learning_rate": 4.992461696250783e-07,
84
- "logits/chosen": -2.425698757171631,
85
- "logits/rejected": -2.399880886077881,
86
- "logps/chosen": -445.71978759765625,
87
- "logps/rejected": -201.20761108398438,
88
- "loss": 0.1773,
89
- "rewards/accuracies": 0.800000011920929,
90
- "rewards/chosen": -0.034065067768096924,
91
- "rewards/margins": 0.8275578618049622,
92
- "rewards/rejected": -0.8616229295730591,
93
  "step": 50
94
  },
95
  {
96
  "epoch": 0.15,
97
  "learning_rate": 4.966461721767899e-07,
98
- "logits/chosen": -2.4016242027282715,
99
- "logits/rejected": -2.3502964973449707,
100
- "logps/chosen": -424.775390625,
101
- "logps/rejected": -253.54776000976562,
102
- "loss": 0.1294,
103
- "rewards/accuracies": 0.75,
104
- "rewards/chosen": -0.3768869638442993,
105
- "rewards/margins": 0.9074532389640808,
106
- "rewards/rejected": -1.2843403816223145,
107
  "step": 60
108
  },
109
  {
110
  "epoch": 0.17,
111
  "learning_rate": 4.922100518015975e-07,
112
- "logits/chosen": -2.43666410446167,
113
- "logits/rejected": -2.387927293777466,
114
- "logps/chosen": -420.531494140625,
115
- "logps/rejected": -273.5174255371094,
116
- "loss": 0.1116,
117
- "rewards/accuracies": 0.768750011920929,
118
- "rewards/chosen": -0.3666774034500122,
119
- "rewards/margins": 1.1816540956497192,
120
- "rewards/rejected": -1.548331618309021,
121
  "step": 70
122
  },
123
  {
124
  "epoch": 0.2,
125
  "learning_rate": 4.859708325770919e-07,
126
- "logits/chosen": -2.37559175491333,
127
- "logits/rejected": -2.327603816986084,
128
- "logps/chosen": -472.6153259277344,
129
- "logps/rejected": -317.5882873535156,
130
- "loss": 0.0637,
131
- "rewards/accuracies": 0.7250000238418579,
132
- "rewards/chosen": -0.8155827522277832,
133
- "rewards/margins": 1.3035672903060913,
134
- "rewards/rejected": -2.119150161743164,
135
  "step": 80
136
  },
137
  {
138
  "epoch": 0.22,
139
  "learning_rate": 4.779749614980225e-07,
140
- "logits/chosen": -2.3662772178649902,
141
- "logits/rejected": -2.3145246505737305,
142
- "logps/chosen": -546.580810546875,
143
- "logps/rejected": -391.6395263671875,
144
- "loss": 0.0501,
145
- "rewards/accuracies": 0.8374999761581421,
146
- "rewards/chosen": -0.7051855325698853,
147
- "rewards/margins": 1.912410020828247,
148
- "rewards/rejected": -2.617595672607422,
149
  "step": 90
150
  },
151
  {
152
  "epoch": 0.25,
153
  "learning_rate": 4.682819627081427e-07,
154
- "logits/chosen": -2.3446455001831055,
155
- "logits/rejected": -2.278437852859497,
156
- "logps/chosen": -482.21063232421875,
157
- "logps/rejected": -363.7936096191406,
158
- "loss": 0.0594,
159
- "rewards/accuracies": 0.831250011920929,
160
- "rewards/chosen": -0.6942282915115356,
161
- "rewards/margins": 1.7591311931610107,
162
- "rewards/rejected": -2.4533591270446777,
163
  "step": 100
164
  },
165
  {
166
  "epoch": 0.25,
167
- "eval_logits/chosen": -2.340813636779785,
168
- "eval_logits/rejected": -2.327035903930664,
169
- "eval_logps/chosen": -428.9503173828125,
170
- "eval_logps/rejected": -451.85736083984375,
171
- "eval_loss": 0.10351637005805969,
172
- "eval_rewards/accuracies": 0.6171875,
173
- "eval_rewards/chosen": -1.7191063165664673,
174
- "eval_rewards/margins": 0.22593416273593903,
175
- "eval_rewards/rejected": -1.9450405836105347,
176
- "eval_runtime": 53.3665,
177
- "eval_samples_per_second": 37.477,
178
  "eval_steps_per_second": 0.6,
179
  "step": 100
180
  },
181
  {
182
  "epoch": 0.27,
183
  "learning_rate": 4.569639943810477e-07,
184
- "logits/chosen": -2.3267300128936768,
185
- "logits/rejected": -2.256336212158203,
186
- "logps/chosen": -502.18572998046875,
187
- "logps/rejected": -387.1337890625,
188
- "loss": 0.0472,
189
- "rewards/accuracies": 0.768750011920929,
190
- "rewards/chosen": -0.9502252340316772,
191
- "rewards/margins": 1.751552939414978,
192
- "rewards/rejected": -2.7017781734466553,
193
  "step": 110
194
  },
195
  {
196
  "epoch": 0.3,
197
  "learning_rate": 4.4410531154874543e-07,
198
- "logits/chosen": -2.3445639610290527,
199
- "logits/rejected": -2.2553389072418213,
200
- "logps/chosen": -552.4199829101562,
201
- "logps/rejected": -416.80755615234375,
202
- "loss": 0.0477,
203
- "rewards/accuracies": 0.78125,
204
- "rewards/chosen": -1.0058166980743408,
205
- "rewards/margins": 1.8569440841674805,
206
- "rewards/rejected": -2.8627610206604004,
207
  "step": 120
208
  },
209
  {
210
  "epoch": 0.32,
211
  "learning_rate": 4.298016388768561e-07,
212
- "logits/chosen": -2.396329641342163,
213
- "logits/rejected": -2.322551727294922,
214
- "logps/chosen": -542.0057373046875,
215
- "logps/rejected": -407.68634033203125,
216
- "loss": 0.0418,
217
- "rewards/accuracies": 0.8187500238418579,
218
- "rewards/chosen": -0.768031895160675,
219
- "rewards/margins": 2.077030658721924,
220
- "rewards/rejected": -2.845062017440796,
221
  "step": 130
222
  },
223
  {
224
  "epoch": 0.35,
225
  "learning_rate": 4.1415945805573005e-07,
226
- "logits/chosen": -2.3263237476348877,
227
- "logits/rejected": -2.2574667930603027,
228
- "logps/chosen": -506.77471923828125,
229
- "logps/rejected": -388.97479248046875,
230
- "loss": 0.0506,
231
- "rewards/accuracies": 0.8125,
232
- "rewards/chosen": -0.8580313920974731,
233
- "rewards/margins": 1.7057987451553345,
234
- "rewards/rejected": -2.5638298988342285,
235
  "step": 140
236
  },
237
  {
238
  "epoch": 0.37,
239
  "learning_rate": 3.972952151123984e-07,
240
- "logits/chosen": -2.3322761058807373,
241
- "logits/rejected": -2.2486355304718018,
242
- "logps/chosen": -450.03778076171875,
243
- "logps/rejected": -351.47064208984375,
244
- "loss": 0.0528,
245
- "rewards/accuracies": 0.824999988079071,
246
- "rewards/chosen": -0.7531972527503967,
247
- "rewards/margins": 1.7522554397583008,
248
- "rewards/rejected": -2.505452871322632,
249
  "step": 150
250
  },
251
  {
252
  "epoch": 0.39,
253
  "learning_rate": 3.793344535444142e-07,
254
- "logits/chosen": -2.298706531524658,
255
- "logits/rejected": -2.205777168273926,
256
- "logps/chosen": -549.6655883789062,
257
- "logps/rejected": -407.4877624511719,
258
- "loss": 0.0361,
259
- "rewards/accuracies": 0.8187500238418579,
260
- "rewards/chosen": -0.8248310089111328,
261
- "rewards/margins": 2.1384449005126953,
262
- "rewards/rejected": -2.963275909423828,
263
  "step": 160
264
  },
265
  {
266
  "epoch": 0.42,
267
  "learning_rate": 3.604108797288461e-07,
268
- "logits/chosen": -2.301478862762451,
269
- "logits/rejected": -2.199977397918701,
270
- "logps/chosen": -550.0228271484375,
271
- "logps/rejected": -447.4345703125,
272
- "loss": 0.0349,
273
- "rewards/accuracies": 0.831250011920929,
274
- "rewards/chosen": -1.1104724407196045,
275
- "rewards/margins": 2.2591710090637207,
276
- "rewards/rejected": -3.369643449783325,
277
  "step": 170
278
  },
279
  {
280
  "epoch": 0.44,
281
  "learning_rate": 3.40665367563858e-07,
282
- "logits/chosen": -2.2790443897247314,
283
- "logits/rejected": -2.1830639839172363,
284
- "logps/chosen": -540.7822265625,
285
- "logps/rejected": -438.80816650390625,
286
- "loss": 0.0358,
287
- "rewards/accuracies": 0.7875000238418579,
288
- "rewards/chosen": -1.3068325519561768,
289
- "rewards/margins": 1.9258372783660889,
290
- "rewards/rejected": -3.2326698303222656,
291
  "step": 180
292
  },
293
  {
294
  "epoch": 0.47,
295
  "learning_rate": 3.202449097526798e-07,
296
- "logits/chosen": -2.2940845489501953,
297
- "logits/rejected": -2.213531732559204,
298
- "logps/chosen": -518.0568237304688,
299
- "logps/rejected": -424.33331298828125,
300
- "loss": 0.0358,
301
- "rewards/accuracies": 0.800000011920929,
302
- "rewards/chosen": -1.1591523885726929,
303
- "rewards/margins": 2.0107340812683105,
304
- "rewards/rejected": -3.169886350631714,
305
  "step": 190
306
  },
307
  {
308
  "epoch": 0.49,
309
  "learning_rate": 2.993015235369905e-07,
310
- "logits/chosen": -2.2501273155212402,
311
- "logits/rejected": -2.1389498710632324,
312
- "logps/chosen": -568.6901245117188,
313
- "logps/rejected": -470.89617919921875,
314
- "loss": 0.0329,
315
- "rewards/accuracies": 0.8062499761581421,
316
- "rewards/chosen": -1.2941691875457764,
317
- "rewards/margins": 2.236302375793457,
318
- "rewards/rejected": -3.5304713249206543,
319
  "step": 200
320
  },
321
  {
322
  "epoch": 0.49,
323
- "eval_logits/chosen": -2.2352473735809326,
324
- "eval_logits/rejected": -2.214733362197876,
325
- "eval_logps/chosen": -501.9567565917969,
326
- "eval_logps/rejected": -538.0303955078125,
327
- "eval_loss": 0.06932022422552109,
328
- "eval_rewards/accuracies": 0.609375,
329
- "eval_rewards/chosen": -2.449171304702759,
330
- "eval_rewards/margins": 0.35759952664375305,
331
- "eval_rewards/rejected": -2.8067705631256104,
332
- "eval_runtime": 53.3061,
333
- "eval_samples_per_second": 37.519,
334
- "eval_steps_per_second": 0.6,
335
  "step": 200
336
  },
337
  {
338
  "epoch": 0.52,
339
  "learning_rate": 2.7799111902582693e-07,
340
- "logits/chosen": -2.2516720294952393,
341
- "logits/rejected": -2.1468265056610107,
342
- "logps/chosen": -544.9647216796875,
343
- "logps/rejected": -425.84832763671875,
344
- "loss": 0.0319,
345
- "rewards/accuracies": 0.731249988079071,
346
- "rewards/chosen": -1.4447880983352661,
347
- "rewards/margins": 1.7926721572875977,
348
- "rewards/rejected": -3.2374606132507324,
349
  "step": 210
350
  },
351
  {
352
  "epoch": 0.54,
353
  "learning_rate": 2.564723385445869e-07,
354
- "logits/chosen": -2.325510025024414,
355
- "logits/rejected": -2.2458481788635254,
356
- "logps/chosen": -532.0316772460938,
357
- "logps/rejected": -426.2433166503906,
358
- "loss": 0.0441,
359
- "rewards/accuracies": 0.8125,
360
- "rewards/chosen": -1.1441152095794678,
361
- "rewards/margins": 1.8752161264419556,
362
- "rewards/rejected": -3.019331455230713,
363
  "step": 220
364
  },
365
  {
366
  "epoch": 0.57,
367
  "learning_rate": 2.3490537564442845e-07,
368
- "logits/chosen": -2.3061037063598633,
369
- "logits/rejected": -2.2063522338867188,
370
- "logps/chosen": -515.2584228515625,
371
- "logps/rejected": -387.2288818359375,
372
- "loss": 0.0536,
373
- "rewards/accuracies": 0.7749999761581421,
374
- "rewards/chosen": -1.2331289052963257,
375
- "rewards/margins": 1.573769211769104,
376
- "rewards/rejected": -2.806898355484009,
377
  "step": 230
378
  },
379
  {
380
  "epoch": 0.59,
381
  "learning_rate": 2.1345078256378801e-07,
382
- "logits/chosen": -2.3259823322296143,
383
- "logits/rejected": -2.232604503631592,
384
- "logps/chosen": -529.44775390625,
385
- "logps/rejected": -442.9454040527344,
386
- "loss": 0.0384,
387
- "rewards/accuracies": 0.768750011920929,
388
- "rewards/chosen": -1.2063531875610352,
389
- "rewards/margins": 2.0420820713043213,
390
- "rewards/rejected": -3.2484352588653564,
391
  "step": 240
392
  },
393
  {
394
  "epoch": 0.62,
395
  "learning_rate": 1.9226827501969865e-07,
396
- "logits/chosen": -2.310181140899658,
397
- "logits/rejected": -2.225755214691162,
398
- "logps/chosen": -569.6714477539062,
399
- "logps/rejected": -482.9613342285156,
400
- "loss": 0.0368,
401
- "rewards/accuracies": 0.8374999761581421,
402
- "rewards/chosen": -1.2699750661849976,
403
- "rewards/margins": 2.3776299953460693,
404
- "rewards/rejected": -3.6476047039031982,
405
  "step": 250
406
  },
407
  {
408
  "epoch": 0.64,
409
  "learning_rate": 1.715155432264775e-07,
410
- "logits/chosen": -2.3007090091705322,
411
- "logits/rejected": -2.2159204483032227,
412
- "logps/chosen": -574.6656494140625,
413
- "logps/rejected": -473.60528564453125,
414
- "loss": 0.0275,
415
- "rewards/accuracies": 0.824999988079071,
416
- "rewards/chosen": -1.4263044595718384,
417
- "rewards/margins": 2.146233081817627,
418
- "rewards/rejected": -3.572537660598755,
419
  "step": 260
420
  },
421
  {
422
  "epoch": 0.67,
423
  "learning_rate": 1.51347077992983e-07,
424
- "logits/chosen": -2.280165195465088,
425
- "logits/rejected": -2.1988308429718018,
426
- "logps/chosen": -573.0145874023438,
427
- "logps/rejected": -490.4935607910156,
428
- "loss": 0.024,
429
- "rewards/accuracies": 0.8187500238418579,
430
- "rewards/chosen": -1.6931577920913696,
431
- "rewards/margins": 1.988318681716919,
432
- "rewards/rejected": -3.68147611618042,
433
  "step": 270
434
  },
435
  {
436
  "epoch": 0.69,
437
  "learning_rate": 1.3191302063739906e-07,
438
- "logits/chosen": -2.247427463531494,
439
- "logits/rejected": -2.1717417240142822,
440
- "logps/chosen": -552.9573364257812,
441
- "logps/rejected": -480.90435791015625,
442
- "loss": 0.0231,
443
- "rewards/accuracies": 0.7562500238418579,
444
- "rewards/chosen": -1.7376149892807007,
445
- "rewards/margins": 1.9405027627944946,
446
- "rewards/rejected": -3.678117275238037,
447
  "step": 280
448
  },
449
  {
450
  "epoch": 0.72,
451
  "learning_rate": 1.1335804528119475e-07,
452
- "logits/chosen": -2.3430678844451904,
453
- "logits/rejected": -2.2265610694885254,
454
- "logps/chosen": -586.9962158203125,
455
- "logps/rejected": -472.01611328125,
456
- "loss": 0.0285,
457
- "rewards/accuracies": 0.7749999761581421,
458
- "rewards/chosen": -1.5123710632324219,
459
- "rewards/margins": 2.2006583213806152,
460
- "rewards/rejected": -3.713029384613037,
461
  "step": 290
462
  },
463
  {
464
  "epoch": 0.74,
465
  "learning_rate": 9.582028184286423e-08,
466
- "logits/chosen": -2.2495548725128174,
467
- "logits/rejected": -2.186642646789551,
468
- "logps/chosen": -531.0364990234375,
469
- "logps/rejected": -480.0726623535156,
470
- "loss": 0.0312,
471
- "rewards/accuracies": 0.7437499761581421,
472
- "rewards/chosen": -1.7118114233016968,
473
- "rewards/margins": 1.8730456829071045,
474
- "rewards/rejected": -3.58485746383667,
475
  "step": 300
476
  },
477
  {
478
  "epoch": 0.74,
479
- "eval_logits/chosen": -2.2933216094970703,
480
- "eval_logits/rejected": -2.2721123695373535,
481
- "eval_logps/chosen": -501.1633605957031,
482
- "eval_logps/rejected": -543.5177612304688,
483
- "eval_loss": 0.06885366886854172,
484
- "eval_rewards/accuracies": 0.61328125,
485
- "eval_rewards/chosen": -2.441237449645996,
486
- "eval_rewards/margins": 0.42040756344795227,
487
- "eval_rewards/rejected": -2.861644983291626,
488
- "eval_runtime": 53.2903,
489
- "eval_samples_per_second": 37.53,
490
  "eval_steps_per_second": 0.6,
491
  "step": 300
492
  },
493
  {
494
  "epoch": 0.76,
495
  "learning_rate": 7.943028774907065e-08,
496
- "logits/chosen": -2.2719688415527344,
497
- "logits/rejected": -2.1988675594329834,
498
- "logps/chosen": -524.6929931640625,
499
- "logps/rejected": -446.8042907714844,
500
- "loss": 0.0349,
501
- "rewards/accuracies": 0.793749988079071,
502
- "rewards/chosen": -1.4022165536880493,
503
- "rewards/margins": 1.8806768655776978,
504
- "rewards/rejected": -3.282893419265747,
505
  "step": 310
506
  },
507
  {
508
  "epoch": 0.79,
509
  "learning_rate": 6.431007601814637e-08,
510
- "logits/chosen": -2.2960824966430664,
511
- "logits/rejected": -2.2386252880096436,
512
- "logps/chosen": -477.001953125,
513
- "logps/rejected": -436.0245666503906,
514
- "loss": 0.0298,
515
- "rewards/accuracies": 0.8374999761581421,
516
- "rewards/chosen": -1.4929635524749756,
517
- "rewards/margins": 1.7944204807281494,
518
- "rewards/rejected": -3.287383556365967,
519
  "step": 320
520
  },
521
  {
522
  "epoch": 0.81,
523
  "learning_rate": 5.0572206951246e-08,
524
- "logits/chosen": -2.277937650680542,
525
- "logits/rejected": -2.1940300464630127,
526
- "logps/chosen": -516.416015625,
527
- "logps/rejected": -444.90032958984375,
528
- "loss": 0.0329,
529
- "rewards/accuracies": 0.7749999761581421,
530
- "rewards/chosen": -1.4886820316314697,
531
- "rewards/margins": 1.8972896337509155,
532
- "rewards/rejected": -3.385971784591675,
533
  "step": 330
534
  },
535
  {
536
  "epoch": 0.84,
537
  "learning_rate": 3.831895019292897e-08,
538
- "logits/chosen": -2.3472743034362793,
539
- "logits/rejected": -2.266993999481201,
540
- "logps/chosen": -560.1998291015625,
541
- "logps/rejected": -486.14801025390625,
542
- "loss": 0.0324,
543
- "rewards/accuracies": 0.800000011920929,
544
- "rewards/chosen": -1.25786554813385,
545
- "rewards/margins": 2.4262924194335938,
546
- "rewards/rejected": -3.6841578483581543,
547
  "step": 340
548
  },
549
  {
550
  "epoch": 0.86,
551
  "learning_rate": 2.764152339909756e-08,
552
- "logits/chosen": -2.2894670963287354,
553
- "logits/rejected": -2.2070441246032715,
554
- "logps/chosen": -551.2086181640625,
555
- "logps/rejected": -415.3118591308594,
556
- "loss": 0.0328,
557
- "rewards/accuracies": 0.793749988079071,
558
- "rewards/chosen": -1.2593928575515747,
559
- "rewards/margins": 1.9064128398895264,
560
- "rewards/rejected": -3.1658055782318115,
561
  "step": 350
562
  },
563
  {
564
  "epoch": 0.89,
565
  "learning_rate": 1.861941317991664e-08,
566
- "logits/chosen": -2.3396449089050293,
567
- "logits/rejected": -2.227651834487915,
568
- "logps/chosen": -571.0888061523438,
569
- "logps/rejected": -453.03277587890625,
570
- "loss": 0.0325,
571
- "rewards/accuracies": 0.8374999761581421,
572
- "rewards/chosen": -1.14793860912323,
573
- "rewards/margins": 2.2367420196533203,
574
- "rewards/rejected": -3.3846805095672607,
575
  "step": 360
576
  },
577
  {
578
  "epoch": 0.91,
579
  "learning_rate": 1.13197833728636e-08,
580
- "logits/chosen": -2.2972564697265625,
581
- "logits/rejected": -2.215446710586548,
582
- "logps/chosen": -527.4664306640625,
583
- "logps/rejected": -465.6924743652344,
584
- "loss": 0.0288,
585
- "rewards/accuracies": 0.824999988079071,
586
- "rewards/chosen": -1.247899055480957,
587
- "rewards/margins": 2.289482355117798,
588
- "rewards/rejected": -3.537381410598755,
589
  "step": 370
590
  },
591
  {
592
  "epoch": 0.94,
593
  "learning_rate": 5.79697505093521e-09,
594
- "logits/chosen": -2.293482542037964,
595
- "logits/rejected": -2.2097363471984863,
596
- "logps/chosen": -540.6966552734375,
597
- "logps/rejected": -439.814697265625,
598
- "loss": 0.0375,
599
- "rewards/accuracies": 0.768750011920929,
600
- "rewards/chosen": -1.383996605873108,
601
- "rewards/margins": 1.9607197046279907,
602
- "rewards/rejected": -3.3447163105010986,
603
  "step": 380
604
  },
605
  {
606
  "epoch": 0.96,
607
  "learning_rate": 2.092101988131256e-09,
608
- "logits/chosen": -2.346567153930664,
609
- "logits/rejected": -2.220730781555176,
610
- "logps/chosen": -575.7041625976562,
611
- "logps/rejected": -463.69427490234375,
612
- "loss": 0.0315,
613
- "rewards/accuracies": 0.8687499761581421,
614
- "rewards/chosen": -1.1256561279296875,
615
- "rewards/margins": 2.420063018798828,
616
- "rewards/rejected": -3.5457186698913574,
617
  "step": 390
618
  },
619
  {
620
  "epoch": 0.99,
621
  "learning_rate": 2.327445937151673e-10,
622
- "logits/chosen": -2.3339614868164062,
623
- "logits/rejected": -2.2517640590667725,
624
- "logps/chosen": -568.7457275390625,
625
- "logps/rejected": -479.13653564453125,
626
- "loss": 0.0331,
627
- "rewards/accuracies": 0.831250011920929,
628
- "rewards/chosen": -1.2012748718261719,
629
- "rewards/margins": 2.3051795959472656,
630
- "rewards/rejected": -3.5064544677734375,
631
  "step": 400
632
  },
633
  {
634
  "epoch": 0.99,
635
- "eval_logits/chosen": -2.3029849529266357,
636
- "eval_logits/rejected": -2.282188892364502,
637
- "eval_logps/chosen": -494.22357177734375,
638
- "eval_logps/rejected": -539.6053466796875,
639
- "eval_loss": 0.07123579829931259,
640
  "eval_rewards/accuracies": 0.625,
641
- "eval_rewards/chosen": -2.3718395233154297,
642
- "eval_rewards/margins": 0.45068085193634033,
643
- "eval_rewards/rejected": -2.8225200176239014,
644
- "eval_runtime": 53.2767,
645
- "eval_samples_per_second": 37.54,
646
- "eval_steps_per_second": 0.601,
647
  "step": 400
648
  },
649
  {
650
  "epoch": 1.0,
651
  "step": 405,
652
  "total_flos": 0.0,
653
- "train_loss": 0.0722552685457983,
654
- "train_runtime": 3732.8792,
655
- "train_samples_per_second": 13.902,
656
  "train_steps_per_second": 0.108
657
  }
658
  ],
 
11
  {
12
  "epoch": 0.0,
13
  "learning_rate": 1.2195121951219512e-08,
14
+ "logits/chosen": -2.8681135177612305,
15
+ "logits/rejected": -2.8858838081359863,
16
+ "logps/chosen": -518.1907958984375,
17
+ "logps/rejected": -109.31971740722656,
18
+ "loss": 0.3475,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
 
25
  {
26
  "epoch": 0.02,
27
  "learning_rate": 1.219512195121951e-07,
28
+ "logits/chosen": -2.7986178398132324,
29
+ "logits/rejected": -2.752176284790039,
30
+ "logps/chosen": -434.208251953125,
31
+ "logps/rejected": -114.19618225097656,
32
+ "loss": 0.3394,
33
+ "rewards/accuracies": 0.4930555522441864,
34
+ "rewards/chosen": 0.0002649651141837239,
35
+ "rewards/margins": 0.0009347840095870197,
36
+ "rewards/rejected": -0.0006698188371956348,
37
  "step": 10
38
  },
39
  {
40
  "epoch": 0.05,
41
  "learning_rate": 2.439024390243902e-07,
42
+ "logits/chosen": -2.8215415477752686,
43
+ "logits/rejected": -2.7983882427215576,
44
+ "logps/chosen": -417.2633361816406,
45
+ "logps/rejected": -118.0062026977539,
46
+ "loss": 0.3373,
47
+ "rewards/accuracies": 0.7250000238418579,
48
+ "rewards/chosen": 0.019945567473769188,
49
+ "rewards/margins": 0.03575458750128746,
50
+ "rewards/rejected": -0.015809018164873123,
51
  "step": 20
52
  },
53
  {
54
  "epoch": 0.07,
55
  "learning_rate": 3.6585365853658536e-07,
56
+ "logits/chosen": -2.6574862003326416,
57
+ "logits/rejected": -2.6451315879821777,
58
+ "logps/chosen": -398.87353515625,
59
+ "logps/rejected": -125.69970703125,
60
+ "loss": 0.3045,
61
+ "rewards/accuracies": 0.7562500238418579,
62
+ "rewards/chosen": 0.07569055259227753,
63
+ "rewards/margins": 0.19884702563285828,
64
+ "rewards/rejected": -0.12315647304058075,
65
  "step": 30
66
  },
67
  {
68
  "epoch": 0.1,
69
  "learning_rate": 4.878048780487804e-07,
70
+ "logits/chosen": -2.54256272315979,
71
+ "logits/rejected": -2.5281729698181152,
72
+ "logps/chosen": -384.5321044921875,
73
+ "logps/rejected": -168.55758666992188,
74
+ "loss": 0.2564,
75
+ "rewards/accuracies": 0.737500011920929,
76
+ "rewards/chosen": 0.024631643667817116,
77
+ "rewards/margins": 0.41851943731307983,
78
+ "rewards/rejected": -0.39388787746429443,
79
  "step": 40
80
  },
81
  {
82
  "epoch": 0.12,
83
  "learning_rate": 4.992461696250783e-07,
84
+ "logits/chosen": -2.4257261753082275,
85
+ "logits/rejected": -2.3928446769714355,
86
+ "logps/chosen": -436.45330810546875,
87
+ "logps/rejected": -219.0617218017578,
88
+ "loss": 0.1809,
89
+ "rewards/accuracies": 0.7749999761581421,
90
+ "rewards/chosen": -0.1671580970287323,
91
+ "rewards/margins": 0.7879143953323364,
92
+ "rewards/rejected": -0.9550724029541016,
93
  "step": 50
94
  },
95
  {
96
  "epoch": 0.15,
97
  "learning_rate": 4.966461721767899e-07,
98
+ "logits/chosen": -2.3805835247039795,
99
+ "logits/rejected": -2.3364853858947754,
100
+ "logps/chosen": -437.4466857910156,
101
+ "logps/rejected": -240.6685791015625,
102
+ "loss": 0.1377,
103
+ "rewards/accuracies": 0.768750011920929,
104
+ "rewards/chosen": -0.32454290986061096,
105
+ "rewards/margins": 0.9316140413284302,
106
+ "rewards/rejected": -1.2561569213867188,
107
  "step": 60
108
  },
109
  {
110
  "epoch": 0.17,
111
  "learning_rate": 4.922100518015975e-07,
112
+ "logits/chosen": -2.3752458095550537,
113
+ "logits/rejected": -2.3281030654907227,
114
+ "logps/chosen": -419.6747131347656,
115
+ "logps/rejected": -264.75787353515625,
116
+ "loss": 0.103,
117
+ "rewards/accuracies": 0.7562500238418579,
118
+ "rewards/chosen": -0.3937300443649292,
119
+ "rewards/margins": 1.1917842626571655,
120
+ "rewards/rejected": -1.5855143070220947,
121
  "step": 70
122
  },
123
  {
124
  "epoch": 0.2,
125
  "learning_rate": 4.859708325770919e-07,
126
+ "logits/chosen": -2.4320530891418457,
127
+ "logits/rejected": -2.3738579750061035,
128
+ "logps/chosen": -472.10479736328125,
129
+ "logps/rejected": -330.32403564453125,
130
+ "loss": 0.0674,
131
+ "rewards/accuracies": 0.8187500238418579,
132
+ "rewards/chosen": -0.5765678286552429,
133
+ "rewards/margins": 1.5421369075775146,
134
+ "rewards/rejected": -2.1187047958374023,
135
  "step": 80
136
  },
137
  {
138
  "epoch": 0.22,
139
  "learning_rate": 4.779749614980225e-07,
140
+ "logits/chosen": -2.3991949558258057,
141
+ "logits/rejected": -2.357053279876709,
142
+ "logps/chosen": -487.83074951171875,
143
+ "logps/rejected": -349.1925354003906,
144
+ "loss": 0.0553,
145
+ "rewards/accuracies": 0.8062499761581421,
146
+ "rewards/chosen": -0.666537880897522,
147
+ "rewards/margins": 1.7182201147079468,
148
+ "rewards/rejected": -2.3847577571868896,
149
  "step": 90
150
  },
151
  {
152
  "epoch": 0.25,
153
  "learning_rate": 4.682819627081427e-07,
154
+ "logits/chosen": -2.3752927780151367,
155
+ "logits/rejected": -2.326216220855713,
156
+ "logps/chosen": -515.1549682617188,
157
+ "logps/rejected": -378.8877868652344,
158
+ "loss": 0.0437,
159
+ "rewards/accuracies": 0.862500011920929,
160
+ "rewards/chosen": -0.6667075157165527,
161
+ "rewards/margins": 2.000246524810791,
162
+ "rewards/rejected": -2.666954278945923,
163
  "step": 100
164
  },
165
  {
166
  "epoch": 0.25,
167
+ "eval_logits/chosen": -2.3259778022766113,
168
+ "eval_logits/rejected": -2.314302682876587,
169
+ "eval_logps/chosen": -482.4153747558594,
170
+ "eval_logps/rejected": -504.759033203125,
171
+ "eval_loss": 0.08243285864591599,
172
+ "eval_rewards/accuracies": 0.5859375,
173
+ "eval_rewards/chosen": -2.2537574768066406,
174
+ "eval_rewards/margins": 0.22029951214790344,
175
+ "eval_rewards/rejected": -2.4740567207336426,
176
+ "eval_runtime": 53.3582,
177
+ "eval_samples_per_second": 37.483,
178
  "eval_steps_per_second": 0.6,
179
  "step": 100
180
  },
181
  {
182
  "epoch": 0.27,
183
  "learning_rate": 4.569639943810477e-07,
184
+ "logits/chosen": -2.3144371509552,
185
+ "logits/rejected": -2.2340025901794434,
186
+ "logps/chosen": -490.12921142578125,
187
+ "logps/rejected": -419.07867431640625,
188
+ "loss": 0.0359,
189
+ "rewards/accuracies": 0.737500011920929,
190
+ "rewards/chosen": -1.3260681629180908,
191
+ "rewards/margins": 1.7610738277435303,
192
+ "rewards/rejected": -3.0871422290802,
193
  "step": 110
194
  },
195
  {
196
  "epoch": 0.3,
197
  "learning_rate": 4.4410531154874543e-07,
198
+ "logits/chosen": -2.3716444969177246,
199
+ "logits/rejected": -2.3235533237457275,
200
+ "logps/chosen": -466.01702880859375,
201
+ "logps/rejected": -356.8735046386719,
202
+ "loss": 0.0575,
203
+ "rewards/accuracies": 0.7875000238418579,
204
+ "rewards/chosen": -0.7180399298667908,
205
+ "rewards/margins": 1.6505486965179443,
206
+ "rewards/rejected": -2.368588924407959,
207
  "step": 120
208
  },
209
  {
210
  "epoch": 0.32,
211
  "learning_rate": 4.298016388768561e-07,
212
+ "logits/chosen": -2.3074584007263184,
213
+ "logits/rejected": -2.257930040359497,
214
+ "logps/chosen": -472.1845703125,
215
+ "logps/rejected": -373.66522216796875,
216
+ "loss": 0.0498,
217
+ "rewards/accuracies": 0.768750011920929,
218
+ "rewards/chosen": -1.0088322162628174,
219
+ "rewards/margins": 1.6445964574813843,
220
+ "rewards/rejected": -2.653428792953491,
221
  "step": 130
222
  },
223
  {
224
  "epoch": 0.35,
225
  "learning_rate": 4.1415945805573005e-07,
226
+ "logits/chosen": -2.225804328918457,
227
+ "logits/rejected": -2.15400767326355,
228
+ "logps/chosen": -534.1700439453125,
229
+ "logps/rejected": -430.3104553222656,
230
+ "loss": 0.0361,
231
+ "rewards/accuracies": 0.8500000238418579,
232
+ "rewards/chosen": -1.042690634727478,
233
+ "rewards/margins": 2.092653751373291,
234
+ "rewards/rejected": -3.1353445053100586,
235
  "step": 140
236
  },
237
  {
238
  "epoch": 0.37,
239
  "learning_rate": 3.972952151123984e-07,
240
+ "logits/chosen": -2.2562787532806396,
241
+ "logits/rejected": -2.164506673812866,
242
+ "logps/chosen": -522.7659912109375,
243
+ "logps/rejected": -425.18109130859375,
244
+ "loss": 0.0344,
245
+ "rewards/accuracies": 0.8500000238418579,
246
+ "rewards/chosen": -1.1226718425750732,
247
+ "rewards/margins": 2.0036892890930176,
248
+ "rewards/rejected": -3.12636137008667,
249
  "step": 150
250
  },
251
  {
252
  "epoch": 0.39,
253
  "learning_rate": 3.793344535444142e-07,
254
+ "logits/chosen": -2.267565965652466,
255
+ "logits/rejected": -2.1969974040985107,
256
+ "logps/chosen": -530.3189086914062,
257
+ "logps/rejected": -426.72332763671875,
258
+ "loss": 0.0393,
259
+ "rewards/accuracies": 0.768750011920929,
260
+ "rewards/chosen": -1.2129985094070435,
261
+ "rewards/margins": 1.8231168985366821,
262
+ "rewards/rejected": -3.0361156463623047,
263
  "step": 160
264
  },
265
  {
266
  "epoch": 0.42,
267
  "learning_rate": 3.604108797288461e-07,
268
+ "logits/chosen": -2.237342119216919,
269
+ "logits/rejected": -2.1961898803710938,
270
+ "logps/chosen": -448.13812255859375,
271
+ "logps/rejected": -372.9068298339844,
272
+ "loss": 0.0465,
273
+ "rewards/accuracies": 0.768750011920929,
274
+ "rewards/chosen": -1.092370629310608,
275
+ "rewards/margins": 1.5985119342803955,
276
+ "rewards/rejected": -2.690882444381714,
277
  "step": 170
278
  },
279
  {
280
  "epoch": 0.44,
281
  "learning_rate": 3.40665367563858e-07,
282
+ "logits/chosen": -2.2571911811828613,
283
+ "logits/rejected": -2.140353202819824,
284
+ "logps/chosen": -548.1529541015625,
285
+ "logps/rejected": -449.4532165527344,
286
+ "loss": 0.035,
287
+ "rewards/accuracies": 0.8187500238418579,
288
+ "rewards/chosen": -0.8807584643363953,
289
+ "rewards/margins": 2.355053663253784,
290
+ "rewards/rejected": -3.2358124256134033,
291
  "step": 180
292
  },
293
  {
294
  "epoch": 0.47,
295
  "learning_rate": 3.202449097526798e-07,
296
+ "logits/chosen": -2.1954236030578613,
297
+ "logits/rejected": -2.113832950592041,
298
+ "logps/chosen": -545.7277221679688,
299
+ "logps/rejected": -466.76580810546875,
300
+ "loss": 0.029,
301
+ "rewards/accuracies": 0.768750011920929,
302
+ "rewards/chosen": -1.4056795835494995,
303
+ "rewards/margins": 2.1022555828094482,
304
+ "rewards/rejected": -3.5079357624053955,
305
  "step": 190
306
  },
307
  {
308
  "epoch": 0.49,
309
  "learning_rate": 2.993015235369905e-07,
310
+ "logits/chosen": -2.1386027336120605,
311
+ "logits/rejected": -2.0572166442871094,
312
+ "logps/chosen": -560.2534790039062,
313
+ "logps/rejected": -491.8816833496094,
314
+ "loss": 0.0258,
315
+ "rewards/accuracies": 0.7437499761581421,
316
+ "rewards/chosen": -1.810624361038208,
317
+ "rewards/margins": 1.9691530466079712,
318
+ "rewards/rejected": -3.7797775268554688,
319
  "step": 200
320
  },
321
  {
322
  "epoch": 0.49,
323
+ "eval_logits/chosen": -2.1394448280334473,
324
+ "eval_logits/rejected": -2.1155476570129395,
325
+ "eval_logps/chosen": -543.8071899414062,
326
+ "eval_logps/rejected": -579.2755126953125,
327
+ "eval_loss": 0.0581156425178051,
328
+ "eval_rewards/accuracies": 0.59765625,
329
+ "eval_rewards/chosen": -2.86767578125,
330
+ "eval_rewards/margins": 0.35154610872268677,
331
+ "eval_rewards/rejected": -3.219222068786621,
332
+ "eval_runtime": 53.2701,
333
+ "eval_samples_per_second": 37.545,
334
+ "eval_steps_per_second": 0.601,
335
  "step": 200
336
  },
337
  {
338
  "epoch": 0.52,
339
  "learning_rate": 2.7799111902582693e-07,
340
+ "logits/chosen": -2.1782305240631104,
341
+ "logits/rejected": -2.044674873352051,
342
+ "logps/chosen": -579.908935546875,
343
+ "logps/rejected": -500.6641540527344,
344
+ "loss": 0.0219,
345
+ "rewards/accuracies": 0.8187500238418579,
346
+ "rewards/chosen": -1.408044695854187,
347
+ "rewards/margins": 2.4992563724517822,
348
+ "rewards/rejected": -3.9073009490966797,
349
  "step": 210
350
  },
351
  {
352
  "epoch": 0.54,
353
  "learning_rate": 2.564723385445869e-07,
354
+ "logits/chosen": -2.2589755058288574,
355
+ "logits/rejected": -2.156228542327881,
356
+ "logps/chosen": -563.1976318359375,
357
+ "logps/rejected": -475.75030517578125,
358
+ "loss": 0.038,
359
+ "rewards/accuracies": 0.762499988079071,
360
+ "rewards/chosen": -1.3078866004943848,
361
+ "rewards/margins": 2.1681323051452637,
362
+ "rewards/rejected": -3.4760184288024902,
363
  "step": 220
364
  },
365
  {
366
  "epoch": 0.57,
367
  "learning_rate": 2.3490537564442845e-07,
368
+ "logits/chosen": -2.2288191318511963,
369
+ "logits/rejected": -2.136579751968384,
370
+ "logps/chosen": -507.54632568359375,
371
+ "logps/rejected": -419.88470458984375,
372
+ "loss": 0.0432,
373
+ "rewards/accuracies": 0.737500011920929,
374
+ "rewards/chosen": -1.2000774145126343,
375
+ "rewards/margins": 1.7510545253753662,
376
+ "rewards/rejected": -2.951131820678711,
377
  "step": 230
378
  },
379
  {
380
  "epoch": 0.59,
381
  "learning_rate": 2.1345078256378801e-07,
382
+ "logits/chosen": -2.282217264175415,
383
+ "logits/rejected": -2.1927459239959717,
384
+ "logps/chosen": -539.92822265625,
385
+ "logps/rejected": -433.8241271972656,
386
+ "loss": 0.0373,
387
+ "rewards/accuracies": 0.7749999761581421,
388
+ "rewards/chosen": -1.2784963846206665,
389
+ "rewards/margins": 1.8950881958007812,
390
+ "rewards/rejected": -3.1735846996307373,
391
  "step": 240
392
  },
393
  {
394
  "epoch": 0.62,
395
  "learning_rate": 1.9226827501969865e-07,
396
+ "logits/chosen": -2.2803092002868652,
397
+ "logits/rejected": -2.1990160942077637,
398
+ "logps/chosen": -537.9136962890625,
399
+ "logps/rejected": -442.28350830078125,
400
+ "loss": 0.04,
401
+ "rewards/accuracies": 0.800000011920929,
402
+ "rewards/chosen": -1.2320274114608765,
403
+ "rewards/margins": 2.1027939319610596,
404
+ "rewards/rejected": -3.3348212242126465,
405
  "step": 250
406
  },
407
  {
408
  "epoch": 0.64,
409
  "learning_rate": 1.715155432264775e-07,
410
+ "logits/chosen": -2.2646355628967285,
411
+ "logits/rejected": -2.14613676071167,
412
+ "logps/chosen": -502.49664306640625,
413
+ "logps/rejected": -420.11004638671875,
414
+ "loss": 0.0396,
415
+ "rewards/accuracies": 0.800000011920929,
416
+ "rewards/chosen": -1.1264328956604004,
417
+ "rewards/margins": 2.006878614425659,
418
+ "rewards/rejected": -3.1333117485046387,
419
  "step": 260
420
  },
421
  {
422
  "epoch": 0.67,
423
  "learning_rate": 1.51347077992983e-07,
424
+ "logits/chosen": -2.3088698387145996,
425
+ "logits/rejected": -2.2018628120422363,
426
+ "logps/chosen": -554.0256958007812,
427
+ "logps/rejected": -421.2101135253906,
428
+ "loss": 0.0375,
429
+ "rewards/accuracies": 0.831250011920929,
430
+ "rewards/chosen": -1.10258948802948,
431
+ "rewards/margins": 1.9626919031143188,
432
+ "rewards/rejected": -3.065281391143799,
433
  "step": 270
434
  },
435
  {
436
  "epoch": 0.69,
437
  "learning_rate": 1.3191302063739906e-07,
438
+ "logits/chosen": -2.310133457183838,
439
+ "logits/rejected": -2.216827392578125,
440
+ "logps/chosen": -522.3606567382812,
441
+ "logps/rejected": -438.058349609375,
442
+ "loss": 0.043,
443
+ "rewards/accuracies": 0.800000011920929,
444
+ "rewards/chosen": -1.1809624433517456,
445
+ "rewards/margins": 1.9291051626205444,
446
+ "rewards/rejected": -3.110067844390869,
447
  "step": 280
448
  },
449
  {
450
  "epoch": 0.72,
451
  "learning_rate": 1.1335804528119475e-07,
452
+ "logits/chosen": -2.3108785152435303,
453
+ "logits/rejected": -2.2141172885894775,
454
+ "logps/chosen": -544.7510986328125,
455
+ "logps/rejected": -427.60150146484375,
456
+ "loss": 0.044,
457
+ "rewards/accuracies": 0.831250011920929,
458
+ "rewards/chosen": -1.00501549243927,
459
+ "rewards/margins": 2.146829605102539,
460
+ "rewards/rejected": -3.1518452167510986,
461
  "step": 290
462
  },
463
  {
464
  "epoch": 0.74,
465
  "learning_rate": 9.582028184286423e-08,
466
+ "logits/chosen": -2.350487470626831,
467
+ "logits/rejected": -2.307096481323242,
468
+ "logps/chosen": -554.42529296875,
469
+ "logps/rejected": -470.14434814453125,
470
+ "loss": 0.0402,
471
+ "rewards/accuracies": 0.8062499761581421,
472
+ "rewards/chosen": -1.1662867069244385,
473
+ "rewards/margins": 2.156501531600952,
474
+ "rewards/rejected": -3.3227882385253906,
475
  "step": 300
476
  },
477
  {
478
  "epoch": 0.74,
479
+ "eval_logits/chosen": -2.2979543209075928,
480
+ "eval_logits/rejected": -2.2751243114471436,
481
+ "eval_logps/chosen": -467.0057067871094,
482
+ "eval_logps/rejected": -507.4114685058594,
483
+ "eval_loss": 0.08367828279733658,
484
+ "eval_rewards/accuracies": 0.62890625,
485
+ "eval_rewards/chosen": -2.099660634994507,
486
+ "eval_rewards/margins": 0.4009218215942383,
487
+ "eval_rewards/rejected": -2.500582218170166,
488
+ "eval_runtime": 53.3734,
489
+ "eval_samples_per_second": 37.472,
490
  "eval_steps_per_second": 0.6,
491
  "step": 300
492
  },
493
  {
494
  "epoch": 0.76,
495
  "learning_rate": 7.943028774907065e-08,
496
+ "logits/chosen": -2.316253185272217,
497
+ "logits/rejected": -2.209606170654297,
498
+ "logps/chosen": -524.6145629882812,
499
+ "logps/rejected": -420.94671630859375,
500
+ "loss": 0.0324,
501
+ "rewards/accuracies": 0.768750011920929,
502
+ "rewards/chosen": -1.0568145513534546,
503
+ "rewards/margins": 2.0644707679748535,
504
+ "rewards/rejected": -3.1212852001190186,
505
  "step": 310
506
  },
507
  {
508
  "epoch": 0.79,
509
  "learning_rate": 6.431007601814637e-08,
510
+ "logits/chosen": -2.2733869552612305,
511
+ "logits/rejected": -2.169506549835205,
512
+ "logps/chosen": -532.5906982421875,
513
+ "logps/rejected": -450.932373046875,
514
+ "loss": 0.0316,
515
+ "rewards/accuracies": 0.762499988079071,
516
+ "rewards/chosen": -1.5221502780914307,
517
+ "rewards/margins": 1.9199508428573608,
518
+ "rewards/rejected": -3.442101001739502,
519
  "step": 320
520
  },
521
  {
522
  "epoch": 0.81,
523
  "learning_rate": 5.0572206951246e-08,
524
+ "logits/chosen": -2.290539503097534,
525
+ "logits/rejected": -2.193920850753784,
526
+ "logps/chosen": -562.043701171875,
527
+ "logps/rejected": -479.5208435058594,
528
+ "loss": 0.0278,
529
+ "rewards/accuracies": 0.793749988079071,
530
+ "rewards/chosen": -1.3767458200454712,
531
+ "rewards/margins": 2.1372973918914795,
532
+ "rewards/rejected": -3.514043092727661,
533
  "step": 330
534
  },
535
  {
536
  "epoch": 0.84,
537
  "learning_rate": 3.831895019292897e-08,
538
+ "logits/chosen": -2.3263535499572754,
539
+ "logits/rejected": -2.207899570465088,
540
+ "logps/chosen": -619.2625122070312,
541
+ "logps/rejected": -520.6148071289062,
542
+ "loss": 0.0305,
543
+ "rewards/accuracies": 0.7562500238418579,
544
+ "rewards/chosen": -1.5666420459747314,
545
+ "rewards/margins": 2.300938844680786,
546
+ "rewards/rejected": -3.8675804138183594,
547
  "step": 340
548
  },
549
  {
550
  "epoch": 0.86,
551
  "learning_rate": 2.764152339909756e-08,
552
+ "logits/chosen": -2.305875539779663,
553
+ "logits/rejected": -2.1924188137054443,
554
+ "logps/chosen": -568.1319580078125,
555
+ "logps/rejected": -475.6539611816406,
556
+ "loss": 0.0245,
557
+ "rewards/accuracies": 0.8062499761581421,
558
+ "rewards/chosen": -1.341552972793579,
559
+ "rewards/margins": 2.3198726177215576,
560
+ "rewards/rejected": -3.661425828933716,
561
  "step": 350
562
  },
563
  {
564
  "epoch": 0.89,
565
  "learning_rate": 1.861941317991664e-08,
566
+ "logits/chosen": -2.31453800201416,
567
+ "logits/rejected": -2.209552049636841,
568
+ "logps/chosen": -574.0198974609375,
569
+ "logps/rejected": -498.809326171875,
570
+ "loss": 0.0246,
571
+ "rewards/accuracies": 0.8187500238418579,
572
+ "rewards/chosen": -1.3858083486557007,
573
+ "rewards/margins": 2.422987699508667,
574
+ "rewards/rejected": -3.80879545211792,
575
  "step": 360
576
  },
577
  {
578
  "epoch": 0.91,
579
  "learning_rate": 1.13197833728636e-08,
580
+ "logits/chosen": -2.2876980304718018,
581
+ "logits/rejected": -2.1881823539733887,
582
+ "logps/chosen": -583.4609985351562,
583
+ "logps/rejected": -515.4216918945312,
584
+ "loss": 0.0274,
585
+ "rewards/accuracies": 0.8125,
586
+ "rewards/chosen": -1.2713569402694702,
587
+ "rewards/margins": 2.548645496368408,
588
+ "rewards/rejected": -3.820002317428589,
589
  "step": 370
590
  },
591
  {
592
  "epoch": 0.94,
593
  "learning_rate": 5.79697505093521e-09,
594
+ "logits/chosen": -2.2938995361328125,
595
+ "logits/rejected": -2.161371946334839,
596
+ "logps/chosen": -567.2229614257812,
597
+ "logps/rejected": -493.6429138183594,
598
+ "loss": 0.0339,
599
+ "rewards/accuracies": 0.8187500238418579,
600
+ "rewards/chosen": -1.495012879371643,
601
+ "rewards/margins": 2.173337697982788,
602
+ "rewards/rejected": -3.6683506965637207,
603
  "step": 380
604
  },
605
  {
606
  "epoch": 0.96,
607
  "learning_rate": 2.092101988131256e-09,
608
+ "logits/chosen": -2.3137059211730957,
609
+ "logits/rejected": -2.1986048221588135,
610
+ "logps/chosen": -600.1227416992188,
611
+ "logps/rejected": -496.6559143066406,
612
+ "loss": 0.0258,
613
+ "rewards/accuracies": 0.7875000238418579,
614
+ "rewards/chosen": -1.368238925933838,
615
+ "rewards/margins": 2.388805389404297,
616
+ "rewards/rejected": -3.7570443153381348,
617
  "step": 390
618
  },
619
  {
620
  "epoch": 0.99,
621
  "learning_rate": 2.327445937151673e-10,
622
+ "logits/chosen": -2.316849708557129,
623
+ "logits/rejected": -2.1959729194641113,
624
+ "logps/chosen": -559.0263061523438,
625
+ "logps/rejected": -482.11773681640625,
626
+ "loss": 0.0288,
627
+ "rewards/accuracies": 0.78125,
628
+ "rewards/chosen": -1.659148931503296,
629
+ "rewards/margins": 2.0086750984191895,
630
+ "rewards/rejected": -3.6678237915039062,
631
  "step": 400
632
  },
633
  {
634
  "epoch": 0.99,
635
+ "eval_logits/chosen": -2.2708253860473633,
636
+ "eval_logits/rejected": -2.245922565460205,
637
+ "eval_logps/chosen": -513.0960083007812,
638
+ "eval_logps/rejected": -552.8470458984375,
639
+ "eval_loss": 0.06599809229373932,
640
  "eval_rewards/accuracies": 0.625,
641
+ "eval_rewards/chosen": -2.560563564300537,
642
+ "eval_rewards/margins": 0.3943747282028198,
643
+ "eval_rewards/rejected": -2.9549384117126465,
644
+ "eval_runtime": 53.3482,
645
+ "eval_samples_per_second": 37.49,
646
+ "eval_steps_per_second": 0.6,
647
  "step": 400
648
  },
649
  {
650
  "epoch": 1.0,
651
  "step": 405,
652
  "total_flos": 0.0,
653
+ "train_loss": 0.07151281171374851,
654
+ "train_runtime": 3738.25,
655
+ "train_samples_per_second": 13.882,
656
  "train_steps_per_second": 0.108
657
  }
658
  ],