wzhouad commited on
Commit
96104a8
1 Parent(s): 1598ea5

Model save

Browse files
README.md CHANGED
@@ -16,16 +16,6 @@ should probably proofread and complete it, then remove this comment. -->
16
  # zephyr-7b-dpo-full
17
 
18
  This model is a fine-tuned version of [HuggingFaceH4/mistral-7b-sft-beta](https://huggingface.co/HuggingFaceH4/mistral-7b-sft-beta) on the None dataset.
19
- It achieves the following results on the evaluation set:
20
- - Loss: 0.0270
21
- - Rewards/chosen: -1.1958
22
- - Rewards/rejected: -1.8757
23
- - Rewards/accuracies: 0.7266
24
- - Rewards/margins: 0.6799
25
- - Logps/rejected: -444.9223
26
- - Logps/chosen: -376.6192
27
- - Logits/rejected: -2.4111
28
- - Logits/chosen: -2.4251
29
 
30
  ## Model description
31
 
@@ -47,7 +37,7 @@ The following hyperparameters were used during training:
47
  - learning_rate: 5e-07
48
  - train_batch_size: 8
49
  - eval_batch_size: 8
50
- - seed: 5
51
  - distributed_type: multi-GPU
52
  - num_devices: 8
53
  - gradient_accumulation_steps: 2
@@ -60,12 +50,6 @@ The following hyperparameters were used during training:
60
 
61
  ### Training results
62
 
63
- | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
64
- |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
65
- | 0.0517 | 0.21 | 100 | 0.0427 | -0.6331 | -1.0207 | 0.7031 | 0.3876 | -359.4206 | -320.3455 | -2.7320 | -2.7523 |
66
- | 0.0305 | 0.42 | 200 | 0.0297 | -1.0902 | -1.6743 | 0.7227 | 0.5842 | -424.7881 | -366.0565 | -2.5649 | -2.5797 |
67
- | 0.0258 | 0.63 | 300 | 0.0274 | -1.2031 | -1.8719 | 0.7188 | 0.6688 | -444.5428 | -377.3524 | -2.4247 | -2.4384 |
68
- | 0.0234 | 0.84 | 400 | 0.0270 | -1.1958 | -1.8757 | 0.7266 | 0.6799 | -444.9223 | -376.6192 | -2.4111 | -2.4251 |
69
 
70
 
71
  ### Framework versions
 
16
  # zephyr-7b-dpo-full
17
 
18
  This model is a fine-tuned version of [HuggingFaceH4/mistral-7b-sft-beta](https://huggingface.co/HuggingFaceH4/mistral-7b-sft-beta) on the None dataset.
 
 
 
 
 
 
 
 
 
 
19
 
20
  ## Model description
21
 
 
37
  - learning_rate: 5e-07
38
  - train_batch_size: 8
39
  - eval_batch_size: 8
40
+ - seed: 42
41
  - distributed_type: multi-GPU
42
  - num_devices: 8
43
  - gradient_accumulation_steps: 2
 
50
 
51
  ### Training results
52
 
 
 
 
 
 
 
53
 
54
 
55
  ### Framework versions
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.04001489771085803,
4
- "train_runtime": 4360.6473,
5
- "train_samples": 61134,
6
- "train_samples_per_second": 14.019,
7
- "train_steps_per_second": 0.11
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.3564033879424041,
4
+ "train_runtime": 425.1442,
5
+ "train_samples": 6750,
6
+ "train_samples_per_second": 15.877,
7
+ "train_steps_per_second": 0.125
8
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa33fb1854ab3a907487f79fd925398e17227af5383adab3028603e3f44eaf9e
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91328ad569624c8b14dcdb0ab4fea32f800ac871d8cb5af98a27b4b7452a2c7d
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:68702b69b734cbebb761073acc5f8012ce4ea55de3beb0ca6172c9f2a6e55165
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d23390b5bd7cb83885da59402667db713e5665a092d2a0df3084a70f535e0af
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c15124ee20ee3554c60c2262b71a9104a1fd6a950e0b80f05e85eafd6e7cd14a
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cba4c24bce3c8d8bf4f70ba942459472c04e1245c651f98e9f1c6a6e2b7a4c6b
3
  size 4540516344
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.04001489771085803,
4
- "train_runtime": 4360.6473,
5
- "train_samples": 61134,
6
- "train_samples_per_second": 14.019,
7
- "train_steps_per_second": 0.11
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.3564033879424041,
4
+ "train_runtime": 425.1442,
5
+ "train_samples": 6750,
6
+ "train_samples_per_second": 15.877,
7
+ "train_steps_per_second": 0.125
8
  }
trainer_state.json CHANGED
@@ -3,19 +3,22 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 100,
6
- "global_step": 478,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0,
13
- "learning_rate": 1.0416666666666666e-08,
14
- "logits/chosen": -2.8386030197143555,
15
- "logits/rejected": -2.823939323425293,
16
- "logps/chosen": -324.3727722167969,
17
- "logps/rejected": -231.64634704589844,
18
- "loss": 0.1053,
 
 
 
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
@@ -23,739 +26,102 @@
23
  "step": 1
24
  },
25
  {
26
- "epoch": 0.02,
27
- "learning_rate": 1.0416666666666667e-07,
28
- "logits/chosen": -2.8247900009155273,
29
- "logits/rejected": -2.7507708072662354,
30
- "logps/chosen": -275.7712707519531,
31
- "logps/rejected": -253.4366912841797,
32
- "loss": 0.1049,
33
- "rewards/accuracies": 0.4305555522441864,
34
- "rewards/chosen": -0.00010182532423641533,
35
- "rewards/margins": -0.0002283406356582418,
36
- "rewards/rejected": 0.00012651534052565694,
37
- "step": 10
38
- },
39
- {
40
- "epoch": 0.04,
41
- "learning_rate": 2.0833333333333333e-07,
42
- "logits/chosen": -2.7973473072052,
43
- "logits/rejected": -2.7797961235046387,
44
- "logps/chosen": -261.93072509765625,
45
- "logps/rejected": -257.0435791015625,
46
- "loss": 0.1048,
47
- "rewards/accuracies": 0.59375,
48
- "rewards/chosen": 0.00023481599055230618,
49
- "rewards/margins": 0.0013333772076293826,
50
- "rewards/rejected": -0.0010985612170770764,
51
- "step": 20
52
- },
53
- {
54
- "epoch": 0.06,
55
- "learning_rate": 3.1249999999999997e-07,
56
- "logits/chosen": -2.7847647666931152,
57
- "logits/rejected": -2.778301239013672,
58
- "logps/chosen": -295.0087890625,
59
- "logps/rejected": -259.0351257324219,
60
- "loss": 0.1026,
61
- "rewards/accuracies": 0.6499999761581421,
62
- "rewards/chosen": 0.0006326682632789016,
63
- "rewards/margins": 0.008715003728866577,
64
- "rewards/rejected": -0.008082335814833641,
65
- "step": 30
66
- },
67
- {
68
- "epoch": 0.08,
69
- "learning_rate": 4.1666666666666667e-07,
70
- "logits/chosen": -2.8043212890625,
71
- "logits/rejected": -2.773319959640503,
72
- "logps/chosen": -284.64312744140625,
73
- "logps/rejected": -264.53228759765625,
74
- "loss": 0.1011,
75
- "rewards/accuracies": 0.675000011920929,
76
- "rewards/chosen": -0.007321106735616922,
77
- "rewards/margins": 0.018408337607979774,
78
- "rewards/rejected": -0.02572944387793541,
79
- "step": 40
80
- },
81
- {
82
- "epoch": 0.1,
83
- "learning_rate": 4.999733114418725e-07,
84
- "logits/chosen": -2.779865264892578,
85
- "logits/rejected": -2.7175371646881104,
86
- "logps/chosen": -283.73773193359375,
87
- "logps/rejected": -253.7095184326172,
88
- "loss": 0.0944,
89
- "rewards/accuracies": 0.737500011920929,
90
- "rewards/chosen": -0.013570243492722511,
91
- "rewards/margins": 0.056820280849933624,
92
- "rewards/rejected": -0.07039052993059158,
93
- "step": 50
94
- },
95
- {
96
- "epoch": 0.13,
97
- "learning_rate": 4.990398100856366e-07,
98
- "logits/chosen": -2.805795192718506,
99
- "logits/rejected": -2.78430438041687,
100
- "logps/chosen": -293.5281677246094,
101
- "logps/rejected": -261.317626953125,
102
- "loss": 0.0893,
103
- "rewards/accuracies": 0.612500011920929,
104
- "rewards/chosen": -0.06878744065761566,
105
- "rewards/margins": 0.07276646792888641,
106
- "rewards/rejected": -0.14155389368534088,
107
- "step": 60
108
- },
109
- {
110
- "epoch": 0.15,
111
- "learning_rate": 4.967775735898179e-07,
112
- "logits/chosen": -2.7008605003356934,
113
- "logits/rejected": -2.6654582023620605,
114
- "logps/chosen": -272.4254455566406,
115
- "logps/rejected": -264.991455078125,
116
- "loss": 0.0818,
117
- "rewards/accuracies": 0.7124999761581421,
118
- "rewards/chosen": -0.12376339733600616,
119
- "rewards/margins": 0.13984574377536774,
120
- "rewards/rejected": -0.2636091411113739,
121
- "step": 70
122
- },
123
- {
124
- "epoch": 0.17,
125
- "learning_rate": 4.931986719649298e-07,
126
- "logits/chosen": -2.7169277667999268,
127
- "logits/rejected": -2.700068950653076,
128
- "logps/chosen": -293.3257751464844,
129
- "logps/rejected": -285.71270751953125,
130
- "loss": 0.0738,
131
- "rewards/accuracies": 0.65625,
132
- "rewards/chosen": -0.2549767792224884,
133
- "rewards/margins": 0.13869795203208923,
134
- "rewards/rejected": -0.39367473125457764,
135
- "step": 80
136
- },
137
- {
138
  "epoch": 0.19,
139
- "learning_rate": 4.883222001996351e-07,
140
- "logits/chosen": -2.7454025745391846,
141
- "logits/rejected": -2.721019983291626,
142
- "logps/chosen": -322.88800048828125,
143
- "logps/rejected": -335.7534484863281,
144
- "loss": 0.0572,
145
- "rewards/accuracies": 0.675000011920929,
146
- "rewards/chosen": -0.3772469460964203,
147
- "rewards/margins": 0.30307695269584656,
148
- "rewards/rejected": -0.6803240180015564,
149
- "step": 90
150
- },
151
- {
152
- "epoch": 0.21,
153
- "learning_rate": 4.821741763807186e-07,
154
- "logits/chosen": -2.6386475563049316,
155
- "logits/rejected": -2.6311068534851074,
156
- "logps/chosen": -348.7519226074219,
157
- "logps/rejected": -341.0511169433594,
158
- "loss": 0.0517,
159
- "rewards/accuracies": 0.6312500238418579,
160
- "rewards/chosen": -0.5442075729370117,
161
- "rewards/margins": 0.2245761901140213,
162
- "rewards/rejected": -0.7687837481498718,
163
- "step": 100
164
- },
165
- {
166
- "epoch": 0.21,
167
- "eval_logits/chosen": -2.752265691757202,
168
- "eval_logits/rejected": -2.7319908142089844,
169
- "eval_logps/chosen": -320.34552001953125,
170
- "eval_logps/rejected": -359.42059326171875,
171
- "eval_loss": 0.04265438765287399,
172
- "eval_rewards/accuracies": 0.703125,
173
- "eval_rewards/chosen": -0.6330589056015015,
174
- "eval_rewards/margins": 0.38761410117149353,
175
- "eval_rewards/rejected": -1.0206730365753174,
176
- "eval_runtime": 53.2642,
177
- "eval_samples_per_second": 37.549,
178
- "eval_steps_per_second": 0.601,
179
- "step": 100
180
- },
181
- {
182
- "epoch": 0.23,
183
- "learning_rate": 4.747874028753375e-07,
184
- "logits/chosen": -2.745008945465088,
185
- "logits/rejected": -2.6902968883514404,
186
- "logps/chosen": -342.7307434082031,
187
- "logps/rejected": -338.11322021484375,
188
- "loss": 0.0397,
189
- "rewards/accuracies": 0.6875,
190
- "rewards/chosen": -0.6825590133666992,
191
- "rewards/margins": 0.41266337037086487,
192
- "rewards/rejected": -1.0952222347259521,
193
- "step": 110
194
- },
195
- {
196
- "epoch": 0.25,
197
- "learning_rate": 4.662012913161997e-07,
198
- "logits/chosen": -2.6888184547424316,
199
- "logits/rejected": -2.6683411598205566,
200
- "logps/chosen": -343.5830078125,
201
- "logps/rejected": -382.2731628417969,
202
- "loss": 0.044,
203
- "rewards/accuracies": 0.6937500238418579,
204
- "rewards/chosen": -0.8013314008712769,
205
- "rewards/margins": 0.41239532828330994,
206
- "rewards/rejected": -1.2137267589569092,
207
- "step": 120
208
- },
209
- {
210
- "epoch": 0.27,
211
- "learning_rate": 4.5646165232345103e-07,
212
- "logits/chosen": -2.690122127532959,
213
- "logits/rejected": -2.673074245452881,
214
- "logps/chosen": -349.1435546875,
215
- "logps/rejected": -378.6903381347656,
216
- "loss": 0.0367,
217
- "rewards/accuracies": 0.6937500238418579,
218
- "rewards/chosen": -0.8628436326980591,
219
- "rewards/margins": 0.46390500664711,
220
- "rewards/rejected": -1.3267487287521362,
221
- "step": 130
222
- },
223
- {
224
- "epoch": 0.29,
225
- "learning_rate": 4.456204510851956e-07,
226
- "logits/chosen": -2.6322216987609863,
227
- "logits/rejected": -2.615206480026245,
228
- "logps/chosen": -335.52008056640625,
229
- "logps/rejected": -349.85638427734375,
230
- "loss": 0.0411,
231
- "rewards/accuracies": 0.65625,
232
- "rewards/chosen": -0.8122199773788452,
233
- "rewards/margins": 0.3598429262638092,
234
- "rewards/rejected": -1.1720629930496216,
235
- "step": 140
236
- },
237
- {
238
- "epoch": 0.31,
239
- "learning_rate": 4.337355301007335e-07,
240
- "logits/chosen": -2.546915292739868,
241
- "logits/rejected": -2.5243468284606934,
242
- "logps/chosen": -353.551025390625,
243
- "logps/rejected": -377.07989501953125,
244
- "loss": 0.0368,
245
- "rewards/accuracies": 0.6625000238418579,
246
- "rewards/chosen": -0.8309852480888367,
247
- "rewards/margins": 0.3872799277305603,
248
- "rewards/rejected": -1.2182650566101074,
249
- "step": 150
250
- },
251
- {
252
- "epoch": 0.33,
253
- "learning_rate": 4.2087030056579986e-07,
254
- "logits/chosen": -2.516842842102051,
255
- "logits/rejected": -2.509202480316162,
256
- "logps/chosen": -396.78466796875,
257
- "logps/rejected": -430.03662109375,
258
- "loss": 0.0276,
259
- "rewards/accuracies": 0.637499988079071,
260
- "rewards/chosen": -1.324944257736206,
261
- "rewards/margins": 0.2987218201160431,
262
- "rewards/rejected": -1.6236661672592163,
263
- "step": 160
264
- },
265
- {
266
- "epoch": 0.36,
267
- "learning_rate": 4.070934040463998e-07,
268
- "logits/chosen": -2.486632823944092,
269
- "logits/rejected": -2.4413046836853027,
270
- "logps/chosen": -419.40582275390625,
271
- "logps/rejected": -413.1378479003906,
272
- "loss": 0.0252,
273
- "rewards/accuracies": 0.574999988079071,
274
- "rewards/chosen": -1.4800517559051514,
275
- "rewards/margins": 0.3067554235458374,
276
- "rewards/rejected": -1.7868072986602783,
277
- "step": 170
278
  },
279
  {
 
 
 
280
  "epoch": 0.38,
281
- "learning_rate": 3.9247834624635404e-07,
282
- "logits/chosen": -2.510317325592041,
283
- "logits/rejected": -2.5057549476623535,
284
- "logps/chosen": -372.1962585449219,
285
- "logps/rejected": -414.73858642578125,
286
- "loss": 0.0272,
287
- "rewards/accuracies": 0.6499999761581421,
288
- "rewards/chosen": -1.3809983730316162,
289
- "rewards/margins": 0.39020317792892456,
290
- "rewards/rejected": -1.771201491355896,
291
- "step": 180
292
- },
293
- {
294
- "epoch": 0.4,
295
- "learning_rate": 3.7710310482256523e-07,
296
- "logits/chosen": -2.5819058418273926,
297
- "logits/rejected": -2.580766439437866,
298
- "logps/chosen": -367.2718505859375,
299
- "logps/rejected": -395.99261474609375,
300
- "loss": 0.0276,
301
- "rewards/accuracies": 0.668749988079071,
302
- "rewards/chosen": -1.1196916103363037,
303
- "rewards/margins": 0.3849378228187561,
304
- "rewards/rejected": -1.504629373550415,
305
- "step": 190
306
- },
307
- {
308
- "epoch": 0.42,
309
- "learning_rate": 3.610497133404795e-07,
310
- "logits/chosen": -2.5476508140563965,
311
- "logits/rejected": -2.5647332668304443,
312
- "logps/chosen": -385.6960754394531,
313
- "logps/rejected": -413.998291015625,
314
- "loss": 0.0305,
315
- "rewards/accuracies": 0.6187499761581421,
316
- "rewards/chosen": -1.0930745601654053,
317
- "rewards/margins": 0.34844768047332764,
318
- "rewards/rejected": -1.4415223598480225,
319
- "step": 200
320
- },
321
- {
322
- "epoch": 0.42,
323
- "eval_logits/chosen": -2.579733371734619,
324
- "eval_logits/rejected": -2.5648751258850098,
325
- "eval_logps/chosen": -366.0565490722656,
326
- "eval_logps/rejected": -424.7881164550781,
327
- "eval_loss": 0.029730383306741714,
328
- "eval_rewards/accuracies": 0.72265625,
329
- "eval_rewards/chosen": -1.0901691913604736,
330
- "eval_rewards/margins": 0.5841794013977051,
331
- "eval_rewards/rejected": -1.6743484735488892,
332
- "eval_runtime": 53.2173,
333
- "eval_samples_per_second": 37.582,
334
- "eval_steps_per_second": 0.601,
335
- "step": 200
336
- },
337
- {
338
- "epoch": 0.44,
339
- "learning_rate": 3.4440382358952115e-07,
340
- "logits/chosen": -2.49862003326416,
341
- "logits/rejected": -2.4916858673095703,
342
- "logps/chosen": -370.3592834472656,
343
- "logps/rejected": -407.2472229003906,
344
- "loss": 0.0299,
345
- "rewards/accuracies": 0.675000011920929,
346
- "rewards/chosen": -1.1277533769607544,
347
- "rewards/margins": 0.5519038438796997,
348
- "rewards/rejected": -1.679657220840454,
349
- "step": 210
350
- },
351
- {
352
- "epoch": 0.46,
353
- "learning_rate": 3.272542485937368e-07,
354
- "logits/chosen": -2.5778369903564453,
355
- "logits/rejected": -2.558171510696411,
356
- "logps/chosen": -380.861328125,
357
- "logps/rejected": -401.4359130859375,
358
- "loss": 0.0296,
359
- "rewards/accuracies": 0.668749988079071,
360
- "rewards/chosen": -1.1397533416748047,
361
- "rewards/margins": 0.4481385350227356,
362
- "rewards/rejected": -1.587891936302185,
363
- "step": 220
364
- },
365
- {
366
- "epoch": 0.48,
367
- "learning_rate": 3.096924887558854e-07,
368
- "logits/chosen": -2.5747199058532715,
369
- "logits/rejected": -2.535149097442627,
370
- "logps/chosen": -425.2906188964844,
371
- "logps/rejected": -415.38677978515625,
372
- "loss": 0.0278,
373
  "rewards/accuracies": 0.6875,
374
- "rewards/chosen": -1.116990327835083,
375
- "rewards/margins": 0.44105783104896545,
376
- "rewards/rejected": -1.5580482482910156,
377
- "step": 230
378
  },
379
  {
380
- "epoch": 0.5,
381
- "learning_rate": 2.9181224366319943e-07,
382
- "logits/chosen": -2.4556021690368652,
383
- "logits/rejected": -2.422232151031494,
384
- "logps/chosen": -395.6624755859375,
385
- "logps/rejected": -402.35894775390625,
386
- "loss": 0.0272,
 
 
 
387
  "rewards/accuracies": 0.6937500238418579,
388
- "rewards/chosen": -1.2475910186767578,
389
- "rewards/margins": 0.4241558909416199,
390
- "rewards/rejected": -1.671746850013733,
391
- "step": 240
392
- },
393
- {
394
- "epoch": 0.52,
395
- "learning_rate": 2.7370891215954565e-07,
396
- "logits/chosen": -2.432041645050049,
397
- "logits/rejected": -2.4213993549346924,
398
- "logps/chosen": -398.3387451171875,
399
- "logps/rejected": -416.9798889160156,
400
- "loss": 0.0264,
401
- "rewards/accuracies": 0.706250011920929,
402
- "rewards/chosen": -1.2123913764953613,
403
- "rewards/margins": 0.5389462113380432,
404
- "rewards/rejected": -1.7513376474380493,
405
- "step": 250
406
- },
407
- {
408
- "epoch": 0.54,
409
- "learning_rate": 2.55479083351317e-07,
410
- "logits/chosen": -2.477323532104492,
411
- "logits/rejected": -2.4589943885803223,
412
- "logps/chosen": -404.10638427734375,
413
- "logps/rejected": -437.82330322265625,
414
- "loss": 0.0289,
415
- "rewards/accuracies": 0.7437499761581421,
416
- "rewards/chosen": -1.1455386877059937,
417
- "rewards/margins": 0.6531764268875122,
418
- "rewards/rejected": -1.7987149953842163,
419
- "step": 260
420
- },
421
- {
422
- "epoch": 0.56,
423
- "learning_rate": 2.3722002126275822e-07,
424
- "logits/chosen": -2.392612934112549,
425
- "logits/rejected": -2.3851914405822754,
426
- "logps/chosen": -389.7043151855469,
427
- "logps/rejected": -410.2840881347656,
428
- "loss": 0.0276,
429
- "rewards/accuracies": 0.65625,
430
- "rewards/chosen": -1.2070677280426025,
431
- "rewards/margins": 0.4757324755191803,
432
- "rewards/rejected": -1.68280029296875,
433
- "step": 270
434
- },
435
- {
436
- "epoch": 0.59,
437
- "learning_rate": 2.19029145890313e-07,
438
- "logits/chosen": -2.4097766876220703,
439
- "logits/rejected": -2.387056350708008,
440
- "logps/chosen": -399.73663330078125,
441
- "logps/rejected": -442.6588439941406,
442
- "loss": 0.0236,
443
- "rewards/accuracies": 0.7250000238418579,
444
- "rewards/chosen": -1.4455411434173584,
445
- "rewards/margins": 0.6014262437820435,
446
- "rewards/rejected": -2.046967029571533,
447
- "step": 280
448
- },
449
- {
450
- "epoch": 0.61,
451
- "learning_rate": 2.0100351342479216e-07,
452
- "logits/chosen": -2.4165215492248535,
453
- "logits/rejected": -2.3698172569274902,
454
- "logps/chosen": -431.76922607421875,
455
- "logps/rejected": -444.64080810546875,
456
- "loss": 0.025,
457
- "rewards/accuracies": 0.706250011920929,
458
- "rewards/chosen": -1.4643802642822266,
459
- "rewards/margins": 0.5780037045478821,
460
- "rewards/rejected": -2.0423836708068848,
461
- "step": 290
462
- },
463
- {
464
- "epoch": 0.63,
465
- "learning_rate": 1.8323929841460178e-07,
466
- "logits/chosen": -2.457998275756836,
467
- "logits/rejected": -2.436868190765381,
468
- "logps/chosen": -404.76409912109375,
469
- "logps/rejected": -460.781005859375,
470
- "loss": 0.0258,
471
- "rewards/accuracies": 0.7250000238418579,
472
- "rewards/chosen": -1.1497269868850708,
473
- "rewards/margins": 0.6188378930091858,
474
- "rewards/rejected": -1.7685649394989014,
475
- "step": 300
476
- },
477
- {
478
- "epoch": 0.63,
479
- "eval_logits/chosen": -2.438358783721924,
480
- "eval_logits/rejected": -2.4246902465820312,
481
- "eval_logps/chosen": -377.3524475097656,
482
- "eval_logps/rejected": -444.54278564453125,
483
- "eval_loss": 0.02741866372525692,
484
- "eval_rewards/accuracies": 0.71875,
485
- "eval_rewards/chosen": -1.2031283378601074,
486
- "eval_rewards/margins": 0.668766438961029,
487
- "eval_rewards/rejected": -1.8718948364257812,
488
- "eval_runtime": 53.272,
489
- "eval_samples_per_second": 37.543,
490
- "eval_steps_per_second": 0.601,
491
- "step": 300
492
- },
493
- {
494
- "epoch": 0.65,
495
- "learning_rate": 1.6583128063291573e-07,
496
- "logits/chosen": -2.4320359230041504,
497
- "logits/rejected": -2.4065451622009277,
498
- "logps/chosen": -390.3728332519531,
499
- "logps/rejected": -426.6919860839844,
500
- "loss": 0.025,
501
- "rewards/accuracies": 0.668749988079071,
502
- "rewards/chosen": -1.2841016054153442,
503
- "rewards/margins": 0.5093994736671448,
504
- "rewards/rejected": -1.7935012578964233,
505
- "step": 310
506
- },
507
- {
508
- "epoch": 0.67,
509
- "learning_rate": 1.488723393865766e-07,
510
- "logits/chosen": -2.36087965965271,
511
- "logits/rejected": -2.3556385040283203,
512
- "logps/chosen": -418.49493408203125,
513
- "logps/rejected": -450.12139892578125,
514
- "loss": 0.0232,
515
- "rewards/accuracies": 0.675000011920929,
516
- "rewards/chosen": -1.3573024272918701,
517
- "rewards/margins": 0.4884418547153473,
518
- "rewards/rejected": -1.8457443714141846,
519
- "step": 320
520
- },
521
- {
522
- "epoch": 0.69,
523
- "learning_rate": 1.3245295796480788e-07,
524
- "logits/chosen": -2.3685812950134277,
525
- "logits/rejected": -2.3668787479400635,
526
- "logps/chosen": -373.538330078125,
527
- "logps/rejected": -445.90625,
528
- "loss": 0.025,
529
- "rewards/accuracies": 0.7250000238418579,
530
- "rewards/chosen": -1.239685297012329,
531
- "rewards/margins": 0.6630219221115112,
532
- "rewards/rejected": -1.9027073383331299,
533
- "step": 330
534
- },
535
- {
536
- "epoch": 0.71,
537
- "learning_rate": 1.1666074087171627e-07,
538
- "logits/chosen": -2.38093900680542,
539
- "logits/rejected": -2.3856873512268066,
540
- "logps/chosen": -420.5098571777344,
541
- "logps/rejected": -464.68243408203125,
542
- "loss": 0.0268,
543
- "rewards/accuracies": 0.71875,
544
- "rewards/chosen": -1.2994890213012695,
545
- "rewards/margins": 0.645952045917511,
546
- "rewards/rejected": -1.9454412460327148,
547
- "step": 340
548
- },
549
- {
550
- "epoch": 0.73,
551
- "learning_rate": 1.0157994641835734e-07,
552
- "logits/chosen": -2.4149842262268066,
553
- "logits/rejected": -2.3791468143463135,
554
- "logps/chosen": -378.4749450683594,
555
- "logps/rejected": -417.32305908203125,
556
- "loss": 0.0256,
557
- "rewards/accuracies": 0.6812499761581421,
558
- "rewards/chosen": -1.2759901285171509,
559
- "rewards/margins": 0.5479945540428162,
560
- "rewards/rejected": -1.8239845037460327,
561
- "step": 350
562
  },
563
  {
 
 
 
564
  "epoch": 0.75,
565
- "learning_rate": 8.729103716819111e-08,
566
- "logits/chosen": -2.410066604614258,
567
- "logits/rejected": -2.374561309814453,
568
- "logps/chosen": -394.406494140625,
569
- "logps/rejected": -416.733642578125,
570
- "loss": 0.0275,
571
- "rewards/accuracies": 0.675000011920929,
572
- "rewards/chosen": -1.3009366989135742,
573
- "rewards/margins": 0.5371383428573608,
574
- "rewards/rejected": -1.838075041770935,
575
- "step": 360
576
- },
577
- {
578
- "epoch": 0.77,
579
- "learning_rate": 7.387025063449081e-08,
580
- "logits/chosen": -2.397771120071411,
581
- "logits/rejected": -2.362600326538086,
582
- "logps/chosen": -412.84332275390625,
583
- "logps/rejected": -450.31048583984375,
584
- "loss": 0.0228,
585
- "rewards/accuracies": 0.6937500238418579,
586
- "rewards/chosen": -1.2637946605682373,
587
- "rewards/margins": 0.5941613912582397,
588
- "rewards/rejected": -1.8579561710357666,
589
- "step": 370
590
- },
591
- {
592
- "epoch": 0.79,
593
- "learning_rate": 6.138919252022435e-08,
594
- "logits/chosen": -2.4274802207946777,
595
- "logits/rejected": -2.420586347579956,
596
- "logps/chosen": -411.314208984375,
597
- "logps/rejected": -444.90869140625,
598
- "loss": 0.0251,
599
- "rewards/accuracies": 0.7250000238418579,
600
- "rewards/chosen": -1.256594181060791,
601
- "rewards/margins": 0.5403616428375244,
602
- "rewards/rejected": -1.7969558238983154,
603
- "step": 380
604
- },
605
- {
606
- "epoch": 0.82,
607
- "learning_rate": 4.991445467064689e-08,
608
- "logits/chosen": -2.3289995193481445,
609
- "logits/rejected": -2.306408405303955,
610
- "logps/chosen": -396.445556640625,
611
- "logps/rejected": -452.8165588378906,
612
- "loss": 0.024,
613
- "rewards/accuracies": 0.737500011920929,
614
- "rewards/chosen": -1.269308090209961,
615
- "rewards/margins": 0.6829138994216919,
616
- "rewards/rejected": -1.9522218704223633,
617
- "step": 390
618
- },
619
- {
620
- "epoch": 0.84,
621
- "learning_rate": 3.9507259776993954e-08,
622
- "logits/chosen": -2.41839337348938,
623
- "logits/rejected": -2.396681547164917,
624
- "logps/chosen": -400.5920715332031,
625
- "logps/rejected": -426.2327575683594,
626
- "loss": 0.0234,
627
- "rewards/accuracies": 0.65625,
628
- "rewards/chosen": -1.2493975162506104,
629
- "rewards/margins": 0.4738248288631439,
630
- "rewards/rejected": -1.7232223749160767,
631
- "step": 400
632
- },
633
- {
634
- "epoch": 0.84,
635
- "eval_logits/chosen": -2.4251461029052734,
636
- "eval_logits/rejected": -2.4111008644104004,
637
- "eval_logps/chosen": -376.6192321777344,
638
- "eval_logps/rejected": -444.92230224609375,
639
- "eval_loss": 0.027030915021896362,
640
- "eval_rewards/accuracies": 0.7265625,
641
- "eval_rewards/chosen": -1.1957957744598389,
642
- "eval_rewards/margins": 0.6798944473266602,
643
- "eval_rewards/rejected": -1.8756903409957886,
644
- "eval_runtime": 53.2021,
645
- "eval_samples_per_second": 37.593,
646
- "eval_steps_per_second": 0.601,
647
- "step": 400
648
- },
649
- {
650
- "epoch": 0.86,
651
- "learning_rate": 3.022313472693447e-08,
652
- "logits/chosen": -2.412111759185791,
653
- "logits/rejected": -2.3663253784179688,
654
- "logps/chosen": -387.081787109375,
655
- "logps/rejected": -392.51336669921875,
656
- "loss": 0.0248,
657
- "rewards/accuracies": 0.71875,
658
- "rewards/chosen": -1.1241955757141113,
659
- "rewards/margins": 0.5859023332595825,
660
- "rewards/rejected": -1.7100979089736938,
661
- "step": 410
662
- },
663
- {
664
- "epoch": 0.88,
665
- "learning_rate": 2.2111614344599684e-08,
666
- "logits/chosen": -2.420088291168213,
667
- "logits/rejected": -2.3837363719940186,
668
- "logps/chosen": -418.43719482421875,
669
- "logps/rejected": -451.14996337890625,
670
- "loss": 0.0256,
671
- "rewards/accuracies": 0.71875,
672
- "rewards/chosen": -1.2756112813949585,
673
- "rewards/margins": 0.7357032895088196,
674
- "rewards/rejected": -2.011314868927002,
675
- "step": 420
676
- },
677
- {
678
- "epoch": 0.9,
679
- "learning_rate": 1.521597710086439e-08,
680
- "logits/chosen": -2.376132011413574,
681
- "logits/rejected": -2.380525827407837,
682
- "logps/chosen": -408.94781494140625,
683
- "logps/rejected": -458.7674865722656,
684
- "loss": 0.0244,
685
- "rewards/accuracies": 0.7124999761581421,
686
- "rewards/chosen": -1.34002685546875,
687
- "rewards/margins": 0.5449159741401672,
688
- "rewards/rejected": -1.8849427700042725,
689
- "step": 430
690
- },
691
- {
692
- "epoch": 0.92,
693
- "learning_rate": 9.57301420397924e-09,
694
- "logits/chosen": -2.360464334487915,
695
- "logits/rejected": -2.3589634895324707,
696
- "logps/chosen": -394.01226806640625,
697
- "logps/rejected": -432.554443359375,
698
- "loss": 0.026,
699
- "rewards/accuracies": 0.625,
700
- "rewards/chosen": -1.3313329219818115,
701
- "rewards/margins": 0.5157779455184937,
702
- "rewards/rejected": -1.8471107482910156,
703
- "step": 440
704
  },
705
  {
 
 
 
706
  "epoch": 0.94,
707
- "learning_rate": 5.212833302556258e-09,
708
- "logits/chosen": -2.3685426712036133,
709
- "logits/rejected": -2.3834328651428223,
710
- "logps/chosen": -390.4405212402344,
711
- "logps/rejected": -432.5458068847656,
712
- "loss": 0.0248,
713
- "rewards/accuracies": 0.675000011920929,
714
- "rewards/chosen": -1.3699880838394165,
715
- "rewards/margins": 0.4800630509853363,
716
- "rewards/rejected": -1.8500511646270752,
717
- "step": 450
718
- },
719
- {
720
- "epoch": 0.96,
721
- "learning_rate": 2.158697848236607e-09,
722
- "logits/chosen": -2.3582141399383545,
723
- "logits/rejected": -2.3275790214538574,
724
- "logps/chosen": -391.3001403808594,
725
- "logps/rejected": -415.3433532714844,
726
- "loss": 0.0249,
727
- "rewards/accuracies": 0.65625,
728
- "rewards/chosen": -1.381425380706787,
729
- "rewards/margins": 0.4009224474430084,
730
- "rewards/rejected": -1.7823479175567627,
731
- "step": 460
732
- },
733
- {
734
- "epoch": 0.98,
735
- "learning_rate": 4.269029751107489e-10,
736
- "logits/chosen": -2.3417115211486816,
737
- "logits/rejected": -2.3047232627868652,
738
- "logps/chosen": -387.1515197753906,
739
- "logps/rejected": -449.08837890625,
740
- "loss": 0.0252,
741
  "rewards/accuracies": 0.699999988079071,
742
- "rewards/chosen": -1.2849957942962646,
743
- "rewards/margins": 0.6696574091911316,
744
- "rewards/rejected": -1.9546531438827515,
745
- "step": 470
746
  },
747
  {
748
  "epoch": 1.0,
749
- "step": 478,
750
  "total_flos": 0.0,
751
- "train_loss": 0.04001489771085803,
752
- "train_runtime": 4360.6473,
753
- "train_samples_per_second": 14.019,
754
- "train_steps_per_second": 0.11
755
  }
756
  ],
757
  "logging_steps": 10,
758
- "max_steps": 478,
759
  "num_train_epochs": 1,
760
  "save_steps": 100,
761
  "total_flos": 0.0,
 
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 100,
6
+ "global_step": 53,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "debug/losses": 0.342240571975708,
13
+ "debug/policy_weights": 0.4937487840652466,
14
+ "debug/raw_losses": 0.6931471824645996,
15
+ "epoch": 0.02,
16
+ "learning_rate": 8.333333333333333e-08,
17
+ "logits/chosen": -2.8462421894073486,
18
+ "logits/rejected": -2.8283610343933105,
19
+ "logps/chosen": -274.7393798828125,
20
+ "logps/rejected": -204.42575073242188,
21
+ "loss": 0.3624,
22
  "rewards/accuracies": 0.0,
23
  "rewards/chosen": 0.0,
24
  "rewards/margins": 0.0,
 
26
  "step": 1
27
  },
28
  {
29
+ "debug/losses": 0.3810771703720093,
30
+ "debug/policy_weights": 0.5504893660545349,
31
+ "debug/raw_losses": 0.6920116543769836,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  "epoch": 0.19,
33
+ "learning_rate": 4.911172937635942e-07,
34
+ "logits/chosen": -2.852349281311035,
35
+ "logits/rejected": -2.83735990524292,
36
+ "logps/chosen": -306.01458740234375,
37
+ "logps/rejected": -295.93804931640625,
38
+ "loss": 0.3763,
39
+ "rewards/accuracies": 0.4861111044883728,
40
+ "rewards/chosen": 0.0012468346394598484,
41
+ "rewards/margins": 0.0023373025469481945,
42
+ "rewards/rejected": -0.0010904677910730243,
43
+ "step": 10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  },
45
  {
46
+ "debug/losses": 0.3667130768299103,
47
+ "debug/policy_weights": 0.5492504835128784,
48
+ "debug/raw_losses": 0.6667538285255432,
49
  "epoch": 0.38,
50
+ "learning_rate": 3.982949361823388e-07,
51
+ "logits/chosen": -2.8518126010894775,
52
+ "logits/rejected": -2.872077226638794,
53
+ "logps/chosen": -296.2869567871094,
54
+ "logps/rejected": -332.9769592285156,
55
+ "loss": 0.374,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  "rewards/accuracies": 0.6875,
57
+ "rewards/chosen": 0.017367612570524216,
58
+ "rewards/margins": 0.05825704336166382,
59
+ "rewards/rejected": -0.04088941961526871,
60
+ "step": 20
61
  },
62
  {
63
+ "debug/losses": 0.35409680008888245,
64
+ "debug/policy_weights": 0.5632873177528381,
65
+ "debug/raw_losses": 0.6232098340988159,
66
+ "epoch": 0.57,
67
+ "learning_rate": 2.416462557480814e-07,
68
+ "logits/chosen": -2.824850559234619,
69
+ "logits/rejected": -2.8103976249694824,
70
+ "logps/chosen": -312.3518981933594,
71
+ "logps/rejected": -323.0265197753906,
72
+ "loss": 0.3551,
73
  "rewards/accuracies": 0.6937500238418579,
74
+ "rewards/chosen": 0.014037144370377064,
75
+ "rewards/margins": 0.1827748715877533,
76
+ "rewards/rejected": -0.1687377393245697,
77
+ "step": 30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  },
79
  {
80
+ "debug/losses": 0.34752941131591797,
81
+ "debug/policy_weights": 0.550245463848114,
82
+ "debug/raw_losses": 0.6313939690589905,
83
  "epoch": 0.75,
84
+ "learning_rate": 8.859303711029939e-08,
85
+ "logits/chosen": -2.79345965385437,
86
+ "logits/rejected": -2.797208547592163,
87
+ "logps/chosen": -275.5638122558594,
88
+ "logps/rejected": -348.8089294433594,
89
+ "loss": 0.3454,
90
+ "rewards/accuracies": 0.6812499761581421,
91
+ "rewards/chosen": -0.0044286223128438,
92
+ "rewards/margins": 0.20287349820137024,
93
+ "rewards/rejected": -0.20730212330818176,
94
+ "step": 40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  },
96
  {
97
+ "debug/losses": 0.3295789361000061,
98
+ "debug/policy_weights": 0.5445331335067749,
99
+ "debug/raw_losses": 0.5927887558937073,
100
  "epoch": 0.94,
101
+ "learning_rate": 5.009573740853313e-09,
102
+ "logits/chosen": -2.829876184463501,
103
+ "logits/rejected": -2.8419814109802246,
104
+ "logps/chosen": -307.4209899902344,
105
+ "logps/rejected": -348.12298583984375,
106
+ "loss": 0.3378,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  "rewards/accuracies": 0.699999988079071,
108
+ "rewards/chosen": 0.0008583018789067864,
109
+ "rewards/margins": 0.31627127528190613,
110
+ "rewards/rejected": -0.3154129683971405,
111
+ "step": 50
112
  },
113
  {
114
  "epoch": 1.0,
115
+ "step": 53,
116
  "total_flos": 0.0,
117
+ "train_loss": 0.3564033879424041,
118
+ "train_runtime": 425.1442,
119
+ "train_samples_per_second": 15.877,
120
+ "train_steps_per_second": 0.125
121
  }
122
  ],
123
  "logging_steps": 10,
124
+ "max_steps": 53,
125
  "num_train_epochs": 1,
126
  "save_steps": 100,
127
  "total_flos": 0.0,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ae4b3803601c546a6a85ea5d3bbbeb734308c9a2796685d117ece10803b4cd0f
3
  size 5944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:badf95e1b28a5ba1204144bfc8bd0b89ed4f9c81844941c1b6c826fe492f8ed6
3
  size 5944