RikkiXu commited on
Commit
83304fb
1 Parent(s): 915b0f3

Model save

Browse files
README.md CHANGED
@@ -13,7 +13,7 @@ should probably proofread and complete it, then remove this comment. -->
13
 
14
  # zephyr-7b-dpo-full
15
 
16
- This model was trained from scratch on the None dataset.
17
 
18
  ## Model description
19
 
@@ -52,7 +52,7 @@ The following hyperparameters were used during training:
52
 
53
  ### Framework versions
54
 
55
- - Transformers 4.41.1
56
  - Pytorch 2.1.2+cu118
57
- - Datasets 2.16.1
58
- - Tokenizers 0.19.1
 
13
 
14
  # zephyr-7b-dpo-full
15
 
16
+ This model was trained from scratch on an unknown dataset.
17
 
18
  ## Model description
19
 
 
52
 
53
  ### Framework versions
54
 
55
+ - Transformers 4.39.3
56
  - Pytorch 2.1.2+cu118
57
+ - Datasets 2.19.1
58
+ - Tokenizers 0.15.2
all_results.json CHANGED
@@ -1,9 +1,8 @@
1
  {
2
- "epoch": 0.9980806142034548,
3
- "total_flos": 0.0,
4
- "train_loss": 0.29645214692140237,
5
- "train_runtime": 6440.0874,
6
  "train_samples": 50000,
7
- "train_samples_per_second": 7.764,
8
- "train_steps_per_second": 0.061
9
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "train_loss": 0.20326648155848184,
4
+ "train_runtime": 5896.1189,
 
5
  "train_samples": 50000,
6
+ "train_samples_per_second": 8.48,
7
+ "train_steps_per_second": 0.066
8
  }
generation_config.json CHANGED
@@ -2,5 +2,5 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
- "transformers_version": "4.41.1"
6
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
+ "transformers_version": "4.39.3"
6
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec9240d64ac00cd6eb3d828f3a6b877927f4e73531e0292161f3a9fb63dfbe81
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58bde9bfb67010c336cace37ab13ac39da6af79040fcdbdabd0a04935b66a870
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f71eb1b478b34a5d188dd277f1c68954a3090f452264f7aee953e94fffa874f7
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35d424b1b9f269fcdb54ade09434feadf354ef611fe440a2d936528908734919
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cadb0e9982bf9b0f8f3048618cb4654101dc9b30be3d02283ea1fc069abbe2d3
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:073b6342526b72183cb12d771d63bd08d1440bc35898cc6d740a2bf32a97585b
3
  size 4540516344
runs/Jun22_07-28-38_n136-112-146/events.out.tfevents.1719013199.n136-112-146.3352140.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a78c114b19fab1e039d3a4be75335ff74ec60afbf850c3d771699bbeb6953590
3
- size 26098
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ab2372bceb2d6c5b46f639ae89e3d4ccce0ce757ac17c449bfd5ab2298930cb
3
+ size 32644
train_results.json CHANGED
@@ -1,9 +1,8 @@
1
  {
2
- "epoch": 0.9980806142034548,
3
- "total_flos": 0.0,
4
- "train_loss": 0.29645214692140237,
5
- "train_runtime": 6440.0874,
6
  "train_samples": 50000,
7
- "train_samples_per_second": 7.764,
8
- "train_steps_per_second": 0.061
9
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "train_loss": 0.20326648155848184,
4
+ "train_runtime": 5896.1189,
 
5
  "train_samples": 50000,
6
+ "train_samples_per_second": 8.48,
7
+ "train_steps_per_second": 0.066
8
  }
trainer_state.json CHANGED
@@ -9,13 +9,13 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0025591810620601407,
13
- "grad_norm": 709.6283481081026,
14
  "learning_rate": 7.692307692307691e-09,
15
- "logits/chosen": -2.5583817958831787,
16
- "logits/rejected": -2.4487552642822266,
17
  "logps/chosen": -258.1644592285156,
18
- "logps/rejected": -216.25729370117188,
19
  "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
@@ -24,598 +24,598 @@
24
  "step": 1
25
  },
26
  {
27
- "epoch": 0.025591810620601407,
28
- "grad_norm": 680.9594369005476,
29
  "learning_rate": 7.692307692307691e-08,
30
- "logits/chosen": -2.605868101119995,
31
- "logits/rejected": -2.5530831813812256,
32
- "logps/chosen": -267.5987548828125,
33
- "logps/rejected": -217.66183471679688,
34
- "loss": 0.698,
35
- "rewards/accuracies": 0.4444444477558136,
36
- "rewards/chosen": -0.01713324338197708,
37
- "rewards/margins": -0.0054442849941551685,
38
- "rewards/rejected": -0.011688957922160625,
39
  "step": 10
40
  },
41
  {
42
- "epoch": 0.05118362124120281,
43
- "grad_norm": 521.8729090609377,
44
  "learning_rate": 1.5384615384615382e-07,
45
- "logits/chosen": -2.6261820793151855,
46
- "logits/rejected": -2.563920497894287,
47
- "logps/chosen": -260.98382568359375,
48
- "logps/rejected": -207.09121704101562,
49
- "loss": 0.6233,
50
- "rewards/accuracies": 0.65625,
51
- "rewards/chosen": 0.16019153594970703,
52
- "rewards/margins": 0.1787206381559372,
53
- "rewards/rejected": -0.018529098480939865,
54
  "step": 20
55
  },
56
  {
57
- "epoch": 0.07677543186180422,
58
- "grad_norm": 587.1314648207136,
59
  "learning_rate": 2.3076923076923078e-07,
60
- "logits/chosen": -2.637615919113159,
61
- "logits/rejected": -2.5644373893737793,
62
- "logps/chosen": -252.89340209960938,
63
- "logps/rejected": -198.89572143554688,
64
- "loss": 0.4258,
65
- "rewards/accuracies": 0.8500000238418579,
66
- "rewards/chosen": 1.126920461654663,
67
- "rewards/margins": 1.1405318975448608,
68
- "rewards/rejected": -0.013611525297164917,
69
  "step": 30
70
  },
71
  {
72
- "epoch": 0.10236724248240563,
73
- "grad_norm": 348.96467920767,
74
  "learning_rate": 2.999939918069778e-07,
75
- "logits/chosen": -2.648346424102783,
76
- "logits/rejected": -2.5705723762512207,
77
- "logps/chosen": -245.802001953125,
78
- "logps/rejected": -193.64395141601562,
79
- "loss": 0.3346,
80
- "rewards/accuracies": 0.793749988079071,
81
- "rewards/chosen": 2.2736613750457764,
82
- "rewards/margins": 2.252485513687134,
83
- "rewards/rejected": 0.021175961941480637,
84
  "step": 40
85
  },
86
  {
87
- "epoch": 0.12795905310300704,
88
- "grad_norm": 372.4722713208355,
89
  "learning_rate": 2.9927359084964875e-07,
90
- "logits/chosen": -2.681694746017456,
91
- "logits/rejected": -2.61210560798645,
92
- "logps/chosen": -259.555908203125,
93
- "logps/rejected": -203.13912963867188,
94
- "loss": 0.3226,
95
- "rewards/accuracies": 0.8812500238418579,
96
- "rewards/chosen": 4.340622901916504,
97
- "rewards/margins": 3.3357081413269043,
98
- "rewards/rejected": 1.0049149990081787,
99
  "step": 50
100
  },
101
  {
102
- "epoch": 0.15355086372360843,
103
- "grad_norm": 406.3298611207908,
104
  "learning_rate": 2.9735816061234966e-07,
105
- "logits/chosen": -2.6559042930603027,
106
- "logits/rejected": -2.586275577545166,
107
- "logps/chosen": -260.26336669921875,
108
- "logps/rejected": -206.2169647216797,
109
- "loss": 0.3104,
110
- "rewards/accuracies": 0.8500000238418579,
111
- "rewards/chosen": 4.282172203063965,
112
- "rewards/margins": 3.3613991737365723,
113
- "rewards/rejected": 0.9207728505134583,
114
  "step": 60
115
  },
116
  {
117
- "epoch": 0.17914267434420986,
118
- "grad_norm": 326.0546143118845,
119
  "learning_rate": 2.942630353226844e-07,
120
- "logits/chosen": -2.6311216354370117,
121
- "logits/rejected": -2.563368320465088,
122
- "logps/chosen": -289.8287048339844,
123
- "logps/rejected": -225.3174591064453,
124
- "loss": 0.2976,
125
- "rewards/accuracies": 0.893750011920929,
126
- "rewards/chosen": 4.040958404541016,
127
- "rewards/margins": 4.104364395141602,
128
- "rewards/rejected": -0.0634058266878128,
129
  "step": 70
130
  },
131
  {
132
- "epoch": 0.20473448496481125,
133
- "grad_norm": 419.2531192048535,
134
  "learning_rate": 2.900129934114876e-07,
135
- "logits/chosen": -2.576873302459717,
136
- "logits/rejected": -2.505291700363159,
137
- "logps/chosen": -260.63385009765625,
138
- "logps/rejected": -204.9154510498047,
139
- "loss": 0.255,
140
- "rewards/accuracies": 0.90625,
141
- "rewards/chosen": 3.4621634483337402,
142
- "rewards/margins": 4.38175106048584,
143
- "rewards/rejected": -0.9195877909660339,
144
  "step": 80
145
  },
146
  {
147
- "epoch": 0.23032629558541268,
148
- "grad_norm": 526.7607200099269,
149
  "learning_rate": 2.8464205914585213e-07,
150
- "logits/chosen": -2.578784942626953,
151
- "logits/rejected": -2.520214080810547,
152
- "logps/chosen": -256.7921447753906,
153
- "logps/rejected": -216.86544799804688,
154
- "loss": 0.2605,
155
- "rewards/accuracies": 0.84375,
156
- "rewards/chosen": 2.476574182510376,
157
- "rewards/margins": 3.6615688800811768,
158
- "rewards/rejected": -1.1849944591522217,
159
  "step": 90
160
  },
161
  {
162
- "epoch": 0.2559181062060141,
163
- "grad_norm": 765.903861721633,
164
  "learning_rate": 2.78193230243403e-07,
165
- "logits/chosen": -2.6087539196014404,
166
- "logits/rejected": -2.529031276702881,
167
- "logps/chosen": -236.2876434326172,
168
- "logps/rejected": -200.96026611328125,
169
- "loss": 0.3552,
170
- "rewards/accuracies": 0.862500011920929,
171
- "rewards/chosen": 2.9406275749206543,
172
- "rewards/margins": 4.085367679595947,
173
- "rewards/rejected": -1.1447399854660034,
174
  "step": 100
175
  },
176
  {
177
- "epoch": 0.28150991682661547,
178
- "grad_norm": 341.8125232177188,
179
  "learning_rate": 2.707181336484383e-07,
180
- "logits/chosen": -2.6070523262023926,
181
- "logits/rejected": -2.5225558280944824,
182
- "logps/chosen": -257.5608825683594,
183
- "logps/rejected": -209.5845489501953,
184
- "loss": 0.2939,
185
- "rewards/accuracies": 0.875,
186
- "rewards/chosen": 4.056784629821777,
187
- "rewards/margins": 4.63706636428833,
188
- "rewards/rejected": -0.5802817344665527,
189
  "step": 110
190
  },
191
  {
192
- "epoch": 0.30710172744721687,
193
- "grad_norm": 404.74851579206114,
194
  "learning_rate": 2.622766122256652e-07,
195
- "logits/chosen": -2.597041606903076,
196
- "logits/rejected": -2.524170398712158,
197
- "logps/chosen": -261.7186279296875,
198
- "logps/rejected": -207.9816436767578,
199
- "loss": 0.2833,
200
- "rewards/accuracies": 0.887499988079071,
201
- "rewards/chosen": 3.8718819618225098,
202
- "rewards/margins": 4.679508686065674,
203
- "rewards/rejected": -0.8076267242431641,
204
  "step": 120
205
  },
206
  {
207
- "epoch": 0.3326935380678183,
208
- "grad_norm": 236.84495240691993,
209
  "learning_rate": 2.5293624568031005e-07,
210
- "logits/chosen": -2.575899600982666,
211
- "logits/rejected": -2.50651216506958,
212
- "logps/chosen": -250.2094268798828,
213
- "logps/rejected": -195.43930053710938,
214
- "loss": 0.2701,
215
- "rewards/accuracies": 0.84375,
216
- "rewards/chosen": 3.546905994415283,
217
- "rewards/margins": 4.399945259094238,
218
- "rewards/rejected": -0.8530394434928894,
219
  "step": 130
220
  },
221
  {
222
- "epoch": 0.3582853486884197,
223
- "grad_norm": 316.10574756623254,
224
  "learning_rate": 2.4277180953993823e-07,
225
- "logits/chosen": -2.597114086151123,
226
- "logits/rejected": -2.526017904281616,
227
- "logps/chosen": -267.56597900390625,
228
- "logps/rejected": -214.47607421875,
229
- "loss": 0.3022,
230
- "rewards/accuracies": 0.862500011920929,
231
- "rewards/chosen": 3.8051650524139404,
232
- "rewards/margins": 4.264659881591797,
233
- "rewards/rejected": -0.4594948887825012,
234
  "step": 140
235
  },
236
  {
237
- "epoch": 0.3838771593090211,
238
- "grad_norm": 309.0947994138828,
239
  "learning_rate": 2.3186467652917566e-07,
240
- "logits/chosen": -2.5764989852905273,
241
- "logits/rejected": -2.5064079761505127,
242
- "logps/chosen": -261.96685791015625,
243
- "logps/rejected": -211.4801025390625,
244
- "loss": 0.2803,
245
- "rewards/accuracies": 0.893750011920929,
246
- "rewards/chosen": 4.31245756149292,
247
- "rewards/margins": 4.854833126068115,
248
- "rewards/rejected": -0.542374849319458,
249
  "step": 150
250
  },
251
  {
252
- "epoch": 0.4094689699296225,
253
- "grad_norm": 269.7166459422843,
254
  "learning_rate": 2.2030216512970552e-07,
255
- "logits/chosen": -2.574291467666626,
256
- "logits/rejected": -2.503854274749756,
257
- "logps/chosen": -241.0322723388672,
258
- "logps/rejected": -194.2302703857422,
259
- "loss": 0.2695,
260
- "rewards/accuracies": 0.862500011920929,
261
- "rewards/chosen": 3.9528305530548096,
262
- "rewards/margins": 4.557216167449951,
263
- "rewards/rejected": -0.604385256767273,
264
  "step": 160
265
  },
266
  {
267
- "epoch": 0.4350607805502239,
268
- "grad_norm": 316.51801045522615,
269
  "learning_rate": 2.0817684054072823e-07,
270
- "logits/chosen": -2.578244924545288,
271
- "logits/rejected": -2.512073040008545,
272
- "logps/chosen": -262.7640075683594,
273
- "logps/rejected": -207.98153686523438,
274
- "loss": 0.2347,
275
- "rewards/accuracies": 0.8812500238418579,
276
- "rewards/chosen": 3.963876247406006,
277
- "rewards/margins": 4.765882968902588,
278
- "rewards/rejected": -0.8020064234733582,
279
  "step": 170
280
  },
281
  {
282
- "epoch": 0.46065259117082535,
283
- "grad_norm": 308.7222349543346,
284
  "learning_rate": 1.9558577363613703e-07,
285
- "logits/chosen": -2.6352927684783936,
286
- "logits/rejected": -2.549041271209717,
287
- "logps/chosen": -235.12112426757812,
288
- "logps/rejected": -184.7916717529297,
289
- "loss": 0.2985,
290
- "rewards/accuracies": 0.8812500238418579,
291
- "rewards/chosen": 3.456139326095581,
292
- "rewards/margins": 4.226747035980225,
293
- "rewards/rejected": -0.770608127117157,
294
  "step": 180
295
  },
296
  {
297
- "epoch": 0.48624440179142675,
298
- "grad_norm": 258.43311774583907,
299
  "learning_rate": 1.826297638509251e-07,
300
- "logits/chosen": -2.5956501960754395,
301
- "logits/rejected": -2.5205705165863037,
302
- "logps/chosen": -257.1447448730469,
303
- "logps/rejected": -201.02076721191406,
304
- "loss": 0.2441,
305
- "rewards/accuracies": 0.887499988079071,
306
- "rewards/chosen": 3.8379158973693848,
307
- "rewards/margins": 4.68334436416626,
308
- "rewards/rejected": -0.8454282879829407,
309
  "step": 190
310
  },
311
  {
312
- "epoch": 0.5118362124120281,
313
- "grad_norm": 345.320567765688,
314
  "learning_rate": 1.694125322181083e-07,
315
- "logits/chosen": -2.6293721199035645,
316
- "logits/rejected": -2.5405123233795166,
317
- "logps/chosen": -272.0235290527344,
318
- "logps/rejected": -215.8284912109375,
319
- "loss": 0.2397,
320
- "rewards/accuracies": 0.893750011920929,
321
- "rewards/chosen": 4.249785423278809,
322
- "rewards/margins": 5.354866981506348,
323
- "rewards/rejected": -1.1050812005996704,
324
  "step": 200
325
  },
326
  {
327
- "epoch": 0.5374280230326296,
328
- "grad_norm": 544.5614360009855,
329
  "learning_rate": 1.5603989101641228e-07,
330
- "logits/chosen": -2.6099467277526855,
331
- "logits/rejected": -2.5365748405456543,
332
- "logps/chosen": -264.88690185546875,
333
- "logps/rejected": -209.3201141357422,
334
- "loss": 0.3574,
335
- "rewards/accuracies": 0.8374999761581421,
336
- "rewards/chosen": 3.0834546089172363,
337
- "rewards/margins": 4.563128471374512,
338
- "rewards/rejected": -1.4796737432479858,
339
  "step": 210
340
  },
341
  {
342
- "epoch": 0.5630198336532309,
343
- "grad_norm": 355.4652179934555,
344
  "learning_rate": 1.4261889667621828e-07,
345
- "logits/chosen": -2.620954751968384,
346
- "logits/rejected": -2.556044578552246,
347
- "logps/chosen": -255.65048217773438,
348
- "logps/rejected": -211.9694366455078,
349
- "loss": 0.2686,
350
- "rewards/accuracies": 0.893750011920929,
351
- "rewards/chosen": 3.0131924152374268,
352
- "rewards/margins": 4.114481449127197,
353
- "rewards/rejected": -1.1012890338897705,
354
  "step": 220
355
  },
356
  {
357
- "epoch": 0.5886116442738324,
358
- "grad_norm": 308.3809131002459,
359
  "learning_rate": 1.2925699272529007e-07,
360
- "logits/chosen": -2.6484158039093018,
361
- "logits/rejected": -2.5617594718933105,
362
- "logps/chosen": -265.15606689453125,
363
- "logps/rejected": -213.99850463867188,
364
- "loss": 0.253,
365
- "rewards/accuracies": 0.856249988079071,
366
- "rewards/chosen": 4.123552322387695,
367
- "rewards/margins": 4.665673732757568,
368
- "rewards/rejected": -0.542121171951294,
369
  "step": 230
370
  },
371
  {
372
- "epoch": 0.6142034548944337,
373
- "grad_norm": 534.1086121087117,
374
  "learning_rate": 1.160611496355417e-07,
375
- "logits/chosen": -2.634096145629883,
376
- "logits/rejected": -2.5853183269500732,
377
- "logps/chosen": -259.93743896484375,
378
- "logps/rejected": -216.4737548828125,
379
- "loss": 0.2542,
380
- "rewards/accuracies": 0.893750011920929,
381
- "rewards/chosen": 3.896925687789917,
382
- "rewards/margins": 5.348150730133057,
383
- "rewards/rejected": -1.451224684715271,
384
  "step": 240
385
  },
386
  {
387
- "epoch": 0.6397952655150352,
388
- "grad_norm": 369.496952926306,
389
  "learning_rate": 1.0313700845691635e-07,
390
- "logits/chosen": -2.64689040184021,
391
- "logits/rejected": -2.5748291015625,
392
- "logps/chosen": -265.5174255371094,
393
- "logps/rejected": -219.63558959960938,
394
- "loss": 0.263,
395
- "rewards/accuracies": 0.875,
396
- "rewards/chosen": 3.130638837814331,
397
- "rewards/margins": 4.757687568664551,
398
- "rewards/rejected": -1.6270482540130615,
399
  "step": 250
400
  },
401
  {
402
- "epoch": 0.6653870761356366,
403
- "grad_norm": 277.7533336484834,
404
  "learning_rate": 9.058803509412647e-08,
405
- "logits/chosen": -2.639646530151367,
406
- "logits/rejected": -2.577847719192505,
407
- "logps/chosen": -259.77166748046875,
408
- "logps/rejected": -206.2452392578125,
409
- "loss": 0.2401,
410
- "rewards/accuracies": 0.893750011920929,
411
- "rewards/chosen": 3.0441479682922363,
412
- "rewards/margins": 4.5638251304626465,
413
- "rewards/rejected": -1.5196778774261475,
414
  "step": 260
415
  },
416
  {
417
- "epoch": 0.690978886756238,
418
- "grad_norm": 399.30433913889635,
419
  "learning_rate": 7.851469199680381e-08,
420
- "logits/chosen": -2.6187744140625,
421
- "logits/rejected": -2.5289955139160156,
422
- "logps/chosen": -270.216796875,
423
- "logps/rejected": -225.6618194580078,
424
- "loss": 0.267,
425
- "rewards/accuracies": 0.90625,
426
- "rewards/chosen": 3.1835665702819824,
427
- "rewards/margins": 5.6915483474731445,
428
- "rewards/rejected": -2.507981777191162,
429
  "step": 270
430
  },
431
  {
432
- "epoch": 0.7165706973768394,
433
- "grad_norm": 272.0486827380912,
434
  "learning_rate": 6.701363389420295e-08,
435
- "logits/chosen": -2.643656015396118,
436
- "logits/rejected": -2.585157871246338,
437
- "logps/chosen": -267.4632568359375,
438
- "logps/rejected": -216.541015625,
439
- "loss": 0.2361,
440
- "rewards/accuracies": 0.887499988079071,
441
- "rewards/chosen": 3.1215426921844482,
442
- "rewards/margins": 4.919692039489746,
443
- "rewards/rejected": -1.7981488704681396,
444
  "step": 280
445
  },
446
  {
447
- "epoch": 0.7421625079974408,
448
- "grad_norm": 328.48611808044626,
449
  "learning_rate": 5.617693401310837e-08,
450
- "logits/chosen": -2.6063172817230225,
451
- "logits/rejected": -2.5578410625457764,
452
- "logps/chosen": -265.97723388671875,
453
- "logps/rejected": -214.73867797851562,
454
- "loss": 0.234,
455
- "rewards/accuracies": 0.893750011920929,
456
- "rewards/chosen": 2.9844164848327637,
457
- "rewards/margins": 4.7947001457214355,
458
- "rewards/rejected": -1.810283899307251,
459
  "step": 290
460
  },
461
  {
462
- "epoch": 0.7677543186180422,
463
- "grad_norm": 485.80687185291606,
464
  "learning_rate": 4.609134697356009e-08,
465
- "logits/chosen": -2.6260290145874023,
466
- "logits/rejected": -2.570155620574951,
467
- "logps/chosen": -272.4629211425781,
468
- "logps/rejected": -218.36135864257812,
469
- "loss": 0.2579,
470
- "rewards/accuracies": 0.887499988079071,
471
- "rewards/chosen": 3.055912971496582,
472
- "rewards/margins": 4.725813388824463,
473
- "rewards/rejected": -1.66990065574646,
474
  "step": 300
475
  },
476
  {
477
- "epoch": 0.7933461292386437,
478
- "grad_norm": 242.33931597612025,
479
  "learning_rate": 3.683761426338148e-08,
480
- "logits/chosen": -2.5885746479034424,
481
- "logits/rejected": -2.5180420875549316,
482
- "logps/chosen": -275.88812255859375,
483
- "logps/rejected": -213.2559814453125,
484
- "loss": 0.2684,
485
- "rewards/accuracies": 0.8500000238418579,
486
- "rewards/chosen": 3.1929287910461426,
487
- "rewards/margins": 5.026190757751465,
488
- "rewards/rejected": -1.8332622051239014,
489
  "step": 310
490
  },
491
  {
492
- "epoch": 0.818937939859245,
493
- "grad_norm": 363.0806683357755,
494
  "learning_rate": 2.8489817851625024e-08,
495
- "logits/chosen": -2.610605001449585,
496
- "logits/rejected": -2.562117099761963,
497
- "logps/chosen": -259.27264404296875,
498
- "logps/rejected": -202.4815673828125,
499
- "loss": 0.2424,
500
- "rewards/accuracies": 0.887499988079071,
501
- "rewards/chosen": 2.8195748329162598,
502
- "rewards/margins": 4.644892692565918,
503
- "rewards/rejected": -1.8253180980682373,
504
  "step": 320
505
  },
506
  {
507
- "epoch": 0.8445297504798465,
508
- "grad_norm": 367.20877637580975,
509
  "learning_rate": 2.1114787115667477e-08,
510
- "logits/chosen": -2.633531332015991,
511
- "logits/rejected": -2.5729198455810547,
512
- "logps/chosen": -262.47796630859375,
513
- "logps/rejected": -215.085205078125,
514
- "loss": 0.2445,
515
- "rewards/accuracies": 0.9125000238418579,
516
- "rewards/chosen": 3.610386610031128,
517
- "rewards/margins": 4.788306713104248,
518
- "rewards/rejected": -1.177919864654541,
519
  "step": 330
520
  },
521
  {
522
- "epoch": 0.8701215611004478,
523
- "grad_norm": 293.8708439770587,
524
  "learning_rate": 1.4771563829877598e-08,
525
- "logits/chosen": -2.6198182106018066,
526
- "logits/rejected": -2.561091661453247,
527
- "logps/chosen": -258.06500244140625,
528
- "logps/rejected": -208.5477752685547,
529
- "loss": 0.2419,
530
- "rewards/accuracies": 0.9437500238418579,
531
- "rewards/chosen": 3.373103380203247,
532
- "rewards/margins": 5.2115888595581055,
533
- "rewards/rejected": -1.8384857177734375,
534
  "step": 340
535
  },
536
  {
537
- "epoch": 0.8957133717210493,
538
- "grad_norm": 723.7045832401957,
539
  "learning_rate": 9.510929498959268e-09,
540
- "logits/chosen": -2.6300716400146484,
541
- "logits/rejected": -2.58100962638855,
542
- "logps/chosen": -267.67926025390625,
543
- "logps/rejected": -224.3400421142578,
544
- "loss": 0.2628,
545
- "rewards/accuracies": 0.8812500238418579,
546
- "rewards/chosen": 3.285783290863037,
547
- "rewards/margins": 4.712864398956299,
548
- "rewards/rejected": -1.4270811080932617,
549
  "step": 350
550
  },
551
  {
552
- "epoch": 0.9213051823416507,
553
- "grad_norm": 345.1677659124253,
554
  "learning_rate": 5.374998819965654e-09,
555
- "logits/chosen": -2.6322882175445557,
556
- "logits/rejected": -2.561767101287842,
557
- "logps/chosen": -272.7427673339844,
558
- "logps/rejected": -216.9067840576172,
559
- "loss": 0.291,
560
- "rewards/accuracies": 0.893750011920929,
561
- "rewards/chosen": 3.2370834350585938,
562
- "rewards/margins": 4.826039791107178,
563
- "rewards/rejected": -1.5889561176300049,
564
  "step": 360
565
  },
566
  {
567
- "epoch": 0.946896992962252,
568
- "grad_norm": 308.7073524078868,
569
  "learning_rate": 2.396882527576477e-09,
570
- "logits/chosen": -2.5925230979919434,
571
- "logits/rejected": -2.5545706748962402,
572
- "logps/chosen": -271.9848327636719,
573
- "logps/rejected": -226.7042999267578,
574
- "loss": 0.2664,
575
- "rewards/accuracies": 0.875,
576
- "rewards/chosen": 3.643012523651123,
577
- "rewards/margins": 4.708712577819824,
578
- "rewards/rejected": -1.065699815750122,
579
  "step": 370
580
  },
581
  {
582
- "epoch": 0.9724888035828535,
583
- "grad_norm": 182.6217744497864,
584
  "learning_rate": 6.004223217757509e-10,
585
- "logits/chosen": -2.6407010555267334,
586
- "logits/rejected": -2.5988292694091797,
587
- "logps/chosen": -261.07269287109375,
588
- "logps/rejected": -228.239013671875,
589
- "loss": 0.2581,
590
- "rewards/accuracies": 0.8687499761581421,
591
- "rewards/chosen": 3.656334638595581,
592
- "rewards/margins": 4.874091148376465,
593
- "rewards/rejected": -1.2177565097808838,
594
  "step": 380
595
  },
596
  {
597
- "epoch": 0.9980806142034548,
598
- "grad_norm": 472.83951094562434,
599
  "learning_rate": 0.0,
600
- "logits/chosen": -2.653154134750366,
601
- "logits/rejected": -2.5895044803619385,
602
- "logps/chosen": -248.54092407226562,
603
- "logps/rejected": -207.9895477294922,
604
- "loss": 0.2561,
605
- "rewards/accuracies": 0.8999999761581421,
606
- "rewards/chosen": 3.2267730236053467,
607
- "rewards/margins": 4.816933631896973,
608
- "rewards/rejected": -1.5901600122451782,
609
  "step": 390
610
  },
611
  {
612
- "epoch": 0.9980806142034548,
613
  "step": 390,
614
  "total_flos": 0.0,
615
- "train_loss": 0.29645214692140237,
616
- "train_runtime": 6440.0874,
617
- "train_samples_per_second": 7.764,
618
- "train_steps_per_second": 0.061
619
  }
620
  ],
621
  "logging_steps": 10,
@@ -623,18 +623,6 @@
623
  "num_input_tokens_seen": 0,
624
  "num_train_epochs": 1,
625
  "save_steps": 100,
626
- "stateful_callbacks": {
627
- "TrainerControl": {
628
- "args": {
629
- "should_epoch_stop": false,
630
- "should_evaluate": false,
631
- "should_log": false,
632
- "should_save": true,
633
- "should_training_stop": false
634
- },
635
- "attributes": {}
636
- }
637
- },
638
  "total_flos": 0.0,
639
  "train_batch_size": 4,
640
  "trial_name": null,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0,
13
+ "grad_norm": 747.0977926336889,
14
  "learning_rate": 7.692307692307691e-09,
15
+ "logits/chosen": -2.5617921352386475,
16
+ "logits/rejected": -2.415619373321533,
17
  "logps/chosen": -258.1644592285156,
18
+ "logps/rejected": -191.65736389160156,
19
  "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
 
24
  "step": 1
25
  },
26
  {
27
+ "epoch": 0.03,
28
+ "grad_norm": 746.141595669296,
29
  "learning_rate": 7.692307692307691e-08,
30
+ "logits/chosen": -2.6110925674438477,
31
+ "logits/rejected": -2.524423122406006,
32
+ "logps/chosen": -267.3368225097656,
33
+ "logps/rejected": -198.19520568847656,
34
+ "loss": 0.6963,
35
+ "rewards/accuracies": 0.3888888955116272,
36
+ "rewards/chosen": -0.02406422607600689,
37
+ "rewards/margins": -0.021091409027576447,
38
+ "rewards/rejected": -0.0029728179797530174,
39
  "step": 10
40
  },
41
  {
42
+ "epoch": 0.05,
43
+ "grad_norm": 433.45696577907285,
44
  "learning_rate": 1.5384615384615382e-07,
45
+ "logits/chosen": -2.628641128540039,
46
+ "logits/rejected": -2.5271899700164795,
47
+ "logps/chosen": -260.9211120605469,
48
+ "logps/rejected": -198.38711547851562,
49
+ "loss": 0.5697,
50
+ "rewards/accuracies": 0.768750011920929,
51
+ "rewards/chosen": 0.20215623080730438,
52
+ "rewards/margins": 0.35489505529403687,
53
+ "rewards/rejected": -0.1527387946844101,
54
  "step": 20
55
  },
56
  {
57
+ "epoch": 0.08,
58
+ "grad_norm": 466.53027664588734,
59
  "learning_rate": 2.3076923076923078e-07,
60
+ "logits/chosen": -2.6438043117523193,
61
+ "logits/rejected": -2.5429482460021973,
62
+ "logps/chosen": -252.55557250976562,
63
+ "logps/rejected": -195.11138916015625,
64
+ "loss": 0.306,
65
+ "rewards/accuracies": 0.90625,
66
+ "rewards/chosen": 1.301897406578064,
67
+ "rewards/margins": 1.880059838294983,
68
+ "rewards/rejected": -0.5781622529029846,
69
  "step": 30
70
  },
71
  {
72
+ "epoch": 0.1,
73
+ "grad_norm": 451.3374046706128,
74
  "learning_rate": 2.999939918069778e-07,
75
+ "logits/chosen": -2.6604888439178467,
76
+ "logits/rejected": -2.5366878509521484,
77
+ "logps/chosen": -244.97402954101562,
78
+ "logps/rejected": -199.42355346679688,
79
+ "loss": 0.2052,
80
+ "rewards/accuracies": 0.875,
81
+ "rewards/chosen": 2.6914100646972656,
82
+ "rewards/margins": 3.822247266769409,
83
+ "rewards/rejected": -1.130837321281433,
84
  "step": 40
85
  },
86
  {
87
+ "epoch": 0.13,
88
+ "grad_norm": 268.00932309883206,
89
  "learning_rate": 2.9927359084964875e-07,
90
+ "logits/chosen": -2.7027461528778076,
91
+ "logits/rejected": -2.5904927253723145,
92
+ "logps/chosen": -258.02886962890625,
93
+ "logps/rejected": -204.94631958007812,
94
+ "loss": 0.2326,
95
+ "rewards/accuracies": 0.887499988079071,
96
+ "rewards/chosen": 4.602013111114502,
97
+ "rewards/margins": 5.5924835205078125,
98
+ "rewards/rejected": -0.9904701113700867,
99
  "step": 50
100
  },
101
  {
102
+ "epoch": 0.15,
103
+ "grad_norm": 340.360656955259,
104
  "learning_rate": 2.9735816061234966e-07,
105
+ "logits/chosen": -2.6768908500671387,
106
+ "logits/rejected": -2.557954788208008,
107
+ "logps/chosen": -259.3023681640625,
108
+ "logps/rejected": -192.10040283203125,
109
+ "loss": 0.1987,
110
+ "rewards/accuracies": 0.9312499761581421,
111
+ "rewards/chosen": 4.70054817199707,
112
+ "rewards/margins": 6.1066999435424805,
113
+ "rewards/rejected": -1.4061520099639893,
114
  "step": 60
115
  },
116
  {
117
+ "epoch": 0.18,
118
+ "grad_norm": 344.3937897868754,
119
  "learning_rate": 2.942630353226844e-07,
120
+ "logits/chosen": -2.650172472000122,
121
+ "logits/rejected": -2.500756025314331,
122
+ "logps/chosen": -288.3818054199219,
123
+ "logps/rejected": -222.72811889648438,
124
+ "loss": 0.1904,
125
+ "rewards/accuracies": 0.918749988079071,
126
+ "rewards/chosen": 4.768267631530762,
127
+ "rewards/margins": 7.4695234298706055,
128
+ "rewards/rejected": -2.7012553215026855,
129
  "step": 70
130
  },
131
  {
132
+ "epoch": 0.2,
133
+ "grad_norm": 453.28474416843744,
134
  "learning_rate": 2.900129934114876e-07,
135
+ "logits/chosen": -2.5939252376556396,
136
+ "logits/rejected": -2.458728790283203,
137
+ "logps/chosen": -260.589111328125,
138
+ "logps/rejected": -212.63925170898438,
139
+ "loss": 0.1791,
140
+ "rewards/accuracies": 0.9312499761581421,
141
+ "rewards/chosen": 3.8766517639160156,
142
+ "rewards/margins": 7.434275150299072,
143
+ "rewards/rejected": -3.5576236248016357,
144
  "step": 80
145
  },
146
  {
147
+ "epoch": 0.23,
148
+ "grad_norm": 598.8098573854415,
149
  "learning_rate": 2.8464205914585213e-07,
150
+ "logits/chosen": -2.6328907012939453,
151
+ "logits/rejected": -2.507690906524658,
152
+ "logps/chosen": -253.75570678710938,
153
+ "logps/rejected": -193.23147583007812,
154
+ "loss": 0.19,
155
+ "rewards/accuracies": 0.90625,
156
+ "rewards/chosen": 3.2832133769989014,
157
+ "rewards/margins": 6.802037715911865,
158
+ "rewards/rejected": -3.518825054168701,
159
  "step": 90
160
  },
161
  {
162
+ "epoch": 0.26,
163
+ "grad_norm": 242.78091859451033,
164
  "learning_rate": 2.78193230243403e-07,
165
+ "logits/chosen": -2.6586058139801025,
166
+ "logits/rejected": -2.52885103225708,
167
+ "logps/chosen": -234.36068725585938,
168
+ "logps/rejected": -197.55322265625,
169
+ "loss": 0.1766,
170
+ "rewards/accuracies": 0.9125000238418579,
171
+ "rewards/chosen": 4.212424278259277,
172
+ "rewards/margins": 7.145503997802734,
173
+ "rewards/rejected": -2.933079242706299,
174
  "step": 100
175
  },
176
  {
177
+ "epoch": 0.28,
178
+ "grad_norm": 288.7802754186709,
179
  "learning_rate": 2.707181336484383e-07,
180
+ "logits/chosen": -2.6430606842041016,
181
+ "logits/rejected": -2.5142664909362793,
182
+ "logps/chosen": -253.27975463867188,
183
+ "logps/rejected": -192.4219207763672,
184
+ "loss": 0.1616,
185
+ "rewards/accuracies": 0.925000011920929,
186
+ "rewards/chosen": 5.699277400970459,
187
+ "rewards/margins": 7.844499111175537,
188
+ "rewards/rejected": -2.1452219486236572,
189
  "step": 110
190
  },
191
  {
192
+ "epoch": 0.31,
193
+ "grad_norm": 242.33756453841693,
194
  "learning_rate": 2.622766122256652e-07,
195
+ "logits/chosen": -2.6139981746673584,
196
+ "logits/rejected": -2.516448497772217,
197
+ "logps/chosen": -256.9673767089844,
198
+ "logps/rejected": -211.84988403320312,
199
+ "loss": 0.1786,
200
+ "rewards/accuracies": 0.9437500238418579,
201
+ "rewards/chosen": 5.386040210723877,
202
+ "rewards/margins": 8.150399208068848,
203
+ "rewards/rejected": -2.7643585205078125,
204
  "step": 120
205
  },
206
  {
207
+ "epoch": 0.33,
208
+ "grad_norm": 181.17712779023478,
209
  "learning_rate": 2.5293624568031005e-07,
210
+ "logits/chosen": -2.5844571590423584,
211
+ "logits/rejected": -2.4461381435394287,
212
+ "logps/chosen": -248.85513305664062,
213
+ "logps/rejected": -187.93716430664062,
214
+ "loss": 0.172,
215
+ "rewards/accuracies": 0.918749988079071,
216
+ "rewards/chosen": 4.1653828620910645,
217
+ "rewards/margins": 7.870238304138184,
218
+ "rewards/rejected": -3.7048561573028564,
219
  "step": 130
220
  },
221
  {
222
+ "epoch": 0.36,
223
+ "grad_norm": 386.4455316752931,
224
  "learning_rate": 2.4277180953993823e-07,
225
+ "logits/chosen": -2.6205365657806396,
226
+ "logits/rejected": -2.5077974796295166,
227
+ "logps/chosen": -269.39251708984375,
228
+ "logps/rejected": -207.6556854248047,
229
+ "loss": 0.2112,
230
+ "rewards/accuracies": 0.9312499761581421,
231
+ "rewards/chosen": 3.9067764282226562,
232
+ "rewards/margins": 7.915855407714844,
233
+ "rewards/rejected": -4.0090789794921875,
234
  "step": 140
235
  },
236
  {
237
+ "epoch": 0.38,
238
+ "grad_norm": 264.0646768310831,
239
  "learning_rate": 2.3186467652917566e-07,
240
+ "logits/chosen": -2.607632875442505,
241
+ "logits/rejected": -2.5044326782226562,
242
+ "logps/chosen": -261.9036560058594,
243
+ "logps/rejected": -212.12777709960938,
244
+ "loss": 0.2286,
245
+ "rewards/accuracies": 0.918749988079071,
246
+ "rewards/chosen": 4.435807704925537,
247
+ "rewards/margins": 8.641576766967773,
248
+ "rewards/rejected": -4.2057695388793945,
249
  "step": 150
250
  },
251
  {
252
+ "epoch": 0.41,
253
+ "grad_norm": 571.6419774908388,
254
  "learning_rate": 2.2030216512970552e-07,
255
+ "logits/chosen": -2.5919671058654785,
256
+ "logits/rejected": -2.48041033744812,
257
+ "logps/chosen": -240.66891479492188,
258
+ "logps/rejected": -183.67489624023438,
259
+ "loss": 0.2024,
260
+ "rewards/accuracies": 0.956250011920929,
261
+ "rewards/chosen": 4.69089412689209,
262
+ "rewards/margins": 7.830643653869629,
263
+ "rewards/rejected": -3.139749526977539,
264
  "step": 160
265
  },
266
  {
267
+ "epoch": 0.44,
268
+ "grad_norm": 351.0335983478419,
269
  "learning_rate": 2.0817684054072823e-07,
270
+ "logits/chosen": -2.594038724899292,
271
+ "logits/rejected": -2.479696273803711,
272
+ "logps/chosen": -259.5168151855469,
273
+ "logps/rejected": -199.40609741210938,
274
+ "loss": 0.1492,
275
+ "rewards/accuracies": 0.9375,
276
+ "rewards/chosen": 4.83851432800293,
277
+ "rewards/margins": 7.966272830963135,
278
+ "rewards/rejected": -3.127758741378784,
279
  "step": 170
280
  },
281
  {
282
+ "epoch": 0.46,
283
+ "grad_norm": 332.9543090868915,
284
  "learning_rate": 1.9558577363613703e-07,
285
+ "logits/chosen": -2.643381357192993,
286
+ "logits/rejected": -2.5339646339416504,
287
+ "logps/chosen": -235.37637329101562,
288
+ "logps/rejected": -190.86380004882812,
289
+ "loss": 0.1789,
290
+ "rewards/accuracies": 0.9437500238418579,
291
+ "rewards/chosen": 3.2347798347473145,
292
+ "rewards/margins": 7.21783971786499,
293
+ "rewards/rejected": -3.983060359954834,
294
  "step": 180
295
  },
296
  {
297
+ "epoch": 0.49,
298
+ "grad_norm": 413.79495264463003,
299
  "learning_rate": 1.826297638509251e-07,
300
+ "logits/chosen": -2.595829725265503,
301
+ "logits/rejected": -2.500739812850952,
302
+ "logps/chosen": -256.819580078125,
303
+ "logps/rejected": -206.32376098632812,
304
+ "loss": 0.179,
305
+ "rewards/accuracies": 0.918749988079071,
306
+ "rewards/chosen": 3.607396364212036,
307
+ "rewards/margins": 7.752522945404053,
308
+ "rewards/rejected": -4.145126819610596,
309
  "step": 190
310
  },
311
  {
312
+ "epoch": 0.51,
313
+ "grad_norm": 458.13511698305706,
314
  "learning_rate": 1.694125322181083e-07,
315
+ "logits/chosen": -2.63800311088562,
316
+ "logits/rejected": -2.5028953552246094,
317
+ "logps/chosen": -269.77081298828125,
318
+ "logps/rejected": -201.87176513671875,
319
+ "loss": 0.1642,
320
+ "rewards/accuracies": 0.949999988079071,
321
+ "rewards/chosen": 5.391812801361084,
322
+ "rewards/margins": 9.271406173706055,
323
+ "rewards/rejected": -3.8795933723449707,
324
  "step": 200
325
  },
326
  {
327
+ "epoch": 0.54,
328
+ "grad_norm": 449.95115191482023,
329
  "learning_rate": 1.5603989101641228e-07,
330
+ "logits/chosen": -2.620668888092041,
331
+ "logits/rejected": -2.5066120624542236,
332
+ "logps/chosen": -262.0960693359375,
333
+ "logps/rejected": -210.2143096923828,
334
+ "loss": 0.158,
335
+ "rewards/accuracies": 0.9375,
336
+ "rewards/chosen": 4.4803595542907715,
337
+ "rewards/margins": 8.733253479003906,
338
+ "rewards/rejected": -4.252893924713135,
339
  "step": 210
340
  },
341
  {
342
+ "epoch": 0.56,
343
+ "grad_norm": 386.8925888622673,
344
  "learning_rate": 1.4261889667621828e-07,
345
+ "logits/chosen": -2.623037815093994,
346
+ "logits/rejected": -2.5237972736358643,
347
+ "logps/chosen": -254.78390502929688,
348
+ "logps/rejected": -206.7708282470703,
349
+ "loss": 0.2113,
350
+ "rewards/accuracies": 0.925000011920929,
351
+ "rewards/chosen": 3.867889881134033,
352
+ "rewards/margins": 8.484498023986816,
353
+ "rewards/rejected": -4.616608142852783,
354
  "step": 220
355
  },
356
  {
357
+ "epoch": 0.59,
358
+ "grad_norm": 264.4396878771059,
359
  "learning_rate": 1.2925699272529007e-07,
360
+ "logits/chosen": -2.6517319679260254,
361
+ "logits/rejected": -2.5196144580841064,
362
+ "logps/chosen": -263.8755798339844,
363
+ "logps/rejected": -206.9174346923828,
364
+ "loss": 0.1518,
365
+ "rewards/accuracies": 0.8999999761581421,
366
+ "rewards/chosen": 4.761946201324463,
367
+ "rewards/margins": 8.56539249420166,
368
+ "rewards/rejected": -3.8034462928771973,
369
  "step": 230
370
  },
371
  {
372
+ "epoch": 0.61,
373
+ "grad_norm": 419.86487391831014,
374
  "learning_rate": 1.160611496355417e-07,
375
+ "logits/chosen": -2.6276602745056152,
376
+ "logits/rejected": -2.5167899131774902,
377
+ "logps/chosen": -257.7350769042969,
378
+ "logps/rejected": -210.53292846679688,
379
+ "loss": 0.1539,
380
+ "rewards/accuracies": 0.9375,
381
+ "rewards/chosen": 4.4489850997924805,
382
+ "rewards/margins": 9.010086059570312,
383
+ "rewards/rejected": -4.561100482940674,
384
  "step": 240
385
  },
386
  {
387
+ "epoch": 0.64,
388
+ "grad_norm": 233.18563601164297,
389
  "learning_rate": 1.0313700845691635e-07,
390
+ "logits/chosen": -2.6395657062530518,
391
+ "logits/rejected": -2.5249786376953125,
392
+ "logps/chosen": -263.78375244140625,
393
+ "logps/rejected": -216.53323364257812,
394
+ "loss": 0.1755,
395
+ "rewards/accuracies": 0.949999988079071,
396
+ "rewards/chosen": 4.129330158233643,
397
+ "rewards/margins": 8.881619453430176,
398
+ "rewards/rejected": -4.752288341522217,
399
  "step": 250
400
  },
401
  {
402
+ "epoch": 0.67,
403
+ "grad_norm": 301.32208500782895,
404
  "learning_rate": 9.058803509412647e-08,
405
+ "logits/chosen": -2.63856840133667,
406
+ "logits/rejected": -2.5386815071105957,
407
+ "logps/chosen": -257.0624084472656,
408
+ "logps/rejected": -209.1647186279297,
409
+ "loss": 0.1345,
410
+ "rewards/accuracies": 0.949999988079071,
411
+ "rewards/chosen": 4.407042503356934,
412
+ "rewards/margins": 8.488649368286133,
413
+ "rewards/rejected": -4.081605434417725,
414
  "step": 260
415
  },
416
  {
417
+ "epoch": 0.69,
418
+ "grad_norm": 262.72645974554706,
419
  "learning_rate": 7.851469199680381e-08,
420
+ "logits/chosen": -2.6127829551696777,
421
+ "logits/rejected": -2.4841771125793457,
422
+ "logps/chosen": -268.802490234375,
423
+ "logps/rejected": -217.36740112304688,
424
+ "loss": 0.1877,
425
+ "rewards/accuracies": 0.9375,
426
+ "rewards/chosen": 4.018976211547852,
427
+ "rewards/margins": 9.022808074951172,
428
+ "rewards/rejected": -5.00383186340332,
429
  "step": 270
430
  },
431
  {
432
+ "epoch": 0.72,
433
+ "grad_norm": 216.84234596530857,
434
  "learning_rate": 6.701363389420295e-08,
435
+ "logits/chosen": -2.649754762649536,
436
+ "logits/rejected": -2.5276429653167725,
437
+ "logps/chosen": -265.546875,
438
+ "logps/rejected": -207.27426147460938,
439
+ "loss": 0.1499,
440
+ "rewards/accuracies": 0.9312499761581421,
441
+ "rewards/chosen": 4.076898574829102,
442
+ "rewards/margins": 9.239118576049805,
443
+ "rewards/rejected": -5.1622209548950195,
444
  "step": 280
445
  },
446
  {
447
+ "epoch": 0.74,
448
+ "grad_norm": 267.2442999173217,
449
  "learning_rate": 5.617693401310837e-08,
450
+ "logits/chosen": -2.6028785705566406,
451
+ "logits/rejected": -2.526676654815674,
452
+ "logps/chosen": -266.23406982421875,
453
+ "logps/rejected": -225.2560577392578,
454
+ "loss": 0.1785,
455
+ "rewards/accuracies": 0.9437500238418579,
456
+ "rewards/chosen": 3.791701555252075,
457
+ "rewards/margins": 7.995067596435547,
458
+ "rewards/rejected": -4.203365802764893,
459
  "step": 290
460
  },
461
  {
462
+ "epoch": 0.77,
463
+ "grad_norm": 304.40926278591246,
464
  "learning_rate": 4.609134697356009e-08,
465
+ "logits/chosen": -2.6325907707214355,
466
+ "logits/rejected": -2.513869524002075,
467
+ "logps/chosen": -270.08087158203125,
468
+ "logps/rejected": -216.07247924804688,
469
+ "loss": 0.1539,
470
+ "rewards/accuracies": 0.956250011920929,
471
+ "rewards/chosen": 4.105973720550537,
472
+ "rewards/margins": 9.096407890319824,
473
+ "rewards/rejected": -4.990433692932129,
474
  "step": 300
475
  },
476
  {
477
+ "epoch": 0.79,
478
+ "grad_norm": 183.46140347071307,
479
  "learning_rate": 3.683761426338148e-08,
480
+ "logits/chosen": -2.5910542011260986,
481
+ "logits/rejected": -2.4951541423797607,
482
+ "logps/chosen": -275.2190246582031,
483
+ "logps/rejected": -210.9887237548828,
484
+ "loss": 0.1539,
485
+ "rewards/accuracies": 0.925000011920929,
486
+ "rewards/chosen": 4.16552209854126,
487
+ "rewards/margins": 8.900744438171387,
488
+ "rewards/rejected": -4.735221862792969,
489
  "step": 310
490
  },
491
  {
492
+ "epoch": 0.82,
493
+ "grad_norm": 336.8108974655118,
494
  "learning_rate": 2.8489817851625024e-08,
495
+ "logits/chosen": -2.6144814491271973,
496
+ "logits/rejected": -2.5162534713745117,
497
+ "logps/chosen": -258.4134826660156,
498
+ "logps/rejected": -210.2257080078125,
499
+ "loss": 0.1866,
500
+ "rewards/accuracies": 0.875,
501
+ "rewards/chosen": 3.252080202102661,
502
+ "rewards/margins": 8.461040496826172,
503
+ "rewards/rejected": -5.20896053314209,
504
  "step": 320
505
  },
506
  {
507
+ "epoch": 0.84,
508
+ "grad_norm": 222.30131769249033,
509
  "learning_rate": 2.1114787115667477e-08,
510
+ "logits/chosen": -2.634732484817505,
511
+ "logits/rejected": -2.5385124683380127,
512
+ "logps/chosen": -260.74530029296875,
513
+ "logps/rejected": -209.8925323486328,
514
+ "loss": 0.1579,
515
+ "rewards/accuracies": 0.981249988079071,
516
+ "rewards/chosen": 4.701260566711426,
517
+ "rewards/margins": 9.207574844360352,
518
+ "rewards/rejected": -4.506315231323242,
519
  "step": 330
520
  },
521
  {
522
+ "epoch": 0.87,
523
+ "grad_norm": 298.91033859930906,
524
  "learning_rate": 1.4771563829877598e-08,
525
+ "logits/chosen": -2.620940685272217,
526
+ "logits/rejected": -2.5207433700561523,
527
+ "logps/chosen": -256.2254638671875,
528
+ "logps/rejected": -197.62445068359375,
529
+ "loss": 0.1936,
530
+ "rewards/accuracies": 0.956250011920929,
531
+ "rewards/chosen": 4.292969703674316,
532
+ "rewards/margins": 9.223516464233398,
533
+ "rewards/rejected": -4.930546760559082,
534
  "step": 340
535
  },
536
  {
537
+ "epoch": 0.9,
538
+ "grad_norm": 322.2905254048829,
539
  "learning_rate": 9.510929498959268e-09,
540
+ "logits/chosen": -2.636793851852417,
541
+ "logits/rejected": -2.522016763687134,
542
+ "logps/chosen": -265.2098388671875,
543
+ "logps/rejected": -215.28469848632812,
544
+ "loss": 0.1724,
545
+ "rewards/accuracies": 0.918749988079071,
546
+ "rewards/chosen": 3.8617148399353027,
547
+ "rewards/margins": 8.617898941040039,
548
+ "rewards/rejected": -4.7561845779418945,
549
  "step": 350
550
  },
551
  {
552
+ "epoch": 0.92,
553
+ "grad_norm": 389.7582997593581,
554
  "learning_rate": 5.374998819965654e-09,
555
+ "logits/chosen": -2.6430556774139404,
556
+ "logits/rejected": -2.5382204055786133,
557
+ "logps/chosen": -270.87615966796875,
558
+ "logps/rejected": -212.25949096679688,
559
+ "loss": 0.1727,
560
+ "rewards/accuracies": 0.9437500238418579,
561
+ "rewards/chosen": 4.255101680755615,
562
+ "rewards/margins": 8.951577186584473,
563
+ "rewards/rejected": -4.696475028991699,
564
  "step": 360
565
  },
566
  {
567
+ "epoch": 0.95,
568
+ "grad_norm": 224.62737396153923,
569
  "learning_rate": 2.396882527576477e-09,
570
+ "logits/chosen": -2.5986270904541016,
571
+ "logits/rejected": -2.492842197418213,
572
+ "logps/chosen": -269.3092346191406,
573
+ "logps/rejected": -214.992919921875,
574
+ "loss": 0.1462,
575
+ "rewards/accuracies": 0.9312499761581421,
576
+ "rewards/chosen": 4.158980846405029,
577
+ "rewards/margins": 8.108312606811523,
578
+ "rewards/rejected": -3.9493324756622314,
579
  "step": 370
580
  },
581
  {
582
+ "epoch": 0.97,
583
+ "grad_norm": 174.48647090969504,
584
  "learning_rate": 6.004223217757509e-10,
585
+ "logits/chosen": -2.653160572052002,
586
+ "logits/rejected": -2.5599982738494873,
587
+ "logps/chosen": -260.55279541015625,
588
+ "logps/rejected": -219.11105346679688,
589
+ "loss": 0.1559,
590
+ "rewards/accuracies": 0.925000011920929,
591
+ "rewards/chosen": 4.291203498840332,
592
+ "rewards/margins": 8.267863273620605,
593
+ "rewards/rejected": -3.9766602516174316,
594
  "step": 380
595
  },
596
  {
597
+ "epoch": 1.0,
598
+ "grad_norm": 384.8395002541089,
599
  "learning_rate": 0.0,
600
+ "logits/chosen": -2.6584863662719727,
601
+ "logits/rejected": -2.5489349365234375,
602
+ "logps/chosen": -248.6810302734375,
603
+ "logps/rejected": -207.1171875,
604
+ "loss": 0.1633,
605
+ "rewards/accuracies": 0.9375,
606
+ "rewards/chosen": 3.9972071647644043,
607
+ "rewards/margins": 9.053262710571289,
608
+ "rewards/rejected": -5.056054592132568,
609
  "step": 390
610
  },
611
  {
612
+ "epoch": 1.0,
613
  "step": 390,
614
  "total_flos": 0.0,
615
+ "train_loss": 0.20326648155848184,
616
+ "train_runtime": 5896.1189,
617
+ "train_samples_per_second": 8.48,
618
+ "train_steps_per_second": 0.066
619
  }
620
  ],
621
  "logging_steps": 10,
 
623
  "num_input_tokens_seen": 0,
624
  "num_train_epochs": 1,
625
  "save_steps": 100,
 
 
 
 
 
 
 
 
 
 
 
 
626
  "total_flos": 0.0,
627
  "train_batch_size": 4,
628
  "trial_name": null,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b7346ad29a6b9b0903d845abfe58994ac8a80348e425e0e321ee83abcfcb035e
3
- size 6264
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d90343e793e5916c6afac01760eb7b5a30707ec90a3ec177dfdeda931df024a
3
+ size 6328