File size: 23,253 Bytes
0b4ce1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.9984,
  "eval_steps": 500,
  "global_step": 468,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.0,
      "learning_rate": 1.0638297872340426e-07,
      "logits/chosen": 0.18604117631912231,
      "logits/rejected": 0.34631967544555664,
      "logps/chosen": -460.0769348144531,
      "logps/rejected": -351.57135009765625,
      "loss": 0.3612,
      "rewards/accuracies": 0.0,
      "rewards/chosen": 0.0,
      "rewards/margins": 0.0,
      "rewards/rejected": 0.0,
      "step": 1
    },
    {
      "epoch": 0.02,
      "learning_rate": 1.0638297872340427e-06,
      "logits/chosen": 0.05747946724295616,
      "logits/rejected": 0.16539901494979858,
      "logps/chosen": -330.9287414550781,
      "logps/rejected": -328.71575927734375,
      "loss": 0.3819,
      "rewards/accuracies": 0.3888888955116272,
      "rewards/chosen": 2.3913149561849423e-05,
      "rewards/margins": -1.7764228687155992e-05,
      "rewards/rejected": 4.167737643001601e-05,
      "step": 10
    },
    {
      "epoch": 0.04,
      "learning_rate": 2.1276595744680853e-06,
      "logits/chosen": 0.15373219549655914,
      "logits/rejected": 0.19493858516216278,
      "logps/chosen": -325.29803466796875,
      "logps/rejected": -315.1011047363281,
      "loss": 0.3718,
      "rewards/accuracies": 0.4312500059604645,
      "rewards/chosen": -1.974666338355746e-05,
      "rewards/margins": 1.2878153654583002e-07,
      "rewards/rejected": -1.9875456928275526e-05,
      "step": 20
    },
    {
      "epoch": 0.06,
      "learning_rate": 3.191489361702128e-06,
      "logits/chosen": 0.11452794075012207,
      "logits/rejected": 0.1842522919178009,
      "logps/chosen": -371.60943603515625,
      "logps/rejected": -327.19366455078125,
      "loss": 0.3676,
      "rewards/accuracies": 0.4937500059604645,
      "rewards/chosen": -5.7096302043646574e-05,
      "rewards/margins": 3.7131747376406565e-05,
      "rewards/rejected": -9.422805305803195e-05,
      "step": 30
    },
    {
      "epoch": 0.09,
      "learning_rate": 4.255319148936171e-06,
      "logits/chosen": 0.18893569707870483,
      "logits/rejected": 0.2084759920835495,
      "logps/chosen": -337.7607727050781,
      "logps/rejected": -353.72503662109375,
      "loss": 0.3678,
      "rewards/accuracies": 0.5062500238418579,
      "rewards/chosen": 0.00010834180284291506,
      "rewards/margins": 0.00026063303812406957,
      "rewards/rejected": -0.0001522912352811545,
      "step": 40
    },
    {
      "epoch": 0.11,
      "learning_rate": 4.999373573764188e-06,
      "logits/chosen": 0.08207504451274872,
      "logits/rejected": 0.20786412060260773,
      "logps/chosen": -384.91656494140625,
      "logps/rejected": -347.6080627441406,
      "loss": 0.3741,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.0010863704374060035,
      "rewards/margins": 0.0014893051702529192,
      "rewards/rejected": -0.0004029346746392548,
      "step": 50
    },
    {
      "epoch": 0.13,
      "learning_rate": 4.988245838331339e-06,
      "logits/chosen": 0.13469652831554413,
      "logits/rejected": 0.17153413593769073,
      "logps/chosen": -372.34124755859375,
      "logps/rejected": -329.98626708984375,
      "loss": 0.3646,
      "rewards/accuracies": 0.6499999761581421,
      "rewards/chosen": 0.0023965700529515743,
      "rewards/margins": 0.0026933744084089994,
      "rewards/rejected": -0.0002968042972497642,
      "step": 60
    },
    {
      "epoch": 0.15,
      "learning_rate": 4.963268819535228e-06,
      "logits/chosen": 0.15689334273338318,
      "logits/rejected": 0.20978419482707977,
      "logps/chosen": -364.29888916015625,
      "logps/rejected": -377.21087646484375,
      "loss": 0.3705,
      "rewards/accuracies": 0.71875,
      "rewards/chosen": 0.0023818810004740953,
      "rewards/margins": 0.0072565278969705105,
      "rewards/rejected": -0.0048746466636657715,
      "step": 70
    },
    {
      "epoch": 0.17,
      "learning_rate": 4.9245815365216115e-06,
      "logits/chosen": 0.11595060676336288,
      "logits/rejected": 0.18608702719211578,
      "logps/chosen": -328.6256103515625,
      "logps/rejected": -365.93896484375,
      "loss": 0.3663,
      "rewards/accuracies": 0.6812499761581421,
      "rewards/chosen": -0.002104334533214569,
      "rewards/margins": 0.011472588405013084,
      "rewards/rejected": -0.013576922006905079,
      "step": 80
    },
    {
      "epoch": 0.19,
      "learning_rate": 4.872399318152594e-06,
      "logits/chosen": 0.08527339994907379,
      "logits/rejected": 0.11540959030389786,
      "logps/chosen": -352.0487060546875,
      "logps/rejected": -361.36578369140625,
      "loss": 0.3456,
      "rewards/accuracies": 0.675000011920929,
      "rewards/chosen": -0.011672710999846458,
      "rewards/margins": 0.01901327446103096,
      "rewards/rejected": -0.03068598173558712,
      "step": 90
    },
    {
      "epoch": 0.21,
      "learning_rate": 4.807012604511542e-06,
      "logits/chosen": 0.14592930674552917,
      "logits/rejected": 0.18039533495903015,
      "logps/chosen": -414.8818359375,
      "logps/rejected": -435.3639221191406,
      "loss": 0.34,
      "rewards/accuracies": 0.6625000238418579,
      "rewards/chosen": -0.04601982235908508,
      "rewards/margins": 0.034013133496046066,
      "rewards/rejected": -0.08003295958042145,
      "step": 100
    },
    {
      "epoch": 0.23,
      "learning_rate": 4.728785330347771e-06,
      "logits/chosen": 0.13395074009895325,
      "logits/rejected": 0.2120208740234375,
      "logps/chosen": -432.98486328125,
      "logps/rejected": -553.6598510742188,
      "loss": 0.305,
      "rewards/accuracies": 0.6875,
      "rewards/chosen": -0.11783289909362793,
      "rewards/margins": 0.08183668553829193,
      "rewards/rejected": -0.19966959953308105,
      "step": 110
    },
    {
      "epoch": 0.26,
      "learning_rate": 4.63815289945858e-06,
      "logits/chosen": 0.14807412028312683,
      "logits/rejected": 0.21360798180103302,
      "logps/chosen": -565.0419311523438,
      "logps/rejected": -595.3816528320312,
      "loss": 0.3434,
      "rewards/accuracies": 0.612500011920929,
      "rewards/chosen": -0.18253007531166077,
      "rewards/margins": 0.060289014130830765,
      "rewards/rejected": -0.24281907081604004,
      "step": 120
    },
    {
      "epoch": 0.28,
      "learning_rate": 4.535619761282989e-06,
      "logits/chosen": 0.10818658024072647,
      "logits/rejected": 0.18194182217121124,
      "logps/chosen": -462.3949279785156,
      "logps/rejected": -513.8490600585938,
      "loss": 0.3173,
      "rewards/accuracies": 0.6625000238418579,
      "rewards/chosen": -0.1326686441898346,
      "rewards/margins": 0.0804656594991684,
      "rewards/rejected": -0.2131342887878418,
      "step": 130
    },
    {
      "epoch": 0.3,
      "learning_rate": 4.42175660319555e-06,
      "logits/chosen": 0.13493295013904572,
      "logits/rejected": 0.22737479209899902,
      "logps/chosen": -532.4095458984375,
      "logps/rejected": -585.3455200195312,
      "loss": 0.3196,
      "rewards/accuracies": 0.6499999761581421,
      "rewards/chosen": -0.16629299521446228,
      "rewards/margins": 0.08414360880851746,
      "rewards/rejected": -0.25043657422065735,
      "step": 140
    },
    {
      "epoch": 0.32,
      "learning_rate": 4.297197174127619e-06,
      "logits/chosen": 0.17478415369987488,
      "logits/rejected": 0.24990789592266083,
      "logps/chosen": -479.77862548828125,
      "logps/rejected": -553.7377319335938,
      "loss": 0.3207,
      "rewards/accuracies": 0.6187499761581421,
      "rewards/chosen": -0.14474426209926605,
      "rewards/margins": 0.09155549854040146,
      "rewards/rejected": -0.2362997829914093,
      "step": 150
    },
    {
      "epoch": 0.34,
      "learning_rate": 4.162634757195418e-06,
      "logits/chosen": 0.13402113318443298,
      "logits/rejected": 0.2551622688770294,
      "logps/chosen": -496.40081787109375,
      "logps/rejected": -558.84326171875,
      "loss": 0.3187,
      "rewards/accuracies": 0.65625,
      "rewards/chosen": -0.13809171319007874,
      "rewards/margins": 0.08086591213941574,
      "rewards/rejected": -0.21895763278007507,
      "step": 160
    },
    {
      "epoch": 0.36,
      "learning_rate": 4.018818310967843e-06,
      "logits/chosen": 0.12252243608236313,
      "logits/rejected": 0.16481925547122955,
      "logps/chosen": -484.56353759765625,
      "logps/rejected": -554.2274780273438,
      "loss": 0.3255,
      "rewards/accuracies": 0.6625000238418579,
      "rewards/chosen": -0.14209917187690735,
      "rewards/margins": 0.09574152529239655,
      "rewards/rejected": -0.2378406971693039,
      "step": 170
    },
    {
      "epoch": 0.38,
      "learning_rate": 3.866548300851254e-06,
      "logits/chosen": 0.08216498792171478,
      "logits/rejected": 0.17952165007591248,
      "logps/chosen": -482.576171875,
      "logps/rejected": -579.781982421875,
      "loss": 0.3047,
      "rewards/accuracies": 0.637499988079071,
      "rewards/chosen": -0.14626096189022064,
      "rewards/margins": 0.08570893108844757,
      "rewards/rejected": -0.23196987807750702,
      "step": 180
    },
    {
      "epoch": 0.41,
      "learning_rate": 3.706672243793271e-06,
      "logits/chosen": 0.07855963706970215,
      "logits/rejected": 0.07844971120357513,
      "logps/chosen": -468.69061279296875,
      "logps/rejected": -544.0549926757812,
      "loss": 0.2935,
      "rewards/accuracies": 0.6875,
      "rewards/chosen": -0.1354297697544098,
      "rewards/margins": 0.09144213050603867,
      "rewards/rejected": -0.22687189280986786,
      "step": 190
    },
    {
      "epoch": 0.43,
      "learning_rate": 3.5400799911032357e-06,
      "logits/chosen": 0.10545216500759125,
      "logits/rejected": 0.1882828176021576,
      "logps/chosen": -501.1815490722656,
      "logps/rejected": -601.7250366210938,
      "loss": 0.3041,
      "rewards/accuracies": 0.6312500238418579,
      "rewards/chosen": -0.15607957541942596,
      "rewards/margins": 0.10436417162418365,
      "rewards/rejected": -0.2604437470436096,
      "step": 200
    },
    {
      "epoch": 0.45,
      "learning_rate": 3.3676987756445894e-06,
      "logits/chosen": 0.10487540811300278,
      "logits/rejected": 0.11818546056747437,
      "logps/chosen": -470.6344299316406,
      "logps/rejected": -565.8145751953125,
      "loss": 0.3148,
      "rewards/accuracies": 0.612500011920929,
      "rewards/chosen": -0.16682696342468262,
      "rewards/margins": 0.09452913701534271,
      "rewards/rejected": -0.26135605573654175,
      "step": 210
    },
    {
      "epoch": 0.47,
      "learning_rate": 3.1904880509659397e-06,
      "logits/chosen": 0.13482534885406494,
      "logits/rejected": 0.20024776458740234,
      "logps/chosen": -510.10528564453125,
      "logps/rejected": -605.6468505859375,
      "loss": 0.3172,
      "rewards/accuracies": 0.6812499761581421,
      "rewards/chosen": -0.16119703650474548,
      "rewards/margins": 0.1006912812590599,
      "rewards/rejected": -0.261888325214386,
      "step": 220
    },
    {
      "epoch": 0.49,
      "learning_rate": 3.0094341510955697e-06,
      "logits/chosen": 0.10005593299865723,
      "logits/rejected": 0.1791614145040512,
      "logps/chosen": -532.1925048828125,
      "logps/rejected": -624.0726318359375,
      "loss": 0.3106,
      "rewards/accuracies": 0.668749988079071,
      "rewards/chosen": -0.17168700695037842,
      "rewards/margins": 0.09216924011707306,
      "rewards/rejected": -0.26385626196861267,
      "step": 230
    },
    {
      "epoch": 0.51,
      "learning_rate": 2.825544800722376e-06,
      "logits/chosen": 0.10918021202087402,
      "logits/rejected": 0.18382051587104797,
      "logps/chosen": -508.88494873046875,
      "logps/rejected": -571.0011596679688,
      "loss": 0.309,
      "rewards/accuracies": 0.637499988079071,
      "rewards/chosen": -0.15795882046222687,
      "rewards/margins": 0.09128745645284653,
      "rewards/rejected": -0.2492462694644928,
      "step": 240
    },
    {
      "epoch": 0.53,
      "learning_rate": 2.639843506318899e-06,
      "logits/chosen": 0.12444597482681274,
      "logits/rejected": 0.16049379110336304,
      "logps/chosen": -509.65631103515625,
      "logps/rejected": -609.0398559570312,
      "loss": 0.2924,
      "rewards/accuracies": 0.6937500238418579,
      "rewards/chosen": -0.1560695320367813,
      "rewards/margins": 0.11202052980661392,
      "rewards/rejected": -0.26809003949165344,
      "step": 250
    },
    {
      "epoch": 0.55,
      "learning_rate": 2.4533638594248094e-06,
      "logits/chosen": 0.11506851017475128,
      "logits/rejected": 0.1052849292755127,
      "logps/chosen": -535.1851806640625,
      "logps/rejected": -609.4058837890625,
      "loss": 0.3139,
      "rewards/accuracies": 0.699999988079071,
      "rewards/chosen": -0.18579542636871338,
      "rewards/margins": 0.10177616029977798,
      "rewards/rejected": -0.28757157921791077,
      "step": 260
    },
    {
      "epoch": 0.58,
      "learning_rate": 2.2671437837980943e-06,
      "logits/chosen": 0.11827238649129868,
      "logits/rejected": 0.12189098447561264,
      "logps/chosen": -542.7433471679688,
      "logps/rejected": -633.5802612304688,
      "loss": 0.2867,
      "rewards/accuracies": 0.606249988079071,
      "rewards/chosen": -0.19100908935070038,
      "rewards/margins": 0.09670811891555786,
      "rewards/rejected": -0.28771719336509705,
      "step": 270
    },
    {
      "epoch": 0.6,
      "learning_rate": 2.082219758453629e-06,
      "logits/chosen": 0.0904841274023056,
      "logits/rejected": 0.20232203602790833,
      "logps/chosen": -575.5374755859375,
      "logps/rejected": -672.4290161132812,
      "loss": 0.3105,
      "rewards/accuracies": 0.6812499761581421,
      "rewards/chosen": -0.2171885222196579,
      "rewards/margins": 0.10769243538379669,
      "rewards/rejected": -0.3248809278011322,
      "step": 280
    },
    {
      "epoch": 0.62,
      "learning_rate": 1.899621048743019e-06,
      "logits/chosen": 0.07801838964223862,
      "logits/rejected": 0.16570156812667847,
      "logps/chosen": -556.6237182617188,
      "logps/rejected": -643.4578857421875,
      "loss": 0.3145,
      "rewards/accuracies": 0.6937500238418579,
      "rewards/chosen": -0.20063337683677673,
      "rewards/margins": 0.1107235923409462,
      "rewards/rejected": -0.31135696172714233,
      "step": 290
    },
    {
      "epoch": 0.64,
      "learning_rate": 1.7203639775848423e-06,
      "logits/chosen": 0.07458348572254181,
      "logits/rejected": 0.08251482248306274,
      "logps/chosen": -524.4200439453125,
      "logps/rejected": -666.1168823242188,
      "loss": 0.2934,
      "rewards/accuracies": 0.668749988079071,
      "rewards/chosen": -0.18233875930309296,
      "rewards/margins": 0.13505366444587708,
      "rewards/rejected": -0.3173924386501312,
      "step": 300
    },
    {
      "epoch": 0.66,
      "learning_rate": 1.5454462687309445e-06,
      "logits/chosen": 0.06379405409097672,
      "logits/rejected": 0.14568018913269043,
      "logps/chosen": -504.7530822753906,
      "logps/rejected": -650.941650390625,
      "loss": 0.2814,
      "rewards/accuracies": 0.675000011920929,
      "rewards/chosen": -0.17345736920833588,
      "rewards/margins": 0.1400579959154129,
      "rewards/rejected": -0.3135153353214264,
      "step": 310
    },
    {
      "epoch": 0.68,
      "learning_rate": 1.3758414935535147e-06,
      "logits/chosen": 0.0773477703332901,
      "logits/rejected": 0.12101063877344131,
      "logps/chosen": -536.0679931640625,
      "logps/rejected": -659.2662353515625,
      "loss": 0.3014,
      "rewards/accuracies": 0.71875,
      "rewards/chosen": -0.18032298982143402,
      "rewards/margins": 0.11869337409734726,
      "rewards/rejected": -0.29901641607284546,
      "step": 320
    },
    {
      "epoch": 0.7,
      "learning_rate": 1.2124936522614622e-06,
      "logits/chosen": 0.09862785786390305,
      "logits/rejected": 0.14109982550144196,
      "logps/chosen": -487.79071044921875,
      "logps/rejected": -610.7267456054688,
      "loss": 0.2935,
      "rewards/accuracies": 0.65625,
      "rewards/chosen": -0.18224193155765533,
      "rewards/margins": 0.10070188343524933,
      "rewards/rejected": -0.2829437851905823,
      "step": 330
    },
    {
      "epoch": 0.73,
      "learning_rate": 1.0563119197063934e-06,
      "logits/chosen": 0.09601452201604843,
      "logits/rejected": 0.13355228304862976,
      "logps/chosen": -478.78143310546875,
      "logps/rejected": -622.6370239257812,
      "loss": 0.31,
      "rewards/accuracies": 0.668749988079071,
      "rewards/chosen": -0.17518463730812073,
      "rewards/margins": 0.11871640384197235,
      "rewards/rejected": -0.2939010262489319,
      "step": 340
    },
    {
      "epoch": 0.75,
      "learning_rate": 9.081655850224449e-07,
      "logits/chosen": 0.14273716509342194,
      "logits/rejected": 0.12122112512588501,
      "logps/chosen": -535.0431518554688,
      "logps/rejected": -643.0104370117188,
      "loss": 0.3196,
      "rewards/accuracies": 0.675000011920929,
      "rewards/chosen": -0.19643327593803406,
      "rewards/margins": 0.10987784713506699,
      "rewards/rejected": -0.30631113052368164,
      "step": 350
    },
    {
      "epoch": 0.77,
      "learning_rate": 7.688792132653111e-07,
      "logits/chosen": 0.1597367525100708,
      "logits/rejected": 0.18071278929710388,
      "logps/chosen": -597.0608520507812,
      "logps/rejected": -692.2689208984375,
      "loss": 0.2965,
      "rewards/accuracies": 0.699999988079071,
      "rewards/chosen": -0.20307877659797668,
      "rewards/margins": 0.12918424606323242,
      "rewards/rejected": -0.3322630524635315,
      "step": 360
    },
    {
      "epoch": 0.79,
      "learning_rate": 6.392280559802341e-07,
      "logits/chosen": 0.1371072232723236,
      "logits/rejected": 0.16206106543540955,
      "logps/chosen": -573.9052124023438,
      "logps/rejected": -700.1799926757812,
      "loss": 0.2689,
      "rewards/accuracies": 0.706250011920929,
      "rewards/chosen": -0.2099401205778122,
      "rewards/margins": 0.13856378197669983,
      "rewards/rejected": -0.34850388765335083,
      "step": 370
    },
    {
      "epoch": 0.81,
      "learning_rate": 5.199337362431792e-07,
      "logits/chosen": 0.10267746448516846,
      "logits/rejected": 0.1483933925628662,
      "logps/chosen": -554.7241821289062,
      "logps/rejected": -637.0789184570312,
      "loss": 0.2999,
      "rewards/accuracies": 0.71875,
      "rewards/chosen": -0.19773730635643005,
      "rewards/margins": 0.11398313194513321,
      "rewards/rejected": -0.31172046065330505,
      "step": 380
    },
    {
      "epoch": 0.83,
      "learning_rate": 4.1166023219176176e-07,
      "logits/chosen": 0.11247100681066513,
      "logits/rejected": 0.1309679001569748,
      "logps/chosen": -557.1490478515625,
      "logps/rejected": -678.8016357421875,
      "loss": 0.2945,
      "rewards/accuracies": 0.6875,
      "rewards/chosen": -0.1877584308385849,
      "rewards/margins": 0.12902548909187317,
      "rewards/rejected": -0.3167839050292969,
      "step": 390
    },
    {
      "epoch": 0.85,
      "learning_rate": 3.150101814011136e-07,
      "logits/chosen": 0.16857033967971802,
      "logits/rejected": 0.18106935918331146,
      "logps/chosen": -571.0828857421875,
      "logps/rejected": -638.4654541015625,
      "loss": 0.3045,
      "rewards/accuracies": 0.668749988079071,
      "rewards/chosen": -0.19992712140083313,
      "rewards/margins": 0.10413169860839844,
      "rewards/rejected": -0.30405884981155396,
      "step": 400
    },
    {
      "epoch": 0.87,
      "learning_rate": 2.3052152667409289e-07,
      "logits/chosen": 0.09823437035083771,
      "logits/rejected": 0.23346427083015442,
      "logps/chosen": -537.1575317382812,
      "logps/rejected": -657.79931640625,
      "loss": 0.3101,
      "rewards/accuracies": 0.637499988079071,
      "rewards/chosen": -0.19983352720737457,
      "rewards/margins": 0.11520209163427353,
      "rewards/rejected": -0.3150356113910675,
      "step": 410
    },
    {
      "epoch": 0.9,
      "learning_rate": 1.5866452191498488e-07,
      "logits/chosen": 0.13151055574417114,
      "logits/rejected": 0.17384907603263855,
      "logps/chosen": -562.8436279296875,
      "logps/rejected": -688.2677612304688,
      "loss": 0.2892,
      "rewards/accuracies": 0.6499999761581421,
      "rewards/chosen": -0.21342253684997559,
      "rewards/margins": 0.1261134147644043,
      "rewards/rejected": -0.3395359516143799,
      "step": 420
    },
    {
      "epoch": 0.92,
      "learning_rate": 9.983911475163727e-08,
      "logits/chosen": 0.11608059704303741,
      "logits/rejected": 0.13708294928073883,
      "logps/chosen": -503.0716857910156,
      "logps/rejected": -606.0911865234375,
      "loss": 0.2983,
      "rewards/accuracies": 0.6499999761581421,
      "rewards/chosen": -0.18081562221050262,
      "rewards/margins": 0.11758317053318024,
      "rewards/rejected": -0.29839879274368286,
      "step": 430
    },
    {
      "epoch": 0.94,
      "learning_rate": 5.437272047405712e-08,
      "logits/chosen": 0.11100079119205475,
      "logits/rejected": 0.13695240020751953,
      "logps/chosen": -530.9528198242188,
      "logps/rejected": -668.3643798828125,
      "loss": 0.3095,
      "rewards/accuracies": 0.668749988079071,
      "rewards/chosen": -0.2087739259004593,
      "rewards/margins": 0.13447535037994385,
      "rewards/rejected": -0.34324929118156433,
      "step": 440
    },
    {
      "epoch": 0.96,
      "learning_rate": 2.251839967945535e-08,
      "logits/chosen": 0.022059569135308266,
      "logits/rejected": 0.12616530060768127,
      "logps/chosen": -492.80181884765625,
      "logps/rejected": -643.1315307617188,
      "loss": 0.2834,
      "rewards/accuracies": 0.706250011920929,
      "rewards/chosen": -0.18623578548431396,
      "rewards/margins": 0.14532844722270966,
      "rewards/rejected": -0.3315642178058624,
      "step": 450
    },
    {
      "epoch": 0.98,
      "learning_rate": 4.453449766758933e-09,
      "logits/chosen": 0.10801200568675995,
      "logits/rejected": 0.11458040773868561,
      "logps/chosen": -531.0709228515625,
      "logps/rejected": -645.5343627929688,
      "loss": 0.3043,
      "rewards/accuracies": 0.6937500238418579,
      "rewards/chosen": -0.1950865238904953,
      "rewards/margins": 0.11565764993429184,
      "rewards/rejected": -0.3107442259788513,
      "step": 460
    },
    {
      "epoch": 1.0,
      "step": 468,
      "total_flos": 0.0,
      "train_loss": 0.2399272450015076,
      "train_runtime": 4417.3525,
      "train_samples_per_second": 3.396,
      "train_steps_per_second": 0.106
    }
  ],
  "logging_steps": 10,
  "max_steps": 468,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 100,
  "total_flos": 0.0,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}