File size: 36,704 Bytes
bd4aa30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.0,
  "eval_steps": 100,
  "global_step": 42,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "debug/policy_chosen_logits": 0.09027537703514099,
      "debug/policy_chosen_logps": -162.730224609375,
      "debug/policy_rejected_logits": 0.5158556699752808,
      "debug/policy_rejected_logps": -184.16571044921875,
      "debug/reference_chosen_logps": -162.730224609375,
      "debug/reference_rejected_logps": -184.16571044921875,
      "epoch": 0.023809523809523808,
      "grad_norm": 4.685966665550777,
      "learning_rate": 5e-07,
      "logits/chosen": 0.09027537703514099,
      "logits/rejected": 0.5158556699752808,
      "logps/chosen": -162.730224609375,
      "logps/rejected": -184.16571044921875,
      "loss": 0.6931,
      "rewards/accuracies": 0.0,
      "rewards/chosen": 0.0,
      "rewards/margins": 0.0,
      "rewards/rejected": 0.0,
      "step": 1
    },
    {
      "debug/policy_chosen_logits": 0.7074397802352905,
      "debug/policy_chosen_logps": -150.46807861328125,
      "debug/policy_rejected_logits": 0.43174317479133606,
      "debug/policy_rejected_logps": -140.48440551757812,
      "debug/reference_chosen_logps": -153.92564392089844,
      "debug/reference_rejected_logps": -142.85406494140625,
      "epoch": 0.047619047619047616,
      "grad_norm": 15.044621835270732,
      "learning_rate": 5e-07,
      "logits/chosen": 0.7074397802352905,
      "logits/rejected": 0.43174317479133606,
      "logps/chosen": -150.46807861328125,
      "logps/rejected": -140.48440551757812,
      "loss": 0.6973,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.034575510770082474,
      "rewards/margins": 0.010879031382501125,
      "rewards/rejected": 0.023696478456258774,
      "step": 2
    },
    {
      "debug/policy_chosen_logits": 0.4280781149864197,
      "debug/policy_chosen_logps": -153.80137634277344,
      "debug/policy_rejected_logits": 1.081570029258728,
      "debug/policy_rejected_logps": -173.27056884765625,
      "debug/reference_chosen_logps": -151.54473876953125,
      "debug/reference_rejected_logps": -169.95703125,
      "epoch": 0.07142857142857142,
      "grad_norm": 4.933146485934281,
      "learning_rate": 5e-07,
      "logits/chosen": 0.4280781149864197,
      "logits/rejected": 1.081570029258728,
      "logps/chosen": -153.80137634277344,
      "logps/rejected": -173.27056884765625,
      "loss": 0.6923,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -0.022566460072994232,
      "rewards/margins": 0.010568867437541485,
      "rewards/rejected": -0.03313532844185829,
      "step": 3
    },
    {
      "debug/policy_chosen_logits": 0.49108535051345825,
      "debug/policy_chosen_logps": -155.66534423828125,
      "debug/policy_rejected_logits": 0.4322296977043152,
      "debug/policy_rejected_logps": -149.7783203125,
      "debug/reference_chosen_logps": -153.9282989501953,
      "debug/reference_rejected_logps": -148.75108337402344,
      "epoch": 0.09523809523809523,
      "grad_norm": 6.198349828531256,
      "learning_rate": 5e-07,
      "logits/chosen": 0.49108535051345825,
      "logits/rejected": 0.4322296977043152,
      "logps/chosen": -155.66534423828125,
      "logps/rejected": -149.7783203125,
      "loss": 0.6899,
      "rewards/accuracies": 0.375,
      "rewards/chosen": -0.0173704344779253,
      "rewards/margins": -0.0070981215685606,
      "rewards/rejected": -0.010272311978042126,
      "step": 4
    },
    {
      "debug/policy_chosen_logits": 1.0142210721969604,
      "debug/policy_chosen_logps": -158.026611328125,
      "debug/policy_rejected_logits": 1.0418132543563843,
      "debug/policy_rejected_logps": -199.23785400390625,
      "debug/reference_chosen_logps": -155.4060516357422,
      "debug/reference_rejected_logps": -194.74618530273438,
      "epoch": 0.11904761904761904,
      "grad_norm": 4.004423522491588,
      "learning_rate": 5e-07,
      "logits/chosen": 1.0142210721969604,
      "logits/rejected": 1.0418132543563843,
      "logps/chosen": -158.026611328125,
      "logps/rejected": -199.23785400390625,
      "loss": 0.6916,
      "rewards/accuracies": 0.875,
      "rewards/chosen": -0.026205480098724365,
      "rewards/margins": 0.018711339682340622,
      "rewards/rejected": -0.04491681978106499,
      "step": 5
    },
    {
      "debug/policy_chosen_logits": 0.803213894367218,
      "debug/policy_chosen_logps": -156.5592041015625,
      "debug/policy_rejected_logits": 0.9607799649238586,
      "debug/policy_rejected_logps": -173.58987426757812,
      "debug/reference_chosen_logps": -146.985595703125,
      "debug/reference_rejected_logps": -163.15786743164062,
      "epoch": 0.14285714285714285,
      "grad_norm": 10.059361542630269,
      "learning_rate": 5e-07,
      "logits/chosen": 0.803213894367218,
      "logits/rejected": 0.9607799649238586,
      "logps/chosen": -156.5592041015625,
      "logps/rejected": -173.58987426757812,
      "loss": 0.6869,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -0.09573620557785034,
      "rewards/margins": 0.008583765476942062,
      "rewards/rejected": -0.1043199747800827,
      "step": 6
    },
    {
      "debug/policy_chosen_logits": 0.5479093194007874,
      "debug/policy_chosen_logps": -176.60946655273438,
      "debug/policy_rejected_logits": 0.11254727840423584,
      "debug/policy_rejected_logps": -175.84767150878906,
      "debug/reference_chosen_logps": -174.020751953125,
      "debug/reference_rejected_logps": -170.24949645996094,
      "epoch": 0.16666666666666666,
      "grad_norm": 20.07889018294013,
      "learning_rate": 5e-07,
      "logits/chosen": 0.5479093194007874,
      "logits/rejected": 0.11254727840423584,
      "logps/chosen": -176.60946655273438,
      "logps/rejected": -175.84767150878906,
      "loss": 0.6892,
      "rewards/accuracies": 0.875,
      "rewards/chosen": -0.025887146592140198,
      "rewards/margins": 0.030094660818576813,
      "rewards/rejected": -0.05598180741071701,
      "step": 7
    },
    {
      "debug/policy_chosen_logits": 0.5916139483451843,
      "debug/policy_chosen_logps": -164.50384521484375,
      "debug/policy_rejected_logits": 0.5801162123680115,
      "debug/policy_rejected_logps": -156.9475860595703,
      "debug/reference_chosen_logps": -153.96173095703125,
      "debug/reference_rejected_logps": -147.9217071533203,
      "epoch": 0.19047619047619047,
      "grad_norm": 4.5921375475915776,
      "learning_rate": 5e-07,
      "logits/chosen": 0.5916139483451843,
      "logits/rejected": 0.5801162123680115,
      "logps/chosen": -164.50384521484375,
      "logps/rejected": -156.9475860595703,
      "loss": 0.69,
      "rewards/accuracies": 0.375,
      "rewards/chosen": -0.10542111098766327,
      "rewards/margins": -0.015162268653512001,
      "rewards/rejected": -0.09025884419679642,
      "step": 8
    },
    {
      "debug/policy_chosen_logits": 0.2481817752122879,
      "debug/policy_chosen_logps": -169.9642333984375,
      "debug/policy_rejected_logits": 0.48347601294517517,
      "debug/policy_rejected_logps": -190.4657440185547,
      "debug/reference_chosen_logps": -162.74264526367188,
      "debug/reference_rejected_logps": -181.60940551757812,
      "epoch": 0.21428571428571427,
      "grad_norm": 5.023010276429253,
      "learning_rate": 5e-07,
      "logits/chosen": 0.2481817752122879,
      "logits/rejected": 0.48347601294517517,
      "logps/chosen": -169.9642333984375,
      "logps/rejected": -190.4657440185547,
      "loss": 0.6877,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -0.07221580296754837,
      "rewards/margins": 0.01634763740003109,
      "rewards/rejected": -0.08856344223022461,
      "step": 9
    },
    {
      "debug/policy_chosen_logits": 0.2575492560863495,
      "debug/policy_chosen_logps": -184.9705810546875,
      "debug/policy_rejected_logits": 0.5005592703819275,
      "debug/policy_rejected_logps": -172.09518432617188,
      "debug/reference_chosen_logps": -172.8156280517578,
      "debug/reference_rejected_logps": -160.72515869140625,
      "epoch": 0.23809523809523808,
      "grad_norm": 8.326836844842857,
      "learning_rate": 5e-07,
      "logits/chosen": 0.2575492560863495,
      "logits/rejected": 0.5005592703819275,
      "logps/chosen": -184.9705810546875,
      "logps/rejected": -172.09518432617188,
      "loss": 0.6811,
      "rewards/accuracies": 0.375,
      "rewards/chosen": -0.12154942750930786,
      "rewards/margins": -0.00784902460873127,
      "rewards/rejected": -0.11370040476322174,
      "step": 10
    },
    {
      "debug/policy_chosen_logits": 0.02329457737505436,
      "debug/policy_chosen_logps": -151.5535888671875,
      "debug/policy_rejected_logits": 0.5039985179901123,
      "debug/policy_rejected_logps": -164.93890380859375,
      "debug/reference_chosen_logps": -145.47381591796875,
      "debug/reference_rejected_logps": -155.04107666015625,
      "epoch": 0.2619047619047619,
      "grad_norm": 8.956479762651878,
      "learning_rate": 5e-07,
      "logits/chosen": 0.02329457737505436,
      "logits/rejected": 0.5039985179901123,
      "logps/chosen": -151.5535888671875,
      "logps/rejected": -164.93890380859375,
      "loss": 0.6868,
      "rewards/accuracies": 0.625,
      "rewards/chosen": -0.06079769879579544,
      "rewards/margins": 0.038180749863386154,
      "rewards/rejected": -0.0989784449338913,
      "step": 11
    },
    {
      "debug/policy_chosen_logits": 0.47942259907722473,
      "debug/policy_chosen_logps": -159.60877990722656,
      "debug/policy_rejected_logits": 0.5704939365386963,
      "debug/policy_rejected_logps": -154.61744689941406,
      "debug/reference_chosen_logps": -147.24301147460938,
      "debug/reference_rejected_logps": -141.2715301513672,
      "epoch": 0.2857142857142857,
      "grad_norm": 11.992421788281984,
      "learning_rate": 5e-07,
      "logits/chosen": 0.47942259907722473,
      "logits/rejected": 0.5704939365386963,
      "logps/chosen": -159.60877990722656,
      "logps/rejected": -154.61744689941406,
      "loss": 0.6853,
      "rewards/accuracies": 0.5,
      "rewards/chosen": -0.12365761399269104,
      "rewards/margins": 0.009801514446735382,
      "rewards/rejected": -0.13345913589000702,
      "step": 12
    },
    {
      "debug/policy_chosen_logits": -0.20997528731822968,
      "debug/policy_chosen_logps": -142.75146484375,
      "debug/policy_rejected_logits": 0.3726802468299866,
      "debug/policy_rejected_logps": -175.70962524414062,
      "debug/reference_chosen_logps": -134.545166015625,
      "debug/reference_rejected_logps": -164.0076141357422,
      "epoch": 0.30952380952380953,
      "grad_norm": 5.358671233182435,
      "learning_rate": 5e-07,
      "logits/chosen": -0.20997528731822968,
      "logits/rejected": 0.3726802468299866,
      "logps/chosen": -142.75146484375,
      "logps/rejected": -175.70962524414062,
      "loss": 0.689,
      "rewards/accuracies": 0.625,
      "rewards/chosen": -0.08206304907798767,
      "rewards/margins": 0.0349571518599987,
      "rewards/rejected": -0.11702020466327667,
      "step": 13
    },
    {
      "debug/policy_chosen_logits": 0.20442210137844086,
      "debug/policy_chosen_logps": -177.51991271972656,
      "debug/policy_rejected_logits": 0.34923601150512695,
      "debug/policy_rejected_logps": -161.62881469726562,
      "debug/reference_chosen_logps": -164.69485473632812,
      "debug/reference_rejected_logps": -150.70733642578125,
      "epoch": 0.3333333333333333,
      "grad_norm": 6.471200581198782,
      "learning_rate": 5e-07,
      "logits/chosen": 0.20442210137844086,
      "logits/rejected": 0.34923601150512695,
      "logps/chosen": -177.51991271972656,
      "logps/rejected": -161.62881469726562,
      "loss": 0.6899,
      "rewards/accuracies": 0.375,
      "rewards/chosen": -0.12825068831443787,
      "rewards/margins": -0.019035786390304565,
      "rewards/rejected": -0.1092148944735527,
      "step": 14
    },
    {
      "debug/policy_chosen_logits": 0.20042140781879425,
      "debug/policy_chosen_logps": -198.75933837890625,
      "debug/policy_rejected_logits": 0.24180738627910614,
      "debug/policy_rejected_logps": -177.14825439453125,
      "debug/reference_chosen_logps": -178.46697998046875,
      "debug/reference_rejected_logps": -158.2596435546875,
      "epoch": 0.35714285714285715,
      "grad_norm": 15.202276429910315,
      "learning_rate": 5e-07,
      "logits/chosen": 0.20042140781879425,
      "logits/rejected": 0.24180738627910614,
      "logps/chosen": -198.75933837890625,
      "logps/rejected": -177.14825439453125,
      "loss": 0.6823,
      "rewards/accuracies": 0.25,
      "rewards/chosen": -0.2029237002134323,
      "rewards/margins": -0.014037556946277618,
      "rewards/rejected": -0.1888861358165741,
      "step": 15
    },
    {
      "debug/policy_chosen_logits": 0.3575197160243988,
      "debug/policy_chosen_logps": -159.78720092773438,
      "debug/policy_rejected_logits": 0.6836833953857422,
      "debug/policy_rejected_logps": -167.8487548828125,
      "debug/reference_chosen_logps": -148.35433959960938,
      "debug/reference_rejected_logps": -153.20465087890625,
      "epoch": 0.38095238095238093,
      "grad_norm": 5.011772584899409,
      "learning_rate": 5e-07,
      "logits/chosen": 0.3575197160243988,
      "logits/rejected": 0.6836833953857422,
      "logps/chosen": -159.78720092773438,
      "logps/rejected": -167.8487548828125,
      "loss": 0.6846,
      "rewards/accuracies": 0.625,
      "rewards/chosen": -0.11432872712612152,
      "rewards/margins": 0.0321124903857708,
      "rewards/rejected": -0.14644122123718262,
      "step": 16
    },
    {
      "debug/policy_chosen_logits": 0.3418176472187042,
      "debug/policy_chosen_logps": -153.46934509277344,
      "debug/policy_rejected_logits": 0.33436495065689087,
      "debug/policy_rejected_logps": -164.7410888671875,
      "debug/reference_chosen_logps": -145.3973388671875,
      "debug/reference_rejected_logps": -149.6763458251953,
      "epoch": 0.40476190476190477,
      "grad_norm": 20.39016346970483,
      "learning_rate": 5e-07,
      "logits/chosen": 0.3418176472187042,
      "logits/rejected": 0.33436495065689087,
      "logps/chosen": -153.46934509277344,
      "logps/rejected": -164.7410888671875,
      "loss": 0.6823,
      "rewards/accuracies": 0.875,
      "rewards/chosen": -0.08072000741958618,
      "rewards/margins": 0.06992734968662262,
      "rewards/rejected": -0.1506473571062088,
      "step": 17
    },
    {
      "debug/policy_chosen_logits": 0.21861077845096588,
      "debug/policy_chosen_logps": -184.25680541992188,
      "debug/policy_rejected_logits": 0.3139030635356903,
      "debug/policy_rejected_logps": -186.92042541503906,
      "debug/reference_chosen_logps": -167.13250732421875,
      "debug/reference_rejected_logps": -167.22145080566406,
      "epoch": 0.42857142857142855,
      "grad_norm": 9.604003163834712,
      "learning_rate": 5e-07,
      "logits/chosen": 0.21861077845096588,
      "logits/rejected": 0.3139030635356903,
      "logps/chosen": -184.25680541992188,
      "logps/rejected": -186.92042541503906,
      "loss": 0.6871,
      "rewards/accuracies": 0.5,
      "rewards/chosen": -0.17124298214912415,
      "rewards/margins": 0.025746773928403854,
      "rewards/rejected": -0.1969897449016571,
      "step": 18
    },
    {
      "debug/policy_chosen_logits": 0.4868623614311218,
      "debug/policy_chosen_logps": -164.40745544433594,
      "debug/policy_rejected_logits": 0.407953143119812,
      "debug/policy_rejected_logps": -192.24801635742188,
      "debug/reference_chosen_logps": -149.22422790527344,
      "debug/reference_rejected_logps": -176.274658203125,
      "epoch": 0.4523809523809524,
      "grad_norm": 5.467573091749328,
      "learning_rate": 5e-07,
      "logits/chosen": 0.4868623614311218,
      "logits/rejected": 0.407953143119812,
      "logps/chosen": -164.40745544433594,
      "logps/rejected": -192.24801635742188,
      "loss": 0.6788,
      "rewards/accuracies": 0.625,
      "rewards/chosen": -0.1518322378396988,
      "rewards/margins": 0.007901255041360855,
      "rewards/rejected": -0.15973350405693054,
      "step": 19
    },
    {
      "debug/policy_chosen_logits": 0.059103213250637054,
      "debug/policy_chosen_logps": -174.8370361328125,
      "debug/policy_rejected_logits": 0.5786897540092468,
      "debug/policy_rejected_logps": -199.56698608398438,
      "debug/reference_chosen_logps": -161.7840118408203,
      "debug/reference_rejected_logps": -185.17050170898438,
      "epoch": 0.47619047619047616,
      "grad_norm": 5.234500875119642,
      "learning_rate": 5e-07,
      "logits/chosen": 0.059103213250637054,
      "logits/rejected": 0.5786897540092468,
      "logps/chosen": -174.8370361328125,
      "logps/rejected": -199.56698608398438,
      "loss": 0.6848,
      "rewards/accuracies": 0.5,
      "rewards/chosen": -0.13053017854690552,
      "rewards/margins": 0.013434587977826595,
      "rewards/rejected": -0.1439647674560547,
      "step": 20
    },
    {
      "debug/policy_chosen_logits": 0.5743213891983032,
      "debug/policy_chosen_logps": -178.57594299316406,
      "debug/policy_rejected_logits": 0.20286375284194946,
      "debug/policy_rejected_logps": -179.84762573242188,
      "debug/reference_chosen_logps": -158.3350067138672,
      "debug/reference_rejected_logps": -160.45053100585938,
      "epoch": 0.5,
      "grad_norm": 6.102878645835385,
      "learning_rate": 5e-07,
      "logits/chosen": 0.5743213891983032,
      "logits/rejected": 0.20286375284194946,
      "logps/chosen": -178.57594299316406,
      "logps/rejected": -179.84762573242188,
      "loss": 0.6845,
      "rewards/accuracies": 0.625,
      "rewards/chosen": -0.20240947604179382,
      "rewards/margins": -0.008438415825366974,
      "rewards/rejected": -0.19397103786468506,
      "step": 21
    },
    {
      "debug/policy_chosen_logits": 0.3122093677520752,
      "debug/policy_chosen_logps": -164.51431274414062,
      "debug/policy_rejected_logits": 0.3530707061290741,
      "debug/policy_rejected_logps": -176.93601989746094,
      "debug/reference_chosen_logps": -147.92042541503906,
      "debug/reference_rejected_logps": -156.74851989746094,
      "epoch": 0.5238095238095238,
      "grad_norm": 6.315550057559611,
      "learning_rate": 5e-07,
      "logits/chosen": 0.3122093677520752,
      "logits/rejected": 0.3530707061290741,
      "logps/chosen": -164.51431274414062,
      "logps/rejected": -176.93601989746094,
      "loss": 0.6815,
      "rewards/accuracies": 0.625,
      "rewards/chosen": -0.16593879461288452,
      "rewards/margins": 0.03593616560101509,
      "rewards/rejected": -0.20187495648860931,
      "step": 22
    },
    {
      "debug/policy_chosen_logits": -0.044268831610679626,
      "debug/policy_chosen_logps": -157.81039428710938,
      "debug/policy_rejected_logits": 0.3559872508049011,
      "debug/policy_rejected_logps": -147.35525512695312,
      "debug/reference_chosen_logps": -150.7872314453125,
      "debug/reference_rejected_logps": -138.82229614257812,
      "epoch": 0.5476190476190477,
      "grad_norm": 23.620495593781406,
      "learning_rate": 5e-07,
      "logits/chosen": -0.044268831610679626,
      "logits/rejected": 0.3559872508049011,
      "logps/chosen": -157.81039428710938,
      "logps/rejected": -147.35525512695312,
      "loss": 0.6842,
      "rewards/accuracies": 0.5,
      "rewards/chosen": -0.07023164629936218,
      "rewards/margins": 0.015097856521606445,
      "rewards/rejected": -0.08532950282096863,
      "step": 23
    },
    {
      "debug/policy_chosen_logits": 0.018966468051075935,
      "debug/policy_chosen_logps": -159.8534698486328,
      "debug/policy_rejected_logits": 0.1428254395723343,
      "debug/policy_rejected_logps": -164.7286834716797,
      "debug/reference_chosen_logps": -147.0450439453125,
      "debug/reference_rejected_logps": -149.85382080078125,
      "epoch": 0.5714285714285714,
      "grad_norm": 4.677349017880479,
      "learning_rate": 5e-07,
      "logits/chosen": 0.018966468051075935,
      "logits/rejected": 0.1428254395723343,
      "logps/chosen": -159.8534698486328,
      "logps/rejected": -164.7286834716797,
      "loss": 0.6819,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -0.1280841827392578,
      "rewards/margins": 0.02066453918814659,
      "rewards/rejected": -0.1487487256526947,
      "step": 24
    },
    {
      "debug/policy_chosen_logits": 0.2853807806968689,
      "debug/policy_chosen_logps": -176.42611694335938,
      "debug/policy_rejected_logits": 0.13074414432048798,
      "debug/policy_rejected_logps": -155.15415954589844,
      "debug/reference_chosen_logps": -158.4669647216797,
      "debug/reference_rejected_logps": -136.29364013671875,
      "epoch": 0.5952380952380952,
      "grad_norm": 11.079802876278416,
      "learning_rate": 5e-07,
      "logits/chosen": 0.2853807806968689,
      "logits/rejected": 0.13074414432048798,
      "logps/chosen": -176.42611694335938,
      "logps/rejected": -155.15415954589844,
      "loss": 0.6821,
      "rewards/accuracies": 0.375,
      "rewards/chosen": -0.179591566324234,
      "rewards/margins": 0.00901371892541647,
      "rewards/rejected": -0.18860529363155365,
      "step": 25
    },
    {
      "debug/policy_chosen_logits": 0.40262606739997864,
      "debug/policy_chosen_logps": -153.15701293945312,
      "debug/policy_rejected_logits": 0.7936873435974121,
      "debug/policy_rejected_logps": -169.62689208984375,
      "debug/reference_chosen_logps": -140.70889282226562,
      "debug/reference_rejected_logps": -152.8561248779297,
      "epoch": 0.6190476190476191,
      "grad_norm": 5.32694876840012,
      "learning_rate": 5e-07,
      "logits/chosen": 0.40262606739997864,
      "logits/rejected": 0.7936873435974121,
      "logps/chosen": -153.15701293945312,
      "logps/rejected": -169.62689208984375,
      "loss": 0.6733,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -0.12448111921548843,
      "rewards/margins": 0.04322664067149162,
      "rewards/rejected": -0.16770777106285095,
      "step": 26
    },
    {
      "debug/policy_chosen_logits": 0.4326091408729553,
      "debug/policy_chosen_logps": -157.39126586914062,
      "debug/policy_rejected_logits": 0.607225239276886,
      "debug/policy_rejected_logps": -188.34918212890625,
      "debug/reference_chosen_logps": -145.33380126953125,
      "debug/reference_rejected_logps": -162.48890686035156,
      "epoch": 0.6428571428571429,
      "grad_norm": 4.700163204340666,
      "learning_rate": 5e-07,
      "logits/chosen": 0.4326091408729553,
      "logits/rejected": 0.607225239276886,
      "logps/chosen": -157.39126586914062,
      "logps/rejected": -188.34918212890625,
      "loss": 0.6642,
      "rewards/accuracies": 0.625,
      "rewards/chosen": -0.12057456374168396,
      "rewards/margins": 0.13802826404571533,
      "rewards/rejected": -0.2586028277873993,
      "step": 27
    },
    {
      "debug/policy_chosen_logits": 0.031958021223545074,
      "debug/policy_chosen_logps": -159.67169189453125,
      "debug/policy_rejected_logits": 0.1899116486310959,
      "debug/policy_rejected_logps": -187.28189086914062,
      "debug/reference_chosen_logps": -141.6376953125,
      "debug/reference_rejected_logps": -162.19659423828125,
      "epoch": 0.6666666666666666,
      "grad_norm": 6.569512152499291,
      "learning_rate": 5e-07,
      "logits/chosen": 0.031958021223545074,
      "logits/rejected": 0.1899116486310959,
      "logps/chosen": -159.67169189453125,
      "logps/rejected": -187.28189086914062,
      "loss": 0.6824,
      "rewards/accuracies": 0.625,
      "rewards/chosen": -0.180339977145195,
      "rewards/margins": 0.07051312178373337,
      "rewards/rejected": -0.2508530914783478,
      "step": 28
    },
    {
      "debug/policy_chosen_logits": 0.10809233784675598,
      "debug/policy_chosen_logps": -176.20724487304688,
      "debug/policy_rejected_logits": 0.41801854968070984,
      "debug/policy_rejected_logps": -197.15541076660156,
      "debug/reference_chosen_logps": -158.1036834716797,
      "debug/reference_rejected_logps": -176.26634216308594,
      "epoch": 0.6904761904761905,
      "grad_norm": 6.541965417685165,
      "learning_rate": 5e-07,
      "logits/chosen": 0.10809233784675598,
      "logits/rejected": 0.41801854968070984,
      "logps/chosen": -176.20724487304688,
      "logps/rejected": -197.15541076660156,
      "loss": 0.6681,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -0.181035578250885,
      "rewards/margins": 0.02785516157746315,
      "rewards/rejected": -0.20889073610305786,
      "step": 29
    },
    {
      "debug/policy_chosen_logits": -0.15116974711418152,
      "debug/policy_chosen_logps": -164.75247192382812,
      "debug/policy_rejected_logits": 0.12009341269731522,
      "debug/policy_rejected_logps": -175.12867736816406,
      "debug/reference_chosen_logps": -146.2644805908203,
      "debug/reference_rejected_logps": -149.75460815429688,
      "epoch": 0.7142857142857143,
      "grad_norm": 4.855645858354973,
      "learning_rate": 5e-07,
      "logits/chosen": -0.15116974711418152,
      "logits/rejected": 0.12009341269731522,
      "logps/chosen": -164.75247192382812,
      "logps/rejected": -175.12867736816406,
      "loss": 0.6786,
      "rewards/accuracies": 0.625,
      "rewards/chosen": -0.18488001823425293,
      "rewards/margins": 0.06886060535907745,
      "rewards/rejected": -0.2537406086921692,
      "step": 30
    },
    {
      "debug/policy_chosen_logits": 0.17149780690670013,
      "debug/policy_chosen_logps": -188.273193359375,
      "debug/policy_rejected_logits": 0.3072517216205597,
      "debug/policy_rejected_logps": -187.52651977539062,
      "debug/reference_chosen_logps": -165.44805908203125,
      "debug/reference_rejected_logps": -158.7125244140625,
      "epoch": 0.7380952380952381,
      "grad_norm": 6.097067478609718,
      "learning_rate": 5e-07,
      "logits/chosen": 0.17149780690670013,
      "logits/rejected": 0.3072517216205597,
      "logps/chosen": -188.273193359375,
      "logps/rejected": -187.52651977539062,
      "loss": 0.6829,
      "rewards/accuracies": 0.625,
      "rewards/chosen": -0.22825142741203308,
      "rewards/margins": 0.059888482093811035,
      "rewards/rejected": -0.2881399095058441,
      "step": 31
    },
    {
      "debug/policy_chosen_logits": 0.46266281604766846,
      "debug/policy_chosen_logps": -169.596435546875,
      "debug/policy_rejected_logits": 0.4477306604385376,
      "debug/policy_rejected_logps": -173.21981811523438,
      "debug/reference_chosen_logps": -152.52273559570312,
      "debug/reference_rejected_logps": -151.03050231933594,
      "epoch": 0.7619047619047619,
      "grad_norm": 10.520633954833,
      "learning_rate": 5e-07,
      "logits/chosen": 0.46266281604766846,
      "logits/rejected": 0.4477306604385376,
      "logps/chosen": -169.596435546875,
      "logps/rejected": -173.21981811523438,
      "loss": 0.6878,
      "rewards/accuracies": 0.625,
      "rewards/chosen": -0.17073702812194824,
      "rewards/margins": 0.05115606635808945,
      "rewards/rejected": -0.2218931019306183,
      "step": 32
    },
    {
      "debug/policy_chosen_logits": 0.7084560394287109,
      "debug/policy_chosen_logps": -161.38735961914062,
      "debug/policy_rejected_logits": 0.6357196569442749,
      "debug/policy_rejected_logps": -164.42718505859375,
      "debug/reference_chosen_logps": -154.86412048339844,
      "debug/reference_rejected_logps": -157.79238891601562,
      "epoch": 0.7857142857142857,
      "grad_norm": 15.939572697711974,
      "learning_rate": 5e-07,
      "logits/chosen": 0.7084560394287109,
      "logits/rejected": 0.6357196569442749,
      "logps/chosen": -161.38735961914062,
      "logps/rejected": -164.42718505859375,
      "loss": 0.6803,
      "rewards/accuracies": 0.5,
      "rewards/chosen": -0.06523235142230988,
      "rewards/margins": 0.001115655992180109,
      "rewards/rejected": -0.06634800881147385,
      "step": 33
    },
    {
      "debug/policy_chosen_logits": 0.12151144444942474,
      "debug/policy_chosen_logps": -161.50515747070312,
      "debug/policy_rejected_logits": 1.0420786142349243,
      "debug/policy_rejected_logps": -164.21615600585938,
      "debug/reference_chosen_logps": -150.59689331054688,
      "debug/reference_rejected_logps": -152.6244354248047,
      "epoch": 0.8095238095238095,
      "grad_norm": 9.868780868712001,
      "learning_rate": 5e-07,
      "logits/chosen": 0.12151144444942474,
      "logits/rejected": 1.0420786142349243,
      "logps/chosen": -161.50515747070312,
      "logps/rejected": -164.21615600585938,
      "loss": 0.6786,
      "rewards/accuracies": 0.375,
      "rewards/chosen": -0.10908253490924835,
      "rewards/margins": 0.006834707222878933,
      "rewards/rejected": -0.11591724306344986,
      "step": 34
    },
    {
      "debug/policy_chosen_logits": 0.027947237715125084,
      "debug/policy_chosen_logps": -157.77456665039062,
      "debug/policy_rejected_logits": 0.31475889682769775,
      "debug/policy_rejected_logps": -159.06002807617188,
      "debug/reference_chosen_logps": -136.69996643066406,
      "debug/reference_rejected_logps": -138.58349609375,
      "epoch": 0.8333333333333334,
      "grad_norm": 8.177811902764494,
      "learning_rate": 5e-07,
      "logits/chosen": 0.027947237715125084,
      "logits/rejected": 0.31475889682769775,
      "logps/chosen": -157.77456665039062,
      "logps/rejected": -159.06002807617188,
      "loss": 0.6874,
      "rewards/accuracies": 0.5,
      "rewards/chosen": -0.21074604988098145,
      "rewards/margins": -0.005980661138892174,
      "rewards/rejected": -0.20476537942886353,
      "step": 35
    },
    {
      "debug/policy_chosen_logits": 0.2593013346195221,
      "debug/policy_chosen_logps": -165.31509399414062,
      "debug/policy_rejected_logits": -0.04938528686761856,
      "debug/policy_rejected_logps": -168.42660522460938,
      "debug/reference_chosen_logps": -145.93374633789062,
      "debug/reference_rejected_logps": -145.29168701171875,
      "epoch": 0.8571428571428571,
      "grad_norm": 12.91232630873052,
      "learning_rate": 5e-07,
      "logits/chosen": 0.2593013346195221,
      "logits/rejected": -0.04938528686761856,
      "logps/chosen": -165.31509399414062,
      "logps/rejected": -168.42660522460938,
      "loss": 0.6712,
      "rewards/accuracies": 0.5,
      "rewards/chosen": -0.19381342828273773,
      "rewards/margins": 0.03753571957349777,
      "rewards/rejected": -0.2313491404056549,
      "step": 36
    },
    {
      "debug/policy_chosen_logits": 1.0012390613555908,
      "debug/policy_chosen_logps": -165.09347534179688,
      "debug/policy_rejected_logits": 1.0178093910217285,
      "debug/policy_rejected_logps": -171.16152954101562,
      "debug/reference_chosen_logps": -150.7286834716797,
      "debug/reference_rejected_logps": -150.47354125976562,
      "epoch": 0.8809523809523809,
      "grad_norm": 4.747576829430422,
      "learning_rate": 5e-07,
      "logits/chosen": 1.0012390613555908,
      "logits/rejected": 1.0178093910217285,
      "logps/chosen": -165.09347534179688,
      "logps/rejected": -171.16152954101562,
      "loss": 0.6849,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -0.1436479538679123,
      "rewards/margins": 0.06323190778493881,
      "rewards/rejected": -0.2068798542022705,
      "step": 37
    },
    {
      "debug/policy_chosen_logits": -0.11002390831708908,
      "debug/policy_chosen_logps": -173.82029724121094,
      "debug/policy_rejected_logits": 0.446510910987854,
      "debug/policy_rejected_logps": -212.50643920898438,
      "debug/reference_chosen_logps": -150.2817840576172,
      "debug/reference_rejected_logps": -185.846923828125,
      "epoch": 0.9047619047619048,
      "grad_norm": 5.476549746954031,
      "learning_rate": 5e-07,
      "logits/chosen": -0.11002390831708908,
      "logits/rejected": 0.446510910987854,
      "logps/chosen": -173.82029724121094,
      "logps/rejected": -212.50643920898438,
      "loss": 0.6724,
      "rewards/accuracies": 0.5,
      "rewards/chosen": -0.23538516461849213,
      "rewards/margins": 0.031209895387291908,
      "rewards/rejected": -0.2665950655937195,
      "step": 38
    },
    {
      "debug/policy_chosen_logits": -0.20052273571491241,
      "debug/policy_chosen_logps": -163.867431640625,
      "debug/policy_rejected_logits": 0.5886087417602539,
      "debug/policy_rejected_logps": -185.58941650390625,
      "debug/reference_chosen_logps": -141.18801879882812,
      "debug/reference_rejected_logps": -159.76058959960938,
      "epoch": 0.9285714285714286,
      "grad_norm": 12.5233373844755,
      "learning_rate": 5e-07,
      "logits/chosen": -0.20052273571491241,
      "logits/rejected": 0.5886087417602539,
      "logps/chosen": -163.867431640625,
      "logps/rejected": -185.58941650390625,
      "loss": 0.6685,
      "rewards/accuracies": 0.5,
      "rewards/chosen": -0.22679391503334045,
      "rewards/margins": 0.0314943790435791,
      "rewards/rejected": -0.25828829407691956,
      "step": 39
    },
    {
      "debug/policy_chosen_logits": 0.25073912739753723,
      "debug/policy_chosen_logps": -157.51876831054688,
      "debug/policy_rejected_logits": 0.30381596088409424,
      "debug/policy_rejected_logps": -176.589599609375,
      "debug/reference_chosen_logps": -138.98110961914062,
      "debug/reference_rejected_logps": -154.03880310058594,
      "epoch": 0.9523809523809523,
      "grad_norm": 5.529655141820795,
      "learning_rate": 5e-07,
      "logits/chosen": 0.25073912739753723,
      "logits/rejected": 0.30381596088409424,
      "logps/chosen": -157.51876831054688,
      "logps/rejected": -176.589599609375,
      "loss": 0.6694,
      "rewards/accuracies": 0.625,
      "rewards/chosen": -0.1853766143321991,
      "rewards/margins": 0.040131378918886185,
      "rewards/rejected": -0.22550798952579498,
      "step": 40
    },
    {
      "debug/policy_chosen_logits": 0.23757268488407135,
      "debug/policy_chosen_logps": -167.54112243652344,
      "debug/policy_rejected_logits": 0.40399065613746643,
      "debug/policy_rejected_logps": -191.4041748046875,
      "debug/reference_chosen_logps": -145.42752075195312,
      "debug/reference_rejected_logps": -164.51507568359375,
      "epoch": 0.9761904761904762,
      "grad_norm": 5.959237498635192,
      "learning_rate": 5e-07,
      "logits/chosen": 0.23757268488407135,
      "logits/rejected": 0.40399065613746643,
      "logps/chosen": -167.54112243652344,
      "logps/rejected": -191.4041748046875,
      "loss": 0.6706,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -0.22113589942455292,
      "rewards/margins": 0.047754913568496704,
      "rewards/rejected": -0.2688907980918884,
      "step": 41
    },
    {
      "debug/policy_chosen_logits": 0.029359659180045128,
      "debug/policy_chosen_logps": -178.32794189453125,
      "debug/policy_rejected_logits": 0.20693586766719818,
      "debug/policy_rejected_logps": -185.86007690429688,
      "debug/reference_chosen_logps": -151.25294494628906,
      "debug/reference_rejected_logps": -157.61886596679688,
      "epoch": 1.0,
      "grad_norm": 15.740573011032756,
      "learning_rate": 5e-07,
      "logits/chosen": 0.029359659180045128,
      "logits/rejected": 0.20693586766719818,
      "logps/chosen": -178.32794189453125,
      "logps/rejected": -185.86007690429688,
      "loss": 0.6925,
      "rewards/accuracies": 0.625,
      "rewards/chosen": -0.27075010538101196,
      "rewards/margins": 0.01166202500462532,
      "rewards/rejected": -0.2824121117591858,
      "step": 42
    },
    {
      "epoch": 1.0,
      "step": 42,
      "total_flos": 0.0,
      "train_loss": 0.6828021520660037,
      "train_runtime": 390.3052,
      "train_samples_per_second": 6.784,
      "train_steps_per_second": 0.108
    }
  ],
  "logging_steps": 1,
  "max_steps": 42,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}