File size: 41,964 Bytes
152b38d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.999345977763244,
  "eval_steps": 100,
  "global_step": 764,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.0,
      "learning_rate": 6.493506493506494e-09,
      "logits/chosen": -2.0615594387054443,
      "logits/rejected": -1.9222214221954346,
      "logps/chosen": -3380.6083984375,
      "logps/rejected": -2521.2978515625,
      "loss": 0.0001,
      "rewards/accuracies": 0.0,
      "rewards/chosen": 0.0,
      "rewards/margins": 0.0,
      "rewards/rejected": 0.0,
      "step": 1
    },
    {
      "epoch": 0.01,
      "learning_rate": 6.493506493506492e-08,
      "logits/chosen": -1.674426555633545,
      "logits/rejected": -1.637134313583374,
      "logps/chosen": -2549.3515625,
      "logps/rejected": -2319.4013671875,
      "loss": 10.0505,
      "rewards/accuracies": 0.4333333373069763,
      "rewards/chosen": 0.0008169158827513456,
      "rewards/margins": 0.0011402772506698966,
      "rewards/rejected": -0.00032336192089132965,
      "step": 10
    },
    {
      "epoch": 0.03,
      "learning_rate": 1.2987012987012984e-07,
      "logits/chosen": -1.6043205261230469,
      "logits/rejected": -1.5535523891448975,
      "logps/chosen": -2340.101318359375,
      "logps/rejected": -2224.145263671875,
      "loss": 7.4843,
      "rewards/accuracies": 0.5200000405311584,
      "rewards/chosen": 0.00018432810611557215,
      "rewards/margins": 0.0009077669237740338,
      "rewards/rejected": -0.0007234388613142073,
      "step": 20
    },
    {
      "epoch": 0.04,
      "learning_rate": 1.948051948051948e-07,
      "logits/chosen": -1.6847426891326904,
      "logits/rejected": -1.6577625274658203,
      "logps/chosen": -2983.23681640625,
      "logps/rejected": -2513.237060546875,
      "loss": 9.1379,
      "rewards/accuracies": 0.48000001907348633,
      "rewards/chosen": 0.010261936113238335,
      "rewards/margins": 0.004135974682867527,
      "rewards/rejected": 0.006125961430370808,
      "step": 30
    },
    {
      "epoch": 0.05,
      "learning_rate": 2.597402597402597e-07,
      "logits/chosen": -1.859400987625122,
      "logits/rejected": -1.8100417852401733,
      "logps/chosen": -2879.57470703125,
      "logps/rejected": -2273.878173828125,
      "loss": 12.271,
      "rewards/accuracies": 0.5,
      "rewards/chosen": 0.012033696286380291,
      "rewards/margins": 0.005555520299822092,
      "rewards/rejected": 0.006478174589574337,
      "step": 40
    },
    {
      "epoch": 0.07,
      "learning_rate": 3.2467532467532465e-07,
      "logits/chosen": -1.828608751296997,
      "logits/rejected": -1.805625319480896,
      "logps/chosen": -2893.784423828125,
      "logps/rejected": -2551.77294921875,
      "loss": 8.7411,
      "rewards/accuracies": 0.5600000023841858,
      "rewards/chosen": 0.02166888490319252,
      "rewards/margins": 0.007775471545755863,
      "rewards/rejected": 0.013893413357436657,
      "step": 50
    },
    {
      "epoch": 0.08,
      "learning_rate": 3.896103896103896e-07,
      "logits/chosen": -1.7459495067596436,
      "logits/rejected": -1.6628999710083008,
      "logps/chosen": -3231.689697265625,
      "logps/rejected": -2554.42919921875,
      "loss": 9.758,
      "rewards/accuracies": 0.559999942779541,
      "rewards/chosen": 0.027519574388861656,
      "rewards/margins": 0.008895651437342167,
      "rewards/rejected": 0.018623923882842064,
      "step": 60
    },
    {
      "epoch": 0.09,
      "learning_rate": 4.545454545454545e-07,
      "logits/chosen": -1.8072433471679688,
      "logits/rejected": -1.7838470935821533,
      "logps/chosen": -2829.386474609375,
      "logps/rejected": -2542.68701171875,
      "loss": 11.0017,
      "rewards/accuracies": 0.5,
      "rewards/chosen": 0.024034958332777023,
      "rewards/margins": 0.006175906863063574,
      "rewards/rejected": 0.017859051004052162,
      "step": 70
    },
    {
      "epoch": 0.1,
      "learning_rate": 4.99976474872689e-07,
      "logits/chosen": -1.7730411291122437,
      "logits/rejected": -1.7399647235870361,
      "logps/chosen": -2769.705322265625,
      "logps/rejected": -2476.75634765625,
      "loss": 15.623,
      "rewards/accuracies": 0.5400000214576721,
      "rewards/chosen": 0.008623984642326832,
      "rewards/margins": 0.008157819509506226,
      "rewards/rejected": 0.0004661638231482357,
      "step": 80
    },
    {
      "epoch": 0.12,
      "learning_rate": 4.995583735427465e-07,
      "logits/chosen": -1.790204644203186,
      "logits/rejected": -1.7226215600967407,
      "logps/chosen": -2688.0732421875,
      "logps/rejected": -2436.649658203125,
      "loss": 11.9811,
      "rewards/accuracies": 0.6100000143051147,
      "rewards/chosen": 0.017978714779019356,
      "rewards/margins": 0.017238261178135872,
      "rewards/rejected": 0.0007404519128613174,
      "step": 90
    },
    {
      "epoch": 0.13,
      "learning_rate": 4.986184978516146e-07,
      "logits/chosen": -1.7211675643920898,
      "logits/rejected": -1.6991230249404907,
      "logps/chosen": -2611.177001953125,
      "logps/rejected": -2212.4033203125,
      "loss": 16.8403,
      "rewards/accuracies": 0.5200001001358032,
      "rewards/chosen": 0.024822045117616653,
      "rewards/margins": 0.00336282467469573,
      "rewards/rejected": 0.021459218114614487,
      "step": 100
    },
    {
      "epoch": 0.13,
      "eval_logits/chosen": -1.6729556322097778,
      "eval_logits/rejected": -1.6068017482757568,
      "eval_logps/chosen": -2806.55517578125,
      "eval_logps/rejected": -2491.901123046875,
      "eval_loss": 19.51178741455078,
      "eval_rewards/accuracies": 0.52734375,
      "eval_rewards/chosen": 0.025559017434716225,
      "eval_rewards/margins": 0.008243386633694172,
      "eval_rewards/rejected": 0.017315629869699478,
      "eval_runtime": 115.2508,
      "eval_samples_per_second": 17.353,
      "eval_steps_per_second": 0.278,
      "step": 100
    },
    {
      "epoch": 0.14,
      "learning_rate": 4.971588128827782e-07,
      "logits/chosen": -1.7473026514053345,
      "logits/rejected": -1.6806236505508423,
      "logps/chosen": -3125.757080078125,
      "logps/rejected": -2645.337158203125,
      "loss": 26.9149,
      "rewards/accuracies": 0.6000000238418579,
      "rewards/chosen": 0.016931097954511642,
      "rewards/margins": 0.002864243695512414,
      "rewards/rejected": 0.014066850766539574,
      "step": 110
    },
    {
      "epoch": 0.16,
      "learning_rate": 4.951823705321981e-07,
      "logits/chosen": -1.7069530487060547,
      "logits/rejected": -1.6579583883285522,
      "logps/chosen": -2828.78662109375,
      "logps/rejected": -2442.76416015625,
      "loss": 33.872,
      "rewards/accuracies": 0.5600000023841858,
      "rewards/chosen": 0.013961514458060265,
      "rewards/margins": 0.00896529946476221,
      "rewards/rejected": 0.004996216390281916,
      "step": 120
    },
    {
      "epoch": 0.17,
      "learning_rate": 4.926933031274343e-07,
      "logits/chosen": -1.7224699258804321,
      "logits/rejected": -1.6934731006622314,
      "logps/chosen": -2923.9306640625,
      "logps/rejected": -2566.210693359375,
      "loss": 39.0757,
      "rewards/accuracies": 0.550000011920929,
      "rewards/chosen": 0.038237668573856354,
      "rewards/margins": 0.006029448006302118,
      "rewards/rejected": 0.03220822289586067,
      "step": 130
    },
    {
      "epoch": 0.18,
      "learning_rate": 4.896968147878145e-07,
      "logits/chosen": -1.7280409336090088,
      "logits/rejected": -1.7070726156234741,
      "logps/chosen": -2737.75927734375,
      "logps/rejected": -2486.45751953125,
      "loss": 18.5231,
      "rewards/accuracies": 0.6299999952316284,
      "rewards/chosen": 0.031596291810274124,
      "rewards/margins": 0.04216960817575455,
      "rewards/rejected": -0.010573318228125572,
      "step": 140
    },
    {
      "epoch": 0.2,
      "learning_rate": 4.861991705437081e-07,
      "logits/chosen": -1.7859830856323242,
      "logits/rejected": -1.7191492319107056,
      "logps/chosen": -2743.43310546875,
      "logps/rejected": -2297.162109375,
      "loss": 20.7835,
      "rewards/accuracies": 0.5800000429153442,
      "rewards/chosen": 0.0336376316845417,
      "rewards/margins": 0.011832155287265778,
      "rewards/rejected": 0.021805476397275925,
      "step": 150
    },
    {
      "epoch": 0.21,
      "learning_rate": 4.822076832376586e-07,
      "logits/chosen": -1.8132251501083374,
      "logits/rejected": -1.7665789127349854,
      "logps/chosen": -2841.165771484375,
      "logps/rejected": -2748.486572265625,
      "loss": 57.9401,
      "rewards/accuracies": 0.5099999904632568,
      "rewards/chosen": 0.006118610501289368,
      "rewards/margins": 0.0022155127953737974,
      "rewards/rejected": 0.0039030970074236393,
      "step": 160
    },
    {
      "epoch": 0.22,
      "learning_rate": 4.777306982347594e-07,
      "logits/chosen": -1.6557657718658447,
      "logits/rejected": -1.5996118783950806,
      "logps/chosen": -3055.95361328125,
      "logps/rejected": -2603.83642578125,
      "loss": 23.1296,
      "rewards/accuracies": 0.6200000047683716,
      "rewards/chosen": 0.028251701965928078,
      "rewards/margins": 0.020935241132974625,
      "rewards/rejected": 0.007316464092582464,
      "step": 170
    },
    {
      "epoch": 0.24,
      "learning_rate": 4.7277757597424075e-07,
      "logits/chosen": -1.8335905075073242,
      "logits/rejected": -1.7595329284667969,
      "logps/chosen": -2963.73779296875,
      "logps/rejected": -2540.163818359375,
      "loss": 40.5046,
      "rewards/accuracies": 0.5400000810623169,
      "rewards/chosen": 0.018591446802020073,
      "rewards/margins": -0.0024230503477156162,
      "rewards/rejected": 0.02101449854671955,
      "step": 180
    },
    {
      "epoch": 0.25,
      "learning_rate": 4.6735867239874904e-07,
      "logits/chosen": -1.8637840747833252,
      "logits/rejected": -1.7640159130096436,
      "logps/chosen": -3237.434814453125,
      "logps/rejected": -2429.197998046875,
      "loss": 36.3042,
      "rewards/accuracies": 0.6200000047683716,
      "rewards/chosen": 0.04794805496931076,
      "rewards/margins": 0.019117821007966995,
      "rewards/rejected": 0.028830235823988914,
      "step": 190
    },
    {
      "epoch": 0.26,
      "learning_rate": 4.6148531730223733e-07,
      "logits/chosen": -1.6909841299057007,
      "logits/rejected": -1.6915366649627686,
      "logps/chosen": -2649.89404296875,
      "logps/rejected": -2436.87353515625,
      "loss": 28.1241,
      "rewards/accuracies": 0.5300000309944153,
      "rewards/chosen": 0.007661645300686359,
      "rewards/margins": 0.0055509163066744804,
      "rewards/rejected": 0.0021107294596731663,
      "step": 200
    },
    {
      "epoch": 0.26,
      "eval_logits/chosen": -1.681164264678955,
      "eval_logits/rejected": -1.618328332901001,
      "eval_logps/chosen": -2808.258056640625,
      "eval_logps/rejected": -2494.01953125,
      "eval_loss": 32.517486572265625,
      "eval_rewards/accuracies": 0.5234375,
      "eval_rewards/chosen": 0.008527392521500587,
      "eval_rewards/margins": 0.012391308322548866,
      "eval_rewards/rejected": -0.0038639232516288757,
      "eval_runtime": 113.682,
      "eval_samples_per_second": 17.593,
      "eval_steps_per_second": 0.281,
      "step": 200
    },
    {
      "epoch": 0.27,
      "learning_rate": 4.5516979064173524e-07,
      "logits/chosen": -1.749903917312622,
      "logits/rejected": -1.7615283727645874,
      "logps/chosen": -2285.7451171875,
      "logps/rejected": -2269.229736328125,
      "loss": 25.9535,
      "rewards/accuracies": 0.6100000143051147,
      "rewards/chosen": 0.011981850489974022,
      "rewards/margins": 0.014764687046408653,
      "rewards/rejected": -0.0027828349266201258,
      "step": 210
    },
    {
      "epoch": 0.29,
      "learning_rate": 4.484252968625277e-07,
      "logits/chosen": -1.716509222984314,
      "logits/rejected": -1.6396989822387695,
      "logps/chosen": -2435.95556640625,
      "logps/rejected": -1922.770751953125,
      "loss": 28.3739,
      "rewards/accuracies": 0.6200000047683716,
      "rewards/chosen": 0.004359879065304995,
      "rewards/margins": 0.007711753249168396,
      "rewards/rejected": -0.0033518739510327578,
      "step": 220
    },
    {
      "epoch": 0.3,
      "learning_rate": 4.4126593729042193e-07,
      "logits/chosen": -1.799469232559204,
      "logits/rejected": -1.757004737854004,
      "logps/chosen": -3254.6396484375,
      "logps/rejected": -2515.59619140625,
      "loss": 39.4707,
      "rewards/accuracies": 0.5900000333786011,
      "rewards/chosen": 0.03561704605817795,
      "rewards/margins": 0.019830647855997086,
      "rewards/rejected": 0.015786398202180862,
      "step": 230
    },
    {
      "epoch": 0.31,
      "learning_rate": 4.3370668064882397e-07,
      "logits/chosen": -1.7325947284698486,
      "logits/rejected": -1.7474550008773804,
      "logps/chosen": -2579.47412109375,
      "logps/rejected": -2328.500732421875,
      "loss": 44.2727,
      "rewards/accuracies": 0.5100000500679016,
      "rewards/chosen": 0.04269097000360489,
      "rewards/margins": 0.02060030959546566,
      "rewards/rejected": 0.02209065482020378,
      "step": 240
    },
    {
      "epoch": 0.33,
      "learning_rate": 4.2576333176226944e-07,
      "logits/chosen": -1.7366650104522705,
      "logits/rejected": -1.706789255142212,
      "logps/chosen": -2479.5576171875,
      "logps/rejected": -2277.726318359375,
      "loss": 29.5758,
      "rewards/accuracies": 0.5300000309944153,
      "rewards/chosen": 0.1058274507522583,
      "rewards/margins": 0.013660475611686707,
      "rewards/rejected": 0.0921669602394104,
      "step": 250
    },
    {
      "epoch": 0.34,
      "learning_rate": 4.17452498511841e-07,
      "logits/chosen": -1.7807962894439697,
      "logits/rejected": -1.7134149074554443,
      "logps/chosen": -2989.12841796875,
      "logps/rejected": -2354.25830078125,
      "loss": 38.7316,
      "rewards/accuracies": 0.5200000405311584,
      "rewards/chosen": 0.023859605193138123,
      "rewards/margins": 0.005521018523722887,
      "rewards/rejected": 0.018338587135076523,
      "step": 260
    },
    {
      "epoch": 0.35,
      "learning_rate": 4.087915571115629e-07,
      "logits/chosen": -1.8165556192398071,
      "logits/rejected": -1.7687098979949951,
      "logps/chosen": -2833.55859375,
      "logps/rejected": -2183.32470703125,
      "loss": 330.4642,
      "rewards/accuracies": 0.5699999928474426,
      "rewards/chosen": 0.031318746507167816,
      "rewards/margins": 0.029996121302247047,
      "rewards/rejected": 0.0013226259034126997,
      "step": 270
    },
    {
      "epoch": 0.37,
      "learning_rate": 3.997986157783715e-07,
      "logits/chosen": -1.6980018615722656,
      "logits/rejected": -1.589050531387329,
      "logps/chosen": -3510.792236328125,
      "logps/rejected": -2689.208984375,
      "loss": 58.1646,
      "rewards/accuracies": 0.5200000405311584,
      "rewards/chosen": 0.014776378870010376,
      "rewards/margins": 0.011294273659586906,
      "rewards/rejected": 0.0034821047447621822,
      "step": 280
    },
    {
      "epoch": 0.38,
      "learning_rate": 3.9049247687162155e-07,
      "logits/chosen": -1.7791646718978882,
      "logits/rejected": -1.7399044036865234,
      "logps/chosen": -2478.590576171875,
      "logps/rejected": -2269.01416015625,
      "loss": 31.6725,
      "rewards/accuracies": 0.5699999928474426,
      "rewards/chosen": 0.04884537309408188,
      "rewards/margins": 0.0339895561337471,
      "rewards/rejected": 0.014855814166367054,
      "step": 290
    },
    {
      "epoch": 0.39,
      "learning_rate": 3.8089259758128543e-07,
      "logits/chosen": -1.670789361000061,
      "logits/rejected": -1.6030629873275757,
      "logps/chosen": -2726.465576171875,
      "logps/rejected": -2119.26123046875,
      "loss": 84.7591,
      "rewards/accuracies": 0.5699999928474426,
      "rewards/chosen": 0.021672677248716354,
      "rewards/margins": -0.010785548016428947,
      "rewards/rejected": 0.03245822712779045,
      "step": 300
    },
    {
      "epoch": 0.39,
      "eval_logits/chosen": -1.660080909729004,
      "eval_logits/rejected": -1.596778154373169,
      "eval_logps/chosen": -2806.140625,
      "eval_logps/rejected": -2492.270263671875,
      "eval_loss": 47.80431365966797,
      "eval_rewards/accuracies": 0.5390625,
      "eval_rewards/chosen": 0.029702020809054375,
      "eval_rewards/margins": 0.01607733778655529,
      "eval_rewards/rejected": 0.013624681159853935,
      "eval_runtime": 116.3019,
      "eval_samples_per_second": 17.197,
      "eval_steps_per_second": 0.275,
      "step": 300
    },
    {
      "epoch": 0.41,
      "learning_rate": 3.710190492470386e-07,
      "logits/chosen": -1.6620228290557861,
      "logits/rejected": -1.7311099767684937,
      "logps/chosen": -2315.977294921875,
      "logps/rejected": -2199.08251953125,
      "loss": 43.6013,
      "rewards/accuracies": 0.5400000214576721,
      "rewards/chosen": 0.032384876161813736,
      "rewards/margins": 0.008921505883336067,
      "rewards/rejected": 0.02346337027847767,
      "step": 310
    },
    {
      "epoch": 0.42,
      "learning_rate": 3.6089247539328616e-07,
      "logits/chosen": -1.7675050497055054,
      "logits/rejected": -1.7156997919082642,
      "logps/chosen": -2859.810791015625,
      "logps/rejected": -2569.75537109375,
      "loss": 38.8904,
      "rewards/accuracies": 0.559999942779541,
      "rewards/chosen": 0.020630866289138794,
      "rewards/margins": 0.0018306337296962738,
      "rewards/rejected": 0.01880022883415222,
      "step": 320
    },
    {
      "epoch": 0.43,
      "learning_rate": 3.5053404856787166e-07,
      "logits/chosen": -1.6446609497070312,
      "logits/rejected": -1.5918724536895752,
      "logps/chosen": -3104.72802734375,
      "logps/rejected": -2430.239013671875,
      "loss": 84.9753,
      "rewards/accuracies": 0.47999995946884155,
      "rewards/chosen": 0.053660690784454346,
      "rewards/margins": -0.005831834394484758,
      "rewards/rejected": 0.05949252098798752,
      "step": 330
    },
    {
      "epoch": 0.44,
      "learning_rate": 3.399654260747078e-07,
      "logits/chosen": -1.699196219444275,
      "logits/rejected": -1.7045748233795166,
      "logps/chosen": -2584.699462890625,
      "logps/rejected": -2263.678466796875,
      "loss": 38.1532,
      "rewards/accuracies": 0.5300000309944153,
      "rewards/chosen": 0.02709970250725746,
      "rewards/margins": 0.01412280835211277,
      "rewards/rejected": 0.012976895086467266,
      "step": 340
    },
    {
      "epoch": 0.46,
      "learning_rate": 3.2920870469288373e-07,
      "logits/chosen": -1.7267532348632812,
      "logits/rejected": -1.6659395694732666,
      "logps/chosen": -2935.341796875,
      "logps/rejected": -2503.583984375,
      "loss": 47.1836,
      "rewards/accuracies": 0.5199999809265137,
      "rewards/chosen": 0.031215447932481766,
      "rewards/margins": 0.022346725687384605,
      "rewards/rejected": 0.008868719451129436,
      "step": 350
    },
    {
      "epoch": 0.47,
      "learning_rate": 3.182863744769218e-07,
      "logits/chosen": -1.7288787364959717,
      "logits/rejected": -1.6928844451904297,
      "logps/chosen": -2811.489501953125,
      "logps/rejected": -2596.68310546875,
      "loss": 36.8176,
      "rewards/accuracies": 0.5099999904632568,
      "rewards/chosen": 0.1375296413898468,
      "rewards/margins": 0.0820910781621933,
      "rewards/rejected": 0.05543852597475052,
      "step": 360
    },
    {
      "epoch": 0.48,
      "learning_rate": 3.072212717347776e-07,
      "logits/chosen": -1.7680120468139648,
      "logits/rejected": -1.6781940460205078,
      "logps/chosen": -3101.98583984375,
      "logps/rejected": -2426.4716796875,
      "loss": 36.7837,
      "rewards/accuracies": 0.5199999809265137,
      "rewards/chosen": 0.022122934460639954,
      "rewards/margins": 0.011781491339206696,
      "rewards/rejected": 0.010341441258788109,
      "step": 370
    },
    {
      "epoch": 0.5,
      "learning_rate": 2.9603653128189665e-07,
      "logits/chosen": -1.6812299489974976,
      "logits/rejected": -1.7215496301651,
      "logps/chosen": -2823.8291015625,
      "logps/rejected": -2762.53076171875,
      "loss": 42.732,
      "rewards/accuracies": 0.5,
      "rewards/chosen": 0.028385426849126816,
      "rewards/margins": -0.006515379063785076,
      "rewards/rejected": 0.03490080684423447,
      "step": 380
    },
    {
      "epoch": 0.51,
      "learning_rate": 2.8475553807115387e-07,
      "logits/chosen": -1.8070951700210571,
      "logits/rejected": -1.7426990270614624,
      "logps/chosen": -2697.833251953125,
      "logps/rejected": -2263.9990234375,
      "loss": 55.8683,
      "rewards/accuracies": 0.5099999904632568,
      "rewards/chosen": 0.012206131592392921,
      "rewards/margins": 0.011461116373538971,
      "rewards/rejected": 0.0007450145785696805,
      "step": 390
    },
    {
      "epoch": 0.52,
      "learning_rate": 2.7340187829980883e-07,
      "logits/chosen": -1.8249183893203735,
      "logits/rejected": -1.7130759954452515,
      "logps/chosen": -2940.11181640625,
      "logps/rejected": -2463.068359375,
      "loss": 40.7835,
      "rewards/accuracies": 0.6000000238418579,
      "rewards/chosen": 0.0059137181378901005,
      "rewards/margins": 0.01795141212642193,
      "rewards/rejected": -0.012037692591547966,
      "step": 400
    },
    {
      "epoch": 0.52,
      "eval_logits/chosen": -1.6917269229888916,
      "eval_logits/rejected": -1.628839373588562,
      "eval_logps/chosen": -2807.42626953125,
      "eval_logps/rejected": -2493.92041015625,
      "eval_loss": 30.672218322753906,
      "eval_rewards/accuracies": 0.5546875,
      "eval_rewards/chosen": 0.016848012804985046,
      "eval_rewards/margins": 0.019721925258636475,
      "eval_rewards/rejected": -0.0028739143162965775,
      "eval_runtime": 110.0303,
      "eval_samples_per_second": 18.177,
      "eval_steps_per_second": 0.291,
      "step": 400
    },
    {
      "epoch": 0.54,
      "learning_rate": 2.6199929009569996e-07,
      "logits/chosen": -1.7034717798233032,
      "logits/rejected": -1.707564353942871,
      "logps/chosen": -2599.38330078125,
      "logps/rejected": -2273.864990234375,
      "loss": 43.9981,
      "rewards/accuracies": 0.5600000023841858,
      "rewards/chosen": 0.02160579524934292,
      "rewards/margins": 0.0038177832029759884,
      "rewards/rejected": 0.017788011580705643,
      "step": 410
    },
    {
      "epoch": 0.55,
      "learning_rate": 2.5057161388578505e-07,
      "logits/chosen": -1.7964134216308594,
      "logits/rejected": -1.730661392211914,
      "logps/chosen": -3038.08740234375,
      "logps/rejected": -2405.333740234375,
      "loss": 31.4477,
      "rewards/accuracies": 0.5600000619888306,
      "rewards/chosen": 0.05013390630483627,
      "rewards/margins": 0.029845798388123512,
      "rewards/rejected": 0.02028810977935791,
      "step": 420
    },
    {
      "epoch": 0.56,
      "learning_rate": 2.391427425507943e-07,
      "logits/chosen": -1.6959331035614014,
      "logits/rejected": -1.6784296035766602,
      "logps/chosen": -2696.2236328125,
      "logps/rejected": -2173.48583984375,
      "loss": 32.2174,
      "rewards/accuracies": 0.5600000619888306,
      "rewards/chosen": 0.01727980561554432,
      "rewards/margins": 0.013602805323898792,
      "rewards/rejected": 0.0036770000588148832,
      "step": 430
    },
    {
      "epoch": 0.58,
      "learning_rate": 2.2773657147021465e-07,
      "logits/chosen": -1.8469693660736084,
      "logits/rejected": -1.7459551095962524,
      "logps/chosen": -3117.762451171875,
      "logps/rejected": -2390.564208984375,
      "loss": 37.6526,
      "rewards/accuracies": 0.5,
      "rewards/chosen": 0.01079155970364809,
      "rewards/margins": 0.011436818167567253,
      "rewards/rejected": -0.00064525764901191,
      "step": 440
    },
    {
      "epoch": 0.59,
      "learning_rate": 2.1637694856204885e-07,
      "logits/chosen": -1.751587152481079,
      "logits/rejected": -1.6395552158355713,
      "logps/chosen": -2887.770751953125,
      "logps/rejected": -2129.771728515625,
      "loss": 53.6906,
      "rewards/accuracies": 0.550000011920929,
      "rewards/chosen": 0.004514098167419434,
      "rewards/margins": 0.00042482782737351954,
      "rewards/rejected": 0.004089272115379572,
      "step": 450
    },
    {
      "epoch": 0.6,
      "learning_rate": 2.0508762442180743e-07,
      "logits/chosen": -1.8443762063980103,
      "logits/rejected": -1.792295217514038,
      "logps/chosen": -2964.06494140625,
      "logps/rejected": -2577.50244140625,
      "loss": 62.4137,
      "rewards/accuracies": 0.5600000023841858,
      "rewards/chosen": 0.04290894791483879,
      "rewards/margins": 0.01085699163377285,
      "rewards/rejected": 0.03205195814371109,
      "step": 460
    },
    {
      "epoch": 0.61,
      "learning_rate": 1.93892202664981e-07,
      "logits/chosen": -1.6403262615203857,
      "logits/rejected": -1.712969183921814,
      "logps/chosen": -2689.706787109375,
      "logps/rejected": -2513.2998046875,
      "loss": 31.7885,
      "rewards/accuracies": 0.5100000500679016,
      "rewards/chosen": 0.01223880797624588,
      "rewards/margins": 0.011182873509824276,
      "rewards/rejected": 0.0010559323709458113,
      "step": 470
    },
    {
      "epoch": 0.63,
      "learning_rate": 1.8281409057681686e-07,
      "logits/chosen": -1.651449203491211,
      "logits/rejected": -1.5920675992965698,
      "logps/chosen": -3211.50341796875,
      "logps/rejected": -2753.0322265625,
      "loss": 103.2519,
      "rewards/accuracies": 0.550000011920929,
      "rewards/chosen": 0.0340498685836792,
      "rewards/margins": 0.005449384916573763,
      "rewards/rejected": 0.028600484132766724,
      "step": 480
    },
    {
      "epoch": 0.64,
      "learning_rate": 1.7187645017258195e-07,
      "logits/chosen": -1.823428750038147,
      "logits/rejected": -1.7740917205810547,
      "logps/chosen": -2745.991455078125,
      "logps/rejected": -2407.27978515625,
      "loss": 48.2582,
      "rewards/accuracies": 0.5300000309944153,
      "rewards/chosen": 0.03057839907705784,
      "rewards/margins": 0.003420495195314288,
      "rewards/rejected": 0.027157902717590332,
      "step": 490
    },
    {
      "epoch": 0.65,
      "learning_rate": 1.6110214977063343e-07,
      "logits/chosen": -1.7967636585235596,
      "logits/rejected": -1.7410199642181396,
      "logps/chosen": -2905.251708984375,
      "logps/rejected": -2435.401611328125,
      "loss": 36.2204,
      "rewards/accuracies": 0.5,
      "rewards/chosen": 0.013903191313147545,
      "rewards/margins": 0.00013892585411667824,
      "rewards/rejected": 0.013764267787337303,
      "step": 500
    },
    {
      "epoch": 0.65,
      "eval_logits/chosen": -1.6842743158340454,
      "eval_logits/rejected": -1.6236169338226318,
      "eval_logps/chosen": -2806.076171875,
      "eval_logps/rejected": -2491.544677734375,
      "eval_loss": 31.220157623291016,
      "eval_rewards/accuracies": 0.53515625,
      "eval_rewards/chosen": 0.030346479266881943,
      "eval_rewards/margins": 0.009465347044169903,
      "eval_rewards/rejected": 0.020881133154034615,
      "eval_runtime": 112.3374,
      "eval_samples_per_second": 17.804,
      "eval_steps_per_second": 0.285,
      "step": 500
    },
    {
      "epoch": 0.67,
      "learning_rate": 1.5051371617954777e-07,
      "logits/chosen": -1.6810442209243774,
      "logits/rejected": -1.6596931219100952,
      "logps/chosen": -2559.396728515625,
      "logps/rejected": -2228.120361328125,
      "loss": 44.2046,
      "rewards/accuracies": 0.5399999618530273,
      "rewards/chosen": 0.016220757737755775,
      "rewards/margins": 0.007862111553549767,
      "rewards/rejected": 0.00835864432156086,
      "step": 510
    },
    {
      "epoch": 0.68,
      "learning_rate": 1.4013328759927622e-07,
      "logits/chosen": -1.6315361261367798,
      "logits/rejected": -1.6191142797470093,
      "logps/chosen": -2893.280029296875,
      "logps/rejected": -2805.344970703125,
      "loss": 31.2508,
      "rewards/accuracies": 0.6299999952316284,
      "rewards/chosen": 0.02805119752883911,
      "rewards/margins": 0.01223050244152546,
      "rewards/rejected": 0.0158206969499588,
      "step": 520
    },
    {
      "epoch": 0.69,
      "learning_rate": 1.2998256733479896e-07,
      "logits/chosen": -1.810739278793335,
      "logits/rejected": -1.8173195123672485,
      "logps/chosen": -2332.383056640625,
      "logps/rejected": -1922.5443115234375,
      "loss": 226.0483,
      "rewards/accuracies": 0.5600000023841858,
      "rewards/chosen": 0.020228227600455284,
      "rewards/margins": 0.009395391680300236,
      "rewards/rejected": 0.010832836851477623,
      "step": 530
    },
    {
      "epoch": 0.71,
      "learning_rate": 1.200827784190537e-07,
      "logits/chosen": -1.6795597076416016,
      "logits/rejected": -1.6883628368377686,
      "logps/chosen": -3027.91796875,
      "logps/rejected": -2619.313232421875,
      "loss": 29.3654,
      "rewards/accuracies": 0.5600000619888306,
      "rewards/chosen": 0.01968817412853241,
      "rewards/margins": 0.008831174112856388,
      "rewards/rejected": 0.010857000946998596,
      "step": 540
    },
    {
      "epoch": 0.72,
      "learning_rate": 1.1045461924001323e-07,
      "logits/chosen": -1.791738748550415,
      "logits/rejected": -1.8031442165374756,
      "logps/chosen": -2852.6904296875,
      "logps/rejected": -2462.853271484375,
      "loss": 45.3966,
      "rewards/accuracies": 0.46000003814697266,
      "rewards/chosen": 0.010838394984602928,
      "rewards/margins": 0.003685446921736002,
      "rewards/rejected": 0.007152946200221777,
      "step": 550
    },
    {
      "epoch": 0.73,
      "learning_rate": 1.0111822026468514e-07,
      "logits/chosen": -1.7872514724731445,
      "logits/rejected": -1.658860445022583,
      "logps/chosen": -2903.530029296875,
      "logps/rejected": -2319.64599609375,
      "loss": 67.4473,
      "rewards/accuracies": 0.5700000524520874,
      "rewards/chosen": 0.009608490392565727,
      "rewards/margins": 0.004570655524730682,
      "rewards/rejected": 0.0050378344021737576,
      "step": 560
    },
    {
      "epoch": 0.75,
      "learning_rate": 9.209310195051581e-08,
      "logits/chosen": -1.8252109289169312,
      "logits/rejected": -1.6855742931365967,
      "logps/chosen": -2538.860107421875,
      "logps/rejected": -1955.6048583984375,
      "loss": 63.0174,
      "rewards/accuracies": 0.6299999952316284,
      "rewards/chosen": 0.04829864576458931,
      "rewards/margins": 0.022980675101280212,
      "rewards/rejected": 0.025317972525954247,
      "step": 570
    },
    {
      "epoch": 0.76,
      "learning_rate": 8.339813393219713e-08,
      "logits/chosen": -1.739793062210083,
      "logits/rejected": -1.641005516052246,
      "logps/chosen": -2791.561767578125,
      "logps/rejected": -2475.72998046875,
      "loss": 59.8369,
      "rewards/accuracies": 0.5699999928474426,
      "rewards/chosen": 0.05231914669275284,
      "rewards/margins": 0.021858692169189453,
      "rewards/rejected": 0.030460450798273087,
      "step": 580
    },
    {
      "epoch": 0.77,
      "learning_rate": 7.505149556920698e-08,
      "logits/chosen": -1.8431494235992432,
      "logits/rejected": -1.7774893045425415,
      "logps/chosen": -2542.13427734375,
      "logps/rejected": -2193.825927734375,
      "loss": 29.3999,
      "rewards/accuracies": 0.5800000429153442,
      "rewards/chosen": 0.04752471297979355,
      "rewards/margins": 0.017444033175706863,
      "rewards/rejected": 0.030080681666731834,
      "step": 590
    },
    {
      "epoch": 0.78,
      "learning_rate": 6.707063793657064e-08,
      "logits/chosen": -1.7773969173431396,
      "logits/rejected": -1.6891686916351318,
      "logps/chosen": -2942.21240234375,
      "logps/rejected": -2429.352294921875,
      "loss": 99.7738,
      "rewards/accuracies": 0.6200000047683716,
      "rewards/chosen": 0.03306427597999573,
      "rewards/margins": 0.01405587512999773,
      "rewards/rejected": 0.019008399918675423,
      "step": 600
    },
    {
      "epoch": 0.78,
      "eval_logits/chosen": -1.6827195882797241,
      "eval_logits/rejected": -1.6222153902053833,
      "eval_logps/chosen": -2804.348388671875,
      "eval_logps/rejected": -2489.908935546875,
      "eval_loss": 33.74028778076172,
      "eval_rewards/accuracies": 0.5390625,
      "eval_rewards/chosen": 0.04762275516986847,
      "eval_rewards/margins": 0.010385587811470032,
      "eval_rewards/rejected": 0.037237171083688736,
      "eval_runtime": 106.3716,
      "eval_samples_per_second": 18.802,
      "eval_steps_per_second": 0.301,
      "step": 600
    },
    {
      "epoch": 0.8,
      "learning_rate": 5.947224733831363e-08,
      "logits/chosen": -1.759399175643921,
      "logits/rejected": -1.7431520223617554,
      "logps/chosen": -2756.701416015625,
      "logps/rejected": -2470.905029296875,
      "loss": 51.5387,
      "rewards/accuracies": 0.5199999809265137,
      "rewards/chosen": 0.011415710672736168,
      "rewards/margins": 0.009652274660766125,
      "rewards/rejected": 0.0017634350806474686,
      "step": 610
    },
    {
      "epoch": 0.81,
      "learning_rate": 5.227221041988955e-08,
      "logits/chosen": -1.7857062816619873,
      "logits/rejected": -1.725630760192871,
      "logps/chosen": -2520.410400390625,
      "logps/rejected": -2319.78564453125,
      "loss": 28.3912,
      "rewards/accuracies": 0.5900000333786011,
      "rewards/chosen": 0.01811736635863781,
      "rewards/margins": 0.01297797542065382,
      "rewards/rejected": 0.005139390472322702,
      "step": 620
    },
    {
      "epoch": 0.82,
      "learning_rate": 4.548558095252758e-08,
      "logits/chosen": -1.6374757289886475,
      "logits/rejected": -1.673044204711914,
      "logps/chosen": -2845.2119140625,
      "logps/rejected": -2698.294677734375,
      "loss": 42.0619,
      "rewards/accuracies": 0.48000001907348633,
      "rewards/chosen": 0.030720695853233337,
      "rewards/margins": 0.020541973412036896,
      "rewards/rejected": 0.010178723372519016,
      "step": 630
    },
    {
      "epoch": 0.84,
      "learning_rate": 3.9126548358945635e-08,
      "logits/chosen": -1.7063062191009521,
      "logits/rejected": -1.6988853216171265,
      "logps/chosen": -3136.520263671875,
      "logps/rejected": -2731.25927734375,
      "loss": 46.0608,
      "rewards/accuracies": 0.5600000619888306,
      "rewards/chosen": 0.029350021854043007,
      "rewards/margins": 0.010425332933664322,
      "rewards/rejected": 0.018924688920378685,
      "step": 640
    },
    {
      "epoch": 0.85,
      "learning_rate": 3.3208408046234896e-08,
      "logits/chosen": -1.8164535760879517,
      "logits/rejected": -1.7656440734863281,
      "logps/chosen": -2538.8046875,
      "logps/rejected": -2061.81591796875,
      "loss": 40.0838,
      "rewards/accuracies": 0.550000011920929,
      "rewards/chosen": 0.01678399369120598,
      "rewards/margins": 0.009666666388511658,
      "rewards/rejected": 0.007117328234016895,
      "step": 650
    },
    {
      "epoch": 0.86,
      "learning_rate": 2.774353360794493e-08,
      "logits/chosen": -1.7155154943466187,
      "logits/rejected": -1.7442939281463623,
      "logps/chosen": -2761.740966796875,
      "logps/rejected": -2534.80419921875,
      "loss": 36.8374,
      "rewards/accuracies": 0.64000004529953,
      "rewards/chosen": 0.03588343411684036,
      "rewards/margins": 0.037469957023859024,
      "rewards/rejected": -0.0015865217428654432,
      "step": 660
    },
    {
      "epoch": 0.88,
      "learning_rate": 2.2743350953487422e-08,
      "logits/chosen": -1.6992709636688232,
      "logits/rejected": -1.7416222095489502,
      "logps/chosen": -2850.97705078125,
      "logps/rejected": -2569.76611328125,
      "loss": 86.4259,
      "rewards/accuracies": 0.5300000905990601,
      "rewards/chosen": 0.018378589302301407,
      "rewards/margins": 0.004103804472833872,
      "rewards/rejected": 0.014274786226451397,
      "step": 670
    },
    {
      "epoch": 0.89,
      "learning_rate": 1.8218314418949387e-08,
      "logits/chosen": -1.718764305114746,
      "logits/rejected": -1.6741740703582764,
      "logps/chosen": -2353.093017578125,
      "logps/rejected": -2190.77001953125,
      "loss": 46.1473,
      "rewards/accuracies": 0.5699999928474426,
      "rewards/chosen": 0.0031641994137316942,
      "rewards/margins": 0.0004996396601200104,
      "rewards/rejected": 0.002664559753611684,
      "step": 680
    },
    {
      "epoch": 0.9,
      "learning_rate": 1.4177884909263277e-08,
      "logits/chosen": -1.6867101192474365,
      "logits/rejected": -1.652515172958374,
      "logps/chosen": -2937.97412109375,
      "logps/rejected": -2552.07177734375,
      "loss": 37.0029,
      "rewards/accuracies": 0.5199999809265137,
      "rewards/chosen": 0.007288885302841663,
      "rewards/margins": -0.0001541988895041868,
      "rewards/rejected": 0.007443083915859461,
      "step": 690
    },
    {
      "epoch": 0.92,
      "learning_rate": 1.063051011743335e-08,
      "logits/chosen": -1.7554800510406494,
      "logits/rejected": -1.7419729232788086,
      "logps/chosen": -2755.109619140625,
      "logps/rejected": -2368.246337890625,
      "loss": 41.8506,
      "rewards/accuracies": 0.46000003814697266,
      "rewards/chosen": 0.006798497401177883,
      "rewards/margins": 0.010020612739026546,
      "rewards/rejected": -0.0032221146393567324,
      "step": 700
    },
    {
      "epoch": 0.92,
      "eval_logits/chosen": -1.682308554649353,
      "eval_logits/rejected": -1.6210675239562988,
      "eval_logps/chosen": -2806.1005859375,
      "eval_logps/rejected": -2491.68505859375,
      "eval_loss": 32.91334915161133,
      "eval_rewards/accuracies": 0.5546875,
      "eval_rewards/chosen": 0.030103469267487526,
      "eval_rewards/margins": 0.01062812004238367,
      "eval_rewards/rejected": 0.01947534643113613,
      "eval_runtime": 110.9725,
      "eval_samples_per_second": 18.022,
      "eval_steps_per_second": 0.288,
      "step": 700
    },
    {
      "epoch": 0.93,
      "learning_rate": 7.58360686217671e-09,
      "logits/chosen": -1.7843902111053467,
      "logits/rejected": -1.6857761144638062,
      "logps/chosen": -2821.310546875,
      "logps/rejected": -2445.586669921875,
      "loss": 44.0616,
      "rewards/accuracies": 0.6000000238418579,
      "rewards/chosen": 0.025277357548475266,
      "rewards/margins": 0.014521745964884758,
      "rewards/rejected": 0.010755611583590508,
      "step": 710
    },
    {
      "epoch": 0.94,
      "learning_rate": 5.043545580906694e-09,
      "logits/chosen": -1.7102206945419312,
      "logits/rejected": -1.6009165048599243,
      "logps/chosen": -2682.336669921875,
      "logps/rejected": -2234.36279296875,
      "loss": 44.9558,
      "rewards/accuracies": 0.6299999952316284,
      "rewards/chosen": 0.018401915207505226,
      "rewards/margins": 0.030992329120635986,
      "rewards/rejected": -0.012590417638421059,
      "step": 720
    },
    {
      "epoch": 0.95,
      "learning_rate": 3.015637010480576e-09,
      "logits/chosen": -1.7701480388641357,
      "logits/rejected": -1.7436597347259521,
      "logps/chosen": -3042.88330078125,
      "logps/rejected": -2501.55615234375,
      "loss": 35.3819,
      "rewards/accuracies": 0.5199999809265137,
      "rewards/chosen": 0.012798592448234558,
      "rewards/margins": -0.0020437492057681084,
      "rewards/rejected": 0.014842341654002666,
      "step": 730
    },
    {
      "epoch": 0.97,
      "learning_rate": 1.5041210835596285e-09,
      "logits/chosen": -1.703537940979004,
      "logits/rejected": -1.6897165775299072,
      "logps/chosen": -2817.1484375,
      "logps/rejected": -2390.23095703125,
      "loss": 55.7217,
      "rewards/accuracies": 0.5800000429153442,
      "rewards/chosen": 0.022753870114684105,
      "rewards/margins": 0.017634030431509018,
      "rewards/rejected": 0.005119838751852512,
      "step": 740
    },
    {
      "epoch": 0.98,
      "learning_rate": 5.121580637968137e-10,
      "logits/chosen": -1.7322509288787842,
      "logits/rejected": -1.6345192193984985,
      "logps/chosen": -2836.63623046875,
      "logps/rejected": -2363.93505859375,
      "loss": 67.2692,
      "rewards/accuracies": 0.6300000548362732,
      "rewards/chosen": 0.02889620140194893,
      "rewards/margins": 0.014323192648589611,
      "rewards/rejected": 0.014573007822036743,
      "step": 750
    },
    {
      "epoch": 0.99,
      "learning_rate": 4.1821938386477075e-11,
      "logits/chosen": -1.7962977886199951,
      "logits/rejected": -1.7127879858016968,
      "logps/chosen": -2801.806640625,
      "logps/rejected": -2341.844970703125,
      "loss": 46.0674,
      "rewards/accuracies": 0.5700000524520874,
      "rewards/chosen": 0.009056088514626026,
      "rewards/margins": 0.013271180912852287,
      "rewards/rejected": -0.004215092398226261,
      "step": 760
    },
    {
      "epoch": 1.0,
      "step": 764,
      "total_flos": 0.0,
      "train_loss": 46.613421885577296,
      "train_runtime": 4597.6924,
      "train_samples_per_second": 13.297,
      "train_steps_per_second": 0.166
    }
  ],
  "logging_steps": 10,
  "max_steps": 764,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 100,
  "total_flos": 0.0,
  "train_batch_size": 5,
  "trial_name": null,
  "trial_params": null
}