File size: 50,895 Bytes
609a020
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.998691442030882,
  "eval_steps": 500,
  "global_step": 477,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.010468463752944255,
      "grad_norm": 11.782889401902718,
      "learning_rate": 6.25e-08,
      "logits/chosen": -1.444485068321228,
      "logits/rejected": -1.4456722736358643,
      "logps/chosen": -7.9825921058654785,
      "logps/rejected": -8.156225204467773,
      "loss": 8.9796,
      "rewards/accuracies": 0.5,
      "rewards/chosen": -7.9825921058654785,
      "rewards/margins": 0.17363198101520538,
      "rewards/rejected": -8.156225204467773,
      "step": 5
    },
    {
      "epoch": 0.02093692750588851,
      "grad_norm": 9.749361718413306,
      "learning_rate": 1.25e-07,
      "logits/chosen": -1.447454810142517,
      "logits/rejected": -1.4387584924697876,
      "logps/chosen": -8.047009468078613,
      "logps/rejected": -7.960066795349121,
      "loss": 8.9813,
      "rewards/accuracies": 0.4937500059604645,
      "rewards/chosen": -8.047009468078613,
      "rewards/margins": -0.08694207668304443,
      "rewards/rejected": -7.960066795349121,
      "step": 10
    },
    {
      "epoch": 0.031405391258832765,
      "grad_norm": 16.533988717004068,
      "learning_rate": 1.875e-07,
      "logits/chosen": -1.4474663734436035,
      "logits/rejected": -1.4442191123962402,
      "logps/chosen": -7.851595401763916,
      "logps/rejected": -7.866987705230713,
      "loss": 8.8899,
      "rewards/accuracies": 0.5,
      "rewards/chosen": -7.851595401763916,
      "rewards/margins": 0.01539215724915266,
      "rewards/rejected": -7.866987705230713,
      "step": 15
    },
    {
      "epoch": 0.04187385501177702,
      "grad_norm": 13.917496227050558,
      "learning_rate": 2.5e-07,
      "logits/chosen": -1.440216064453125,
      "logits/rejected": -1.4452197551727295,
      "logps/chosen": -8.178640365600586,
      "logps/rejected": -8.201952934265137,
      "loss": 9.0475,
      "rewards/accuracies": 0.5,
      "rewards/chosen": -8.178640365600586,
      "rewards/margins": 0.023312047123908997,
      "rewards/rejected": -8.201952934265137,
      "step": 20
    },
    {
      "epoch": 0.05234231876472128,
      "grad_norm": 13.358220692601913,
      "learning_rate": 3.125e-07,
      "logits/chosen": -1.474110722541809,
      "logits/rejected": -1.463666558265686,
      "logps/chosen": -8.079231262207031,
      "logps/rejected": -7.98193883895874,
      "loss": 9.124,
      "rewards/accuracies": 0.4375,
      "rewards/chosen": -8.079231262207031,
      "rewards/margins": -0.09729210287332535,
      "rewards/rejected": -7.98193883895874,
      "step": 25
    },
    {
      "epoch": 0.06281078251766553,
      "grad_norm": 11.375823739582524,
      "learning_rate": 3.75e-07,
      "logits/chosen": -1.4473092555999756,
      "logits/rejected": -1.4344959259033203,
      "logps/chosen": -7.780773162841797,
      "logps/rejected": -7.703455448150635,
      "loss": 9.0197,
      "rewards/accuracies": 0.4749999940395355,
      "rewards/chosen": -7.780773162841797,
      "rewards/margins": -0.07731723040342331,
      "rewards/rejected": -7.703455448150635,
      "step": 30
    },
    {
      "epoch": 0.07327924627060979,
      "grad_norm": 10.213017154182484,
      "learning_rate": 4.3749999999999994e-07,
      "logits/chosen": -1.4583995342254639,
      "logits/rejected": -1.431770920753479,
      "logps/chosen": -8.027624130249023,
      "logps/rejected": -7.8937225341796875,
      "loss": 8.9843,
      "rewards/accuracies": 0.4937500059604645,
      "rewards/chosen": -8.027624130249023,
      "rewards/margins": -0.13390299677848816,
      "rewards/rejected": -7.8937225341796875,
      "step": 35
    },
    {
      "epoch": 0.08374771002355404,
      "grad_norm": 10.12652288345569,
      "learning_rate": 5e-07,
      "logits/chosen": -1.4447615146636963,
      "logits/rejected": -1.458698034286499,
      "logps/chosen": -7.983005523681641,
      "logps/rejected": -8.174285888671875,
      "loss": 9.0094,
      "rewards/accuracies": 0.48124998807907104,
      "rewards/chosen": -7.983005523681641,
      "rewards/margins": 0.19128072261810303,
      "rewards/rejected": -8.174285888671875,
      "step": 40
    },
    {
      "epoch": 0.0942161737764983,
      "grad_norm": 10.9885005835532,
      "learning_rate": 5.625e-07,
      "logits/chosen": -1.4630662202835083,
      "logits/rejected": -1.4628698825836182,
      "logps/chosen": -8.03730583190918,
      "logps/rejected": -7.831875801086426,
      "loss": 8.9878,
      "rewards/accuracies": 0.45625001192092896,
      "rewards/chosen": -8.03730583190918,
      "rewards/margins": -0.20542971789836884,
      "rewards/rejected": -7.831875801086426,
      "step": 45
    },
    {
      "epoch": 0.10468463752944256,
      "grad_norm": 13.872196323961617,
      "learning_rate": 5.999678242522831e-07,
      "logits/chosen": -1.4442825317382812,
      "logits/rejected": -1.4613512754440308,
      "logps/chosen": -8.217935562133789,
      "logps/rejected": -8.252190589904785,
      "loss": 9.0757,
      "rewards/accuracies": 0.5062500238418579,
      "rewards/chosen": -8.217935562133789,
      "rewards/margins": 0.03425510972738266,
      "rewards/rejected": -8.252190589904785,
      "step": 50
    },
    {
      "epoch": 0.11515310128238682,
      "grad_norm": 10.905494395813982,
      "learning_rate": 5.996059263493219e-07,
      "logits/chosen": -1.4492484331130981,
      "logits/rejected": -1.4467532634735107,
      "logps/chosen": -8.046092987060547,
      "logps/rejected": -8.062843322753906,
      "loss": 9.1036,
      "rewards/accuracies": 0.4312500059604645,
      "rewards/chosen": -8.046092987060547,
      "rewards/margins": 0.01675090566277504,
      "rewards/rejected": -8.062843322753906,
      "step": 55
    },
    {
      "epoch": 0.12562156503533106,
      "grad_norm": 15.995330684554988,
      "learning_rate": 5.988423976115163e-07,
      "logits/chosen": -1.443290114402771,
      "logits/rejected": -1.4562170505523682,
      "logps/chosen": -8.026491165161133,
      "logps/rejected": -8.317246437072754,
      "loss": 8.9008,
      "rewards/accuracies": 0.612500011920929,
      "rewards/chosen": -8.026491165161133,
      "rewards/margins": 0.29075488448143005,
      "rewards/rejected": -8.317246437072754,
      "step": 60
    },
    {
      "epoch": 0.1360900287882753,
      "grad_norm": 24.861886587620123,
      "learning_rate": 5.976782615723061e-07,
      "logits/chosen": -1.392534613609314,
      "logits/rejected": -1.4108682870864868,
      "logps/chosen": -7.828791618347168,
      "logps/rejected": -8.337072372436523,
      "loss": 8.934,
      "rewards/accuracies": 0.59375,
      "rewards/chosen": -7.828791618347168,
      "rewards/margins": 0.5082817673683167,
      "rewards/rejected": -8.337072372436523,
      "step": 65
    },
    {
      "epoch": 0.14655849254121958,
      "grad_norm": 35.209412870115344,
      "learning_rate": 5.961150787913738e-07,
      "logits/chosen": -1.39071524143219,
      "logits/rejected": -1.3853540420532227,
      "logps/chosen": -7.945198059082031,
      "logps/rejected": -8.038311004638672,
      "loss": 8.9653,
      "rewards/accuracies": 0.5062500238418579,
      "rewards/chosen": -7.945198059082031,
      "rewards/margins": 0.0931134819984436,
      "rewards/rejected": -8.038311004638672,
      "step": 70
    },
    {
      "epoch": 0.15702695629416383,
      "grad_norm": 12.413941901156766,
      "learning_rate": 5.941549447626671e-07,
      "logits/chosen": -1.3913167715072632,
      "logits/rejected": -1.3984179496765137,
      "logps/chosen": -7.823273658752441,
      "logps/rejected": -7.864768981933594,
      "loss": 8.9142,
      "rewards/accuracies": 0.48124998807907104,
      "rewards/chosen": -7.823273658752441,
      "rewards/margins": 0.04149458184838295,
      "rewards/rejected": -7.864768981933594,
      "step": 75
    },
    {
      "epoch": 0.16749542004710807,
      "grad_norm": 21.221667512587725,
      "learning_rate": 5.918004871053251e-07,
      "logits/chosen": -1.3923091888427734,
      "logits/rejected": -1.4085341691970825,
      "logps/chosen": -7.852835178375244,
      "logps/rejected": -7.9230217933654785,
      "loss": 8.9088,
      "rewards/accuracies": 0.4625000059604645,
      "rewards/chosen": -7.852835178375244,
      "rewards/margins": 0.07018764317035675,
      "rewards/rejected": -7.9230217933654785,
      "step": 80
    },
    {
      "epoch": 0.17796388380005235,
      "grad_norm": 12.603711372215182,
      "learning_rate": 5.890548620412763e-07,
      "logits/chosen": -1.4011937379837036,
      "logits/rejected": -1.39864182472229,
      "logps/chosen": -7.970945835113525,
      "logps/rejected": -8.160429000854492,
      "loss": 9.0488,
      "rewards/accuracies": 0.512499988079071,
      "rewards/chosen": -7.970945835113525,
      "rewards/margins": 0.18948234617710114,
      "rewards/rejected": -8.160429000854492,
      "step": 85
    },
    {
      "epoch": 0.1884323475529966,
      "grad_norm": 13.164098047063113,
      "learning_rate": 5.859217501642258e-07,
      "logits/chosen": -1.375800371170044,
      "logits/rejected": -1.389070749282837,
      "logps/chosen": -7.946028232574463,
      "logps/rejected": -8.130967140197754,
      "loss": 9.0141,
      "rewards/accuracies": 0.53125,
      "rewards/chosen": -7.946028232574463,
      "rewards/margins": 0.18493881821632385,
      "rewards/rejected": -8.130967140197754,
      "step": 90
    },
    {
      "epoch": 0.19890081130594087,
      "grad_norm": 11.129043830781203,
      "learning_rate": 5.824053515057091e-07,
      "logits/chosen": -1.384723424911499,
      "logits/rejected": -1.3767420053482056,
      "logps/chosen": -8.055198669433594,
      "logps/rejected": -7.921385288238525,
      "loss": 9.0835,
      "rewards/accuracies": 0.4124999940395355,
      "rewards/chosen": -8.055198669433594,
      "rewards/margins": -0.13381320238113403,
      "rewards/rejected": -7.921385288238525,
      "step": 95
    },
    {
      "epoch": 0.2093692750588851,
      "grad_norm": 19.959846628166616,
      "learning_rate": 5.785103799048218e-07,
      "logits/chosen": -1.4132357835769653,
      "logits/rejected": -1.418881893157959,
      "logps/chosen": -8.033044815063477,
      "logps/rejected": -8.07997989654541,
      "loss": 9.0153,
      "rewards/accuracies": 0.4749999940395355,
      "rewards/chosen": -8.033044815063477,
      "rewards/margins": 0.04693456366658211,
      "rewards/rejected": -8.07997989654541,
      "step": 100
    },
    {
      "epoch": 0.21983773881182936,
      "grad_norm": 12.843923135972,
      "learning_rate": 5.742420566891749e-07,
      "logits/chosen": -1.413010835647583,
      "logits/rejected": -1.4074172973632812,
      "logps/chosen": -7.718166351318359,
      "logps/rejected": -7.9243879318237305,
      "loss": 8.9445,
      "rewards/accuracies": 0.5625,
      "rewards/chosen": -7.718166351318359,
      "rewards/margins": 0.206221342086792,
      "rewards/rejected": -7.9243879318237305,
      "step": 105
    },
    {
      "epoch": 0.23030620256477363,
      "grad_norm": 13.825481188162163,
      "learning_rate": 5.696061036755478e-07,
      "logits/chosen": -1.4453760385513306,
      "logits/rejected": -1.4452683925628662,
      "logps/chosen": -7.982637882232666,
      "logps/rejected": -8.220747947692871,
      "loss": 9.0144,
      "rewards/accuracies": 0.5625,
      "rewards/chosen": -7.982637882232666,
      "rewards/margins": 0.23810970783233643,
      "rewards/rejected": -8.220747947692871,
      "step": 110
    },
    {
      "epoch": 0.24077466631771788,
      "grad_norm": 1525.1356967991103,
      "learning_rate": 5.64608735499618e-07,
      "logits/chosen": -1.3860673904418945,
      "logits/rejected": -1.3894257545471191,
      "logps/chosen": -7.8776044845581055,
      "logps/rejected": -8.189804077148438,
      "loss": 8.9598,
      "rewards/accuracies": 0.59375,
      "rewards/chosen": -7.8776044845581055,
      "rewards/margins": 0.31219929456710815,
      "rewards/rejected": -8.189804077148438,
      "step": 115
    },
    {
      "epoch": 0.2512431300706621,
      "grad_norm": 13.913132246096096,
      "learning_rate": 5.592566512850545e-07,
      "logits/chosen": -1.3590507507324219,
      "logits/rejected": -1.3622348308563232,
      "logps/chosen": -8.100934982299805,
      "logps/rejected": -8.155590057373047,
      "loss": 8.9501,
      "rewards/accuracies": 0.5,
      "rewards/chosen": -8.100934982299805,
      "rewards/margins": 0.054654598236083984,
      "rewards/rejected": -8.155590057373047,
      "step": 120
    },
    {
      "epoch": 0.26171159382360637,
      "grad_norm": 14.57715484351377,
      "learning_rate": 5.535570256631384e-07,
      "logits/chosen": -1.4173157215118408,
      "logits/rejected": -1.411921739578247,
      "logps/chosen": -8.191034317016602,
      "logps/rejected": -8.077339172363281,
      "loss": 9.0651,
      "rewards/accuracies": 0.46875,
      "rewards/chosen": -8.191034317016602,
      "rewards/margins": -0.11369502544403076,
      "rewards/rejected": -8.077339172363281,
      "step": 125
    },
    {
      "epoch": 0.2721800575765506,
      "grad_norm": 14.54742842440625,
      "learning_rate": 5.475174991549528e-07,
      "logits/chosen": -1.37632417678833,
      "logits/rejected": -1.3858749866485596,
      "logps/chosen": -8.046875953674316,
      "logps/rejected": -8.172870635986328,
      "loss": 8.9777,
      "rewards/accuracies": 0.5249999761581421,
      "rewards/chosen": -8.046875953674316,
      "rewards/margins": 0.12599456310272217,
      "rewards/rejected": -8.172870635986328,
      "step": 130
    },
    {
      "epoch": 0.2826485213294949,
      "grad_norm": 18.256408848890032,
      "learning_rate": 5.411461679290317e-07,
      "logits/chosen": -1.3864247798919678,
      "logits/rejected": -1.4004995822906494,
      "logps/chosen": -7.979268550872803,
      "logps/rejected": -8.406595230102539,
      "loss": 8.9672,
      "rewards/accuracies": 0.5874999761581421,
      "rewards/chosen": -7.979268550872803,
      "rewards/margins": 0.4273262023925781,
      "rewards/rejected": -8.406595230102539,
      "step": 135
    },
    {
      "epoch": 0.29311698508243916,
      "grad_norm": 14.27521931097187,
      "learning_rate": 5.34451572948201e-07,
      "logits/chosen": -1.4093233346939087,
      "logits/rejected": -1.4172067642211914,
      "logps/chosen": -7.903810977935791,
      "logps/rejected": -7.975949287414551,
      "loss": 8.9533,
      "rewards/accuracies": 0.4749999940395355,
      "rewards/chosen": -7.903810977935791,
      "rewards/margins": 0.07213909924030304,
      "rewards/rejected": -7.975949287414551,
      "step": 140
    },
    {
      "epoch": 0.3035854488353834,
      "grad_norm": 10.523105376926537,
      "learning_rate": 5.274426885201582e-07,
      "logits/chosen": -1.4147297143936157,
      "logits/rejected": -1.4396823644638062,
      "logps/chosen": -7.8977460861206055,
      "logps/rejected": -8.05931568145752,
      "loss": 8.915,
      "rewards/accuracies": 0.550000011920929,
      "rewards/chosen": -7.8977460861206055,
      "rewards/margins": 0.16156847774982452,
      "rewards/rejected": -8.05931568145752,
      "step": 145
    },
    {
      "epoch": 0.31405391258832765,
      "grad_norm": 14.122907500033074,
      "learning_rate": 5.201289102671411e-07,
      "logits/chosen": -1.4332246780395508,
      "logits/rejected": -1.436842679977417,
      "logps/chosen": -7.895875453948975,
      "logps/rejected": -8.0299072265625,
      "loss": 8.9785,
      "rewards/accuracies": 0.5,
      "rewards/chosen": -7.895875453948975,
      "rewards/margins": 0.13403132557868958,
      "rewards/rejected": -8.0299072265625,
      "step": 150
    },
    {
      "epoch": 0.3245223763412719,
      "grad_norm": 12.92310774863363,
      "learning_rate": 5.12520042530811e-07,
      "logits/chosen": -1.402719259262085,
      "logits/rejected": -1.3787992000579834,
      "logps/chosen": -7.979246616363525,
      "logps/rejected": -7.966032981872559,
      "loss": 9.0256,
      "rewards/accuracies": 0.48750001192092896,
      "rewards/chosen": -7.979246616363525,
      "rewards/margins": -0.013212683610618114,
      "rewards/rejected": -7.966032981872559,
      "step": 155
    },
    {
      "epoch": 0.33499084009421615,
      "grad_norm": 15.237628673130487,
      "learning_rate": 5.046262852292346e-07,
      "logits/chosen": -1.3872135877609253,
      "logits/rejected": -1.395935297012329,
      "logps/chosen": -8.034635543823242,
      "logps/rejected": -8.069303512573242,
      "loss": 9.0268,
      "rewards/accuracies": 0.543749988079071,
      "rewards/chosen": -8.034635543823242,
      "rewards/margins": 0.03466759994626045,
      "rewards/rejected": -8.069303512573242,
      "step": 160
    },
    {
      "epoch": 0.34545930384716045,
      "grad_norm": 11.298592435998462,
      "learning_rate": 4.964582201835856e-07,
      "logits/chosen": -1.396750569343567,
      "logits/rejected": -1.3891570568084717,
      "logps/chosen": -7.99398946762085,
      "logps/rejected": -8.040716171264648,
      "loss": 9.0073,
      "rewards/accuracies": 0.5062500238418579,
      "rewards/chosen": -7.99398946762085,
      "rewards/margins": 0.04672648385167122,
      "rewards/rejected": -8.040716171264648,
      "step": 165
    },
    {
      "epoch": 0.3559277676001047,
      "grad_norm": 12.492415372530475,
      "learning_rate": 4.880267969328908e-07,
      "logits/chosen": -1.3683674335479736,
      "logits/rejected": -1.3726252317428589,
      "logps/chosen": -8.114925384521484,
      "logps/rejected": -8.097586631774902,
      "loss": 9.0856,
      "rewards/accuracies": 0.512499988079071,
      "rewards/chosen": -8.114925384521484,
      "rewards/margins": -0.01733933761715889,
      "rewards/rejected": -8.097586631774902,
      "step": 170
    },
    {
      "epoch": 0.36639623135304894,
      "grad_norm": 12.610496367889976,
      "learning_rate": 4.793433180558423e-07,
      "logits/chosen": -1.3843915462493896,
      "logits/rejected": -1.3853034973144531,
      "logps/chosen": -7.956766605377197,
      "logps/rejected": -7.944356441497803,
      "loss": 9.0054,
      "rewards/accuracies": 0.5062500238418579,
      "rewards/chosen": -7.956766605377197,
      "rewards/margins": -0.012410154566168785,
      "rewards/rejected": -7.944356441497803,
      "step": 175
    },
    {
      "epoch": 0.3768646951059932,
      "grad_norm": 15.598692092405715,
      "learning_rate": 4.704194240193467e-07,
      "logits/chosen": -1.3554438352584839,
      "logits/rejected": -1.372804880142212,
      "logps/chosen": -8.031749725341797,
      "logps/rejected": -8.155205726623535,
      "loss": 8.9878,
      "rewards/accuracies": 0.5375000238418579,
      "rewards/chosen": -8.031749725341797,
      "rewards/margins": 0.12345610558986664,
      "rewards/rejected": -8.155205726623535,
      "step": 180
    },
    {
      "epoch": 0.38733315885893743,
      "grad_norm": 13.474501957199323,
      "learning_rate": 4.6126707757412686e-07,
      "logits/chosen": -1.3345744609832764,
      "logits/rejected": -1.3397581577301025,
      "logps/chosen": -7.977494716644287,
      "logps/rejected": -8.02932357788086,
      "loss": 8.9482,
      "rewards/accuracies": 0.5562499761581421,
      "rewards/chosen": -7.977494716644287,
      "rewards/margins": 0.051828037947416306,
      "rewards/rejected": -8.02932357788086,
      "step": 185
    },
    {
      "epoch": 0.39780162261188173,
      "grad_norm": 11.929724403265839,
      "learning_rate": 4.5189854771829086e-07,
      "logits/chosen": -1.3528499603271484,
      "logits/rejected": -1.3492704629898071,
      "logps/chosen": -7.803788661956787,
      "logps/rejected": -7.93734073638916,
      "loss": 8.9516,
      "rewards/accuracies": 0.44999998807907104,
      "rewards/chosen": -7.803788661956787,
      "rewards/margins": 0.1335521936416626,
      "rewards/rejected": -7.93734073638916,
      "step": 190
    },
    {
      "epoch": 0.408270086364826,
      "grad_norm": 14.327437395286285,
      "learning_rate": 4.4232639325036807e-07,
      "logits/chosen": -1.3263393640518188,
      "logits/rejected": -1.3331449031829834,
      "logps/chosen": -8.183530807495117,
      "logps/rejected": -8.074382781982422,
      "loss": 9.054,
      "rewards/accuracies": 0.518750011920929,
      "rewards/chosen": -8.183530807495117,
      "rewards/margins": -0.10914800316095352,
      "rewards/rejected": -8.074382781982422,
      "step": 195
    },
    {
      "epoch": 0.4187385501177702,
      "grad_norm": 12.623357323327125,
      "learning_rate": 4.32563445933859e-07,
      "logits/chosen": -1.3866218328475952,
      "logits/rejected": -1.376103401184082,
      "logps/chosen": -7.869284152984619,
      "logps/rejected": -7.980343818664551,
      "loss": 9.0216,
      "rewards/accuracies": 0.4749999940395355,
      "rewards/chosen": -7.869284152984619,
      "rewards/margins": 0.11105932295322418,
      "rewards/rejected": -7.980343818664551,
      "step": 200
    },
    {
      "epoch": 0.42920701387071447,
      "grad_norm": 15.673764218634288,
      "learning_rate": 4.226227932958664e-07,
      "logits/chosen": -1.3467977046966553,
      "logits/rejected": -1.3465808629989624,
      "logps/chosen": -7.946604251861572,
      "logps/rejected": -8.12873363494873,
      "loss": 8.9418,
      "rewards/accuracies": 0.5062500238418579,
      "rewards/chosen": -7.946604251861572,
      "rewards/margins": 0.18213000893592834,
      "rewards/rejected": -8.12873363494873,
      "step": 205
    },
    {
      "epoch": 0.4396754776236587,
      "grad_norm": 20.82547017360473,
      "learning_rate": 4.1251776108286854e-07,
      "logits/chosen": -1.3276244401931763,
      "logits/rejected": -1.3366806507110596,
      "logps/chosen": -7.942746639251709,
      "logps/rejected": -8.075704574584961,
      "loss": 8.992,
      "rewards/accuracies": 0.518750011920929,
      "rewards/chosen": -7.942746639251709,
      "rewards/margins": 0.13295890390872955,
      "rewards/rejected": -8.075704574584961,
      "step": 210
    },
    {
      "epoch": 0.45014394137660296,
      "grad_norm": 11.77567830972404,
      "learning_rate": 4.022618953971514e-07,
      "logits/chosen": -1.3542811870574951,
      "logits/rejected": -1.3621467351913452,
      "logps/chosen": -7.741019248962402,
      "logps/rejected": -8.169224739074707,
      "loss": 8.9028,
      "rewards/accuracies": 0.6499999761581421,
      "rewards/chosen": -7.741019248962402,
      "rewards/margins": 0.42820531129837036,
      "rewards/rejected": -8.169224739074707,
      "step": 215
    },
    {
      "epoch": 0.46061240512954726,
      "grad_norm": 13.792748846310712,
      "learning_rate": 3.918689445378477e-07,
      "logits/chosen": -1.3647044897079468,
      "logits/rejected": -1.3888493776321411,
      "logps/chosen": -7.679605960845947,
      "logps/rejected": -7.820864677429199,
      "loss": 9.0059,
      "rewards/accuracies": 0.550000011920929,
      "rewards/chosen": -7.679605960845947,
      "rewards/margins": 0.14125962555408478,
      "rewards/rejected": -7.820864677429199,
      "step": 220
    },
    {
      "epoch": 0.4710808688824915,
      "grad_norm": 10.698537268464346,
      "learning_rate": 3.813528405709251e-07,
      "logits/chosen": -1.3668994903564453,
      "logits/rejected": -1.370476484298706,
      "logps/chosen": -7.723212242126465,
      "logps/rejected": -7.974145412445068,
      "loss": 8.9131,
      "rewards/accuracies": 0.543749988079071,
      "rewards/chosen": -7.723212242126465,
      "rewards/margins": 0.25093379616737366,
      "rewards/rejected": -7.974145412445068,
      "step": 225
    },
    {
      "epoch": 0.48154933263543576,
      "grad_norm": 12.476277662413903,
      "learning_rate": 3.707276806528282e-07,
      "logits/chosen": -1.37067449092865,
      "logits/rejected": -1.3700437545776367,
      "logps/chosen": -8.093690872192383,
      "logps/rejected": -8.251599311828613,
      "loss": 9.068,
      "rewards/accuracies": 0.5062500238418579,
      "rewards/chosen": -8.093690872192383,
      "rewards/margins": 0.157908633351326,
      "rewards/rejected": -8.251599311828613,
      "step": 230
    },
    {
      "epoch": 0.49201779638838,
      "grad_norm": 12.703214615987921,
      "learning_rate": 3.6000770813281334e-07,
      "logits/chosen": -1.3918092250823975,
      "logits/rejected": -1.3941457271575928,
      "logps/chosen": -7.891854286193848,
      "logps/rejected": -8.121790885925293,
      "loss": 8.9911,
      "rewards/accuracies": 0.53125,
      "rewards/chosen": -7.891854286193848,
      "rewards/margins": 0.22993668913841248,
      "rewards/rejected": -8.121790885925293,
      "step": 235
    },
    {
      "epoch": 0.5024862601413242,
      "grad_norm": 18.912862114031174,
      "learning_rate": 3.4920729345930654e-07,
      "logits/chosen": -1.3598334789276123,
      "logits/rejected": -1.3656227588653564,
      "logps/chosen": -7.972811698913574,
      "logps/rejected": -8.120051383972168,
      "loss": 9.0708,
      "rewards/accuracies": 0.5687500238418579,
      "rewards/chosen": -7.972811698913574,
      "rewards/margins": 0.14723989367485046,
      "rewards/rejected": -8.120051383972168,
      "step": 240
    },
    {
      "epoch": 0.5129547238942685,
      "grad_norm": 15.322600609417346,
      "learning_rate": 3.383409149158814e-07,
      "logits/chosen": -1.3441493511199951,
      "logits/rejected": -1.3492319583892822,
      "logps/chosen": -8.092975616455078,
      "logps/rejected": -8.160036087036133,
      "loss": 8.9194,
      "rewards/accuracies": 0.53125,
      "rewards/chosen": -8.092975616455078,
      "rewards/margins": 0.06706006824970245,
      "rewards/rejected": -8.160036087036133,
      "step": 245
    },
    {
      "epoch": 0.5234231876472127,
      "grad_norm": 16.724538535729355,
      "learning_rate": 3.2742313921268035e-07,
      "logits/chosen": -1.3152296543121338,
      "logits/rejected": -1.3239524364471436,
      "logps/chosen": -7.889418601989746,
      "logps/rejected": -8.20849323272705,
      "loss": 8.8184,
      "rewards/accuracies": 0.581250011920929,
      "rewards/chosen": -7.889418601989746,
      "rewards/margins": 0.31907448172569275,
      "rewards/rejected": -8.20849323272705,
      "step": 250
    },
    {
      "epoch": 0.533891651400157,
      "grad_norm": 12.327867536896116,
      "learning_rate": 3.1646860195929825e-07,
      "logits/chosen": -1.3065917491912842,
      "logits/rejected": -1.3107439279556274,
      "logps/chosen": -8.116486549377441,
      "logps/rejected": -8.308655738830566,
      "loss": 8.9949,
      "rewards/accuracies": 0.5687500238418579,
      "rewards/chosen": -8.116486549377441,
      "rewards/margins": 0.19216908514499664,
      "rewards/rejected": -8.308655738830566,
      "step": 255
    },
    {
      "epoch": 0.5443601151531012,
      "grad_norm": 14.17754725379555,
      "learning_rate": 3.054919880453032e-07,
      "logits/chosen": -1.246124029159546,
      "logits/rejected": -1.2508999109268188,
      "logps/chosen": -7.7648186683654785,
      "logps/rejected": -8.22431755065918,
      "loss": 8.941,
      "rewards/accuracies": 0.550000011920929,
      "rewards/chosen": -7.7648186683654785,
      "rewards/margins": 0.4594977796077728,
      "rewards/rejected": -8.22431755065918,
      "step": 260
    },
    {
      "epoch": 0.5548285789060455,
      "grad_norm": 11.969966746660198,
      "learning_rate": 2.9450801195469686e-07,
      "logits/chosen": -1.3018732070922852,
      "logits/rejected": -1.3149497509002686,
      "logps/chosen": -7.904818058013916,
      "logps/rejected": -8.152360916137695,
      "loss": 8.9657,
      "rewards/accuracies": 0.581250011920929,
      "rewards/chosen": -7.904818058013916,
      "rewards/margins": 0.24754443764686584,
      "rewards/rejected": -8.152360916137695,
      "step": 265
    },
    {
      "epoch": 0.5652970426589898,
      "grad_norm": 14.47186665684816,
      "learning_rate": 2.835313980407017e-07,
      "logits/chosen": -1.3108150959014893,
      "logits/rejected": -1.288703441619873,
      "logps/chosen": -8.249927520751953,
      "logps/rejected": -8.318041801452637,
      "loss": 9.0073,
      "rewards/accuracies": 0.5375000238418579,
      "rewards/chosen": -8.249927520751953,
      "rewards/margins": 0.06811434030532837,
      "rewards/rejected": -8.318041801452637,
      "step": 270
    },
    {
      "epoch": 0.575765506411934,
      "grad_norm": 26.602745593974163,
      "learning_rate": 2.7257686078731973e-07,
      "logits/chosen": -1.337909460067749,
      "logits/rejected": -1.348547339439392,
      "logps/chosen": -7.881032466888428,
      "logps/rejected": -8.068848609924316,
      "loss": 8.8981,
      "rewards/accuracies": 0.5249999761581421,
      "rewards/chosen": -7.881032466888428,
      "rewards/margins": 0.18781575560569763,
      "rewards/rejected": -8.068848609924316,
      "step": 275
    },
    {
      "epoch": 0.5862339701648783,
      "grad_norm": 14.906273538361356,
      "learning_rate": 2.6165908508411857e-07,
      "logits/chosen": -1.3503994941711426,
      "logits/rejected": -1.3676143884658813,
      "logps/chosen": -7.861943244934082,
      "logps/rejected": -8.101309776306152,
      "loss": 8.9213,
      "rewards/accuracies": 0.518750011920929,
      "rewards/chosen": -7.861943244934082,
      "rewards/margins": 0.23936741054058075,
      "rewards/rejected": -8.101309776306152,
      "step": 280
    },
    {
      "epoch": 0.5967024339178225,
      "grad_norm": 14.643252229490672,
      "learning_rate": 2.5079270654069354e-07,
      "logits/chosen": -1.3024542331695557,
      "logits/rejected": -1.3081843852996826,
      "logps/chosen": -7.836719512939453,
      "logps/rejected": -8.08849048614502,
      "loss": 8.8721,
      "rewards/accuracies": 0.5562499761581421,
      "rewards/chosen": -7.836719512939453,
      "rewards/margins": 0.251770943403244,
      "rewards/rejected": -8.08849048614502,
      "step": 285
    },
    {
      "epoch": 0.6071708976707668,
      "grad_norm": 12.350106404715637,
      "learning_rate": 2.399922918671867e-07,
      "logits/chosen": -1.337571620941162,
      "logits/rejected": -1.3552089929580688,
      "logps/chosen": -7.821458339691162,
      "logps/rejected": -8.146204948425293,
      "loss": 8.9032,
      "rewards/accuracies": 0.5874999761581421,
      "rewards/chosen": -7.821458339691162,
      "rewards/margins": 0.3247470557689667,
      "rewards/rejected": -8.146204948425293,
      "step": 290
    },
    {
      "epoch": 0.6176393614237111,
      "grad_norm": 13.544262102627407,
      "learning_rate": 2.2927231934717176e-07,
      "logits/chosen": -1.331067442893982,
      "logits/rejected": -1.3430246114730835,
      "logps/chosen": -7.9300737380981445,
      "logps/rejected": -8.060845375061035,
      "loss": 8.9735,
      "rewards/accuracies": 0.574999988079071,
      "rewards/chosen": -7.9300737380981445,
      "rewards/margins": 0.1307719349861145,
      "rewards/rejected": -8.060845375061035,
      "step": 295
    },
    {
      "epoch": 0.6281078251766553,
      "grad_norm": 12.721635836612304,
      "learning_rate": 2.1864715942907487e-07,
      "logits/chosen": -1.299328088760376,
      "logits/rejected": -1.3065472841262817,
      "logps/chosen": -7.961094856262207,
      "logps/rejected": -8.206907272338867,
      "loss": 8.9027,
      "rewards/accuracies": 0.512499988079071,
      "rewards/chosen": -7.961094856262207,
      "rewards/margins": 0.24581179022789001,
      "rewards/rejected": -8.206907272338867,
      "step": 300
    },
    {
      "epoch": 0.6385762889295996,
      "grad_norm": 12.545308362098348,
      "learning_rate": 2.081310554621522e-07,
      "logits/chosen": -1.3111393451690674,
      "logits/rejected": -1.335069179534912,
      "logps/chosen": -8.182366371154785,
      "logps/rejected": -8.390935897827148,
      "loss": 9.0314,
      "rewards/accuracies": 0.518750011920929,
      "rewards/chosen": -8.182366371154785,
      "rewards/margins": 0.20856896042823792,
      "rewards/rejected": -8.390935897827148,
      "step": 305
    },
    {
      "epoch": 0.6490447526825438,
      "grad_norm": 15.575558913925711,
      "learning_rate": 1.9773810460284862e-07,
      "logits/chosen": -1.3477294445037842,
      "logits/rejected": -1.3550546169281006,
      "logps/chosen": -8.07054328918457,
      "logps/rejected": -8.061942100524902,
      "loss": 9.0612,
      "rewards/accuracies": 0.5249999761581421,
      "rewards/chosen": -8.07054328918457,
      "rewards/margins": -0.008599767461419106,
      "rewards/rejected": -8.061942100524902,
      "step": 310
    },
    {
      "epoch": 0.6595132164354881,
      "grad_norm": 20.765211421302535,
      "learning_rate": 1.874822389171314e-07,
      "logits/chosen": -1.3256926536560059,
      "logits/rejected": -1.3409112691879272,
      "logps/chosen": -7.885645389556885,
      "logps/rejected": -8.124526977539062,
      "loss": 8.8864,
      "rewards/accuracies": 0.543749988079071,
      "rewards/chosen": -7.885645389556885,
      "rewards/margins": 0.2388812005519867,
      "rewards/rejected": -8.124526977539062,
      "step": 315
    },
    {
      "epoch": 0.6699816801884323,
      "grad_norm": 14.208218028523063,
      "learning_rate": 1.7737720670413356e-07,
      "logits/chosen": -1.344118595123291,
      "logits/rejected": -1.336096167564392,
      "logps/chosen": -8.159255981445312,
      "logps/rejected": -8.0567045211792,
      "loss": 8.9837,
      "rewards/accuracies": 0.5062500238418579,
      "rewards/chosen": -8.159255981445312,
      "rewards/margins": -0.10255154222249985,
      "rewards/rejected": -8.0567045211792,
      "step": 320
    },
    {
      "epoch": 0.6804501439413766,
      "grad_norm": 14.61227257116642,
      "learning_rate": 1.6743655406614095e-07,
      "logits/chosen": -1.340541958808899,
      "logits/rejected": -1.3474371433258057,
      "logps/chosen": -8.056330680847168,
      "logps/rejected": -8.348928451538086,
      "loss": 8.9222,
      "rewards/accuracies": 0.4937500059604645,
      "rewards/chosen": -8.056330680847168,
      "rewards/margins": 0.29259705543518066,
      "rewards/rejected": -8.348928451538086,
      "step": 325
    },
    {
      "epoch": 0.6909186076943209,
      "grad_norm": 13.778075151913542,
      "learning_rate": 1.5767360674963198e-07,
      "logits/chosen": -1.3218133449554443,
      "logits/rejected": -1.3337442874908447,
      "logps/chosen": -7.961134910583496,
      "logps/rejected": -7.996614933013916,
      "loss": 9.0247,
      "rewards/accuracies": 0.5062500238418579,
      "rewards/chosen": -7.961134910583496,
      "rewards/margins": 0.035479746758937836,
      "rewards/rejected": -7.996614933013916,
      "step": 330
    },
    {
      "epoch": 0.7013870714472651,
      "grad_norm": 13.684086792814428,
      "learning_rate": 1.4810145228170922e-07,
      "logits/chosen": -1.3398381471633911,
      "logits/rejected": -1.3437585830688477,
      "logps/chosen": -7.856637001037598,
      "logps/rejected": -8.111886978149414,
      "loss": 8.8913,
      "rewards/accuracies": 0.5687500238418579,
      "rewards/chosen": -7.856637001037598,
      "rewards/margins": 0.25525030493736267,
      "rewards/rejected": -8.111886978149414,
      "step": 335
    },
    {
      "epoch": 0.7118555352002094,
      "grad_norm": 15.39649445200101,
      "learning_rate": 1.3873292242587306e-07,
      "logits/chosen": -1.3376450538635254,
      "logits/rejected": -1.3476964235305786,
      "logps/chosen": -8.228338241577148,
      "logps/rejected": -8.340727806091309,
      "loss": 9.0269,
      "rewards/accuracies": 0.5687500238418579,
      "rewards/chosen": -8.228338241577148,
      "rewards/margins": 0.11239071190357208,
      "rewards/rejected": -8.340727806091309,
      "step": 340
    },
    {
      "epoch": 0.7223239989531536,
      "grad_norm": 15.302013253785537,
      "learning_rate": 1.295805759806533e-07,
      "logits/chosen": -1.3724461793899536,
      "logits/rejected": -1.3841075897216797,
      "logps/chosen": -8.054750442504883,
      "logps/rejected": -8.403682708740234,
      "loss": 9.0089,
      "rewards/accuracies": 0.5,
      "rewards/chosen": -8.054750442504883,
      "rewards/margins": 0.3489326238632202,
      "rewards/rejected": -8.403682708740234,
      "step": 345
    },
    {
      "epoch": 0.7327924627060979,
      "grad_norm": 18.608453972243662,
      "learning_rate": 1.2065668194415777e-07,
      "logits/chosen": -1.3417284488677979,
      "logits/rejected": -1.3348530530929565,
      "logps/chosen": -7.915482997894287,
      "logps/rejected": -8.044729232788086,
      "loss": 8.9016,
      "rewards/accuracies": 0.518750011920929,
      "rewards/chosen": -7.915482997894287,
      "rewards/margins": 0.12924641370773315,
      "rewards/rejected": -8.044729232788086,
      "step": 350
    },
    {
      "epoch": 0.7432609264590422,
      "grad_norm": 14.900748845819772,
      "learning_rate": 1.1197320306710923e-07,
      "logits/chosen": -1.3621351718902588,
      "logits/rejected": -1.3541442155838013,
      "logps/chosen": -8.007196426391602,
      "logps/rejected": -7.965734004974365,
      "loss": 8.9062,
      "rewards/accuracies": 0.5062500238418579,
      "rewards/chosen": -8.007196426391602,
      "rewards/margins": -0.04146287590265274,
      "rewards/rejected": -7.965734004974365,
      "step": 355
    },
    {
      "epoch": 0.7537293902119864,
      "grad_norm": 11.569520650790327,
      "learning_rate": 1.035417798164145e-07,
      "logits/chosen": -1.3260619640350342,
      "logits/rejected": -1.3356263637542725,
      "logps/chosen": -7.753990173339844,
      "logps/rejected": -8.039525985717773,
      "loss": 8.8536,
      "rewards/accuracies": 0.5874999761581421,
      "rewards/chosen": -7.753990173339844,
      "rewards/margins": 0.2855362296104431,
      "rewards/rejected": -8.039525985717773,
      "step": 360
    },
    {
      "epoch": 0.7641978539649307,
      "grad_norm": 13.480030507608214,
      "learning_rate": 9.537371477076535e-08,
      "logits/chosen": -1.2944828271865845,
      "logits/rejected": -1.2956254482269287,
      "logps/chosen": -7.926826477050781,
      "logps/rejected": -7.9895477294921875,
      "loss": 8.9487,
      "rewards/accuracies": 0.5375000238418579,
      "rewards/chosen": -7.926826477050781,
      "rewards/margins": 0.06272158026695251,
      "rewards/rejected": -7.9895477294921875,
      "step": 365
    },
    {
      "epoch": 0.7746663177178749,
      "grad_norm": 14.634365970472302,
      "learning_rate": 8.747995746918898e-08,
      "logits/chosen": -1.3467233180999756,
      "logits/rejected": -1.3351846933364868,
      "logps/chosen": -8.043527603149414,
      "logps/rejected": -8.186015129089355,
      "loss": 8.9627,
      "rewards/accuracies": 0.5249999761581421,
      "rewards/chosen": -8.043527603149414,
      "rewards/margins": 0.14248715341091156,
      "rewards/rejected": -8.186015129089355,
      "step": 370
    },
    {
      "epoch": 0.7851347814708192,
      "grad_norm": 28.38170473677795,
      "learning_rate": 7.987108973285888e-08,
      "logits/chosen": -1.3258306980133057,
      "logits/rejected": -1.3155487775802612,
      "logps/chosen": -8.005027770996094,
      "logps/rejected": -8.246636390686035,
      "loss": 8.9413,
      "rewards/accuracies": 0.5375000238418579,
      "rewards/chosen": -8.005027770996094,
      "rewards/margins": 0.24160809814929962,
      "rewards/rejected": -8.246636390686035,
      "step": 375
    },
    {
      "epoch": 0.7956032452237635,
      "grad_norm": 13.711915418794124,
      "learning_rate": 7.255731147984174e-08,
      "logits/chosen": -1.3438084125518799,
      "logits/rejected": -1.297163963317871,
      "logps/chosen": -8.208559036254883,
      "logps/rejected": -8.363499641418457,
      "loss": 8.942,
      "rewards/accuracies": 0.5625,
      "rewards/chosen": -8.208559036254883,
      "rewards/margins": 0.15493938326835632,
      "rewards/rejected": -8.363499641418457,
      "step": 380
    },
    {
      "epoch": 0.8060717089767077,
      "grad_norm": 13.642711731891415,
      "learning_rate": 6.554842705179898e-08,
      "logits/chosen": -1.3352845907211304,
      "logits/rejected": -1.3314430713653564,
      "logps/chosen": -8.112469673156738,
      "logps/rejected": -8.209820747375488,
      "loss": 8.9588,
      "rewards/accuracies": 0.4937500059604645,
      "rewards/chosen": -8.112469673156738,
      "rewards/margins": 0.09735036641359329,
      "rewards/rejected": -8.209820747375488,
      "step": 385
    },
    {
      "epoch": 0.816540172729652,
      "grad_norm": 14.269345053816819,
      "learning_rate": 5.885383207096832e-08,
      "logits/chosen": -1.3467012643814087,
      "logits/rejected": -1.3490493297576904,
      "logps/chosen": -7.833376884460449,
      "logps/rejected": -8.030352592468262,
      "loss": 8.8689,
      "rewards/accuracies": 0.543749988079071,
      "rewards/chosen": -7.833376884460449,
      "rewards/margins": 0.1969761848449707,
      "rewards/rejected": -8.030352592468262,
      "step": 390
    },
    {
      "epoch": 0.8270086364825961,
      "grad_norm": 16.14618551872646,
      "learning_rate": 5.2482500845047165e-08,
      "logits/chosen": -1.3177175521850586,
      "logits/rejected": -1.3296372890472412,
      "logps/chosen": -7.635066032409668,
      "logps/rejected": -7.791895866394043,
      "loss": 8.9076,
      "rewards/accuracies": 0.543749988079071,
      "rewards/chosen": -7.635066032409668,
      "rewards/margins": 0.15682990849018097,
      "rewards/rejected": -7.791895866394043,
      "step": 395
    },
    {
      "epoch": 0.8374771002355405,
      "grad_norm": 12.278193076130206,
      "learning_rate": 4.644297433686162e-08,
      "logits/chosen": -1.3246910572052002,
      "logits/rejected": -1.315019965171814,
      "logps/chosen": -7.837827205657959,
      "logps/rejected": -7.908313751220703,
      "loss": 8.951,
      "rewards/accuracies": 0.5249999761581421,
      "rewards/chosen": -7.837827205657959,
      "rewards/margins": 0.0704866498708725,
      "rewards/rejected": -7.908313751220703,
      "step": 400
    },
    {
      "epoch": 0.8479455639884846,
      "grad_norm": 12.786235556241849,
      "learning_rate": 4.074334871494558e-08,
      "logits/chosen": -1.3545995950698853,
      "logits/rejected": -1.3624496459960938,
      "logps/chosen": -8.024687767028809,
      "logps/rejected": -8.172109603881836,
      "loss": 8.9198,
      "rewards/accuracies": 0.518750011920929,
      "rewards/chosen": -8.024687767028809,
      "rewards/margins": 0.1474229097366333,
      "rewards/rejected": -8.172109603881836,
      "step": 405
    },
    {
      "epoch": 0.8584140277414289,
      "grad_norm": 30.758577870183032,
      "learning_rate": 3.5391264500382e-08,
      "logits/chosen": -1.360478401184082,
      "logits/rejected": -1.3552910089492798,
      "logps/chosen": -7.844922065734863,
      "logps/rejected": -7.852625846862793,
      "loss": 8.8997,
      "rewards/accuracies": 0.53125,
      "rewards/chosen": -7.844922065734863,
      "rewards/margins": 0.0077047706581652164,
      "rewards/rejected": -7.852625846862793,
      "step": 410
    },
    {
      "epoch": 0.8688824914943732,
      "grad_norm": 14.260374307768236,
      "learning_rate": 3.0393896324452226e-08,
      "logits/chosen": -1.372036337852478,
      "logits/rejected": -1.3762390613555908,
      "logps/chosen": -7.982748508453369,
      "logps/rejected": -8.225188255310059,
      "loss": 8.9748,
      "rewards/accuracies": 0.5249999761581421,
      "rewards/chosen": -7.982748508453369,
      "rewards/margins": 0.24244041740894318,
      "rewards/rejected": -8.225188255310059,
      "step": 415
    },
    {
      "epoch": 0.8793509552473174,
      "grad_norm": 14.09879602927017,
      "learning_rate": 2.5757943310825026e-08,
      "logits/chosen": -1.3225996494293213,
      "logits/rejected": -1.3161330223083496,
      "logps/chosen": -7.865872859954834,
      "logps/rejected": -7.931491851806641,
      "loss": 8.9372,
      "rewards/accuracies": 0.5,
      "rewards/chosen": -7.865872859954834,
      "rewards/margins": 0.06561894714832306,
      "rewards/rejected": -7.931491851806641,
      "step": 420
    },
    {
      "epoch": 0.8898194190002617,
      "grad_norm": 13.351401390808332,
      "learning_rate": 2.148962009517823e-08,
      "logits/chosen": -1.342071771621704,
      "logits/rejected": -1.337024450302124,
      "logps/chosen": -8.03447151184082,
      "logps/rejected": -8.085325241088867,
      "loss": 8.9767,
      "rewards/accuracies": 0.48750001192092896,
      "rewards/chosen": -8.03447151184082,
      "rewards/margins": 0.050852321088314056,
      "rewards/rejected": -8.085325241088867,
      "step": 425
    },
    {
      "epoch": 0.9002878827532059,
      "grad_norm": 16.02345328859732,
      "learning_rate": 1.759464849429082e-08,
      "logits/chosen": -1.3405394554138184,
      "logits/rejected": -1.3419816493988037,
      "logps/chosen": -7.878898620605469,
      "logps/rejected": -8.002215385437012,
      "loss": 8.9292,
      "rewards/accuracies": 0.518750011920929,
      "rewards/chosen": -7.878898620605469,
      "rewards/margins": 0.123316690325737,
      "rewards/rejected": -8.002215385437012,
      "step": 430
    },
    {
      "epoch": 0.9107563465061502,
      "grad_norm": 13.971661978504134,
      "learning_rate": 1.4078249835774169e-08,
      "logits/chosen": -1.3646373748779297,
      "logits/rejected": -1.3699538707733154,
      "logps/chosen": -7.937603950500488,
      "logps/rejected": -8.069661140441895,
      "loss": 8.8372,
      "rewards/accuracies": 0.4937500059604645,
      "rewards/chosen": -7.937603950500488,
      "rewards/margins": 0.13205692172050476,
      "rewards/rejected": -8.069661140441895,
      "step": 435
    },
    {
      "epoch": 0.9212248102590945,
      "grad_norm": 13.636923891581842,
      "learning_rate": 1.0945137958723705e-08,
      "logits/chosen": -1.3303980827331543,
      "logits/rejected": -1.3274564743041992,
      "logps/chosen": -8.00455379486084,
      "logps/rejected": -8.096671104431152,
      "loss": 8.9997,
      "rewards/accuracies": 0.48750001192092896,
      "rewards/chosen": -8.00455379486084,
      "rewards/margins": 0.09211695194244385,
      "rewards/rejected": -8.096671104431152,
      "step": 440
    },
    {
      "epoch": 0.9316932740120387,
      "grad_norm": 14.66331138432002,
      "learning_rate": 8.19951289467482e-09,
      "logits/chosen": -1.3527616262435913,
      "logits/rejected": -1.352975606918335,
      "logps/chosen": -7.898123741149902,
      "logps/rejected": -8.020647048950195,
      "loss": 8.9114,
      "rewards/accuracies": 0.5375000238418579,
      "rewards/chosen": -7.898123741149902,
      "rewards/margins": 0.1225227564573288,
      "rewards/rejected": -8.020647048950195,
      "step": 445
    },
    {
      "epoch": 0.942161737764983,
      "grad_norm": 34.64920022108061,
      "learning_rate": 5.84505523733293e-09,
      "logits/chosen": -1.3027703762054443,
      "logits/rejected": -1.2922091484069824,
      "logps/chosen": -8.017878532409668,
      "logps/rejected": -8.019991874694824,
      "loss": 9.0038,
      "rewards/accuracies": 0.46875,
      "rewards/chosen": -8.017878532409668,
      "rewards/margins": 0.0021121830213814974,
      "rewards/rejected": -8.019991874694824,
      "step": 450
    },
    {
      "epoch": 0.9526302015179272,
      "grad_norm": 14.096689301269398,
      "learning_rate": 3.8849212086261466e-09,
      "logits/chosen": -1.3568954467773438,
      "logits/rejected": -1.345536231994629,
      "logps/chosen": -7.817251682281494,
      "logps/rejected": -8.18480110168457,
      "loss": 8.9022,
      "rewards/accuracies": 0.6187499761581421,
      "rewards/chosen": -7.817251682281494,
      "rewards/margins": 0.3675496578216553,
      "rewards/rejected": -8.18480110168457,
      "step": 455
    },
    {
      "epoch": 0.9630986652708715,
      "grad_norm": 21.948748802651522,
      "learning_rate": 2.3217384276938756e-09,
      "logits/chosen": -1.3387937545776367,
      "logits/rejected": -1.349258542060852,
      "logps/chosen": -7.9868292808532715,
      "logps/rejected": -8.197335243225098,
      "loss": 8.8854,
      "rewards/accuracies": 0.53125,
      "rewards/chosen": -7.9868292808532715,
      "rewards/margins": 0.21050508320331573,
      "rewards/rejected": -8.197335243225098,
      "step": 460
    },
    {
      "epoch": 0.9735671290238157,
      "grad_norm": 12.715751305789052,
      "learning_rate": 1.1576023884836472e-09,
      "logits/chosen": -1.3674533367156982,
      "logits/rejected": -1.3665874004364014,
      "logps/chosen": -8.10934066772461,
      "logps/rejected": -8.27099323272705,
      "loss": 8.9853,
      "rewards/accuracies": 0.5249999761581421,
      "rewards/chosen": -8.10934066772461,
      "rewards/margins": 0.16165266931056976,
      "rewards/rejected": -8.27099323272705,
      "step": 465
    },
    {
      "epoch": 0.98403559277676,
      "grad_norm": 13.20358280327505,
      "learning_rate": 3.940736506780395e-10,
      "logits/chosen": -1.348550796508789,
      "logits/rejected": -1.3657060861587524,
      "logps/chosen": -7.707891941070557,
      "logps/rejected": -7.990015983581543,
      "loss": 8.9804,
      "rewards/accuracies": 0.543749988079071,
      "rewards/chosen": -7.707891941070557,
      "rewards/margins": 0.2821243703365326,
      "rewards/rejected": -7.990015983581543,
      "step": 470
    },
    {
      "epoch": 0.9945040565297043,
      "grad_norm": 17.668181816444864,
      "learning_rate": 3.2175747716822744e-11,
      "logits/chosen": -1.3433798551559448,
      "logits/rejected": -1.3304665088653564,
      "logps/chosen": -8.101046562194824,
      "logps/rejected": -8.15410041809082,
      "loss": 8.9813,
      "rewards/accuracies": 0.5375000238418579,
      "rewards/chosen": -8.101046562194824,
      "rewards/margins": 0.05305204540491104,
      "rewards/rejected": -8.15410041809082,
      "step": 475
    },
    {
      "epoch": 0.998691442030882,
      "step": 477,
      "total_flos": 0.0,
      "train_loss": 8.967987340451286,
      "train_runtime": 8184.2286,
      "train_samples_per_second": 7.47,
      "train_steps_per_second": 0.058
    }
  ],
  "logging_steps": 5,
  "max_steps": 477,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 1000000,
  "total_flos": 0.0,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}