diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5294 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 3750, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 1.3333333333333334e-08, + "logits/chosen": -1.381319522857666, + "logits/rejected": -0.9757366180419922, + "logps/chosen": -223.25863647460938, + "logps/rejected": -830.5400390625, + "loss": 0.2593, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 1.3333333333333336e-07, + "logits/chosen": -1.736572504043579, + "logits/rejected": -1.0549728870391846, + "logps/chosen": -406.9079284667969, + "logps/rejected": -761.596435546875, + "loss": 0.1822, + "rewards/accuracies": 0.5555555820465088, + "rewards/chosen": 0.00039627417572773993, + "rewards/margins": 0.000484730233438313, + "rewards/rejected": -8.845605771057308e-05, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 2.666666666666667e-07, + "logits/chosen": -1.6399459838867188, + "logits/rejected": -1.0379071235656738, + "logps/chosen": -483.6226501464844, + "logps/rejected": -819.0009765625, + "loss": 0.1801, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 3.848170308629051e-05, + "rewards/margins": 0.00036858199746347964, + "rewards/rejected": -0.00033010030165314674, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 4.0000000000000003e-07, + "logits/chosen": -1.7753417491912842, + "logits/rejected": -1.3355859518051147, + "logps/chosen": -443.94390869140625, + "logps/rejected": -788.3363647460938, + "loss": 0.2323, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0017691084649413824, + "rewards/margins": 0.0024432786740362644, + "rewards/rejected": -0.0006741699180565774, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 5.333333333333335e-07, + "logits/chosen": -1.5635123252868652, + "logits/rejected": -0.9124569892883301, + "logps/chosen": -458.33428955078125, + "logps/rejected": -747.6420288085938, + "loss": 0.2195, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.004033350385725498, + "rewards/margins": 0.006722611375153065, + "rewards/rejected": -0.0026892595924437046, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 6.666666666666667e-07, + "logits/chosen": -1.631588339805603, + "logits/rejected": -0.8681947588920593, + "logps/chosen": -465.05731201171875, + "logps/rejected": -838.4075927734375, + "loss": 0.2014, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.008273603394627571, + "rewards/margins": 0.015597726218402386, + "rewards/rejected": -0.0073241242207586765, + "step": 50 + }, + { + "epoch": 0.02, + "learning_rate": 8.000000000000001e-07, + "logits/chosen": -1.4628058671951294, + "logits/rejected": -1.2347371578216553, + "logps/chosen": -343.9599304199219, + "logps/rejected": -739.0056762695312, + "loss": 0.1761, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.0036266068927943707, + "rewards/margins": 0.022512439638376236, + "rewards/rejected": -0.01888582855463028, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 9.333333333333334e-07, + "logits/chosen": -1.8094412088394165, + "logits/rejected": -0.9877569079399109, + "logps/chosen": -497.0489807128906, + "logps/rejected": -864.1619262695312, + "loss": 0.1809, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.003260440658777952, + "rewards/margins": 0.046338800340890884, + "rewards/rejected": -0.04307835176587105, + "step": 70 + }, + { + "epoch": 0.02, + "learning_rate": 1.066666666666667e-06, + "logits/chosen": -1.6897491216659546, + "logits/rejected": -1.0848586559295654, + "logps/chosen": -560.68017578125, + "logps/rejected": -1089.6458740234375, + "loss": 0.1443, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.006865059025585651, + "rewards/margins": 0.08565281331539154, + "rewards/rejected": -0.09251787513494492, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 1.2000000000000002e-06, + "logits/chosen": -1.7690013647079468, + "logits/rejected": -0.9375957250595093, + "logps/chosen": -427.4967346191406, + "logps/rejected": -953.2610473632812, + "loss": 0.1582, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.010322836227715015, + "rewards/margins": 0.07507754862308502, + "rewards/rejected": -0.08540038764476776, + "step": 90 + }, + { + "epoch": 0.03, + "learning_rate": 1.3333333333333334e-06, + "logits/chosen": -1.4934628009796143, + "logits/rejected": -0.9881563186645508, + "logps/chosen": -397.26727294921875, + "logps/rejected": -905.0123901367188, + "loss": 0.1339, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.01196499913930893, + "rewards/margins": 0.11481380462646484, + "rewards/rejected": -0.12677881121635437, + "step": 100 + }, + { + "epoch": 0.03, + "learning_rate": 1.4666666666666669e-06, + "logits/chosen": -1.559560775756836, + "logits/rejected": -0.9702051877975464, + "logps/chosen": -446.76849365234375, + "logps/rejected": -964.1668090820312, + "loss": 0.1009, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.029861677438020706, + "rewards/margins": 0.14626939594745636, + "rewards/rejected": -0.17613105475902557, + "step": 110 + }, + { + "epoch": 0.03, + "learning_rate": 1.6000000000000001e-06, + "logits/chosen": -1.7105035781860352, + "logits/rejected": -0.9925721287727356, + "logps/chosen": -542.6316528320312, + "logps/rejected": -977.3997192382812, + "loss": 0.1034, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.06414582580327988, + "rewards/margins": 0.1390438973903656, + "rewards/rejected": -0.2031897008419037, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 1.7333333333333336e-06, + "logits/chosen": -1.7129449844360352, + "logits/rejected": -0.9808734655380249, + "logps/chosen": -639.7268676757812, + "logps/rejected": -1264.408203125, + "loss": 0.0778, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1564100980758667, + "rewards/margins": 0.22008244693279266, + "rewards/rejected": -0.37649255990982056, + "step": 130 + }, + { + "epoch": 0.04, + "learning_rate": 1.8666666666666669e-06, + "logits/chosen": -1.4957599639892578, + "logits/rejected": -0.9900957345962524, + "logps/chosen": -606.5774536132812, + "logps/rejected": -1158.996826171875, + "loss": 0.1186, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1333167403936386, + "rewards/margins": 0.18605293333530426, + "rewards/rejected": -0.31936967372894287, + "step": 140 + }, + { + "epoch": 0.04, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": -1.7749484777450562, + "logits/rejected": -1.1498210430145264, + "logps/chosen": -588.8472900390625, + "logps/rejected": -1247.8353271484375, + "loss": 0.0621, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.15223875641822815, + "rewards/margins": 0.2504242956638336, + "rewards/rejected": -0.40266305208206177, + "step": 150 + }, + { + "epoch": 0.04, + "learning_rate": 2.133333333333334e-06, + "logits/chosen": -1.4668447971343994, + "logits/rejected": -0.9629266858100891, + "logps/chosen": -740.5608520507812, + "logps/rejected": -1320.8753662109375, + "loss": 0.074, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22708892822265625, + "rewards/margins": 0.23111894726753235, + "rewards/rejected": -0.458207905292511, + "step": 160 + }, + { + "epoch": 0.05, + "learning_rate": 2.266666666666667e-06, + "logits/chosen": -1.5278871059417725, + "logits/rejected": -1.1116211414337158, + "logps/chosen": -571.50390625, + "logps/rejected": -1168.722412109375, + "loss": 0.1131, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1509171426296234, + "rewards/margins": 0.27128082513809204, + "rewards/rejected": -0.4221979081630707, + "step": 170 + }, + { + "epoch": 0.05, + "learning_rate": 2.4000000000000003e-06, + "logits/chosen": -1.6129930019378662, + "logits/rejected": -1.0707186460494995, + "logps/chosen": -591.6637573242188, + "logps/rejected": -1284.7354736328125, + "loss": 0.0784, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.12449514865875244, + "rewards/margins": 0.2544993758201599, + "rewards/rejected": -0.37899452447891235, + "step": 180 + }, + { + "epoch": 0.05, + "learning_rate": 2.5333333333333338e-06, + "logits/chosen": -1.6850178241729736, + "logits/rejected": -1.1943457126617432, + "logps/chosen": -514.7299194335938, + "logps/rejected": -1000.5671997070312, + "loss": 0.1249, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09646569192409515, + "rewards/margins": 0.17868806421756744, + "rewards/rejected": -0.2751538157463074, + "step": 190 + }, + { + "epoch": 0.05, + "learning_rate": 2.666666666666667e-06, + "logits/chosen": -1.5830456018447876, + "logits/rejected": -1.097068428993225, + "logps/chosen": -658.3897705078125, + "logps/rejected": -1211.879150390625, + "loss": 0.0931, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16921699047088623, + "rewards/margins": 0.20347478985786438, + "rewards/rejected": -0.3726917505264282, + "step": 200 + }, + { + "epoch": 0.06, + "learning_rate": 2.8000000000000003e-06, + "logits/chosen": -1.765363097190857, + "logits/rejected": -0.8959721326828003, + "logps/chosen": -716.1063842773438, + "logps/rejected": -1217.0675048828125, + "loss": 0.1018, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.21076449751853943, + "rewards/margins": 0.23035843670368195, + "rewards/rejected": -0.4411229193210602, + "step": 210 + }, + { + "epoch": 0.06, + "learning_rate": 2.9333333333333338e-06, + "logits/chosen": -1.4971026182174683, + "logits/rejected": -1.0308849811553955, + "logps/chosen": -635.874267578125, + "logps/rejected": -1254.032470703125, + "loss": 0.0944, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1963960826396942, + "rewards/margins": 0.2612842321395874, + "rewards/rejected": -0.4576803147792816, + "step": 220 + }, + { + "epoch": 0.06, + "learning_rate": 3.066666666666667e-06, + "logits/chosen": -1.3114674091339111, + "logits/rejected": -1.1226143836975098, + "logps/chosen": -669.163818359375, + "logps/rejected": -1447.716064453125, + "loss": 0.083, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2657012939453125, + "rewards/margins": 0.3253920078277588, + "rewards/rejected": -0.5910933613777161, + "step": 230 + }, + { + "epoch": 0.06, + "learning_rate": 3.2000000000000003e-06, + "logits/chosen": -1.5960338115692139, + "logits/rejected": -0.8448678255081177, + "logps/chosen": -783.5925903320312, + "logps/rejected": -1367.287841796875, + "loss": 0.0798, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2770916819572449, + "rewards/margins": 0.2540797293186188, + "rewards/rejected": -0.531171441078186, + "step": 240 + }, + { + "epoch": 0.07, + "learning_rate": 3.3333333333333333e-06, + "logits/chosen": -1.6775119304656982, + "logits/rejected": -1.2753263711929321, + "logps/chosen": -694.5948486328125, + "logps/rejected": -1378.860107421875, + "loss": 0.0801, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2328498661518097, + "rewards/margins": 0.26127415895462036, + "rewards/rejected": -0.49412399530410767, + "step": 250 + }, + { + "epoch": 0.07, + "learning_rate": 3.4666666666666672e-06, + "logits/chosen": -1.4416381120681763, + "logits/rejected": -0.9755349159240723, + "logps/chosen": -681.4529418945312, + "logps/rejected": -1261.5875244140625, + "loss": 0.1169, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.18144458532333374, + "rewards/margins": 0.2220906764268875, + "rewards/rejected": -0.40353527665138245, + "step": 260 + }, + { + "epoch": 0.07, + "learning_rate": 3.6000000000000003e-06, + "logits/chosen": -1.776125192642212, + "logits/rejected": -1.1443500518798828, + "logps/chosen": -665.0440673828125, + "logps/rejected": -1173.3468017578125, + "loss": 0.1028, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19433431327342987, + "rewards/margins": 0.21727688610553741, + "rewards/rejected": -0.4116111695766449, + "step": 270 + }, + { + "epoch": 0.07, + "learning_rate": 3.7333333333333337e-06, + "logits/chosen": -1.6408843994140625, + "logits/rejected": -1.2362545728683472, + "logps/chosen": -652.2579956054688, + "logps/rejected": -1271.698974609375, + "loss": 0.0917, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1795671433210373, + "rewards/margins": 0.24397559463977814, + "rewards/rejected": -0.4235427975654602, + "step": 280 + }, + { + "epoch": 0.08, + "learning_rate": 3.866666666666667e-06, + "logits/chosen": -1.8528718948364258, + "logits/rejected": -1.1004583835601807, + "logps/chosen": -762.6512451171875, + "logps/rejected": -1343.5460205078125, + "loss": 0.0868, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.226405531167984, + "rewards/margins": 0.2510288953781128, + "rewards/rejected": -0.4774344861507416, + "step": 290 + }, + { + "epoch": 0.08, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": -1.7013801336288452, + "logits/rejected": -1.2125957012176514, + "logps/chosen": -613.29345703125, + "logps/rejected": -1406.8970947265625, + "loss": 0.0761, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.17005790770053864, + "rewards/margins": 0.32780343294143677, + "rewards/rejected": -0.4978613257408142, + "step": 300 + }, + { + "epoch": 0.08, + "learning_rate": 4.133333333333333e-06, + "logits/chosen": -1.415290117263794, + "logits/rejected": -0.9908515810966492, + "logps/chosen": -712.7332763671875, + "logps/rejected": -1258.039306640625, + "loss": 0.1258, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.25694146752357483, + "rewards/margins": 0.21659043431282043, + "rewards/rejected": -0.4735318720340729, + "step": 310 + }, + { + "epoch": 0.09, + "learning_rate": 4.266666666666668e-06, + "logits/chosen": -1.5041309595108032, + "logits/rejected": -1.0038108825683594, + "logps/chosen": -661.9385986328125, + "logps/rejected": -1160.186767578125, + "loss": 0.0992, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.21888642013072968, + "rewards/margins": 0.21484248340129852, + "rewards/rejected": -0.4337288737297058, + "step": 320 + }, + { + "epoch": 0.09, + "learning_rate": 4.4e-06, + "logits/chosen": -1.6438214778900146, + "logits/rejected": -1.0989625453948975, + "logps/chosen": -537.155517578125, + "logps/rejected": -1078.1251220703125, + "loss": 0.0751, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11467760801315308, + "rewards/margins": 0.25429314374923706, + "rewards/rejected": -0.36897072196006775, + "step": 330 + }, + { + "epoch": 0.09, + "learning_rate": 4.533333333333334e-06, + "logits/chosen": -1.7438217401504517, + "logits/rejected": -1.0444936752319336, + "logps/chosen": -644.1298828125, + "logps/rejected": -1348.093505859375, + "loss": 0.0711, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.19465377926826477, + "rewards/margins": 0.32639193534851074, + "rewards/rejected": -0.5210457444190979, + "step": 340 + }, + { + "epoch": 0.09, + "learning_rate": 4.666666666666667e-06, + "logits/chosen": -1.7205537557601929, + "logits/rejected": -1.1176466941833496, + "logps/chosen": -653.7063598632812, + "logps/rejected": -1317.8218994140625, + "loss": 0.0812, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15995605289936066, + "rewards/margins": 0.2710942327976227, + "rewards/rejected": -0.43105024099349976, + "step": 350 + }, + { + "epoch": 0.1, + "learning_rate": 4.800000000000001e-06, + "logits/chosen": -1.8885447978973389, + "logits/rejected": -1.4283367395401, + "logps/chosen": -435.2276916503906, + "logps/rejected": -1041.008056640625, + "loss": 0.0918, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.05559268593788147, + "rewards/margins": 0.2262849360704422, + "rewards/rejected": -0.2818776071071625, + "step": 360 + }, + { + "epoch": 0.1, + "learning_rate": 4.933333333333334e-06, + "logits/chosen": -1.7745654582977295, + "logits/rejected": -1.2009865045547485, + "logps/chosen": -606.5958251953125, + "logps/rejected": -1145.1015625, + "loss": 0.0968, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.11594484001398087, + "rewards/margins": 0.24505428969860077, + "rewards/rejected": -0.36099910736083984, + "step": 370 + }, + { + "epoch": 0.1, + "learning_rate": 4.999972922944898e-06, + "logits/chosen": -1.6557433605194092, + "logits/rejected": -1.1534380912780762, + "logps/chosen": -643.7879028320312, + "logps/rejected": -1236.194091796875, + "loss": 0.091, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16532504558563232, + "rewards/margins": 0.24116845428943634, + "rewards/rejected": -0.40649348497390747, + "step": 380 + }, + { + "epoch": 0.1, + "learning_rate": 4.999756310023261e-06, + "logits/chosen": -1.6974895000457764, + "logits/rejected": -1.2435563802719116, + "logps/chosen": -619.0020751953125, + "logps/rejected": -1253.426513671875, + "loss": 0.0496, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11514924466609955, + "rewards/margins": 0.26920756697654724, + "rewards/rejected": -0.3843567967414856, + "step": 390 + }, + { + "epoch": 0.11, + "learning_rate": 4.999323102948655e-06, + "logits/chosen": -1.6725631952285767, + "logits/rejected": -0.9952858686447144, + "logps/chosen": -683.99072265625, + "logps/rejected": -1263.6319580078125, + "loss": 0.0985, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.22003349661827087, + "rewards/margins": 0.21788537502288818, + "rewards/rejected": -0.4379189610481262, + "step": 400 + }, + { + "epoch": 0.11, + "learning_rate": 4.998673339256785e-06, + "logits/chosen": -1.6918662786483765, + "logits/rejected": -0.9807602167129517, + "logps/chosen": -646.4641723632812, + "logps/rejected": -1105.5599365234375, + "loss": 0.1276, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13202176988124847, + "rewards/margins": 0.22680577635765076, + "rewards/rejected": -0.35882753133773804, + "step": 410 + }, + { + "epoch": 0.11, + "learning_rate": 4.997807075247147e-06, + "logits/chosen": -1.4633402824401855, + "logits/rejected": -0.8964066505432129, + "logps/chosen": -623.4297485351562, + "logps/rejected": -1171.818359375, + "loss": 0.0798, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.17750394344329834, + "rewards/margins": 0.23941746354103088, + "rewards/rejected": -0.4169214367866516, + "step": 420 + }, + { + "epoch": 0.11, + "learning_rate": 4.996724385978142e-06, + "logits/chosen": -1.7313741445541382, + "logits/rejected": -1.088205099105835, + "logps/chosen": -618.5508422851562, + "logps/rejected": -1335.5269775390625, + "loss": 0.0565, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.15107488632202148, + "rewards/margins": 0.30174392461776733, + "rewards/rejected": -0.4528188109397888, + "step": 430 + }, + { + "epoch": 0.12, + "learning_rate": 4.995425365260585e-06, + "logits/chosen": -1.6465423107147217, + "logits/rejected": -1.1094882488250732, + "logps/chosen": -621.9539794921875, + "logps/rejected": -1221.4598388671875, + "loss": 0.0909, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15349408984184265, + "rewards/margins": 0.25984710454940796, + "rewards/rejected": -0.4133411943912506, + "step": 440 + }, + { + "epoch": 0.12, + "learning_rate": 4.993910125649561e-06, + "logits/chosen": -1.8044379949569702, + "logits/rejected": -1.1311860084533691, + "logps/chosen": -721.5858764648438, + "logps/rejected": -1256.863037109375, + "loss": 0.0915, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.21946442127227783, + "rewards/margins": 0.22845225036144257, + "rewards/rejected": -0.4479166567325592, + "step": 450 + }, + { + "epoch": 0.12, + "learning_rate": 4.992178798434684e-06, + "logits/chosen": -1.76088547706604, + "logits/rejected": -1.2385786771774292, + "logps/chosen": -657.9778442382812, + "logps/rejected": -1414.336669921875, + "loss": 0.0575, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.16792455315589905, + "rewards/margins": 0.3237282633781433, + "rewards/rejected": -0.49165281653404236, + "step": 460 + }, + { + "epoch": 0.13, + "learning_rate": 4.990231533628719e-06, + "logits/chosen": -1.5809530019760132, + "logits/rejected": -1.1684823036193848, + "logps/chosen": -623.2067260742188, + "logps/rejected": -1329.0098876953125, + "loss": 0.067, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15883824229240417, + "rewards/margins": 0.3003080189228058, + "rewards/rejected": -0.45914632081985474, + "step": 470 + }, + { + "epoch": 0.13, + "learning_rate": 4.988068499954578e-06, + "logits/chosen": -1.5603920221328735, + "logits/rejected": -1.0103719234466553, + "logps/chosen": -745.253173828125, + "logps/rejected": -1386.312744140625, + "loss": 0.0639, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2308214157819748, + "rewards/margins": 0.3027498126029968, + "rewards/rejected": -0.5335712432861328, + "step": 480 + }, + { + "epoch": 0.13, + "learning_rate": 4.985689884830711e-06, + "logits/chosen": -1.7204630374908447, + "logits/rejected": -1.0981186628341675, + "logps/chosen": -663.6007080078125, + "logps/rejected": -1271.954833984375, + "loss": 0.0546, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.23924314975738525, + "rewards/margins": 0.3018389344215393, + "rewards/rejected": -0.5410820841789246, + "step": 490 + }, + { + "epoch": 0.13, + "learning_rate": 4.983095894354858e-06, + "logits/chosen": -1.6816179752349854, + "logits/rejected": -1.2458436489105225, + "logps/chosen": -812.2794189453125, + "logps/rejected": -1452.4508056640625, + "loss": 0.0876, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3606022000312805, + "rewards/margins": 0.2518552541732788, + "rewards/rejected": -0.6124575138092041, + "step": 500 + }, + { + "epoch": 0.14, + "learning_rate": 4.980286753286196e-06, + "logits/chosen": -1.675920844078064, + "logits/rejected": -1.2157505750656128, + "logps/chosen": -622.9227294921875, + "logps/rejected": -1268.478271484375, + "loss": 0.0997, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.23038606345653534, + "rewards/margins": 0.280417799949646, + "rewards/rejected": -0.5108038783073425, + "step": 510 + }, + { + "epoch": 0.14, + "learning_rate": 4.97726270502586e-06, + "logits/chosen": -1.6523020267486572, + "logits/rejected": -1.1194841861724854, + "logps/chosen": -536.9530029296875, + "logps/rejected": -1284.30517578125, + "loss": 0.0562, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15182599425315857, + "rewards/margins": 0.308131605386734, + "rewards/rejected": -0.4599575400352478, + "step": 520 + }, + { + "epoch": 0.14, + "learning_rate": 4.974024011595864e-06, + "logits/chosen": -1.5846920013427734, + "logits/rejected": -1.2534643411636353, + "logps/chosen": -688.9484252929688, + "logps/rejected": -1183.958984375, + "loss": 0.1024, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.21223464608192444, + "rewards/margins": 0.19630616903305054, + "rewards/rejected": -0.408540815114975, + "step": 530 + }, + { + "epoch": 0.14, + "learning_rate": 4.970570953616383e-06, + "logits/chosen": -1.7657943964004517, + "logits/rejected": -1.2593281269073486, + "logps/chosen": -591.6409912109375, + "logps/rejected": -1163.0576171875, + "loss": 0.1017, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.17924083769321442, + "rewards/margins": 0.2396513670682907, + "rewards/rejected": -0.4188922345638275, + "step": 540 + }, + { + "epoch": 0.15, + "learning_rate": 4.966903830281449e-06, + "logits/chosen": -1.7614797353744507, + "logits/rejected": -1.300492286682129, + "logps/chosen": -588.6466064453125, + "logps/rejected": -1244.7601318359375, + "loss": 0.0925, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.16072975099086761, + "rewards/margins": 0.29418593645095825, + "rewards/rejected": -0.4549156725406647, + "step": 550 + }, + { + "epoch": 0.15, + "learning_rate": 4.9630229593330226e-06, + "logits/chosen": -1.5666195154190063, + "logits/rejected": -0.924557089805603, + "logps/chosen": -734.8566284179688, + "logps/rejected": -1351.8369140625, + "loss": 0.0874, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.236628919839859, + "rewards/margins": 0.24980910122394562, + "rewards/rejected": -0.48643797636032104, + "step": 560 + }, + { + "epoch": 0.15, + "learning_rate": 4.958928677033465e-06, + "logits/chosen": -1.5746439695358276, + "logits/rejected": -1.0514501333236694, + "logps/chosen": -673.4780883789062, + "logps/rejected": -1367.333740234375, + "loss": 0.0705, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.23988430202007294, + "rewards/margins": 0.26904696226119995, + "rewards/rejected": -0.5089312791824341, + "step": 570 + }, + { + "epoch": 0.15, + "learning_rate": 4.954621338136399e-06, + "logits/chosen": -1.5549921989440918, + "logits/rejected": -0.825292706489563, + "logps/chosen": -724.3428955078125, + "logps/rejected": -1314.2396240234375, + "loss": 0.1148, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.2795710563659668, + "rewards/margins": 0.24751707911491394, + "rewards/rejected": -0.5270881652832031, + "step": 580 + }, + { + "epoch": 0.16, + "learning_rate": 4.95010131585597e-06, + "logits/chosen": -1.4858559370040894, + "logits/rejected": -1.164233922958374, + "logps/chosen": -720.483642578125, + "logps/rejected": -1470.8189697265625, + "loss": 0.0644, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.28550735116004944, + "rewards/margins": 0.27395910024642944, + "rewards/rejected": -0.5594664812088013, + "step": 590 + }, + { + "epoch": 0.16, + "learning_rate": 4.9453690018345144e-06, + "logits/chosen": -1.7503721714019775, + "logits/rejected": -1.0189541578292847, + "logps/chosen": -725.6884765625, + "logps/rejected": -1330.4554443359375, + "loss": 0.0731, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.19357889890670776, + "rewards/margins": 0.26617223024368286, + "rewards/rejected": -0.4597511887550354, + "step": 600 + }, + { + "epoch": 0.16, + "learning_rate": 4.940424806108619e-06, + "logits/chosen": -1.7605764865875244, + "logits/rejected": -0.9754387140274048, + "logps/chosen": -736.2041625976562, + "logps/rejected": -1220.272216796875, + "loss": 0.0966, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.16418889164924622, + "rewards/margins": 0.2352021038532257, + "rewards/rejected": -0.3993909955024719, + "step": 610 + }, + { + "epoch": 0.17, + "learning_rate": 4.935269157073597e-06, + "logits/chosen": -1.6696975231170654, + "logits/rejected": -1.1732470989227295, + "logps/chosen": -546.5994262695312, + "logps/rejected": -1076.2138671875, + "loss": 0.1142, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17854368686676025, + "rewards/margins": 0.22630243003368378, + "rewards/rejected": -0.40484610199928284, + "step": 620 + }, + { + "epoch": 0.17, + "learning_rate": 4.9299025014463665e-06, + "logits/chosen": -1.6130393743515015, + "logits/rejected": -0.9944950342178345, + "logps/chosen": -607.0363159179688, + "logps/rejected": -1301.4781494140625, + "loss": 0.0704, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.19457684457302094, + "rewards/margins": 0.2787570357322693, + "rewards/rejected": -0.47333383560180664, + "step": 630 + }, + { + "epoch": 0.17, + "learning_rate": 4.924325304226745e-06, + "logits/chosen": -1.6456743478775024, + "logits/rejected": -1.2997629642486572, + "logps/chosen": -683.3946533203125, + "logps/rejected": -1356.107177734375, + "loss": 0.0818, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2882430851459503, + "rewards/margins": 0.2655082941055298, + "rewards/rejected": -0.5537513494491577, + "step": 640 + }, + { + "epoch": 0.17, + "learning_rate": 4.91853804865716e-06, + "logits/chosen": -1.406205654144287, + "logits/rejected": -1.0480941534042358, + "logps/chosen": -837.6101684570312, + "logps/rejected": -1426.9271240234375, + "loss": 0.0759, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.3547836244106293, + "rewards/margins": 0.24995502829551697, + "rewards/rejected": -0.6047386527061462, + "step": 650 + }, + { + "epoch": 0.18, + "learning_rate": 4.912541236180779e-06, + "logits/chosen": -1.6352602243423462, + "logits/rejected": -1.0264801979064941, + "logps/chosen": -687.4603271484375, + "logps/rejected": -1298.3892822265625, + "loss": 0.0835, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.25935202836990356, + "rewards/margins": 0.28063350915908813, + "rewards/rejected": -0.5399855375289917, + "step": 660 + }, + { + "epoch": 0.18, + "learning_rate": 4.9063353863980565e-06, + "logits/chosen": -1.652361512184143, + "logits/rejected": -1.3324909210205078, + "logps/chosen": -703.0999755859375, + "logps/rejected": -1310.7120361328125, + "loss": 0.0822, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2579403519630432, + "rewards/margins": 0.26742976903915405, + "rewards/rejected": -0.525370180606842, + "step": 670 + }, + { + "epoch": 0.18, + "learning_rate": 4.899921037021719e-06, + "logits/chosen": -1.8630950450897217, + "logits/rejected": -1.117851734161377, + "logps/chosen": -572.1090087890625, + "logps/rejected": -1142.517822265625, + "loss": 0.0669, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.13034145534038544, + "rewards/margins": 0.2702890932559967, + "rewards/rejected": -0.40063056349754333, + "step": 680 + }, + { + "epoch": 0.18, + "learning_rate": 4.893298743830168e-06, + "logits/chosen": -1.5307347774505615, + "logits/rejected": -1.1395881175994873, + "logps/chosen": -570.9049072265625, + "logps/rejected": -1325.3486328125, + "loss": 0.0776, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1899944543838501, + "rewards/margins": 0.31518790125846863, + "rewards/rejected": -0.5051823854446411, + "step": 690 + }, + { + "epoch": 0.19, + "learning_rate": 4.88646908061933e-06, + "logits/chosen": -1.6429624557495117, + "logits/rejected": -0.952468991279602, + "logps/chosen": -504.73321533203125, + "logps/rejected": -1041.1776123046875, + "loss": 0.1034, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.08391048014163971, + "rewards/margins": 0.24313923716545105, + "rewards/rejected": -0.3270496726036072, + "step": 700 + }, + { + "epoch": 0.19, + "learning_rate": 4.879432639152935e-06, + "logits/chosen": -1.8941549062728882, + "logits/rejected": -1.1734158992767334, + "logps/chosen": -530.647216796875, + "logps/rejected": -1243.784912109375, + "loss": 0.0827, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03668345510959625, + "rewards/margins": 0.30672526359558105, + "rewards/rejected": -0.3434087336063385, + "step": 710 + }, + { + "epoch": 0.19, + "learning_rate": 4.8721900291112415e-06, + "logits/chosen": -1.6430208683013916, + "logits/rejected": -1.1618727445602417, + "logps/chosen": -664.0516357421875, + "logps/rejected": -1358.1705322265625, + "loss": 0.064, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1712174415588379, + "rewards/margins": 0.27873173356056213, + "rewards/rejected": -0.44994911551475525, + "step": 720 + }, + { + "epoch": 0.19, + "learning_rate": 4.864741878038218e-06, + "logits/chosen": -1.5911110639572144, + "logits/rejected": -0.9319060444831848, + "logps/chosen": -551.3786010742188, + "logps/rejected": -1062.9332275390625, + "loss": 0.0848, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.13672541081905365, + "rewards/margins": 0.23177051544189453, + "rewards/rejected": -0.368495911359787, + "step": 730 + }, + { + "epoch": 0.2, + "learning_rate": 4.857088831287158e-06, + "logits/chosen": -1.3883923292160034, + "logits/rejected": -0.8421472311019897, + "logps/chosen": -622.4197387695312, + "logps/rejected": -1147.806884765625, + "loss": 0.0948, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.19367149472236633, + "rewards/margins": 0.24927671253681183, + "rewards/rejected": -0.44294825196266174, + "step": 740 + }, + { + "epoch": 0.2, + "learning_rate": 4.849231551964771e-06, + "logits/chosen": -1.6589374542236328, + "logits/rejected": -1.0507800579071045, + "logps/chosen": -580.9486083984375, + "logps/rejected": -1298.055908203125, + "loss": 0.078, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1833840161561966, + "rewards/margins": 0.29099351167678833, + "rewards/rejected": -0.4743775427341461, + "step": 750 + }, + { + "epoch": 0.2, + "learning_rate": 4.841170720873723e-06, + "logits/chosen": -1.9193570613861084, + "logits/rejected": -1.29689359664917, + "logps/chosen": -593.7999267578125, + "logps/rejected": -1074.827880859375, + "loss": 0.1027, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.13971452414989471, + "rewards/margins": 0.2339230477809906, + "rewards/rejected": -0.3736375570297241, + "step": 760 + }, + { + "epoch": 0.21, + "learning_rate": 4.832907036453647e-06, + "logits/chosen": -1.7203441858291626, + "logits/rejected": -1.0238596200942993, + "logps/chosen": -575.1504516601562, + "logps/rejected": -1223.0185546875, + "loss": 0.0779, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1228102445602417, + "rewards/margins": 0.28532546758651733, + "rewards/rejected": -0.4081357419490814, + "step": 770 + }, + { + "epoch": 0.21, + "learning_rate": 4.824441214720629e-06, + "logits/chosen": -1.5191177129745483, + "logits/rejected": -1.0029339790344238, + "logps/chosen": -579.9059448242188, + "logps/rejected": -1316.46142578125, + "loss": 0.0519, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.1901233047246933, + "rewards/margins": 0.3058861494064331, + "rewards/rejected": -0.4960094392299652, + "step": 780 + }, + { + "epoch": 0.21, + "learning_rate": 4.815773989205165e-06, + "logits/chosen": -1.5836925506591797, + "logits/rejected": -0.9203447103500366, + "logps/chosen": -776.8148803710938, + "logps/rejected": -1455.188720703125, + "loss": 0.0819, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23546621203422546, + "rewards/margins": 0.2814113199710846, + "rewards/rejected": -0.5168775916099548, + "step": 790 + }, + { + "epoch": 0.21, + "learning_rate": 4.806906110888606e-06, + "logits/chosen": -1.6255607604980469, + "logits/rejected": -1.1398379802703857, + "logps/chosen": -536.3740234375, + "logps/rejected": -1251.212646484375, + "loss": 0.0639, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.11835892498493195, + "rewards/margins": 0.3064490854740143, + "rewards/rejected": -0.4248080849647522, + "step": 800 + }, + { + "epoch": 0.22, + "learning_rate": 4.7978383481380865e-06, + "logits/chosen": -1.5587496757507324, + "logits/rejected": -1.0958257913589478, + "logps/chosen": -577.5291748046875, + "logps/rejected": -1253.976806640625, + "loss": 0.0724, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15895147621631622, + "rewards/margins": 0.29505571722984314, + "rewards/rejected": -0.45400720834732056, + "step": 810 + }, + { + "epoch": 0.22, + "learning_rate": 4.788571486639948e-06, + "logits/chosen": -1.5678253173828125, + "logits/rejected": -1.0112650394439697, + "logps/chosen": -582.6275634765625, + "logps/rejected": -1258.8802490234375, + "loss": 0.0726, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.16134771704673767, + "rewards/margins": 0.3210769593715668, + "rewards/rejected": -0.48242464661598206, + "step": 820 + }, + { + "epoch": 0.22, + "learning_rate": 4.779106329331665e-06, + "logits/chosen": -1.5958459377288818, + "logits/rejected": -1.0422935485839844, + "logps/chosen": -604.3923950195312, + "logps/rejected": -1268.2470703125, + "loss": 0.0656, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.17159458994865417, + "rewards/margins": 0.29530078172683716, + "rewards/rejected": -0.4668954014778137, + "step": 830 + }, + { + "epoch": 0.22, + "learning_rate": 4.769443696332272e-06, + "logits/chosen": -1.6635258197784424, + "logits/rejected": -1.1757241487503052, + "logps/chosen": -576.9772338867188, + "logps/rejected": -1184.485595703125, + "loss": 0.0987, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.10074315965175629, + "rewards/margins": 0.25036129355430603, + "rewards/rejected": -0.3511044681072235, + "step": 840 + }, + { + "epoch": 0.23, + "learning_rate": 4.759584424871302e-06, + "logits/chosen": -1.4811336994171143, + "logits/rejected": -0.9249873161315918, + "logps/chosen": -593.8721313476562, + "logps/rejected": -1355.8583984375, + "loss": 0.06, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1529252678155899, + "rewards/margins": 0.3431011736392975, + "rewards/rejected": -0.4960264265537262, + "step": 850 + }, + { + "epoch": 0.23, + "learning_rate": 4.749529369216246e-06, + "logits/chosen": -1.5341050624847412, + "logits/rejected": -0.9583051800727844, + "logps/chosen": -680.0247802734375, + "logps/rejected": -1330.1800537109375, + "loss": 0.0771, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.18310308456420898, + "rewards/margins": 0.31003543734550476, + "rewards/rejected": -0.49313855171203613, + "step": 860 + }, + { + "epoch": 0.23, + "learning_rate": 4.7392794005985324e-06, + "logits/chosen": -1.3542709350585938, + "logits/rejected": -0.9462020993232727, + "logps/chosen": -521.2957763671875, + "logps/rejected": -1280.841552734375, + "loss": 0.0739, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12296704202890396, + "rewards/margins": 0.3202964663505554, + "rewards/rejected": -0.44326353073120117, + "step": 870 + }, + { + "epoch": 0.23, + "learning_rate": 4.7288354071380415e-06, + "logits/chosen": -1.4632813930511475, + "logits/rejected": -1.0232326984405518, + "logps/chosen": -572.9237060546875, + "logps/rejected": -1308.82666015625, + "loss": 0.0532, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.11455889046192169, + "rewards/margins": 0.3093631863594055, + "rewards/rejected": -0.423922061920166, + "step": 880 + }, + { + "epoch": 0.24, + "learning_rate": 4.7181982937661485e-06, + "logits/chosen": -1.8567373752593994, + "logits/rejected": -0.8586881756782532, + "logps/chosen": -683.0592651367188, + "logps/rejected": -1194.775634765625, + "loss": 0.0782, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15534023940563202, + "rewards/margins": 0.24916231632232666, + "rewards/rejected": -0.4045025706291199, + "step": 890 + }, + { + "epoch": 0.24, + "learning_rate": 4.707368982147318e-06, + "logits/chosen": -1.5189939737319946, + "logits/rejected": -1.0900559425354004, + "logps/chosen": -617.9810791015625, + "logps/rejected": -1228.6695556640625, + "loss": 0.0811, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16101650893688202, + "rewards/margins": 0.2594471573829651, + "rewards/rejected": -0.4204636514186859, + "step": 900 + }, + { + "epoch": 0.24, + "learning_rate": 4.696348410599244e-06, + "logits/chosen": -1.609279990196228, + "logits/rejected": -0.9329544901847839, + "logps/chosen": -649.111328125, + "logps/rejected": -1244.6878662109375, + "loss": 0.0947, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1598513424396515, + "rewards/margins": 0.2521916627883911, + "rewards/rejected": -0.4120430052280426, + "step": 910 + }, + { + "epoch": 0.25, + "learning_rate": 4.685137534011549e-06, + "logits/chosen": -1.5942234992980957, + "logits/rejected": -0.9433167576789856, + "logps/chosen": -600.16796875, + "logps/rejected": -1137.0755615234375, + "loss": 0.0973, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1486530601978302, + "rewards/margins": 0.2408868372440338, + "rewards/rejected": -0.389539897441864, + "step": 920 + }, + { + "epoch": 0.25, + "learning_rate": 4.673737323763048e-06, + "logits/chosen": -1.757784128189087, + "logits/rejected": -0.9788764715194702, + "logps/chosen": -526.8590087890625, + "logps/rejected": -1164.993408203125, + "loss": 0.0518, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06887698173522949, + "rewards/margins": 0.3250022530555725, + "rewards/rejected": -0.3938792049884796, + "step": 930 + }, + { + "epoch": 0.25, + "learning_rate": 4.662148767637578e-06, + "logits/chosen": -1.695051908493042, + "logits/rejected": -1.0422875881195068, + "logps/chosen": -673.8726806640625, + "logps/rejected": -1251.0634765625, + "loss": 0.0824, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1298268437385559, + "rewards/margins": 0.2638325095176697, + "rewards/rejected": -0.3936593532562256, + "step": 940 + }, + { + "epoch": 0.25, + "learning_rate": 4.650372869738415e-06, + "logits/chosen": -1.817731261253357, + "logits/rejected": -1.1714346408843994, + "logps/chosen": -632.3103637695312, + "logps/rejected": -1204.101806640625, + "loss": 0.0757, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08371297270059586, + "rewards/margins": 0.2807037830352783, + "rewards/rejected": -0.36441677808761597, + "step": 950 + }, + { + "epoch": 0.26, + "learning_rate": 4.638410650401267e-06, + "logits/chosen": -1.6917314529418945, + "logits/rejected": -1.2975494861602783, + "logps/chosen": -501.3326110839844, + "logps/rejected": -1110.889404296875, + "loss": 0.1007, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.010560419410467148, + "rewards/margins": 0.2687085270881653, + "rewards/rejected": -0.27926892042160034, + "step": 960 + }, + { + "epoch": 0.26, + "learning_rate": 4.626263146105875e-06, + "logits/chosen": -1.681670904159546, + "logits/rejected": -1.0678811073303223, + "logps/chosen": -548.2903442382812, + "logps/rejected": -1203.9267578125, + "loss": 0.0685, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.08060415089130402, + "rewards/margins": 0.297444224357605, + "rewards/rejected": -0.3780483603477478, + "step": 970 + }, + { + "epoch": 0.26, + "learning_rate": 4.613931409386196e-06, + "logits/chosen": -1.469982385635376, + "logits/rejected": -1.1716678142547607, + "logps/chosen": -675.4676513671875, + "logps/rejected": -1349.0924072265625, + "loss": 0.087, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.18839700520038605, + "rewards/margins": 0.28368327021598816, + "rewards/rejected": -0.4720802903175354, + "step": 980 + }, + { + "epoch": 0.26, + "learning_rate": 4.601416508739211e-06, + "logits/chosen": -1.589691162109375, + "logits/rejected": -1.0157541036605835, + "logps/chosen": -608.4013061523438, + "logps/rejected": -1338.4168701171875, + "loss": 0.0375, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.15417249500751495, + "rewards/margins": 0.30883660912513733, + "rewards/rejected": -0.4630090594291687, + "step": 990 + }, + { + "epoch": 0.27, + "learning_rate": 4.588719528532342e-06, + "logits/chosen": -1.6860065460205078, + "logits/rejected": -1.0934185981750488, + "logps/chosen": -706.9453735351562, + "logps/rejected": -1332.388916015625, + "loss": 0.0972, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2328319251537323, + "rewards/margins": 0.26258260011672974, + "rewards/rejected": -0.49541449546813965, + "step": 1000 + }, + { + "epoch": 0.27, + "learning_rate": 4.575841568909494e-06, + "logits/chosen": -1.433650016784668, + "logits/rejected": -1.1477049589157104, + "logps/chosen": -688.2756958007812, + "logps/rejected": -1245.33447265625, + "loss": 0.0894, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.22389158606529236, + "rewards/margins": 0.24267525970935822, + "rewards/rejected": -0.46656686067581177, + "step": 1010 + }, + { + "epoch": 0.27, + "learning_rate": 4.562783745695738e-06, + "logits/chosen": -1.5687224864959717, + "logits/rejected": -0.853603720664978, + "logps/chosen": -791.3260498046875, + "logps/rejected": -1367.469482421875, + "loss": 0.0939, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.21331918239593506, + "rewards/margins": 0.2616121768951416, + "rewards/rejected": -0.4749313294887543, + "step": 1020 + }, + { + "epoch": 0.27, + "learning_rate": 4.549547190300622e-06, + "logits/chosen": -1.7121455669403076, + "logits/rejected": -0.8818023800849915, + "logps/chosen": -657.982177734375, + "logps/rejected": -1258.4931640625, + "loss": 0.0877, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12182845175266266, + "rewards/margins": 0.31102484464645386, + "rewards/rejected": -0.4328532814979553, + "step": 1030 + }, + { + "epoch": 0.28, + "learning_rate": 4.536133049620143e-06, + "logits/chosen": -1.4799646139144897, + "logits/rejected": -1.1651620864868164, + "logps/chosen": -479.6021423339844, + "logps/rejected": -1181.6322021484375, + "loss": 0.0789, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09868054836988449, + "rewards/margins": 0.26966673135757446, + "rewards/rejected": -0.36834731698036194, + "step": 1040 + }, + { + "epoch": 0.28, + "learning_rate": 4.522542485937369e-06, + "logits/chosen": -1.827845573425293, + "logits/rejected": -1.0856597423553467, + "logps/chosen": -625.0631103515625, + "logps/rejected": -1238.838134765625, + "loss": 0.0703, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1401088982820511, + "rewards/margins": 0.30787450075149536, + "rewards/rejected": -0.44798341393470764, + "step": 1050 + }, + { + "epoch": 0.28, + "learning_rate": 4.508776676821739e-06, + "logits/chosen": -1.5721492767333984, + "logits/rejected": -0.8746267557144165, + "logps/chosen": -652.6207275390625, + "logps/rejected": -1226.645751953125, + "loss": 0.0661, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.17216768860816956, + "rewards/margins": 0.27679505944252014, + "rewards/rejected": -0.4489627778530121, + "step": 1060 + }, + { + "epoch": 0.29, + "learning_rate": 4.494836815027022e-06, + "logits/chosen": -1.6152639389038086, + "logits/rejected": -1.128306269645691, + "logps/chosen": -588.0072021484375, + "logps/rejected": -1203.587890625, + "loss": 0.0879, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.15817685425281525, + "rewards/margins": 0.2696138024330139, + "rewards/rejected": -0.42779064178466797, + "step": 1070 + }, + { + "epoch": 0.29, + "learning_rate": 4.4807241083879774e-06, + "logits/chosen": -1.3238633871078491, + "logits/rejected": -0.7138497233390808, + "logps/chosen": -601.8871459960938, + "logps/rejected": -1312.838623046875, + "loss": 0.0552, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.15420369803905487, + "rewards/margins": 0.3349114656448364, + "rewards/rejected": -0.4891151785850525, + "step": 1080 + }, + { + "epoch": 0.29, + "learning_rate": 4.466439779715696e-06, + "logits/chosen": -1.2504911422729492, + "logits/rejected": -0.7397804856300354, + "logps/chosen": -631.8212890625, + "logps/rejected": -1243.5828857421875, + "loss": 0.0868, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.19839642941951752, + "rewards/margins": 0.29437923431396484, + "rewards/rejected": -0.49277567863464355, + "step": 1090 + }, + { + "epoch": 0.29, + "learning_rate": 4.451985066691649e-06, + "logits/chosen": -1.809372901916504, + "logits/rejected": -0.8999295234680176, + "logps/chosen": -633.7000122070312, + "logps/rejected": -1235.184814453125, + "loss": 0.0684, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1565568596124649, + "rewards/margins": 0.30287352204322815, + "rewards/rejected": -0.45943036675453186, + "step": 1100 + }, + { + "epoch": 0.3, + "learning_rate": 4.437361221760449e-06, + "logits/chosen": -1.5753552913665771, + "logits/rejected": -0.8743413090705872, + "logps/chosen": -684.4049072265625, + "logps/rejected": -1305.688232421875, + "loss": 0.0796, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.17457641661167145, + "rewards/margins": 0.2981758415699005, + "rewards/rejected": -0.47275224328041077, + "step": 1110 + }, + { + "epoch": 0.3, + "learning_rate": 4.422569512021332e-06, + "logits/chosen": -1.5101526975631714, + "logits/rejected": -0.977883517742157, + "logps/chosen": -588.9329223632812, + "logps/rejected": -1159.4969482421875, + "loss": 0.0763, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1433287113904953, + "rewards/margins": 0.25360313057899475, + "rewards/rejected": -0.39693182706832886, + "step": 1120 + }, + { + "epoch": 0.3, + "learning_rate": 4.407611219118363e-06, + "logits/chosen": -1.4618273973464966, + "logits/rejected": -1.0363489389419556, + "logps/chosen": -448.98089599609375, + "logps/rejected": -1272.61865234375, + "loss": 0.0453, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09960681945085526, + "rewards/margins": 0.31163084506988525, + "rewards/rejected": -0.4112376570701599, + "step": 1130 + }, + { + "epoch": 0.3, + "learning_rate": 4.3924876391293915e-06, + "logits/chosen": -1.6461549997329712, + "logits/rejected": -0.9912912249565125, + "logps/chosen": -611.0677490234375, + "logps/rejected": -1183.2215576171875, + "loss": 0.0846, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1306958794593811, + "rewards/margins": 0.28848105669021606, + "rewards/rejected": -0.41917696595191956, + "step": 1140 + }, + { + "epoch": 0.31, + "learning_rate": 4.377200082453748e-06, + "logits/chosen": -1.8383325338363647, + "logits/rejected": -0.9174288511276245, + "logps/chosen": -618.29638671875, + "logps/rejected": -1216.172119140625, + "loss": 0.0708, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.11510708183050156, + "rewards/margins": 0.29687556624412537, + "rewards/rejected": -0.41198262572288513, + "step": 1150 + }, + { + "epoch": 0.31, + "learning_rate": 4.361749873698707e-06, + "logits/chosen": -1.3221898078918457, + "logits/rejected": -0.9603347778320312, + "logps/chosen": -517.3427124023438, + "logps/rejected": -1317.27099609375, + "loss": 0.0551, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.09739838540554047, + "rewards/margins": 0.3246908485889435, + "rewards/rejected": -0.42208918929100037, + "step": 1160 + }, + { + "epoch": 0.31, + "learning_rate": 4.346138351564711e-06, + "logits/chosen": -1.7003717422485352, + "logits/rejected": -0.7912822961807251, + "logps/chosen": -598.9830932617188, + "logps/rejected": -1176.973388671875, + "loss": 0.0841, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.18871501088142395, + "rewards/margins": 0.29334282875061035, + "rewards/rejected": -0.4820578098297119, + "step": 1170 + }, + { + "epoch": 0.31, + "learning_rate": 4.330366868729376e-06, + "logits/chosen": -1.4258317947387695, + "logits/rejected": -1.199805498123169, + "logps/chosen": -769.9146728515625, + "logps/rejected": -1417.9521484375, + "loss": 0.0696, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.25572648644447327, + "rewards/margins": 0.25885215401649475, + "rewards/rejected": -0.5145785808563232, + "step": 1180 + }, + { + "epoch": 0.32, + "learning_rate": 4.3144367917302964e-06, + "logits/chosen": -1.580244779586792, + "logits/rejected": -0.9348461031913757, + "logps/chosen": -604.8896484375, + "logps/rejected": -1246.4227294921875, + "loss": 0.0574, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.14286699891090393, + "rewards/margins": 0.30815887451171875, + "rewards/rejected": -0.4510258734226227, + "step": 1190 + }, + { + "epoch": 0.32, + "learning_rate": 4.2983495008466285e-06, + "logits/chosen": -1.449998140335083, + "logits/rejected": -1.0776797533035278, + "logps/chosen": -581.5048217773438, + "logps/rejected": -1115.7093505859375, + "loss": 0.1337, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.0915089100599289, + "rewards/margins": 0.22939009964466095, + "rewards/rejected": -0.32089897990226746, + "step": 1200 + }, + { + "epoch": 0.32, + "learning_rate": 4.2821063899795015e-06, + "logits/chosen": -1.1581242084503174, + "logits/rejected": -0.6956531405448914, + "logps/chosen": -486.95257568359375, + "logps/rejected": -1211.6070556640625, + "loss": 0.0813, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.02962355688214302, + "rewards/margins": 0.285904198884964, + "rewards/rejected": -0.3155277669429779, + "step": 1210 + }, + { + "epoch": 0.33, + "learning_rate": 4.265708866531238e-06, + "logits/chosen": -1.6472032070159912, + "logits/rejected": -1.1526950597763062, + "logps/chosen": -458.576171875, + "logps/rejected": -1106.2225341796875, + "loss": 0.0867, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.08721883594989777, + "rewards/margins": 0.26627764105796814, + "rewards/rejected": -0.3534964919090271, + "step": 1220 + }, + { + "epoch": 0.33, + "learning_rate": 4.249158351283414e-06, + "logits/chosen": -1.5008890628814697, + "logits/rejected": -0.9523450136184692, + "logps/chosen": -564.3670654296875, + "logps/rejected": -1191.93798828125, + "loss": 0.0889, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.12786266207695007, + "rewards/margins": 0.2664950489997864, + "rewards/rejected": -0.39435768127441406, + "step": 1230 + }, + { + "epoch": 0.33, + "learning_rate": 4.232456278273743e-06, + "logits/chosen": -1.5925250053405762, + "logits/rejected": -0.787007749080658, + "logps/chosen": -634.4956665039062, + "logps/rejected": -1199.644287109375, + "loss": 0.0832, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1421954333782196, + "rewards/margins": 0.27115732431411743, + "rewards/rejected": -0.4133527874946594, + "step": 1240 + }, + { + "epoch": 0.33, + "learning_rate": 4.215604094671835e-06, + "logits/chosen": -1.5216588973999023, + "logits/rejected": -0.8067516088485718, + "logps/chosen": -736.2872314453125, + "logps/rejected": -1336.3583984375, + "loss": 0.0645, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2195323407649994, + "rewards/margins": 0.2807646691799164, + "rewards/rejected": -0.500296950340271, + "step": 1250 + }, + { + "epoch": 0.34, + "learning_rate": 4.198603260653792e-06, + "logits/chosen": -1.6377366781234741, + "logits/rejected": -1.1248198747634888, + "logps/chosen": -589.9413452148438, + "logps/rejected": -1174.857666015625, + "loss": 0.0898, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1545807123184204, + "rewards/margins": 0.25077009201049805, + "rewards/rejected": -0.40535083413124084, + "step": 1260 + }, + { + "epoch": 0.34, + "learning_rate": 4.181455249275701e-06, + "logits/chosen": -1.359490990638733, + "logits/rejected": -0.7116638422012329, + "logps/chosen": -482.0816345214844, + "logps/rejected": -1281.95068359375, + "loss": 0.0926, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.06251558661460876, + "rewards/margins": 0.3599362075328827, + "rewards/rejected": -0.42245182394981384, + "step": 1270 + }, + { + "epoch": 0.34, + "learning_rate": 4.1641615463459926e-06, + "logits/chosen": -1.4103469848632812, + "logits/rejected": -0.9721433520317078, + "logps/chosen": -495.11639404296875, + "logps/rejected": -1221.6544189453125, + "loss": 0.0611, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.09844540059566498, + "rewards/margins": 0.32037508487701416, + "rewards/rejected": -0.41882047057151794, + "step": 1280 + }, + { + "epoch": 0.34, + "learning_rate": 4.146723650296701e-06, + "logits/chosen": -1.5504339933395386, + "logits/rejected": -0.8985152244567871, + "logps/chosen": -516.6334228515625, + "logps/rejected": -1175.6920166015625, + "loss": 0.0714, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15037675201892853, + "rewards/margins": 0.27001953125, + "rewards/rejected": -0.42039623856544495, + "step": 1290 + }, + { + "epoch": 0.35, + "learning_rate": 4.129143072053639e-06, + "logits/chosen": -1.5418593883514404, + "logits/rejected": -0.9249173402786255, + "logps/chosen": -479.78521728515625, + "logps/rejected": -1107.382568359375, + "loss": 0.0675, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1212129220366478, + "rewards/margins": 0.27643007040023804, + "rewards/rejected": -0.3976430296897888, + "step": 1300 + }, + { + "epoch": 0.35, + "learning_rate": 4.111421334905468e-06, + "logits/chosen": -1.461808443069458, + "logits/rejected": -0.8558281660079956, + "logps/chosen": -667.7941284179688, + "logps/rejected": -1245.40234375, + "loss": 0.0659, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1607908010482788, + "rewards/margins": 0.2721284031867981, + "rewards/rejected": -0.4329192638397217, + "step": 1310 + }, + { + "epoch": 0.35, + "learning_rate": 4.093559974371725e-06, + "logits/chosen": -1.3010141849517822, + "logits/rejected": -0.9021800756454468, + "logps/chosen": -656.9991455078125, + "logps/rejected": -1318.10205078125, + "loss": 0.0846, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.20327822864055634, + "rewards/margins": 0.2836567759513855, + "rewards/rejected": -0.48693498969078064, + "step": 1320 + }, + { + "epoch": 0.35, + "learning_rate": 4.075560538069767e-06, + "logits/chosen": -1.2061702013015747, + "logits/rejected": -0.8291865587234497, + "logps/chosen": -608.1866455078125, + "logps/rejected": -1327.5355224609375, + "loss": 0.0806, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.16998444497585297, + "rewards/margins": 0.32111790776252747, + "rewards/rejected": -0.49110230803489685, + "step": 1330 + }, + { + "epoch": 0.36, + "learning_rate": 4.05742458558068e-06, + "logits/chosen": -1.6475965976715088, + "logits/rejected": -0.8810272216796875, + "logps/chosen": -616.7311401367188, + "logps/rejected": -1327.117431640625, + "loss": 0.062, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1568661332130432, + "rewards/margins": 0.3473976254463196, + "rewards/rejected": -0.5042637586593628, + "step": 1340 + }, + { + "epoch": 0.36, + "learning_rate": 4.039153688314146e-06, + "logits/chosen": -1.3039356470108032, + "logits/rejected": -0.9500153660774231, + "logps/chosen": -611.8599853515625, + "logps/rejected": -1257.874267578125, + "loss": 0.0689, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1369098573923111, + "rewards/margins": 0.29038089513778687, + "rewards/rejected": -0.42729073762893677, + "step": 1350 + }, + { + "epoch": 0.36, + "learning_rate": 4.020749429372286e-06, + "logits/chosen": -1.4324769973754883, + "logits/rejected": -0.807245135307312, + "logps/chosen": -625.339111328125, + "logps/rejected": -1257.253173828125, + "loss": 0.089, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.14000260829925537, + "rewards/margins": 0.28776806592941284, + "rewards/rejected": -0.4277706742286682, + "step": 1360 + }, + { + "epoch": 0.37, + "learning_rate": 4.002213403412492e-06, + "logits/chosen": -1.445261001586914, + "logits/rejected": -0.9508928060531616, + "logps/chosen": -594.2431030273438, + "logps/rejected": -1150.235595703125, + "loss": 0.073, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15963387489318848, + "rewards/margins": 0.2395774871110916, + "rewards/rejected": -0.3992113471031189, + "step": 1370 + }, + { + "epoch": 0.37, + "learning_rate": 3.983547216509254e-06, + "logits/chosen": -1.33284592628479, + "logits/rejected": -0.7798209190368652, + "logps/chosen": -605.1170654296875, + "logps/rejected": -1115.391845703125, + "loss": 0.0769, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.17733311653137207, + "rewards/margins": 0.23692724108695984, + "rewards/rejected": -0.4142603278160095, + "step": 1380 + }, + { + "epoch": 0.37, + "learning_rate": 3.964752486015001e-06, + "logits/chosen": -1.353686809539795, + "logits/rejected": -0.9458833932876587, + "logps/chosen": -541.7138671875, + "logps/rejected": -1133.408935546875, + "loss": 0.0832, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1459619700908661, + "rewards/margins": 0.25547298789024353, + "rewards/rejected": -0.4014349579811096, + "step": 1390 + }, + { + "epoch": 0.37, + "learning_rate": 3.945830840419966e-06, + "logits/chosen": -1.3489134311676025, + "logits/rejected": -1.114639163017273, + "logps/chosen": -650.7469482421875, + "logps/rejected": -1327.571044921875, + "loss": 0.1023, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.19746045768260956, + "rewards/margins": 0.3029775023460388, + "rewards/rejected": -0.5004379749298096, + "step": 1400 + }, + { + "epoch": 0.38, + "learning_rate": 3.92678391921108e-06, + "logits/chosen": -1.3552124500274658, + "logits/rejected": -0.8964862823486328, + "logps/chosen": -505.401123046875, + "logps/rejected": -1367.124267578125, + "loss": 0.0499, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.12019245326519012, + "rewards/margins": 0.3561457395553589, + "rewards/rejected": -0.4763382375240326, + "step": 1410 + }, + { + "epoch": 0.38, + "learning_rate": 3.907613372729916e-06, + "logits/chosen": -1.5101354122161865, + "logits/rejected": -1.0879840850830078, + "logps/chosen": -619.8988037109375, + "logps/rejected": -1368.367919921875, + "loss": 0.0501, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.15163154900074005, + "rewards/margins": 0.34400704503059387, + "rewards/rejected": -0.49563854932785034, + "step": 1420 + }, + { + "epoch": 0.38, + "learning_rate": 3.888320862029699e-06, + "logits/chosen": -1.5360214710235596, + "logits/rejected": -0.9855524897575378, + "logps/chosen": -748.1507568359375, + "logps/rejected": -1296.6070556640625, + "loss": 0.0988, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1940506398677826, + "rewards/margins": 0.2233462780714035, + "rewards/rejected": -0.4173968732357025, + "step": 1430 + }, + { + "epoch": 0.38, + "learning_rate": 3.868908058731376e-06, + "logits/chosen": -1.5095126628875732, + "logits/rejected": -0.8970023989677429, + "logps/chosen": -497.6952209472656, + "logps/rejected": -1121.1767578125, + "loss": 0.07, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10547561943531036, + "rewards/margins": 0.27518388628959656, + "rewards/rejected": -0.3806595206260681, + "step": 1440 + }, + { + "epoch": 0.39, + "learning_rate": 3.849376644878783e-06, + "logits/chosen": -1.4751381874084473, + "logits/rejected": -0.9001661539077759, + "logps/chosen": -588.8636474609375, + "logps/rejected": -1271.7371826171875, + "loss": 0.0558, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1327972114086151, + "rewards/margins": 0.305908739566803, + "rewards/rejected": -0.4387059211730957, + "step": 1450 + }, + { + "epoch": 0.39, + "learning_rate": 3.829728312792895e-06, + "logits/chosen": -1.616092324256897, + "logits/rejected": -1.0537126064300537, + "logps/chosen": -540.6871337890625, + "logps/rejected": -1179.548828125, + "loss": 0.0725, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09595973044633865, + "rewards/margins": 0.3011319935321808, + "rewards/rejected": -0.39709168672561646, + "step": 1460 + }, + { + "epoch": 0.39, + "learning_rate": 3.8099647649251984e-06, + "logits/chosen": -1.423906683921814, + "logits/rejected": -0.7600029706954956, + "logps/chosen": -594.6055908203125, + "logps/rejected": -1255.1416015625, + "loss": 0.0918, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12480314821004868, + "rewards/margins": 0.3190528154373169, + "rewards/rejected": -0.44385600090026855, + "step": 1470 + }, + { + "epoch": 0.39, + "learning_rate": 3.790087713710179e-06, + "logits/chosen": -1.5575999021530151, + "logits/rejected": -1.1269023418426514, + "logps/chosen": -627.612548828125, + "logps/rejected": -1399.1263427734375, + "loss": 0.0576, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1461525857448578, + "rewards/margins": 0.3743034303188324, + "rewards/rejected": -0.5204560160636902, + "step": 1480 + }, + { + "epoch": 0.4, + "learning_rate": 3.770098881416945e-06, + "logits/chosen": -1.4309592247009277, + "logits/rejected": -0.8114882707595825, + "logps/chosen": -639.8936767578125, + "logps/rejected": -1324.5836181640625, + "loss": 0.0548, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.15769222378730774, + "rewards/margins": 0.3088419735431671, + "rewards/rejected": -0.46653419733047485, + "step": 1490 + }, + { + "epoch": 0.4, + "learning_rate": 3.7500000000000005e-06, + "logits/chosen": -1.3537265062332153, + "logits/rejected": -0.9575905799865723, + "logps/chosen": -644.993408203125, + "logps/rejected": -1161.0167236328125, + "loss": 0.0985, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.13782911002635956, + "rewards/margins": 0.23069393634796143, + "rewards/rejected": -0.3685230612754822, + "step": 1500 + }, + { + "epoch": 0.4, + "learning_rate": 3.7297928109491765e-06, + "logits/chosen": -1.7066447734832764, + "logits/rejected": -0.8981353044509888, + "logps/chosen": -499.3677673339844, + "logps/rejected": -1228.71533203125, + "loss": 0.0597, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10399389266967773, + "rewards/margins": 0.3211382031440735, + "rewards/rejected": -0.42513203620910645, + "step": 1510 + }, + { + "epoch": 0.41, + "learning_rate": 3.7094790651387414e-06, + "logits/chosen": -1.5993268489837646, + "logits/rejected": -0.943587601184845, + "logps/chosen": -549.6283569335938, + "logps/rejected": -1147.561279296875, + "loss": 0.0736, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.09537569433450699, + "rewards/margins": 0.2917521297931671, + "rewards/rejected": -0.3871277868747711, + "step": 1520 + }, + { + "epoch": 0.41, + "learning_rate": 3.689060522675689e-06, + "logits/chosen": -1.4540773630142212, + "logits/rejected": -0.9544679522514343, + "logps/chosen": -567.1152954101562, + "logps/rejected": -1310.57080078125, + "loss": 0.067, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13753007352352142, + "rewards/margins": 0.3150814175605774, + "rewards/rejected": -0.45261150598526, + "step": 1530 + }, + { + "epoch": 0.41, + "learning_rate": 3.668538952747236e-06, + "logits/chosen": -1.5060861110687256, + "logits/rejected": -1.0383261442184448, + "logps/chosen": -541.1341552734375, + "logps/rejected": -1345.200927734375, + "loss": 0.0495, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.08431238681077957, + "rewards/margins": 0.34140679240226746, + "rewards/rejected": -0.42571917176246643, + "step": 1540 + }, + { + "epoch": 0.41, + "learning_rate": 3.6479161334675294e-06, + "logits/chosen": -1.6825485229492188, + "logits/rejected": -0.8775063753128052, + "logps/chosen": -636.63427734375, + "logps/rejected": -1235.6170654296875, + "loss": 0.0867, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.09943681955337524, + "rewards/margins": 0.2878590226173401, + "rewards/rejected": -0.38729584217071533, + "step": 1550 + }, + { + "epoch": 0.42, + "learning_rate": 3.627193851723577e-06, + "logits/chosen": -1.686977744102478, + "logits/rejected": -1.0781195163726807, + "logps/chosen": -594.205078125, + "logps/rejected": -1142.4056396484375, + "loss": 0.084, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.11142469942569733, + "rewards/margins": 0.26075294613838196, + "rewards/rejected": -0.3721776604652405, + "step": 1560 + }, + { + "epoch": 0.42, + "learning_rate": 3.6063739030204226e-06, + "logits/chosen": -1.571839451789856, + "logits/rejected": -1.1530930995941162, + "logps/chosen": -549.628173828125, + "logps/rejected": -1172.0718994140625, + "loss": 0.0807, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.10474500805139542, + "rewards/margins": 0.269944429397583, + "rewards/rejected": -0.37468940019607544, + "step": 1570 + }, + { + "epoch": 0.42, + "learning_rate": 3.5854580913255706e-06, + "logits/chosen": -1.61894953250885, + "logits/rejected": -0.9551402926445007, + "logps/chosen": -607.7701416015625, + "logps/rejected": -1296.424072265625, + "loss": 0.053, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.13554790616035461, + "rewards/margins": 0.3056618571281433, + "rewards/rejected": -0.4412097930908203, + "step": 1580 + }, + { + "epoch": 0.42, + "learning_rate": 3.564448228912682e-06, + "logits/chosen": -1.6686357259750366, + "logits/rejected": -1.0033533573150635, + "logps/chosen": -650.9888916015625, + "logps/rejected": -1215.917724609375, + "loss": 0.0889, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1597142517566681, + "rewards/margins": 0.26373302936553955, + "rewards/rejected": -0.42344728112220764, + "step": 1590 + }, + { + "epoch": 0.43, + "learning_rate": 3.543346136204545e-06, + "logits/chosen": -1.3500854969024658, + "logits/rejected": -0.8943287134170532, + "logps/chosen": -594.9763793945312, + "logps/rejected": -1269.090576171875, + "loss": 0.0873, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14664295315742493, + "rewards/margins": 0.29260388016700745, + "rewards/rejected": -0.4392468333244324, + "step": 1600 + }, + { + "epoch": 0.43, + "learning_rate": 3.522153641615345e-06, + "logits/chosen": -1.5656368732452393, + "logits/rejected": -0.917197048664093, + "logps/chosen": -623.9747924804688, + "logps/rejected": -1312.198486328125, + "loss": 0.0674, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1256953328847885, + "rewards/margins": 0.3076288104057312, + "rewards/rejected": -0.4333241581916809, + "step": 1610 + }, + { + "epoch": 0.43, + "learning_rate": 3.5008725813922383e-06, + "logits/chosen": -1.4567426443099976, + "logits/rejected": -1.0509058237075806, + "logps/chosen": -514.6878051757812, + "logps/rejected": -1177.8046875, + "loss": 0.0862, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1288285255432129, + "rewards/margins": 0.2865757346153259, + "rewards/rejected": -0.4154042601585388, + "step": 1620 + }, + { + "epoch": 0.43, + "learning_rate": 3.4795047994562463e-06, + "logits/chosen": -1.6101102828979492, + "logits/rejected": -1.2119176387786865, + "logps/chosen": -549.9823608398438, + "logps/rejected": -1237.9873046875, + "loss": 0.0912, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1458604633808136, + "rewards/margins": 0.2935812771320343, + "rewards/rejected": -0.4394417405128479, + "step": 1630 + }, + { + "epoch": 0.44, + "learning_rate": 3.458052147242494e-06, + "logits/chosen": -1.6634056568145752, + "logits/rejected": -0.9373501539230347, + "logps/chosen": -565.6293334960938, + "logps/rejected": -1192.64208984375, + "loss": 0.065, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.15129828453063965, + "rewards/margins": 0.2895079553127289, + "rewards/rejected": -0.44080623984336853, + "step": 1640 + }, + { + "epoch": 0.44, + "learning_rate": 3.436516483539781e-06, + "logits/chosen": -1.4864572286605835, + "logits/rejected": -1.038694143295288, + "logps/chosen": -491.3564453125, + "logps/rejected": -1235.2786865234375, + "loss": 0.0685, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.09666456282138824, + "rewards/margins": 0.3150130808353424, + "rewards/rejected": -0.41167759895324707, + "step": 1650 + }, + { + "epoch": 0.44, + "learning_rate": 3.4148996743295305e-06, + "logits/chosen": -1.561033010482788, + "logits/rejected": -0.8384987711906433, + "logps/chosen": -697.5499877929688, + "logps/rejected": -1347.208251953125, + "loss": 0.0675, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.17049038410186768, + "rewards/margins": 0.30814796686172485, + "rewards/rejected": -0.47863835096359253, + "step": 1660 + }, + { + "epoch": 0.45, + "learning_rate": 3.3932035926241103e-06, + "logits/chosen": -1.4081476926803589, + "logits/rejected": -1.0466349124908447, + "logps/chosen": -630.5411376953125, + "logps/rejected": -1300.04541015625, + "loss": 0.0695, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.12465560436248779, + "rewards/margins": 0.29317888617515564, + "rewards/rejected": -0.41783446073532104, + "step": 1670 + }, + { + "epoch": 0.45, + "learning_rate": 3.3714301183045382e-06, + "logits/chosen": -1.6413304805755615, + "logits/rejected": -0.9996837377548218, + "logps/chosen": -584.4578857421875, + "logps/rejected": -1310.6175537109375, + "loss": 0.0728, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.16229207813739777, + "rewards/margins": 0.3190585970878601, + "rewards/rejected": -0.4813506603240967, + "step": 1680 + }, + { + "epoch": 0.45, + "learning_rate": 3.349581137957604e-06, + "logits/chosen": -1.5459994077682495, + "logits/rejected": -0.8717397451400757, + "logps/chosen": -689.1311645507812, + "logps/rejected": -1347.6954345703125, + "loss": 0.0896, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.18276944756507874, + "rewards/margins": 0.30370309948921204, + "rewards/rejected": -0.48647254705429077, + "step": 1690 + }, + { + "epoch": 0.45, + "learning_rate": 3.3276585447123957e-06, + "logits/chosen": -1.5712751150131226, + "logits/rejected": -0.9994010925292969, + "logps/chosen": -587.7183227539062, + "logps/rejected": -1291.80322265625, + "loss": 0.0678, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.13140062987804413, + "rewards/margins": 0.2967793345451355, + "rewards/rejected": -0.4281799793243408, + "step": 1700 + }, + { + "epoch": 0.46, + "learning_rate": 3.3056642380762783e-06, + "logits/chosen": -1.45646071434021, + "logits/rejected": -0.9559444189071655, + "logps/chosen": -743.5758056640625, + "logps/rejected": -1357.35693359375, + "loss": 0.0537, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1634978950023651, + "rewards/margins": 0.26821058988571167, + "rewards/rejected": -0.4317084848880768, + "step": 1710 + }, + { + "epoch": 0.46, + "learning_rate": 3.2836001237702993e-06, + "logits/chosen": -1.2642626762390137, + "logits/rejected": -0.8956004977226257, + "logps/chosen": -672.7369995117188, + "logps/rejected": -1317.1905517578125, + "loss": 0.0843, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.19635051488876343, + "rewards/margins": 0.26600882411003113, + "rewards/rejected": -0.46235933899879456, + "step": 1720 + }, + { + "epoch": 0.46, + "learning_rate": 3.2614681135640696e-06, + "logits/chosen": -1.538914442062378, + "logits/rejected": -0.9134441614151001, + "logps/chosen": -672.8499755859375, + "logps/rejected": -1275.136474609375, + "loss": 0.0585, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.17318633198738098, + "rewards/margins": 0.288135290145874, + "rewards/rejected": -0.4613215923309326, + "step": 1730 + }, + { + "epoch": 0.46, + "learning_rate": 3.2392701251101172e-06, + "logits/chosen": -1.4427398443222046, + "logits/rejected": -0.8872078061103821, + "logps/chosen": -686.1043701171875, + "logps/rejected": -1366.374755859375, + "loss": 0.0687, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19739912450313568, + "rewards/margins": 0.28218263387680054, + "rewards/rejected": -0.479581743478775, + "step": 1740 + }, + { + "epoch": 0.47, + "learning_rate": 3.217008081777726e-06, + "logits/chosen": -1.3497573137283325, + "logits/rejected": -0.872177004814148, + "logps/chosen": -589.72412109375, + "logps/rejected": -1277.909423828125, + "loss": 0.0563, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13301566243171692, + "rewards/margins": 0.2993132770061493, + "rewards/rejected": -0.4323289394378662, + "step": 1750 + }, + { + "epoch": 0.47, + "learning_rate": 3.1946839124862873e-06, + "logits/chosen": -1.3833153247833252, + "logits/rejected": -1.0205485820770264, + "logps/chosen": -539.8531494140625, + "logps/rejected": -1183.308837890625, + "loss": 0.0925, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1366974115371704, + "rewards/margins": 0.2687934637069702, + "rewards/rejected": -0.4054908752441406, + "step": 1760 + }, + { + "epoch": 0.47, + "learning_rate": 3.1722995515381644e-06, + "logits/chosen": -1.4850012063980103, + "logits/rejected": -0.8387139439582825, + "logps/chosen": -636.983642578125, + "logps/rejected": -1304.3221435546875, + "loss": 0.0795, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1878032237291336, + "rewards/margins": 0.2711120843887329, + "rewards/rejected": -0.4589153230190277, + "step": 1770 + }, + { + "epoch": 0.47, + "learning_rate": 3.149856938451094e-06, + "logits/chosen": -1.0989512205123901, + "logits/rejected": -0.8349654078483582, + "logps/chosen": -627.0206298828125, + "logps/rejected": -1307.218505859375, + "loss": 0.0903, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1822880506515503, + "rewards/margins": 0.3011923134326935, + "rewards/rejected": -0.4834803640842438, + "step": 1780 + }, + { + "epoch": 0.48, + "learning_rate": 3.127358017790132e-06, + "logits/chosen": -1.485824704170227, + "logits/rejected": -0.8337934613227844, + "logps/chosen": -623.2086791992188, + "logps/rejected": -1302.7957763671875, + "loss": 0.0511, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15663902461528778, + "rewards/margins": 0.3056802451610565, + "rewards/rejected": -0.4623193144798279, + "step": 1790 + }, + { + "epoch": 0.48, + "learning_rate": 3.1048047389991693e-06, + "logits/chosen": -1.7094755172729492, + "logits/rejected": -1.041133165359497, + "logps/chosen": -671.2548217773438, + "logps/rejected": -1311.30419921875, + "loss": 0.1018, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.167900949716568, + "rewards/margins": 0.27187058329582214, + "rewards/rejected": -0.43977150321006775, + "step": 1800 + }, + { + "epoch": 0.48, + "learning_rate": 3.082199056232015e-06, + "logits/chosen": -1.6966993808746338, + "logits/rejected": -1.220529556274414, + "logps/chosen": -596.4708251953125, + "logps/rejected": -1238.605224609375, + "loss": 0.0583, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.12312579154968262, + "rewards/margins": 0.272055447101593, + "rewards/rejected": -0.39518123865127563, + "step": 1810 + }, + { + "epoch": 0.49, + "learning_rate": 3.059542928183079e-06, + "logits/chosen": -1.661625623703003, + "logits/rejected": -1.1181296110153198, + "logps/chosen": -575.0912475585938, + "logps/rejected": -1266.91064453125, + "loss": 0.0801, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.11585699021816254, + "rewards/margins": 0.2694869041442871, + "rewards/rejected": -0.38534384965896606, + "step": 1820 + }, + { + "epoch": 0.49, + "learning_rate": 3.0368383179176584e-06, + "logits/chosen": -1.606603980064392, + "logits/rejected": -1.0587247610092163, + "logps/chosen": -512.89013671875, + "logps/rejected": -1128.80859375, + "loss": 0.0617, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.10761525481939316, + "rewards/margins": 0.2635376751422882, + "rewards/rejected": -0.37115293741226196, + "step": 1830 + }, + { + "epoch": 0.49, + "learning_rate": 3.0140871927018466e-06, + "logits/chosen": -1.5754063129425049, + "logits/rejected": -0.8801782727241516, + "logps/chosen": -655.8692626953125, + "logps/rejected": -1180.616943359375, + "loss": 0.07, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.153736412525177, + "rewards/margins": 0.2592002749443054, + "rewards/rejected": -0.41293662786483765, + "step": 1840 + }, + { + "epoch": 0.49, + "learning_rate": 2.9912915238320755e-06, + "logits/chosen": -1.427549123764038, + "logits/rejected": -1.0166289806365967, + "logps/chosen": -586.8533325195312, + "logps/rejected": -1178.782958984375, + "loss": 0.0907, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15467192232608795, + "rewards/margins": 0.2657596468925476, + "rewards/rejected": -0.420431524515152, + "step": 1850 + }, + { + "epoch": 0.5, + "learning_rate": 2.9684532864643123e-06, + "logits/chosen": -1.580718755722046, + "logits/rejected": -1.1227762699127197, + "logps/chosen": -620.2764892578125, + "logps/rejected": -1323.167236328125, + "loss": 0.0675, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.14635826647281647, + "rewards/margins": 0.3277556300163269, + "rewards/rejected": -0.4741138815879822, + "step": 1860 + }, + { + "epoch": 0.5, + "learning_rate": 2.945574459442917e-06, + "logits/chosen": -1.291585922241211, + "logits/rejected": -0.7484699487686157, + "logps/chosen": -530.9237060546875, + "logps/rejected": -1149.4744873046875, + "loss": 0.0707, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1255730241537094, + "rewards/margins": 0.28315025568008423, + "rewards/rejected": -0.40872329473495483, + "step": 1870 + }, + { + "epoch": 0.5, + "learning_rate": 2.922657025129185e-06, + "logits/chosen": -1.3349201679229736, + "logits/rejected": -0.9772024154663086, + "logps/chosen": -620.5244140625, + "logps/rejected": -1263.1776123046875, + "loss": 0.0585, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16309909522533417, + "rewards/margins": 0.273107647895813, + "rewards/rejected": -0.43620675802230835, + "step": 1880 + }, + { + "epoch": 0.5, + "learning_rate": 2.8997029692295875e-06, + "logits/chosen": -1.4110755920410156, + "logits/rejected": -0.9908379316329956, + "logps/chosen": -516.9703369140625, + "logps/rejected": -1351.016357421875, + "loss": 0.0628, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14381906390190125, + "rewards/margins": 0.3800903558731079, + "rewards/rejected": -0.5239094495773315, + "step": 1890 + }, + { + "epoch": 0.51, + "learning_rate": 2.876714280623708e-06, + "logits/chosen": -1.4153320789337158, + "logits/rejected": -0.820611298084259, + "logps/chosen": -487.5252990722656, + "logps/rejected": -1135.842529296875, + "loss": 0.0915, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10841517150402069, + "rewards/margins": 0.29948437213897705, + "rewards/rejected": -0.40789952874183655, + "step": 1900 + }, + { + "epoch": 0.51, + "learning_rate": 2.8536929511919227e-06, + "logits/chosen": -1.4119293689727783, + "logits/rejected": -0.8228232264518738, + "logps/chosen": -627.8701171875, + "logps/rejected": -1294.2296142578125, + "loss": 0.0469, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12193576991558075, + "rewards/margins": 0.32117849588394165, + "rewards/rejected": -0.4431142807006836, + "step": 1910 + }, + { + "epoch": 0.51, + "learning_rate": 2.8306409756428067e-06, + "logits/chosen": -1.527777075767517, + "logits/rejected": -0.8934208154678345, + "logps/chosen": -581.7477416992188, + "logps/rejected": -1238.830810546875, + "loss": 0.0816, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1308506429195404, + "rewards/margins": 0.29394230246543884, + "rewards/rejected": -0.42479294538497925, + "step": 1920 + }, + { + "epoch": 0.51, + "learning_rate": 2.807560351340302e-06, + "logits/chosen": -1.3228596448898315, + "logits/rejected": -0.7611247897148132, + "logps/chosen": -601.160400390625, + "logps/rejected": -1213.851806640625, + "loss": 0.0707, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1343916952610016, + "rewards/margins": 0.28090834617614746, + "rewards/rejected": -0.4152999818325043, + "step": 1930 + }, + { + "epoch": 0.52, + "learning_rate": 2.7844530781306544e-06, + "logits/chosen": -1.4402358531951904, + "logits/rejected": -0.8715489506721497, + "logps/chosen": -518.7476806640625, + "logps/rejected": -1256.0328369140625, + "loss": 0.0528, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11650246381759644, + "rewards/margins": 0.3238303065299988, + "rewards/rejected": -0.4403327405452728, + "step": 1940 + }, + { + "epoch": 0.52, + "learning_rate": 2.761321158169134e-06, + "logits/chosen": -1.5251938104629517, + "logits/rejected": -1.1043987274169922, + "logps/chosen": -656.2462158203125, + "logps/rejected": -1206.697021484375, + "loss": 0.0894, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.17089466750621796, + "rewards/margins": 0.22701752185821533, + "rewards/rejected": -0.3979122042655945, + "step": 1950 + }, + { + "epoch": 0.52, + "learning_rate": 2.738166595746554e-06, + "logits/chosen": -1.3905115127563477, + "logits/rejected": -0.9935697317123413, + "logps/chosen": -628.3776245117188, + "logps/rejected": -1096.552734375, + "loss": 0.0795, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15120725333690643, + "rewards/margins": 0.24288305640220642, + "rewards/rejected": -0.39409032464027405, + "step": 1960 + }, + { + "epoch": 0.53, + "learning_rate": 2.7149913971156105e-06, + "logits/chosen": -1.722516655921936, + "logits/rejected": -1.0052350759506226, + "logps/chosen": -496.67657470703125, + "logps/rejected": -1112.5921630859375, + "loss": 0.0733, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10522119700908661, + "rewards/margins": 0.28908994793891907, + "rewards/rejected": -0.3943111300468445, + "step": 1970 + }, + { + "epoch": 0.53, + "learning_rate": 2.6917975703170466e-06, + "logits/chosen": -1.4045814275741577, + "logits/rejected": -0.9305141568183899, + "logps/chosen": -512.0284423828125, + "logps/rejected": -1209.204345703125, + "loss": 0.0672, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1334155797958374, + "rewards/margins": 0.2813461720943451, + "rewards/rejected": -0.4147617220878601, + "step": 1980 + }, + { + "epoch": 0.53, + "learning_rate": 2.668587125005663e-06, + "logits/chosen": -1.343185305595398, + "logits/rejected": -1.0273711681365967, + "logps/chosen": -549.5206298828125, + "logps/rejected": -1240.9766845703125, + "loss": 0.0707, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1306164562702179, + "rewards/margins": 0.29476845264434814, + "rewards/rejected": -0.4253849387168884, + "step": 1990 + }, + { + "epoch": 0.53, + "learning_rate": 2.6453620722761897e-06, + "logits/chosen": -1.6594680547714233, + "logits/rejected": -0.850638210773468, + "logps/chosen": -591.6373291015625, + "logps/rejected": -1358.3460693359375, + "loss": 0.0625, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10016246140003204, + "rewards/margins": 0.33138564229011536, + "rewards/rejected": -0.43154802918434143, + "step": 2000 + }, + { + "epoch": 0.54, + "learning_rate": 2.6221244244890336e-06, + "logits/chosen": -1.5611286163330078, + "logits/rejected": -0.799461841583252, + "logps/chosen": -587.806640625, + "logps/rejected": -1161.3482666015625, + "loss": 0.0633, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.12262831628322601, + "rewards/margins": 0.2799530327320099, + "rewards/rejected": -0.4025813639163971, + "step": 2010 + }, + { + "epoch": 0.54, + "learning_rate": 2.5988761950959133e-06, + "logits/chosen": -1.6644928455352783, + "logits/rejected": -0.9483749270439148, + "logps/chosen": -535.2498168945312, + "logps/rejected": -1164.773681640625, + "loss": 0.0748, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1117793545126915, + "rewards/margins": 0.281157910823822, + "rewards/rejected": -0.3929373323917389, + "step": 2020 + }, + { + "epoch": 0.54, + "learning_rate": 2.575619398465402e-06, + "logits/chosen": -1.5217034816741943, + "logits/rejected": -0.715064287185669, + "logps/chosen": -589.2874755859375, + "logps/rejected": -1261.570556640625, + "loss": 0.0622, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1293199360370636, + "rewards/margins": 0.3084322214126587, + "rewards/rejected": -0.4377521574497223, + "step": 2030 + }, + { + "epoch": 0.54, + "learning_rate": 2.5523560497083927e-06, + "logits/chosen": -1.6095244884490967, + "logits/rejected": -0.9723415374755859, + "logps/chosen": -610.6204223632812, + "logps/rejected": -1346.6697998046875, + "loss": 0.0527, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.18110117316246033, + "rewards/margins": 0.3312448561191559, + "rewards/rejected": -0.5123459696769714, + "step": 2040 + }, + { + "epoch": 0.55, + "learning_rate": 2.5290881645034932e-06, + "logits/chosen": -1.4780033826828003, + "logits/rejected": -0.983650803565979, + "logps/chosen": -654.4927978515625, + "logps/rejected": -1217.2103271484375, + "loss": 0.0931, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1973283588886261, + "rewards/margins": 0.2564861476421356, + "rewards/rejected": -0.4538145065307617, + "step": 2050 + }, + { + "epoch": 0.55, + "learning_rate": 2.5058177589223766e-06, + "logits/chosen": -1.6766622066497803, + "logits/rejected": -0.907199501991272, + "logps/chosen": -659.3814086914062, + "logps/rejected": -1295.118896484375, + "loss": 0.0769, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.14497703313827515, + "rewards/margins": 0.3324897885322571, + "rewards/rejected": -0.477466881275177, + "step": 2060 + }, + { + "epoch": 0.55, + "learning_rate": 2.482546849255096e-06, + "logits/chosen": -1.5557941198349, + "logits/rejected": -0.8595023155212402, + "logps/chosen": -599.567626953125, + "logps/rejected": -1293.6220703125, + "loss": 0.038, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.14183931052684784, + "rewards/margins": 0.32466521859169006, + "rewards/rejected": -0.4665044844150543, + "step": 2070 + }, + { + "epoch": 0.55, + "learning_rate": 2.4592774518353858e-06, + "logits/chosen": -1.3870598077774048, + "logits/rejected": -0.7746745944023132, + "logps/chosen": -578.613525390625, + "logps/rejected": -1236.041748046875, + "loss": 0.0582, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1255788505077362, + "rewards/margins": 0.2901487350463867, + "rewards/rejected": -0.41572752594947815, + "step": 2080 + }, + { + "epoch": 0.56, + "learning_rate": 2.436011582865945e-06, + "logits/chosen": -1.539902925491333, + "logits/rejected": -0.8010295629501343, + "logps/chosen": -680.379638671875, + "logps/rejected": -1223.6986083984375, + "loss": 0.0678, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15240542590618134, + "rewards/margins": 0.2671021819114685, + "rewards/rejected": -0.41950759291648865, + "step": 2090 + }, + { + "epoch": 0.56, + "learning_rate": 2.4127512582437486e-06, + "logits/chosen": -1.5265008211135864, + "logits/rejected": -1.3408691883087158, + "logps/chosen": -541.078125, + "logps/rejected": -1169.788818359375, + "loss": 0.0992, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15979784727096558, + "rewards/margins": 0.2517135739326477, + "rewards/rejected": -0.4115114212036133, + "step": 2100 + }, + { + "epoch": 0.56, + "learning_rate": 2.3894984933853734e-06, + "logits/chosen": -1.4429771900177002, + "logits/rejected": -0.9880257844924927, + "logps/chosen": -528.2786254882812, + "logps/rejected": -1244.5059814453125, + "loss": 0.0763, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.14250726997852325, + "rewards/margins": 0.30752262473106384, + "rewards/rejected": -0.4500298500061035, + "step": 2110 + }, + { + "epoch": 0.57, + "learning_rate": 2.366255303052377e-06, + "logits/chosen": -1.4017646312713623, + "logits/rejected": -0.8113569021224976, + "logps/chosen": -604.7733154296875, + "logps/rejected": -1221.2369384765625, + "loss": 0.094, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15813498198986053, + "rewards/margins": 0.2982472777366638, + "rewards/rejected": -0.45638221502304077, + "step": 2120 + }, + { + "epoch": 0.57, + "learning_rate": 2.3430237011767166e-06, + "logits/chosen": -1.5092931985855103, + "logits/rejected": -0.8236274719238281, + "logps/chosen": -533.9609985351562, + "logps/rejected": -1168.628173828125, + "loss": 0.0722, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.13640852272510529, + "rewards/margins": 0.28761720657348633, + "rewards/rejected": -0.4240257740020752, + "step": 2130 + }, + { + "epoch": 0.57, + "learning_rate": 2.319805700686257e-06, + "logits/chosen": -1.5162551403045654, + "logits/rejected": -0.786509096622467, + "logps/chosen": -623.4132080078125, + "logps/rejected": -1179.5946044921875, + "loss": 0.0579, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1392001360654831, + "rewards/margins": 0.26958781480789185, + "rewards/rejected": -0.40878796577453613, + "step": 2140 + }, + { + "epoch": 0.57, + "learning_rate": 2.296603313330355e-06, + "logits/chosen": -1.1993951797485352, + "logits/rejected": -0.6298279166221619, + "logps/chosen": -582.5997314453125, + "logps/rejected": -1334.009765625, + "loss": 0.0706, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.17245307564735413, + "rewards/margins": 0.29515841603279114, + "rewards/rejected": -0.46761149168014526, + "step": 2150 + }, + { + "epoch": 0.58, + "learning_rate": 2.2734185495055503e-06, + "logits/chosen": -1.7049709558486938, + "logits/rejected": -0.9636220932006836, + "logps/chosen": -705.0452270507812, + "logps/rejected": -1329.1993408203125, + "loss": 0.0595, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.15186749398708344, + "rewards/margins": 0.3114302158355713, + "rewards/rejected": -0.46329769492149353, + "step": 2160 + }, + { + "epoch": 0.58, + "learning_rate": 2.250253418081373e-06, + "logits/chosen": -1.4931375980377197, + "logits/rejected": -1.0522197484970093, + "logps/chosen": -568.1475830078125, + "logps/rejected": -1231.364013671875, + "loss": 0.0732, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13682815432548523, + "rewards/margins": 0.29910990595817566, + "rewards/rejected": -0.4359380602836609, + "step": 2170 + }, + { + "epoch": 0.58, + "learning_rate": 2.22710992622628e-06, + "logits/chosen": -1.4385493993759155, + "logits/rejected": -1.0058460235595703, + "logps/chosen": -473.60498046875, + "logps/rejected": -1201.9755859375, + "loss": 0.0674, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.10265137255191803, + "rewards/margins": 0.32189419865608215, + "rewards/rejected": -0.4245455265045166, + "step": 2180 + }, + { + "epoch": 0.58, + "learning_rate": 2.2039900792337477e-06, + "logits/chosen": -1.3410050868988037, + "logits/rejected": -1.1295228004455566, + "logps/chosen": -553.0806274414062, + "logps/rejected": -1231.395263671875, + "loss": 0.0821, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.16361066699028015, + "rewards/margins": 0.2729097008705139, + "rewards/rejected": -0.43652039766311646, + "step": 2190 + }, + { + "epoch": 0.59, + "learning_rate": 2.1808958803485134e-06, + "logits/chosen": -1.3688017129898071, + "logits/rejected": -0.9007024765014648, + "logps/chosen": -651.8567504882812, + "logps/rejected": -1406.11572265625, + "loss": 0.0535, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.18758761882781982, + "rewards/margins": 0.3379947543144226, + "rewards/rejected": -0.5255824327468872, + "step": 2200 + }, + { + "epoch": 0.59, + "learning_rate": 2.157829330593008e-06, + "logits/chosen": -1.6441303491592407, + "logits/rejected": -1.0162547826766968, + "logps/chosen": -711.4053955078125, + "logps/rejected": -1393.6256103515625, + "loss": 0.0708, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.2044825553894043, + "rewards/margins": 0.30536073446273804, + "rewards/rejected": -0.5098432302474976, + "step": 2210 + }, + { + "epoch": 0.59, + "learning_rate": 2.134792428593971e-06, + "logits/chosen": -1.6138668060302734, + "logits/rejected": -0.9830889701843262, + "logps/chosen": -678.9330444335938, + "logps/rejected": -1337.688720703125, + "loss": 0.0712, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.16158784925937653, + "rewards/margins": 0.30953675508499146, + "rewards/rejected": -0.4711245596408844, + "step": 2220 + }, + { + "epoch": 0.59, + "learning_rate": 2.1117871704092818e-06, + "logits/chosen": -1.5652110576629639, + "logits/rejected": -0.7780826687812805, + "logps/chosen": -496.6578674316406, + "logps/rejected": -1144.76171875, + "loss": 0.0663, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.11985839903354645, + "rewards/margins": 0.3066459596157074, + "rewards/rejected": -0.42650431394577026, + "step": 2230 + }, + { + "epoch": 0.6, + "learning_rate": 2.0888155493550027e-06, + "logits/chosen": -1.516342282295227, + "logits/rejected": -1.1366102695465088, + "logps/chosen": -601.8604125976562, + "logps/rejected": -1439.399658203125, + "loss": 0.0535, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.15253353118896484, + "rewards/margins": 0.3515278697013855, + "rewards/rejected": -0.5040613412857056, + "step": 2240 + }, + { + "epoch": 0.6, + "learning_rate": 2.0658795558326745e-06, + "logits/chosen": -1.4972165822982788, + "logits/rejected": -1.1287825107574463, + "logps/chosen": -551.3253784179688, + "logps/rejected": -1201.3675537109375, + "loss": 0.0887, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.14231975376605988, + "rewards/margins": 0.2838347554206848, + "rewards/rejected": -0.4261545240879059, + "step": 2250 + }, + { + "epoch": 0.6, + "learning_rate": 2.0429811771568468e-06, + "logits/chosen": -1.283483862876892, + "logits/rejected": -0.8047486543655396, + "logps/chosen": -674.7145385742188, + "logps/rejected": -1273.704345703125, + "loss": 0.0626, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.19594672322273254, + "rewards/margins": 0.2617323696613312, + "rewards/rejected": -0.4576791226863861, + "step": 2260 + }, + { + "epoch": 0.61, + "learning_rate": 2.0201223973828917e-06, + "logits/chosen": -1.3640494346618652, + "logits/rejected": -0.9720889329910278, + "logps/chosen": -654.1539306640625, + "logps/rejected": -1382.0146484375, + "loss": 0.0744, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17939691245555878, + "rewards/margins": 0.31711509823799133, + "rewards/rejected": -0.4965119957923889, + "step": 2270 + }, + { + "epoch": 0.61, + "learning_rate": 1.997305197135089e-06, + "logits/chosen": -1.557839274406433, + "logits/rejected": -0.9444772601127625, + "logps/chosen": -631.0563354492188, + "logps/rejected": -1341.0086669921875, + "loss": 0.0733, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16753628849983215, + "rewards/margins": 0.30379122495651245, + "rewards/rejected": -0.4713274836540222, + "step": 2280 + }, + { + "epoch": 0.61, + "learning_rate": 1.9745315534350157e-06, + "logits/chosen": -1.2147436141967773, + "logits/rejected": -0.681081235408783, + "logps/chosen": -712.3760986328125, + "logps/rejected": -1318.22265625, + "loss": 0.0959, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23314671218395233, + "rewards/margins": 0.2763899266719818, + "rewards/rejected": -0.509536623954773, + "step": 2290 + }, + { + "epoch": 0.61, + "learning_rate": 1.9518034395302413e-06, + "logits/chosen": -1.6583919525146484, + "logits/rejected": -1.0387184619903564, + "logps/chosen": -607.9590454101562, + "logps/rejected": -1096.9937744140625, + "loss": 0.1013, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1655300408601761, + "rewards/margins": 0.24450743198394775, + "rewards/rejected": -0.41003745794296265, + "step": 2300 + }, + { + "epoch": 0.62, + "learning_rate": 1.9291228247233607e-06, + "logits/chosen": -1.4778130054473877, + "logits/rejected": -0.7460058331489563, + "logps/chosen": -651.7698974609375, + "logps/rejected": -1243.669921875, + "loss": 0.0769, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1902499794960022, + "rewards/margins": 0.2825292944908142, + "rewards/rejected": -0.4727793335914612, + "step": 2310 + }, + { + "epoch": 0.62, + "learning_rate": 1.9064916742013515e-06, + "logits/chosen": -1.3330655097961426, + "logits/rejected": -0.9275220036506653, + "logps/chosen": -523.3305053710938, + "logps/rejected": -1225.2471923828125, + "loss": 0.0641, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15531578660011292, + "rewards/margins": 0.31203147768974304, + "rewards/rejected": -0.46734723448753357, + "step": 2320 + }, + { + "epoch": 0.62, + "learning_rate": 1.883911948865306e-06, + "logits/chosen": -1.3421717882156372, + "logits/rejected": -1.1515899896621704, + "logps/chosen": -492.34246826171875, + "logps/rejected": -1202.8974609375, + "loss": 0.0831, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.15640634298324585, + "rewards/margins": 0.2934826612472534, + "rewards/rejected": -0.44988900423049927, + "step": 2330 + }, + { + "epoch": 0.62, + "learning_rate": 1.8613856051605242e-06, + "logits/chosen": -1.4181009531021118, + "logits/rejected": -0.8086174130439758, + "logps/chosen": -602.2988891601562, + "logps/rejected": -1168.888916015625, + "loss": 0.07, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.15534570813179016, + "rewards/margins": 0.2883809208869934, + "rewards/rejected": -0.44372662901878357, + "step": 2340 + }, + { + "epoch": 0.63, + "learning_rate": 1.8389145949069953e-06, + "logits/chosen": -1.6121231317520142, + "logits/rejected": -0.8654192090034485, + "logps/chosen": -598.3814697265625, + "logps/rejected": -1284.2470703125, + "loss": 0.0595, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.14926204085350037, + "rewards/margins": 0.3212565779685974, + "rewards/rejected": -0.47051864862442017, + "step": 2350 + }, + { + "epoch": 0.63, + "learning_rate": 1.816500865130279e-06, + "logits/chosen": -1.4523346424102783, + "logits/rejected": -0.9201906323432922, + "logps/chosen": -600.6221923828125, + "logps/rejected": -1303.9447021484375, + "loss": 0.0641, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.17756803333759308, + "rewards/margins": 0.3046211898326874, + "rewards/rejected": -0.48218923807144165, + "step": 2360 + }, + { + "epoch": 0.63, + "learning_rate": 1.7941463578928088e-06, + "logits/chosen": -1.5082757472991943, + "logits/rejected": -0.9013730883598328, + "logps/chosen": -610.1536254882812, + "logps/rejected": -1315.129638671875, + "loss": 0.0639, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.176839679479599, + "rewards/margins": 0.3339093327522278, + "rewards/rejected": -0.5107490420341492, + "step": 2370 + }, + { + "epoch": 0.63, + "learning_rate": 1.7718530101256115e-06, + "logits/chosen": -1.6840633153915405, + "logits/rejected": -0.9501806497573853, + "logps/chosen": -662.0902709960938, + "logps/rejected": -1296.606689453125, + "loss": 0.0698, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.17285804450511932, + "rewards/margins": 0.3210682272911072, + "rewards/rejected": -0.4939262866973877, + "step": 2380 + }, + { + "epoch": 0.64, + "learning_rate": 1.7496227534604859e-06, + "logits/chosen": -1.4562785625457764, + "logits/rejected": -1.0628981590270996, + "logps/chosen": -594.0131225585938, + "logps/rejected": -1322.223876953125, + "loss": 0.0512, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.185628280043602, + "rewards/margins": 0.31836962699890137, + "rewards/rejected": -0.5039979219436646, + "step": 2390 + }, + { + "epoch": 0.64, + "learning_rate": 1.7274575140626318e-06, + "logits/chosen": -1.622641921043396, + "logits/rejected": -0.7408018112182617, + "logps/chosen": -671.2892456054688, + "logps/rejected": -1338.8228759765625, + "loss": 0.0585, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19210806488990784, + "rewards/margins": 0.3319090008735657, + "rewards/rejected": -0.5240170359611511, + "step": 2400 + }, + { + "epoch": 0.64, + "learning_rate": 1.7053592124637557e-06, + "logits/chosen": -1.6081740856170654, + "logits/rejected": -0.7799841165542603, + "logps/chosen": -656.5260009765625, + "logps/rejected": -1301.16650390625, + "loss": 0.0568, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21345119178295135, + "rewards/margins": 0.30246636271476746, + "rewards/rejected": -0.5159175992012024, + "step": 2410 + }, + { + "epoch": 0.65, + "learning_rate": 1.6833297633956647e-06, + "logits/chosen": -1.5897592306137085, + "logits/rejected": -0.830175518989563, + "logps/chosen": -643.302734375, + "logps/rejected": -1318.0699462890625, + "loss": 0.0593, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17795221507549286, + "rewards/margins": 0.332580029964447, + "rewards/rejected": -0.5105322599411011, + "step": 2420 + }, + { + "epoch": 0.65, + "learning_rate": 1.661371075624363e-06, + "logits/chosen": -1.5620427131652832, + "logits/rejected": -1.090867519378662, + "logps/chosen": -677.5968627929688, + "logps/rejected": -1375.831787109375, + "loss": 0.0626, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.185464009642601, + "rewards/margins": 0.3168545365333557, + "rewards/rejected": -0.5023185014724731, + "step": 2430 + }, + { + "epoch": 0.65, + "learning_rate": 1.6394850517846621e-06, + "logits/chosen": -1.4541980028152466, + "logits/rejected": -0.972217857837677, + "logps/chosen": -705.5538940429688, + "logps/rejected": -1215.7884521484375, + "loss": 0.1066, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.19816702604293823, + "rewards/margins": 0.24639299511909485, + "rewards/rejected": -0.4445599615573883, + "step": 2440 + }, + { + "epoch": 0.65, + "learning_rate": 1.6176735882153284e-06, + "logits/chosen": -1.5021053552627563, + "logits/rejected": -0.8954359292984009, + "logps/chosen": -642.7943115234375, + "logps/rejected": -1351.9677734375, + "loss": 0.0746, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18909773230552673, + "rewards/margins": 0.3134486675262451, + "rewards/rejected": -0.5025463104248047, + "step": 2450 + }, + { + "epoch": 0.66, + "learning_rate": 1.5959385747947697e-06, + "logits/chosen": -1.4700965881347656, + "logits/rejected": -0.7698783874511719, + "logps/chosen": -592.5593872070312, + "logps/rejected": -1228.227783203125, + "loss": 0.055, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.16944539546966553, + "rewards/margins": 0.2940993309020996, + "rewards/rejected": -0.46354469656944275, + "step": 2460 + }, + { + "epoch": 0.66, + "learning_rate": 1.5742818947772875e-06, + "logits/chosen": -1.6665054559707642, + "logits/rejected": -0.948663592338562, + "logps/chosen": -769.6544799804688, + "logps/rejected": -1263.137451171875, + "loss": 0.1006, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.24728581309318542, + "rewards/margins": 0.25588518381118774, + "rewards/rejected": -0.5031709671020508, + "step": 2470 + }, + { + "epoch": 0.66, + "learning_rate": 1.552705424629898e-06, + "logits/chosen": -1.4320557117462158, + "logits/rejected": -0.8846480250358582, + "logps/chosen": -672.8453369140625, + "logps/rejected": -1423.1922607421875, + "loss": 0.0535, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.1967121660709381, + "rewards/margins": 0.31136855483055115, + "rewards/rejected": -0.508080780506134, + "step": 2480 + }, + { + "epoch": 0.66, + "learning_rate": 1.5312110338697427e-06, + "logits/chosen": -1.596573829650879, + "logits/rejected": -0.9393990635871887, + "logps/chosen": -719.8648681640625, + "logps/rejected": -1458.17919921875, + "loss": 0.0633, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.19690574705600739, + "rewards/margins": 0.33237752318382263, + "rewards/rejected": -0.5292832851409912, + "step": 2490 + }, + { + "epoch": 0.67, + "learning_rate": 1.509800584902108e-06, + "logits/chosen": -1.6579780578613281, + "logits/rejected": -1.0232713222503662, + "logps/chosen": -629.9151611328125, + "logps/rejected": -1246.3726806640625, + "loss": 0.0641, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1578701287508011, + "rewards/margins": 0.3020227253437042, + "rewards/rejected": -0.45989280939102173, + "step": 2500 + }, + { + "epoch": 0.67, + "learning_rate": 1.4884759328590476e-06, + "logits/chosen": -1.6255989074707031, + "logits/rejected": -0.9438311457633972, + "logps/chosen": -571.7470703125, + "logps/rejected": -1234.297119140625, + "loss": 0.0744, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15460793673992157, + "rewards/margins": 0.3083663582801819, + "rewards/rejected": -0.4629742503166199, + "step": 2510 + }, + { + "epoch": 0.67, + "learning_rate": 1.467238925438646e-06, + "logits/chosen": -1.4770749807357788, + "logits/rejected": -0.936165452003479, + "logps/chosen": -617.3800048828125, + "logps/rejected": -1241.4713134765625, + "loss": 0.0783, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15939846634864807, + "rewards/margins": 0.28489136695861816, + "rewards/rejected": -0.44428983330726624, + "step": 2520 + }, + { + "epoch": 0.67, + "learning_rate": 1.446091402744923e-06, + "logits/chosen": -1.6321996450424194, + "logits/rejected": -1.2910716533660889, + "logps/chosen": -621.7635498046875, + "logps/rejected": -1340.450439453125, + "loss": 0.0587, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.16068853437900543, + "rewards/margins": 0.31573906540870667, + "rewards/rejected": -0.4764275550842285, + "step": 2530 + }, + { + "epoch": 0.68, + "learning_rate": 1.4250351971283937e-06, + "logits/chosen": -1.790464162826538, + "logits/rejected": -0.8707693219184875, + "logps/chosen": -630.2117919921875, + "logps/rejected": -1443.947509765625, + "loss": 0.0503, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.12107650935649872, + "rewards/margins": 0.3619995713233948, + "rewards/rejected": -0.4830760955810547, + "step": 2540 + }, + { + "epoch": 0.68, + "learning_rate": 1.4040721330273063e-06, + "logits/chosen": -1.4005483388900757, + "logits/rejected": -0.8707137107849121, + "logps/chosen": -572.6295166015625, + "logps/rejected": -1228.290283203125, + "loss": 0.0709, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.13189435005187988, + "rewards/margins": 0.30900174379348755, + "rewards/rejected": -0.44089609384536743, + "step": 2550 + }, + { + "epoch": 0.68, + "learning_rate": 1.3832040268095589e-06, + "logits/chosen": -1.5691678524017334, + "logits/rejected": -0.9897274971008301, + "logps/chosen": -598.7499389648438, + "logps/rejected": -1142.5048828125, + "loss": 0.0866, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1460307091474533, + "rewards/margins": 0.24934545159339905, + "rewards/rejected": -0.39537614583969116, + "step": 2560 + }, + { + "epoch": 0.69, + "learning_rate": 1.362432686615316e-06, + "logits/chosen": -1.69058358669281, + "logits/rejected": -1.319437861442566, + "logps/chosen": -554.8807373046875, + "logps/rejected": -1080.8026123046875, + "loss": 0.0886, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.13216087222099304, + "rewards/margins": 0.22610945999622345, + "rewards/rejected": -0.3582703471183777, + "step": 2570 + }, + { + "epoch": 0.69, + "learning_rate": 1.3417599122003464e-06, + "logits/chosen": -1.4841969013214111, + "logits/rejected": -0.8709270358085632, + "logps/chosen": -590.7588500976562, + "logps/rejected": -1269.95556640625, + "loss": 0.0861, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.156617671251297, + "rewards/margins": 0.2556094229221344, + "rewards/rejected": -0.4122270941734314, + "step": 2580 + }, + { + "epoch": 0.69, + "learning_rate": 1.3211874947800747e-06, + "logits/chosen": -1.7067668437957764, + "logits/rejected": -0.8970105051994324, + "logps/chosen": -637.1722412109375, + "logps/rejected": -1236.7254638671875, + "loss": 0.0759, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15255072712898254, + "rewards/margins": 0.2996431291103363, + "rewards/rejected": -0.45219388604164124, + "step": 2590 + }, + { + "epoch": 0.69, + "learning_rate": 1.3007172168743854e-06, + "logits/chosen": -1.722249984741211, + "logits/rejected": -0.9102805256843567, + "logps/chosen": -568.446533203125, + "logps/rejected": -1270.4002685546875, + "loss": 0.0711, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.13264772295951843, + "rewards/margins": 0.33025822043418884, + "rewards/rejected": -0.4629059433937073, + "step": 2600 + }, + { + "epoch": 0.7, + "learning_rate": 1.280350852153168e-06, + "logits/chosen": -1.4858075380325317, + "logits/rejected": -1.0102977752685547, + "logps/chosen": -668.56201171875, + "logps/rejected": -1335.7791748046875, + "loss": 0.0851, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.20782490074634552, + "rewards/margins": 0.29307177662849426, + "rewards/rejected": -0.5008966326713562, + "step": 2610 + }, + { + "epoch": 0.7, + "learning_rate": 1.260090165282645e-06, + "logits/chosen": -1.3048267364501953, + "logits/rejected": -0.79096519947052, + "logps/chosen": -672.3717041015625, + "logps/rejected": -1276.05224609375, + "loss": 0.091, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.2136264592409134, + "rewards/margins": 0.26162099838256836, + "rewards/rejected": -0.47524747252464294, + "step": 2620 + }, + { + "epoch": 0.7, + "learning_rate": 1.2399369117724582e-06, + "logits/chosen": -1.5877363681793213, + "logits/rejected": -0.9525176882743835, + "logps/chosen": -706.8630981445312, + "logps/rejected": -1383.3653564453125, + "loss": 0.0601, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.2066899538040161, + "rewards/margins": 0.30500850081443787, + "rewards/rejected": -0.5116984248161316, + "step": 2630 + }, + { + "epoch": 0.7, + "learning_rate": 1.2198928378235717e-06, + "logits/chosen": -1.7394685745239258, + "logits/rejected": -1.1289197206497192, + "logps/chosen": -770.9951171875, + "logps/rejected": -1358.626220703125, + "loss": 0.0663, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2261141985654831, + "rewards/margins": 0.2788589596748352, + "rewards/rejected": -0.5049731135368347, + "step": 2640 + }, + { + "epoch": 0.71, + "learning_rate": 1.1999596801769617e-06, + "logits/chosen": -1.5435702800750732, + "logits/rejected": -1.0047051906585693, + "logps/chosen": -634.2347412109375, + "logps/rejected": -1342.857177734375, + "loss": 0.0383, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18071284890174866, + "rewards/margins": 0.3370632231235504, + "rewards/rejected": -0.5177761316299438, + "step": 2650 + }, + { + "epoch": 0.71, + "learning_rate": 1.1801391659631423e-06, + "logits/chosen": -1.4378631114959717, + "logits/rejected": -1.1336383819580078, + "logps/chosen": -650.482177734375, + "logps/rejected": -1228.133056640625, + "loss": 0.1003, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.21179573237895966, + "rewards/margins": 0.25491851568222046, + "rewards/rejected": -0.46671420335769653, + "step": 2660 + }, + { + "epoch": 0.71, + "learning_rate": 1.160433012552508e-06, + "logits/chosen": -1.5026006698608398, + "logits/rejected": -1.0078703165054321, + "logps/chosen": -657.7984619140625, + "logps/rejected": -1241.7266845703125, + "loss": 0.0907, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.18873688578605652, + "rewards/margins": 0.2727685272693634, + "rewards/rejected": -0.4615054130554199, + "step": 2670 + }, + { + "epoch": 0.71, + "learning_rate": 1.1408429274065418e-06, + "logits/chosen": -1.5700013637542725, + "logits/rejected": -1.132730484008789, + "logps/chosen": -637.8956298828125, + "logps/rejected": -1278.509033203125, + "loss": 0.0772, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.19238412380218506, + "rewards/margins": 0.2825482189655304, + "rewards/rejected": -0.47493234276771545, + "step": 2680 + }, + { + "epoch": 0.72, + "learning_rate": 1.1213706079298566e-06, + "logits/chosen": -1.2392457723617554, + "logits/rejected": -0.6231773495674133, + "logps/chosen": -654.0189208984375, + "logps/rejected": -1259.2901611328125, + "loss": 0.0774, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2084679901599884, + "rewards/margins": 0.2823956310749054, + "rewards/rejected": -0.4908636212348938, + "step": 2690 + }, + { + "epoch": 0.72, + "learning_rate": 1.1020177413231334e-06, + "logits/chosen": -1.4837418794631958, + "logits/rejected": -1.0266939401626587, + "logps/chosen": -575.24169921875, + "logps/rejected": -1292.1416015625, + "loss": 0.0814, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.17716486752033234, + "rewards/margins": 0.29848557710647583, + "rewards/rejected": -0.4756503999233246, + "step": 2700 + }, + { + "epoch": 0.72, + "learning_rate": 1.0827860044369226e-06, + "logits/chosen": -1.7130857706069946, + "logits/rejected": -1.1947839260101318, + "logps/chosen": -707.26171875, + "logps/rejected": -1313.96484375, + "loss": 0.0685, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.20284132659435272, + "rewards/margins": 0.28745827078819275, + "rewards/rejected": -0.49029961228370667, + "step": 2710 + }, + { + "epoch": 0.73, + "learning_rate": 1.06367706362636e-06, + "logits/chosen": -1.60333251953125, + "logits/rejected": -1.095365047454834, + "logps/chosen": -590.465576171875, + "logps/rejected": -1228.563720703125, + "loss": 0.0736, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1857512891292572, + "rewards/margins": 0.2781633734703064, + "rewards/rejected": -0.463914692401886, + "step": 2720 + }, + { + "epoch": 0.73, + "learning_rate": 1.0446925746067768e-06, + "logits/chosen": -1.579377293586731, + "logits/rejected": -0.8978742361068726, + "logps/chosen": -583.1848754882812, + "logps/rejected": -1341.265380859375, + "loss": 0.0439, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.1685558259487152, + "rewards/margins": 0.35025161504745483, + "rewards/rejected": -0.5188074111938477, + "step": 2730 + }, + { + "epoch": 0.73, + "learning_rate": 1.0258341823102418e-06, + "logits/chosen": -1.5291283130645752, + "logits/rejected": -1.0524482727050781, + "logps/chosen": -632.1546020507812, + "logps/rejected": -1308.994384765625, + "loss": 0.0785, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19686022400856018, + "rewards/margins": 0.2875005602836609, + "rewards/rejected": -0.48436084389686584, + "step": 2740 + }, + { + "epoch": 0.73, + "learning_rate": 1.0071035207430352e-06, + "logits/chosen": -1.6773452758789062, + "logits/rejected": -0.8837090730667114, + "logps/chosen": -650.5315551757812, + "logps/rejected": -1348.764404296875, + "loss": 0.0673, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.16877111792564392, + "rewards/margins": 0.34151607751846313, + "rewards/rejected": -0.5102871656417847, + "step": 2750 + }, + { + "epoch": 0.74, + "learning_rate": 9.88502212844063e-07, + "logits/chosen": -1.2815361022949219, + "logits/rejected": -0.6873558163642883, + "logps/chosen": -566.9096069335938, + "logps/rejected": -1250.682373046875, + "loss": 0.0637, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18210506439208984, + "rewards/margins": 0.3151671588420868, + "rewards/rejected": -0.497272253036499, + "step": 2760 + }, + { + "epoch": 0.74, + "learning_rate": 9.700318703442437e-07, + "logits/chosen": -1.480957269668579, + "logits/rejected": -1.109178066253662, + "logps/chosen": -599.2630615234375, + "logps/rejected": -1315.3450927734375, + "loss": 0.0725, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.16557563841342926, + "rewards/margins": 0.32593974471092224, + "rewards/rejected": -0.49151545763015747, + "step": 2770 + }, + { + "epoch": 0.74, + "learning_rate": 9.516940936268504e-07, + "logits/chosen": -1.5238648653030396, + "logits/rejected": -0.9741169214248657, + "logps/chosen": -644.1940307617188, + "logps/rejected": -1286.3187255859375, + "loss": 0.0661, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1868639439344406, + "rewards/margins": 0.286797434091568, + "rewards/rejected": -0.4736614227294922, + "step": 2780 + }, + { + "epoch": 0.74, + "learning_rate": 9.334904715888496e-07, + "logits/chosen": -1.5936267375946045, + "logits/rejected": -1.060530424118042, + "logps/chosen": -593.8538208007812, + "logps/rejected": -1255.338623046875, + "loss": 0.0778, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.17814789712429047, + "rewards/margins": 0.27660074830055237, + "rewards/rejected": -0.45474863052368164, + "step": 2790 + }, + { + "epoch": 0.75, + "learning_rate": 9.154225815032242e-07, + "logits/chosen": -1.590341329574585, + "logits/rejected": -0.7449840903282166, + "logps/chosen": -610.4566650390625, + "logps/rejected": -1260.689208984375, + "loss": 0.0591, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1576831042766571, + "rewards/margins": 0.33318689465522766, + "rewards/rejected": -0.49086999893188477, + "step": 2800 + }, + { + "epoch": 0.75, + "learning_rate": 8.974919888823164e-07, + "logits/chosen": -1.3408777713775635, + "logits/rejected": -0.7836991548538208, + "logps/chosen": -589.0643920898438, + "logps/rejected": -1239.4324951171875, + "loss": 0.0914, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15606389939785004, + "rewards/margins": 0.29545897245407104, + "rewards/rejected": -0.45152291655540466, + "step": 2810 + }, + { + "epoch": 0.75, + "learning_rate": 8.797002473421729e-07, + "logits/chosen": -1.6109354496002197, + "logits/rejected": -1.0465881824493408, + "logps/chosen": -709.0189208984375, + "logps/rejected": -1308.754638671875, + "loss": 0.0682, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.20454378426074982, + "rewards/margins": 0.263343870639801, + "rewards/rejected": -0.4678876996040344, + "step": 2820 + }, + { + "epoch": 0.75, + "learning_rate": 8.620488984679378e-07, + "logits/chosen": -1.621872901916504, + "logits/rejected": -0.963768482208252, + "logps/chosen": -608.9706420898438, + "logps/rejected": -1198.4373779296875, + "loss": 0.0642, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.16753293573856354, + "rewards/margins": 0.2994327247142792, + "rewards/rejected": -0.4669656753540039, + "step": 2830 + }, + { + "epoch": 0.76, + "learning_rate": 8.445394716802754e-07, + "logits/chosen": -1.4504549503326416, + "logits/rejected": -0.7841233015060425, + "logps/chosen": -669.5154418945312, + "logps/rejected": -1335.7557373046875, + "loss": 0.0643, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.19036135077476501, + "rewards/margins": 0.3035343289375305, + "rewards/rejected": -0.49389567971229553, + "step": 2840 + }, + { + "epoch": 0.76, + "learning_rate": 8.271734841028553e-07, + "logits/chosen": -1.371618390083313, + "logits/rejected": -0.9690683484077454, + "logps/chosen": -578.9434814453125, + "logps/rejected": -1268.2574462890625, + "loss": 0.0852, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.17845505475997925, + "rewards/margins": 0.30898019671440125, + "rewards/rejected": -0.4874352812767029, + "step": 2850 + }, + { + "epoch": 0.76, + "learning_rate": 8.099524404308948e-07, + "logits/chosen": -1.3634693622589111, + "logits/rejected": -1.2364251613616943, + "logps/chosen": -655.4860229492188, + "logps/rejected": -1382.656005859375, + "loss": 0.0849, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.21539123356342316, + "rewards/margins": 0.2801755666732788, + "rewards/rejected": -0.49556678533554077, + "step": 2860 + }, + { + "epoch": 0.77, + "learning_rate": 7.928778328007918e-07, + "logits/chosen": -1.6657747030258179, + "logits/rejected": -1.1524592638015747, + "logps/chosen": -609.6968994140625, + "logps/rejected": -1228.130859375, + "loss": 0.1019, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.18815730512142181, + "rewards/margins": 0.2728883922100067, + "rewards/rejected": -0.46104568243026733, + "step": 2870 + }, + { + "epoch": 0.77, + "learning_rate": 7.759511406608255e-07, + "logits/chosen": -1.521481990814209, + "logits/rejected": -1.0144175291061401, + "logps/chosen": -666.2223510742188, + "logps/rejected": -1294.4681396484375, + "loss": 0.0567, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1811828464269638, + "rewards/margins": 0.2964962124824524, + "rewards/rejected": -0.4776790142059326, + "step": 2880 + }, + { + "epoch": 0.77, + "learning_rate": 7.591738306429769e-07, + "logits/chosen": -1.5954043865203857, + "logits/rejected": -0.9687323570251465, + "logps/chosen": -608.4232177734375, + "logps/rejected": -1266.69482421875, + "loss": 0.0699, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1722910851240158, + "rewards/margins": 0.308040052652359, + "rewards/rejected": -0.4803311228752136, + "step": 2890 + }, + { + "epoch": 0.77, + "learning_rate": 7.425473564358457e-07, + "logits/chosen": -1.4824254512786865, + "logits/rejected": -0.898513913154602, + "logps/chosen": -605.569091796875, + "logps/rejected": -1327.9998779296875, + "loss": 0.0638, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.16081413626670837, + "rewards/margins": 0.3248310089111328, + "rewards/rejected": -0.4856451451778412, + "step": 2900 + }, + { + "epoch": 0.78, + "learning_rate": 7.260731586586983e-07, + "logits/chosen": -1.2875173091888428, + "logits/rejected": -0.9592302441596985, + "logps/chosen": -512.1587524414062, + "logps/rejected": -1235.3233642578125, + "loss": 0.0696, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.13009069859981537, + "rewards/margins": 0.3230075538158417, + "rewards/rejected": -0.45309823751449585, + "step": 2910 + }, + { + "epoch": 0.78, + "learning_rate": 7.097526647366379e-07, + "logits/chosen": -1.526106357574463, + "logits/rejected": -0.9157294034957886, + "logps/chosen": -611.5623779296875, + "logps/rejected": -1331.778076171875, + "loss": 0.0441, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.1644577980041504, + "rewards/margins": 0.3466406762599945, + "rewards/rejected": -0.5110985040664673, + "step": 2920 + }, + { + "epoch": 0.78, + "learning_rate": 6.935872887769299e-07, + "logits/chosen": -1.4740724563598633, + "logits/rejected": -1.2198327779769897, + "logps/chosen": -515.4468994140625, + "logps/rejected": -1197.4677734375, + "loss": 0.0739, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13031090795993805, + "rewards/margins": 0.29615822434425354, + "rewards/rejected": -0.42646917700767517, + "step": 2930 + }, + { + "epoch": 0.78, + "learning_rate": 6.775784314464717e-07, + "logits/chosen": -1.691954255104065, + "logits/rejected": -1.2317649126052856, + "logps/chosen": -551.2989501953125, + "logps/rejected": -1318.2874755859375, + "loss": 0.0574, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.15544219315052032, + "rewards/margins": 0.31455904245376587, + "rewards/rejected": -0.4700012803077698, + "step": 2940 + }, + { + "epoch": 0.79, + "learning_rate": 6.617274798504286e-07, + "logits/chosen": -1.5783944129943848, + "logits/rejected": -0.9322364926338196, + "logps/chosen": -619.6995849609375, + "logps/rejected": -1310.8206787109375, + "loss": 0.0682, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.13858993351459503, + "rewards/margins": 0.35597696900367737, + "rewards/rejected": -0.4945669174194336, + "step": 2950 + }, + { + "epoch": 0.79, + "learning_rate": 6.460358074120518e-07, + "logits/chosen": -1.6260960102081299, + "logits/rejected": -1.0405943393707275, + "logps/chosen": -588.0743408203125, + "logps/rejected": -1362.0108642578125, + "loss": 0.037, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14083652198314667, + "rewards/margins": 0.32858163118362427, + "rewards/rejected": -0.46941813826560974, + "step": 2960 + }, + { + "epoch": 0.79, + "learning_rate": 6.305047737536707e-07, + "logits/chosen": -1.509161353111267, + "logits/rejected": -1.0090100765228271, + "logps/chosen": -575.589599609375, + "logps/rejected": -1260.2518310546875, + "loss": 0.0557, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16428951919078827, + "rewards/margins": 0.31566599011421204, + "rewards/rejected": -0.4799554944038391, + "step": 2970 + }, + { + "epoch": 0.79, + "learning_rate": 6.151357245788917e-07, + "logits/chosen": -1.3317458629608154, + "logits/rejected": -0.9225482940673828, + "logps/chosen": -795.397705078125, + "logps/rejected": -1355.7197265625, + "loss": 0.0627, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2432664930820465, + "rewards/margins": 0.2584363520145416, + "rewards/rejected": -0.5017029047012329, + "step": 2980 + }, + { + "epoch": 0.8, + "learning_rate": 5.999299915559956e-07, + "logits/chosen": -1.482460379600525, + "logits/rejected": -1.0576423406600952, + "logps/chosen": -619.4547119140625, + "logps/rejected": -1274.428955078125, + "loss": 0.0574, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.16743937134742737, + "rewards/margins": 0.3212641477584839, + "rewards/rejected": -0.48870354890823364, + "step": 2990 + }, + { + "epoch": 0.8, + "learning_rate": 5.848888922025553e-07, + "logits/chosen": -1.5611276626586914, + "logits/rejected": -1.225208044052124, + "logps/chosen": -633.8873901367188, + "logps/rejected": -1392.9622802734375, + "loss": 0.0583, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.15855436027050018, + "rewards/margins": 0.31555289030075073, + "rewards/rejected": -0.4741072654724121, + "step": 3000 + }, + { + "epoch": 0.8, + "learning_rate": 5.700137297712749e-07, + "logits/chosen": -1.5436227321624756, + "logits/rejected": -0.7761337161064148, + "logps/chosen": -619.49609375, + "logps/rejected": -1352.654052734375, + "loss": 0.0673, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.16255274415016174, + "rewards/margins": 0.3418361246585846, + "rewards/rejected": -0.5043889284133911, + "step": 3010 + }, + { + "epoch": 0.81, + "learning_rate": 5.553057931370729e-07, + "logits/chosen": -1.5770825147628784, + "logits/rejected": -0.8190025091171265, + "logps/chosen": -633.076416015625, + "logps/rejected": -1185.306396484375, + "loss": 0.0764, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.14611086249351501, + "rewards/margins": 0.2796880602836609, + "rewards/rejected": -0.4257989525794983, + "step": 3020 + }, + { + "epoch": 0.81, + "learning_rate": 5.407663566854008e-07, + "logits/chosen": -1.6848455667495728, + "logits/rejected": -0.7886224985122681, + "logps/chosen": -664.6588745117188, + "logps/rejected": -1323.16064453125, + "loss": 0.0494, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.15056583285331726, + "rewards/margins": 0.34144124388694763, + "rewards/rejected": -0.4920070767402649, + "step": 3030 + }, + { + "epoch": 0.81, + "learning_rate": 5.263966802018275e-07, + "logits/chosen": -1.736999750137329, + "logits/rejected": -0.8889113664627075, + "logps/chosen": -563.3355712890625, + "logps/rejected": -1182.220947265625, + "loss": 0.0538, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.10529695451259613, + "rewards/margins": 0.3400992751121521, + "rewards/rejected": -0.4453962445259094, + "step": 3040 + }, + { + "epoch": 0.81, + "learning_rate": 5.121980087628802e-07, + "logits/chosen": -1.3477979898452759, + "logits/rejected": -0.9392274618148804, + "logps/chosen": -666.6458740234375, + "logps/rejected": -1304.849853515625, + "loss": 0.0825, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18823906779289246, + "rewards/margins": 0.28339654207229614, + "rewards/rejected": -0.471635639667511, + "step": 3050 + }, + { + "epoch": 0.82, + "learning_rate": 4.981715726281666e-07, + "logits/chosen": -1.468207597732544, + "logits/rejected": -0.750462532043457, + "logps/chosen": -549.2374877929688, + "logps/rejected": -1129.191650390625, + "loss": 0.0837, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1359584927558899, + "rewards/margins": 0.2849787175655365, + "rewards/rejected": -0.4209372103214264, + "step": 3060 + }, + { + "epoch": 0.82, + "learning_rate": 4.843185871337722e-07, + "logits/chosen": -1.5334383249282837, + "logits/rejected": -1.020437240600586, + "logps/chosen": -542.3077392578125, + "logps/rejected": -1162.928955078125, + "loss": 0.0704, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.14567705988883972, + "rewards/margins": 0.2968258261680603, + "rewards/rejected": -0.4425028860569, + "step": 3070 + }, + { + "epoch": 0.82, + "learning_rate": 4.706402525869633e-07, + "logits/chosen": -1.614105463027954, + "logits/rejected": -0.9975967407226562, + "logps/chosen": -592.6704711914062, + "logps/rejected": -1195.4998779296875, + "loss": 0.0767, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.14713343977928162, + "rewards/margins": 0.293200820684433, + "rewards/rejected": -0.4403342306613922, + "step": 3080 + }, + { + "epoch": 0.82, + "learning_rate": 4.5713775416217884e-07, + "logits/chosen": -1.5890741348266602, + "logits/rejected": -1.1050251722335815, + "logps/chosen": -633.64990234375, + "logps/rejected": -1316.699462890625, + "loss": 0.0565, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.15724198520183563, + "rewards/margins": 0.3219819962978363, + "rewards/rejected": -0.47922396659851074, + "step": 3090 + }, + { + "epoch": 0.83, + "learning_rate": 4.438122617983442e-07, + "logits/chosen": -1.4880859851837158, + "logits/rejected": -1.0814130306243896, + "logps/chosen": -556.4613647460938, + "logps/rejected": -1175.9158935546875, + "loss": 0.0847, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1583164930343628, + "rewards/margins": 0.27797532081604004, + "rewards/rejected": -0.43629178404808044, + "step": 3100 + }, + { + "epoch": 0.83, + "learning_rate": 4.3066493009749853e-07, + "logits/chosen": -1.5819056034088135, + "logits/rejected": -0.8545023798942566, + "logps/chosen": -625.5091552734375, + "logps/rejected": -1260.5357666015625, + "loss": 0.056, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14933553338050842, + "rewards/margins": 0.3165150284767151, + "rewards/rejected": -0.4658505916595459, + "step": 3110 + }, + { + "epoch": 0.83, + "learning_rate": 4.1769689822475147e-07, + "logits/chosen": -1.4295012950897217, + "logits/rejected": -0.9826697111129761, + "logps/chosen": -595.7457885742188, + "logps/rejected": -1152.125, + "loss": 0.0926, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1720040887594223, + "rewards/margins": 0.25839370489120483, + "rewards/rejected": -0.4303978383541107, + "step": 3120 + }, + { + "epoch": 0.83, + "learning_rate": 4.049092898095816e-07, + "logits/chosen": -1.69967782497406, + "logits/rejected": -1.0549393892288208, + "logps/chosen": -673.6993408203125, + "logps/rejected": -1231.6971435546875, + "loss": 0.0544, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1993969976902008, + "rewards/margins": 0.26704469323158264, + "rewards/rejected": -0.46644172072410583, + "step": 3130 + }, + { + "epoch": 0.84, + "learning_rate": 3.9230321284847856e-07, + "logits/chosen": -1.347398042678833, + "logits/rejected": -0.7797093391418457, + "logps/chosen": -616.3978271484375, + "logps/rejected": -1361.00732421875, + "loss": 0.0547, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.1867767572402954, + "rewards/margins": 0.33997079730033875, + "rewards/rejected": -0.5267475247383118, + "step": 3140 + }, + { + "epoch": 0.84, + "learning_rate": 3.798797596089351e-07, + "logits/chosen": -1.627018928527832, + "logits/rejected": -0.8644296526908875, + "logps/chosen": -735.9021606445312, + "logps/rejected": -1377.241455078125, + "loss": 0.0526, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.19497773051261902, + "rewards/margins": 0.33351653814315796, + "rewards/rejected": -0.5284942388534546, + "step": 3150 + }, + { + "epoch": 0.84, + "learning_rate": 3.6764000653481263e-07, + "logits/chosen": -1.6354873180389404, + "logits/rejected": -0.8436982035636902, + "logps/chosen": -638.96728515625, + "logps/rejected": -1231.9659423828125, + "loss": 0.0799, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.19029106199741364, + "rewards/margins": 0.27815455198287964, + "rewards/rejected": -0.4684455990791321, + "step": 3160 + }, + { + "epoch": 0.85, + "learning_rate": 3.555850141530659e-07, + "logits/chosen": -1.9651466608047485, + "logits/rejected": -1.0768983364105225, + "logps/chosen": -751.0382690429688, + "logps/rejected": -1339.02783203125, + "loss": 0.0791, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1946159154176712, + "rewards/margins": 0.29727649688720703, + "rewards/rejected": -0.49189239740371704, + "step": 3170 + }, + { + "epoch": 0.85, + "learning_rate": 3.4371582698185636e-07, + "logits/chosen": -1.511791467666626, + "logits/rejected": -1.1664907932281494, + "logps/chosen": -508.4298400878906, + "logps/rejected": -1147.3773193359375, + "loss": 0.0805, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.149129718542099, + "rewards/margins": 0.2729035019874573, + "rewards/rejected": -0.4220332205295563, + "step": 3180 + }, + { + "epoch": 0.85, + "learning_rate": 3.3203347344004737e-07, + "logits/chosen": -1.4468176364898682, + "logits/rejected": -1.2824242115020752, + "logps/chosen": -515.1135864257812, + "logps/rejected": -1145.444580078125, + "loss": 0.1051, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1785714328289032, + "rewards/margins": 0.23533880710601807, + "rewards/rejected": -0.41391023993492126, + "step": 3190 + }, + { + "epoch": 0.85, + "learning_rate": 3.2053896575809426e-07, + "logits/chosen": -1.5166642665863037, + "logits/rejected": -0.9769018292427063, + "logps/chosen": -721.0452880859375, + "logps/rejected": -1224.28125, + "loss": 0.0988, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.19355639815330505, + "rewards/margins": 0.24941392242908478, + "rewards/rejected": -0.442970335483551, + "step": 3200 + }, + { + "epoch": 0.86, + "learning_rate": 3.092332998903416e-07, + "logits/chosen": -1.5955547094345093, + "logits/rejected": -0.8334843516349792, + "logps/chosen": -624.3068237304688, + "logps/rejected": -1299.734130859375, + "loss": 0.0705, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1941680610179901, + "rewards/margins": 0.31897804141044617, + "rewards/rejected": -0.5131461024284363, + "step": 3210 + }, + { + "epoch": 0.86, + "learning_rate": 2.981174554287239e-07, + "logits/chosen": -1.361081600189209, + "logits/rejected": -0.7655187845230103, + "logps/chosen": -657.2144775390625, + "logps/rejected": -1261.987548828125, + "loss": 0.0805, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19083121418952942, + "rewards/margins": 0.29023870825767517, + "rewards/rejected": -0.481069952249527, + "step": 3220 + }, + { + "epoch": 0.86, + "learning_rate": 2.871923955178918e-07, + "logits/chosen": -1.5999078750610352, + "logits/rejected": -0.7431113719940186, + "logps/chosen": -730.16455078125, + "logps/rejected": -1300.1298828125, + "loss": 0.0675, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.21212446689605713, + "rewards/margins": 0.29901638627052307, + "rewards/rejected": -0.5111408829689026, + "step": 3230 + }, + { + "epoch": 0.86, + "learning_rate": 2.764590667717562e-07, + "logits/chosen": -1.7554610967636108, + "logits/rejected": -1.0210330486297607, + "logps/chosen": -691.9470825195312, + "logps/rejected": -1321.851806640625, + "loss": 0.0668, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.17403197288513184, + "rewards/margins": 0.3116183876991272, + "rewards/rejected": -0.4856503903865814, + "step": 3240 + }, + { + "epoch": 0.87, + "learning_rate": 2.6591839919146963e-07, + "logits/chosen": -1.366562843322754, + "logits/rejected": -0.8948714137077332, + "logps/chosen": -622.9371948242188, + "logps/rejected": -1320.292724609375, + "loss": 0.068, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1862497329711914, + "rewards/margins": 0.32431843876838684, + "rewards/rejected": -0.5105680823326111, + "step": 3250 + }, + { + "epoch": 0.87, + "learning_rate": 2.555713060848433e-07, + "logits/chosen": -1.5601285696029663, + "logits/rejected": -0.8055307269096375, + "logps/chosen": -620.6419067382812, + "logps/rejected": -1285.7567138671875, + "loss": 0.0621, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.187259703874588, + "rewards/margins": 0.3192376494407654, + "rewards/rejected": -0.506497323513031, + "step": 3260 + }, + { + "epoch": 0.87, + "learning_rate": 2.454186839872158e-07, + "logits/chosen": -1.2374000549316406, + "logits/rejected": -0.9511027336120605, + "logps/chosen": -584.9683837890625, + "logps/rejected": -1257.975341796875, + "loss": 0.086, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.20605416595935822, + "rewards/margins": 0.264712929725647, + "rewards/rejected": -0.4707671105861664, + "step": 3270 + }, + { + "epoch": 0.87, + "learning_rate": 2.3546141258376786e-07, + "logits/chosen": -1.4687381982803345, + "logits/rejected": -1.0029032230377197, + "logps/chosen": -693.2366943359375, + "logps/rejected": -1394.9901123046875, + "loss": 0.0572, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.20397081971168518, + "rewards/margins": 0.31658852100372314, + "rewards/rejected": -0.5205592513084412, + "step": 3280 + }, + { + "epoch": 0.88, + "learning_rate": 2.257003546333042e-07, + "logits/chosen": -1.8167024850845337, + "logits/rejected": -0.9430249929428101, + "logps/chosen": -649.8558959960938, + "logps/rejected": -1447.1058349609375, + "loss": 0.0456, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.18286794424057007, + "rewards/margins": 0.35680800676345825, + "rewards/rejected": -0.5396759510040283, + "step": 3290 + }, + { + "epoch": 0.88, + "learning_rate": 2.1613635589349756e-07, + "logits/chosen": -1.5668996572494507, + "logits/rejected": -0.9328106045722961, + "logps/chosen": -549.9508056640625, + "logps/rejected": -1223.6793212890625, + "loss": 0.0725, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.156307652592659, + "rewards/margins": 0.3133383095264435, + "rewards/rejected": -0.46964597702026367, + "step": 3300 + }, + { + "epoch": 0.88, + "learning_rate": 2.0677024504760752e-07, + "logits/chosen": -1.8043934106826782, + "logits/rejected": -1.106671929359436, + "logps/chosen": -603.4810791015625, + "logps/rejected": -1286.2061767578125, + "loss": 0.0659, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.16138550639152527, + "rewards/margins": 0.31447383761405945, + "rewards/rejected": -0.4758593440055847, + "step": 3310 + }, + { + "epoch": 0.89, + "learning_rate": 1.9760283363267684e-07, + "logits/chosen": -1.6394857168197632, + "logits/rejected": -1.1434743404388428, + "logps/chosen": -560.7457885742188, + "logps/rejected": -1270.900390625, + "loss": 0.0668, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.18648633360862732, + "rewards/margins": 0.3022904694080353, + "rewards/rejected": -0.4887767732143402, + "step": 3320 + }, + { + "epoch": 0.89, + "learning_rate": 1.8863491596921745e-07, + "logits/chosen": -1.4652307033538818, + "logits/rejected": -0.8056305646896362, + "logps/chosen": -588.2001342773438, + "logps/rejected": -1267.324951171875, + "loss": 0.0493, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.17455193400382996, + "rewards/margins": 0.3139300048351288, + "rewards/rejected": -0.48848190903663635, + "step": 3330 + }, + { + "epoch": 0.89, + "learning_rate": 1.798672690923828e-07, + "logits/chosen": -1.417265772819519, + "logits/rejected": -0.8639974594116211, + "logps/chosen": -566.65234375, + "logps/rejected": -1115.6005859375, + "loss": 0.0645, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1619497835636139, + "rewards/margins": 0.26331502199172974, + "rewards/rejected": -0.42526477575302124, + "step": 3340 + }, + { + "epoch": 0.89, + "learning_rate": 1.713006526846439e-07, + "logits/chosen": -1.53346848487854, + "logits/rejected": -0.9233430027961731, + "logps/chosen": -593.0252075195312, + "logps/rejected": -1353.1142578125, + "loss": 0.0667, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.17712654173374176, + "rewards/margins": 0.3622104525566101, + "rewards/rejected": -0.5393369793891907, + "step": 3350 + }, + { + "epoch": 0.9, + "learning_rate": 1.629358090099639e-07, + "logits/chosen": -1.4553884267807007, + "logits/rejected": -0.867672324180603, + "logps/chosen": -564.8086547851562, + "logps/rejected": -1151.0306396484375, + "loss": 0.0901, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.18404200673103333, + "rewards/margins": 0.25011754035949707, + "rewards/rejected": -0.4341595768928528, + "step": 3360 + }, + { + "epoch": 0.9, + "learning_rate": 1.5477346284948292e-07, + "logits/chosen": -1.5238468647003174, + "logits/rejected": -1.0101631879806519, + "logps/chosen": -620.8792724609375, + "logps/rejected": -1464.6549072265625, + "loss": 0.0397, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.1736445277929306, + "rewards/margins": 0.37397870421409607, + "rewards/rejected": -0.5476232171058655, + "step": 3370 + }, + { + "epoch": 0.9, + "learning_rate": 1.4681432143872133e-07, + "logits/chosen": -1.4745080471038818, + "logits/rejected": -1.0695136785507202, + "logps/chosen": -757.6289672851562, + "logps/rejected": -1409.22802734375, + "loss": 0.0673, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.22448639571666718, + "rewards/margins": 0.2988077998161316, + "rewards/rejected": -0.5232942700386047, + "step": 3380 + }, + { + "epoch": 0.9, + "learning_rate": 1.3905907440629752e-07, + "logits/chosen": -1.6097753047943115, + "logits/rejected": -1.02309250831604, + "logps/chosen": -702.1261596679688, + "logps/rejected": -1313.4244384765625, + "loss": 0.0943, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2056199014186859, + "rewards/margins": 0.2749633491039276, + "rewards/rejected": -0.4805833399295807, + "step": 3390 + }, + { + "epoch": 0.91, + "learning_rate": 1.31508393714177e-07, + "logits/chosen": -1.528530478477478, + "logits/rejected": -1.0698211193084717, + "logps/chosen": -605.3930053710938, + "logps/rejected": -1313.823486328125, + "loss": 0.0457, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1993333399295807, + "rewards/margins": 0.3147328794002533, + "rewards/rejected": -0.514066219329834, + "step": 3400 + }, + { + "epoch": 0.91, + "learning_rate": 1.241629335994471e-07, + "logits/chosen": -1.638108253479004, + "logits/rejected": -0.8232443928718567, + "logps/chosen": -782.85986328125, + "logps/rejected": -1360.0474853515625, + "loss": 0.0739, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2265842854976654, + "rewards/margins": 0.2933647036552429, + "rewards/rejected": -0.5199490785598755, + "step": 3410 + }, + { + "epoch": 0.91, + "learning_rate": 1.1702333051763271e-07, + "logits/chosen": -1.5153647661209106, + "logits/rejected": -0.7482441663742065, + "logps/chosen": -806.5430908203125, + "logps/rejected": -1379.1104736328125, + "loss": 0.1037, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.21187452971935272, + "rewards/margins": 0.2870264947414398, + "rewards/rejected": -0.4989010691642761, + "step": 3420 + }, + { + "epoch": 0.91, + "learning_rate": 1.1009020308754587e-07, + "logits/chosen": -1.422628402709961, + "logits/rejected": -1.1864253282546997, + "logps/chosen": -631.9682006835938, + "logps/rejected": -1306.8543701171875, + "loss": 0.1056, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20772810280323029, + "rewards/margins": 0.2739812731742859, + "rewards/rejected": -0.48170939087867737, + "step": 3430 + }, + { + "epoch": 0.92, + "learning_rate": 1.0336415203768962e-07, + "logits/chosen": -1.5100951194763184, + "logits/rejected": -0.9938896894454956, + "logps/chosen": -725.32373046875, + "logps/rejected": -1294.2386474609375, + "loss": 0.0828, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.20762935280799866, + "rewards/margins": 0.26330476999282837, + "rewards/rejected": -0.47093409299850464, + "step": 3440 + }, + { + "epoch": 0.92, + "learning_rate": 9.684576015420277e-08, + "logits/chosen": -1.4899585247039795, + "logits/rejected": -0.9078477025032043, + "logps/chosen": -672.1417846679688, + "logps/rejected": -1225.94140625, + "loss": 0.0873, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17601068317890167, + "rewards/margins": 0.2746294140815735, + "rewards/rejected": -0.4506400525569916, + "step": 3450 + }, + { + "epoch": 0.92, + "learning_rate": 9.053559223036746e-08, + "logits/chosen": -1.6049926280975342, + "logits/rejected": -0.8928203582763672, + "logps/chosen": -690.5973510742188, + "logps/rejected": -1238.628173828125, + "loss": 0.0812, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.20545156300067902, + "rewards/margins": 0.27740758657455444, + "rewards/rejected": -0.48285919427871704, + "step": 3460 + }, + { + "epoch": 0.93, + "learning_rate": 8.44341950176683e-08, + "logits/chosen": -1.3215292692184448, + "logits/rejected": -0.9712142944335938, + "logps/chosen": -696.58349609375, + "logps/rejected": -1317.867431640625, + "loss": 0.0799, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.18639104068279266, + "rewards/margins": 0.28709009289741516, + "rewards/rejected": -0.47348111867904663, + "step": 3470 + }, + { + "epoch": 0.93, + "learning_rate": 7.854209717842231e-08, + "logits/chosen": -1.5110366344451904, + "logits/rejected": -0.9558350443840027, + "logps/chosen": -647.8221435546875, + "logps/rejected": -1398.7818603515625, + "loss": 0.0389, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.1730073243379593, + "rewards/margins": 0.34914129972457886, + "rewards/rejected": -0.522148609161377, + "step": 3480 + }, + { + "epoch": 0.93, + "learning_rate": 7.285980923996989e-08, + "logits/chosen": -1.4418294429779053, + "logits/rejected": -0.9980441331863403, + "logps/chosen": -586.1068115234375, + "logps/rejected": -1378.889892578125, + "loss": 0.0612, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.17401185631752014, + "rewards/margins": 0.3377479314804077, + "rewards/rejected": -0.5117597579956055, + "step": 3490 + }, + { + "epoch": 0.93, + "learning_rate": 6.738782355044048e-08, + "logits/chosen": -1.775757074356079, + "logits/rejected": -1.016351342201233, + "logps/chosen": -715.5010986328125, + "logps/rejected": -1282.7650146484375, + "loss": 0.0603, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18584254384040833, + "rewards/margins": 0.28814181685447693, + "rewards/rejected": -0.47398439049720764, + "step": 3500 + }, + { + "epoch": 0.94, + "learning_rate": 6.212661423609184e-08, + "logits/chosen": -1.5122634172439575, + "logits/rejected": -0.9780920743942261, + "logps/chosen": -668.7765502929688, + "logps/rejected": -1279.484619140625, + "loss": 0.0737, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18126599490642548, + "rewards/margins": 0.28931209444999695, + "rewards/rejected": -0.47057804465293884, + "step": 3510 + }, + { + "epoch": 0.94, + "learning_rate": 5.707663716023021e-08, + "logits/chosen": -1.7019774913787842, + "logits/rejected": -0.9764993786811829, + "logps/chosen": -598.6218872070312, + "logps/rejected": -1202.2509765625, + "loss": 0.0736, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.16811831295490265, + "rewards/margins": 0.292553186416626, + "rewards/rejected": -0.4606715142726898, + "step": 3520 + }, + { + "epoch": 0.94, + "learning_rate": 5.22383298837098e-08, + "logits/chosen": -1.589091420173645, + "logits/rejected": -1.0338377952575684, + "logps/chosen": -595.810302734375, + "logps/rejected": -1203.7371826171875, + "loss": 0.08, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1645013988018036, + "rewards/margins": 0.29109999537467957, + "rewards/rejected": -0.45560139417648315, + "step": 3530 + }, + { + "epoch": 0.94, + "learning_rate": 4.761211162702117e-08, + "logits/chosen": -1.7127647399902344, + "logits/rejected": -0.8404116630554199, + "logps/chosen": -608.0611572265625, + "logps/rejected": -1101.7435302734375, + "loss": 0.0955, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.16957123577594757, + "rewards/margins": 0.2522638440132141, + "rewards/rejected": -0.4218350946903229, + "step": 3540 + }, + { + "epoch": 0.95, + "learning_rate": 4.319838323396691e-08, + "logits/chosen": -1.3308387994766235, + "logits/rejected": -0.711550772190094, + "logps/chosen": -609.5896606445312, + "logps/rejected": -1312.685302734375, + "loss": 0.0725, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.17916107177734375, + "rewards/margins": 0.2821735739707947, + "rewards/rejected": -0.4613346457481384, + "step": 3550 + }, + { + "epoch": 0.95, + "learning_rate": 3.8997527136930004e-08, + "logits/chosen": -1.469405174255371, + "logits/rejected": -0.9255521893501282, + "logps/chosen": -650.4967041015625, + "logps/rejected": -1233.0570068359375, + "loss": 0.0673, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.19022206962108612, + "rewards/margins": 0.2924087345600128, + "rewards/rejected": -0.48263078927993774, + "step": 3560 + }, + { + "epoch": 0.95, + "learning_rate": 3.5009907323737826e-08, + "logits/chosen": -1.5496537685394287, + "logits/rejected": -1.0329219102859497, + "logps/chosen": -653.6101684570312, + "logps/rejected": -1362.90478515625, + "loss": 0.0495, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19138206541538239, + "rewards/margins": 0.3122704029083252, + "rewards/rejected": -0.5036525130271912, + "step": 3570 + }, + { + "epoch": 0.95, + "learning_rate": 3.1235869306123766e-08, + "logits/chosen": -1.4933403730392456, + "logits/rejected": -0.8053463101387024, + "logps/chosen": -725.1935424804688, + "logps/rejected": -1382.274169921875, + "loss": 0.0651, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21708261966705322, + "rewards/margins": 0.3156852126121521, + "rewards/rejected": -0.5327678322792053, + "step": 3580 + }, + { + "epoch": 0.96, + "learning_rate": 2.767574008979007e-08, + "logits/chosen": -1.3949609994888306, + "logits/rejected": -0.9586105346679688, + "logps/chosen": -533.8550415039062, + "logps/rejected": -1206.543212890625, + "loss": 0.0737, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.15743830800056458, + "rewards/margins": 0.2968447208404541, + "rewards/rejected": -0.4542829990386963, + "step": 3590 + }, + { + "epoch": 0.96, + "learning_rate": 2.4329828146074096e-08, + "logits/chosen": -1.6258302927017212, + "logits/rejected": -1.0097240209579468, + "logps/chosen": -639.5706176757812, + "logps/rejected": -1331.8486328125, + "loss": 0.0453, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17417296767234802, + "rewards/margins": 0.3265232443809509, + "rewards/rejected": -0.5006962418556213, + "step": 3600 + }, + { + "epoch": 0.96, + "learning_rate": 2.1198423385220822e-08, + "logits/chosen": -1.5256226062774658, + "logits/rejected": -0.8268339037895203, + "logps/chosen": -662.9381103515625, + "logps/rejected": -1204.4327392578125, + "loss": 0.0715, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.19520241022109985, + "rewards/margins": 0.27870461344718933, + "rewards/rejected": -0.47390708327293396, + "step": 3610 + }, + { + "epoch": 0.97, + "learning_rate": 1.82817971312621e-08, + "logits/chosen": -1.7607667446136475, + "logits/rejected": -1.1123876571655273, + "logps/chosen": -596.0337524414062, + "logps/rejected": -1304.044921875, + "loss": 0.0639, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14941272139549255, + "rewards/margins": 0.33007779717445374, + "rewards/rejected": -0.47949057817459106, + "step": 3620 + }, + { + "epoch": 0.97, + "learning_rate": 1.5580202098509078e-08, + "logits/chosen": -1.6048189401626587, + "logits/rejected": -0.984302818775177, + "logps/chosen": -657.6131591796875, + "logps/rejected": -1466.190185546875, + "loss": 0.0442, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.18197950720787048, + "rewards/margins": 0.34665971994400024, + "rewards/rejected": -0.5286391973495483, + "step": 3630 + }, + { + "epoch": 0.97, + "learning_rate": 1.3093872369654148e-08, + "logits/chosen": -1.5646374225616455, + "logits/rejected": -0.8982056379318237, + "logps/chosen": -580.00146484375, + "logps/rejected": -1101.258544921875, + "loss": 0.1123, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1632545292377472, + "rewards/margins": 0.24513199925422668, + "rewards/rejected": -0.4083865284919739, + "step": 3640 + }, + { + "epoch": 0.97, + "learning_rate": 1.0823023375489128e-08, + "logits/chosen": -1.5190411806106567, + "logits/rejected": -1.027479887008667, + "logps/chosen": -595.8035888671875, + "logps/rejected": -1325.04443359375, + "loss": 0.0629, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.17017218470573425, + "rewards/margins": 0.305239737033844, + "rewards/rejected": -0.47541195154190063, + "step": 3650 + }, + { + "epoch": 0.98, + "learning_rate": 8.767851876239075e-09, + "logits/chosen": -1.6967909336090088, + "logits/rejected": -0.9378561973571777, + "logps/chosen": -561.2920532226562, + "logps/rejected": -1203.197021484375, + "loss": 0.0622, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.14759351313114166, + "rewards/margins": 0.31226032972335815, + "rewards/rejected": -0.4598538279533386, + "step": 3660 + }, + { + "epoch": 0.98, + "learning_rate": 6.9285359445145366e-09, + "logits/chosen": -1.6613948345184326, + "logits/rejected": -1.1440684795379639, + "logps/chosen": -600.6172485351562, + "logps/rejected": -1154.182861328125, + "loss": 0.0852, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17384423315525055, + "rewards/margins": 0.26200392842292786, + "rewards/rejected": -0.4358481466770172, + "step": 3670 + }, + { + "epoch": 0.98, + "learning_rate": 5.305234949880001e-09, + "logits/chosen": -1.5972778797149658, + "logits/rejected": -0.801485538482666, + "logps/chosen": -688.7052001953125, + "logps/rejected": -1276.092041015625, + "loss": 0.0613, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18506471812725067, + "rewards/margins": 0.29814431071281433, + "rewards/rejected": -0.4832090437412262, + "step": 3680 + }, + { + "epoch": 0.98, + "learning_rate": 3.8980895450474455e-09, + "logits/chosen": -1.405221700668335, + "logits/rejected": -0.8206748962402344, + "logps/chosen": -693.5679931640625, + "logps/rejected": -1329.2283935546875, + "loss": 0.0479, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.19921013712882996, + "rewards/margins": 0.30057448148727417, + "rewards/rejected": -0.4997846186161041, + "step": 3690 + }, + { + "epoch": 0.99, + "learning_rate": 2.7072216536885855e-09, + "logits/chosen": -1.4513555765151978, + "logits/rejected": -0.7634484767913818, + "logps/chosen": -613.1361083984375, + "logps/rejected": -1215.9375, + "loss": 0.0642, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.18391281366348267, + "rewards/margins": 0.2894715964794159, + "rewards/rejected": -0.47338438034057617, + "step": 3700 + }, + { + "epoch": 0.99, + "learning_rate": 1.7327344598702667e-09, + "logits/chosen": -1.5481555461883545, + "logits/rejected": -0.75025874376297, + "logps/chosen": -655.292236328125, + "logps/rejected": -1392.68115234375, + "loss": 0.0376, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.18728521466255188, + "rewards/margins": 0.3374475836753845, + "rewards/rejected": -0.524732768535614, + "step": 3710 + }, + { + "epoch": 0.99, + "learning_rate": 9.747123991141193e-10, + "logits/chosen": -1.4917545318603516, + "logits/rejected": -1.0273144245147705, + "logps/chosen": -579.1278686523438, + "logps/rejected": -1233.640869140625, + "loss": 0.0868, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.17262960970401764, + "rewards/margins": 0.2730409801006317, + "rewards/rejected": -0.44567054510116577, + "step": 3720 + }, + { + "epoch": 0.99, + "learning_rate": 4.332211510807427e-10, + "logits/chosen": -1.4177316427230835, + "logits/rejected": -0.9999169111251831, + "logps/chosen": -677.0986328125, + "logps/rejected": -1409.077880859375, + "loss": 0.0463, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.19204583764076233, + "rewards/margins": 0.3291808068752289, + "rewards/rejected": -0.5212266445159912, + "step": 3730 + }, + { + "epoch": 1.0, + "learning_rate": 1.0830763387897902e-10, + "logits/chosen": -1.4876052141189575, + "logits/rejected": -0.8847878575325012, + "logps/chosen": -651.7437744140625, + "logps/rejected": -1331.395751953125, + "loss": 0.0532, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.15324924886226654, + "rewards/margins": 0.33165210485458374, + "rewards/rejected": -0.4849013388156891, + "step": 3740 + }, + { + "epoch": 1.0, + "learning_rate": 0.0, + "logits/chosen": -1.6436065435409546, + "logits/rejected": -0.8282138705253601, + "logps/chosen": -606.6455688476562, + "logps/rejected": -1475.3424072265625, + "loss": 0.0392, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.18834789097309113, + "rewards/margins": 0.36257123947143555, + "rewards/rejected": -0.5509191751480103, + "step": 3750 + }, + { + "epoch": 1.0, + "step": 3750, + "total_flos": 0.0, + "train_loss": 0.07734704875151316, + "train_runtime": 15655.3296, + "train_samples_per_second": 0.958, + "train_steps_per_second": 0.24 + } + ], + "logging_steps": 10, + "max_steps": 3750, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}