{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 5733, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005232862375719519, "grad_norm": 183.98587765287522, "learning_rate": 8.710801393728223e-10, "logits/chosen": -2.9629170894622803, "logits/rejected": -2.8386623859405518, "logps/chosen": -359.1232604980469, "logps/rejected": -423.54595947265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0052328623757195184, "grad_norm": 167.87221508350268, "learning_rate": 8.710801393728223e-09, "logits/chosen": -2.615884304046631, "logits/rejected": -2.5635154247283936, "logps/chosen": -315.01708984375, "logps/rejected": -260.0672302246094, "loss": 0.6924, "rewards/accuracies": 0.5, "rewards/chosen": 0.0022036631125956774, "rewards/margins": 0.009863314218819141, "rewards/rejected": -0.007659651339054108, "step": 10 }, { "epoch": 0.010465724751439037, "grad_norm": 154.0126096202216, "learning_rate": 1.7421602787456446e-08, "logits/chosen": -2.6068050861358643, "logits/rejected": -2.5919718742370605, "logps/chosen": -218.543212890625, "logps/rejected": -218.17190551757812, "loss": 0.692, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0015194631414487958, "rewards/margins": 0.008878999389708042, "rewards/rejected": -0.007359535899013281, "step": 20 }, { "epoch": 0.015698587127158554, "grad_norm": 176.1128975668413, "learning_rate": 2.6132404181184667e-08, "logits/chosen": -2.5653188228607178, "logits/rejected": -2.580052614212036, "logps/chosen": -281.95220947265625, "logps/rejected": -287.4288024902344, "loss": 0.6927, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.003514764364808798, "rewards/margins": 0.004516353365033865, "rewards/rejected": -0.0010015892330557108, "step": 30 }, { "epoch": 0.020931449502878074, "grad_norm": 171.8129308647673, "learning_rate": 3.484320557491289e-08, "logits/chosen": -2.6622366905212402, "logits/rejected": -2.5720410346984863, "logps/chosen": -284.25567626953125, "logps/rejected": -277.15838623046875, "loss": 0.6902, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.003352329134941101, "rewards/margins": 0.0006043304456397891, "rewards/rejected": -0.00395665830001235, "step": 40 }, { "epoch": 0.026164311878597593, "grad_norm": 167.64074454350632, "learning_rate": 4.355400696864111e-08, "logits/chosen": -2.7314631938934326, "logits/rejected": -2.696852207183838, "logps/chosen": -257.3500061035156, "logps/rejected": -267.8942565917969, "loss": 0.6918, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.002391135785728693, "rewards/margins": 0.0023752849083393812, "rewards/rejected": -0.004766420926898718, "step": 50 }, { "epoch": 0.03139717425431711, "grad_norm": 160.88259681415948, "learning_rate": 5.2264808362369334e-08, "logits/chosen": -2.6548471450805664, "logits/rejected": -2.641181468963623, "logps/chosen": -283.1813659667969, "logps/rejected": -285.5431823730469, "loss": 0.6885, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.010778693482279778, "rewards/margins": 0.0168532095849514, "rewards/rejected": -0.006074518896639347, "step": 60 }, { "epoch": 0.03663003663003663, "grad_norm": 179.43805291752176, "learning_rate": 6.097560975609756e-08, "logits/chosen": -2.7097113132476807, "logits/rejected": -2.6384758949279785, "logps/chosen": -305.2355041503906, "logps/rejected": -285.43377685546875, "loss": 0.6856, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.017004575580358505, "rewards/margins": 0.019567107781767845, "rewards/rejected": -0.002562532667070627, "step": 70 }, { "epoch": 0.04186289900575615, "grad_norm": 176.46253282950096, "learning_rate": 6.968641114982578e-08, "logits/chosen": -2.6290106773376465, "logits/rejected": -2.5502569675445557, "logps/chosen": -276.4088439941406, "logps/rejected": -278.7929992675781, "loss": 0.685, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.014232590794563293, "rewards/margins": 0.019482804462313652, "rewards/rejected": -0.005250214599072933, "step": 80 }, { "epoch": 0.04709576138147567, "grad_norm": 162.969010570062, "learning_rate": 7.8397212543554e-08, "logits/chosen": -2.642940044403076, "logits/rejected": -2.598159074783325, "logps/chosen": -243.7359619140625, "logps/rejected": -237.2580108642578, "loss": 0.6811, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.018116420134902, "rewards/margins": 0.01501198299229145, "rewards/rejected": 0.0031044380739331245, "step": 90 }, { "epoch": 0.052328623757195186, "grad_norm": 159.0928564282926, "learning_rate": 8.710801393728223e-08, "logits/chosen": -2.7249038219451904, "logits/rejected": -2.6030564308166504, "logps/chosen": -290.7940368652344, "logps/rejected": -235.9745635986328, "loss": 0.6673, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.061399661004543304, "rewards/margins": 0.06571947038173676, "rewards/rejected": -0.004319812171161175, "step": 100 }, { "epoch": 0.052328623757195186, "eval_logits/chosen": -2.62892746925354, "eval_logits/rejected": -2.5742084980010986, "eval_logps/chosen": -274.2127685546875, "eval_logps/rejected": -264.3204345703125, "eval_loss": 0.6670113205909729, "eval_rewards/accuracies": 0.6796875, "eval_rewards/chosen": 0.06990836560726166, "eval_rewards/margins": 0.06019680202007294, "eval_rewards/rejected": 0.009711564518511295, "eval_runtime": 218.3359, "eval_samples_per_second": 9.16, "eval_steps_per_second": 0.147, "step": 100 }, { "epoch": 0.0575614861329147, "grad_norm": 147.0037502096744, "learning_rate": 9.581881533101045e-08, "logits/chosen": -2.601004123687744, "logits/rejected": -2.5575811862945557, "logps/chosen": -257.3785705566406, "logps/rejected": -236.22900390625, "loss": 0.6595, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.11586113274097443, "rewards/margins": 0.10850280523300171, "rewards/rejected": 0.007358331233263016, "step": 110 }, { "epoch": 0.06279434850863422, "grad_norm": 144.62623151189092, "learning_rate": 1.0452961672473867e-07, "logits/chosen": -2.7150049209594727, "logits/rejected": -2.6897425651550293, "logps/chosen": -347.8599548339844, "logps/rejected": -314.2908020019531, "loss": 0.6561, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.1537223905324936, "rewards/margins": 0.06817600131034851, "rewards/rejected": 0.08554638922214508, "step": 120 }, { "epoch": 0.06802721088435375, "grad_norm": 396.6233809581414, "learning_rate": 1.132404181184669e-07, "logits/chosen": -2.55329966545105, "logits/rejected": -2.5485997200012207, "logps/chosen": -216.77621459960938, "logps/rejected": -243.31454467773438, "loss": 0.654, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.13126204907894135, "rewards/margins": 0.13412313163280487, "rewards/rejected": -0.0028610914014279842, "step": 130 }, { "epoch": 0.07326007326007326, "grad_norm": 153.51508852061852, "learning_rate": 1.219512195121951e-07, "logits/chosen": -2.6019041538238525, "logits/rejected": -2.5934832096099854, "logps/chosen": -299.33465576171875, "logps/rejected": -273.23834228515625, "loss": 0.6256, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1934138387441635, "rewards/margins": 0.20421452820301056, "rewards/rejected": -0.010800689458847046, "step": 140 }, { "epoch": 0.07849293563579278, "grad_norm": 163.7060760889637, "learning_rate": 1.3066202090592334e-07, "logits/chosen": -2.6243631839752197, "logits/rejected": -2.5580031871795654, "logps/chosen": -265.50518798828125, "logps/rejected": -250.77645874023438, "loss": 0.6198, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.21493752300739288, "rewards/margins": 0.2061689794063568, "rewards/rejected": 0.008768541738390923, "step": 150 }, { "epoch": 0.0837257980115123, "grad_norm": 156.7774326910388, "learning_rate": 1.3937282229965157e-07, "logits/chosen": -2.599414110183716, "logits/rejected": -2.5340352058410645, "logps/chosen": -281.4360656738281, "logps/rejected": -251.8666534423828, "loss": 0.609, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.3013482391834259, "rewards/margins": 0.31039100885391235, "rewards/rejected": -0.009042812511324883, "step": 160 }, { "epoch": 0.08895866038723181, "grad_norm": 162.12146483359885, "learning_rate": 1.480836236933798e-07, "logits/chosen": -2.6282248497009277, "logits/rejected": -2.62455153465271, "logps/chosen": -259.92987060546875, "logps/rejected": -274.9657897949219, "loss": 0.6118, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.3490704596042633, "rewards/margins": 0.11296037584543228, "rewards/rejected": 0.23611004650592804, "step": 170 }, { "epoch": 0.09419152276295134, "grad_norm": 114.03745902544327, "learning_rate": 1.56794425087108e-07, "logits/chosen": -2.543210744857788, "logits/rejected": -2.484558582305908, "logps/chosen": -245.36703491210938, "logps/rejected": -251.70455932617188, "loss": 0.6177, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.2426120489835739, "rewards/margins": 0.20732513070106506, "rewards/rejected": 0.03528692573308945, "step": 180 }, { "epoch": 0.09942438513867086, "grad_norm": 157.936808405019, "learning_rate": 1.6550522648083622e-07, "logits/chosen": -2.632174015045166, "logits/rejected": -2.5556230545043945, "logps/chosen": -318.48028564453125, "logps/rejected": -272.4574890136719, "loss": 0.576, "rewards/accuracies": 0.8125, "rewards/chosen": 0.39713579416275024, "rewards/margins": 0.46693238615989685, "rewards/rejected": -0.06979658454656601, "step": 190 }, { "epoch": 0.10465724751439037, "grad_norm": 133.53183529149425, "learning_rate": 1.7421602787456445e-07, "logits/chosen": -2.5309154987335205, "logits/rejected": -2.47617769241333, "logps/chosen": -273.1667785644531, "logps/rejected": -287.5762939453125, "loss": 0.5806, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.40918025374412537, "rewards/margins": 0.34663549065589905, "rewards/rejected": 0.0625447928905487, "step": 200 }, { "epoch": 0.10465724751439037, "eval_logits/chosen": -2.58058762550354, "eval_logits/rejected": -2.522460460662842, "eval_logps/chosen": -271.1104431152344, "eval_logps/rejected": -264.5256042480469, "eval_loss": 0.5925512909889221, "eval_rewards/accuracies": 0.705078125, "eval_rewards/chosen": 0.3801417350769043, "eval_rewards/margins": 0.3909473419189453, "eval_rewards/rejected": -0.010805574245750904, "eval_runtime": 218.4019, "eval_samples_per_second": 9.157, "eval_steps_per_second": 0.147, "step": 200 }, { "epoch": 0.10989010989010989, "grad_norm": 221.83994817426583, "learning_rate": 1.8292682926829268e-07, "logits/chosen": -2.611128091812134, "logits/rejected": -2.5472967624664307, "logps/chosen": -226.0545196533203, "logps/rejected": -219.01443481445312, "loss": 0.625, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.2898949384689331, "rewards/margins": 0.3430555462837219, "rewards/rejected": -0.053160618990659714, "step": 210 }, { "epoch": 0.1151229722658294, "grad_norm": 120.50438689186194, "learning_rate": 1.916376306620209e-07, "logits/chosen": -2.638408660888672, "logits/rejected": -2.6073029041290283, "logps/chosen": -250.3545379638672, "logps/rejected": -256.3841552734375, "loss": 0.5637, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.35777509212493896, "rewards/margins": 0.3942762613296509, "rewards/rejected": -0.03650114685297012, "step": 220 }, { "epoch": 0.12035583464154893, "grad_norm": 284.0145438996966, "learning_rate": 2.003484320557491e-07, "logits/chosen": -2.605532169342041, "logits/rejected": -2.586738109588623, "logps/chosen": -275.60284423828125, "logps/rejected": -312.9735107421875, "loss": 0.5722, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.7563163638114929, "rewards/margins": 0.4808167517185211, "rewards/rejected": 0.2754996716976166, "step": 230 }, { "epoch": 0.12558869701726844, "grad_norm": 152.56981237328114, "learning_rate": 2.0905923344947734e-07, "logits/chosen": -2.534872055053711, "logits/rejected": -2.4644360542297363, "logps/chosen": -239.1423797607422, "logps/rejected": -262.83331298828125, "loss": 0.5606, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2813827395439148, "rewards/margins": 0.43976646661758423, "rewards/rejected": -0.15838375687599182, "step": 240 }, { "epoch": 0.13082155939298795, "grad_norm": 128.66498664663672, "learning_rate": 2.1777003484320556e-07, "logits/chosen": -2.555589199066162, "logits/rejected": -2.5107057094573975, "logps/chosen": -289.6865539550781, "logps/rejected": -269.06890869140625, "loss": 0.5432, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.25498753786087036, "rewards/margins": 0.6274275779724121, "rewards/rejected": -0.37244001030921936, "step": 250 }, { "epoch": 0.1360544217687075, "grad_norm": 178.37695249436223, "learning_rate": 2.264808362369338e-07, "logits/chosen": -2.564396619796753, "logits/rejected": -2.5406436920166016, "logps/chosen": -268.16241455078125, "logps/rejected": -248.9013214111328, "loss": 0.5735, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.20303396880626678, "rewards/margins": 0.750515878200531, "rewards/rejected": -0.547481894493103, "step": 260 }, { "epoch": 0.141287284144427, "grad_norm": 110.63597263745346, "learning_rate": 2.3519163763066202e-07, "logits/chosen": -2.4621379375457764, "logits/rejected": -2.415996789932251, "logps/chosen": -270.91864013671875, "logps/rejected": -299.12884521484375, "loss": 0.4954, "rewards/accuracies": 0.75, "rewards/chosen": 0.32311588525772095, "rewards/margins": 0.6306180953979492, "rewards/rejected": -0.30750221014022827, "step": 270 }, { "epoch": 0.14652014652014653, "grad_norm": 123.42335213451238, "learning_rate": 2.439024390243902e-07, "logits/chosen": -2.50434947013855, "logits/rejected": -2.483466625213623, "logps/chosen": -303.1611022949219, "logps/rejected": -284.998046875, "loss": 0.5749, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.4917478561401367, "rewards/margins": 0.8094803690910339, "rewards/rejected": -0.3177325129508972, "step": 280 }, { "epoch": 0.15175300889586604, "grad_norm": 131.57352309258098, "learning_rate": 2.526132404181184e-07, "logits/chosen": -2.6253631114959717, "logits/rejected": -2.5721564292907715, "logps/chosen": -290.97186279296875, "logps/rejected": -252.8906707763672, "loss": 0.5574, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2963283956050873, "rewards/margins": 0.6972683668136597, "rewards/rejected": -0.40094003081321716, "step": 290 }, { "epoch": 0.15698587127158556, "grad_norm": 138.8312888625021, "learning_rate": 2.613240418118467e-07, "logits/chosen": -2.6122896671295166, "logits/rejected": -2.556002140045166, "logps/chosen": -278.6819763183594, "logps/rejected": -248.638916015625, "loss": 0.554, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.3047786355018616, "rewards/margins": 0.6601444482803345, "rewards/rejected": -0.3553658425807953, "step": 300 }, { "epoch": 0.15698587127158556, "eval_logits/chosen": -2.5603127479553223, "eval_logits/rejected": -2.4975221157073975, "eval_logps/chosen": -271.81622314453125, "eval_logps/rejected": -268.9031677246094, "eval_loss": 0.5668895244598389, "eval_rewards/accuracies": 0.724609375, "eval_rewards/chosen": 0.30956095457077026, "eval_rewards/margins": 0.7581228613853455, "eval_rewards/rejected": -0.4485618770122528, "eval_runtime": 223.8878, "eval_samples_per_second": 8.933, "eval_steps_per_second": 0.143, "step": 300 }, { "epoch": 0.16221873364730507, "grad_norm": 163.99010545451193, "learning_rate": 2.700348432055749e-07, "logits/chosen": -2.548293113708496, "logits/rejected": -2.459908962249756, "logps/chosen": -278.00604248046875, "logps/rejected": -256.2436218261719, "loss": 0.5329, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.6239417195320129, "rewards/margins": 0.7823342084884644, "rewards/rejected": -0.15839248895645142, "step": 310 }, { "epoch": 0.1674515960230246, "grad_norm": 141.38198908428052, "learning_rate": 2.7874564459930313e-07, "logits/chosen": -2.5683250427246094, "logits/rejected": -2.476428508758545, "logps/chosen": -301.060302734375, "logps/rejected": -257.2955017089844, "loss": 0.5493, "rewards/accuracies": 0.75, "rewards/chosen": 0.6340880990028381, "rewards/margins": 0.6276373267173767, "rewards/rejected": 0.006450694985687733, "step": 320 }, { "epoch": 0.1726844583987441, "grad_norm": 178.4179869457342, "learning_rate": 2.874564459930314e-07, "logits/chosen": -2.498730421066284, "logits/rejected": -2.3893237113952637, "logps/chosen": -278.46588134765625, "logps/rejected": -239.07302856445312, "loss": 0.5422, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.2538575530052185, "rewards/margins": 1.0127308368682861, "rewards/rejected": -0.7588733434677124, "step": 330 }, { "epoch": 0.17791732077446362, "grad_norm": 167.7896002385607, "learning_rate": 2.961672473867596e-07, "logits/chosen": -2.565402030944824, "logits/rejected": -2.505615472793579, "logps/chosen": -277.89129638671875, "logps/rejected": -259.83502197265625, "loss": 0.5275, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.19900047779083252, "rewards/margins": 0.7730966806411743, "rewards/rejected": -0.9720970988273621, "step": 340 }, { "epoch": 0.18315018315018314, "grad_norm": 153.8946835516047, "learning_rate": 3.048780487804878e-07, "logits/chosen": -2.50780987739563, "logits/rejected": -2.50248646736145, "logps/chosen": -206.43246459960938, "logps/rejected": -229.46044921875, "loss": 0.4942, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.31379157304763794, "rewards/margins": 0.7969471216201782, "rewards/rejected": -1.110738754272461, "step": 350 }, { "epoch": 0.18838304552590268, "grad_norm": 138.78773401761865, "learning_rate": 3.13588850174216e-07, "logits/chosen": -2.5323219299316406, "logits/rejected": -2.4537017345428467, "logps/chosen": -277.50054931640625, "logps/rejected": -284.33819580078125, "loss": 0.5471, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.09428353607654572, "rewards/margins": 0.817980170249939, "rewards/rejected": -0.9122638702392578, "step": 360 }, { "epoch": 0.1936159079016222, "grad_norm": 125.44923459666191, "learning_rate": 3.2229965156794425e-07, "logits/chosen": -2.434256076812744, "logits/rejected": -2.3915438652038574, "logps/chosen": -240.3540802001953, "logps/rejected": -232.7808837890625, "loss": 0.5364, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.27087968587875366, "rewards/margins": 0.6792318224906921, "rewards/rejected": -0.4083520472049713, "step": 370 }, { "epoch": 0.1988487702773417, "grad_norm": 140.4084345641121, "learning_rate": 3.3101045296167245e-07, "logits/chosen": -2.405693531036377, "logits/rejected": -2.416766405105591, "logps/chosen": -229.98385620117188, "logps/rejected": -281.7201232910156, "loss": 0.4936, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.10076479613780975, "rewards/margins": 0.9603910446166992, "rewards/rejected": -1.061155915260315, "step": 380 }, { "epoch": 0.20408163265306123, "grad_norm": 120.39877294055975, "learning_rate": 3.3972125435540065e-07, "logits/chosen": -2.577932596206665, "logits/rejected": -2.521261692047119, "logps/chosen": -311.5802307128906, "logps/rejected": -271.8887634277344, "loss": 0.5437, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0765007957816124, "rewards/margins": 0.864869236946106, "rewards/rejected": -0.9413700103759766, "step": 390 }, { "epoch": 0.20931449502878074, "grad_norm": 139.92660158161289, "learning_rate": 3.484320557491289e-07, "logits/chosen": -2.5955634117126465, "logits/rejected": -2.5011191368103027, "logps/chosen": -303.11602783203125, "logps/rejected": -252.5283966064453, "loss": 0.5674, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.7355321645736694, "rewards/margins": 0.8152166604995728, "rewards/rejected": -0.07968443632125854, "step": 400 }, { "epoch": 0.20931449502878074, "eval_logits/chosen": -2.538734197616577, "eval_logits/rejected": -2.4794044494628906, "eval_logps/chosen": -267.778564453125, "eval_logps/rejected": -265.08099365234375, "eval_loss": 0.5521309971809387, "eval_rewards/accuracies": 0.724609375, "eval_rewards/chosen": 0.7133291959762573, "eval_rewards/margins": 0.7796748876571655, "eval_rewards/rejected": -0.06634563207626343, "eval_runtime": 221.6021, "eval_samples_per_second": 9.025, "eval_steps_per_second": 0.144, "step": 400 }, { "epoch": 0.21454735740450026, "grad_norm": 140.5180095597033, "learning_rate": 3.5714285714285716e-07, "logits/chosen": -2.583953857421875, "logits/rejected": -2.5235819816589355, "logps/chosen": -294.8075866699219, "logps/rejected": -294.6155700683594, "loss": 0.5796, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6494255065917969, "rewards/margins": 0.6041531562805176, "rewards/rejected": 0.04527236148715019, "step": 410 }, { "epoch": 0.21978021978021978, "grad_norm": 238.85465934404633, "learning_rate": 3.6585365853658536e-07, "logits/chosen": -2.5574214458465576, "logits/rejected": -2.4819633960723877, "logps/chosen": -262.40264892578125, "logps/rejected": -254.68276977539062, "loss": 0.5645, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.19096426665782928, "rewards/margins": 0.7092164754867554, "rewards/rejected": -0.5182522535324097, "step": 420 }, { "epoch": 0.2250130821559393, "grad_norm": 250.27963516801734, "learning_rate": 3.7456445993031356e-07, "logits/chosen": -2.5694403648376465, "logits/rejected": -2.4888291358947754, "logps/chosen": -279.59307861328125, "logps/rejected": -241.97488403320312, "loss": 0.5328, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.06417352706193924, "rewards/margins": 0.8508540987968445, "rewards/rejected": -0.9150276184082031, "step": 430 }, { "epoch": 0.2302459445316588, "grad_norm": 165.00367206006922, "learning_rate": 3.832752613240418e-07, "logits/chosen": -2.54233717918396, "logits/rejected": -2.4655821323394775, "logps/chosen": -297.9071044921875, "logps/rejected": -251.12423706054688, "loss": 0.5137, "rewards/accuracies": 0.75, "rewards/chosen": -0.2488856017589569, "rewards/margins": 0.9149578213691711, "rewards/rejected": -1.1638433933258057, "step": 440 }, { "epoch": 0.23547880690737832, "grad_norm": 162.05705890028784, "learning_rate": 3.9198606271777e-07, "logits/chosen": -2.441164970397949, "logits/rejected": -2.4501419067382812, "logps/chosen": -252.2966766357422, "logps/rejected": -266.63372802734375, "loss": 0.5251, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.31691259145736694, "rewards/margins": 0.7372081875801086, "rewards/rejected": -1.0541207790374756, "step": 450 }, { "epoch": 0.24071166928309787, "grad_norm": 140.45668801877844, "learning_rate": 4.006968641114982e-07, "logits/chosen": -2.5211620330810547, "logits/rejected": -2.4626450538635254, "logps/chosen": -326.5841064453125, "logps/rejected": -286.4332580566406, "loss": 0.5447, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.17609867453575134, "rewards/margins": 0.7172503471374512, "rewards/rejected": -0.5411517024040222, "step": 460 }, { "epoch": 0.24594453165881738, "grad_norm": 380.8895726074156, "learning_rate": 4.0940766550522647e-07, "logits/chosen": -2.4646973609924316, "logits/rejected": -2.465517044067383, "logps/chosen": -285.59893798828125, "logps/rejected": -283.36883544921875, "loss": 0.5188, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.04073848947882652, "rewards/margins": 1.0120255947113037, "rewards/rejected": -0.9712872505187988, "step": 470 }, { "epoch": 0.25117739403453687, "grad_norm": 177.32309008049933, "learning_rate": 4.1811846689895467e-07, "logits/chosen": -2.485696792602539, "logits/rejected": -2.4581103324890137, "logps/chosen": -243.02377319335938, "logps/rejected": -237.8154296875, "loss": 0.5133, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.6118694543838501, "rewards/margins": 0.9234670400619507, "rewards/rejected": -0.31159764528274536, "step": 480 }, { "epoch": 0.2564102564102564, "grad_norm": 131.74480562683152, "learning_rate": 4.268292682926829e-07, "logits/chosen": -2.5157337188720703, "logits/rejected": -2.4771084785461426, "logps/chosen": -282.98004150390625, "logps/rejected": -262.6257629394531, "loss": 0.4904, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.8928958773612976, "rewards/margins": 1.0961697101593018, "rewards/rejected": -0.20327381789684296, "step": 490 }, { "epoch": 0.2616431187859759, "grad_norm": 131.6010797674407, "learning_rate": 4.3554006968641113e-07, "logits/chosen": -2.4228482246398926, "logits/rejected": -2.4214601516723633, "logps/chosen": -261.0464782714844, "logps/rejected": -261.32183837890625, "loss": 0.512, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.25835180282592773, "rewards/margins": 0.9328727722167969, "rewards/rejected": -0.6745210886001587, "step": 500 }, { "epoch": 0.2616431187859759, "eval_logits/chosen": -2.484192371368408, "eval_logits/rejected": -2.4185428619384766, "eval_logps/chosen": -272.9901428222656, "eval_logps/rejected": -273.6878967285156, "eval_loss": 0.5478084683418274, "eval_rewards/accuracies": 0.7265625, "eval_rewards/chosen": 0.19216710329055786, "eval_rewards/margins": 1.1192002296447754, "eval_rewards/rejected": -0.9270331263542175, "eval_runtime": 221.5316, "eval_samples_per_second": 9.028, "eval_steps_per_second": 0.144, "step": 500 }, { "epoch": 0.2668759811616955, "grad_norm": 167.96943023226484, "learning_rate": 4.442508710801394e-07, "logits/chosen": -2.5195443630218506, "logits/rejected": -2.431065559387207, "logps/chosen": -267.10491943359375, "logps/rejected": -261.60894775390625, "loss": 0.5391, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.20528289675712585, "rewards/margins": 0.8989221453666687, "rewards/rejected": -0.6936392784118652, "step": 510 }, { "epoch": 0.272108843537415, "grad_norm": 159.70275813017813, "learning_rate": 4.529616724738676e-07, "logits/chosen": -2.59523344039917, "logits/rejected": -2.5851051807403564, "logps/chosen": -305.79949951171875, "logps/rejected": -314.739990234375, "loss": 0.5167, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.2992744445800781, "rewards/margins": 0.785306990146637, "rewards/rejected": -0.4860325753688812, "step": 520 }, { "epoch": 0.2773417059131345, "grad_norm": 317.67349937656803, "learning_rate": 4.616724738675958e-07, "logits/chosen": -2.4469475746154785, "logits/rejected": -2.42167592048645, "logps/chosen": -298.7344665527344, "logps/rejected": -252.8144989013672, "loss": 0.5212, "rewards/accuracies": 0.75, "rewards/chosen": 0.15460513532161713, "rewards/margins": 0.9266587495803833, "rewards/rejected": -0.7720536589622498, "step": 530 }, { "epoch": 0.282574568288854, "grad_norm": 174.13717212110464, "learning_rate": 4.7038327526132404e-07, "logits/chosen": -2.4723916053771973, "logits/rejected": -2.4589004516601562, "logps/chosen": -300.64276123046875, "logps/rejected": -275.3119201660156, "loss": 0.4821, "rewards/accuracies": 0.75, "rewards/chosen": -0.15326401591300964, "rewards/margins": 1.1747413873672485, "rewards/rejected": -1.3280054330825806, "step": 540 }, { "epoch": 0.28780743066457354, "grad_norm": 142.14037628701686, "learning_rate": 4.790940766550523e-07, "logits/chosen": -2.5600087642669678, "logits/rejected": -2.4618473052978516, "logps/chosen": -306.9172058105469, "logps/rejected": -301.33258056640625, "loss": 0.5613, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.12298767268657684, "rewards/margins": 1.2676466703414917, "rewards/rejected": -1.390634298324585, "step": 550 }, { "epoch": 0.29304029304029305, "grad_norm": 123.51077743884832, "learning_rate": 4.878048780487804e-07, "logits/chosen": -2.4466552734375, "logits/rejected": -2.440688371658325, "logps/chosen": -283.785888671875, "logps/rejected": -287.18048095703125, "loss": 0.5097, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.29718270897865295, "rewards/margins": 1.1045414209365845, "rewards/rejected": -0.8073585629463196, "step": 560 }, { "epoch": 0.29827315541601257, "grad_norm": 132.23370731427892, "learning_rate": 4.965156794425087e-07, "logits/chosen": -2.47137713432312, "logits/rejected": -2.3778138160705566, "logps/chosen": -292.77386474609375, "logps/rejected": -275.23260498046875, "loss": 0.5786, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.2698545455932617, "rewards/margins": 0.7487947344779968, "rewards/rejected": -1.0186494588851929, "step": 570 }, { "epoch": 0.3035060177917321, "grad_norm": 154.0273553192033, "learning_rate": 4.994184919558054e-07, "logits/chosen": -2.4509811401367188, "logits/rejected": -2.372676134109497, "logps/chosen": -254.5933380126953, "logps/rejected": -216.241943359375, "loss": 0.492, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.08447350561618805, "rewards/margins": 0.8071562051773071, "rewards/rejected": -0.7226825952529907, "step": 580 }, { "epoch": 0.3087388801674516, "grad_norm": 307.85069023156603, "learning_rate": 4.984493118821477e-07, "logits/chosen": -2.4190568923950195, "logits/rejected": -2.41286039352417, "logps/chosen": -262.74920654296875, "logps/rejected": -255.7161102294922, "loss": 0.5559, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.22470727562904358, "rewards/margins": 0.8006716966629028, "rewards/rejected": -1.025378942489624, "step": 590 }, { "epoch": 0.3139717425431711, "grad_norm": 154.84743173436397, "learning_rate": 4.9748013180849e-07, "logits/chosen": -2.5416674613952637, "logits/rejected": -2.5267162322998047, "logps/chosen": -277.2581481933594, "logps/rejected": -299.3721008300781, "loss": 0.5511, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.02236304245889187, "rewards/margins": 0.9567183256149292, "rewards/rejected": -0.9343553781509399, "step": 600 }, { "epoch": 0.3139717425431711, "eval_logits/chosen": -2.430830240249634, "eval_logits/rejected": -2.364751100540161, "eval_logps/chosen": -275.02703857421875, "eval_logps/rejected": -275.7374572753906, "eval_loss": 0.5388508439064026, "eval_rewards/accuracies": 0.75390625, "eval_rewards/chosen": -0.011518077924847603, "eval_rewards/margins": 1.1204721927642822, "eval_rewards/rejected": -1.1319903135299683, "eval_runtime": 219.9218, "eval_samples_per_second": 9.094, "eval_steps_per_second": 0.146, "step": 600 }, { "epoch": 0.31920460491889063, "grad_norm": 147.86578883953283, "learning_rate": 4.965109517348324e-07, "logits/chosen": -2.470914125442505, "logits/rejected": -2.406186103820801, "logps/chosen": -235.50119018554688, "logps/rejected": -233.675048828125, "loss": 0.5052, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.12222044169902802, "rewards/margins": 0.983404278755188, "rewards/rejected": -0.8611838221549988, "step": 610 }, { "epoch": 0.32443746729461015, "grad_norm": 176.40071933424915, "learning_rate": 4.955417716611746e-07, "logits/chosen": -2.549196720123291, "logits/rejected": -2.385798931121826, "logps/chosen": -330.9555969238281, "logps/rejected": -253.33108520507812, "loss": 0.6099, "rewards/accuracies": 0.6875, "rewards/chosen": -0.020737838000059128, "rewards/margins": 1.2608206272125244, "rewards/rejected": -1.2815585136413574, "step": 620 }, { "epoch": 0.32967032967032966, "grad_norm": 144.90403500220893, "learning_rate": 4.94572591587517e-07, "logits/chosen": -2.442561149597168, "logits/rejected": -2.4166173934936523, "logps/chosen": -217.47470092773438, "logps/rejected": -256.1766357421875, "loss": 0.6043, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.15434017777442932, "rewards/margins": 1.1347166299819946, "rewards/rejected": -0.9803764224052429, "step": 630 }, { "epoch": 0.3349031920460492, "grad_norm": 142.64272120157057, "learning_rate": 4.936034115138592e-07, "logits/chosen": -2.550363540649414, "logits/rejected": -2.4924328327178955, "logps/chosen": -288.17535400390625, "logps/rejected": -266.46893310546875, "loss": 0.6018, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.11004707962274551, "rewards/margins": 1.057800054550171, "rewards/rejected": -1.1678471565246582, "step": 640 }, { "epoch": 0.3401360544217687, "grad_norm": 150.1018625307248, "learning_rate": 4.926342314402016e-07, "logits/chosen": -2.526475667953491, "logits/rejected": -2.490504026412964, "logps/chosen": -283.5575256347656, "logps/rejected": -259.7370910644531, "loss": 0.5361, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.36602357029914856, "rewards/margins": 1.0414791107177734, "rewards/rejected": -1.4075026512145996, "step": 650 }, { "epoch": 0.3453689167974882, "grad_norm": 147.4332401225207, "learning_rate": 4.916650513665439e-07, "logits/chosen": -2.4335074424743652, "logits/rejected": -2.3871591091156006, "logps/chosen": -233.3837432861328, "logps/rejected": -229.06698608398438, "loss": 0.5302, "rewards/accuracies": 0.75, "rewards/chosen": -0.058051299303770065, "rewards/margins": 0.9938095211982727, "rewards/rejected": -1.0518608093261719, "step": 660 }, { "epoch": 0.35060177917320773, "grad_norm": 158.75242079704003, "learning_rate": 4.906958712928862e-07, "logits/chosen": -2.428741931915283, "logits/rejected": -2.3815505504608154, "logps/chosen": -266.20452880859375, "logps/rejected": -246.29867553710938, "loss": 0.6359, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.5916377305984497, "rewards/margins": 0.9054499864578247, "rewards/rejected": -0.3138122260570526, "step": 670 }, { "epoch": 0.35583464154892724, "grad_norm": 173.76361516615546, "learning_rate": 4.897266912192285e-07, "logits/chosen": -2.395550489425659, "logits/rejected": -2.3644254207611084, "logps/chosen": -264.17083740234375, "logps/rejected": -232.5367889404297, "loss": 0.5816, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.2797853946685791, "rewards/margins": 0.6415343284606934, "rewards/rejected": -0.36174899339675903, "step": 680 }, { "epoch": 0.36106750392464676, "grad_norm": 151.57147781557686, "learning_rate": 4.887575111455709e-07, "logits/chosen": -2.505558967590332, "logits/rejected": -2.462104082107544, "logps/chosen": -285.12237548828125, "logps/rejected": -270.3896484375, "loss": 0.5308, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.16674380004405975, "rewards/margins": 0.9051195383071899, "rewards/rejected": -0.7383755445480347, "step": 690 }, { "epoch": 0.3663003663003663, "grad_norm": 205.88365963434188, "learning_rate": 4.877883310719131e-07, "logits/chosen": -2.4890451431274414, "logits/rejected": -2.4678356647491455, "logps/chosen": -299.48541259765625, "logps/rejected": -260.42340087890625, "loss": 0.5851, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3837422728538513, "rewards/margins": 0.7338034510612488, "rewards/rejected": -1.1175458431243896, "step": 700 }, { "epoch": 0.3663003663003663, "eval_logits/chosen": -2.4621548652648926, "eval_logits/rejected": -2.405531644821167, "eval_logps/chosen": -274.4615478515625, "eval_logps/rejected": -275.8707580566406, "eval_loss": 0.5447643995285034, "eval_rewards/accuracies": 0.740234375, "eval_rewards/chosen": 0.04502829909324646, "eval_rewards/margins": 1.190348744392395, "eval_rewards/rejected": -1.1453205347061157, "eval_runtime": 224.7292, "eval_samples_per_second": 8.9, "eval_steps_per_second": 0.142, "step": 700 }, { "epoch": 0.3715332286760858, "grad_norm": 140.7526326371456, "learning_rate": 4.868191509982554e-07, "logits/chosen": -2.4812631607055664, "logits/rejected": -2.4220027923583984, "logps/chosen": -318.1250305175781, "logps/rejected": -293.35595703125, "loss": 0.4455, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.4442598819732666, "rewards/margins": 1.610219955444336, "rewards/rejected": -1.1659600734710693, "step": 710 }, { "epoch": 0.37676609105180536, "grad_norm": 148.89154306839464, "learning_rate": 4.858499709245977e-07, "logits/chosen": -2.4213387966156006, "logits/rejected": -2.456634521484375, "logps/chosen": -249.0806427001953, "logps/rejected": -254.9530029296875, "loss": 0.5009, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.01653282716870308, "rewards/margins": 1.1262192726135254, "rewards/rejected": -1.1096864938735962, "step": 720 }, { "epoch": 0.3819989534275249, "grad_norm": 183.17248509699854, "learning_rate": 4.848807908509401e-07, "logits/chosen": -2.532348871231079, "logits/rejected": -2.4976563453674316, "logps/chosen": -273.73114013671875, "logps/rejected": -273.97625732421875, "loss": 0.5266, "rewards/accuracies": 0.75, "rewards/chosen": -0.1789076030254364, "rewards/margins": 1.1574863195419312, "rewards/rejected": -1.3363940715789795, "step": 730 }, { "epoch": 0.3872318158032444, "grad_norm": 138.44938969131164, "learning_rate": 4.839116107772823e-07, "logits/chosen": -2.533747911453247, "logits/rejected": -2.5028321743011475, "logps/chosen": -259.8409729003906, "logps/rejected": -223.9192657470703, "loss": 0.5387, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5174254775047302, "rewards/margins": 0.8043993711471558, "rewards/rejected": -1.3218247890472412, "step": 740 }, { "epoch": 0.3924646781789639, "grad_norm": 158.664290799275, "learning_rate": 4.829424307036247e-07, "logits/chosen": -2.481804370880127, "logits/rejected": -2.4438273906707764, "logps/chosen": -269.47418212890625, "logps/rejected": -283.1268005371094, "loss": 0.499, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.6957629919052124, "rewards/margins": 0.9171501994132996, "rewards/rejected": -1.6129131317138672, "step": 750 }, { "epoch": 0.3976975405546834, "grad_norm": 169.17783774203113, "learning_rate": 4.81973250629967e-07, "logits/chosen": -2.5377438068389893, "logits/rejected": -2.5006866455078125, "logps/chosen": -251.92428588867188, "logps/rejected": -241.5601348876953, "loss": 0.5735, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5146152973175049, "rewards/margins": 1.0794529914855957, "rewards/rejected": -1.594068169593811, "step": 760 }, { "epoch": 0.40293040293040294, "grad_norm": 117.34348963430396, "learning_rate": 4.810040705563093e-07, "logits/chosen": -2.5264742374420166, "logits/rejected": -2.517835855484009, "logps/chosen": -291.6723937988281, "logps/rejected": -303.6868896484375, "loss": 0.5569, "rewards/accuracies": 0.6875, "rewards/chosen": -0.22133469581604004, "rewards/margins": 0.7371618747711182, "rewards/rejected": -0.9584965705871582, "step": 770 }, { "epoch": 0.40816326530612246, "grad_norm": 160.29773868249387, "learning_rate": 4.800348904826517e-07, "logits/chosen": -2.580953359603882, "logits/rejected": -2.51811146736145, "logps/chosen": -276.2479553222656, "logps/rejected": -255.4117431640625, "loss": 0.5142, "rewards/accuracies": 0.6875, "rewards/chosen": -0.25065022706985474, "rewards/margins": 1.0290964841842651, "rewards/rejected": -1.2797467708587646, "step": 780 }, { "epoch": 0.413396127681842, "grad_norm": 140.77713244741025, "learning_rate": 4.79065710408994e-07, "logits/chosen": -2.5539793968200684, "logits/rejected": -2.5606868267059326, "logps/chosen": -234.91757202148438, "logps/rejected": -260.0947570800781, "loss": 0.5363, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5232092142105103, "rewards/margins": 0.7950447797775269, "rewards/rejected": -1.318253993988037, "step": 790 }, { "epoch": 0.4186289900575615, "grad_norm": 179.29698066632878, "learning_rate": 4.780965303353363e-07, "logits/chosen": -2.5527491569519043, "logits/rejected": -2.5650885105133057, "logps/chosen": -312.03228759765625, "logps/rejected": -303.5509948730469, "loss": 0.5302, "rewards/accuracies": 0.75, "rewards/chosen": -0.18354937434196472, "rewards/margins": 1.3258657455444336, "rewards/rejected": -1.5094151496887207, "step": 800 }, { "epoch": 0.4186289900575615, "eval_logits/chosen": -2.574220657348633, "eval_logits/rejected": -2.5103955268859863, "eval_logps/chosen": -277.17022705078125, "eval_logps/rejected": -277.32940673828125, "eval_loss": 0.5568873286247253, "eval_rewards/accuracies": 0.732421875, "eval_rewards/chosen": -0.22583983838558197, "eval_rewards/margins": 1.0653458833694458, "eval_rewards/rejected": -1.291185736656189, "eval_runtime": 218.0695, "eval_samples_per_second": 9.171, "eval_steps_per_second": 0.147, "step": 800 }, { "epoch": 0.423861852433281, "grad_norm": 164.53102629046592, "learning_rate": 4.771273502616786e-07, "logits/chosen": -2.5001702308654785, "logits/rejected": -2.4676594734191895, "logps/chosen": -282.761474609375, "logps/rejected": -275.35638427734375, "loss": 0.5393, "rewards/accuracies": 0.6875, "rewards/chosen": -0.24931776523590088, "rewards/margins": 1.0307122468948364, "rewards/rejected": -1.2800301313400269, "step": 810 }, { "epoch": 0.4290947148090005, "grad_norm": 127.6041182696833, "learning_rate": 4.761581701880209e-07, "logits/chosen": -2.585120677947998, "logits/rejected": -2.51393985748291, "logps/chosen": -266.7093811035156, "logps/rejected": -245.558349609375, "loss": 0.5682, "rewards/accuracies": 0.75, "rewards/chosen": -0.34990614652633667, "rewards/margins": 1.096731424331665, "rewards/rejected": -1.4466376304626465, "step": 820 }, { "epoch": 0.43432757718472004, "grad_norm": 148.30939710614686, "learning_rate": 4.7518899011436326e-07, "logits/chosen": -2.5504064559936523, "logits/rejected": -2.4718239307403564, "logps/chosen": -245.26181030273438, "logps/rejected": -248.34164428710938, "loss": 0.5804, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08449772745370865, "rewards/margins": 1.0684163570404053, "rewards/rejected": -1.152914047241211, "step": 830 }, { "epoch": 0.43956043956043955, "grad_norm": 228.9800430595574, "learning_rate": 4.7421981004070556e-07, "logits/chosen": -2.547473430633545, "logits/rejected": -2.434157609939575, "logps/chosen": -314.70843505859375, "logps/rejected": -278.4574279785156, "loss": 0.5026, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.5256937146186829, "rewards/margins": 1.6434406042099, "rewards/rejected": -1.1177470684051514, "step": 840 }, { "epoch": 0.44479330193615907, "grad_norm": 175.3886717463151, "learning_rate": 4.7325062996704787e-07, "logits/chosen": -2.494997501373291, "logits/rejected": -2.4436986446380615, "logps/chosen": -257.09246826171875, "logps/rejected": -237.9566650390625, "loss": 0.5266, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.2028721570968628, "rewards/margins": 1.315305471420288, "rewards/rejected": -1.5181777477264404, "step": 850 }, { "epoch": 0.4500261643118786, "grad_norm": 169.4983578601498, "learning_rate": 4.722814498933902e-07, "logits/chosen": -2.5793604850769043, "logits/rejected": -2.582852840423584, "logps/chosen": -251.5225067138672, "logps/rejected": -289.56671142578125, "loss": 0.5062, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.09693838655948639, "rewards/margins": 1.2970662117004395, "rewards/rejected": -1.2001278400421143, "step": 860 }, { "epoch": 0.4552590266875981, "grad_norm": 102.47321073340156, "learning_rate": 4.7131226981973247e-07, "logits/chosen": -2.481945276260376, "logits/rejected": -2.482484817504883, "logps/chosen": -282.3057556152344, "logps/rejected": -271.061767578125, "loss": 0.6625, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.21059174835681915, "rewards/margins": 0.4407234191894531, "rewards/rejected": -0.2301316261291504, "step": 870 }, { "epoch": 0.4604918890633176, "grad_norm": 211.78934042520572, "learning_rate": 4.7034308974607477e-07, "logits/chosen": -2.615574836730957, "logits/rejected": -2.5486385822296143, "logps/chosen": -306.647216796875, "logps/rejected": -252.2527313232422, "loss": 0.5356, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.054035067558288574, "rewards/margins": 0.9619921445846558, "rewards/rejected": -1.0160272121429443, "step": 880 }, { "epoch": 0.46572475143903713, "grad_norm": 162.1422266669909, "learning_rate": 4.693739096724171e-07, "logits/chosen": -2.6489675045013428, "logits/rejected": -2.5830769538879395, "logps/chosen": -300.5631408691406, "logps/rejected": -276.5767517089844, "loss": 0.5657, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2942334711551666, "rewards/margins": 0.731018602848053, "rewards/rejected": -1.0252519845962524, "step": 890 }, { "epoch": 0.47095761381475665, "grad_norm": 189.36302973081814, "learning_rate": 4.684047295987594e-07, "logits/chosen": -2.458033800125122, "logits/rejected": -2.4546303749084473, "logps/chosen": -267.4068908691406, "logps/rejected": -308.43585205078125, "loss": 0.518, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.023354459553956985, "rewards/margins": 1.8015449047088623, "rewards/rejected": -1.824899435043335, "step": 900 }, { "epoch": 0.47095761381475665, "eval_logits/chosen": -2.4910223484039307, "eval_logits/rejected": -2.4297759532928467, "eval_logps/chosen": -277.4685363769531, "eval_logps/rejected": -278.74957275390625, "eval_loss": 0.5607163310050964, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": -0.2556682527065277, "eval_rewards/margins": 1.177531361579895, "eval_rewards/rejected": -1.4331996440887451, "eval_runtime": 217.205, "eval_samples_per_second": 9.208, "eval_steps_per_second": 0.147, "step": 900 }, { "epoch": 0.47619047619047616, "grad_norm": 216.23092008047914, "learning_rate": 4.6743554952510173e-07, "logits/chosen": -2.402878999710083, "logits/rejected": -2.3379149436950684, "logps/chosen": -234.92129516601562, "logps/rejected": -241.1740264892578, "loss": 0.5736, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.3084966540336609, "rewards/margins": 1.3116391897201538, "rewards/rejected": -1.6201359033584595, "step": 910 }, { "epoch": 0.48142333856619574, "grad_norm": 224.5090794641475, "learning_rate": 4.664663694514441e-07, "logits/chosen": -2.542755365371704, "logits/rejected": -2.4887800216674805, "logps/chosen": -286.3699645996094, "logps/rejected": -280.3407287597656, "loss": 0.5029, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.32557255029678345, "rewards/margins": 1.2060874700546265, "rewards/rejected": -1.5316599607467651, "step": 920 }, { "epoch": 0.48665620094191525, "grad_norm": 138.603049737207, "learning_rate": 4.654971893777864e-07, "logits/chosen": -2.5329623222351074, "logits/rejected": -2.5001254081726074, "logps/chosen": -288.963623046875, "logps/rejected": -266.41497802734375, "loss": 0.5249, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1772511899471283, "rewards/margins": 0.7718427181243896, "rewards/rejected": -0.9490938186645508, "step": 930 }, { "epoch": 0.49188906331763477, "grad_norm": 110.75992426741142, "learning_rate": 4.645280093041287e-07, "logits/chosen": -2.5486257076263428, "logits/rejected": -2.4839091300964355, "logps/chosen": -281.23321533203125, "logps/rejected": -258.21923828125, "loss": 0.5179, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.08490540087223053, "rewards/margins": 1.4410486221313477, "rewards/rejected": -1.525954008102417, "step": 940 }, { "epoch": 0.4971219256933543, "grad_norm": 141.68507214211544, "learning_rate": 4.6355882923047104e-07, "logits/chosen": -2.530820369720459, "logits/rejected": -2.4536337852478027, "logps/chosen": -275.2161865234375, "logps/rejected": -264.17926025390625, "loss": 0.5038, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.2539115250110626, "rewards/margins": 1.5458872318267822, "rewards/rejected": -1.7997987270355225, "step": 950 }, { "epoch": 0.5023547880690737, "grad_norm": 129.9989750112544, "learning_rate": 4.625896491568133e-07, "logits/chosen": -2.4915084838867188, "logits/rejected": -2.4303746223449707, "logps/chosen": -265.11297607421875, "logps/rejected": -272.45611572265625, "loss": 0.5302, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.33651041984558105, "rewards/margins": 1.264890432357788, "rewards/rejected": -1.6014009714126587, "step": 960 }, { "epoch": 0.5075876504447933, "grad_norm": 186.6278529584037, "learning_rate": 4.616204690831556e-07, "logits/chosen": -2.518186092376709, "logits/rejected": -2.4276421070098877, "logps/chosen": -319.1684265136719, "logps/rejected": -263.4048767089844, "loss": 0.5328, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.04223860055208206, "rewards/margins": 1.2940385341644287, "rewards/rejected": -1.3362772464752197, "step": 970 }, { "epoch": 0.5128205128205128, "grad_norm": 121.60757725401304, "learning_rate": 4.6065128900949794e-07, "logits/chosen": -2.5751535892486572, "logits/rejected": -2.4032208919525146, "logps/chosen": -305.0018310546875, "logps/rejected": -276.8344421386719, "loss": 0.5082, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.010190332308411598, "rewards/margins": 1.4215805530548096, "rewards/rejected": -1.4113901853561401, "step": 980 }, { "epoch": 0.5180533751962323, "grad_norm": 157.7746989079604, "learning_rate": 4.5968210893584024e-07, "logits/chosen": -2.438528060913086, "logits/rejected": -2.3997128009796143, "logps/chosen": -269.225341796875, "logps/rejected": -244.0732421875, "loss": 0.5519, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.40925294160842896, "rewards/margins": 1.4047729969024658, "rewards/rejected": -1.81402587890625, "step": 990 }, { "epoch": 0.5232862375719518, "grad_norm": 161.94289038525295, "learning_rate": 4.5871292886218254e-07, "logits/chosen": -2.452096462249756, "logits/rejected": -2.4424660205841064, "logps/chosen": -257.41094970703125, "logps/rejected": -244.45645141601562, "loss": 0.5525, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.518764853477478, "rewards/margins": 1.0366418361663818, "rewards/rejected": -1.5554068088531494, "step": 1000 }, { "epoch": 0.5232862375719518, "eval_logits/chosen": -2.5088629722595215, "eval_logits/rejected": -2.4482171535491943, "eval_logps/chosen": -282.6304931640625, "eval_logps/rejected": -284.3083801269531, "eval_loss": 0.5600619912147522, "eval_rewards/accuracies": 0.748046875, "eval_rewards/chosen": -0.7718652486801147, "eval_rewards/margins": 1.2172174453735352, "eval_rewards/rejected": -1.9890825748443604, "eval_runtime": 218.8562, "eval_samples_per_second": 9.138, "eval_steps_per_second": 0.146, "step": 1000 }, { "epoch": 0.5285190999476713, "grad_norm": 127.367468841812, "learning_rate": 4.577437487885249e-07, "logits/chosen": -2.5522303581237793, "logits/rejected": -2.5477395057678223, "logps/chosen": -293.200439453125, "logps/rejected": -304.0264892578125, "loss": 0.5219, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.5539871454238892, "rewards/margins": 1.2164762020111084, "rewards/rejected": -1.7704633474349976, "step": 1010 }, { "epoch": 0.533751962323391, "grad_norm": 141.18868129887247, "learning_rate": 4.567745687148672e-07, "logits/chosen": -2.5796492099761963, "logits/rejected": -2.5616557598114014, "logps/chosen": -282.737060546875, "logps/rejected": -259.65960693359375, "loss": 0.4778, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.25293225049972534, "rewards/margins": 1.591460943222046, "rewards/rejected": -1.8443934917449951, "step": 1020 }, { "epoch": 0.5389848246991105, "grad_norm": 135.78629838030577, "learning_rate": 4.5580538864120955e-07, "logits/chosen": -2.6245040893554688, "logits/rejected": -2.549217700958252, "logps/chosen": -293.6830749511719, "logps/rejected": -243.7079620361328, "loss": 0.5453, "rewards/accuracies": 0.75, "rewards/chosen": -0.7508270144462585, "rewards/margins": 1.1780718564987183, "rewards/rejected": -1.9288990497589111, "step": 1030 }, { "epoch": 0.54421768707483, "grad_norm": 108.01386934118759, "learning_rate": 4.5483620856755186e-07, "logits/chosen": -2.5785679817199707, "logits/rejected": -2.5940213203430176, "logps/chosen": -268.609619140625, "logps/rejected": -297.02496337890625, "loss": 0.5605, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0627683401107788, "rewards/margins": 1.0904523134231567, "rewards/rejected": -2.1532206535339355, "step": 1040 }, { "epoch": 0.5494505494505495, "grad_norm": 181.76817664601157, "learning_rate": 4.5386702849389416e-07, "logits/chosen": -2.640252113342285, "logits/rejected": -2.603022336959839, "logps/chosen": -269.426513671875, "logps/rejected": -283.8904113769531, "loss": 0.5966, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6111410856246948, "rewards/margins": 1.449350357055664, "rewards/rejected": -2.0604913234710693, "step": 1050 }, { "epoch": 0.554683411826269, "grad_norm": 213.19601611633826, "learning_rate": 4.5289784842023646e-07, "logits/chosen": -2.630995273590088, "logits/rejected": -2.579375743865967, "logps/chosen": -278.83831787109375, "logps/rejected": -263.08514404296875, "loss": 0.5813, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.4633944630622864, "rewards/margins": 1.1730378866195679, "rewards/rejected": -1.6364322900772095, "step": 1060 }, { "epoch": 0.5599162742019885, "grad_norm": 98.93208047512609, "learning_rate": 4.5192866834657876e-07, "logits/chosen": -2.597029447555542, "logits/rejected": -2.555540084838867, "logps/chosen": -306.2710266113281, "logps/rejected": -294.3712158203125, "loss": 0.4978, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.07860786467790604, "rewards/margins": 1.4149057865142822, "rewards/rejected": -1.4935137033462524, "step": 1070 }, { "epoch": 0.565149136577708, "grad_norm": 164.20094848775878, "learning_rate": 4.5095948827292106e-07, "logits/chosen": -2.6048460006713867, "logits/rejected": -2.53227162361145, "logps/chosen": -275.6731262207031, "logps/rejected": -251.59286499023438, "loss": 0.5735, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.039272479712963104, "rewards/margins": 0.93670654296875, "rewards/rejected": -0.9759789705276489, "step": 1080 }, { "epoch": 0.5703819989534276, "grad_norm": 542.3832348831099, "learning_rate": 4.499903081992634e-07, "logits/chosen": -2.4844672679901123, "logits/rejected": -2.4187238216400146, "logps/chosen": -298.60284423828125, "logps/rejected": -242.6894989013672, "loss": 0.7684, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7479456067085266, "rewards/margins": 1.1190924644470215, "rewards/rejected": -1.867038369178772, "step": 1090 }, { "epoch": 0.5756148613291471, "grad_norm": 133.457680817441, "learning_rate": 4.490211281256057e-07, "logits/chosen": -2.5487215518951416, "logits/rejected": -2.5282652378082275, "logps/chosen": -253.854736328125, "logps/rejected": -248.1874542236328, "loss": 0.5189, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.24323770403862, "rewards/margins": 1.1156004667282104, "rewards/rejected": -1.3588382005691528, "step": 1100 }, { "epoch": 0.5756148613291471, "eval_logits/chosen": -2.542973518371582, "eval_logits/rejected": -2.481628894805908, "eval_logps/chosen": -278.9518127441406, "eval_logps/rejected": -280.3683166503906, "eval_loss": 0.5514706969261169, "eval_rewards/accuracies": 0.7421875, "eval_rewards/chosen": -0.40399906039237976, "eval_rewards/margins": 1.1910773515701294, "eval_rewards/rejected": -1.595076322555542, "eval_runtime": 222.5131, "eval_samples_per_second": 8.988, "eval_steps_per_second": 0.144, "step": 1100 }, { "epoch": 0.5808477237048666, "grad_norm": 187.72474092229655, "learning_rate": 4.48051948051948e-07, "logits/chosen": -2.472424030303955, "logits/rejected": -2.41933274269104, "logps/chosen": -253.4503631591797, "logps/rejected": -251.7877960205078, "loss": 0.6088, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3762842118740082, "rewards/margins": 0.9124935269355774, "rewards/rejected": -1.2887777090072632, "step": 1110 }, { "epoch": 0.5860805860805861, "grad_norm": 149.94083439188466, "learning_rate": 4.4708276797829037e-07, "logits/chosen": -2.5426857471466064, "logits/rejected": -2.4825961589813232, "logps/chosen": -319.66729736328125, "logps/rejected": -287.60595703125, "loss": 0.5429, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.3697783052921295, "rewards/margins": 1.2993946075439453, "rewards/rejected": -1.6691728830337524, "step": 1120 }, { "epoch": 0.5913134484563056, "grad_norm": 145.4404221618554, "learning_rate": 4.461135879046327e-07, "logits/chosen": -2.5043511390686035, "logits/rejected": -2.4660134315490723, "logps/chosen": -275.49090576171875, "logps/rejected": -299.2298278808594, "loss": 0.5616, "rewards/accuracies": 0.75, "rewards/chosen": -0.7244709730148315, "rewards/margins": 1.1518573760986328, "rewards/rejected": -1.8763282299041748, "step": 1130 }, { "epoch": 0.5965463108320251, "grad_norm": 163.50058824419256, "learning_rate": 4.45144407830975e-07, "logits/chosen": -2.6152803897857666, "logits/rejected": -2.5187859535217285, "logps/chosen": -293.48724365234375, "logps/rejected": -264.94671630859375, "loss": 0.5208, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.2766650319099426, "rewards/margins": 1.0300482511520386, "rewards/rejected": -1.306713342666626, "step": 1140 }, { "epoch": 0.6017791732077447, "grad_norm": 166.89966526329172, "learning_rate": 4.4417522775731733e-07, "logits/chosen": -2.5654802322387695, "logits/rejected": -2.485625743865967, "logps/chosen": -253.51626586914062, "logps/rejected": -246.95986938476562, "loss": 0.5099, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.06844454258680344, "rewards/margins": 1.433528184890747, "rewards/rejected": -1.5019727945327759, "step": 1150 }, { "epoch": 0.6070120355834642, "grad_norm": 149.02789239965162, "learning_rate": 4.432060476836596e-07, "logits/chosen": -2.4950003623962402, "logits/rejected": -2.4438886642456055, "logps/chosen": -257.9818420410156, "logps/rejected": -207.91921997070312, "loss": 0.5785, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4198984205722809, "rewards/margins": 0.9951489567756653, "rewards/rejected": -1.4150474071502686, "step": 1160 }, { "epoch": 0.6122448979591837, "grad_norm": 200.49918344050434, "learning_rate": 4.422368676100019e-07, "logits/chosen": -2.4990642070770264, "logits/rejected": -2.4279208183288574, "logps/chosen": -287.23858642578125, "logps/rejected": -309.6932373046875, "loss": 0.5873, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6417319774627686, "rewards/margins": 1.1336843967437744, "rewards/rejected": -1.775416612625122, "step": 1170 }, { "epoch": 0.6174777603349032, "grad_norm": 153.9979222384669, "learning_rate": 4.4126768753634423e-07, "logits/chosen": -2.510612726211548, "logits/rejected": -2.4535515308380127, "logps/chosen": -283.4455871582031, "logps/rejected": -265.0210266113281, "loss": 0.5284, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8293108940124512, "rewards/margins": 1.124691128730774, "rewards/rejected": -1.954002022743225, "step": 1180 }, { "epoch": 0.6227106227106227, "grad_norm": 108.74027991128183, "learning_rate": 4.4029850746268654e-07, "logits/chosen": -2.4637205600738525, "logits/rejected": -2.4155681133270264, "logps/chosen": -282.4340515136719, "logps/rejected": -275.81719970703125, "loss": 0.4782, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9284334182739258, "rewards/margins": 1.182985544204712, "rewards/rejected": -2.1114187240600586, "step": 1190 }, { "epoch": 0.6279434850863422, "grad_norm": 145.93096403142889, "learning_rate": 4.3932932738902884e-07, "logits/chosen": -2.4681918621063232, "logits/rejected": -2.4034788608551025, "logps/chosen": -299.88116455078125, "logps/rejected": -283.5958557128906, "loss": 0.5331, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.792948842048645, "rewards/margins": 1.0235211849212646, "rewards/rejected": -1.8164701461791992, "step": 1200 }, { "epoch": 0.6279434850863422, "eval_logits/chosen": -2.50797700881958, "eval_logits/rejected": -2.452104091644287, "eval_logps/chosen": -280.2539978027344, "eval_logps/rejected": -282.08856201171875, "eval_loss": 0.5453246831893921, "eval_rewards/accuracies": 0.73828125, "eval_rewards/chosen": -0.5342170596122742, "eval_rewards/margins": 1.232883334159851, "eval_rewards/rejected": -1.76710045337677, "eval_runtime": 216.5497, "eval_samples_per_second": 9.236, "eval_steps_per_second": 0.148, "step": 1200 }, { "epoch": 0.6331763474620618, "grad_norm": 158.6368584010347, "learning_rate": 4.383601473153712e-07, "logits/chosen": -2.5414161682128906, "logits/rejected": -2.4675703048706055, "logps/chosen": -306.38787841796875, "logps/rejected": -255.7639617919922, "loss": 0.5578, "rewards/accuracies": 0.8125, "rewards/chosen": -0.44307613372802734, "rewards/margins": 1.5332390069961548, "rewards/rejected": -1.976314902305603, "step": 1210 }, { "epoch": 0.6384092098377813, "grad_norm": 156.75390038786026, "learning_rate": 4.373909672417135e-07, "logits/chosen": -2.527590274810791, "logits/rejected": -2.4596567153930664, "logps/chosen": -311.61627197265625, "logps/rejected": -267.5633544921875, "loss": 0.539, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.23937828838825226, "rewards/margins": 1.1805975437164307, "rewards/rejected": -1.419975996017456, "step": 1220 }, { "epoch": 0.6436420722135008, "grad_norm": 120.44454584630857, "learning_rate": 4.3642178716805585e-07, "logits/chosen": -2.5047621726989746, "logits/rejected": -2.473341941833496, "logps/chosen": -317.7254333496094, "logps/rejected": -271.3211669921875, "loss": 0.4928, "rewards/accuracies": 0.8125, "rewards/chosen": -0.16988685727119446, "rewards/margins": 1.519685983657837, "rewards/rejected": -1.6895729303359985, "step": 1230 }, { "epoch": 0.6488749345892203, "grad_norm": 138.43010840584344, "learning_rate": 4.3545260709439815e-07, "logits/chosen": -2.483609199523926, "logits/rejected": -2.4897103309631348, "logps/chosen": -226.516845703125, "logps/rejected": -230.9214630126953, "loss": 0.5139, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.050329409539699554, "rewards/margins": 1.203236699104309, "rewards/rejected": -1.2535661458969116, "step": 1240 }, { "epoch": 0.6541077969649398, "grad_norm": 132.7984346148871, "learning_rate": 4.344834270207404e-07, "logits/chosen": -2.6118252277374268, "logits/rejected": -2.526855707168579, "logps/chosen": -271.97576904296875, "logps/rejected": -277.47381591796875, "loss": 0.5371, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.17879578471183777, "rewards/margins": 1.348672866821289, "rewards/rejected": -1.5274686813354492, "step": 1250 }, { "epoch": 0.6593406593406593, "grad_norm": 115.39623310487656, "learning_rate": 4.3351424694708275e-07, "logits/chosen": -2.5626518726348877, "logits/rejected": -2.4910178184509277, "logps/chosen": -274.8258361816406, "logps/rejected": -223.31787109375, "loss": 0.4722, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.4395284056663513, "rewards/margins": 1.7968591451644897, "rewards/rejected": -1.3573306798934937, "step": 1260 }, { "epoch": 0.6645735217163788, "grad_norm": 128.9326237647205, "learning_rate": 4.3254506687342505e-07, "logits/chosen": -2.4830269813537598, "logits/rejected": -2.409470319747925, "logps/chosen": -277.0553894042969, "logps/rejected": -252.94778442382812, "loss": 0.4705, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.622801661491394, "rewards/margins": 1.2426730394363403, "rewards/rejected": -1.8654747009277344, "step": 1270 }, { "epoch": 0.6698063840920984, "grad_norm": 135.38537056196282, "learning_rate": 4.3157588679976735e-07, "logits/chosen": -2.4172778129577637, "logits/rejected": -2.3863511085510254, "logps/chosen": -261.90948486328125, "logps/rejected": -235.4448699951172, "loss": 0.5044, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.12738476693630219, "rewards/margins": 1.3675730228424072, "rewards/rejected": -1.2401882410049438, "step": 1280 }, { "epoch": 0.6750392464678179, "grad_norm": 162.31197034986923, "learning_rate": 4.306067067261097e-07, "logits/chosen": -2.530656337738037, "logits/rejected": -2.504903554916382, "logps/chosen": -252.63095092773438, "logps/rejected": -273.94525146484375, "loss": 0.565, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.5331031084060669, "rewards/margins": 1.0316709280014038, "rewards/rejected": -0.49856796860694885, "step": 1290 }, { "epoch": 0.6802721088435374, "grad_norm": 192.41755429559154, "learning_rate": 4.29637526652452e-07, "logits/chosen": -2.499812602996826, "logits/rejected": -2.496819019317627, "logps/chosen": -276.8181457519531, "logps/rejected": -286.4149475097656, "loss": 0.5104, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.05198407173156738, "rewards/margins": 1.2382978200912476, "rewards/rejected": -1.1863137483596802, "step": 1300 }, { "epoch": 0.6802721088435374, "eval_logits/chosen": -2.490921974182129, "eval_logits/rejected": -2.428142547607422, "eval_logps/chosen": -279.54595947265625, "eval_logps/rejected": -283.33392333984375, "eval_loss": 0.5510684847831726, "eval_rewards/accuracies": 0.736328125, "eval_rewards/chosen": -0.4634132385253906, "eval_rewards/margins": 1.4282209873199463, "eval_rewards/rejected": -1.891634225845337, "eval_runtime": 220.722, "eval_samples_per_second": 9.061, "eval_steps_per_second": 0.145, "step": 1300 }, { "epoch": 0.6855049712192569, "grad_norm": 239.01905328546653, "learning_rate": 4.286683465787943e-07, "logits/chosen": -2.533567428588867, "logits/rejected": -2.458756685256958, "logps/chosen": -297.37127685546875, "logps/rejected": -281.47052001953125, "loss": 0.5413, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6005761623382568, "rewards/margins": 1.7634689807891846, "rewards/rejected": -2.3640449047088623, "step": 1310 }, { "epoch": 0.6907378335949764, "grad_norm": 166.48098637858817, "learning_rate": 4.2769916650513666e-07, "logits/chosen": -2.441638469696045, "logits/rejected": -2.384211540222168, "logps/chosen": -285.3301696777344, "logps/rejected": -284.38592529296875, "loss": 0.5301, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2250484228134155, "rewards/margins": 1.5056023597717285, "rewards/rejected": -2.7306506633758545, "step": 1320 }, { "epoch": 0.6959706959706959, "grad_norm": 169.79963486813256, "learning_rate": 4.2672998643147897e-07, "logits/chosen": -2.493367910385132, "logits/rejected": -2.4358468055725098, "logps/chosen": -302.03424072265625, "logps/rejected": -342.2644958496094, "loss": 0.5093, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1555052995681763, "rewards/margins": 1.876401662826538, "rewards/rejected": -3.031907320022583, "step": 1330 }, { "epoch": 0.7012035583464155, "grad_norm": 184.25313625509744, "learning_rate": 4.2576080635782127e-07, "logits/chosen": -2.551159620285034, "logits/rejected": -2.445282459259033, "logps/chosen": -297.99407958984375, "logps/rejected": -275.96868896484375, "loss": 0.5166, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1398738622665405, "rewards/margins": 1.0286223888397217, "rewards/rejected": -2.1684963703155518, "step": 1340 }, { "epoch": 0.706436420722135, "grad_norm": 213.30967051409797, "learning_rate": 4.247916262841636e-07, "logits/chosen": -2.5155346393585205, "logits/rejected": -2.5223095417022705, "logps/chosen": -313.26446533203125, "logps/rejected": -303.25994873046875, "loss": 0.562, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.6499028205871582, "rewards/margins": 1.3472754955291748, "rewards/rejected": -1.997178316116333, "step": 1350 }, { "epoch": 0.7116692830978545, "grad_norm": 147.07948472859036, "learning_rate": 4.2382244621050587e-07, "logits/chosen": -2.5422043800354004, "logits/rejected": -2.5028204917907715, "logps/chosen": -319.7339782714844, "logps/rejected": -310.37408447265625, "loss": 0.5532, "rewards/accuracies": 0.75, "rewards/chosen": -0.30898821353912354, "rewards/margins": 1.4711482524871826, "rewards/rejected": -1.7801364660263062, "step": 1360 }, { "epoch": 0.716902145473574, "grad_norm": 167.13008652691835, "learning_rate": 4.2285326613684817e-07, "logits/chosen": -2.5040907859802246, "logits/rejected": -2.4313254356384277, "logps/chosen": -294.1799621582031, "logps/rejected": -259.5082092285156, "loss": 0.5003, "rewards/accuracies": 0.75, "rewards/chosen": -0.681867241859436, "rewards/margins": 1.1391408443450928, "rewards/rejected": -1.8210079669952393, "step": 1370 }, { "epoch": 0.7221350078492935, "grad_norm": 135.66062646682138, "learning_rate": 4.218840860631905e-07, "logits/chosen": -2.544250011444092, "logits/rejected": -2.454240322113037, "logps/chosen": -256.4124450683594, "logps/rejected": -242.14236450195312, "loss": 0.4746, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.24650950729846954, "rewards/margins": 1.5627297163009644, "rewards/rejected": -1.809239149093628, "step": 1380 }, { "epoch": 0.727367870225013, "grad_norm": 141.98887165416852, "learning_rate": 4.2091490598953283e-07, "logits/chosen": -2.455064296722412, "logits/rejected": -2.4052960872650146, "logps/chosen": -255.6050262451172, "logps/rejected": -254.6734619140625, "loss": 0.5219, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.19802771508693695, "rewards/margins": 1.0098148584365845, "rewards/rejected": -1.2078425884246826, "step": 1390 }, { "epoch": 0.7326007326007326, "grad_norm": 142.7179033190639, "learning_rate": 4.1994572591587513e-07, "logits/chosen": -2.5321285724639893, "logits/rejected": -2.495742082595825, "logps/chosen": -259.5890197753906, "logps/rejected": -253.5409698486328, "loss": 0.4976, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5032232403755188, "rewards/margins": 1.396849513053894, "rewards/rejected": -1.9000728130340576, "step": 1400 }, { "epoch": 0.7326007326007326, "eval_logits/chosen": -2.49470591545105, "eval_logits/rejected": -2.439544439315796, "eval_logps/chosen": -278.6596374511719, "eval_logps/rejected": -282.0693664550781, "eval_loss": 0.5413034558296204, "eval_rewards/accuracies": 0.736328125, "eval_rewards/chosen": -0.37477824091911316, "eval_rewards/margins": 1.3904030323028564, "eval_rewards/rejected": -1.7651811838150024, "eval_runtime": 222.103, "eval_samples_per_second": 9.005, "eval_steps_per_second": 0.144, "step": 1400 }, { "epoch": 0.7378335949764521, "grad_norm": 183.67264397036737, "learning_rate": 4.189765458422175e-07, "logits/chosen": -2.4353489875793457, "logits/rejected": -2.3600656986236572, "logps/chosen": -257.83807373046875, "logps/rejected": -264.26593017578125, "loss": 0.5912, "rewards/accuracies": 0.75, "rewards/chosen": -0.42009425163269043, "rewards/margins": 1.4070782661437988, "rewards/rejected": -1.8271725177764893, "step": 1410 }, { "epoch": 0.7430664573521716, "grad_norm": 173.65943947761392, "learning_rate": 4.180073657685598e-07, "logits/chosen": -2.501720905303955, "logits/rejected": -2.4469172954559326, "logps/chosen": -268.50128173828125, "logps/rejected": -258.08306884765625, "loss": 0.6925, "rewards/accuracies": 0.75, "rewards/chosen": -0.5226182341575623, "rewards/margins": 1.2983338832855225, "rewards/rejected": -1.8209521770477295, "step": 1420 }, { "epoch": 0.7482993197278912, "grad_norm": 124.72824505769084, "learning_rate": 4.170381856949021e-07, "logits/chosen": -2.45367169380188, "logits/rejected": -2.461057186126709, "logps/chosen": -260.8750915527344, "logps/rejected": -246.1831512451172, "loss": 0.489, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3254598379135132, "rewards/margins": 1.1876275539398193, "rewards/rejected": -1.5130873918533325, "step": 1430 }, { "epoch": 0.7535321821036107, "grad_norm": 163.60416275256887, "learning_rate": 4.1606900562124444e-07, "logits/chosen": -2.56366229057312, "logits/rejected": -2.4962949752807617, "logps/chosen": -308.81915283203125, "logps/rejected": -278.96429443359375, "loss": 0.5358, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.10903312265872955, "rewards/margins": 1.6170003414154053, "rewards/rejected": -1.507967233657837, "step": 1440 }, { "epoch": 0.7587650444793302, "grad_norm": 116.22615804519079, "learning_rate": 4.150998255475867e-07, "logits/chosen": -2.5434489250183105, "logits/rejected": -2.48994779586792, "logps/chosen": -275.26190185546875, "logps/rejected": -266.0856018066406, "loss": 0.525, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5633369088172913, "rewards/margins": 1.0945422649383545, "rewards/rejected": -1.657879114151001, "step": 1450 }, { "epoch": 0.7639979068550498, "grad_norm": 107.91899142906288, "learning_rate": 4.1413064547392904e-07, "logits/chosen": -2.535365581512451, "logits/rejected": -2.475520610809326, "logps/chosen": -270.49273681640625, "logps/rejected": -245.5968017578125, "loss": 0.5415, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9368154406547546, "rewards/margins": 1.172522783279419, "rewards/rejected": -2.1093382835388184, "step": 1460 }, { "epoch": 0.7692307692307693, "grad_norm": 111.26705191458912, "learning_rate": 4.1316146540027134e-07, "logits/chosen": -2.5238022804260254, "logits/rejected": -2.484795093536377, "logps/chosen": -267.14642333984375, "logps/rejected": -250.4296875, "loss": 0.5245, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7022029161453247, "rewards/margins": 1.3742183446884155, "rewards/rejected": -2.0764214992523193, "step": 1470 }, { "epoch": 0.7744636316064888, "grad_norm": 98.64525904119165, "learning_rate": 4.1219228532661365e-07, "logits/chosen": -2.466796398162842, "logits/rejected": -2.4675745964050293, "logps/chosen": -255.50003051757812, "logps/rejected": -252.94491577148438, "loss": 0.6562, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9924467206001282, "rewards/margins": 0.8215951919555664, "rewards/rejected": -1.8140418529510498, "step": 1480 }, { "epoch": 0.7796964939822083, "grad_norm": 215.97158598679155, "learning_rate": 4.11223105252956e-07, "logits/chosen": -2.4044337272644043, "logits/rejected": -2.374552011489868, "logps/chosen": -276.64208984375, "logps/rejected": -267.0618896484375, "loss": 0.5068, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8586554527282715, "rewards/margins": 1.2865362167358398, "rewards/rejected": -2.1451916694641113, "step": 1490 }, { "epoch": 0.7849293563579278, "grad_norm": 141.09987055763153, "learning_rate": 4.102539251792983e-07, "logits/chosen": -2.4593889713287354, "logits/rejected": -2.4257540702819824, "logps/chosen": -235.09231567382812, "logps/rejected": -265.6543273925781, "loss": 0.4814, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9406391978263855, "rewards/margins": 1.449033260345459, "rewards/rejected": -2.3896727561950684, "step": 1500 }, { "epoch": 0.7849293563579278, "eval_logits/chosen": -2.4908220767974854, "eval_logits/rejected": -2.437608242034912, "eval_logps/chosen": -283.7967834472656, "eval_logps/rejected": -285.9394226074219, "eval_loss": 0.5446794629096985, "eval_rewards/accuracies": 0.73046875, "eval_rewards/chosen": -0.8884966969490051, "eval_rewards/margins": 1.26369047164917, "eval_rewards/rejected": -2.1521873474121094, "eval_runtime": 221.5083, "eval_samples_per_second": 9.029, "eval_steps_per_second": 0.144, "step": 1500 }, { "epoch": 0.7901622187336473, "grad_norm": 166.65650172550812, "learning_rate": 4.092847451056406e-07, "logits/chosen": -2.47309947013855, "logits/rejected": -2.4738082885742188, "logps/chosen": -264.8742980957031, "logps/rejected": -289.4710388183594, "loss": 0.5707, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1635754108428955, "rewards/margins": 1.1796767711639404, "rewards/rejected": -2.343251943588257, "step": 1510 }, { "epoch": 0.7953950811093669, "grad_norm": 77.74620582397179, "learning_rate": 4.0831556503198296e-07, "logits/chosen": -2.5175364017486572, "logits/rejected": -2.4560086727142334, "logps/chosen": -233.7759552001953, "logps/rejected": -238.1402130126953, "loss": 0.5091, "rewards/accuracies": 0.75, "rewards/chosen": -0.429145485162735, "rewards/margins": 1.3892393112182617, "rewards/rejected": -1.8183845281600952, "step": 1520 }, { "epoch": 0.8006279434850864, "grad_norm": 191.63904232247225, "learning_rate": 4.0734638495832526e-07, "logits/chosen": -2.5004396438598633, "logits/rejected": -2.4902970790863037, "logps/chosen": -299.67523193359375, "logps/rejected": -370.71868896484375, "loss": 0.5567, "rewards/accuracies": 0.75, "rewards/chosen": -0.4221988618373871, "rewards/margins": 1.0676653385162354, "rewards/rejected": -1.4898641109466553, "step": 1530 }, { "epoch": 0.8058608058608059, "grad_norm": 96.8941436061373, "learning_rate": 4.0637720488466756e-07, "logits/chosen": -2.5628292560577393, "logits/rejected": -2.5676965713500977, "logps/chosen": -316.46875, "logps/rejected": -304.85223388671875, "loss": 0.5157, "rewards/accuracies": 0.75, "rewards/chosen": -0.5200304388999939, "rewards/margins": 1.159173607826233, "rewards/rejected": -1.679203987121582, "step": 1540 }, { "epoch": 0.8110936682365254, "grad_norm": 122.35852337824512, "learning_rate": 4.0540802481100986e-07, "logits/chosen": -2.4833531379699707, "logits/rejected": -2.465716600418091, "logps/chosen": -339.81494140625, "logps/rejected": -260.31304931640625, "loss": 0.6352, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2978518605232239, "rewards/margins": 1.162075161933899, "rewards/rejected": -1.4599268436431885, "step": 1550 }, { "epoch": 0.8163265306122449, "grad_norm": 143.1385629582226, "learning_rate": 4.0443884473735216e-07, "logits/chosen": -2.5210728645324707, "logits/rejected": -2.478987216949463, "logps/chosen": -306.0292053222656, "logps/rejected": -305.5182800292969, "loss": 0.5756, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4977649748325348, "rewards/margins": 1.1192047595977783, "rewards/rejected": -1.6169697046279907, "step": 1560 }, { "epoch": 0.8215593929879644, "grad_norm": 124.27814642519493, "learning_rate": 4.0346966466369446e-07, "logits/chosen": -2.5749659538269043, "logits/rejected": -2.506140947341919, "logps/chosen": -252.3812713623047, "logps/rejected": -244.5928955078125, "loss": 0.507, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4428938031196594, "rewards/margins": 1.0747992992401123, "rewards/rejected": -1.5176931619644165, "step": 1570 }, { "epoch": 0.826792255363684, "grad_norm": 166.8607273254739, "learning_rate": 4.025004845900368e-07, "logits/chosen": -2.5749292373657227, "logits/rejected": -2.4504244327545166, "logps/chosen": -314.1010437011719, "logps/rejected": -281.9030456542969, "loss": 0.5884, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.4914395809173584, "rewards/margins": 1.583787202835083, "rewards/rejected": -2.0752265453338623, "step": 1580 }, { "epoch": 0.8320251177394035, "grad_norm": 143.91808310394666, "learning_rate": 4.015313045163791e-07, "logits/chosen": -2.4221293926239014, "logits/rejected": -2.398528814315796, "logps/chosen": -279.8281555175781, "logps/rejected": -266.41253662109375, "loss": 0.5022, "rewards/accuracies": 0.8125, "rewards/chosen": -0.31418636441230774, "rewards/margins": 1.6013147830963135, "rewards/rejected": -1.9155009984970093, "step": 1590 }, { "epoch": 0.837257980115123, "grad_norm": 170.05849132865947, "learning_rate": 4.005621244427214e-07, "logits/chosen": -2.449965715408325, "logits/rejected": -2.453136920928955, "logps/chosen": -259.78948974609375, "logps/rejected": -295.80865478515625, "loss": 0.5075, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4218333661556244, "rewards/margins": 1.2140836715698242, "rewards/rejected": -1.6359169483184814, "step": 1600 }, { "epoch": 0.837257980115123, "eval_logits/chosen": -2.481595516204834, "eval_logits/rejected": -2.4315741062164307, "eval_logps/chosen": -277.9630126953125, "eval_logps/rejected": -279.6702880859375, "eval_loss": 0.5422512888908386, "eval_rewards/accuracies": 0.734375, "eval_rewards/chosen": -0.3051190972328186, "eval_rewards/margins": 1.220156192779541, "eval_rewards/rejected": -1.5252752304077148, "eval_runtime": 221.7688, "eval_samples_per_second": 9.018, "eval_steps_per_second": 0.144, "step": 1600 }, { "epoch": 0.8424908424908425, "grad_norm": 164.42822880388547, "learning_rate": 3.995929443690638e-07, "logits/chosen": -2.5344762802124023, "logits/rejected": -2.4833996295928955, "logps/chosen": -320.34088134765625, "logps/rejected": -287.2344055175781, "loss": 0.5716, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.043704282492399216, "rewards/margins": 1.3076083660125732, "rewards/rejected": -1.3513127565383911, "step": 1610 }, { "epoch": 0.847723704866562, "grad_norm": 97.57314842046294, "learning_rate": 3.986237642954061e-07, "logits/chosen": -2.470188856124878, "logits/rejected": -2.431584119796753, "logps/chosen": -266.12139892578125, "logps/rejected": -233.32315063476562, "loss": 0.5204, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3589034676551819, "rewards/margins": 1.0557621717453003, "rewards/rejected": -1.4146654605865479, "step": 1620 }, { "epoch": 0.8529565672422815, "grad_norm": 148.30955288710922, "learning_rate": 3.976545842217484e-07, "logits/chosen": -2.5123963356018066, "logits/rejected": -2.505862236022949, "logps/chosen": -281.48358154296875, "logps/rejected": -296.37115478515625, "loss": 0.5409, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.04674839228391647, "rewards/margins": 1.3475592136383057, "rewards/rejected": -1.3943077325820923, "step": 1630 }, { "epoch": 0.858189429618001, "grad_norm": 146.7703277900639, "learning_rate": 3.9668540414809073e-07, "logits/chosen": -2.5619356632232666, "logits/rejected": -2.5297129154205322, "logps/chosen": -327.06719970703125, "logps/rejected": -296.89935302734375, "loss": 0.5395, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1449117362499237, "rewards/margins": 1.44657301902771, "rewards/rejected": -1.3016613721847534, "step": 1640 }, { "epoch": 0.8634222919937206, "grad_norm": 140.51534876556468, "learning_rate": 3.95716224074433e-07, "logits/chosen": -2.5154123306274414, "logits/rejected": -2.4927542209625244, "logps/chosen": -277.987060546875, "logps/rejected": -282.1713562011719, "loss": 0.5698, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.45631998777389526, "rewards/margins": 1.1773972511291504, "rewards/rejected": -1.6337172985076904, "step": 1650 }, { "epoch": 0.8686551543694401, "grad_norm": 113.27839119737939, "learning_rate": 3.9474704400077533e-07, "logits/chosen": -2.5088717937469482, "logits/rejected": -2.498750925064087, "logps/chosen": -244.3797607421875, "logps/rejected": -248.1291961669922, "loss": 0.5557, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1549554169178009, "rewards/margins": 1.2866535186767578, "rewards/rejected": -1.4416089057922363, "step": 1660 }, { "epoch": 0.8738880167451596, "grad_norm": 170.6188313767065, "learning_rate": 3.9377786392711764e-07, "logits/chosen": -2.566392183303833, "logits/rejected": -2.5167078971862793, "logps/chosen": -306.20819091796875, "logps/rejected": -277.7020263671875, "loss": 0.5677, "rewards/accuracies": 0.75, "rewards/chosen": -0.25892359018325806, "rewards/margins": 1.329339623451233, "rewards/rejected": -1.5882632732391357, "step": 1670 }, { "epoch": 0.8791208791208791, "grad_norm": 155.83134604858014, "learning_rate": 3.9280868385345994e-07, "logits/chosen": -2.463184356689453, "logits/rejected": -2.428490400314331, "logps/chosen": -259.9377136230469, "logps/rejected": -286.1490478515625, "loss": 0.4824, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.09590394049882889, "rewards/margins": 1.6791601181030273, "rewards/rejected": -1.5832561254501343, "step": 1680 }, { "epoch": 0.8843537414965986, "grad_norm": 153.72202428955256, "learning_rate": 3.918395037798023e-07, "logits/chosen": -2.4354023933410645, "logits/rejected": -2.4248557090759277, "logps/chosen": -268.0959777832031, "logps/rejected": -256.25738525390625, "loss": 0.4914, "rewards/accuracies": 0.75, "rewards/chosen": -0.2448834478855133, "rewards/margins": 1.4674071073532104, "rewards/rejected": -1.7122905254364014, "step": 1690 }, { "epoch": 0.8895866038723181, "grad_norm": 129.81123016583018, "learning_rate": 3.908703237061446e-07, "logits/chosen": -2.5093679428100586, "logits/rejected": -2.392098903656006, "logps/chosen": -304.1024169921875, "logps/rejected": -251.80810546875, "loss": 0.4906, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8067069053649902, "rewards/margins": 1.5915721654891968, "rewards/rejected": -2.3982789516448975, "step": 1700 }, { "epoch": 0.8895866038723181, "eval_logits/chosen": -2.543774127960205, "eval_logits/rejected": -2.4876222610473633, "eval_logps/chosen": -289.7531433105469, "eval_logps/rejected": -294.62957763671875, "eval_loss": 0.5805842280387878, "eval_rewards/accuracies": 0.7265625, "eval_rewards/chosen": -1.4841312170028687, "eval_rewards/margins": 1.5370687246322632, "eval_rewards/rejected": -3.021199941635132, "eval_runtime": 211.989, "eval_samples_per_second": 9.434, "eval_steps_per_second": 0.151, "step": 1700 }, { "epoch": 0.8948194662480377, "grad_norm": 214.77108108383771, "learning_rate": 3.899011436324869e-07, "logits/chosen": -2.5504047870635986, "logits/rejected": -2.4949657917022705, "logps/chosen": -305.72149658203125, "logps/rejected": -300.49371337890625, "loss": 0.5528, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5784311294555664, "rewards/margins": 1.343805193901062, "rewards/rejected": -2.9222359657287598, "step": 1710 }, { "epoch": 0.9000523286237572, "grad_norm": 102.43980242703353, "learning_rate": 3.8893196355882925e-07, "logits/chosen": -2.565277576446533, "logits/rejected": -2.5153427124023438, "logps/chosen": -302.40545654296875, "logps/rejected": -280.21630859375, "loss": 0.5296, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5779753923416138, "rewards/margins": 1.3034757375717163, "rewards/rejected": -2.88145112991333, "step": 1720 }, { "epoch": 0.9052851909994767, "grad_norm": 170.38195664347597, "learning_rate": 3.8796278348517155e-07, "logits/chosen": -2.587348461151123, "logits/rejected": -2.565417766571045, "logps/chosen": -264.1243591308594, "logps/rejected": -274.12713623046875, "loss": 0.466, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.85944002866745, "rewards/margins": 1.6328818798065186, "rewards/rejected": -2.4923219680786133, "step": 1730 }, { "epoch": 0.9105180533751962, "grad_norm": 395.8000734909668, "learning_rate": 3.869936034115138e-07, "logits/chosen": -2.70135498046875, "logits/rejected": -2.6039390563964844, "logps/chosen": -298.68853759765625, "logps/rejected": -267.6843566894531, "loss": 0.6612, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.062886118888855, "rewards/margins": 1.324129343032837, "rewards/rejected": -2.3870153427124023, "step": 1740 }, { "epoch": 0.9157509157509157, "grad_norm": 170.5489257816093, "learning_rate": 3.8602442333785615e-07, "logits/chosen": -2.6394999027252197, "logits/rejected": -2.622602701187134, "logps/chosen": -339.5021057128906, "logps/rejected": -314.21630859375, "loss": 0.5511, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.590509295463562, "rewards/margins": 1.4463316202163696, "rewards/rejected": -2.0368409156799316, "step": 1750 }, { "epoch": 0.9209837781266352, "grad_norm": 145.519032885826, "learning_rate": 3.8505524326419845e-07, "logits/chosen": -2.682774782180786, "logits/rejected": -2.6845996379852295, "logps/chosen": -281.56475830078125, "logps/rejected": -313.89385986328125, "loss": 0.4835, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.693749189376831, "rewards/margins": 1.1486456394195557, "rewards/rejected": -1.8423948287963867, "step": 1760 }, { "epoch": 0.9262166405023547, "grad_norm": 143.04943520338836, "learning_rate": 3.8408606319054076e-07, "logits/chosen": -2.606842517852783, "logits/rejected": -2.587186813354492, "logps/chosen": -301.6654968261719, "logps/rejected": -308.6470642089844, "loss": 0.5769, "rewards/accuracies": 0.75, "rewards/chosen": -0.7865618467330933, "rewards/margins": 1.0558887720108032, "rewards/rejected": -1.842450499534607, "step": 1770 }, { "epoch": 0.9314495028780743, "grad_norm": 180.58804947072625, "learning_rate": 3.831168831168831e-07, "logits/chosen": -2.638139009475708, "logits/rejected": -2.6697072982788086, "logps/chosen": -287.81903076171875, "logps/rejected": -287.5301513671875, "loss": 0.5225, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6316205263137817, "rewards/margins": 1.101332426071167, "rewards/rejected": -1.7329530715942383, "step": 1780 }, { "epoch": 0.9366823652537938, "grad_norm": 135.31196899691398, "learning_rate": 3.821477030432254e-07, "logits/chosen": -2.6357929706573486, "logits/rejected": -2.6347250938415527, "logps/chosen": -309.68280029296875, "logps/rejected": -305.3419494628906, "loss": 0.4791, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3846716284751892, "rewards/margins": 1.1850051879882812, "rewards/rejected": -1.5696769952774048, "step": 1790 }, { "epoch": 0.9419152276295133, "grad_norm": 128.45934589699183, "learning_rate": 3.811785229695677e-07, "logits/chosen": -2.529435634613037, "logits/rejected": -2.53661847114563, "logps/chosen": -262.2623291015625, "logps/rejected": -294.1903381347656, "loss": 0.536, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7408838868141174, "rewards/margins": 1.352765679359436, "rewards/rejected": -2.093649387359619, "step": 1800 }, { "epoch": 0.9419152276295133, "eval_logits/chosen": -2.6123387813568115, "eval_logits/rejected": -2.5694358348846436, "eval_logps/chosen": -280.86248779296875, "eval_logps/rejected": -286.1272277832031, "eval_loss": 0.5602679252624512, "eval_rewards/accuracies": 0.73828125, "eval_rewards/chosen": -0.5950646996498108, "eval_rewards/margins": 1.575901985168457, "eval_rewards/rejected": -2.170966863632202, "eval_runtime": 221.2848, "eval_samples_per_second": 9.038, "eval_steps_per_second": 0.145, "step": 1800 }, { "epoch": 0.9471480900052328, "grad_norm": 124.99661874057675, "learning_rate": 3.8020934289591007e-07, "logits/chosen": -2.530865430831909, "logits/rejected": -2.490518093109131, "logps/chosen": -241.8687286376953, "logps/rejected": -236.9424285888672, "loss": 0.5387, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6466684341430664, "rewards/margins": 1.3744982481002808, "rewards/rejected": -2.0211665630340576, "step": 1810 }, { "epoch": 0.9523809523809523, "grad_norm": 149.58383871625242, "learning_rate": 3.7924016282225237e-07, "logits/chosen": -2.5045065879821777, "logits/rejected": -2.514580249786377, "logps/chosen": -215.92105102539062, "logps/rejected": -227.4513397216797, "loss": 0.5368, "rewards/accuracies": 0.75, "rewards/chosen": -0.4417724013328552, "rewards/margins": 1.4011589288711548, "rewards/rejected": -1.8429313898086548, "step": 1820 }, { "epoch": 0.957613814756672, "grad_norm": 197.94498159035226, "learning_rate": 3.7827098274859467e-07, "logits/chosen": -2.645369052886963, "logits/rejected": -2.6039271354675293, "logps/chosen": -293.3324890136719, "logps/rejected": -272.63433837890625, "loss": 0.5996, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5344354510307312, "rewards/margins": 1.3509337902069092, "rewards/rejected": -1.885369062423706, "step": 1830 }, { "epoch": 0.9628466771323915, "grad_norm": 204.2051463674645, "learning_rate": 3.77301802674937e-07, "logits/chosen": -2.602388858795166, "logits/rejected": -2.5755600929260254, "logps/chosen": -289.50811767578125, "logps/rejected": -266.75994873046875, "loss": 0.5347, "rewards/accuracies": 0.75, "rewards/chosen": -0.42006921768188477, "rewards/margins": 1.3145217895507812, "rewards/rejected": -1.7345911264419556, "step": 1840 }, { "epoch": 0.968079539508111, "grad_norm": 158.0448017832084, "learning_rate": 3.7633262260127927e-07, "logits/chosen": -2.5859758853912354, "logits/rejected": -2.511589527130127, "logps/chosen": -353.5319519042969, "logps/rejected": -305.66925048828125, "loss": 0.51, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.03327353671193123, "rewards/margins": 1.8203575611114502, "rewards/rejected": -1.7870838642120361, "step": 1850 }, { "epoch": 0.9733124018838305, "grad_norm": 113.14731360390215, "learning_rate": 3.753634425276216e-07, "logits/chosen": -2.533775568008423, "logits/rejected": -2.4630985260009766, "logps/chosen": -279.3914489746094, "logps/rejected": -258.51043701171875, "loss": 0.4871, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.3653745651245117, "rewards/margins": 1.4163885116577148, "rewards/rejected": -1.7817630767822266, "step": 1860 }, { "epoch": 0.97854526425955, "grad_norm": 173.915707126604, "learning_rate": 3.7439426245396393e-07, "logits/chosen": -2.5380027294158936, "logits/rejected": -2.483966827392578, "logps/chosen": -264.63861083984375, "logps/rejected": -245.9993896484375, "loss": 0.4995, "rewards/accuracies": 0.75, "rewards/chosen": -0.15691910684108734, "rewards/margins": 1.3518301248550415, "rewards/rejected": -1.50874924659729, "step": 1870 }, { "epoch": 0.9837781266352695, "grad_norm": 106.96520907906732, "learning_rate": 3.7342508238030623e-07, "logits/chosen": -2.5838708877563477, "logits/rejected": -2.6150050163269043, "logps/chosen": -266.4748229980469, "logps/rejected": -305.1725769042969, "loss": 0.5023, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.3127604126930237, "rewards/margins": 1.3518893718719482, "rewards/rejected": -1.6646497249603271, "step": 1880 }, { "epoch": 0.989010989010989, "grad_norm": 139.82772790798523, "learning_rate": 3.724559023066486e-07, "logits/chosen": -2.548110008239746, "logits/rejected": -2.5657029151916504, "logps/chosen": -358.4595031738281, "logps/rejected": -333.7761535644531, "loss": 0.4851, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5128358602523804, "rewards/margins": 1.105817198753357, "rewards/rejected": -1.6186530590057373, "step": 1890 }, { "epoch": 0.9942438513867086, "grad_norm": 156.99395418520152, "learning_rate": 3.714867222329909e-07, "logits/chosen": -2.608574628829956, "logits/rejected": -2.567964553833008, "logps/chosen": -258.54132080078125, "logps/rejected": -271.852294921875, "loss": 0.5164, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.6199243664741516, "rewards/margins": 1.3309756517410278, "rewards/rejected": -1.9508998394012451, "step": 1900 }, { "epoch": 0.9942438513867086, "eval_logits/chosen": -2.589801788330078, "eval_logits/rejected": -2.548985004425049, "eval_logps/chosen": -280.3160400390625, "eval_logps/rejected": -284.59088134765625, "eval_loss": 0.5567057132720947, "eval_rewards/accuracies": 0.7421875, "eval_rewards/chosen": -0.5404185652732849, "eval_rewards/margins": 1.4769104719161987, "eval_rewards/rejected": -2.017328977584839, "eval_runtime": 217.3305, "eval_samples_per_second": 9.203, "eval_steps_per_second": 0.147, "step": 1900 }, { "epoch": 0.9994767137624281, "grad_norm": 142.67679178971133, "learning_rate": 3.705175421593332e-07, "logits/chosen": -2.566635847091675, "logits/rejected": -2.5628764629364014, "logps/chosen": -331.7229309082031, "logps/rejected": -292.23760986328125, "loss": 0.4741, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.46179676055908203, "rewards/margins": 1.625088095664978, "rewards/rejected": -2.0868849754333496, "step": 1910 }, { "epoch": 1.0047095761381475, "grad_norm": 83.99064142989266, "learning_rate": 3.6954836208567554e-07, "logits/chosen": -2.5690746307373047, "logits/rejected": -2.5656113624572754, "logps/chosen": -224.0857696533203, "logps/rejected": -249.86416625976562, "loss": 0.1819, "rewards/accuracies": 0.875, "rewards/chosen": 0.8771098852157593, "rewards/margins": 3.3883590698242188, "rewards/rejected": -2.511249303817749, "step": 1920 }, { "epoch": 1.0099424385138671, "grad_norm": 19.844852979405353, "learning_rate": 3.6857918201201784e-07, "logits/chosen": -2.6455276012420654, "logits/rejected": -2.529503107070923, "logps/chosen": -310.1123352050781, "logps/rejected": -309.6076354980469, "loss": 0.0726, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.769474744796753, "rewards/margins": 6.855545997619629, "rewards/rejected": -5.086070537567139, "step": 1930 }, { "epoch": 1.0151753008895865, "grad_norm": 30.685161117919282, "learning_rate": 3.676100019383601e-07, "logits/chosen": -2.513944625854492, "logits/rejected": -2.5131325721740723, "logps/chosen": -292.4463806152344, "logps/rejected": -338.8755187988281, "loss": 0.0889, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9417669177055359, "rewards/margins": 5.887537479400635, "rewards/rejected": -4.945770740509033, "step": 1940 }, { "epoch": 1.0204081632653061, "grad_norm": 29.284045084919008, "learning_rate": 3.6664082186470244e-07, "logits/chosen": -2.538607120513916, "logits/rejected": -2.5373408794403076, "logps/chosen": -233.18399047851562, "logps/rejected": -278.76324462890625, "loss": 0.0897, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0127862691879272, "rewards/margins": 5.298192024230957, "rewards/rejected": -4.285406112670898, "step": 1950 }, { "epoch": 1.0256410256410255, "grad_norm": 27.25427937693851, "learning_rate": 3.6567164179104475e-07, "logits/chosen": -2.595907688140869, "logits/rejected": -2.507371187210083, "logps/chosen": -266.329345703125, "logps/rejected": -262.8670349121094, "loss": 0.0986, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9820111989974976, "rewards/margins": 5.609177589416504, "rewards/rejected": -4.627166748046875, "step": 1960 }, { "epoch": 1.0308738880167452, "grad_norm": 53.8535007077475, "learning_rate": 3.6470246171738705e-07, "logits/chosen": -2.6404824256896973, "logits/rejected": -2.5781140327453613, "logps/chosen": -318.8059997558594, "logps/rejected": -326.7923278808594, "loss": 0.0682, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.434751033782959, "rewards/margins": 7.253692626953125, "rewards/rejected": -5.81894063949585, "step": 1970 }, { "epoch": 1.0361067503924646, "grad_norm": 26.67680243764992, "learning_rate": 3.637332816437294e-07, "logits/chosen": -2.70832896232605, "logits/rejected": -2.680431842803955, "logps/chosen": -237.98770141601562, "logps/rejected": -279.08843994140625, "loss": 0.0857, "rewards/accuracies": 0.9375, "rewards/chosen": 0.26068323850631714, "rewards/margins": 4.834329605102539, "rewards/rejected": -4.57364559173584, "step": 1980 }, { "epoch": 1.0413396127681842, "grad_norm": 44.9170405164323, "learning_rate": 3.627641015700717e-07, "logits/chosen": -2.647665500640869, "logits/rejected": -2.562319040298462, "logps/chosen": -318.2755126953125, "logps/rejected": -355.1710205078125, "loss": 0.1216, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9323193430900574, "rewards/margins": 6.488001823425293, "rewards/rejected": -5.555683612823486, "step": 1990 }, { "epoch": 1.0465724751439036, "grad_norm": 16.758345514955288, "learning_rate": 3.61794921496414e-07, "logits/chosen": -2.7348294258117676, "logits/rejected": -2.667773723602295, "logps/chosen": -269.48968505859375, "logps/rejected": -291.2582092285156, "loss": 0.0947, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.1493604183197021, "rewards/margins": 5.979822635650635, "rewards/rejected": -4.830462455749512, "step": 2000 }, { "epoch": 1.0465724751439036, "eval_logits/chosen": -2.6140401363372803, "eval_logits/rejected": -2.562195301055908, "eval_logps/chosen": -285.5295715332031, "eval_logps/rejected": -294.4039306640625, "eval_loss": 0.5942206978797913, "eval_rewards/accuracies": 0.734375, "eval_rewards/chosen": -1.061771273612976, "eval_rewards/margins": 1.936864972114563, "eval_rewards/rejected": -2.998636484146118, "eval_runtime": 218.9003, "eval_samples_per_second": 9.137, "eval_steps_per_second": 0.146, "step": 2000 }, { "epoch": 1.0518053375196232, "grad_norm": 33.2492102121073, "learning_rate": 3.6082574142275636e-07, "logits/chosen": -2.6531131267547607, "logits/rejected": -2.5694422721862793, "logps/chosen": -255.3903350830078, "logps/rejected": -285.6134948730469, "loss": 0.0867, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8171914219856262, "rewards/margins": 6.1431965827941895, "rewards/rejected": -5.326004981994629, "step": 2010 }, { "epoch": 1.0570381998953426, "grad_norm": 118.35474437663159, "learning_rate": 3.5985656134909866e-07, "logits/chosen": -2.571728229522705, "logits/rejected": -2.5225942134857178, "logps/chosen": -267.7909851074219, "logps/rejected": -276.00494384765625, "loss": 0.0927, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.693240761756897, "rewards/margins": 6.16481876373291, "rewards/rejected": -5.4715776443481445, "step": 2020 }, { "epoch": 1.0622710622710623, "grad_norm": 97.02309892702286, "learning_rate": 3.5888738127544096e-07, "logits/chosen": -2.634601593017578, "logits/rejected": -2.5507116317749023, "logps/chosen": -282.5007629394531, "logps/rejected": -336.6316833496094, "loss": 0.0952, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.10788961499929428, "rewards/margins": 6.554953575134277, "rewards/rejected": -6.447064399719238, "step": 2030 }, { "epoch": 1.0675039246467817, "grad_norm": 20.893925248508094, "learning_rate": 3.5791820120178326e-07, "logits/chosen": -2.6180758476257324, "logits/rejected": -2.5799307823181152, "logps/chosen": -326.1679382324219, "logps/rejected": -339.5617370605469, "loss": 0.067, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.7429353594779968, "rewards/margins": 6.354781150817871, "rewards/rejected": -5.61184549331665, "step": 2040 }, { "epoch": 1.0727367870225013, "grad_norm": 46.72842660080384, "learning_rate": 3.5694902112812556e-07, "logits/chosen": -2.678995132446289, "logits/rejected": -2.5777716636657715, "logps/chosen": -296.65460205078125, "logps/rejected": -318.64141845703125, "loss": 0.0885, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.59366375207901, "rewards/margins": 6.667717933654785, "rewards/rejected": -6.074053764343262, "step": 2050 }, { "epoch": 1.077969649398221, "grad_norm": 68.19996679219317, "learning_rate": 3.5597984105446787e-07, "logits/chosen": -2.559574604034424, "logits/rejected": -2.552882671356201, "logps/chosen": -242.74929809570312, "logps/rejected": -316.19903564453125, "loss": 0.0891, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6113213300704956, "rewards/margins": 6.956334590911865, "rewards/rejected": -6.345013618469238, "step": 2060 }, { "epoch": 1.0832025117739403, "grad_norm": 28.789002996755432, "learning_rate": 3.550106609808102e-07, "logits/chosen": -2.5868875980377197, "logits/rejected": -2.5035669803619385, "logps/chosen": -224.7720947265625, "logps/rejected": -252.1869354248047, "loss": 0.0827, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6460567116737366, "rewards/margins": 5.362710475921631, "rewards/rejected": -4.7166547775268555, "step": 2070 }, { "epoch": 1.08843537414966, "grad_norm": 15.20769976333776, "learning_rate": 3.540414809071525e-07, "logits/chosen": -2.6275830268859863, "logits/rejected": -2.5398662090301514, "logps/chosen": -279.9439697265625, "logps/rejected": -320.3835754394531, "loss": 0.0935, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.3664414882659912, "rewards/margins": 5.956545829772949, "rewards/rejected": -4.590105056762695, "step": 2080 }, { "epoch": 1.0936682365253794, "grad_norm": 46.41216511557465, "learning_rate": 3.530723008334949e-07, "logits/chosen": -2.5760316848754883, "logits/rejected": -2.5261178016662598, "logps/chosen": -226.85537719726562, "logps/rejected": -293.6641540527344, "loss": 0.1003, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.9136565923690796, "rewards/margins": 6.967381954193115, "rewards/rejected": -6.053724765777588, "step": 2090 }, { "epoch": 1.098901098901099, "grad_norm": 64.29466076025598, "learning_rate": 3.521031207598372e-07, "logits/chosen": -2.5962941646575928, "logits/rejected": -2.508232593536377, "logps/chosen": -265.7822265625, "logps/rejected": -303.4806213378906, "loss": 0.068, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.2600649297237396, "rewards/margins": 6.674195289611816, "rewards/rejected": -6.414130210876465, "step": 2100 }, { "epoch": 1.098901098901099, "eval_logits/chosen": -2.504183053970337, "eval_logits/rejected": -2.4360804557800293, "eval_logps/chosen": -291.368896484375, "eval_logps/rejected": -303.51092529296875, "eval_loss": 0.6230309009552002, "eval_rewards/accuracies": 0.751953125, "eval_rewards/chosen": -1.6457051038742065, "eval_rewards/margins": 2.2636306285858154, "eval_rewards/rejected": -3.9093360900878906, "eval_runtime": 219.837, "eval_samples_per_second": 9.098, "eval_steps_per_second": 0.146, "step": 2100 }, { "epoch": 1.1041339612768184, "grad_norm": 10.860328755306922, "learning_rate": 3.511339406861795e-07, "logits/chosen": -2.6003963947296143, "logits/rejected": -2.4945778846740723, "logps/chosen": -292.38177490234375, "logps/rejected": -307.3580322265625, "loss": 0.0814, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.6319004893302917, "rewards/margins": 6.906917572021484, "rewards/rejected": -6.275016784667969, "step": 2110 }, { "epoch": 1.109366823652538, "grad_norm": 101.30142938815428, "learning_rate": 3.5016476061252183e-07, "logits/chosen": -2.443326473236084, "logits/rejected": -2.3479177951812744, "logps/chosen": -237.6367950439453, "logps/rejected": -298.344970703125, "loss": 0.1, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5111889243125916, "rewards/margins": 5.8057355880737305, "rewards/rejected": -5.294546604156494, "step": 2120 }, { "epoch": 1.1145996860282574, "grad_norm": 103.06888470526997, "learning_rate": 3.4919558053886413e-07, "logits/chosen": -2.5572869777679443, "logits/rejected": -2.438095808029175, "logps/chosen": -290.11907958984375, "logps/rejected": -283.16680908203125, "loss": 0.1022, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.14488485455513, "rewards/margins": 6.002516746520996, "rewards/rejected": -5.857631683349609, "step": 2130 }, { "epoch": 1.119832548403977, "grad_norm": 87.69003326173915, "learning_rate": 3.482264004652064e-07, "logits/chosen": -2.5851757526397705, "logits/rejected": -2.531933307647705, "logps/chosen": -262.0079345703125, "logps/rejected": -331.93231201171875, "loss": 0.1159, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.35401642322540283, "rewards/margins": 7.553130149841309, "rewards/rejected": -7.1991143226623535, "step": 2140 }, { "epoch": 1.1250654107796965, "grad_norm": 59.103360795418816, "learning_rate": 3.4725722039154874e-07, "logits/chosen": -2.5091516971588135, "logits/rejected": -2.539517641067505, "logps/chosen": -231.0509490966797, "logps/rejected": -331.1637268066406, "loss": 0.1328, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.25902068614959717, "rewards/margins": 6.4642462730407715, "rewards/rejected": -6.723267555236816, "step": 2150 }, { "epoch": 1.130298273155416, "grad_norm": 61.44895007516442, "learning_rate": 3.4628804031789104e-07, "logits/chosen": -2.5413658618927, "logits/rejected": -2.479661703109741, "logps/chosen": -267.0455627441406, "logps/rejected": -309.9064636230469, "loss": 0.1103, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.4228421151638031, "rewards/margins": 6.322303771972656, "rewards/rejected": -5.8994622230529785, "step": 2160 }, { "epoch": 1.1355311355311355, "grad_norm": 108.90727378063659, "learning_rate": 3.4531886024423334e-07, "logits/chosen": -2.5645623207092285, "logits/rejected": -2.5462088584899902, "logps/chosen": -285.64056396484375, "logps/rejected": -322.49359130859375, "loss": 0.0821, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.4664950370788574, "rewards/margins": 6.3856706619262695, "rewards/rejected": -5.919175148010254, "step": 2170 }, { "epoch": 1.1407639979068551, "grad_norm": 20.123588926719382, "learning_rate": 3.443496801705757e-07, "logits/chosen": -2.5720646381378174, "logits/rejected": -2.518656015396118, "logps/chosen": -251.54025268554688, "logps/rejected": -321.843994140625, "loss": 0.1013, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.45924264192581177, "rewards/margins": 6.617946624755859, "rewards/rejected": -6.158703327178955, "step": 2180 }, { "epoch": 1.1459968602825745, "grad_norm": 23.96653285607074, "learning_rate": 3.43380500096918e-07, "logits/chosen": -2.41733980178833, "logits/rejected": -2.4011635780334473, "logps/chosen": -257.10015869140625, "logps/rejected": -312.45989990234375, "loss": 0.0803, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.0887957438826561, "rewards/margins": 6.416755676269531, "rewards/rejected": -6.32796049118042, "step": 2190 }, { "epoch": 1.1512297226582942, "grad_norm": 57.090672557206666, "learning_rate": 3.424113200232603e-07, "logits/chosen": -2.492663860321045, "logits/rejected": -2.3912405967712402, "logps/chosen": -304.73773193359375, "logps/rejected": -326.7081604003906, "loss": 0.0747, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.7975115180015564, "rewards/margins": 7.34287166595459, "rewards/rejected": -6.545360565185547, "step": 2200 }, { "epoch": 1.1512297226582942, "eval_logits/chosen": -2.454184055328369, "eval_logits/rejected": -2.384427547454834, "eval_logps/chosen": -288.1795349121094, "eval_logps/rejected": -299.36212158203125, "eval_loss": 0.6290668845176697, "eval_rewards/accuracies": 0.74609375, "eval_rewards/chosen": -1.3267689943313599, "eval_rewards/margins": 2.1676881313323975, "eval_rewards/rejected": -3.494457244873047, "eval_runtime": 219.5418, "eval_samples_per_second": 9.11, "eval_steps_per_second": 0.146, "step": 2200 }, { "epoch": 1.1564625850340136, "grad_norm": 62.362264101565046, "learning_rate": 3.4144213994960265e-07, "logits/chosen": -2.4853081703186035, "logits/rejected": -2.3752617835998535, "logps/chosen": -266.93377685546875, "logps/rejected": -283.7090148925781, "loss": 0.059, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8152819871902466, "rewards/margins": 5.81587028503418, "rewards/rejected": -5.000587463378906, "step": 2210 }, { "epoch": 1.1616954474097332, "grad_norm": 20.89884977628219, "learning_rate": 3.4047295987594495e-07, "logits/chosen": -2.4830286502838135, "logits/rejected": -2.3888659477233887, "logps/chosen": -240.0126495361328, "logps/rejected": -312.7715759277344, "loss": 0.0608, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8533409237861633, "rewards/margins": 7.290159702301025, "rewards/rejected": -6.4368181228637695, "step": 2220 }, { "epoch": 1.1669283097854526, "grad_norm": 144.8801741116916, "learning_rate": 3.395037798022872e-07, "logits/chosen": -2.5233261585235596, "logits/rejected": -2.495266914367676, "logps/chosen": -304.90948486328125, "logps/rejected": -326.25531005859375, "loss": 0.0995, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9265473484992981, "rewards/margins": 6.685955047607422, "rewards/rejected": -5.7594075202941895, "step": 2230 }, { "epoch": 1.1721611721611722, "grad_norm": 19.323429873039665, "learning_rate": 3.3853459972862955e-07, "logits/chosen": -2.463965892791748, "logits/rejected": -2.3838164806365967, "logps/chosen": -244.03466796875, "logps/rejected": -324.40484619140625, "loss": 0.1055, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.5131000280380249, "rewards/margins": 7.011256217956543, "rewards/rejected": -7.524355888366699, "step": 2240 }, { "epoch": 1.1773940345368916, "grad_norm": 79.53171515380589, "learning_rate": 3.3756541965497186e-07, "logits/chosen": -2.609351634979248, "logits/rejected": -2.5427794456481934, "logps/chosen": -295.57427978515625, "logps/rejected": -350.0030517578125, "loss": 0.0826, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.009426573291420937, "rewards/margins": 6.208491325378418, "rewards/rejected": -6.1990647315979, "step": 2250 }, { "epoch": 1.1826268969126112, "grad_norm": 29.475988185437906, "learning_rate": 3.3659623958131416e-07, "logits/chosen": -2.6280598640441895, "logits/rejected": -2.5272316932678223, "logps/chosen": -274.50177001953125, "logps/rejected": -330.94268798828125, "loss": 0.0887, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.23656003177165985, "rewards/margins": 7.026780128479004, "rewards/rejected": -6.790220737457275, "step": 2260 }, { "epoch": 1.1878597592883307, "grad_norm": 70.763134919508, "learning_rate": 3.356270595076565e-07, "logits/chosen": -2.5598199367523193, "logits/rejected": -2.506298780441284, "logps/chosen": -310.0873718261719, "logps/rejected": -354.651123046875, "loss": 0.1039, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0659801959991455, "rewards/margins": 7.446995735168457, "rewards/rejected": -6.381015777587891, "step": 2270 }, { "epoch": 1.1930926216640503, "grad_norm": 44.459780975318864, "learning_rate": 3.346578794339988e-07, "logits/chosen": -2.6559979915618896, "logits/rejected": -2.628723621368408, "logps/chosen": -286.8047180175781, "logps/rejected": -319.1341247558594, "loss": 0.0748, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8429773449897766, "rewards/margins": 7.0614166259765625, "rewards/rejected": -6.218438625335693, "step": 2280 }, { "epoch": 1.1983254840397697, "grad_norm": 63.46197054845441, "learning_rate": 3.3368869936034117e-07, "logits/chosen": -2.562051296234131, "logits/rejected": -2.5249762535095215, "logps/chosen": -252.7982635498047, "logps/rejected": -332.78155517578125, "loss": 0.0713, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.8328564763069153, "rewards/margins": 7.555397033691406, "rewards/rejected": -6.722540378570557, "step": 2290 }, { "epoch": 1.2035583464154893, "grad_norm": 7.208044596433599, "learning_rate": 3.3271951928668347e-07, "logits/chosen": -2.670645236968994, "logits/rejected": -2.624894857406616, "logps/chosen": -271.76171875, "logps/rejected": -338.7787170410156, "loss": 0.0553, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.4662889540195465, "rewards/margins": 7.662364959716797, "rewards/rejected": -7.196076393127441, "step": 2300 }, { "epoch": 1.2035583464154893, "eval_logits/chosen": -2.5616238117218018, "eval_logits/rejected": -2.488901138305664, "eval_logps/chosen": -297.120849609375, "eval_logps/rejected": -310.91986083984375, "eval_loss": 0.6764951944351196, "eval_rewards/accuracies": 0.734375, "eval_rewards/chosen": -2.220900774002075, "eval_rewards/margins": 2.4293301105499268, "eval_rewards/rejected": -4.650230884552002, "eval_runtime": 218.4357, "eval_samples_per_second": 9.156, "eval_steps_per_second": 0.146, "step": 2300 }, { "epoch": 1.2087912087912087, "grad_norm": 181.62997151878236, "learning_rate": 3.3175033921302577e-07, "logits/chosen": -2.595334768295288, "logits/rejected": -2.4850056171417236, "logps/chosen": -260.6838684082031, "logps/rejected": -310.8998718261719, "loss": 0.1524, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.2681838572025299, "rewards/margins": 7.936936855316162, "rewards/rejected": -7.668752193450928, "step": 2310 }, { "epoch": 1.2140240711669283, "grad_norm": 36.910826971613254, "learning_rate": 3.307811591393681e-07, "logits/chosen": -2.6260647773742676, "logits/rejected": -2.514893054962158, "logps/chosen": -276.207763671875, "logps/rejected": -312.2389831542969, "loss": 0.1142, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.653515636920929, "rewards/margins": 7.620291709899902, "rewards/rejected": -6.966776371002197, "step": 2320 }, { "epoch": 1.2192569335426477, "grad_norm": 76.49036029628826, "learning_rate": 3.298119790657104e-07, "logits/chosen": -2.576707601547241, "logits/rejected": -2.550014019012451, "logps/chosen": -276.0647888183594, "logps/rejected": -316.4588928222656, "loss": 0.0755, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9700978994369507, "rewards/margins": 7.391635894775391, "rewards/rejected": -6.42153787612915, "step": 2330 }, { "epoch": 1.2244897959183674, "grad_norm": 28.595978990938736, "learning_rate": 3.288427989920527e-07, "logits/chosen": -2.5998644828796387, "logits/rejected": -2.558825969696045, "logps/chosen": -242.4022674560547, "logps/rejected": -294.2389221191406, "loss": 0.077, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7205726504325867, "rewards/margins": 7.46164608001709, "rewards/rejected": -6.7410736083984375, "step": 2340 }, { "epoch": 1.2297226582940868, "grad_norm": 54.80316559515418, "learning_rate": 3.2787361891839503e-07, "logits/chosen": -2.5806260108947754, "logits/rejected": -2.5337390899658203, "logps/chosen": -269.5099792480469, "logps/rejected": -315.20086669921875, "loss": 0.1014, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.5756562948226929, "rewards/margins": 7.119746208190918, "rewards/rejected": -6.544090270996094, "step": 2350 }, { "epoch": 1.2349555206698064, "grad_norm": 56.088315382432135, "learning_rate": 3.2690443884473733e-07, "logits/chosen": -2.6411519050598145, "logits/rejected": -2.6007795333862305, "logps/chosen": -280.2635803222656, "logps/rejected": -315.6443176269531, "loss": 0.1622, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.3262324631214142, "rewards/margins": 6.335809230804443, "rewards/rejected": -6.00957727432251, "step": 2360 }, { "epoch": 1.2401883830455258, "grad_norm": 106.35211259583042, "learning_rate": 3.2593525877107963e-07, "logits/chosen": -2.684727907180786, "logits/rejected": -2.6050662994384766, "logps/chosen": -299.05804443359375, "logps/rejected": -330.98114013671875, "loss": 0.0975, "rewards/accuracies": 0.9375, "rewards/chosen": 0.39265066385269165, "rewards/margins": 7.139306545257568, "rewards/rejected": -6.7466559410095215, "step": 2370 }, { "epoch": 1.2454212454212454, "grad_norm": 61.25351439760001, "learning_rate": 3.24966078697422e-07, "logits/chosen": -2.6198008060455322, "logits/rejected": -2.542973279953003, "logps/chosen": -271.54669189453125, "logps/rejected": -313.18157958984375, "loss": 0.0887, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.22702760994434357, "rewards/margins": 6.568183898925781, "rewards/rejected": -6.7952117919921875, "step": 2380 }, { "epoch": 1.250654107796965, "grad_norm": 48.025970507472586, "learning_rate": 3.239968986237643e-07, "logits/chosen": -2.4945690631866455, "logits/rejected": -2.4734139442443848, "logps/chosen": -335.92254638671875, "logps/rejected": -384.5530090332031, "loss": 0.095, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.5907285809516907, "rewards/margins": 7.474919319152832, "rewards/rejected": -6.884189605712891, "step": 2390 }, { "epoch": 1.2558869701726845, "grad_norm": 70.50763246133272, "learning_rate": 3.230277185501066e-07, "logits/chosen": -2.592679500579834, "logits/rejected": -2.5237956047058105, "logps/chosen": -308.12725830078125, "logps/rejected": -343.26483154296875, "loss": 0.1207, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.452779620885849, "rewards/margins": 7.165650367736816, "rewards/rejected": -6.7128705978393555, "step": 2400 }, { "epoch": 1.2558869701726845, "eval_logits/chosen": -2.509161949157715, "eval_logits/rejected": -2.445741891860962, "eval_logps/chosen": -292.0694885253906, "eval_logps/rejected": -304.0018005371094, "eval_loss": 0.6530308723449707, "eval_rewards/accuracies": 0.724609375, "eval_rewards/chosen": -1.7157639265060425, "eval_rewards/margins": 2.242658853530884, "eval_rewards/rejected": -3.9584226608276367, "eval_runtime": 221.5696, "eval_samples_per_second": 9.027, "eval_steps_per_second": 0.144, "step": 2400 }, { "epoch": 1.2611198325484039, "grad_norm": 46.20818051070898, "learning_rate": 3.2205853847644894e-07, "logits/chosen": -2.5348258018493652, "logits/rejected": -2.4469754695892334, "logps/chosen": -335.8922119140625, "logps/rejected": -360.198974609375, "loss": 0.0799, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.5666877031326294, "rewards/margins": 7.051473140716553, "rewards/rejected": -6.484785556793213, "step": 2410 }, { "epoch": 1.2663526949241235, "grad_norm": 93.66630285832011, "learning_rate": 3.2108935840279124e-07, "logits/chosen": -2.6216418743133545, "logits/rejected": -2.4733779430389404, "logps/chosen": -336.0015869140625, "logps/rejected": -307.5792236328125, "loss": 0.0987, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5304969549179077, "rewards/margins": 7.135917663574219, "rewards/rejected": -6.605420112609863, "step": 2420 }, { "epoch": 1.2715855572998431, "grad_norm": 78.6521806849575, "learning_rate": 3.201201783291335e-07, "logits/chosen": -2.5466980934143066, "logits/rejected": -2.4219024181365967, "logps/chosen": -312.2503662109375, "logps/rejected": -296.533447265625, "loss": 0.1393, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.34462395310401917, "rewards/margins": 6.4234299659729, "rewards/rejected": -6.078805446624756, "step": 2430 }, { "epoch": 1.2768184196755625, "grad_norm": 80.21114755996408, "learning_rate": 3.1915099825547585e-07, "logits/chosen": -2.485072135925293, "logits/rejected": -2.4991774559020996, "logps/chosen": -275.27752685546875, "logps/rejected": -322.7217102050781, "loss": 0.0808, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.5848476886749268, "rewards/margins": 7.007977485656738, "rewards/rejected": -6.423130989074707, "step": 2440 }, { "epoch": 1.282051282051282, "grad_norm": 96.73172725528431, "learning_rate": 3.1818181818181815e-07, "logits/chosen": -2.5032427310943604, "logits/rejected": -2.4806182384490967, "logps/chosen": -273.73406982421875, "logps/rejected": -307.1004333496094, "loss": 0.0772, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.20320355892181396, "rewards/margins": 6.753357887268066, "rewards/rejected": -6.550154209136963, "step": 2450 }, { "epoch": 1.2872841444270016, "grad_norm": 22.937573494951316, "learning_rate": 3.1721263810816045e-07, "logits/chosen": -2.5175888538360596, "logits/rejected": -2.469468593597412, "logps/chosen": -249.15975952148438, "logps/rejected": -276.5986328125, "loss": 0.0858, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.050439536571502686, "rewards/margins": 6.269559383392334, "rewards/rejected": -6.219120502471924, "step": 2460 }, { "epoch": 1.2925170068027212, "grad_norm": 26.72063373196107, "learning_rate": 3.162434580345028e-07, "logits/chosen": -2.468705415725708, "logits/rejected": -2.435086488723755, "logps/chosen": -249.83798217773438, "logps/rejected": -321.27923583984375, "loss": 0.0642, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.5983337759971619, "rewards/margins": 7.224898338317871, "rewards/rejected": -6.626564979553223, "step": 2470 }, { "epoch": 1.2977498691784406, "grad_norm": 36.395344016057415, "learning_rate": 3.152742779608451e-07, "logits/chosen": -2.4750609397888184, "logits/rejected": -2.406247854232788, "logps/chosen": -254.42294311523438, "logps/rejected": -293.39990234375, "loss": 0.129, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7551592588424683, "rewards/margins": 6.792397975921631, "rewards/rejected": -6.037239074707031, "step": 2480 }, { "epoch": 1.30298273155416, "grad_norm": 64.4646865625765, "learning_rate": 3.143050978871874e-07, "logits/chosen": -2.5852818489074707, "logits/rejected": -2.4546711444854736, "logps/chosen": -335.017578125, "logps/rejected": -325.03485107421875, "loss": 0.0609, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.9923974871635437, "rewards/margins": 7.436348915100098, "rewards/rejected": -6.4439520835876465, "step": 2490 }, { "epoch": 1.3082155939298796, "grad_norm": 30.266949265395027, "learning_rate": 3.1333591781352976e-07, "logits/chosen": -2.43589448928833, "logits/rejected": -2.391376256942749, "logps/chosen": -238.7588653564453, "logps/rejected": -302.181396484375, "loss": 0.152, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.2925378382205963, "rewards/margins": 6.823611259460449, "rewards/rejected": -6.531073570251465, "step": 2500 }, { "epoch": 1.3082155939298796, "eval_logits/chosen": -2.491743803024292, "eval_logits/rejected": -2.4231762886047363, "eval_logps/chosen": -293.7032165527344, "eval_logps/rejected": -308.2236633300781, "eval_loss": 0.6881604194641113, "eval_rewards/accuracies": 0.720703125, "eval_rewards/chosen": -1.879141092300415, "eval_rewards/margins": 2.5014708042144775, "eval_rewards/rejected": -4.380611896514893, "eval_runtime": 224.8569, "eval_samples_per_second": 8.895, "eval_steps_per_second": 0.142, "step": 2500 }, { "epoch": 1.3134484563055993, "grad_norm": 230.09793105069184, "learning_rate": 3.1236673773987206e-07, "logits/chosen": -2.500148057937622, "logits/rejected": -2.4602465629577637, "logps/chosen": -239.11972045898438, "logps/rejected": -361.43670654296875, "loss": 0.0891, "rewards/accuracies": 0.9375, "rewards/chosen": 0.11347372829914093, "rewards/margins": 7.480711936950684, "rewards/rejected": -7.367237091064453, "step": 2510 }, { "epoch": 1.3186813186813187, "grad_norm": 41.78355494957457, "learning_rate": 3.113975576662144e-07, "logits/chosen": -2.531987190246582, "logits/rejected": -2.4801089763641357, "logps/chosen": -282.7210998535156, "logps/rejected": -292.29833984375, "loss": 0.0835, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5220044851303101, "rewards/margins": 6.0406813621521, "rewards/rejected": -6.562685489654541, "step": 2520 }, { "epoch": 1.323914181057038, "grad_norm": 75.54615714665115, "learning_rate": 3.1042837759255666e-07, "logits/chosen": -2.5856757164001465, "logits/rejected": -2.4948112964630127, "logps/chosen": -290.7344665527344, "logps/rejected": -352.19696044921875, "loss": 0.0761, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.7986549735069275, "rewards/margins": 8.642539978027344, "rewards/rejected": -7.8438849449157715, "step": 2530 }, { "epoch": 1.3291470434327577, "grad_norm": 112.64804974386416, "learning_rate": 3.0945919751889897e-07, "logits/chosen": -2.4174349308013916, "logits/rejected": -2.3898119926452637, "logps/chosen": -239.66421508789062, "logps/rejected": -302.56207275390625, "loss": 0.0802, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7849119901657104, "rewards/margins": 7.3279571533203125, "rewards/rejected": -6.543044090270996, "step": 2540 }, { "epoch": 1.3343799058084773, "grad_norm": 15.665686118177556, "learning_rate": 3.084900174452413e-07, "logits/chosen": -2.539119005203247, "logits/rejected": -2.416522741317749, "logps/chosen": -269.84197998046875, "logps/rejected": -314.6531066894531, "loss": 0.06, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6195009350776672, "rewards/margins": 7.703807830810547, "rewards/rejected": -7.0843071937561035, "step": 2550 }, { "epoch": 1.3396127681841967, "grad_norm": 19.696516695757254, "learning_rate": 3.075208373715836e-07, "logits/chosen": -2.4958226680755615, "logits/rejected": -2.4016003608703613, "logps/chosen": -274.4857482910156, "logps/rejected": -306.93695068359375, "loss": 0.0663, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.2058308869600296, "rewards/margins": 7.211268424987793, "rewards/rejected": -7.417099952697754, "step": 2560 }, { "epoch": 1.3448456305599163, "grad_norm": 110.21258446820791, "learning_rate": 3.065516572979259e-07, "logits/chosen": -2.508181571960449, "logits/rejected": -2.4227676391601562, "logps/chosen": -297.3214416503906, "logps/rejected": -333.6142578125, "loss": 0.0877, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.06509654223918915, "rewards/margins": 7.6079607009887695, "rewards/rejected": -7.542864799499512, "step": 2570 }, { "epoch": 1.3500784929356358, "grad_norm": 87.52458037512258, "learning_rate": 3.055824772242683e-07, "logits/chosen": -2.573064088821411, "logits/rejected": -2.4985415935516357, "logps/chosen": -282.5545654296875, "logps/rejected": -327.1505432128906, "loss": 0.0885, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.3802102208137512, "rewards/margins": 7.264068603515625, "rewards/rejected": -7.644278526306152, "step": 2580 }, { "epoch": 1.3553113553113554, "grad_norm": 53.31332950130767, "learning_rate": 3.046132971506106e-07, "logits/chosen": -2.6358108520507812, "logits/rejected": -2.5826923847198486, "logps/chosen": -299.3256530761719, "logps/rejected": -379.79156494140625, "loss": 0.1358, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.10961616039276123, "rewards/margins": 7.289822578430176, "rewards/rejected": -7.39943790435791, "step": 2590 }, { "epoch": 1.3605442176870748, "grad_norm": 59.64728356392312, "learning_rate": 3.036441170769529e-07, "logits/chosen": -2.5668880939483643, "logits/rejected": -2.5128886699676514, "logps/chosen": -232.7214813232422, "logps/rejected": -295.72052001953125, "loss": 0.1114, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.16606758534908295, "rewards/margins": 6.52313232421875, "rewards/rejected": -6.689200401306152, "step": 2600 }, { "epoch": 1.3605442176870748, "eval_logits/chosen": -2.6189308166503906, "eval_logits/rejected": -2.5713179111480713, "eval_logps/chosen": -297.245849609375, "eval_logps/rejected": -308.30743408203125, "eval_loss": 0.6422358155250549, "eval_rewards/accuracies": 0.72265625, "eval_rewards/chosen": -2.2333996295928955, "eval_rewards/margins": 2.1555910110473633, "eval_rewards/rejected": -4.388990879058838, "eval_runtime": 221.5622, "eval_samples_per_second": 9.027, "eval_steps_per_second": 0.144, "step": 2600 }, { "epoch": 1.3657770800627944, "grad_norm": 75.1660417496763, "learning_rate": 3.0267493700329523e-07, "logits/chosen": -2.6917107105255127, "logits/rejected": -2.5845510959625244, "logps/chosen": -292.08868408203125, "logps/rejected": -306.9923095703125, "loss": 0.0962, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.23268970847129822, "rewards/margins": 5.808815956115723, "rewards/rejected": -6.041506290435791, "step": 2610 }, { "epoch": 1.3710099424385138, "grad_norm": 86.67771431389309, "learning_rate": 3.0170575692963754e-07, "logits/chosen": -2.5917203426361084, "logits/rejected": -2.5115408897399902, "logps/chosen": -275.4058837890625, "logps/rejected": -300.29974365234375, "loss": 0.0966, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.16756343841552734, "rewards/margins": 6.576737880706787, "rewards/rejected": -6.744301795959473, "step": 2620 }, { "epoch": 1.3762428048142334, "grad_norm": 39.1723061529081, "learning_rate": 3.007365768559798e-07, "logits/chosen": -2.478318691253662, "logits/rejected": -2.451003313064575, "logps/chosen": -233.59805297851562, "logps/rejected": -318.4455261230469, "loss": 0.0803, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.21545524895191193, "rewards/margins": 6.664320945739746, "rewards/rejected": -6.879776954650879, "step": 2630 }, { "epoch": 1.3814756671899528, "grad_norm": 64.77995632765672, "learning_rate": 2.9976739678232214e-07, "logits/chosen": -2.63944411277771, "logits/rejected": -2.5723836421966553, "logps/chosen": -265.38006591796875, "logps/rejected": -297.82373046875, "loss": 0.0867, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.3697351813316345, "rewards/margins": 6.2464280128479, "rewards/rejected": -6.616163730621338, "step": 2640 }, { "epoch": 1.3867085295656725, "grad_norm": 55.87308592719947, "learning_rate": 2.9879821670866444e-07, "logits/chosen": -2.5315511226654053, "logits/rejected": -2.500164747238159, "logps/chosen": -239.0774383544922, "logps/rejected": -325.2108154296875, "loss": 0.1045, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.014102265238761902, "rewards/margins": 7.3664350509643555, "rewards/rejected": -7.35233211517334, "step": 2650 }, { "epoch": 1.3919413919413919, "grad_norm": 66.58069933377543, "learning_rate": 2.9782903663500674e-07, "logits/chosen": -2.6597371101379395, "logits/rejected": -2.6133480072021484, "logps/chosen": -302.06500244140625, "logps/rejected": -385.7315979003906, "loss": 0.1184, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.5828040242195129, "rewards/margins": 6.979525566101074, "rewards/rejected": -6.396721839904785, "step": 2660 }, { "epoch": 1.3971742543171115, "grad_norm": 35.57799287734959, "learning_rate": 2.968598565613491e-07, "logits/chosen": -2.6301932334899902, "logits/rejected": -2.559492826461792, "logps/chosen": -282.4701843261719, "logps/rejected": -291.48907470703125, "loss": 0.073, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.21462485194206238, "rewards/margins": 5.958996772766113, "rewards/rejected": -5.744372367858887, "step": 2670 }, { "epoch": 1.402407116692831, "grad_norm": 48.75444167837231, "learning_rate": 2.958906764876914e-07, "logits/chosen": -2.6060492992401123, "logits/rejected": -2.5475142002105713, "logps/chosen": -299.4150085449219, "logps/rejected": -345.13958740234375, "loss": 0.0808, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.35182443261146545, "rewards/margins": 7.494786262512207, "rewards/rejected": -7.142962455749512, "step": 2680 }, { "epoch": 1.4076399790685505, "grad_norm": 72.33194190990972, "learning_rate": 2.949214964140337e-07, "logits/chosen": -2.5479860305786133, "logits/rejected": -2.55568265914917, "logps/chosen": -224.351318359375, "logps/rejected": -318.9046630859375, "loss": 0.0714, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.08370213210582733, "rewards/margins": 6.050145626068115, "rewards/rejected": -6.133847713470459, "step": 2690 }, { "epoch": 1.41287284144427, "grad_norm": 68.27001568378579, "learning_rate": 2.9395231634037605e-07, "logits/chosen": -2.6008198261260986, "logits/rejected": -2.522848606109619, "logps/chosen": -308.95941162109375, "logps/rejected": -332.6445007324219, "loss": 0.1173, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.016947561874985695, "rewards/margins": 7.837620735168457, "rewards/rejected": -7.854567050933838, "step": 2700 }, { "epoch": 1.41287284144427, "eval_logits/chosen": -2.5718796253204346, "eval_logits/rejected": -2.5152196884155273, "eval_logps/chosen": -290.61187744140625, "eval_logps/rejected": -304.6990966796875, "eval_loss": 0.6618513464927673, "eval_rewards/accuracies": 0.7265625, "eval_rewards/chosen": -1.5700048208236694, "eval_rewards/margins": 2.458146810531616, "eval_rewards/rejected": -4.028151512145996, "eval_runtime": 222.9776, "eval_samples_per_second": 8.97, "eval_steps_per_second": 0.144, "step": 2700 }, { "epoch": 1.4181057038199896, "grad_norm": 91.5367689537771, "learning_rate": 2.9298313626671835e-07, "logits/chosen": -2.5641961097717285, "logits/rejected": -2.4925408363342285, "logps/chosen": -273.421630859375, "logps/rejected": -308.81231689453125, "loss": 0.1154, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.8237888216972351, "rewards/margins": 7.317458152770996, "rewards/rejected": -6.4936699867248535, "step": 2710 }, { "epoch": 1.423338566195709, "grad_norm": 92.8801865768772, "learning_rate": 2.920139561930607e-07, "logits/chosen": -2.5656228065490723, "logits/rejected": -2.520338535308838, "logps/chosen": -277.97454833984375, "logps/rejected": -366.7353515625, "loss": 0.11, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.19112356007099152, "rewards/margins": 7.258709907531738, "rewards/rejected": -7.067586421966553, "step": 2720 }, { "epoch": 1.4285714285714286, "grad_norm": 144.83041689220596, "learning_rate": 2.9104477611940296e-07, "logits/chosen": -2.5771210193634033, "logits/rejected": -2.5248379707336426, "logps/chosen": -256.5364074707031, "logps/rejected": -309.40423583984375, "loss": 0.1272, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.5018315315246582, "rewards/margins": 7.365907192230225, "rewards/rejected": -6.864075660705566, "step": 2730 }, { "epoch": 1.433804290947148, "grad_norm": 57.97368299219418, "learning_rate": 2.9007559604574526e-07, "logits/chosen": -2.4883127212524414, "logits/rejected": -2.45086669921875, "logps/chosen": -278.98187255859375, "logps/rejected": -302.96075439453125, "loss": 0.1091, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.38574856519699097, "rewards/margins": 5.372960567474365, "rewards/rejected": -5.758708953857422, "step": 2740 }, { "epoch": 1.4390371533228676, "grad_norm": 53.11692134684997, "learning_rate": 2.891064159720876e-07, "logits/chosen": -2.5422847270965576, "logits/rejected": -2.5045361518859863, "logps/chosen": -228.661865234375, "logps/rejected": -339.9106750488281, "loss": 0.0926, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.9592034220695496, "rewards/margins": 7.379068851470947, "rewards/rejected": -6.419865608215332, "step": 2750 }, { "epoch": 1.4442700156985873, "grad_norm": 62.25256144895041, "learning_rate": 2.881372358984299e-07, "logits/chosen": -2.530738115310669, "logits/rejected": -2.4734299182891846, "logps/chosen": -303.04913330078125, "logps/rejected": -272.4736328125, "loss": 0.0778, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.3232582211494446, "rewards/margins": 5.88210916519165, "rewards/rejected": -6.205367088317871, "step": 2760 }, { "epoch": 1.4495028780743067, "grad_norm": 27.548907848011336, "learning_rate": 2.871680558247722e-07, "logits/chosen": -2.555623769760132, "logits/rejected": -2.4618635177612305, "logps/chosen": -308.28497314453125, "logps/rejected": -326.5250549316406, "loss": 0.1084, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.10055677592754364, "rewards/margins": 6.703555107116699, "rewards/rejected": -6.804111480712891, "step": 2770 }, { "epoch": 1.454735740450026, "grad_norm": 56.42435265881367, "learning_rate": 2.8619887575111457e-07, "logits/chosen": -2.6364588737487793, "logits/rejected": -2.5728986263275146, "logps/chosen": -265.3021240234375, "logps/rejected": -289.7735900878906, "loss": 0.1111, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.1004844456911087, "rewards/margins": 6.44378137588501, "rewards/rejected": -6.5442657470703125, "step": 2780 }, { "epoch": 1.4599686028257457, "grad_norm": 37.74308401687933, "learning_rate": 2.8522969567745687e-07, "logits/chosen": -2.582435131072998, "logits/rejected": -2.594123125076294, "logps/chosen": -269.3876953125, "logps/rejected": -369.3157043457031, "loss": 0.0576, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.3501587510108948, "rewards/margins": 6.793074607849121, "rewards/rejected": -7.143233299255371, "step": 2790 }, { "epoch": 1.4652014652014653, "grad_norm": 19.254053415340408, "learning_rate": 2.8426051560379917e-07, "logits/chosen": -2.547100067138672, "logits/rejected": -2.545382499694824, "logps/chosen": -305.5469970703125, "logps/rejected": -364.5526428222656, "loss": 0.0925, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03878812864422798, "rewards/margins": 7.066806793212891, "rewards/rejected": -7.105594635009766, "step": 2800 }, { "epoch": 1.4652014652014653, "eval_logits/chosen": -2.5711400508880615, "eval_logits/rejected": -2.5141141414642334, "eval_logps/chosen": -298.14239501953125, "eval_logps/rejected": -310.6962890625, "eval_loss": 0.6523012518882751, "eval_rewards/accuracies": 0.720703125, "eval_rewards/chosen": -2.3230550289154053, "eval_rewards/margins": 2.3048183917999268, "eval_rewards/rejected": -4.627873420715332, "eval_runtime": 217.2629, "eval_samples_per_second": 9.205, "eval_steps_per_second": 0.147, "step": 2800 }, { "epoch": 1.4704343275771847, "grad_norm": 33.845795550613104, "learning_rate": 2.8329133553014153e-07, "logits/chosen": -2.6134302616119385, "logits/rejected": -2.5887703895568848, "logps/chosen": -280.33245849609375, "logps/rejected": -343.8913269042969, "loss": 0.0742, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.2494930773973465, "rewards/margins": 7.015024662017822, "rewards/rejected": -7.264517307281494, "step": 2810 }, { "epoch": 1.4756671899529041, "grad_norm": 229.83950125977404, "learning_rate": 2.823221554564838e-07, "logits/chosen": -2.621290922164917, "logits/rejected": -2.564849376678467, "logps/chosen": -259.19927978515625, "logps/rejected": -328.11309814453125, "loss": 0.0746, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.09396837651729584, "rewards/margins": 7.9216413497924805, "rewards/rejected": -8.015609741210938, "step": 2820 }, { "epoch": 1.4809000523286238, "grad_norm": 21.56200058056173, "learning_rate": 2.813529753828261e-07, "logits/chosen": -2.4473283290863037, "logits/rejected": -2.4102425575256348, "logps/chosen": -260.61480712890625, "logps/rejected": -321.08221435546875, "loss": 0.0636, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.10894875228404999, "rewards/margins": 7.229304313659668, "rewards/rejected": -7.338253021240234, "step": 2830 }, { "epoch": 1.4861329147043434, "grad_norm": 38.02154052013364, "learning_rate": 2.8038379530916843e-07, "logits/chosen": -2.555478572845459, "logits/rejected": -2.4827041625976562, "logps/chosen": -317.4620056152344, "logps/rejected": -348.6655578613281, "loss": 0.1135, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.26374882459640503, "rewards/margins": 7.8946709632873535, "rewards/rejected": -7.630922794342041, "step": 2840 }, { "epoch": 1.4913657770800628, "grad_norm": 22.33371966733499, "learning_rate": 2.7941461523551073e-07, "logits/chosen": -2.561192512512207, "logits/rejected": -2.5197577476501465, "logps/chosen": -280.72332763671875, "logps/rejected": -314.71234130859375, "loss": 0.0716, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.23514780402183533, "rewards/margins": 7.475598335266113, "rewards/rejected": -7.24044942855835, "step": 2850 }, { "epoch": 1.4965986394557822, "grad_norm": 203.83777430561207, "learning_rate": 2.7844543516185303e-07, "logits/chosen": -2.5503902435302734, "logits/rejected": -2.48992657661438, "logps/chosen": -311.14410400390625, "logps/rejected": -390.2708740234375, "loss": 0.1097, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.002077382756397128, "rewards/margins": 8.404695510864258, "rewards/rejected": -8.402618408203125, "step": 2860 }, { "epoch": 1.5018315018315018, "grad_norm": 32.30514631790578, "learning_rate": 2.774762550881954e-07, "logits/chosen": -2.4867565631866455, "logits/rejected": -2.4186935424804688, "logps/chosen": -285.90631103515625, "logps/rejected": -358.36053466796875, "loss": 0.0892, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.11714434623718262, "rewards/margins": 8.043123245239258, "rewards/rejected": -8.16026782989502, "step": 2870 }, { "epoch": 1.5070643642072215, "grad_norm": 15.675717765796625, "learning_rate": 2.765070750145377e-07, "logits/chosen": -2.595289945602417, "logits/rejected": -2.5084784030914307, "logps/chosen": -300.2441711425781, "logps/rejected": -339.9832458496094, "loss": 0.1015, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.429696649312973, "rewards/margins": 7.699285984039307, "rewards/rejected": -8.128982543945312, "step": 2880 }, { "epoch": 1.5122972265829409, "grad_norm": 82.45075332035442, "learning_rate": 2.7553789494088e-07, "logits/chosen": -2.5196218490600586, "logits/rejected": -2.450827121734619, "logps/chosen": -316.9833984375, "logps/rejected": -341.6518859863281, "loss": 0.0811, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.022137140855193138, "rewards/margins": 7.913984775543213, "rewards/rejected": -7.891848564147949, "step": 2890 }, { "epoch": 1.5175300889586603, "grad_norm": 46.199326936141425, "learning_rate": 2.7456871486722234e-07, "logits/chosen": -2.607149839401245, "logits/rejected": -2.526585102081299, "logps/chosen": -310.09637451171875, "logps/rejected": -375.9559631347656, "loss": 0.1221, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.44808870553970337, "rewards/margins": 8.173684120178223, "rewards/rejected": -7.725594520568848, "step": 2900 }, { "epoch": 1.5175300889586603, "eval_logits/chosen": -2.541378974914551, "eval_logits/rejected": -2.473323106765747, "eval_logps/chosen": -303.68231201171875, "eval_logps/rejected": -315.8546142578125, "eval_loss": 0.6496403813362122, "eval_rewards/accuracies": 0.7265625, "eval_rewards/chosen": -2.8770499229431152, "eval_rewards/margins": 2.266655683517456, "eval_rewards/rejected": -5.14370584487915, "eval_runtime": 218.9478, "eval_samples_per_second": 9.135, "eval_steps_per_second": 0.146, "step": 2900 }, { "epoch": 1.5227629513343799, "grad_norm": 54.16394312279491, "learning_rate": 2.7359953479356465e-07, "logits/chosen": -2.5798938274383545, "logits/rejected": -2.5200395584106445, "logps/chosen": -258.32257080078125, "logps/rejected": -325.27764892578125, "loss": 0.0788, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6097877621650696, "rewards/margins": 6.7790045738220215, "rewards/rejected": -7.388791561126709, "step": 2910 }, { "epoch": 1.5279958137100995, "grad_norm": 16.02529549820956, "learning_rate": 2.726303547199069e-07, "logits/chosen": -2.527121067047119, "logits/rejected": -2.5244174003601074, "logps/chosen": -304.4008483886719, "logps/rejected": -389.8269958496094, "loss": 0.0453, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.7094684839248657, "rewards/margins": 7.107405662536621, "rewards/rejected": -7.8168745040893555, "step": 2920 }, { "epoch": 1.533228676085819, "grad_norm": 35.20795926556654, "learning_rate": 2.7166117464624925e-07, "logits/chosen": -2.5798916816711426, "logits/rejected": -2.559600353240967, "logps/chosen": -319.4151306152344, "logps/rejected": -410.05096435546875, "loss": 0.0892, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.46024084091186523, "rewards/margins": 7.860762596130371, "rewards/rejected": -8.321002960205078, "step": 2930 }, { "epoch": 1.5384615384615383, "grad_norm": 20.08740067679489, "learning_rate": 2.7069199457259155e-07, "logits/chosen": -2.6361541748046875, "logits/rejected": -2.5917553901672363, "logps/chosen": -299.72821044921875, "logps/rejected": -352.30181884765625, "loss": 0.0894, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.10643943399190903, "rewards/margins": 7.499425411224365, "rewards/rejected": -7.605864524841309, "step": 2940 }, { "epoch": 1.543694400837258, "grad_norm": 78.00186043411945, "learning_rate": 2.697228144989339e-07, "logits/chosen": -2.5725674629211426, "logits/rejected": -2.47165584564209, "logps/chosen": -274.11968994140625, "logps/rejected": -273.965576171875, "loss": 0.1159, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.0664923191070557, "rewards/margins": 6.723793983459473, "rewards/rejected": -7.790287017822266, "step": 2950 }, { "epoch": 1.5489272632129776, "grad_norm": 33.17590357249773, "learning_rate": 2.687536344252762e-07, "logits/chosen": -2.5554099082946777, "logits/rejected": -2.478506565093994, "logps/chosen": -257.4358215332031, "logps/rejected": -295.0519104003906, "loss": 0.0897, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.24494671821594238, "rewards/margins": 6.950389862060547, "rewards/rejected": -7.19533634185791, "step": 2960 }, { "epoch": 1.554160125588697, "grad_norm": 87.16201833335705, "learning_rate": 2.677844543516185e-07, "logits/chosen": -2.5158534049987793, "logits/rejected": -2.440523624420166, "logps/chosen": -272.75457763671875, "logps/rejected": -325.4996643066406, "loss": 0.0611, "rewards/accuracies": 1.0, "rewards/chosen": 0.06438750773668289, "rewards/margins": 7.242406368255615, "rewards/rejected": -7.178019046783447, "step": 2970 }, { "epoch": 1.5593929879644164, "grad_norm": 27.72210929912796, "learning_rate": 2.6681527427796086e-07, "logits/chosen": -2.5243163108825684, "logits/rejected": -2.443297863006592, "logps/chosen": -314.2002868652344, "logps/rejected": -333.82830810546875, "loss": 0.0793, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.24555449187755585, "rewards/margins": 6.220225811004639, "rewards/rejected": -6.465781211853027, "step": 2980 }, { "epoch": 1.564625850340136, "grad_norm": 70.5627216452905, "learning_rate": 2.6584609420430316e-07, "logits/chosen": -2.4665582180023193, "logits/rejected": -2.4132871627807617, "logps/chosen": -266.46868896484375, "logps/rejected": -361.470947265625, "loss": 0.0592, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5595269799232483, "rewards/margins": 7.706968784332275, "rewards/rejected": -8.266495704650879, "step": 2990 }, { "epoch": 1.5698587127158556, "grad_norm": 21.42260359989903, "learning_rate": 2.6487691413064546e-07, "logits/chosen": -2.511399745941162, "logits/rejected": -2.4704878330230713, "logps/chosen": -311.36309814453125, "logps/rejected": -362.10406494140625, "loss": 0.0807, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.5450899600982666, "rewards/margins": 8.695852279663086, "rewards/rejected": -8.150762557983398, "step": 3000 }, { "epoch": 1.5698587127158556, "eval_logits/chosen": -2.4140665531158447, "eval_logits/rejected": -2.326695442199707, "eval_logps/chosen": -302.6737060546875, "eval_logps/rejected": -317.76776123046875, "eval_loss": 0.6925140619277954, "eval_rewards/accuracies": 0.73828125, "eval_rewards/chosen": -2.7761871814727783, "eval_rewards/margins": 2.5588321685791016, "eval_rewards/rejected": -5.335019588470459, "eval_runtime": 224.8834, "eval_samples_per_second": 8.893, "eval_steps_per_second": 0.142, "step": 3000 }, { "epoch": 1.575091575091575, "grad_norm": 263.252770577149, "learning_rate": 2.639077340569878e-07, "logits/chosen": -2.517911434173584, "logits/rejected": -2.4659979343414307, "logps/chosen": -319.09149169921875, "logps/rejected": -386.6375732421875, "loss": 0.0589, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.2917242646217346, "rewards/margins": 7.740719795227051, "rewards/rejected": -8.03244400024414, "step": 3010 }, { "epoch": 1.5803244374672945, "grad_norm": 76.38521598926056, "learning_rate": 2.6293855398333007e-07, "logits/chosen": -2.398588180541992, "logits/rejected": -2.3212904930114746, "logps/chosen": -296.66461181640625, "logps/rejected": -343.5479431152344, "loss": 0.0981, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.3628515601158142, "rewards/margins": 7.391815185546875, "rewards/rejected": -7.75466775894165, "step": 3020 }, { "epoch": 1.585557299843014, "grad_norm": 21.15431896007464, "learning_rate": 2.6196937390967237e-07, "logits/chosen": -2.353174924850464, "logits/rejected": -2.2482330799102783, "logps/chosen": -307.61651611328125, "logps/rejected": -350.83892822265625, "loss": 0.0827, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6314824819564819, "rewards/margins": 8.731412887573242, "rewards/rejected": -8.099929809570312, "step": 3030 }, { "epoch": 1.5907901622187337, "grad_norm": 39.491523707673544, "learning_rate": 2.610001938360147e-07, "logits/chosen": -2.4090259075164795, "logits/rejected": -2.3596339225769043, "logps/chosen": -263.66094970703125, "logps/rejected": -364.0358581542969, "loss": 0.0641, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.19557060301303864, "rewards/margins": 7.483084201812744, "rewards/rejected": -7.28751277923584, "step": 3040 }, { "epoch": 1.5960230245944533, "grad_norm": 42.632478750442935, "learning_rate": 2.60031013762357e-07, "logits/chosen": -2.4412076473236084, "logits/rejected": -2.3332931995391846, "logps/chosen": -235.74221801757812, "logps/rejected": -266.3738708496094, "loss": 0.0703, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.07606669515371323, "rewards/margins": 6.573895454406738, "rewards/rejected": -6.497828483581543, "step": 3050 }, { "epoch": 1.6012558869701727, "grad_norm": 15.159863698746499, "learning_rate": 2.590618336886993e-07, "logits/chosen": -2.551246166229248, "logits/rejected": -2.436537981033325, "logps/chosen": -330.43609619140625, "logps/rejected": -326.2865295410156, "loss": 0.0774, "rewards/accuracies": 1.0, "rewards/chosen": 0.876664936542511, "rewards/margins": 7.715054988861084, "rewards/rejected": -6.8383893966674805, "step": 3060 }, { "epoch": 1.6064887493458921, "grad_norm": 12.101404215019047, "learning_rate": 2.580926536150417e-07, "logits/chosen": -2.512691020965576, "logits/rejected": -2.429511070251465, "logps/chosen": -291.51800537109375, "logps/rejected": -356.90679931640625, "loss": 0.0812, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.9892548322677612, "rewards/margins": 8.316636085510254, "rewards/rejected": -7.3273820877075195, "step": 3070 }, { "epoch": 1.6117216117216118, "grad_norm": 41.32277101687688, "learning_rate": 2.57123473541384e-07, "logits/chosen": -2.499072790145874, "logits/rejected": -2.328218936920166, "logps/chosen": -304.9527282714844, "logps/rejected": -314.92919921875, "loss": 0.0754, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.531328558921814, "rewards/margins": 7.224859714508057, "rewards/rejected": -7.75618839263916, "step": 3080 }, { "epoch": 1.6169544740973314, "grad_norm": 43.14842699459146, "learning_rate": 2.561542934677263e-07, "logits/chosen": -2.4981369972229004, "logits/rejected": -2.4126057624816895, "logps/chosen": -324.19342041015625, "logps/rejected": -386.69049072265625, "loss": 0.0859, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.18604479730129242, "rewards/margins": 8.45301342010498, "rewards/rejected": -8.639059066772461, "step": 3090 }, { "epoch": 1.6221873364730508, "grad_norm": 43.170924365250634, "learning_rate": 2.5518511339406864e-07, "logits/chosen": -2.4399654865264893, "logits/rejected": -2.321230888366699, "logps/chosen": -299.34698486328125, "logps/rejected": -368.11474609375, "loss": 0.105, "rewards/accuracies": 0.9375, "rewards/chosen": -0.41577863693237305, "rewards/margins": 7.590156555175781, "rewards/rejected": -8.00593376159668, "step": 3100 }, { "epoch": 1.6221873364730508, "eval_logits/chosen": -2.439500570297241, "eval_logits/rejected": -2.3682920932769775, "eval_logps/chosen": -301.769775390625, "eval_logps/rejected": -314.4846496582031, "eval_loss": 0.6539994478225708, "eval_rewards/accuracies": 0.724609375, "eval_rewards/chosen": -2.6857962608337402, "eval_rewards/margins": 2.3209123611450195, "eval_rewards/rejected": -5.006708145141602, "eval_runtime": 223.4815, "eval_samples_per_second": 8.949, "eval_steps_per_second": 0.143, "step": 3100 }, { "epoch": 1.6274201988487702, "grad_norm": 106.85483080846494, "learning_rate": 2.5421593332041094e-07, "logits/chosen": -2.4206371307373047, "logits/rejected": -2.337944507598877, "logps/chosen": -284.30767822265625, "logps/rejected": -342.24169921875, "loss": 0.0971, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.6263729333877563, "rewards/margins": 6.642604827880859, "rewards/rejected": -7.268977165222168, "step": 3110 }, { "epoch": 1.6326530612244898, "grad_norm": 63.0615458228801, "learning_rate": 2.532467532467532e-07, "logits/chosen": -2.4785897731781006, "logits/rejected": -2.4087796211242676, "logps/chosen": -320.53424072265625, "logps/rejected": -340.9963684082031, "loss": 0.0952, "rewards/accuracies": 1.0, "rewards/chosen": 1.025964617729187, "rewards/margins": 8.140347480773926, "rewards/rejected": -7.114382266998291, "step": 3120 }, { "epoch": 1.6378859236002095, "grad_norm": 6.6412012730149295, "learning_rate": 2.5227757317309554e-07, "logits/chosen": -2.54270601272583, "logits/rejected": -2.410787343978882, "logps/chosen": -272.6456298828125, "logps/rejected": -307.1575927734375, "loss": 0.0612, "rewards/accuracies": 1.0, "rewards/chosen": 0.19680127501487732, "rewards/margins": 7.607694149017334, "rewards/rejected": -7.410892486572266, "step": 3130 }, { "epoch": 1.6431187859759289, "grad_norm": 59.26829228941831, "learning_rate": 2.5130839309943784e-07, "logits/chosen": -2.4212193489074707, "logits/rejected": -2.299006223678589, "logps/chosen": -249.61087036132812, "logps/rejected": -321.0813903808594, "loss": 0.1115, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2435237467288971, "rewards/margins": 7.435656547546387, "rewards/rejected": -7.679180145263672, "step": 3140 }, { "epoch": 1.6483516483516483, "grad_norm": 72.82976405549597, "learning_rate": 2.503392130257802e-07, "logits/chosen": -2.442901611328125, "logits/rejected": -2.299041986465454, "logps/chosen": -274.131591796875, "logps/rejected": -307.36968994140625, "loss": 0.0781, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.4332135319709778, "rewards/margins": 7.724754333496094, "rewards/rejected": -7.291542053222656, "step": 3150 }, { "epoch": 1.653584510727368, "grad_norm": 82.07746801489996, "learning_rate": 2.493700329521225e-07, "logits/chosen": -2.430314779281616, "logits/rejected": -2.369117259979248, "logps/chosen": -281.3594665527344, "logps/rejected": -335.70269775390625, "loss": 0.1212, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.48696714639663696, "rewards/margins": 7.057797431945801, "rewards/rejected": -6.570830345153809, "step": 3160 }, { "epoch": 1.6588173731030875, "grad_norm": 38.7027944652355, "learning_rate": 2.484008528784648e-07, "logits/chosen": -2.3618900775909424, "logits/rejected": -2.2716853618621826, "logps/chosen": -289.3221130371094, "logps/rejected": -304.17816162109375, "loss": 0.0753, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.01573336124420166, "rewards/margins": 7.370008945465088, "rewards/rejected": -7.354274749755859, "step": 3170 }, { "epoch": 1.664050235478807, "grad_norm": 62.92310098320559, "learning_rate": 2.474316728048071e-07, "logits/chosen": -2.459293842315674, "logits/rejected": -2.376004457473755, "logps/chosen": -287.3421325683594, "logps/rejected": -339.1846923828125, "loss": 0.0975, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.16361963748931885, "rewards/margins": 6.482570648193359, "rewards/rejected": -6.6461896896362305, "step": 3180 }, { "epoch": 1.6692830978545263, "grad_norm": 46.125042228261364, "learning_rate": 2.4646249273114945e-07, "logits/chosen": -2.4897594451904297, "logits/rejected": -2.393815040588379, "logps/chosen": -277.3688659667969, "logps/rejected": -328.25775146484375, "loss": 0.0761, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.1545456349849701, "rewards/margins": 6.827078342437744, "rewards/rejected": -6.981623649597168, "step": 3190 }, { "epoch": 1.674515960230246, "grad_norm": 66.17865627135092, "learning_rate": 2.4549331265749176e-07, "logits/chosen": -2.3612403869628906, "logits/rejected": -2.286970376968384, "logps/chosen": -323.9811706542969, "logps/rejected": -324.7330627441406, "loss": 0.1162, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.2701718807220459, "rewards/margins": 6.600011348724365, "rewards/rejected": -6.870182991027832, "step": 3200 }, { "epoch": 1.674515960230246, "eval_logits/chosen": -2.4378576278686523, "eval_logits/rejected": -2.3670148849487305, "eval_logps/chosen": -293.04461669921875, "eval_logps/rejected": -304.86517333984375, "eval_loss": 0.648138165473938, "eval_rewards/accuracies": 0.71484375, "eval_rewards/chosen": -1.813275933265686, "eval_rewards/margins": 2.2314834594726562, "eval_rewards/rejected": -4.044759750366211, "eval_runtime": 220.3346, "eval_samples_per_second": 9.077, "eval_steps_per_second": 0.145, "step": 3200 }, { "epoch": 1.6797488226059656, "grad_norm": 106.32482794241598, "learning_rate": 2.4452413258383406e-07, "logits/chosen": -2.477659225463867, "logits/rejected": -2.39676570892334, "logps/chosen": -303.52032470703125, "logps/rejected": -351.10955810546875, "loss": 0.1002, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.011722063645720482, "rewards/margins": 6.779210090637207, "rewards/rejected": -6.790932655334473, "step": 3210 }, { "epoch": 1.684981684981685, "grad_norm": 26.768754474889526, "learning_rate": 2.4355495251017636e-07, "logits/chosen": -2.4913134574890137, "logits/rejected": -2.3384642601013184, "logps/chosen": -309.5846252441406, "logps/rejected": -372.6744689941406, "loss": 0.0698, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8764934539794922, "rewards/margins": 8.39664363861084, "rewards/rejected": -7.520150661468506, "step": 3220 }, { "epoch": 1.6902145473574044, "grad_norm": 35.701485026349, "learning_rate": 2.425857724365187e-07, "logits/chosen": -2.491218328475952, "logits/rejected": -2.428218364715576, "logps/chosen": -271.5113830566406, "logps/rejected": -308.07904052734375, "loss": 0.0872, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.3707517385482788, "rewards/margins": 7.114607334136963, "rewards/rejected": -6.7438554763793945, "step": 3230 }, { "epoch": 1.695447409733124, "grad_norm": 90.74847209253667, "learning_rate": 2.41616592362861e-07, "logits/chosen": -2.391489267349243, "logits/rejected": -2.295874834060669, "logps/chosen": -282.61053466796875, "logps/rejected": -336.82342529296875, "loss": 0.1248, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.14906617999076843, "rewards/margins": 6.938790798187256, "rewards/rejected": -7.087856292724609, "step": 3240 }, { "epoch": 1.7006802721088436, "grad_norm": 39.61846170539721, "learning_rate": 2.406474122892033e-07, "logits/chosen": -2.401337146759033, "logits/rejected": -2.3837599754333496, "logps/chosen": -242.72836303710938, "logps/rejected": -321.4362487792969, "loss": 0.0825, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.5290157794952393, "rewards/margins": 7.116967678070068, "rewards/rejected": -6.58795166015625, "step": 3250 }, { "epoch": 1.705913134484563, "grad_norm": 174.68407428542716, "learning_rate": 2.3967823221554567e-07, "logits/chosen": -2.442564010620117, "logits/rejected": -2.3995108604431152, "logps/chosen": -257.47503662109375, "logps/rejected": -299.9869689941406, "loss": 0.139, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1015794649720192, "rewards/margins": 6.381906032562256, "rewards/rejected": -6.483485221862793, "step": 3260 }, { "epoch": 1.7111459968602825, "grad_norm": 35.17071417525662, "learning_rate": 2.387090521418879e-07, "logits/chosen": -2.570798397064209, "logits/rejected": -2.520289897918701, "logps/chosen": -298.4034423828125, "logps/rejected": -406.50732421875, "loss": 0.0917, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.3961904048919678, "rewards/margins": 8.779169082641602, "rewards/rejected": -7.382978916168213, "step": 3270 }, { "epoch": 1.716378859236002, "grad_norm": 76.74561375210627, "learning_rate": 2.3773987206823027e-07, "logits/chosen": -2.444225549697876, "logits/rejected": -2.391583204269409, "logps/chosen": -295.3659362792969, "logps/rejected": -344.78057861328125, "loss": 0.0891, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5461596250534058, "rewards/margins": 7.7500410079956055, "rewards/rejected": -7.20388126373291, "step": 3280 }, { "epoch": 1.7216117216117217, "grad_norm": 70.61844693834938, "learning_rate": 2.3677069199457257e-07, "logits/chosen": -2.473740577697754, "logits/rejected": -2.395437717437744, "logps/chosen": -275.5171813964844, "logps/rejected": -342.0962219238281, "loss": 0.066, "rewards/accuracies": 1.0, "rewards/chosen": 0.5014622807502747, "rewards/margins": 7.515198707580566, "rewards/rejected": -7.013735771179199, "step": 3290 }, { "epoch": 1.7268445839874411, "grad_norm": 30.928275991831935, "learning_rate": 2.358015119209149e-07, "logits/chosen": -2.2972145080566406, "logits/rejected": -2.315185546875, "logps/chosen": -303.7619323730469, "logps/rejected": -337.3390197753906, "loss": 0.0667, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.0274298191070557, "rewards/margins": 8.70716667175293, "rewards/rejected": -7.679737091064453, "step": 3300 }, { "epoch": 1.7268445839874411, "eval_logits/chosen": -2.3589165210723877, "eval_logits/rejected": -2.279379367828369, "eval_logps/chosen": -295.2762756347656, "eval_logps/rejected": -308.3506164550781, "eval_loss": 0.654128909111023, "eval_rewards/accuracies": 0.736328125, "eval_rewards/chosen": -2.0364439487457275, "eval_rewards/margins": 2.356860637664795, "eval_rewards/rejected": -4.393304824829102, "eval_runtime": 217.1949, "eval_samples_per_second": 9.208, "eval_steps_per_second": 0.147, "step": 3300 }, { "epoch": 1.7320774463631605, "grad_norm": 95.84556633807539, "learning_rate": 2.348323318472572e-07, "logits/chosen": -2.3497447967529297, "logits/rejected": -2.2458086013793945, "logps/chosen": -267.6063232421875, "logps/rejected": -301.579345703125, "loss": 0.0897, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.037819329649209976, "rewards/margins": 6.551515102386475, "rewards/rejected": -6.513696193695068, "step": 3310 }, { "epoch": 1.7373103087388801, "grad_norm": 80.04800338108818, "learning_rate": 2.3386315177359953e-07, "logits/chosen": -2.335570812225342, "logits/rejected": -2.3833823204040527, "logps/chosen": -266.31451416015625, "logps/rejected": -351.53662109375, "loss": 0.0886, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.09156922996044159, "rewards/margins": 8.318611145019531, "rewards/rejected": -8.227041244506836, "step": 3320 }, { "epoch": 1.7425431711145998, "grad_norm": 7.013715124757597, "learning_rate": 2.3289397169994186e-07, "logits/chosen": -2.428406000137329, "logits/rejected": -2.2926154136657715, "logps/chosen": -288.1254577636719, "logps/rejected": -339.63140869140625, "loss": 0.0711, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6113865375518799, "rewards/margins": 8.690730094909668, "rewards/rejected": -8.07934284210205, "step": 3330 }, { "epoch": 1.7477760334903192, "grad_norm": 25.414550929814464, "learning_rate": 2.3192479162628413e-07, "logits/chosen": -2.2973508834838867, "logits/rejected": -2.238509178161621, "logps/chosen": -270.40240478515625, "logps/rejected": -340.88775634765625, "loss": 0.0944, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.6471933126449585, "rewards/margins": 7.3522796630859375, "rewards/rejected": -7.999472141265869, "step": 3340 }, { "epoch": 1.7530088958660386, "grad_norm": 23.919815353951442, "learning_rate": 2.3095561155262646e-07, "logits/chosen": -2.399322748184204, "logits/rejected": -2.323655605316162, "logps/chosen": -252.65402221679688, "logps/rejected": -299.607177734375, "loss": 0.0928, "rewards/accuracies": 1.0, "rewards/chosen": 0.019119098782539368, "rewards/margins": 7.975864410400391, "rewards/rejected": -7.956745147705078, "step": 3350 }, { "epoch": 1.7582417582417582, "grad_norm": 25.648202707293823, "learning_rate": 2.299864314789688e-07, "logits/chosen": -2.2908546924591064, "logits/rejected": -2.267071008682251, "logps/chosen": -248.2761993408203, "logps/rejected": -314.4021911621094, "loss": 0.0841, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.13074228167533875, "rewards/margins": 7.338799953460693, "rewards/rejected": -7.469542503356934, "step": 3360 }, { "epoch": 1.7634746206174778, "grad_norm": 9.072030805563431, "learning_rate": 2.290172514053111e-07, "logits/chosen": -2.5191829204559326, "logits/rejected": -2.398791551589966, "logps/chosen": -329.37445068359375, "logps/rejected": -334.0418701171875, "loss": 0.0671, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.9050701260566711, "rewards/margins": 7.781418800354004, "rewards/rejected": -6.876348972320557, "step": 3370 }, { "epoch": 1.7687074829931972, "grad_norm": 66.4668777172541, "learning_rate": 2.2804807133165342e-07, "logits/chosen": -2.3169493675231934, "logits/rejected": -2.217641592025757, "logps/chosen": -266.71002197265625, "logps/rejected": -306.2403869628906, "loss": 0.093, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2115212380886078, "rewards/margins": 7.699152946472168, "rewards/rejected": -7.487631320953369, "step": 3380 }, { "epoch": 1.7739403453689166, "grad_norm": 14.034966220169443, "learning_rate": 2.2707889125799572e-07, "logits/chosen": -2.375394105911255, "logits/rejected": -2.362278938293457, "logps/chosen": -252.7733154296875, "logps/rejected": -357.8332214355469, "loss": 0.0932, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.25256091356277466, "rewards/margins": 7.7929487228393555, "rewards/rejected": -7.5403876304626465, "step": 3390 }, { "epoch": 1.7791732077446363, "grad_norm": 131.52858648777178, "learning_rate": 2.2610971118433805e-07, "logits/chosen": -2.4172284603118896, "logits/rejected": -2.3040413856506348, "logps/chosen": -290.61334228515625, "logps/rejected": -303.82904052734375, "loss": 0.0935, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6121436357498169, "rewards/margins": 6.604330539703369, "rewards/rejected": -7.2164740562438965, "step": 3400 }, { "epoch": 1.7791732077446363, "eval_logits/chosen": -2.369400978088379, "eval_logits/rejected": -2.2855379581451416, "eval_logps/chosen": -302.2036437988281, "eval_logps/rejected": -317.0096435546875, "eval_loss": 0.6689784526824951, "eval_rewards/accuracies": 0.744140625, "eval_rewards/chosen": -2.729180097579956, "eval_rewards/margins": 2.5300278663635254, "eval_rewards/rejected": -5.259207725524902, "eval_runtime": 220.9012, "eval_samples_per_second": 9.054, "eval_steps_per_second": 0.145, "step": 3400 }, { "epoch": 1.784406070120356, "grad_norm": 18.90711425888832, "learning_rate": 2.2514053111068035e-07, "logits/chosen": -2.4939043521881104, "logits/rejected": -2.4023773670196533, "logps/chosen": -300.4817810058594, "logps/rejected": -343.0694580078125, "loss": 0.0957, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1343466341495514, "rewards/margins": 8.02270221710205, "rewards/rejected": -8.157048225402832, "step": 3410 }, { "epoch": 1.7896389324960753, "grad_norm": 82.20789554623013, "learning_rate": 2.2417135103702268e-07, "logits/chosen": -2.3543498516082764, "logits/rejected": -2.214665651321411, "logps/chosen": -345.53643798828125, "logps/rejected": -322.7984924316406, "loss": 0.0996, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.901275634765625, "rewards/margins": 7.487607002258301, "rewards/rejected": -8.388882637023926, "step": 3420 }, { "epoch": 1.7948717948717947, "grad_norm": 22.329378927752163, "learning_rate": 2.23202170963365e-07, "logits/chosen": -2.3646130561828613, "logits/rejected": -2.3578739166259766, "logps/chosen": -253.9729766845703, "logps/rejected": -307.57281494140625, "loss": 0.0983, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.8083534240722656, "rewards/margins": 6.52506160736084, "rewards/rejected": -7.333415985107422, "step": 3430 }, { "epoch": 1.8001046572475143, "grad_norm": 53.71780222761663, "learning_rate": 2.2223299088970728e-07, "logits/chosen": -2.420081377029419, "logits/rejected": -2.360835313796997, "logps/chosen": -339.54559326171875, "logps/rejected": -369.76025390625, "loss": 0.0524, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.42823439836502075, "rewards/margins": 8.150211334228516, "rewards/rejected": -7.721977233886719, "step": 3440 }, { "epoch": 1.805337519623234, "grad_norm": 46.587478366839186, "learning_rate": 2.212638108160496e-07, "logits/chosen": -2.4080426692962646, "logits/rejected": -2.344693660736084, "logps/chosen": -275.427490234375, "logps/rejected": -315.14520263671875, "loss": 0.0802, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03373689576983452, "rewards/margins": 7.2949113845825195, "rewards/rejected": -7.261174201965332, "step": 3450 }, { "epoch": 1.8105703819989536, "grad_norm": 79.66280035330131, "learning_rate": 2.2029463074239194e-07, "logits/chosen": -2.4254508018493652, "logits/rejected": -2.3108010292053223, "logps/chosen": -295.00714111328125, "logps/rejected": -336.64288330078125, "loss": 0.079, "rewards/accuracies": 1.0, "rewards/chosen": 0.22496457397937775, "rewards/margins": 7.46133279800415, "rewards/rejected": -7.236368656158447, "step": 3460 }, { "epoch": 1.815803244374673, "grad_norm": 87.05005926055823, "learning_rate": 2.1932545066873424e-07, "logits/chosen": -2.4628663063049316, "logits/rejected": -2.3696205615997314, "logps/chosen": -291.80084228515625, "logps/rejected": -309.8385009765625, "loss": 0.0832, "rewards/accuracies": 1.0, "rewards/chosen": -0.030597954988479614, "rewards/margins": 7.370604515075684, "rewards/rejected": -7.401203155517578, "step": 3470 }, { "epoch": 1.8210361067503924, "grad_norm": 75.41438136659319, "learning_rate": 2.1835627059507656e-07, "logits/chosen": -2.361757755279541, "logits/rejected": -2.2893853187561035, "logps/chosen": -316.33123779296875, "logps/rejected": -349.1077575683594, "loss": 0.0918, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.9195256233215332, "rewards/margins": 7.185220241546631, "rewards/rejected": -8.104745864868164, "step": 3480 }, { "epoch": 1.826268969126112, "grad_norm": 29.53941695996506, "learning_rate": 2.1738709052141887e-07, "logits/chosen": -2.504779100418091, "logits/rejected": -2.3570313453674316, "logps/chosen": -306.0981750488281, "logps/rejected": -327.40203857421875, "loss": 0.106, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6305055618286133, "rewards/margins": 7.211207389831543, "rewards/rejected": -7.84171199798584, "step": 3490 }, { "epoch": 1.8315018315018317, "grad_norm": 67.33543416598376, "learning_rate": 2.1641791044776117e-07, "logits/chosen": -2.435098171234131, "logits/rejected": -2.359255075454712, "logps/chosen": -297.4574279785156, "logps/rejected": -313.97955322265625, "loss": 0.095, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.1143710613250732, "rewards/margins": 6.484988212585449, "rewards/rejected": -7.599359035491943, "step": 3500 }, { "epoch": 1.8315018315018317, "eval_logits/chosen": -2.452993392944336, "eval_logits/rejected": -2.3827455043792725, "eval_logps/chosen": -304.2197570800781, "eval_logps/rejected": -316.009033203125, "eval_loss": 0.6361427903175354, "eval_rewards/accuracies": 0.7265625, "eval_rewards/chosen": -2.930795192718506, "eval_rewards/margins": 2.2283527851104736, "eval_rewards/rejected": -5.1591477394104, "eval_runtime": 219.7693, "eval_samples_per_second": 9.1, "eval_steps_per_second": 0.146, "step": 3500 }, { "epoch": 1.836734693877551, "grad_norm": 32.92880973196361, "learning_rate": 2.154487303741035e-07, "logits/chosen": -2.385526180267334, "logits/rejected": -2.3790879249572754, "logps/chosen": -246.4010467529297, "logps/rejected": -322.9360046386719, "loss": 0.0858, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.8317921757698059, "rewards/margins": 6.951634883880615, "rewards/rejected": -7.7834272384643555, "step": 3510 }, { "epoch": 1.8419675562532705, "grad_norm": 53.249417414942656, "learning_rate": 2.1447955030044582e-07, "logits/chosen": -2.4895927906036377, "logits/rejected": -2.4101321697235107, "logps/chosen": -314.7476501464844, "logps/rejected": -345.0580749511719, "loss": 0.0728, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.7081344127655029, "rewards/margins": 7.008575439453125, "rewards/rejected": -7.716709136962891, "step": 3520 }, { "epoch": 1.84720041862899, "grad_norm": 33.36679243942122, "learning_rate": 2.1351037022678815e-07, "logits/chosen": -2.486656427383423, "logits/rejected": -2.4397740364074707, "logps/chosen": -287.84735107421875, "logps/rejected": -341.68487548828125, "loss": 0.0567, "rewards/accuracies": 1.0, "rewards/chosen": -0.2925586700439453, "rewards/margins": 7.375280857086182, "rewards/rejected": -7.667839050292969, "step": 3530 }, { "epoch": 1.8524332810047097, "grad_norm": 49.51139031337744, "learning_rate": 2.1254119015313043e-07, "logits/chosen": -2.4790091514587402, "logits/rejected": -2.363152027130127, "logps/chosen": -337.1871032714844, "logps/rejected": -336.05389404296875, "loss": 0.0684, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6646552085876465, "rewards/margins": 8.15034008026123, "rewards/rejected": -7.4856858253479, "step": 3540 }, { "epoch": 1.8576661433804291, "grad_norm": 12.4785168177443, "learning_rate": 2.1157201007947275e-07, "logits/chosen": -2.507765769958496, "logits/rejected": -2.366875171661377, "logps/chosen": -317.2574768066406, "logps/rejected": -381.31524658203125, "loss": 0.1007, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.5521291494369507, "rewards/margins": 8.218782424926758, "rewards/rejected": -7.666652679443359, "step": 3550 }, { "epoch": 1.8628990057561485, "grad_norm": 32.18520466114337, "learning_rate": 2.1060283000581508e-07, "logits/chosen": -2.46165132522583, "logits/rejected": -2.4455678462982178, "logps/chosen": -291.321533203125, "logps/rejected": -369.7472839355469, "loss": 0.0722, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.29962074756622314, "rewards/margins": 8.607133865356445, "rewards/rejected": -8.307514190673828, "step": 3560 }, { "epoch": 1.8681318681318682, "grad_norm": 78.25594152703285, "learning_rate": 2.0963364993215738e-07, "logits/chosen": -2.4028453826904297, "logits/rejected": -2.35805082321167, "logps/chosen": -271.29888916015625, "logps/rejected": -327.28118896484375, "loss": 0.086, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.0012101649772375822, "rewards/margins": 7.6421098709106445, "rewards/rejected": -7.643320560455322, "step": 3570 }, { "epoch": 1.8733647305075878, "grad_norm": 22.521625947630632, "learning_rate": 2.0866446985849968e-07, "logits/chosen": -2.457045793533325, "logits/rejected": -2.3392226696014404, "logps/chosen": -298.2098693847656, "logps/rejected": -330.40386962890625, "loss": 0.1244, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.18404272198677063, "rewards/margins": 7.69598388671875, "rewards/rejected": -7.880026817321777, "step": 3580 }, { "epoch": 1.8785975928833072, "grad_norm": 40.06962110181783, "learning_rate": 2.07695289784842e-07, "logits/chosen": -2.445855140686035, "logits/rejected": -2.4187183380126953, "logps/chosen": -292.2087707519531, "logps/rejected": -338.23712158203125, "loss": 0.0844, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.4895866811275482, "rewards/margins": 7.064423561096191, "rewards/rejected": -7.554009914398193, "step": 3590 }, { "epoch": 1.8838304552590266, "grad_norm": 169.356679228375, "learning_rate": 2.0672610971118431e-07, "logits/chosen": -2.44105863571167, "logits/rejected": -2.354998826980591, "logps/chosen": -231.13571166992188, "logps/rejected": -292.83953857421875, "loss": 0.0719, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.39349091053009033, "rewards/margins": 8.045453071594238, "rewards/rejected": -7.6519622802734375, "step": 3600 }, { "epoch": 1.8838304552590266, "eval_logits/chosen": -2.50181245803833, "eval_logits/rejected": -2.4285097122192383, "eval_logps/chosen": -298.52783203125, "eval_logps/rejected": -312.6893310546875, "eval_loss": 0.6777711510658264, "eval_rewards/accuracies": 0.724609375, "eval_rewards/chosen": -2.3615970611572266, "eval_rewards/margins": 2.4655814170837402, "eval_rewards/rejected": -4.827178478240967, "eval_runtime": 218.295, "eval_samples_per_second": 9.162, "eval_steps_per_second": 0.147, "step": 3600 }, { "epoch": 1.8890633176347462, "grad_norm": 94.25017412224327, "learning_rate": 2.0575692963752664e-07, "logits/chosen": -2.4225707054138184, "logits/rejected": -2.3844242095947266, "logps/chosen": -263.97174072265625, "logps/rejected": -318.4921875, "loss": 0.0774, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.10253627598285675, "rewards/margins": 7.648558616638184, "rewards/rejected": -7.546021938323975, "step": 3610 }, { "epoch": 1.8942961800104658, "grad_norm": 41.754897248999335, "learning_rate": 2.0478774956386897e-07, "logits/chosen": -2.5245964527130127, "logits/rejected": -2.4918785095214844, "logps/chosen": -260.8522033691406, "logps/rejected": -342.6175231933594, "loss": 0.0809, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1182936578989029, "rewards/margins": 8.538101196289062, "rewards/rejected": -8.419808387756348, "step": 3620 }, { "epoch": 1.8995290423861853, "grad_norm": 105.04781497917497, "learning_rate": 2.038185694902113e-07, "logits/chosen": -2.461094617843628, "logits/rejected": -2.392970561981201, "logps/chosen": -266.72265625, "logps/rejected": -334.2546691894531, "loss": 0.0888, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.9561098217964172, "rewards/margins": 6.487905979156494, "rewards/rejected": -7.444014549255371, "step": 3630 }, { "epoch": 1.9047619047619047, "grad_norm": 84.64924727475301, "learning_rate": 2.0284938941655357e-07, "logits/chosen": -2.5671029090881348, "logits/rejected": -2.438215732574463, "logps/chosen": -285.7683410644531, "logps/rejected": -311.52447509765625, "loss": 0.0736, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.9772365689277649, "rewards/margins": 7.358864784240723, "rewards/rejected": -8.336100578308105, "step": 3640 }, { "epoch": 1.9099947671376243, "grad_norm": 46.704253934827264, "learning_rate": 2.018802093428959e-07, "logits/chosen": -2.4497196674346924, "logits/rejected": -2.478346347808838, "logps/chosen": -242.05612182617188, "logps/rejected": -366.3129577636719, "loss": 0.0995, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5538001656532288, "rewards/margins": 7.660309791564941, "rewards/rejected": -8.214109420776367, "step": 3650 }, { "epoch": 1.915227629513344, "grad_norm": 162.19875364124934, "learning_rate": 2.0091102926923823e-07, "logits/chosen": -2.426657199859619, "logits/rejected": -2.3765947818756104, "logps/chosen": -274.36651611328125, "logps/rejected": -329.5214538574219, "loss": 0.1106, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9210758209228516, "rewards/margins": 6.786985874176025, "rewards/rejected": -7.708061218261719, "step": 3660 }, { "epoch": 1.9204604918890633, "grad_norm": 46.99543650476144, "learning_rate": 1.9994184919558053e-07, "logits/chosen": -2.485370635986328, "logits/rejected": -2.418940305709839, "logps/chosen": -295.74371337890625, "logps/rejected": -334.06756591796875, "loss": 0.0983, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.088950976729393, "rewards/margins": 8.242818832397461, "rewards/rejected": -8.331769943237305, "step": 3670 }, { "epoch": 1.9256933542647827, "grad_norm": 83.32040764365088, "learning_rate": 1.9897266912192283e-07, "logits/chosen": -2.590080738067627, "logits/rejected": -2.4805331230163574, "logps/chosen": -309.9513244628906, "logps/rejected": -346.6572265625, "loss": 0.0654, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.7206801772117615, "rewards/margins": 6.5927629470825195, "rewards/rejected": -7.313443183898926, "step": 3680 }, { "epoch": 1.9309262166405023, "grad_norm": 80.63428033555078, "learning_rate": 1.9800348904826516e-07, "logits/chosen": -2.3929603099823, "logits/rejected": -2.4050092697143555, "logps/chosen": -246.9309844970703, "logps/rejected": -312.17950439453125, "loss": 0.0807, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5777016282081604, "rewards/margins": 7.253002166748047, "rewards/rejected": -7.8307037353515625, "step": 3690 }, { "epoch": 1.936159079016222, "grad_norm": 24.866109203764363, "learning_rate": 1.9703430897460746e-07, "logits/chosen": -2.569267749786377, "logits/rejected": -2.481532573699951, "logps/chosen": -304.2950134277344, "logps/rejected": -358.37884521484375, "loss": 0.0729, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.11949795484542847, "rewards/margins": 8.500961303710938, "rewards/rejected": -8.381464004516602, "step": 3700 }, { "epoch": 1.936159079016222, "eval_logits/chosen": -2.5049147605895996, "eval_logits/rejected": -2.4286677837371826, "eval_logps/chosen": -304.19158935546875, "eval_logps/rejected": -318.77740478515625, "eval_loss": 0.6754004955291748, "eval_rewards/accuracies": 0.728515625, "eval_rewards/chosen": -2.927976131439209, "eval_rewards/margins": 2.50801157951355, "eval_rewards/rejected": -5.4359869956970215, "eval_runtime": 218.5388, "eval_samples_per_second": 9.152, "eval_steps_per_second": 0.146, "step": 3700 }, { "epoch": 1.9413919413919414, "grad_norm": 206.27341705974183, "learning_rate": 1.960651289009498e-07, "logits/chosen": -2.4961371421813965, "logits/rejected": -2.453371524810791, "logps/chosen": -249.11203002929688, "logps/rejected": -303.84808349609375, "loss": 0.089, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5694076418876648, "rewards/margins": 6.572709560394287, "rewards/rejected": -7.142117500305176, "step": 3710 }, { "epoch": 1.9466248037676608, "grad_norm": 44.430301450966475, "learning_rate": 1.9509594882729212e-07, "logits/chosen": -2.4783265590667725, "logits/rejected": -2.3802292346954346, "logps/chosen": -281.66632080078125, "logps/rejected": -302.57257080078125, "loss": 0.0969, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8343402147293091, "rewards/margins": 7.264143943786621, "rewards/rejected": -8.09848403930664, "step": 3720 }, { "epoch": 1.9518576661433804, "grad_norm": 26.86248306012849, "learning_rate": 1.9412676875363442e-07, "logits/chosen": -2.490718126296997, "logits/rejected": -2.384291172027588, "logps/chosen": -328.6300964355469, "logps/rejected": -340.6988830566406, "loss": 0.0734, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.2870655357837677, "rewards/margins": 7.721086025238037, "rewards/rejected": -8.00815200805664, "step": 3730 }, { "epoch": 1.9570905285191, "grad_norm": 78.62790474470229, "learning_rate": 1.9315758867997672e-07, "logits/chosen": -2.4921441078186035, "logits/rejected": -2.376638650894165, "logps/chosen": -335.1251525878906, "logps/rejected": -356.53326416015625, "loss": 0.063, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.3051922917366028, "rewards/margins": 7.896398067474365, "rewards/rejected": -7.5912065505981445, "step": 3740 }, { "epoch": 1.9623233908948194, "grad_norm": 57.2221755440599, "learning_rate": 1.9218840860631905e-07, "logits/chosen": -2.4793710708618164, "logits/rejected": -2.3860833644866943, "logps/chosen": -276.76470947265625, "logps/rejected": -350.911865234375, "loss": 0.0999, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.07188873738050461, "rewards/margins": 7.617722988128662, "rewards/rejected": -7.689610958099365, "step": 3750 }, { "epoch": 1.9675562532705388, "grad_norm": 51.37408650674115, "learning_rate": 1.9121922853266137e-07, "logits/chosen": -2.515812635421753, "logits/rejected": -2.4258525371551514, "logps/chosen": -296.86260986328125, "logps/rejected": -312.52349853515625, "loss": 0.1461, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.7149745225906372, "rewards/margins": 7.467611789703369, "rewards/rejected": -8.182586669921875, "step": 3760 }, { "epoch": 1.9727891156462585, "grad_norm": 40.7746712717558, "learning_rate": 1.9025004845900368e-07, "logits/chosen": -2.502448797225952, "logits/rejected": -2.413639783859253, "logps/chosen": -258.8537902832031, "logps/rejected": -308.92584228515625, "loss": 0.1748, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.1207481622695923, "rewards/margins": 6.454583168029785, "rewards/rejected": -7.5753302574157715, "step": 3770 }, { "epoch": 1.978021978021978, "grad_norm": 53.91219146211383, "learning_rate": 1.8928086838534598e-07, "logits/chosen": -2.5364058017730713, "logits/rejected": -2.462099552154541, "logps/chosen": -307.55291748046875, "logps/rejected": -355.0628967285156, "loss": 0.0778, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.04229793697595596, "rewards/margins": 7.642873287200928, "rewards/rejected": -7.685171604156494, "step": 3780 }, { "epoch": 1.9832548403976975, "grad_norm": 41.20902880598341, "learning_rate": 1.883116883116883e-07, "logits/chosen": -2.4417710304260254, "logits/rejected": -2.3728907108306885, "logps/chosen": -330.58697509765625, "logps/rejected": -380.87030029296875, "loss": 0.0521, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.6773049235343933, "rewards/margins": 8.481683731079102, "rewards/rejected": -9.158989906311035, "step": 3790 }, { "epoch": 1.988487702773417, "grad_norm": 79.34803662382411, "learning_rate": 1.873425082380306e-07, "logits/chosen": -2.3979761600494385, "logits/rejected": -2.3808693885803223, "logps/chosen": -282.55322265625, "logps/rejected": -339.8221435546875, "loss": 0.0867, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.7748892307281494, "rewards/margins": 7.687046051025391, "rewards/rejected": -8.461935043334961, "step": 3800 }, { "epoch": 1.988487702773417, "eval_logits/chosen": -2.4301087856292725, "eval_logits/rejected": -2.354205369949341, "eval_logps/chosen": -305.86749267578125, "eval_logps/rejected": -319.8756408691406, "eval_loss": 0.6743524670600891, "eval_rewards/accuracies": 0.732421875, "eval_rewards/chosen": -3.0955631732940674, "eval_rewards/margins": 2.450246810913086, "eval_rewards/rejected": -5.545810222625732, "eval_runtime": 215.0631, "eval_samples_per_second": 9.3, "eval_steps_per_second": 0.149, "step": 3800 }, { "epoch": 1.9937205651491365, "grad_norm": 27.00336519126764, "learning_rate": 1.8637332816437293e-07, "logits/chosen": -2.4655070304870605, "logits/rejected": -2.3845348358154297, "logps/chosen": -306.236572265625, "logps/rejected": -340.4774169921875, "loss": 0.0885, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.0291368961334229, "rewards/margins": 7.342482566833496, "rewards/rejected": -8.37161922454834, "step": 3810 }, { "epoch": 1.9989534275248562, "grad_norm": 24.994075688132376, "learning_rate": 1.8540414809071526e-07, "logits/chosen": -2.3605875968933105, "logits/rejected": -2.2828755378723145, "logps/chosen": -297.90057373046875, "logps/rejected": -332.27227783203125, "loss": 0.0575, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.185561403632164, "rewards/margins": 7.867689609527588, "rewards/rejected": -8.053250312805176, "step": 3820 }, { "epoch": 2.004186289900576, "grad_norm": 1.636388593061243, "learning_rate": 1.8443496801705756e-07, "logits/chosen": -2.4610435962677, "logits/rejected": -2.340324640274048, "logps/chosen": -316.91583251953125, "logps/rejected": -362.2328796386719, "loss": 0.0162, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.1787187159061432, "rewards/margins": 9.193592071533203, "rewards/rejected": -9.37231159210205, "step": 3830 }, { "epoch": 2.009419152276295, "grad_norm": 22.744195467010766, "learning_rate": 1.8346578794339986e-07, "logits/chosen": -2.33302640914917, "logits/rejected": -2.230029344558716, "logps/chosen": -313.8927307128906, "logps/rejected": -369.19342041015625, "loss": 0.0123, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.39618638157844543, "rewards/margins": 10.058914184570312, "rewards/rejected": -10.455101013183594, "step": 3840 }, { "epoch": 2.0146520146520146, "grad_norm": 2.378364186751786, "learning_rate": 1.824966078697422e-07, "logits/chosen": -2.244232177734375, "logits/rejected": -2.0912654399871826, "logps/chosen": -271.7301025390625, "logps/rejected": -331.6318359375, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": 0.0886969119310379, "rewards/margins": 10.079996109008789, "rewards/rejected": -9.991299629211426, "step": 3850 }, { "epoch": 2.0198848770277342, "grad_norm": 43.55805797720129, "learning_rate": 1.8152742779608452e-07, "logits/chosen": -2.1049273014068604, "logits/rejected": -2.080423355102539, "logps/chosen": -238.1123504638672, "logps/rejected": -341.15216064453125, "loss": 0.0167, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.011350142769515514, "rewards/margins": 9.55983829498291, "rewards/rejected": -9.548487663269043, "step": 3860 }, { "epoch": 2.025117739403454, "grad_norm": 32.64914585509329, "learning_rate": 1.8055824772242682e-07, "logits/chosen": -2.201404094696045, "logits/rejected": -2.0952985286712646, "logps/chosen": -271.05181884765625, "logps/rejected": -347.10675048828125, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -0.11333910375833511, "rewards/margins": 9.036820411682129, "rewards/rejected": -9.15015983581543, "step": 3870 }, { "epoch": 2.030350601779173, "grad_norm": 2.123761501302455, "learning_rate": 1.7958906764876912e-07, "logits/chosen": -2.164815902709961, "logits/rejected": -2.0035622119903564, "logps/chosen": -253.6443328857422, "logps/rejected": -314.2805480957031, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": 0.08712639659643173, "rewards/margins": 9.217796325683594, "rewards/rejected": -9.130668640136719, "step": 3880 }, { "epoch": 2.0355834641548927, "grad_norm": 4.179971815800507, "learning_rate": 1.7861988757511145e-07, "logits/chosen": -2.1586110591888428, "logits/rejected": -1.9041106700897217, "logps/chosen": -296.4727478027344, "logps/rejected": -328.80078125, "loss": 0.0189, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.46136999130249023, "rewards/margins": 10.083131790161133, "rewards/rejected": -10.544503211975098, "step": 3890 }, { "epoch": 2.0408163265306123, "grad_norm": 16.654223495325816, "learning_rate": 1.7765070750145375e-07, "logits/chosen": -1.8256515264511108, "logits/rejected": -1.5629663467407227, "logps/chosen": -258.5280456542969, "logps/rejected": -383.2508850097656, "loss": 0.0057, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.3603250980377197, "rewards/margins": 11.8051118850708, "rewards/rejected": -13.165435791015625, "step": 3900 }, { "epoch": 2.0408163265306123, "eval_logits/chosen": -1.7154812812805176, "eval_logits/rejected": -1.5130733251571655, "eval_logps/chosen": -324.9953308105469, "eval_logps/rejected": -352.1912841796875, "eval_loss": 0.8833321928977966, "eval_rewards/accuracies": 0.732421875, "eval_rewards/chosen": -5.008349418640137, "eval_rewards/margins": 3.769028902053833, "eval_rewards/rejected": -8.77737808227539, "eval_runtime": 221.996, "eval_samples_per_second": 9.009, "eval_steps_per_second": 0.144, "step": 3900 }, { "epoch": 2.046049188906332, "grad_norm": 2.5810656652855384, "learning_rate": 1.7668152742779608e-07, "logits/chosen": -1.6466734409332275, "logits/rejected": -1.5182273387908936, "logps/chosen": -281.562255859375, "logps/rejected": -412.06085205078125, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -2.672607660293579, "rewards/margins": 10.606077194213867, "rewards/rejected": -13.27868366241455, "step": 3910 }, { "epoch": 2.051282051282051, "grad_norm": 0.445319168244973, "learning_rate": 1.757123473541384e-07, "logits/chosen": -1.5864505767822266, "logits/rejected": -1.4270035028457642, "logps/chosen": -256.88311767578125, "logps/rejected": -369.2041320800781, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -1.8064348697662354, "rewards/margins": 11.58574390411377, "rewards/rejected": -13.392178535461426, "step": 3920 }, { "epoch": 2.0565149136577707, "grad_norm": 3.238249767413015, "learning_rate": 1.747431672804807e-07, "logits/chosen": -1.784555435180664, "logits/rejected": -1.3615097999572754, "logps/chosen": -327.14666748046875, "logps/rejected": -376.7530822753906, "loss": 0.0063, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.1112983226776123, "rewards/margins": 11.805521011352539, "rewards/rejected": -13.91681957244873, "step": 3930 }, { "epoch": 2.0617477760334904, "grad_norm": 70.54381166190825, "learning_rate": 1.73773987206823e-07, "logits/chosen": -1.7269071340560913, "logits/rejected": -1.555687427520752, "logps/chosen": -271.82305908203125, "logps/rejected": -399.38311767578125, "loss": 0.0153, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.31229567527771, "rewards/margins": 11.910050392150879, "rewards/rejected": -13.222345352172852, "step": 3940 }, { "epoch": 2.06698063840921, "grad_norm": 3.291806022868679, "learning_rate": 1.7280480713316534e-07, "logits/chosen": -1.7775026559829712, "logits/rejected": -1.4059690237045288, "logps/chosen": -306.10614013671875, "logps/rejected": -393.19305419921875, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": -1.917095422744751, "rewards/margins": 12.099812507629395, "rewards/rejected": -14.0169095993042, "step": 3950 }, { "epoch": 2.072213500784929, "grad_norm": 62.2774370145982, "learning_rate": 1.7183562705950767e-07, "logits/chosen": -1.5891709327697754, "logits/rejected": -1.3685919046401978, "logps/chosen": -290.0743713378906, "logps/rejected": -407.771728515625, "loss": 0.0069, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.109947919845581, "rewards/margins": 13.229748725891113, "rewards/rejected": -15.339696884155273, "step": 3960 }, { "epoch": 2.077446363160649, "grad_norm": 5.641209400340198, "learning_rate": 1.7086644698584997e-07, "logits/chosen": -1.6096198558807373, "logits/rejected": -1.187403678894043, "logps/chosen": -284.5003662109375, "logps/rejected": -341.6573486328125, "loss": 0.0104, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.268524169921875, "rewards/margins": 10.223438262939453, "rewards/rejected": -13.491961479187012, "step": 3970 }, { "epoch": 2.0826792255363684, "grad_norm": 31.54323883253278, "learning_rate": 1.6989726691219227e-07, "logits/chosen": -1.8186414241790771, "logits/rejected": -1.568420171737671, "logps/chosen": -344.41131591796875, "logps/rejected": -438.26129150390625, "loss": 0.0094, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.2491374015808105, "rewards/margins": 12.659438133239746, "rewards/rejected": -14.908575057983398, "step": 3980 }, { "epoch": 2.087912087912088, "grad_norm": 3.915503505330991, "learning_rate": 1.689280868385346e-07, "logits/chosen": -1.8544843196868896, "logits/rejected": -1.4267140626907349, "logps/chosen": -276.9905700683594, "logps/rejected": -376.9084777832031, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -2.799527645111084, "rewards/margins": 11.97034740447998, "rewards/rejected": -14.769874572753906, "step": 3990 }, { "epoch": 2.0931449502878072, "grad_norm": 1.437142342100717, "learning_rate": 1.679589067648769e-07, "logits/chosen": -1.7917182445526123, "logits/rejected": -1.4444924592971802, "logps/chosen": -293.43511962890625, "logps/rejected": -396.1407165527344, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -2.091099977493286, "rewards/margins": 12.69747543334961, "rewards/rejected": -14.788576126098633, "step": 4000 }, { "epoch": 2.0931449502878072, "eval_logits/chosen": -1.8694313764572144, "eval_logits/rejected": -1.6158483028411865, "eval_logps/chosen": -336.1759338378906, "eval_logps/rejected": -367.97119140625, "eval_loss": 0.9721545577049255, "eval_rewards/accuracies": 0.744140625, "eval_rewards/chosen": -6.126408576965332, "eval_rewards/margins": 4.228956699371338, "eval_rewards/rejected": -10.355364799499512, "eval_runtime": 222.3106, "eval_samples_per_second": 8.996, "eval_steps_per_second": 0.144, "step": 4000 }, { "epoch": 2.098377812663527, "grad_norm": 0.17279173017440724, "learning_rate": 1.6698972669121923e-07, "logits/chosen": -1.9071719646453857, "logits/rejected": -1.5385925769805908, "logps/chosen": -281.7474670410156, "logps/rejected": -411.5989685058594, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": -2.8792343139648438, "rewards/margins": 13.827468872070312, "rewards/rejected": -16.70670509338379, "step": 4010 }, { "epoch": 2.1036106750392465, "grad_norm": 0.7701655675507049, "learning_rate": 1.6602054661756155e-07, "logits/chosen": -2.026282548904419, "logits/rejected": -1.6024436950683594, "logps/chosen": -345.68316650390625, "logps/rejected": -470.46014404296875, "loss": 0.0301, "rewards/accuracies": 1.0, "rewards/chosen": -1.5267884731292725, "rewards/margins": 14.93049430847168, "rewards/rejected": -16.457286834716797, "step": 4020 }, { "epoch": 2.108843537414966, "grad_norm": 1.1433747742501128, "learning_rate": 1.6505136654390383e-07, "logits/chosen": -1.849346399307251, "logits/rejected": -1.4481465816497803, "logps/chosen": -304.0299072265625, "logps/rejected": -403.6064758300781, "loss": 0.0251, "rewards/accuracies": 1.0, "rewards/chosen": -3.943570613861084, "rewards/margins": 11.937836647033691, "rewards/rejected": -15.88140869140625, "step": 4030 }, { "epoch": 2.1140763997906853, "grad_norm": 11.672941648617368, "learning_rate": 1.6408218647024616e-07, "logits/chosen": -1.8534587621688843, "logits/rejected": -1.491811990737915, "logps/chosen": -312.2471618652344, "logps/rejected": -428.0400390625, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -3.438689708709717, "rewards/margins": 12.467877388000488, "rewards/rejected": -15.90656566619873, "step": 4040 }, { "epoch": 2.119309262166405, "grad_norm": 0.3558212308850055, "learning_rate": 1.6311300639658848e-07, "logits/chosen": -1.974471092224121, "logits/rejected": -1.5567271709442139, "logps/chosen": -326.0544738769531, "logps/rejected": -400.6845397949219, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": -3.207573413848877, "rewards/margins": 12.100391387939453, "rewards/rejected": -15.307965278625488, "step": 4050 }, { "epoch": 2.1245421245421245, "grad_norm": 0.17473487952057576, "learning_rate": 1.621438263229308e-07, "logits/chosen": -2.0456089973449707, "logits/rejected": -1.6989820003509521, "logps/chosen": -300.3196105957031, "logps/rejected": -408.50677490234375, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -3.033864974975586, "rewards/margins": 13.19610595703125, "rewards/rejected": -16.229970932006836, "step": 4060 }, { "epoch": 2.129774986917844, "grad_norm": 1.0542140766338064, "learning_rate": 1.6117464624927309e-07, "logits/chosen": -1.954842209815979, "logits/rejected": -1.6923853158950806, "logps/chosen": -285.2176513671875, "logps/rejected": -427.909912109375, "loss": 0.0116, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.8270351886749268, "rewards/margins": 14.359907150268555, "rewards/rejected": -18.18694305419922, "step": 4070 }, { "epoch": 2.1350078492935634, "grad_norm": 0.8534481512795538, "learning_rate": 1.6020546617561541e-07, "logits/chosen": -1.8923704624176025, "logits/rejected": -1.6065568923950195, "logps/chosen": -302.41827392578125, "logps/rejected": -444.46551513671875, "loss": 0.0162, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.1700589656829834, "rewards/margins": 15.152372360229492, "rewards/rejected": -18.322429656982422, "step": 4080 }, { "epoch": 2.140240711669283, "grad_norm": 2.82019033036379, "learning_rate": 1.5923628610195774e-07, "logits/chosen": -1.9063475131988525, "logits/rejected": -1.4251835346221924, "logps/chosen": -328.1401062011719, "logps/rejected": -423.852783203125, "loss": 0.0108, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.718710899353027, "rewards/margins": 12.693868637084961, "rewards/rejected": -17.412578582763672, "step": 4090 }, { "epoch": 2.1454735740450026, "grad_norm": 7.673220279225905, "learning_rate": 1.5826710602830004e-07, "logits/chosen": -1.8191146850585938, "logits/rejected": -1.3422605991363525, "logps/chosen": -301.18994140625, "logps/rejected": -436.37884521484375, "loss": 0.0144, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.484773635864258, "rewards/margins": 13.255203247070312, "rewards/rejected": -16.739978790283203, "step": 4100 }, { "epoch": 2.1454735740450026, "eval_logits/chosen": -1.702190637588501, "eval_logits/rejected": -1.3816593885421753, "eval_logps/chosen": -352.7836608886719, "eval_logps/rejected": -390.5073547363281, "eval_loss": 1.0865466594696045, "eval_rewards/accuracies": 0.72265625, "eval_rewards/chosen": -7.7871832847595215, "eval_rewards/margins": 4.821796417236328, "eval_rewards/rejected": -12.608980178833008, "eval_runtime": 220.3856, "eval_samples_per_second": 9.075, "eval_steps_per_second": 0.145, "step": 4100 }, { "epoch": 2.1507064364207222, "grad_norm": 1.978183534759451, "learning_rate": 1.5729792595464237e-07, "logits/chosen": -1.71258544921875, "logits/rejected": -1.1272776126861572, "logps/chosen": -330.2572937011719, "logps/rejected": -399.00933837890625, "loss": 0.0115, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.0442047119140625, "rewards/margins": 14.103948593139648, "rewards/rejected": -18.148151397705078, "step": 4110 }, { "epoch": 2.155939298796442, "grad_norm": 5.349890228659826, "learning_rate": 1.563287458809847e-07, "logits/chosen": -1.691204309463501, "logits/rejected": -1.2004318237304688, "logps/chosen": -363.68157958984375, "logps/rejected": -537.2859497070312, "loss": 0.0097, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.9943907260894775, "rewards/margins": 14.924301147460938, "rewards/rejected": -17.918689727783203, "step": 4120 }, { "epoch": 2.161172161172161, "grad_norm": 2.269403112184859, "learning_rate": 1.5535956580732697e-07, "logits/chosen": -1.28058922290802, "logits/rejected": -1.0815701484680176, "logps/chosen": -305.71246337890625, "logps/rejected": -466.3035583496094, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -4.936121463775635, "rewards/margins": 14.189343452453613, "rewards/rejected": -19.125465393066406, "step": 4130 }, { "epoch": 2.1664050235478807, "grad_norm": 3.2987483859445272, "learning_rate": 1.543903857336693e-07, "logits/chosen": -1.4532785415649414, "logits/rejected": -0.7421761751174927, "logps/chosen": -338.6584167480469, "logps/rejected": -438.45379638671875, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -5.7122321128845215, "rewards/margins": 14.551777839660645, "rewards/rejected": -20.264007568359375, "step": 4140 }, { "epoch": 2.1716378859236003, "grad_norm": 20.70730094179457, "learning_rate": 1.5342120566001163e-07, "logits/chosen": -1.708059310913086, "logits/rejected": -1.1626859903335571, "logps/chosen": -319.282958984375, "logps/rejected": -427.8634338378906, "loss": 0.0083, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.242346286773682, "rewards/margins": 13.834518432617188, "rewards/rejected": -18.07686424255371, "step": 4150 }, { "epoch": 2.17687074829932, "grad_norm": 2.6413289391601062, "learning_rate": 1.5245202558635396e-07, "logits/chosen": -1.8341776132583618, "logits/rejected": -1.434983491897583, "logps/chosen": -344.68536376953125, "logps/rejected": -487.75701904296875, "loss": 0.0202, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.430539131164551, "rewards/margins": 14.86828899383545, "rewards/rejected": -19.298826217651367, "step": 4160 }, { "epoch": 2.182103610675039, "grad_norm": 0.24255087459023358, "learning_rate": 1.5148284551269623e-07, "logits/chosen": -1.8398370742797852, "logits/rejected": -1.532501459121704, "logps/chosen": -350.470947265625, "logps/rejected": -482.94744873046875, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -4.043599605560303, "rewards/margins": 14.743319511413574, "rewards/rejected": -18.78691864013672, "step": 4170 }, { "epoch": 2.1873364730507587, "grad_norm": 1.170617780077935, "learning_rate": 1.5051366543903856e-07, "logits/chosen": -1.8944003582000732, "logits/rejected": -1.506667137145996, "logps/chosen": -330.27496337890625, "logps/rejected": -467.59600830078125, "loss": 0.022, "rewards/accuracies": 1.0, "rewards/chosen": -4.2945332527160645, "rewards/margins": 14.99769401550293, "rewards/rejected": -19.292226791381836, "step": 4180 }, { "epoch": 2.1925693354264784, "grad_norm": 1.7420348764338498, "learning_rate": 1.495444853653809e-07, "logits/chosen": -1.7570146322250366, "logits/rejected": -1.4628037214279175, "logps/chosen": -293.8160095214844, "logps/rejected": -419.6346740722656, "loss": 0.011, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.7349791526794434, "rewards/margins": 14.405634880065918, "rewards/rejected": -18.140613555908203, "step": 4190 }, { "epoch": 2.197802197802198, "grad_norm": 86.47505505971033, "learning_rate": 1.485753052917232e-07, "logits/chosen": -1.7744470834732056, "logits/rejected": -1.2162775993347168, "logps/chosen": -346.13409423828125, "logps/rejected": -467.5338439941406, "loss": 0.0222, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.2589449882507324, "rewards/margins": 16.249040603637695, "rewards/rejected": -19.507984161376953, "step": 4200 }, { "epoch": 2.197802197802198, "eval_logits/chosen": -1.6966558694839478, "eval_logits/rejected": -1.3909358978271484, "eval_logps/chosen": -354.8811340332031, "eval_logps/rejected": -392.9280090332031, "eval_loss": 1.1129672527313232, "eval_rewards/accuracies": 0.708984375, "eval_rewards/chosen": -7.9969329833984375, "eval_rewards/margins": 4.8541131019592285, "eval_rewards/rejected": -12.851046562194824, "eval_runtime": 217.2469, "eval_samples_per_second": 9.206, "eval_steps_per_second": 0.147, "step": 4200 }, { "epoch": 2.203035060177917, "grad_norm": 0.37676404567744676, "learning_rate": 1.4760612521806552e-07, "logits/chosen": -1.8915624618530273, "logits/rejected": -1.4956997632980347, "logps/chosen": -344.36322021484375, "logps/rejected": -421.96484375, "loss": 0.0267, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.695669174194336, "rewards/margins": 13.718816757202148, "rewards/rejected": -16.414485931396484, "step": 4210 }, { "epoch": 2.208267922553637, "grad_norm": 357.2201049755124, "learning_rate": 1.4663694514440782e-07, "logits/chosen": -2.0087883472442627, "logits/rejected": -1.5528895854949951, "logps/chosen": -320.63018798828125, "logps/rejected": -428.7660217285156, "loss": 0.0158, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.470980644226074, "rewards/margins": 13.398353576660156, "rewards/rejected": -17.869335174560547, "step": 4220 }, { "epoch": 2.2135007849293564, "grad_norm": 0.26092655455197766, "learning_rate": 1.4566776507075012e-07, "logits/chosen": -1.7861378192901611, "logits/rejected": -1.4332597255706787, "logps/chosen": -286.4765930175781, "logps/rejected": -425.73394775390625, "loss": 0.0195, "rewards/accuracies": 1.0, "rewards/chosen": -3.6701064109802246, "rewards/margins": 15.1123628616333, "rewards/rejected": -18.782468795776367, "step": 4230 }, { "epoch": 2.218733647305076, "grad_norm": 3.6573910963693477, "learning_rate": 1.4469858499709245e-07, "logits/chosen": -2.0419600009918213, "logits/rejected": -1.6704151630401611, "logps/chosen": -346.13641357421875, "logps/rejected": -479.1158752441406, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -3.7138047218322754, "rewards/margins": 16.241077423095703, "rewards/rejected": -19.954883575439453, "step": 4240 }, { "epoch": 2.2239665096807952, "grad_norm": 1.4039592111520158, "learning_rate": 1.4372940492343478e-07, "logits/chosen": -1.8262889385223389, "logits/rejected": -1.4481077194213867, "logps/chosen": -301.0556945800781, "logps/rejected": -425.95904541015625, "loss": 0.0381, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.3887314796447754, "rewards/margins": 14.301095962524414, "rewards/rejected": -17.689823150634766, "step": 4250 }, { "epoch": 2.229199372056515, "grad_norm": 1.171242136798712, "learning_rate": 1.427602248497771e-07, "logits/chosen": -1.6064163446426392, "logits/rejected": -1.2176098823547363, "logps/chosen": -326.8307800292969, "logps/rejected": -457.1583557128906, "loss": 0.017, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.7317757606506348, "rewards/margins": 13.812063217163086, "rewards/rejected": -17.543840408325195, "step": 4260 }, { "epoch": 2.2344322344322345, "grad_norm": 1.63034121160787, "learning_rate": 1.4179104477611938e-07, "logits/chosen": -1.8732599020004272, "logits/rejected": -1.5643693208694458, "logps/chosen": -297.6470642089844, "logps/rejected": -453.55926513671875, "loss": 0.003, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.3548221588134766, "rewards/margins": 14.426854133605957, "rewards/rejected": -17.78167724609375, "step": 4270 }, { "epoch": 2.239665096807954, "grad_norm": 13.116296991156315, "learning_rate": 1.408218647024617e-07, "logits/chosen": -1.880324363708496, "logits/rejected": -1.4374394416809082, "logps/chosen": -317.56622314453125, "logps/rejected": -442.3705139160156, "loss": 0.0088, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.6204328536987305, "rewards/margins": 14.214098930358887, "rewards/rejected": -18.834529876708984, "step": 4280 }, { "epoch": 2.2448979591836733, "grad_norm": 0.44457115904317396, "learning_rate": 1.3985268462880403e-07, "logits/chosen": -1.8249366283416748, "logits/rejected": -1.495435357093811, "logps/chosen": -352.5448303222656, "logps/rejected": -453.1702575683594, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -4.074585914611816, "rewards/margins": 14.624841690063477, "rewards/rejected": -18.699426651000977, "step": 4290 }, { "epoch": 2.250130821559393, "grad_norm": 4.942528003469965, "learning_rate": 1.3888350455514634e-07, "logits/chosen": -1.997428297996521, "logits/rejected": -1.5461194515228271, "logps/chosen": -371.10760498046875, "logps/rejected": -472.3599548339844, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -3.297682523727417, "rewards/margins": 14.771507263183594, "rewards/rejected": -18.069189071655273, "step": 4300 }, { "epoch": 2.250130821559393, "eval_logits/chosen": -1.7458730936050415, "eval_logits/rejected": -1.5071812868118286, "eval_logps/chosen": -362.7955322265625, "eval_logps/rejected": -399.19024658203125, "eval_loss": 1.0722272396087646, "eval_rewards/accuracies": 0.71875, "eval_rewards/chosen": -8.788372993469238, "eval_rewards/margins": 4.688893795013428, "eval_rewards/rejected": -13.477266311645508, "eval_runtime": 220.6095, "eval_samples_per_second": 9.066, "eval_steps_per_second": 0.145, "step": 4300 }, { "epoch": 2.2553636839351126, "grad_norm": 36.211946661068495, "learning_rate": 1.3791432448148866e-07, "logits/chosen": -1.8661454916000366, "logits/rejected": -1.619972586631775, "logps/chosen": -317.17779541015625, "logps/rejected": -454.04913330078125, "loss": 0.0232, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.826526641845703, "rewards/margins": 13.073823928833008, "rewards/rejected": -17.90035057067871, "step": 4310 }, { "epoch": 2.260596546310832, "grad_norm": 18.269775624790526, "learning_rate": 1.3694514440783096e-07, "logits/chosen": -1.7545273303985596, "logits/rejected": -1.3490523099899292, "logps/chosen": -324.13751220703125, "logps/rejected": -457.81878662109375, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -4.073645114898682, "rewards/margins": 15.571569442749023, "rewards/rejected": -19.645214080810547, "step": 4320 }, { "epoch": 2.2658294086865514, "grad_norm": 0.9322271018028229, "learning_rate": 1.3597596433417327e-07, "logits/chosen": -1.7316324710845947, "logits/rejected": -1.3258253335952759, "logps/chosen": -357.536865234375, "logps/rejected": -424.33221435546875, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -4.320875644683838, "rewards/margins": 13.340585708618164, "rewards/rejected": -17.661460876464844, "step": 4330 }, { "epoch": 2.271062271062271, "grad_norm": 0.2191395375617181, "learning_rate": 1.350067842605156e-07, "logits/chosen": -1.8539575338363647, "logits/rejected": -1.4873892068862915, "logps/chosen": -327.79925537109375, "logps/rejected": -436.9449768066406, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -4.021679878234863, "rewards/margins": 14.80450439453125, "rewards/rejected": -18.826183319091797, "step": 4340 }, { "epoch": 2.2762951334379906, "grad_norm": 5.978714716362344, "learning_rate": 1.3403760418685792e-07, "logits/chosen": -1.8472375869750977, "logits/rejected": -1.527571201324463, "logps/chosen": -399.43121337890625, "logps/rejected": -451.82733154296875, "loss": 0.0172, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.133065223693848, "rewards/margins": 14.115396499633789, "rewards/rejected": -18.24846076965332, "step": 4350 }, { "epoch": 2.2815279958137102, "grad_norm": 3.3228633164346912, "learning_rate": 1.3306842411320025e-07, "logits/chosen": -1.7037369012832642, "logits/rejected": -1.34806227684021, "logps/chosen": -332.9041442871094, "logps/rejected": -457.6480407714844, "loss": 0.0137, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.2194719314575195, "rewards/margins": 13.215705871582031, "rewards/rejected": -18.435176849365234, "step": 4360 }, { "epoch": 2.2867608581894294, "grad_norm": 7.32780309122918, "learning_rate": 1.3209924403954252e-07, "logits/chosen": -1.723406195640564, "logits/rejected": -1.4339897632598877, "logps/chosen": -303.0953369140625, "logps/rejected": -463.48333740234375, "loss": 0.0113, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.372370481491089, "rewards/margins": 14.321685791015625, "rewards/rejected": -17.69405746459961, "step": 4370 }, { "epoch": 2.291993720565149, "grad_norm": 0.8790841751055948, "learning_rate": 1.3113006396588485e-07, "logits/chosen": -1.713115930557251, "logits/rejected": -1.5139821767807007, "logps/chosen": -296.68280029296875, "logps/rejected": -468.450927734375, "loss": 0.0123, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.422327995300293, "rewards/margins": 13.840237617492676, "rewards/rejected": -18.26256561279297, "step": 4380 }, { "epoch": 2.2972265829408687, "grad_norm": 8.875224770141891, "learning_rate": 1.3016088389222718e-07, "logits/chosen": -1.7016446590423584, "logits/rejected": -1.4579055309295654, "logps/chosen": -345.4739074707031, "logps/rejected": -406.3683776855469, "loss": 0.0114, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.9114882946014404, "rewards/margins": 12.688652992248535, "rewards/rejected": -16.600141525268555, "step": 4390 }, { "epoch": 2.3024594453165883, "grad_norm": 4.546643192487832, "learning_rate": 1.2919170381856948e-07, "logits/chosen": -1.4630672931671143, "logits/rejected": -1.0828077793121338, "logps/chosen": -284.3868408203125, "logps/rejected": -376.69073486328125, "loss": 0.0164, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.4605889320373535, "rewards/margins": 12.329212188720703, "rewards/rejected": -16.78980255126953, "step": 4400 }, { "epoch": 2.3024594453165883, "eval_logits/chosen": -1.51815927028656, "eval_logits/rejected": -1.2294021844863892, "eval_logps/chosen": -362.73248291015625, "eval_logps/rejected": -400.1004638671875, "eval_loss": 1.0993417501449585, "eval_rewards/accuracies": 0.724609375, "eval_rewards/chosen": -8.782060623168945, "eval_rewards/margins": 4.786232948303223, "eval_rewards/rejected": -13.568293571472168, "eval_runtime": 221.8899, "eval_samples_per_second": 9.013, "eval_steps_per_second": 0.144, "step": 4400 }, { "epoch": 2.3076923076923075, "grad_norm": 0.5661978551905269, "learning_rate": 1.282225237449118e-07, "logits/chosen": -1.6409820318222046, "logits/rejected": -1.243800163269043, "logps/chosen": -362.60980224609375, "logps/rejected": -427.58551025390625, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -4.7056097984313965, "rewards/margins": 12.95887565612793, "rewards/rejected": -17.664485931396484, "step": 4410 }, { "epoch": 2.312925170068027, "grad_norm": 1.7608635736698999, "learning_rate": 1.272533436712541e-07, "logits/chosen": -1.4827959537506104, "logits/rejected": -0.8618787527084351, "logps/chosen": -291.10009765625, "logps/rejected": -391.69805908203125, "loss": 0.0094, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.803543567657471, "rewards/margins": 13.09046459197998, "rewards/rejected": -17.894006729125977, "step": 4420 }, { "epoch": 2.3181580324437467, "grad_norm": 0.4731166253410567, "learning_rate": 1.262841635975964e-07, "logits/chosen": -1.4394365549087524, "logits/rejected": -1.088076114654541, "logps/chosen": -318.5534973144531, "logps/rejected": -432.033935546875, "loss": 0.0145, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.159219741821289, "rewards/margins": 14.869272232055664, "rewards/rejected": -19.028493881225586, "step": 4430 }, { "epoch": 2.3233908948194664, "grad_norm": 0.41773603978025636, "learning_rate": 1.2531498352393874e-07, "logits/chosen": -1.5039451122283936, "logits/rejected": -1.1101276874542236, "logps/chosen": -355.6140441894531, "logps/rejected": -494.3036193847656, "loss": 0.0066, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.5550899505615234, "rewards/margins": 14.830436706542969, "rewards/rejected": -18.38552474975586, "step": 4440 }, { "epoch": 2.328623757195186, "grad_norm": 18.841776560217824, "learning_rate": 1.2434580345028107e-07, "logits/chosen": -1.395017385482788, "logits/rejected": -1.060880184173584, "logps/chosen": -324.5738220214844, "logps/rejected": -439.9986877441406, "loss": 0.0136, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.6189403533935547, "rewards/margins": 15.243639945983887, "rewards/rejected": -18.862581253051758, "step": 4450 }, { "epoch": 2.333856619570905, "grad_norm": 3.61677138060637, "learning_rate": 1.2337662337662337e-07, "logits/chosen": -1.3702367544174194, "logits/rejected": -1.1147315502166748, "logps/chosen": -300.3222351074219, "logps/rejected": -441.28173828125, "loss": 0.0166, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.4522387981414795, "rewards/margins": 14.996801376342773, "rewards/rejected": -18.449039459228516, "step": 4460 }, { "epoch": 2.339089481946625, "grad_norm": 337.3915144921339, "learning_rate": 1.2240744330296567e-07, "logits/chosen": -1.3891953229904175, "logits/rejected": -1.099961280822754, "logps/chosen": -321.42291259765625, "logps/rejected": -464.1163024902344, "loss": 0.0161, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.193487644195557, "rewards/margins": 14.755739212036133, "rewards/rejected": -18.949228286743164, "step": 4470 }, { "epoch": 2.3443223443223444, "grad_norm": 5.3911007857464295, "learning_rate": 1.21438263229308e-07, "logits/chosen": -1.1622810363769531, "logits/rejected": -0.7887503504753113, "logps/chosen": -277.24383544921875, "logps/rejected": -419.809814453125, "loss": 0.0089, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.472451210021973, "rewards/margins": 13.809962272644043, "rewards/rejected": -19.282413482666016, "step": 4480 }, { "epoch": 2.3495552066980636, "grad_norm": 3.3908674414193882, "learning_rate": 1.204690831556503e-07, "logits/chosen": -1.1291340589523315, "logits/rejected": -0.6053078174591064, "logps/chosen": -319.66583251953125, "logps/rejected": -457.53668212890625, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -5.367248058319092, "rewards/margins": 15.670175552368164, "rewards/rejected": -21.03742218017578, "step": 4490 }, { "epoch": 2.3547880690737832, "grad_norm": 1.5336544043680858, "learning_rate": 1.1949990308199263e-07, "logits/chosen": -1.456369161605835, "logits/rejected": -0.859190821647644, "logps/chosen": -372.5581970214844, "logps/rejected": -473.9956970214844, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -4.7550482749938965, "rewards/margins": 14.862614631652832, "rewards/rejected": -19.61766242980957, "step": 4500 }, { "epoch": 2.3547880690737832, "eval_logits/chosen": -1.09574294090271, "eval_logits/rejected": -0.7476086020469666, "eval_logps/chosen": -373.9384765625, "eval_logps/rejected": -412.2026062011719, "eval_loss": 1.1249762773513794, "eval_rewards/accuracies": 0.732421875, "eval_rewards/chosen": -9.90266227722168, "eval_rewards/margins": 4.875840663909912, "eval_rewards/rejected": -14.778504371643066, "eval_runtime": 222.9196, "eval_samples_per_second": 8.972, "eval_steps_per_second": 0.144, "step": 4500 }, { "epoch": 2.360020931449503, "grad_norm": 3.40514125762932, "learning_rate": 1.1853072300833494e-07, "logits/chosen": -1.2517144680023193, "logits/rejected": -0.9176338911056519, "logps/chosen": -327.3069152832031, "logps/rejected": -450.65740966796875, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -4.483835697174072, "rewards/margins": 14.02863597869873, "rewards/rejected": -18.512470245361328, "step": 4510 }, { "epoch": 2.3652537938252225, "grad_norm": 14.126972807720827, "learning_rate": 1.1756154293467726e-07, "logits/chosen": -1.467652678489685, "logits/rejected": -0.9547795057296753, "logps/chosen": -395.40447998046875, "logps/rejected": -513.0770263671875, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": -4.472814083099365, "rewards/margins": 16.538341522216797, "rewards/rejected": -21.01115608215332, "step": 4520 }, { "epoch": 2.370486656200942, "grad_norm": 0.9284438350020452, "learning_rate": 1.1659236286101957e-07, "logits/chosen": -1.1644915342330933, "logits/rejected": -0.6010005474090576, "logps/chosen": -333.05914306640625, "logps/rejected": -431.88531494140625, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -5.343001365661621, "rewards/margins": 14.191340446472168, "rewards/rejected": -19.53434181213379, "step": 4530 }, { "epoch": 2.3757195185766613, "grad_norm": 17.77184526527596, "learning_rate": 1.1562318278736189e-07, "logits/chosen": -1.4092177152633667, "logits/rejected": -0.8735829591751099, "logps/chosen": -347.2710876464844, "logps/rejected": -462.8575134277344, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -4.7680134773254395, "rewards/margins": 15.547823905944824, "rewards/rejected": -20.315837860107422, "step": 4540 }, { "epoch": 2.380952380952381, "grad_norm": 6.201536508815847, "learning_rate": 1.146540027137042e-07, "logits/chosen": -1.5193450450897217, "logits/rejected": -1.075331449508667, "logps/chosen": -343.96124267578125, "logps/rejected": -446.32666015625, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -4.925023078918457, "rewards/margins": 13.596059799194336, "rewards/rejected": -18.521081924438477, "step": 4550 }, { "epoch": 2.3861852433281006, "grad_norm": 3.3636972277644372, "learning_rate": 1.1368482264004651e-07, "logits/chosen": -1.2732129096984863, "logits/rejected": -0.8378338813781738, "logps/chosen": -345.1691589355469, "logps/rejected": -515.385986328125, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -5.457891941070557, "rewards/margins": 15.813512802124023, "rewards/rejected": -21.271404266357422, "step": 4560 }, { "epoch": 2.3914181057038197, "grad_norm": 24.194725153318593, "learning_rate": 1.1271564256638883e-07, "logits/chosen": -1.2939653396606445, "logits/rejected": -0.7698228359222412, "logps/chosen": -356.67669677734375, "logps/rejected": -457.9652404785156, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -4.990460395812988, "rewards/margins": 14.007291793823242, "rewards/rejected": -18.997753143310547, "step": 4570 }, { "epoch": 2.3966509680795394, "grad_norm": 7.327635632944823, "learning_rate": 1.1174646249273114e-07, "logits/chosen": -1.0309888124465942, "logits/rejected": -0.6903551816940308, "logps/chosen": -337.888916015625, "logps/rejected": -481.9061584472656, "loss": 0.0248, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.396435737609863, "rewards/margins": 14.344100952148438, "rewards/rejected": -20.740535736083984, "step": 4580 }, { "epoch": 2.401883830455259, "grad_norm": 80.47896093506195, "learning_rate": 1.1077728241907346e-07, "logits/chosen": -1.1125319004058838, "logits/rejected": -0.6521554589271545, "logps/chosen": -339.73699951171875, "logps/rejected": -474.95172119140625, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -5.04714298248291, "rewards/margins": 16.05856704711914, "rewards/rejected": -21.105709075927734, "step": 4590 }, { "epoch": 2.4071166928309786, "grad_norm": 16.113469293802325, "learning_rate": 1.0980810234541577e-07, "logits/chosen": -1.073447346687317, "logits/rejected": -0.5534033179283142, "logps/chosen": -306.2637939453125, "logps/rejected": -461.4892578125, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -5.526026725769043, "rewards/margins": 15.030405044555664, "rewards/rejected": -20.556434631347656, "step": 4600 }, { "epoch": 2.4071166928309786, "eval_logits/chosen": -1.0019875764846802, "eval_logits/rejected": -0.594019889831543, "eval_logps/chosen": -379.297119140625, "eval_logps/rejected": -420.0611572265625, "eval_loss": 1.1975449323654175, "eval_rewards/accuracies": 0.728515625, "eval_rewards/chosen": -10.438529014587402, "eval_rewards/margins": 5.125831604003906, "eval_rewards/rejected": -15.564359664916992, "eval_runtime": 222.1561, "eval_samples_per_second": 9.003, "eval_steps_per_second": 0.144, "step": 4600 }, { "epoch": 2.4123495552066982, "grad_norm": 0.9678375240111109, "learning_rate": 1.0883892227175809e-07, "logits/chosen": -0.9676529765129089, "logits/rejected": -0.38217049837112427, "logps/chosen": -292.9762268066406, "logps/rejected": -472.7262268066406, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -6.590880393981934, "rewards/margins": 17.07498550415039, "rewards/rejected": -23.66586685180664, "step": 4610 }, { "epoch": 2.4175824175824174, "grad_norm": 0.13588546665161647, "learning_rate": 1.078697421981004e-07, "logits/chosen": -1.2403453588485718, "logits/rejected": -1.0074535608291626, "logps/chosen": -331.1590576171875, "logps/rejected": -504.5526428222656, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -5.336001396179199, "rewards/margins": 14.526225090026855, "rewards/rejected": -19.862226486206055, "step": 4620 }, { "epoch": 2.422815279958137, "grad_norm": 1.091217144199657, "learning_rate": 1.0690056212444272e-07, "logits/chosen": -1.2076736688613892, "logits/rejected": -0.8387308120727539, "logps/chosen": -375.07342529296875, "logps/rejected": -506.6279296875, "loss": 0.0146, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.553755760192871, "rewards/margins": 16.366252899169922, "rewards/rejected": -20.92000961303711, "step": 4630 }, { "epoch": 2.4280481423338567, "grad_norm": 1.2973586731217244, "learning_rate": 1.0593138205078503e-07, "logits/chosen": -1.5336076021194458, "logits/rejected": -0.9917329549789429, "logps/chosen": -349.8462219238281, "logps/rejected": -438.5542907714844, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -3.810014009475708, "rewards/margins": 15.934967041015625, "rewards/rejected": -19.744983673095703, "step": 4640 }, { "epoch": 2.4332810047095763, "grad_norm": 9.384091690119678, "learning_rate": 1.0496220197712735e-07, "logits/chosen": -1.3734732866287231, "logits/rejected": -0.8663279414176941, "logps/chosen": -349.04095458984375, "logps/rejected": -436.67669677734375, "loss": 0.0104, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.074618339538574, "rewards/margins": 14.132675170898438, "rewards/rejected": -19.207294464111328, "step": 4650 }, { "epoch": 2.4385138670852955, "grad_norm": 0.8253483084498167, "learning_rate": 1.0399302190346966e-07, "logits/chosen": -1.3484551906585693, "logits/rejected": -0.9189489483833313, "logps/chosen": -368.49078369140625, "logps/rejected": -459.4332580566406, "loss": 0.0076, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.258188724517822, "rewards/margins": 14.235844612121582, "rewards/rejected": -19.49403190612793, "step": 4660 }, { "epoch": 2.443746729461015, "grad_norm": 0.5633228315233111, "learning_rate": 1.0302384182981198e-07, "logits/chosen": -1.5077921152114868, "logits/rejected": -1.0789073705673218, "logps/chosen": -393.508544921875, "logps/rejected": -487.1568908691406, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -4.525371551513672, "rewards/margins": 15.420866012573242, "rewards/rejected": -19.946237564086914, "step": 4670 }, { "epoch": 2.4489795918367347, "grad_norm": 7.7350873931947435, "learning_rate": 1.0205466175615429e-07, "logits/chosen": -1.5459680557250977, "logits/rejected": -1.0190424919128418, "logps/chosen": -386.88580322265625, "logps/rejected": -503.068115234375, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -4.344311714172363, "rewards/margins": 15.517217636108398, "rewards/rejected": -19.861530303955078, "step": 4680 }, { "epoch": 2.4542124542124544, "grad_norm": 0.4564832215336133, "learning_rate": 1.010854816824966e-07, "logits/chosen": -1.3345346450805664, "logits/rejected": -0.7118848562240601, "logps/chosen": -317.5199890136719, "logps/rejected": -426.77740478515625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -5.414102077484131, "rewards/margins": 14.011309623718262, "rewards/rejected": -19.425413131713867, "step": 4690 }, { "epoch": 2.4594453165881736, "grad_norm": 18.703092780311614, "learning_rate": 1.0011630160883892e-07, "logits/chosen": -1.2031428813934326, "logits/rejected": -0.8067036867141724, "logps/chosen": -341.3122863769531, "logps/rejected": -489.5074157714844, "loss": 0.0096, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.267340183258057, "rewards/margins": 14.801874160766602, "rewards/rejected": -21.0692138671875, "step": 4700 }, { "epoch": 2.4594453165881736, "eval_logits/chosen": -1.2412922382354736, "eval_logits/rejected": -0.9036476612091064, "eval_logps/chosen": -377.4186706542969, "eval_logps/rejected": -416.21063232421875, "eval_loss": 1.1442936658859253, "eval_rewards/accuracies": 0.734375, "eval_rewards/chosen": -10.250679969787598, "eval_rewards/margins": 4.928625583648682, "eval_rewards/rejected": -15.17930793762207, "eval_runtime": 220.4109, "eval_samples_per_second": 9.074, "eval_steps_per_second": 0.145, "step": 4700 }, { "epoch": 2.464678178963893, "grad_norm": 1.2234685783215498, "learning_rate": 9.914712153518123e-08, "logits/chosen": -1.373504638671875, "logits/rejected": -0.9374760389328003, "logps/chosen": -350.34521484375, "logps/rejected": -456.72955322265625, "loss": 0.0112, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.717162609100342, "rewards/margins": 14.930990219116211, "rewards/rejected": -20.64815330505371, "step": 4710 }, { "epoch": 2.469911041339613, "grad_norm": 65.70710519735921, "learning_rate": 9.817794146152354e-08, "logits/chosen": -1.3174304962158203, "logits/rejected": -0.6792569160461426, "logps/chosen": -313.7614440917969, "logps/rejected": -409.63409423828125, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": -6.824581146240234, "rewards/margins": 13.567425727844238, "rewards/rejected": -20.39200782775879, "step": 4720 }, { "epoch": 2.4751439037153324, "grad_norm": 1.9800277085092517, "learning_rate": 9.720876138786586e-08, "logits/chosen": -1.3761876821517944, "logits/rejected": -0.8605688810348511, "logps/chosen": -327.0479431152344, "logps/rejected": -445.4021911621094, "loss": 0.0164, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.605675220489502, "rewards/margins": 14.786867141723633, "rewards/rejected": -19.392541885375977, "step": 4730 }, { "epoch": 2.4803767660910516, "grad_norm": 4.321665112606113, "learning_rate": 9.623958131420818e-08, "logits/chosen": -1.3254318237304688, "logits/rejected": -0.8368440866470337, "logps/chosen": -318.51739501953125, "logps/rejected": -434.72039794921875, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -4.741262912750244, "rewards/margins": 14.217828750610352, "rewards/rejected": -18.959091186523438, "step": 4740 }, { "epoch": 2.4856096284667712, "grad_norm": 1.6443224265638914, "learning_rate": 9.527040124055049e-08, "logits/chosen": -1.3288558721542358, "logits/rejected": -0.9851850271224976, "logps/chosen": -296.11822509765625, "logps/rejected": -415.15826416015625, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -5.717015266418457, "rewards/margins": 13.65253734588623, "rewards/rejected": -19.369552612304688, "step": 4750 }, { "epoch": 2.490842490842491, "grad_norm": 7.901684176721351, "learning_rate": 9.430122116689281e-08, "logits/chosen": -1.5002716779708862, "logits/rejected": -1.144884467124939, "logps/chosen": -324.4737243652344, "logps/rejected": -475.47052001953125, "loss": 0.0151, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.013172626495361, "rewards/margins": 15.177549362182617, "rewards/rejected": -20.190723419189453, "step": 4760 }, { "epoch": 2.4960753532182105, "grad_norm": 9.872407613684684, "learning_rate": 9.333204109323511e-08, "logits/chosen": -1.3229342699050903, "logits/rejected": -0.8783276677131653, "logps/chosen": -320.592041015625, "logps/rejected": -446.3987731933594, "loss": 0.0146, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.157200813293457, "rewards/margins": 14.721330642700195, "rewards/rejected": -20.878530502319336, "step": 4770 }, { "epoch": 2.50130821559393, "grad_norm": 0.14424843746949972, "learning_rate": 9.236286101957744e-08, "logits/chosen": -1.2552093267440796, "logits/rejected": -0.8691731691360474, "logps/chosen": -318.884033203125, "logps/rejected": -428.4579162597656, "loss": 0.0037, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.551468849182129, "rewards/margins": 15.97840404510498, "rewards/rejected": -20.52987289428711, "step": 4780 }, { "epoch": 2.5065410779696493, "grad_norm": 1.145073446713144, "learning_rate": 9.139368094591975e-08, "logits/chosen": -1.463075876235962, "logits/rejected": -0.9242236018180847, "logps/chosen": -331.6676940917969, "logps/rejected": -451.33056640625, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -5.842660903930664, "rewards/margins": 13.54230785369873, "rewards/rejected": -19.384967803955078, "step": 4790 }, { "epoch": 2.511773940345369, "grad_norm": 53.414049599066296, "learning_rate": 9.042450087226207e-08, "logits/chosen": -1.4032329320907593, "logits/rejected": -0.8539814949035645, "logps/chosen": -374.23919677734375, "logps/rejected": -472.2911682128906, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": -5.821663856506348, "rewards/margins": 14.997770309448242, "rewards/rejected": -20.819433212280273, "step": 4800 }, { "epoch": 2.511773940345369, "eval_logits/chosen": -1.217530608177185, "eval_logits/rejected": -0.8424502015113831, "eval_logps/chosen": -378.7332458496094, "eval_logps/rejected": -418.6387634277344, "eval_loss": 1.1421587467193604, "eval_rewards/accuracies": 0.71875, "eval_rewards/chosen": -10.382144927978516, "eval_rewards/margins": 5.039978504180908, "eval_rewards/rejected": -15.422122955322266, "eval_runtime": 220.0139, "eval_samples_per_second": 9.09, "eval_steps_per_second": 0.145, "step": 4800 }, { "epoch": 2.5170068027210886, "grad_norm": 5.6669095929469515, "learning_rate": 8.945532079860438e-08, "logits/chosen": -1.397002935409546, "logits/rejected": -0.9337421655654907, "logps/chosen": -366.8694152832031, "logps/rejected": -484.88226318359375, "loss": 0.0249, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.546351909637451, "rewards/margins": 15.214116096496582, "rewards/rejected": -20.760469436645508, "step": 4810 }, { "epoch": 2.5222396650968077, "grad_norm": 0.16761125106760624, "learning_rate": 8.848614072494668e-08, "logits/chosen": -1.347931146621704, "logits/rejected": -0.8399303555488586, "logps/chosen": -354.5140380859375, "logps/rejected": -451.71343994140625, "loss": 0.0115, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.887838363647461, "rewards/margins": 15.01702880859375, "rewards/rejected": -20.904865264892578, "step": 4820 }, { "epoch": 2.5274725274725274, "grad_norm": 19.153292722375543, "learning_rate": 8.751696065128901e-08, "logits/chosen": -1.4510747194290161, "logits/rejected": -0.9698382616043091, "logps/chosen": -349.3749694824219, "logps/rejected": -482.50286865234375, "loss": 0.0106, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.78605318069458, "rewards/margins": 14.621673583984375, "rewards/rejected": -20.40772819519043, "step": 4830 }, { "epoch": 2.532705389848247, "grad_norm": 27.16723491879812, "learning_rate": 8.654778057763132e-08, "logits/chosen": -1.284057378768921, "logits/rejected": -0.8538885116577148, "logps/chosen": -307.3272399902344, "logps/rejected": -439.69488525390625, "loss": 0.0162, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.001840114593506, "rewards/margins": 14.73029899597168, "rewards/rejected": -19.73213768005371, "step": 4840 }, { "epoch": 2.5379382522239666, "grad_norm": 9.10796127485408, "learning_rate": 8.557860050397363e-08, "logits/chosen": -1.3410872220993042, "logits/rejected": -0.8306550979614258, "logps/chosen": -329.99737548828125, "logps/rejected": -469.88043212890625, "loss": 0.0176, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.6462836265563965, "rewards/margins": 14.453458786010742, "rewards/rejected": -21.099742889404297, "step": 4850 }, { "epoch": 2.5431711145996863, "grad_norm": 2.829651219701308, "learning_rate": 8.460942043031595e-08, "logits/chosen": -1.3660612106323242, "logits/rejected": -0.9202496409416199, "logps/chosen": -386.5443115234375, "logps/rejected": -468.712890625, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -5.706084251403809, "rewards/margins": 15.134010314941406, "rewards/rejected": -20.8400936126709, "step": 4860 }, { "epoch": 2.5484039769754054, "grad_norm": 1.3993801831023338, "learning_rate": 8.364024035665825e-08, "logits/chosen": -1.2338815927505493, "logits/rejected": -0.60883629322052, "logps/chosen": -366.0127258300781, "logps/rejected": -450.39013671875, "loss": 0.0162, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.53316593170166, "rewards/margins": 13.396270751953125, "rewards/rejected": -19.92943572998047, "step": 4870 }, { "epoch": 2.553636839351125, "grad_norm": 8.168083482719267, "learning_rate": 8.267106028300058e-08, "logits/chosen": -1.3194663524627686, "logits/rejected": -0.8190845251083374, "logps/chosen": -309.6647033691406, "logps/rejected": -484.15509033203125, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -6.348785400390625, "rewards/margins": 14.336840629577637, "rewards/rejected": -20.685626983642578, "step": 4880 }, { "epoch": 2.5588697017268447, "grad_norm": 1.2829477553872992, "learning_rate": 8.17018802093429e-08, "logits/chosen": -1.4365357160568237, "logits/rejected": -0.7577365636825562, "logps/chosen": -342.51507568359375, "logps/rejected": -450.714111328125, "loss": 0.0116, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.9164509773254395, "rewards/margins": 14.551651000976562, "rewards/rejected": -19.468103408813477, "step": 4890 }, { "epoch": 2.564102564102564, "grad_norm": 133.84698457971047, "learning_rate": 8.07327001356852e-08, "logits/chosen": -1.4544377326965332, "logits/rejected": -1.3149387836456299, "logps/chosen": -317.47711181640625, "logps/rejected": -459.3959045410156, "loss": 0.0129, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.178954124450684, "rewards/margins": 14.114166259765625, "rewards/rejected": -18.293121337890625, "step": 4900 }, { "epoch": 2.564102564102564, "eval_logits/chosen": -1.2929997444152832, "eval_logits/rejected": -0.9190285801887512, "eval_logps/chosen": -368.421630859375, "eval_logps/rejected": -406.8686828613281, "eval_loss": 1.115505337715149, "eval_rewards/accuracies": 0.72265625, "eval_rewards/chosen": -9.350979804992676, "eval_rewards/margins": 4.894130229949951, "eval_rewards/rejected": -14.245111465454102, "eval_runtime": 222.0815, "eval_samples_per_second": 9.006, "eval_steps_per_second": 0.144, "step": 4900 }, { "epoch": 2.5693354264782835, "grad_norm": 2.450535442436061, "learning_rate": 7.976352006202753e-08, "logits/chosen": -1.4991605281829834, "logits/rejected": -1.1685174703598022, "logps/chosen": -358.2704162597656, "logps/rejected": -462.3388671875, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -4.822484493255615, "rewards/margins": 13.588537216186523, "rewards/rejected": -18.411022186279297, "step": 4910 }, { "epoch": 2.574568288854003, "grad_norm": 1.2481538539893653, "learning_rate": 7.879433998836983e-08, "logits/chosen": -1.346956729888916, "logits/rejected": -0.7180608510971069, "logps/chosen": -367.8849792480469, "logps/rejected": -461.6809997558594, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -5.024774074554443, "rewards/margins": 14.506240844726562, "rewards/rejected": -19.531017303466797, "step": 4920 }, { "epoch": 2.5798011512297228, "grad_norm": 3.701132658875072, "learning_rate": 7.782515991471216e-08, "logits/chosen": -1.2875282764434814, "logits/rejected": -0.9736431241035461, "logps/chosen": -294.10260009765625, "logps/rejected": -475.20562744140625, "loss": 0.0202, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.328468322753906, "rewards/margins": 14.62384033203125, "rewards/rejected": -19.952306747436523, "step": 4930 }, { "epoch": 2.5850340136054424, "grad_norm": 0.8033607231763227, "learning_rate": 7.685597984105447e-08, "logits/chosen": -1.0782692432403564, "logits/rejected": -0.7414752244949341, "logps/chosen": -303.42645263671875, "logps/rejected": -452.6419982910156, "loss": 0.0262, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.153589725494385, "rewards/margins": 14.17754077911377, "rewards/rejected": -20.33112907409668, "step": 4940 }, { "epoch": 2.5902668759811616, "grad_norm": 32.89928245726819, "learning_rate": 7.588679976739677e-08, "logits/chosen": -1.0568872690200806, "logits/rejected": -0.7248207330703735, "logps/chosen": -317.9796142578125, "logps/rejected": -508.5437927246094, "loss": 0.0112, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.198376178741455, "rewards/margins": 15.357853889465332, "rewards/rejected": -21.556230545043945, "step": 4950 }, { "epoch": 2.595499738356881, "grad_norm": 40.54841822703054, "learning_rate": 7.49176196937391e-08, "logits/chosen": -1.2194268703460693, "logits/rejected": -0.9972572326660156, "logps/chosen": -368.8533020019531, "logps/rejected": -492.67999267578125, "loss": 0.0181, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.049595832824707, "rewards/margins": 14.703588485717773, "rewards/rejected": -19.753183364868164, "step": 4960 }, { "epoch": 2.600732600732601, "grad_norm": 0.3628723278440622, "learning_rate": 7.39484396200814e-08, "logits/chosen": -1.114551305770874, "logits/rejected": -0.7708591222763062, "logps/chosen": -354.394775390625, "logps/rejected": -528.8939819335938, "loss": 0.0107, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.651768684387207, "rewards/margins": 15.612066268920898, "rewards/rejected": -21.26383399963379, "step": 4970 }, { "epoch": 2.60596546310832, "grad_norm": 0.9606935826167063, "learning_rate": 7.297925954642373e-08, "logits/chosen": -1.2622501850128174, "logits/rejected": -0.7919420003890991, "logps/chosen": -348.60736083984375, "logps/rejected": -489.42364501953125, "loss": 0.0062, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.761450290679932, "rewards/margins": 14.725500106811523, "rewards/rejected": -21.486949920654297, "step": 4980 }, { "epoch": 2.6111983254840396, "grad_norm": 0.7773908502105705, "learning_rate": 7.201007947276603e-08, "logits/chosen": -0.9540790319442749, "logits/rejected": -0.3669336140155792, "logps/chosen": -316.34326171875, "logps/rejected": -479.26129150390625, "loss": 0.005, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.749539852142334, "rewards/margins": 16.936477661132812, "rewards/rejected": -22.686016082763672, "step": 4990 }, { "epoch": 2.6164311878597593, "grad_norm": 1.3935544065854253, "learning_rate": 7.104089939910834e-08, "logits/chosen": -1.167436122894287, "logits/rejected": -0.7113689184188843, "logps/chosen": -316.1573181152344, "logps/rejected": -456.22222900390625, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -5.780141830444336, "rewards/margins": 13.970486640930176, "rewards/rejected": -19.750627517700195, "step": 5000 }, { "epoch": 2.6164311878597593, "eval_logits/chosen": -1.0263546705245972, "eval_logits/rejected": -0.6075526475906372, "eval_logps/chosen": -382.1504211425781, "eval_logps/rejected": -424.7771911621094, "eval_loss": 1.1905261278152466, "eval_rewards/accuracies": 0.724609375, "eval_rewards/chosen": -10.723856925964355, "eval_rewards/margins": 5.312103748321533, "eval_rewards/rejected": -16.03596305847168, "eval_runtime": 224.7517, "eval_samples_per_second": 8.899, "eval_steps_per_second": 0.142, "step": 5000 }, { "epoch": 2.621664050235479, "grad_norm": 4.027705526746669, "learning_rate": 7.007171932545067e-08, "logits/chosen": -1.22074294090271, "logits/rejected": -0.5824058651924133, "logps/chosen": -385.4154357910156, "logps/rejected": -483.08050537109375, "loss": 0.0036, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.230188846588135, "rewards/margins": 14.77534008026123, "rewards/rejected": -21.00552749633789, "step": 5010 }, { "epoch": 2.6268969126111985, "grad_norm": 10.364357971959084, "learning_rate": 6.910253925179297e-08, "logits/chosen": -1.157440185546875, "logits/rejected": -0.6002715826034546, "logps/chosen": -339.4776306152344, "logps/rejected": -456.51202392578125, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -5.962937831878662, "rewards/margins": 15.342620849609375, "rewards/rejected": -21.305559158325195, "step": 5020 }, { "epoch": 2.6321297749869177, "grad_norm": 1.4911250605389275, "learning_rate": 6.81333591781353e-08, "logits/chosen": -0.8587439656257629, "logits/rejected": -0.449968159198761, "logps/chosen": -320.25433349609375, "logps/rejected": -494.52105712890625, "loss": 0.0133, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.7136430740356445, "rewards/margins": 15.39857292175293, "rewards/rejected": -22.11221694946289, "step": 5030 }, { "epoch": 2.6373626373626373, "grad_norm": 0.18255325428231595, "learning_rate": 6.71641791044776e-08, "logits/chosen": -1.0854859352111816, "logits/rejected": -0.5131527781486511, "logps/chosen": -361.51678466796875, "logps/rejected": -482.86346435546875, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -7.049310207366943, "rewards/margins": 16.329849243164062, "rewards/rejected": -23.37915802001953, "step": 5040 }, { "epoch": 2.642595499738357, "grad_norm": 180.55531941473674, "learning_rate": 6.619499903081992e-08, "logits/chosen": -1.0115007162094116, "logits/rejected": -0.5730847716331482, "logps/chosen": -299.21221923828125, "logps/rejected": -470.7162170410156, "loss": 0.0089, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.75676155090332, "rewards/margins": 16.181087493896484, "rewards/rejected": -21.937850952148438, "step": 5050 }, { "epoch": 2.647828362114076, "grad_norm": 1.5055317136951336, "learning_rate": 6.522581895716224e-08, "logits/chosen": -1.2051219940185547, "logits/rejected": -0.774687647819519, "logps/chosen": -335.0210876464844, "logps/rejected": -479.7164001464844, "loss": 0.0251, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.115020751953125, "rewards/margins": 16.43503189086914, "rewards/rejected": -21.550052642822266, "step": 5060 }, { "epoch": 2.6530612244897958, "grad_norm": 6.949812024913572, "learning_rate": 6.425663888350455e-08, "logits/chosen": -1.0127816200256348, "logits/rejected": -0.32723063230514526, "logps/chosen": -330.1407165527344, "logps/rejected": -492.898681640625, "loss": 0.0096, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.352281093597412, "rewards/margins": 15.266721725463867, "rewards/rejected": -22.618999481201172, "step": 5070 }, { "epoch": 2.6582940868655154, "grad_norm": 6.935041607523833, "learning_rate": 6.328745880984687e-08, "logits/chosen": -1.3064888715744019, "logits/rejected": -0.7615693211555481, "logps/chosen": -366.3725891113281, "logps/rejected": -460.90814208984375, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": -5.12991189956665, "rewards/margins": 17.188709259033203, "rewards/rejected": -22.318620681762695, "step": 5080 }, { "epoch": 2.663526949241235, "grad_norm": 207.56543292817625, "learning_rate": 6.231827873618918e-08, "logits/chosen": -1.197088599205017, "logits/rejected": -0.6629847884178162, "logps/chosen": -358.68182373046875, "logps/rejected": -490.58819580078125, "loss": 0.0303, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.353116989135742, "rewards/margins": 15.651908874511719, "rewards/rejected": -22.005023956298828, "step": 5090 }, { "epoch": 2.6687598116169546, "grad_norm": 0.4814755928406524, "learning_rate": 6.13490986625315e-08, "logits/chosen": -1.3483777046203613, "logits/rejected": -0.8359513282775879, "logps/chosen": -314.63702392578125, "logps/rejected": -448.37042236328125, "loss": 0.0069, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.5797624588012695, "rewards/margins": 15.26481819152832, "rewards/rejected": -19.844579696655273, "step": 5100 }, { "epoch": 2.6687598116169546, "eval_logits/chosen": -1.1314553022384644, "eval_logits/rejected": -0.7335901856422424, "eval_logps/chosen": -377.53558349609375, "eval_logps/rejected": -419.5959777832031, "eval_loss": 1.1635292768478394, "eval_rewards/accuracies": 0.7265625, "eval_rewards/chosen": -10.26237678527832, "eval_rewards/margins": 5.255462169647217, "eval_rewards/rejected": -15.517838478088379, "eval_runtime": 221.6025, "eval_samples_per_second": 9.025, "eval_steps_per_second": 0.144, "step": 5100 }, { "epoch": 2.6739926739926743, "grad_norm": 2.85591166824044, "learning_rate": 6.037991858887382e-08, "logits/chosen": -1.343016505241394, "logits/rejected": -0.8626836538314819, "logps/chosen": -353.92901611328125, "logps/rejected": -510.2799377441406, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -5.385402679443359, "rewards/margins": 16.151649475097656, "rewards/rejected": -21.537052154541016, "step": 5110 }, { "epoch": 2.6792255363683934, "grad_norm": 18.370622276450902, "learning_rate": 5.941073851521612e-08, "logits/chosen": -1.0809108018875122, "logits/rejected": -0.5481597185134888, "logps/chosen": -325.6885070800781, "logps/rejected": -461.4901428222656, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -6.464535713195801, "rewards/margins": 14.638195991516113, "rewards/rejected": -21.10272979736328, "step": 5120 }, { "epoch": 2.684458398744113, "grad_norm": 62.185314255064156, "learning_rate": 5.844155844155844e-08, "logits/chosen": -1.4143283367156982, "logits/rejected": -0.8491595983505249, "logps/chosen": -397.61834716796875, "logps/rejected": -510.74359130859375, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -4.0614190101623535, "rewards/margins": 16.28720474243164, "rewards/rejected": -20.348621368408203, "step": 5130 }, { "epoch": 2.6896912611198327, "grad_norm": 49.4515836072188, "learning_rate": 5.7472378367900755e-08, "logits/chosen": -1.1385517120361328, "logits/rejected": -0.7364552617073059, "logps/chosen": -321.9407653808594, "logps/rejected": -422.78839111328125, "loss": 0.0108, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.934482574462891, "rewards/margins": 14.1314697265625, "rewards/rejected": -20.065950393676758, "step": 5140 }, { "epoch": 2.694924123495552, "grad_norm": 13.872061363701286, "learning_rate": 5.650319829424307e-08, "logits/chosen": -1.3325976133346558, "logits/rejected": -0.8460037112236023, "logps/chosen": -328.506103515625, "logps/rejected": -432.9920959472656, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -5.798797130584717, "rewards/margins": 14.454076766967773, "rewards/rejected": -20.252872467041016, "step": 5150 }, { "epoch": 2.7001569858712715, "grad_norm": 1.3727686228568312, "learning_rate": 5.5534018220585384e-08, "logits/chosen": -1.178938627243042, "logits/rejected": -0.6552150845527649, "logps/chosen": -326.5952453613281, "logps/rejected": -449.3445739746094, "loss": 0.0097, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.296372413635254, "rewards/margins": 15.000231742858887, "rewards/rejected": -21.296606063842773, "step": 5160 }, { "epoch": 2.705389848246991, "grad_norm": 1.4857367924503033, "learning_rate": 5.456483814692769e-08, "logits/chosen": -0.9873617887496948, "logits/rejected": -0.6433528661727905, "logps/chosen": -298.9508361816406, "logps/rejected": -467.663818359375, "loss": 0.0115, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.651206970214844, "rewards/margins": 16.427021026611328, "rewards/rejected": -22.07822608947754, "step": 5170 }, { "epoch": 2.7106227106227108, "grad_norm": 1.0549387247208362, "learning_rate": 5.359565807327001e-08, "logits/chosen": -1.0802698135375977, "logits/rejected": -0.6398526430130005, "logps/chosen": -331.7131042480469, "logps/rejected": -465.5804138183594, "loss": 0.015, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.969025611877441, "rewards/margins": 16.57152557373047, "rewards/rejected": -21.540552139282227, "step": 5180 }, { "epoch": 2.7158555729984304, "grad_norm": 0.18893972998377886, "learning_rate": 5.262647799961233e-08, "logits/chosen": -1.008361577987671, "logits/rejected": -0.6845455765724182, "logps/chosen": -301.2408752441406, "logps/rejected": -428.3558654785156, "loss": 0.0089, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.827343940734863, "rewards/margins": 14.520584106445312, "rewards/rejected": -20.34792709350586, "step": 5190 }, { "epoch": 2.7210884353741496, "grad_norm": 230.56009221283674, "learning_rate": 5.165729792595464e-08, "logits/chosen": -1.1779309511184692, "logits/rejected": -0.754621148109436, "logps/chosen": -371.3536682128906, "logps/rejected": -491.2679748535156, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -4.041421890258789, "rewards/margins": 16.182706832885742, "rewards/rejected": -20.224130630493164, "step": 5200 }, { "epoch": 2.7210884353741496, "eval_logits/chosen": -0.9680299758911133, "eval_logits/rejected": -0.5586550831794739, "eval_logps/chosen": -379.5029296875, "eval_logps/rejected": -421.263427734375, "eval_loss": 1.1696583032608032, "eval_rewards/accuracies": 0.7265625, "eval_rewards/chosen": -10.4591064453125, "eval_rewards/margins": 5.225481033325195, "eval_rewards/rejected": -15.684587478637695, "eval_runtime": 221.8086, "eval_samples_per_second": 9.017, "eval_steps_per_second": 0.144, "step": 5200 }, { "epoch": 2.726321297749869, "grad_norm": 9.978396091433561, "learning_rate": 5.068811785229696e-08, "logits/chosen": -0.9712247848510742, "logits/rejected": -0.6279711127281189, "logps/chosen": -278.9203186035156, "logps/rejected": -423.95721435546875, "loss": 0.0143, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.465112209320068, "rewards/margins": 13.702920913696289, "rewards/rejected": -19.168033599853516, "step": 5210 }, { "epoch": 2.731554160125589, "grad_norm": 6.044516935799119, "learning_rate": 4.9718937778639265e-08, "logits/chosen": -1.1338233947753906, "logits/rejected": -0.797059953212738, "logps/chosen": -344.9239501953125, "logps/rejected": -466.5174865722656, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -5.865880012512207, "rewards/margins": 13.797937393188477, "rewards/rejected": -19.663818359375, "step": 5220 }, { "epoch": 2.736787022501308, "grad_norm": 10.882288291393834, "learning_rate": 4.874975770498158e-08, "logits/chosen": -1.0527111291885376, "logits/rejected": -0.5531308650970459, "logps/chosen": -350.96026611328125, "logps/rejected": -488.42132568359375, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -6.323761940002441, "rewards/margins": 15.138943672180176, "rewards/rejected": -21.462703704833984, "step": 5230 }, { "epoch": 2.7420198848770276, "grad_norm": 0.47358991440290094, "learning_rate": 4.77805776313239e-08, "logits/chosen": -0.8294271230697632, "logits/rejected": -0.15024976432323456, "logps/chosen": -270.75067138671875, "logps/rejected": -425.16192626953125, "loss": 0.0253, "rewards/accuracies": 1.0, "rewards/chosen": -5.996121883392334, "rewards/margins": 15.376423835754395, "rewards/rejected": -21.372541427612305, "step": 5240 }, { "epoch": 2.7472527472527473, "grad_norm": 11.961761748092576, "learning_rate": 4.6811397557666216e-08, "logits/chosen": -1.1459107398986816, "logits/rejected": -0.48857077956199646, "logps/chosen": -340.2027587890625, "logps/rejected": -438.56494140625, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -4.592692852020264, "rewards/margins": 15.846899032592773, "rewards/rejected": -20.439594268798828, "step": 5250 }, { "epoch": 2.752485609628467, "grad_norm": 13.355230614489862, "learning_rate": 4.584221748400853e-08, "logits/chosen": -0.9249771237373352, "logits/rejected": -0.7348052263259888, "logps/chosen": -295.2366027832031, "logps/rejected": -488.716552734375, "loss": 0.0084, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.635087490081787, "rewards/margins": 15.127801895141602, "rewards/rejected": -21.762889862060547, "step": 5260 }, { "epoch": 2.7577184720041865, "grad_norm": 0.8184563559938807, "learning_rate": 4.487303741035084e-08, "logits/chosen": -1.1059998273849487, "logits/rejected": -0.4299143850803375, "logps/chosen": -316.58184814453125, "logps/rejected": -450.552001953125, "loss": 0.0078, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.569085121154785, "rewards/margins": 16.372846603393555, "rewards/rejected": -21.941930770874023, "step": 5270 }, { "epoch": 2.7629513343799057, "grad_norm": 0.9934255918001019, "learning_rate": 4.390385733669315e-08, "logits/chosen": -1.0912392139434814, "logits/rejected": -0.6420986652374268, "logps/chosen": -360.6625061035156, "logps/rejected": -466.68194580078125, "loss": 0.009, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.99826717376709, "rewards/margins": 14.567499160766602, "rewards/rejected": -19.565765380859375, "step": 5280 }, { "epoch": 2.7681841967556253, "grad_norm": 2.176985198989105, "learning_rate": 4.2934677263035474e-08, "logits/chosen": -1.2324541807174683, "logits/rejected": -0.5672933459281921, "logps/chosen": -361.19696044921875, "logps/rejected": -467.16607666015625, "loss": 0.0065, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.437089920043945, "rewards/margins": 16.02477264404297, "rewards/rejected": -21.46186065673828, "step": 5290 }, { "epoch": 2.773417059131345, "grad_norm": 11.11315427227516, "learning_rate": 4.196549718937779e-08, "logits/chosen": -1.011289119720459, "logits/rejected": -0.8029254674911499, "logps/chosen": -295.74896240234375, "logps/rejected": -451.13671875, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -5.619412422180176, "rewards/margins": 14.679559707641602, "rewards/rejected": -20.298969268798828, "step": 5300 }, { "epoch": 2.773417059131345, "eval_logits/chosen": -1.111677646636963, "eval_logits/rejected": -0.7311839461326599, "eval_logps/chosen": -371.86981201171875, "eval_logps/rejected": -412.99383544921875, "eval_loss": 1.1614402532577515, "eval_rewards/accuracies": 0.724609375, "eval_rewards/chosen": -9.695798873901367, "eval_rewards/margins": 5.161827087402344, "eval_rewards/rejected": -14.857625961303711, "eval_runtime": 217.2304, "eval_samples_per_second": 9.207, "eval_steps_per_second": 0.147, "step": 5300 }, { "epoch": 2.778649921507064, "grad_norm": 39.28169211068417, "learning_rate": 4.0996317115720097e-08, "logits/chosen": -1.1473476886749268, "logits/rejected": -0.6158251762390137, "logps/chosen": -324.9118347167969, "logps/rejected": -432.2850646972656, "loss": 0.0114, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.124190330505371, "rewards/margins": 14.089584350585938, "rewards/rejected": -20.21377182006836, "step": 5310 }, { "epoch": 2.7838827838827838, "grad_norm": 5.0811521861793505, "learning_rate": 4.002713704206241e-08, "logits/chosen": -1.2011510133743286, "logits/rejected": -0.5742926001548767, "logps/chosen": -305.45355224609375, "logps/rejected": -446.84820556640625, "loss": 0.0082, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.9639081954956055, "rewards/margins": 14.753039360046387, "rewards/rejected": -20.71694564819336, "step": 5320 }, { "epoch": 2.7891156462585034, "grad_norm": 0.21438276587655913, "learning_rate": 3.9057956968404726e-08, "logits/chosen": -0.903179943561554, "logits/rejected": -0.3030739724636078, "logps/chosen": -288.71441650390625, "logps/rejected": -402.7944641113281, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -5.406940937042236, "rewards/margins": 15.035211563110352, "rewards/rejected": -20.442150115966797, "step": 5330 }, { "epoch": 2.794348508634223, "grad_norm": 0.45402295424033884, "learning_rate": 3.808877689474704e-08, "logits/chosen": -1.2260855436325073, "logits/rejected": -0.4688805937767029, "logps/chosen": -353.9613952636719, "logps/rejected": -465.66741943359375, "loss": 0.0083, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.747186183929443, "rewards/margins": 15.82872486114502, "rewards/rejected": -21.575910568237305, "step": 5340 }, { "epoch": 2.7995813710099426, "grad_norm": 24.076805243787895, "learning_rate": 3.711959682108936e-08, "logits/chosen": -1.081113576889038, "logits/rejected": -0.7124945521354675, "logps/chosen": -338.6380310058594, "logps/rejected": -454.1852111816406, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": -5.500039577484131, "rewards/margins": 14.252687454223633, "rewards/rejected": -19.752727508544922, "step": 5350 }, { "epoch": 2.804814233385662, "grad_norm": 43.445296810580714, "learning_rate": 3.615041674743167e-08, "logits/chosen": -1.2910771369934082, "logits/rejected": -0.8315303921699524, "logps/chosen": -395.3627624511719, "logps/rejected": -485.40069580078125, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -5.363285064697266, "rewards/margins": 13.80860424041748, "rewards/rejected": -19.17188835144043, "step": 5360 }, { "epoch": 2.8100470957613815, "grad_norm": 0.8151385258870243, "learning_rate": 3.5181236673773984e-08, "logits/chosen": -1.1565812826156616, "logits/rejected": -0.7277237772941589, "logps/chosen": -329.97125244140625, "logps/rejected": -445.7373962402344, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -6.035459995269775, "rewards/margins": 13.54619026184082, "rewards/rejected": -19.581649780273438, "step": 5370 }, { "epoch": 2.815279958137101, "grad_norm": 1.018239878180439, "learning_rate": 3.42120566001163e-08, "logits/chosen": -0.9620019197463989, "logits/rejected": -0.6952940821647644, "logps/chosen": -321.4090576171875, "logps/rejected": -478.28729248046875, "loss": 0.0167, "rewards/accuracies": 1.0, "rewards/chosen": -5.453761100769043, "rewards/margins": 16.26519203186035, "rewards/rejected": -21.718952178955078, "step": 5380 }, { "epoch": 2.8205128205128203, "grad_norm": 9.134739341627501, "learning_rate": 3.324287652645861e-08, "logits/chosen": -1.316060185432434, "logits/rejected": -0.6638561487197876, "logps/chosen": -360.3699645996094, "logps/rejected": -505.68927001953125, "loss": 0.0096, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.198365688323975, "rewards/margins": 16.934099197387695, "rewards/rejected": -21.132465362548828, "step": 5390 }, { "epoch": 2.82574568288854, "grad_norm": 1.1985671597129286, "learning_rate": 3.2273696452800935e-08, "logits/chosen": -1.2535731792449951, "logits/rejected": -0.6485108137130737, "logps/chosen": -313.00274658203125, "logps/rejected": -427.9405212402344, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -5.446139335632324, "rewards/margins": 14.111053466796875, "rewards/rejected": -19.557193756103516, "step": 5400 }, { "epoch": 2.82574568288854, "eval_logits/chosen": -1.0801527500152588, "eval_logits/rejected": -0.6842809915542603, "eval_logps/chosen": -376.01287841796875, "eval_logps/rejected": -417.0325012207031, "eval_loss": 1.1536669731140137, "eval_rewards/accuracies": 0.716796875, "eval_rewards/chosen": -10.110106468200684, "eval_rewards/margins": 5.151389122009277, "eval_rewards/rejected": -15.261494636535645, "eval_runtime": 222.6489, "eval_samples_per_second": 8.983, "eval_steps_per_second": 0.144, "step": 5400 }, { "epoch": 2.8309785452642595, "grad_norm": 2.38554899235331, "learning_rate": 3.130451637914324e-08, "logits/chosen": -1.263742446899414, "logits/rejected": -0.850864589214325, "logps/chosen": -379.99041748046875, "logps/rejected": -493.340576171875, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -5.855295658111572, "rewards/margins": 14.703465461730957, "rewards/rejected": -20.558759689331055, "step": 5410 }, { "epoch": 2.836211407639979, "grad_norm": 0.24442266797356973, "learning_rate": 3.033533630548556e-08, "logits/chosen": -1.1347688436508179, "logits/rejected": -0.7221279740333557, "logps/chosen": -352.7520751953125, "logps/rejected": -445.947509765625, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -4.615082740783691, "rewards/margins": 14.75482177734375, "rewards/rejected": -19.36990737915039, "step": 5420 }, { "epoch": 2.8414442700156988, "grad_norm": 3.100799998305922, "learning_rate": 2.9366156231827872e-08, "logits/chosen": -1.171316385269165, "logits/rejected": -0.7999770641326904, "logps/chosen": -356.74749755859375, "logps/rejected": -473.444091796875, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -6.220460891723633, "rewards/margins": 14.733111381530762, "rewards/rejected": -20.953571319580078, "step": 5430 }, { "epoch": 2.846677132391418, "grad_norm": 0.4163521556308838, "learning_rate": 2.839697615817019e-08, "logits/chosen": -1.0721533298492432, "logits/rejected": -0.4800244867801666, "logps/chosen": -319.77752685546875, "logps/rejected": -453.77703857421875, "loss": 0.0149, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.266595363616943, "rewards/margins": 14.671236991882324, "rewards/rejected": -20.937829971313477, "step": 5440 }, { "epoch": 2.8519099947671376, "grad_norm": 9.153420663097643, "learning_rate": 2.74277960845125e-08, "logits/chosen": -0.9526373744010925, "logits/rejected": -0.4476087689399719, "logps/chosen": -313.1230773925781, "logps/rejected": -505.02069091796875, "loss": 0.0108, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -7.246424198150635, "rewards/margins": 15.73095417022705, "rewards/rejected": -22.977378845214844, "step": 5450 }, { "epoch": 2.857142857142857, "grad_norm": 0.9062326724539099, "learning_rate": 2.6458616010854815e-08, "logits/chosen": -1.0871398448944092, "logits/rejected": -0.7295079231262207, "logps/chosen": -347.8956604003906, "logps/rejected": -511.82965087890625, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -5.8363847732543945, "rewards/margins": 15.657150268554688, "rewards/rejected": -21.493534088134766, "step": 5460 }, { "epoch": 2.8623757195185764, "grad_norm": 36.86610948705389, "learning_rate": 2.548943593719713e-08, "logits/chosen": -1.1093363761901855, "logits/rejected": -0.5206930637359619, "logps/chosen": -333.8991394042969, "logps/rejected": -456.4771423339844, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -6.237990379333496, "rewards/margins": 15.463560104370117, "rewards/rejected": -21.701549530029297, "step": 5470 }, { "epoch": 2.867608581894296, "grad_norm": 150.34508967404915, "learning_rate": 2.4520255863539445e-08, "logits/chosen": -1.0368270874023438, "logits/rejected": -0.9396475553512573, "logps/chosen": -330.38470458984375, "logps/rejected": -498.42608642578125, "loss": 0.0096, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.516279697418213, "rewards/margins": 14.998318672180176, "rewards/rejected": -21.514598846435547, "step": 5480 }, { "epoch": 2.8728414442700156, "grad_norm": 250.0327660412077, "learning_rate": 2.3551075789881756e-08, "logits/chosen": -1.0775210857391357, "logits/rejected": -0.5545331239700317, "logps/chosen": -334.61962890625, "logps/rejected": -463.9176330566406, "loss": 0.0149, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.317910194396973, "rewards/margins": 14.299299240112305, "rewards/rejected": -20.61720848083496, "step": 5490 }, { "epoch": 2.8780743066457353, "grad_norm": 3.302583479290735, "learning_rate": 2.2581895716224074e-08, "logits/chosen": -1.1487029790878296, "logits/rejected": -0.28297287225723267, "logps/chosen": -345.1497802734375, "logps/rejected": -446.97552490234375, "loss": 0.0209, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.723986625671387, "rewards/margins": 15.83373737335205, "rewards/rejected": -21.557722091674805, "step": 5500 }, { "epoch": 2.8780743066457353, "eval_logits/chosen": -0.9493402242660522, "eval_logits/rejected": -0.5315797924995422, "eval_logps/chosen": -382.95819091796875, "eval_logps/rejected": -423.419921875, "eval_loss": 1.142513632774353, "eval_rewards/accuracies": 0.7265625, "eval_rewards/chosen": -10.80463695526123, "eval_rewards/margins": 5.09559965133667, "eval_rewards/rejected": -15.900236129760742, "eval_runtime": 220.4448, "eval_samples_per_second": 9.073, "eval_steps_per_second": 0.145, "step": 5500 }, { "epoch": 2.883307169021455, "grad_norm": 0.7588987626798548, "learning_rate": 2.161271564256639e-08, "logits/chosen": -1.0916458368301392, "logits/rejected": -0.5062496662139893, "logps/chosen": -353.8129577636719, "logps/rejected": -499.0782165527344, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -5.778087615966797, "rewards/margins": 15.373003005981445, "rewards/rejected": -21.15108871459961, "step": 5510 }, { "epoch": 2.8885400313971745, "grad_norm": 1.4611536615202891, "learning_rate": 2.0643535568908703e-08, "logits/chosen": -1.0158772468566895, "logits/rejected": -0.24364694952964783, "logps/chosen": -334.0220031738281, "logps/rejected": -436.5145568847656, "loss": 0.0398, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -7.593095302581787, "rewards/margins": 13.62456226348877, "rewards/rejected": -21.217655181884766, "step": 5520 }, { "epoch": 2.8937728937728937, "grad_norm": 30.553968505675854, "learning_rate": 1.9674355495251018e-08, "logits/chosen": -1.1931350231170654, "logits/rejected": -0.5602790117263794, "logps/chosen": -368.9960021972656, "logps/rejected": -478.62005615234375, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -5.623655796051025, "rewards/margins": 15.397748947143555, "rewards/rejected": -21.021404266357422, "step": 5530 }, { "epoch": 2.8990057561486133, "grad_norm": 0.3710519461008769, "learning_rate": 1.870517542159333e-08, "logits/chosen": -1.1633684635162354, "logits/rejected": -0.47450727224349976, "logps/chosen": -348.9908752441406, "logps/rejected": -490.67041015625, "loss": 0.0088, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.945566177368164, "rewards/margins": 14.709017753601074, "rewards/rejected": -20.654582977294922, "step": 5540 }, { "epoch": 2.904238618524333, "grad_norm": 53.61439898569612, "learning_rate": 1.7735995347935647e-08, "logits/chosen": -1.1472399234771729, "logits/rejected": -0.5255054235458374, "logps/chosen": -383.11260986328125, "logps/rejected": -498.3804626464844, "loss": 0.0151, "rewards/accuracies": 1.0, "rewards/chosen": -6.745779514312744, "rewards/margins": 14.65424633026123, "rewards/rejected": -21.400028228759766, "step": 5550 }, { "epoch": 2.909471480900052, "grad_norm": 0.8861557261458789, "learning_rate": 1.676681527427796e-08, "logits/chosen": -1.084014654159546, "logits/rejected": -0.518815279006958, "logps/chosen": -345.63067626953125, "logps/rejected": -492.04095458984375, "loss": 0.0152, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.484875679016113, "rewards/margins": 14.876008987426758, "rewards/rejected": -20.360885620117188, "step": 5560 }, { "epoch": 2.9147043432757718, "grad_norm": 10.335168696325226, "learning_rate": 1.5797635200620273e-08, "logits/chosen": -1.2691819667816162, "logits/rejected": -0.9174174070358276, "logps/chosen": -361.59075927734375, "logps/rejected": -473.28668212890625, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -5.078537940979004, "rewards/margins": 14.6199369430542, "rewards/rejected": -19.698474884033203, "step": 5570 }, { "epoch": 2.9199372056514914, "grad_norm": 3.0393292440457156, "learning_rate": 1.4828455126962587e-08, "logits/chosen": -0.9357242584228516, "logits/rejected": -0.4985620081424713, "logps/chosen": -316.87554931640625, "logps/rejected": -469.52239990234375, "loss": 0.0116, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.993531227111816, "rewards/margins": 15.592782974243164, "rewards/rejected": -21.586315155029297, "step": 5580 }, { "epoch": 2.925170068027211, "grad_norm": 27.052135960755805, "learning_rate": 1.3859275053304904e-08, "logits/chosen": -0.8931832313537598, "logits/rejected": -0.47486162185668945, "logps/chosen": -338.5508728027344, "logps/rejected": -446.5806579589844, "loss": 0.0166, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.349546432495117, "rewards/margins": 15.573901176452637, "rewards/rejected": -21.92344856262207, "step": 5590 }, { "epoch": 2.9304029304029307, "grad_norm": 141.9900680465616, "learning_rate": 1.2890094979647218e-08, "logits/chosen": -0.9319826364517212, "logits/rejected": -0.3505799174308777, "logps/chosen": -339.04669189453125, "logps/rejected": -462.7384338378906, "loss": 0.0145, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.5506157875061035, "rewards/margins": 16.263586044311523, "rewards/rejected": -22.814197540283203, "step": 5600 }, { "epoch": 2.9304029304029307, "eval_logits/chosen": -1.005789875984192, "eval_logits/rejected": -0.5878034830093384, "eval_logps/chosen": -380.9951171875, "eval_logps/rejected": -422.498291015625, "eval_loss": 1.1672509908676147, "eval_rewards/accuracies": 0.7265625, "eval_rewards/chosen": -10.608327865600586, "eval_rewards/margins": 5.199745178222656, "eval_rewards/rejected": -15.808073043823242, "eval_runtime": 216.3661, "eval_samples_per_second": 9.244, "eval_steps_per_second": 0.148, "step": 5600 }, { "epoch": 2.93563579277865, "grad_norm": 94.47617024579561, "learning_rate": 1.1920914905989533e-08, "logits/chosen": -1.1646987199783325, "logits/rejected": -0.3695438504219055, "logps/chosen": -362.9037170410156, "logps/rejected": -485.9522399902344, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -6.649343013763428, "rewards/margins": 16.6782169342041, "rewards/rejected": -23.327560424804688, "step": 5610 }, { "epoch": 2.9408686551543695, "grad_norm": 42.45844256871833, "learning_rate": 1.0951734832331846e-08, "logits/chosen": -1.122222900390625, "logits/rejected": -0.7718731164932251, "logps/chosen": -334.3581848144531, "logps/rejected": -512.6422119140625, "loss": 0.0197, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.860810279846191, "rewards/margins": 16.237356185913086, "rewards/rejected": -22.098169326782227, "step": 5620 }, { "epoch": 2.946101517530089, "grad_norm": 6.773844238419547, "learning_rate": 9.98255475867416e-09, "logits/chosen": -0.8908674120903015, "logits/rejected": -0.2537611722946167, "logps/chosen": -302.39349365234375, "logps/rejected": -434.7608337402344, "loss": 0.0062, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.5753607749938965, "rewards/margins": 14.723836898803711, "rewards/rejected": -21.299198150634766, "step": 5630 }, { "epoch": 2.9513343799058083, "grad_norm": 2.6257490455286425, "learning_rate": 9.013374685016477e-09, "logits/chosen": -1.272011637687683, "logits/rejected": -0.7793909311294556, "logps/chosen": -360.614013671875, "logps/rejected": -526.3766479492188, "loss": 0.0094, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.569066524505615, "rewards/margins": 16.651378631591797, "rewards/rejected": -23.220443725585938, "step": 5640 }, { "epoch": 2.956567242281528, "grad_norm": 3.0314168348937685, "learning_rate": 8.044194611358791e-09, "logits/chosen": -1.207622766494751, "logits/rejected": -0.8317354917526245, "logps/chosen": -336.08441162109375, "logps/rejected": -467.4076232910156, "loss": 0.015, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.67581033706665, "rewards/margins": 15.154218673706055, "rewards/rejected": -20.830028533935547, "step": 5650 }, { "epoch": 2.9618001046572475, "grad_norm": 1.2175250784514653, "learning_rate": 7.075014537701105e-09, "logits/chosen": -1.2857615947723389, "logits/rejected": -0.5243777632713318, "logps/chosen": -355.365234375, "logps/rejected": -467.15447998046875, "loss": 0.008, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.778724670410156, "rewards/margins": 15.014248847961426, "rewards/rejected": -20.792972564697266, "step": 5660 }, { "epoch": 2.967032967032967, "grad_norm": 0.15423002301342398, "learning_rate": 6.105834464043419e-09, "logits/chosen": -1.1190422773361206, "logits/rejected": -0.41099199652671814, "logps/chosen": -367.7512512207031, "logps/rejected": -483.9335021972656, "loss": 0.0107, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.458062171936035, "rewards/margins": 15.84019660949707, "rewards/rejected": -22.298259735107422, "step": 5670 }, { "epoch": 2.9722658294086868, "grad_norm": 0.6080462327117588, "learning_rate": 5.136654390385733e-09, "logits/chosen": -0.9887609481811523, "logits/rejected": -0.19320037961006165, "logps/chosen": -340.48443603515625, "logps/rejected": -473.19622802734375, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": -7.573164463043213, "rewards/margins": 15.90434455871582, "rewards/rejected": -23.47750473022461, "step": 5680 }, { "epoch": 2.977498691784406, "grad_norm": 11.881703541980716, "learning_rate": 4.167474316728048e-09, "logits/chosen": -0.9063528180122375, "logits/rejected": -0.7397497892379761, "logps/chosen": -324.1902770996094, "logps/rejected": -495.5985412597656, "loss": 0.0258, "rewards/accuracies": 1.0, "rewards/chosen": -7.018535614013672, "rewards/margins": 15.274724960327148, "rewards/rejected": -22.293262481689453, "step": 5690 }, { "epoch": 2.9827315541601256, "grad_norm": 6.607978074651503, "learning_rate": 3.198294243070362e-09, "logits/chosen": -1.158010482788086, "logits/rejected": -0.8962159156799316, "logps/chosen": -345.50579833984375, "logps/rejected": -476.0052185058594, "loss": 0.0189, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.554076194763184, "rewards/margins": 15.098276138305664, "rewards/rejected": -20.652353286743164, "step": 5700 }, { "epoch": 2.9827315541601256, "eval_logits/chosen": -1.0021528005599976, "eval_logits/rejected": -0.5914642810821533, "eval_logps/chosen": -383.5809326171875, "eval_logps/rejected": -424.5231018066406, "eval_loss": 1.1475400924682617, "eval_rewards/accuracies": 0.728515625, "eval_rewards/chosen": -10.866905212402344, "eval_rewards/margins": 5.143650054931641, "eval_rewards/rejected": -16.010557174682617, "eval_runtime": 226.2511, "eval_samples_per_second": 8.84, "eval_steps_per_second": 0.141, "step": 5700 }, { "epoch": 2.987964416535845, "grad_norm": 0.4355411904562426, "learning_rate": 2.2291141694126767e-09, "logits/chosen": -0.8499744534492493, "logits/rejected": -0.5392943620681763, "logps/chosen": -409.978271484375, "logps/rejected": -577.7471923828125, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -6.552199363708496, "rewards/margins": 14.820777893066406, "rewards/rejected": -21.37298011779785, "step": 5710 }, { "epoch": 2.9931972789115644, "grad_norm": 0.4540870766397068, "learning_rate": 1.2599340957549913e-09, "logits/chosen": -0.9112657308578491, "logits/rejected": -0.5860828161239624, "logps/chosen": -328.2040710449219, "logps/rejected": -486.84686279296875, "loss": 0.0077, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.300119400024414, "rewards/margins": 16.899097442626953, "rewards/rejected": -23.199214935302734, "step": 5720 }, { "epoch": 2.998430141287284, "grad_norm": 0.8520808262978153, "learning_rate": 2.907540220973057e-10, "logits/chosen": -1.2528154850006104, "logits/rejected": -0.6007604598999023, "logps/chosen": -390.5143127441406, "logps/rejected": -443.18450927734375, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -5.455491065979004, "rewards/margins": 14.636087417602539, "rewards/rejected": -20.091577529907227, "step": 5730 }, { "epoch": 3.0, "step": 5733, "total_flos": 0.0, "train_loss": 0.21746732159101023, "train_runtime": 62085.3048, "train_samples_per_second": 2.954, "train_steps_per_second": 0.092 } ], "logging_steps": 10, "max_steps": 5733, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }