{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994767137624281, "eval_steps": 100, "global_step": 955, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 50.34250255302009, "learning_rate": 5.208333333333333e-09, "logits/chosen": -1.8382859230041504, "logits/rejected": -1.788834810256958, "logps/chosen": -119.0692138671875, "logps/rejected": -76.35714721679688, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "grad_norm": 46.63620247813633, "learning_rate": 5.208333333333333e-08, "logits/chosen": -1.6616814136505127, "logits/rejected": -1.5508421659469604, "logps/chosen": -129.22186279296875, "logps/rejected": -82.81047821044922, "loss": 0.6931, "rewards/accuracies": 0.375, "rewards/chosen": 0.0005498434184119105, "rewards/margins": 0.0009226154070347548, "rewards/rejected": -0.00037277190131135285, "step": 10 }, { "epoch": 0.02, "grad_norm": 48.137796577108965, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -1.6961771249771118, "logits/rejected": -1.5263831615447998, "logps/chosen": -140.1017608642578, "logps/rejected": -81.02197265625, "loss": 0.6866, "rewards/accuracies": 0.78125, "rewards/chosen": 0.013111919164657593, "rewards/margins": 0.01593630015850067, "rewards/rejected": -0.0028243791311979294, "step": 20 }, { "epoch": 0.03, "grad_norm": 37.06492370153689, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -1.7120641469955444, "logits/rejected": -1.6340694427490234, "logps/chosen": -119.171875, "logps/rejected": -84.84639739990234, "loss": 0.6569, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.05298091098666191, "rewards/margins": 0.07273116707801819, "rewards/rejected": -0.019750254228711128, "step": 30 }, { "epoch": 0.04, "grad_norm": 38.277547537833144, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -1.8135931491851807, "logits/rejected": -1.7179806232452393, "logps/chosen": -130.3672332763672, "logps/rejected": -98.59812927246094, "loss": 0.5913, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.1295873373746872, "rewards/margins": 0.2865225374698639, "rewards/rejected": -0.1569352000951767, "step": 40 }, { "epoch": 0.05, "grad_norm": 25.0126190198897, "learning_rate": 2.604166666666667e-07, "logits/chosen": -1.67790949344635, "logits/rejected": -1.6322219371795654, "logps/chosen": -128.07374572753906, "logps/rejected": -136.76470947265625, "loss": 0.4837, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.012722499668598175, "rewards/margins": 0.6054978370666504, "rewards/rejected": -0.5927752256393433, "step": 50 }, { "epoch": 0.06, "grad_norm": 24.458659174156402, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -1.6139692068099976, "logits/rejected": -1.611717939376831, "logps/chosen": -173.83407592773438, "logps/rejected": -218.3336944580078, "loss": 0.3944, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.2466244399547577, "rewards/margins": 1.035771369934082, "rewards/rejected": -1.282395839691162, "step": 60 }, { "epoch": 0.07, "grad_norm": 32.37910330977062, "learning_rate": 3.645833333333333e-07, "logits/chosen": -1.4961981773376465, "logits/rejected": -1.4705344438552856, "logps/chosen": -191.92495727539062, "logps/rejected": -297.68841552734375, "loss": 0.3475, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.6499562859535217, "rewards/margins": 1.588413953781128, "rewards/rejected": -2.238370418548584, "step": 70 }, { "epoch": 0.08, "grad_norm": 31.121018022045003, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -1.4610167741775513, "logits/rejected": -1.3438676595687866, "logps/chosen": -249.2657470703125, "logps/rejected": -405.2835693359375, "loss": 0.2961, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.1774078607559204, "rewards/margins": 2.065943717956543, "rewards/rejected": -3.243351697921753, "step": 80 }, { "epoch": 0.09, "grad_norm": 34.84421918271163, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -1.295830488204956, "logits/rejected": -1.2198989391326904, "logps/chosen": -235.9141082763672, "logps/rejected": -426.62005615234375, "loss": 0.2782, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.0462534427642822, "rewards/margins": 2.420970916748047, "rewards/rejected": -3.467224597930908, "step": 90 }, { "epoch": 0.1, "grad_norm": 38.18824950345103, "learning_rate": 4.999732492681437e-07, "logits/chosen": -1.4374310970306396, "logits/rejected": -1.2723599672317505, "logps/chosen": -237.91232299804688, "logps/rejected": -481.5489807128906, "loss": 0.254, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.0713858604431152, "rewards/margins": 2.9585039615631104, "rewards/rejected": -4.0298895835876465, "step": 100 }, { "epoch": 0.1, "eval_logits/chosen": -1.520336627960205, "eval_logits/rejected": -1.4872528314590454, "eval_logps/chosen": -814.5384521484375, "eval_logps/rejected": -908.8818359375, "eval_loss": 1.4761102199554443, "eval_rewards/accuracies": 0.58984375, "eval_rewards/chosen": -5.329049110412598, "eval_rewards/margins": 0.8821536302566528, "eval_rewards/rejected": -6.211202144622803, "eval_runtime": 97.4159, "eval_samples_per_second": 20.531, "eval_steps_per_second": 0.328, "step": 100 }, { "epoch": 0.12, "grad_norm": 32.08336034758986, "learning_rate": 4.996723692767926e-07, "logits/chosen": -1.320703148841858, "logits/rejected": -1.2061922550201416, "logps/chosen": -245.25057983398438, "logps/rejected": -540.9496459960938, "loss": 0.2321, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.2422926425933838, "rewards/margins": 3.4426703453063965, "rewards/rejected": -4.684963226318359, "step": 110 }, { "epoch": 0.13, "grad_norm": 42.593718204224416, "learning_rate": 4.990375746213598e-07, "logits/chosen": -1.4631527662277222, "logits/rejected": -1.3092104196548462, "logps/chosen": -291.78375244140625, "logps/rejected": -656.6674194335938, "loss": 0.2292, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.5867398977279663, "rewards/margins": 4.244009017944336, "rewards/rejected": -5.830749034881592, "step": 120 }, { "epoch": 0.14, "grad_norm": 34.153643966408474, "learning_rate": 4.980697142834314e-07, "logits/chosen": -1.5025428533554077, "logits/rejected": -1.3630611896514893, "logps/chosen": -250.1748046875, "logps/rejected": -576.2555541992188, "loss": 0.2284, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.1989684104919434, "rewards/margins": 3.749941349029541, "rewards/rejected": -4.948909759521484, "step": 130 }, { "epoch": 0.15, "grad_norm": 38.62955098354431, "learning_rate": 4.967700826904229e-07, "logits/chosen": -1.5546451807022095, "logits/rejected": -1.388270616531372, "logps/chosen": -287.029541015625, "logps/rejected": -771.34033203125, "loss": 0.2221, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.5651198625564575, "rewards/margins": 5.370504379272461, "rewards/rejected": -6.935624599456787, "step": 140 }, { "epoch": 0.16, "grad_norm": 45.61220950393248, "learning_rate": 4.951404179843962e-07, "logits/chosen": -1.5795482397079468, "logits/rejected": -1.3580058813095093, "logps/chosen": -292.9833068847656, "logps/rejected": -779.7533569335938, "loss": 0.2009, "rewards/accuracies": 0.90625, "rewards/chosen": -1.4337162971496582, "rewards/margins": 5.4721760749816895, "rewards/rejected": -6.905892848968506, "step": 150 }, { "epoch": 0.17, "grad_norm": 40.221682931286466, "learning_rate": 4.931828996974498e-07, "logits/chosen": -1.5406692028045654, "logits/rejected": -1.3174465894699097, "logps/chosen": -287.9991760253906, "logps/rejected": -712.6685791015625, "loss": 0.2152, "rewards/accuracies": 0.875, "rewards/chosen": -1.5834462642669678, "rewards/margins": 4.822080135345459, "rewards/rejected": -6.405526161193848, "step": 160 }, { "epoch": 0.18, "grad_norm": 52.489731314855526, "learning_rate": 4.909001458367866e-07, "logits/chosen": -1.5561904907226562, "logits/rejected": -1.3703653812408447, "logps/chosen": -361.0309753417969, "logps/rejected": -811.6133422851562, "loss": 0.1911, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.442492961883545, "rewards/margins": 4.990392208099365, "rewards/rejected": -7.432885646820068, "step": 170 }, { "epoch": 0.19, "grad_norm": 44.88732370136967, "learning_rate": 4.882952093833627e-07, "logits/chosen": -1.5479228496551514, "logits/rejected": -1.402629017829895, "logps/chosen": -337.51568603515625, "logps/rejected": -855.9022216796875, "loss": 0.2064, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.006166458129883, "rewards/margins": 5.707094192504883, "rewards/rejected": -7.713259696960449, "step": 180 }, { "epoch": 0.2, "grad_norm": 39.308543148166194, "learning_rate": 4.853715742087946e-07, "logits/chosen": -1.4547832012176514, "logits/rejected": -1.2370269298553467, "logps/chosen": -310.5637512207031, "logps/rejected": -798.4898071289062, "loss": 0.1865, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.8440923690795898, "rewards/margins": 5.3871355056762695, "rewards/rejected": -7.231228828430176, "step": 190 }, { "epoch": 0.21, "grad_norm": 37.85441526280708, "learning_rate": 4.821331504159906e-07, "logits/chosen": -1.3665611743927002, "logits/rejected": -1.1807641983032227, "logps/chosen": -266.3150939941406, "logps/rejected": -686.4867553710938, "loss": 0.1844, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.5169986486434937, "rewards/margins": 4.59818172454834, "rewards/rejected": -6.115180015563965, "step": 200 }, { "epoch": 0.21, "eval_logits/chosen": -1.4664057493209839, "eval_logits/rejected": -1.4103434085845947, "eval_logps/chosen": -897.182373046875, "eval_logps/rejected": -1032.5726318359375, "eval_loss": 1.7253305912017822, "eval_rewards/accuracies": 0.61328125, "eval_rewards/chosen": -6.15548849105835, "eval_rewards/margins": 1.2926223278045654, "eval_rewards/rejected": -7.448111534118652, "eval_runtime": 97.4439, "eval_samples_per_second": 20.525, "eval_steps_per_second": 0.328, "step": 200 }, { "epoch": 0.22, "grad_norm": 35.05458176508214, "learning_rate": 4.785842691097342e-07, "logits/chosen": -1.35039484500885, "logits/rejected": -1.0796902179718018, "logps/chosen": -383.1498107910156, "logps/rejected": -1000.6253662109375, "loss": 0.1702, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.5290865898132324, "rewards/margins": 6.735090732574463, "rewards/rejected": -9.264177322387695, "step": 210 }, { "epoch": 0.23, "grad_norm": 48.27429845185424, "learning_rate": 4.7472967660421603e-07, "logits/chosen": -1.0133236646652222, "logits/rejected": -0.6695674657821655, "logps/chosen": -290.65142822265625, "logps/rejected": -783.7594604492188, "loss": 0.1754, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.5812921524047852, "rewards/margins": 5.510292053222656, "rewards/rejected": -7.0915846824646, "step": 220 }, { "epoch": 0.24, "grad_norm": 41.00379708475208, "learning_rate": 4.705745280752585e-07, "logits/chosen": -0.7725291848182678, "logits/rejected": -0.4129869341850281, "logps/chosen": -351.3033752441406, "logps/rejected": -841.9267578125, "loss": 0.2017, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.1709389686584473, "rewards/margins": 5.4972968101501465, "rewards/rejected": -7.668235778808594, "step": 230 }, { "epoch": 0.25, "grad_norm": 38.12279956674338, "learning_rate": 4.6612438066572555e-07, "logits/chosen": -0.4965124726295471, "logits/rejected": -0.0673198476433754, "logps/chosen": -287.70379638671875, "logps/rejected": -758.7793579101562, "loss": 0.1843, "rewards/accuracies": 0.875, "rewards/chosen": -1.6674537658691406, "rewards/margins": 5.193233966827393, "rewards/rejected": -6.860687255859375, "step": 240 }, { "epoch": 0.26, "grad_norm": 41.10699952348945, "learning_rate": 4.6138518605333664e-07, "logits/chosen": -0.6868407726287842, "logits/rejected": -0.13767887651920319, "logps/chosen": -342.7250671386719, "logps/rejected": -987.6883544921875, "loss": 0.1737, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -2.031644344329834, "rewards/margins": 6.998709678649902, "rewards/rejected": -9.030354499816895, "step": 250 }, { "epoch": 0.27, "grad_norm": 36.89441241984481, "learning_rate": 4.5636328249082514e-07, "logits/chosen": -0.46804434061050415, "logits/rejected": 0.16666679084300995, "logps/chosen": -385.1826171875, "logps/rejected": -1048.572021484375, "loss": 0.1683, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.4862778186798096, "rewards/margins": 7.211634635925293, "rewards/rejected": -9.697912216186523, "step": 260 }, { "epoch": 0.28, "grad_norm": 33.10469086445788, "learning_rate": 4.510653863290871e-07, "logits/chosen": -0.6870437860488892, "logits/rejected": -0.09238968789577484, "logps/chosen": -350.16400146484375, "logps/rejected": -1004.6405029296875, "loss": 0.1688, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.0535225868225098, "rewards/margins": 7.145571708679199, "rewards/rejected": -9.199094772338867, "step": 270 }, { "epoch": 0.29, "grad_norm": 36.36855269897335, "learning_rate": 4.4549858303465737e-07, "logits/chosen": -0.6977792382240295, "logits/rejected": -0.09684981405735016, "logps/chosen": -377.00201416015625, "logps/rejected": -1076.9189453125, "loss": 0.1621, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.410522222518921, "rewards/margins": 7.565877437591553, "rewards/rejected": -9.976398468017578, "step": 280 }, { "epoch": 0.3, "grad_norm": 31.447925234202767, "learning_rate": 4.396703177135261e-07, "logits/chosen": -0.9086171984672546, "logits/rejected": -0.4163902699947357, "logps/chosen": -320.157958984375, "logps/rejected": -847.1726684570312, "loss": 0.1843, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.8222324848175049, "rewards/margins": 5.811877250671387, "rewards/rejected": -7.6341094970703125, "step": 290 }, { "epoch": 0.31, "grad_norm": 29.489990961921738, "learning_rate": 4.335883851539693e-07, "logits/chosen": -0.8754051327705383, "logits/rejected": -0.39787793159484863, "logps/chosen": -303.1324157714844, "logps/rejected": -911.4088745117188, "loss": 0.1635, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.7049236297607422, "rewards/margins": 6.647863864898682, "rewards/rejected": -8.352787017822266, "step": 300 }, { "epoch": 0.31, "eval_logits/chosen": -0.7514631748199463, "eval_logits/rejected": -0.6256809234619141, "eval_logps/chosen": -899.3142700195312, "eval_logps/rejected": -1026.9749755859375, "eval_loss": 1.6676901578903198, "eval_rewards/accuracies": 0.59375, "eval_rewards/chosen": -6.176807403564453, "eval_rewards/margins": 1.215327262878418, "eval_rewards/rejected": -7.392133712768555, "eval_runtime": 97.2926, "eval_samples_per_second": 20.557, "eval_steps_per_second": 0.329, "step": 300 }, { "epoch": 0.32, "grad_norm": 36.71426987222159, "learning_rate": 4.272609194017105e-07, "logits/chosen": -0.7960424423217773, "logits/rejected": -0.22463683784008026, "logps/chosen": -326.64764404296875, "logps/rejected": -895.4781494140625, "loss": 0.1774, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.8289636373519897, "rewards/margins": 6.345601558685303, "rewards/rejected": -8.174566268920898, "step": 310 }, { "epoch": 0.33, "grad_norm": 29.01401480971697, "learning_rate": 4.2069638288135547e-07, "logits/chosen": -0.2821110785007477, "logits/rejected": 0.2019030600786209, "logps/chosen": -327.86187744140625, "logps/rejected": -967.6357421875, "loss": 0.1667, "rewards/accuracies": 0.90625, "rewards/chosen": -2.0269737243652344, "rewards/margins": 6.851207733154297, "rewards/rejected": -8.878181457519531, "step": 320 }, { "epoch": 0.35, "grad_norm": 27.12838321931205, "learning_rate": 4.139035550786494e-07, "logits/chosen": -0.5025689005851746, "logits/rejected": 0.3045334815979004, "logps/chosen": -339.73321533203125, "logps/rejected": -999.3941650390625, "loss": 0.1479, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.077441453933716, "rewards/margins": 7.147647857666016, "rewards/rejected": -9.225088119506836, "step": 330 }, { "epoch": 0.36, "grad_norm": 38.66513102667531, "learning_rate": 4.0689152079869306e-07, "logits/chosen": -0.48522940278053284, "logits/rejected": 0.2592470049858093, "logps/chosen": -318.987060546875, "logps/rejected": -1009.4090576171875, "loss": 0.1602, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.873308539390564, "rewards/margins": 7.443674564361572, "rewards/rejected": -9.316983222961426, "step": 340 }, { "epoch": 0.37, "grad_norm": 33.031074442796964, "learning_rate": 3.99669658015821e-07, "logits/chosen": -0.2326478660106659, "logits/rejected": 0.3859787583351135, "logps/chosen": -323.79840087890625, "logps/rejected": -1037.3876953125, "loss": 0.1571, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.968605637550354, "rewards/margins": 7.629488468170166, "rewards/rejected": -9.59809398651123, "step": 350 }, { "epoch": 0.38, "grad_norm": 22.066600953398503, "learning_rate": 3.92247625331392e-07, "logits/chosen": -0.21493081748485565, "logits/rejected": 0.5706053972244263, "logps/chosen": -320.01806640625, "logps/rejected": -916.8917846679688, "loss": 0.1613, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.859118103981018, "rewards/margins": 6.541051387786865, "rewards/rejected": -8.40017032623291, "step": 360 }, { "epoch": 0.39, "grad_norm": 22.111157147625544, "learning_rate": 3.846353490562664e-07, "logits/chosen": -0.13797323405742645, "logits/rejected": 0.591605544090271, "logps/chosen": -353.294921875, "logps/rejected": -1085.5618896484375, "loss": 0.1497, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.195202112197876, "rewards/margins": 7.859287261962891, "rewards/rejected": -10.054490089416504, "step": 370 }, { "epoch": 0.4, "grad_norm": 28.957108905206724, "learning_rate": 3.768430099352445e-07, "logits/chosen": -0.25790831446647644, "logits/rejected": 0.3344423174858093, "logps/chosen": -380.2027282714844, "logps/rejected": -1103.0699462890625, "loss": 0.1628, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.5323684215545654, "rewards/margins": 7.7269287109375, "rewards/rejected": -10.259297370910645, "step": 380 }, { "epoch": 0.41, "grad_norm": 28.445219714861263, "learning_rate": 3.6888102953122304e-07, "logits/chosen": -0.47374868392944336, "logits/rejected": 0.09608779847621918, "logps/chosen": -365.1648254394531, "logps/rejected": -1052.339111328125, "loss": 0.1582, "rewards/accuracies": 0.875, "rewards/chosen": -2.427854061126709, "rewards/margins": 7.350833892822266, "rewards/rejected": -9.778688430786133, "step": 390 }, { "epoch": 0.42, "grad_norm": 26.329510138760806, "learning_rate": 3.607600562872785e-07, "logits/chosen": -0.4803605079650879, "logits/rejected": -0.029133472591638565, "logps/chosen": -345.6582336425781, "logps/rejected": -932.9094848632812, "loss": 0.1606, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.222365379333496, "rewards/margins": 6.369531154632568, "rewards/rejected": -8.591897010803223, "step": 400 }, { "epoch": 0.42, "eval_logits/chosen": -0.6142900586128235, "eval_logits/rejected": -0.4797673225402832, "eval_logps/chosen": -989.371826171875, "eval_logps/rejected": -1133.77001953125, "eval_loss": 2.030726194381714, "eval_rewards/accuracies": 0.6015625, "eval_rewards/chosen": -7.077382564544678, "eval_rewards/margins": 1.3827017545700073, "eval_rewards/rejected": -8.460084915161133, "eval_runtime": 97.3894, "eval_samples_per_second": 20.536, "eval_steps_per_second": 0.329, "step": 400 }, { "epoch": 0.43, "grad_norm": 30.32229808234622, "learning_rate": 3.5249095128531856e-07, "logits/chosen": -0.6582576036453247, "logits/rejected": -0.05353846028447151, "logps/chosen": -382.0814514160156, "logps/rejected": -1017.8347778320312, "loss": 0.1701, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.4453189373016357, "rewards/margins": 6.919729709625244, "rewards/rejected": -9.365047454833984, "step": 410 }, { "epoch": 0.44, "grad_norm": 26.65710867508508, "learning_rate": 3.4408477372034736e-07, "logits/chosen": -0.6557037830352783, "logits/rejected": -0.18612375855445862, "logps/chosen": -369.58209228515625, "logps/rejected": -1136.3447265625, "loss": 0.154, "rewards/accuracies": 0.90625, "rewards/chosen": -2.332486867904663, "rewards/margins": 8.186752319335938, "rewards/rejected": -10.51923942565918, "step": 420 }, { "epoch": 0.45, "grad_norm": 26.24483385496738, "learning_rate": 3.3555276610977276e-07, "logits/chosen": -0.7492531538009644, "logits/rejected": -0.0845475047826767, "logps/chosen": -396.35797119140625, "logps/rejected": -1169.997802734375, "loss": 0.1583, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -2.655913829803467, "rewards/margins": 8.302931785583496, "rewards/rejected": -10.958845138549805, "step": 430 }, { "epoch": 0.46, "grad_norm": 30.73000357585953, "learning_rate": 3.269063392575352e-07, "logits/chosen": -0.9397071599960327, "logits/rejected": -0.30021554231643677, "logps/chosen": -316.59466552734375, "logps/rejected": -935.5177001953125, "loss": 0.1863, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.8889516592025757, "rewards/margins": 6.711340427398682, "rewards/rejected": -8.600292205810547, "step": 440 }, { "epoch": 0.47, "grad_norm": 37.85434716946158, "learning_rate": 3.1815705699316964e-07, "logits/chosen": -0.8938137292861938, "logits/rejected": -0.3564215898513794, "logps/chosen": -316.93023681640625, "logps/rejected": -954.3201904296875, "loss": 0.1637, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.857377290725708, "rewards/margins": 6.8977460861206055, "rewards/rejected": -8.75512409210205, "step": 450 }, { "epoch": 0.48, "grad_norm": 47.07671687902545, "learning_rate": 3.0931662070620794e-07, "logits/chosen": -0.8714531660079956, "logits/rejected": -0.3571350872516632, "logps/chosen": -362.056396484375, "logps/rejected": -1101.175048828125, "loss": 0.166, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.2315657138824463, "rewards/margins": 7.899203300476074, "rewards/rejected": -10.130769729614258, "step": 460 }, { "epoch": 0.49, "grad_norm": 41.83209910846531, "learning_rate": 3.003968536966078e-07, "logits/chosen": -0.987261176109314, "logits/rejected": -0.44738197326660156, "logps/chosen": -340.4277648925781, "logps/rejected": -1015.7706909179688, "loss": 0.1612, "rewards/accuracies": 0.90625, "rewards/chosen": -2.083724021911621, "rewards/margins": 7.344973087310791, "rewards/rejected": -9.428696632385254, "step": 470 }, { "epoch": 0.5, "grad_norm": 70.09766166369198, "learning_rate": 2.9140968536213693e-07, "logits/chosen": -0.9282326698303223, "logits/rejected": -0.4145265519618988, "logps/chosen": -369.92498779296875, "logps/rejected": -1071.494384765625, "loss": 0.1731, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -2.441403865814209, "rewards/margins": 7.5186967849731445, "rewards/rejected": -9.960100173950195, "step": 480 }, { "epoch": 0.51, "grad_norm": 31.072373563077146, "learning_rate": 2.823671352438608e-07, "logits/chosen": -0.7097938656806946, "logits/rejected": -0.2332497388124466, "logps/chosen": -376.07647705078125, "logps/rejected": -1070.494873046875, "loss": 0.1513, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -2.5956947803497314, "rewards/margins": 7.397839546203613, "rewards/rejected": -9.993532180786133, "step": 490 }, { "epoch": 0.52, "grad_norm": 39.48800882374922, "learning_rate": 2.73281296951072e-07, "logits/chosen": -1.0332391262054443, "logits/rejected": -0.47291478514671326, "logps/chosen": -354.30419921875, "logps/rejected": -1030.0706787109375, "loss": 0.163, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -2.254275321960449, "rewards/margins": 7.255748748779297, "rewards/rejected": -9.51002311706543, "step": 500 }, { "epoch": 0.52, "eval_logits/chosen": -0.9380449652671814, "eval_logits/rejected": -0.8136107921600342, "eval_logps/chosen": -936.579345703125, "eval_logps/rejected": -1091.4378662109375, "eval_loss": 1.8216179609298706, "eval_rewards/accuracies": 0.58984375, "eval_rewards/chosen": -6.549457550048828, "eval_rewards/margins": 1.487306833267212, "eval_rewards/rejected": -8.036764144897461, "eval_runtime": 97.3225, "eval_samples_per_second": 20.55, "eval_steps_per_second": 0.329, "step": 500 }, { "epoch": 0.53, "grad_norm": 36.00441662885456, "learning_rate": 2.641643219871597e-07, "logits/chosen": -1.147862195968628, "logits/rejected": -0.4513426721096039, "logps/chosen": -346.061767578125, "logps/rejected": -974.8058471679688, "loss": 0.1448, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.9859822988510132, "rewards/margins": 6.95660924911499, "rewards/rejected": -8.942591667175293, "step": 510 }, { "epoch": 0.54, "grad_norm": 32.51878519513302, "learning_rate": 2.550284034980507e-07, "logits/chosen": -1.13883638381958, "logits/rejected": -0.4823761582374573, "logps/chosen": -372.8052062988281, "logps/rejected": -1022.6721801757812, "loss": 0.1752, "rewards/accuracies": 0.875, "rewards/chosen": -2.303035020828247, "rewards/margins": 7.056814670562744, "rewards/rejected": -9.35984992980957, "step": 520 }, { "epoch": 0.55, "grad_norm": 22.31604773564865, "learning_rate": 2.4588575996495794e-07, "logits/chosen": -1.2034275531768799, "logits/rejected": -0.5457441210746765, "logps/chosen": -309.73455810546875, "logps/rejected": -966.7014770507812, "loss": 0.1395, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.8016897439956665, "rewards/margins": 7.100803375244141, "rewards/rejected": -8.902493476867676, "step": 530 }, { "epoch": 0.57, "grad_norm": 25.730552994224812, "learning_rate": 2.367486188632446e-07, "logits/chosen": -0.8954168558120728, "logits/rejected": -0.3190861642360687, "logps/chosen": -355.24444580078125, "logps/rejected": -993.5911865234375, "loss": 0.1404, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.2296109199523926, "rewards/margins": 6.92122745513916, "rewards/rejected": -9.150837898254395, "step": 540 }, { "epoch": 0.58, "grad_norm": 31.70801335762852, "learning_rate": 2.276292003092593e-07, "logits/chosen": -0.8856201171875, "logits/rejected": -0.2036731243133545, "logps/chosen": -384.66900634765625, "logps/rejected": -1095.9716796875, "loss": 0.1507, "rewards/accuracies": 0.90625, "rewards/chosen": -2.6011099815368652, "rewards/margins": 7.6391730308532715, "rewards/rejected": -10.24028205871582, "step": 550 }, { "epoch": 0.59, "grad_norm": 33.368612321336464, "learning_rate": 2.185397007170141e-07, "logits/chosen": -0.6942281126976013, "logits/rejected": -0.06129706650972366, "logps/chosen": -311.1067810058594, "logps/rejected": -868.57275390625, "loss": 0.1601, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.0446078777313232, "rewards/margins": 6.013201713562012, "rewards/rejected": -8.057809829711914, "step": 560 }, { "epoch": 0.6, "grad_norm": 37.62845344818987, "learning_rate": 2.094922764865619e-07, "logits/chosen": -0.7505870461463928, "logits/rejected": -0.11658618599176407, "logps/chosen": -386.23590087890625, "logps/rejected": -1095.0679931640625, "loss": 0.1543, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.496262311935425, "rewards/margins": 7.5716872215271, "rewards/rejected": -10.067950248718262, "step": 570 }, { "epoch": 0.61, "grad_norm": 26.026316709252203, "learning_rate": 2.0049902774588797e-07, "logits/chosen": -0.7590332627296448, "logits/rejected": -0.031172871589660645, "logps/chosen": -312.64642333984375, "logps/rejected": -921.24560546875, "loss": 0.1337, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.7880489826202393, "rewards/margins": 6.684856414794922, "rewards/rejected": -8.472906112670898, "step": 580 }, { "epoch": 0.62, "grad_norm": 25.651974922486566, "learning_rate": 1.9157198216806238e-07, "logits/chosen": -0.49569645524024963, "logits/rejected": 0.2907310724258423, "logps/chosen": -359.0349426269531, "logps/rejected": -1026.7982177734375, "loss": 0.1546, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.2977614402770996, "rewards/margins": 7.176451206207275, "rewards/rejected": -9.474211692810059, "step": 590 }, { "epoch": 0.63, "grad_norm": 29.13172550353607, "learning_rate": 1.8272307888529274e-07, "logits/chosen": -0.5350311398506165, "logits/rejected": 0.32648250460624695, "logps/chosen": -355.0140380859375, "logps/rejected": -1063.1131591796875, "loss": 0.1656, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.212515354156494, "rewards/margins": 7.633421897888184, "rewards/rejected": -9.845937728881836, "step": 600 }, { "epoch": 0.63, "eval_logits/chosen": -0.5360411405563354, "eval_logits/rejected": -0.37004101276397705, "eval_logps/chosen": -934.728515625, "eval_logps/rejected": -1083.6920166015625, "eval_loss": 1.8090689182281494, "eval_rewards/accuracies": 0.625, "eval_rewards/chosen": -6.530948162078857, "eval_rewards/margins": 1.4283562898635864, "eval_rewards/rejected": -7.9593048095703125, "eval_runtime": 97.4659, "eval_samples_per_second": 20.52, "eval_steps_per_second": 0.328, "step": 600 }, { "epoch": 0.64, "grad_norm": 38.21490518689014, "learning_rate": 1.7396415252139288e-07, "logits/chosen": -0.7486557960510254, "logits/rejected": -0.10511207580566406, "logps/chosen": -321.5223693847656, "logps/rejected": -989.8853759765625, "loss": 0.1515, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.8860008716583252, "rewards/margins": 7.177679538726807, "rewards/rejected": -9.063680648803711, "step": 610 }, { "epoch": 0.65, "grad_norm": 37.606116637861646, "learning_rate": 1.6530691736402316e-07, "logits/chosen": -0.5896056294441223, "logits/rejected": 0.24608202278614044, "logps/chosen": -332.99127197265625, "logps/rejected": -969.2781982421875, "loss": 0.1476, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.0645222663879395, "rewards/margins": 6.90951681137085, "rewards/rejected": -8.974038124084473, "step": 620 }, { "epoch": 0.66, "grad_norm": 29.30177416360563, "learning_rate": 1.5676295169786864e-07, "logits/chosen": -0.5280860662460327, "logits/rejected": 0.11714713275432587, "logps/chosen": -344.58740234375, "logps/rejected": -951.2314453125, "loss": 0.1637, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.1917474269866943, "rewards/margins": 6.532890319824219, "rewards/rejected": -8.724637985229492, "step": 630 }, { "epoch": 0.67, "grad_norm": 20.00745545125351, "learning_rate": 1.483436823197092e-07, "logits/chosen": -0.7493909597396851, "logits/rejected": 0.14372903108596802, "logps/chosen": -323.84112548828125, "logps/rejected": -1045.17333984375, "loss": 0.1427, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.9064871072769165, "rewards/margins": 7.77541446685791, "rewards/rejected": -9.681901931762695, "step": 640 }, { "epoch": 0.68, "grad_norm": 28.980496182255987, "learning_rate": 1.4006036925609243e-07, "logits/chosen": -0.6482175588607788, "logits/rejected": 0.0625019520521164, "logps/chosen": -341.03228759765625, "logps/rejected": -1090.4324951171875, "loss": 0.1431, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.016282320022583, "rewards/margins": 8.004495620727539, "rewards/rejected": -10.020777702331543, "step": 650 }, { "epoch": 0.69, "grad_norm": 28.837458672914796, "learning_rate": 1.319240907040458e-07, "logits/chosen": -0.7208763360977173, "logits/rejected": 0.032781489193439484, "logps/chosen": -354.48773193359375, "logps/rejected": -1058.132080078125, "loss": 0.135, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.1903700828552246, "rewards/margins": 7.6033148765563965, "rewards/rejected": -9.793684005737305, "step": 660 }, { "epoch": 0.7, "grad_norm": 32.56630053751295, "learning_rate": 1.239457282149695e-07, "logits/chosen": -0.6535686254501343, "logits/rejected": 0.021148119121789932, "logps/chosen": -366.6623229980469, "logps/rejected": -1171.2142333984375, "loss": 0.1405, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.4802515506744385, "rewards/margins": 8.500134468078613, "rewards/rejected": -10.980386734008789, "step": 670 }, { "epoch": 0.71, "grad_norm": 36.66974740741856, "learning_rate": 1.1613595214152711e-07, "logits/chosen": -0.7827145457267761, "logits/rejected": 0.08258621394634247, "logps/chosen": -360.87493896484375, "logps/rejected": -1151.84521484375, "loss": 0.153, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.3406710624694824, "rewards/margins": 8.442497253417969, "rewards/rejected": -10.783166885375977, "step": 680 }, { "epoch": 0.72, "grad_norm": 32.97786121900684, "learning_rate": 1.0850520736699362e-07, "logits/chosen": -0.7693961262702942, "logits/rejected": 0.005597646348178387, "logps/chosen": -345.9762268066406, "logps/rejected": -1089.049072265625, "loss": 0.1393, "rewards/accuracies": 0.90625, "rewards/chosen": -2.2439839839935303, "rewards/margins": 7.891903877258301, "rewards/rejected": -10.135889053344727, "step": 690 }, { "epoch": 0.73, "grad_norm": 45.53691199690897, "learning_rate": 1.0106369933615042e-07, "logits/chosen": -0.47497862577438354, "logits/rejected": 0.2753020226955414, "logps/chosen": -350.84698486328125, "logps/rejected": -1033.717041015625, "loss": 0.1552, "rewards/accuracies": 0.875, "rewards/chosen": -2.2757205963134766, "rewards/margins": 7.295037269592285, "rewards/rejected": -9.570757865905762, "step": 700 }, { "epoch": 0.73, "eval_logits/chosen": -0.5351493954658508, "eval_logits/rejected": -0.358839213848114, "eval_logps/chosen": -1044.81787109375, "eval_logps/rejected": -1206.419677734375, "eval_loss": 2.0767157077789307, "eval_rewards/accuracies": 0.59765625, "eval_rewards/chosen": -7.631843090057373, "eval_rewards/margins": 1.5547385215759277, "eval_rewards/rejected": -9.186580657958984, "eval_runtime": 97.3655, "eval_samples_per_second": 20.541, "eval_steps_per_second": 0.329, "step": 700 }, { "epoch": 0.74, "grad_norm": 20.912820446414077, "learning_rate": 9.382138040640714e-08, "logits/chosen": -0.7432624101638794, "logits/rejected": 0.19342878460884094, "logps/chosen": -351.33599853515625, "logps/rejected": -1151.90625, "loss": 0.134, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.2001760005950928, "rewards/margins": 8.568601608276367, "rewards/rejected": -10.768776893615723, "step": 710 }, { "epoch": 0.75, "grad_norm": 43.893306649250164, "learning_rate": 8.678793653740632e-08, "logits/chosen": -0.6166023015975952, "logits/rejected": 0.08570433408021927, "logps/chosen": -365.81402587890625, "logps/rejected": -1052.766357421875, "loss": 0.1483, "rewards/accuracies": 0.875, "rewards/chosen": -2.3786137104034424, "rewards/margins": 7.3965888023376465, "rewards/rejected": -9.775201797485352, "step": 720 }, { "epoch": 0.76, "grad_norm": 27.044416498643887, "learning_rate": 7.997277433690983e-08, "logits/chosen": -0.8449182510375977, "logits/rejected": -0.04736803472042084, "logps/chosen": -356.64935302734375, "logps/rejected": -1065.8873291015625, "loss": 0.1552, "rewards/accuracies": 0.90625, "rewards/chosen": -2.2181100845336914, "rewards/margins": 7.574668884277344, "rewards/rejected": -9.792778968811035, "step": 730 }, { "epoch": 0.77, "grad_norm": 44.06234339270125, "learning_rate": 7.338500848029602e-08, "logits/chosen": -0.7003141641616821, "logits/rejected": 0.08363965153694153, "logps/chosen": -341.93157958984375, "logps/rejected": -1166.3671875, "loss": 0.1518, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.0371365547180176, "rewards/margins": 8.86846923828125, "rewards/rejected": -10.905606269836426, "step": 740 }, { "epoch": 0.78, "grad_norm": 43.35684912820975, "learning_rate": 6.70334495204884e-08, "logits/chosen": -0.568864643573761, "logits/rejected": 0.3128158748149872, "logps/chosen": -353.63616943359375, "logps/rejected": -1124.0401611328125, "loss": 0.1381, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.2875678539276123, "rewards/margins": 8.232528686523438, "rewards/rejected": -10.520097732543945, "step": 750 }, { "epoch": 0.8, "grad_norm": 26.263047158523968, "learning_rate": 6.092659210462231e-08, "logits/chosen": -0.7811049222946167, "logits/rejected": 0.1209399476647377, "logps/chosen": -373.8915710449219, "logps/rejected": -1145.06396484375, "loss": 0.1361, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.237269878387451, "rewards/margins": 8.342939376831055, "rewards/rejected": -10.580209732055664, "step": 760 }, { "epoch": 0.81, "grad_norm": 33.38453699325153, "learning_rate": 5.507260361320737e-08, "logits/chosen": -0.6777874231338501, "logits/rejected": 0.20796041190624237, "logps/chosen": -355.58575439453125, "logps/rejected": -1019.7391357421875, "loss": 0.1459, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.242889642715454, "rewards/margins": 7.201622009277344, "rewards/rejected": -9.444511413574219, "step": 770 }, { "epoch": 0.82, "grad_norm": 27.52737122861175, "learning_rate": 4.947931323697982e-08, "logits/chosen": -0.7127518653869629, "logits/rejected": 0.13608665764331818, "logps/chosen": -345.1900939941406, "logps/rejected": -1019.2976684570312, "loss": 0.1525, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.1412816047668457, "rewards/margins": 7.251255989074707, "rewards/rejected": -9.392538070678711, "step": 780 }, { "epoch": 0.83, "grad_norm": 28.47492588939245, "learning_rate": 4.415420150605398e-08, "logits/chosen": -0.7804344892501831, "logits/rejected": -0.030652623623609543, "logps/chosen": -339.2191162109375, "logps/rejected": -1011.9390869140625, "loss": 0.1619, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.1389918327331543, "rewards/margins": 7.208333492279053, "rewards/rejected": -9.347326278686523, "step": 790 }, { "epoch": 0.84, "grad_norm": 21.149940049259552, "learning_rate": 3.9104390285376374e-08, "logits/chosen": -0.6159350872039795, "logits/rejected": 0.16070613265037537, "logps/chosen": -341.08917236328125, "logps/rejected": -1105.7109375, "loss": 0.1377, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.1167824268341064, "rewards/margins": 8.197529792785645, "rewards/rejected": -10.314311027526855, "step": 800 }, { "epoch": 0.84, "eval_logits/chosen": -0.588712751865387, "eval_logits/rejected": -0.41556617617607117, "eval_logps/chosen": -1010.3356323242188, "eval_logps/rejected": -1168.1900634765625, "eval_loss": 2.030700206756592, "eval_rewards/accuracies": 0.60546875, "eval_rewards/chosen": -7.287020683288574, "eval_rewards/margins": 1.5172640085220337, "eval_rewards/rejected": -8.804285049438477, "eval_runtime": 97.2124, "eval_samples_per_second": 20.573, "eval_steps_per_second": 0.329, "step": 800 }, { "epoch": 0.85, "grad_norm": 24.57939572568816, "learning_rate": 3.433663324986208e-08, "logits/chosen": -0.662031352519989, "logits/rejected": 0.13034440577030182, "logps/chosen": -334.8114318847656, "logps/rejected": -1060.559814453125, "loss": 0.158, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -2.030988931655884, "rewards/margins": 7.7772111892700195, "rewards/rejected": -9.808199882507324, "step": 810 }, { "epoch": 0.86, "grad_norm": 44.23966241882351, "learning_rate": 2.9857306851953897e-08, "logits/chosen": -0.6313251256942749, "logits/rejected": 0.16264604032039642, "logps/chosen": -337.66143798828125, "logps/rejected": -1053.348388671875, "loss": 0.1489, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.0846145153045654, "rewards/margins": 7.6865739822387695, "rewards/rejected": -9.771188735961914, "step": 820 }, { "epoch": 0.87, "grad_norm": 45.06145430483538, "learning_rate": 2.567240179368185e-08, "logits/chosen": -0.6046188473701477, "logits/rejected": 0.2691899836063385, "logps/chosen": -344.50836181640625, "logps/rejected": -1100.190185546875, "loss": 0.1406, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.13590669631958, "rewards/margins": 8.086740493774414, "rewards/rejected": -10.222646713256836, "step": 830 }, { "epoch": 0.88, "grad_norm": 44.72639985506377, "learning_rate": 2.1787515014630357e-08, "logits/chosen": -0.9460281133651733, "logits/rejected": 0.00803391169756651, "logps/chosen": -338.85980224609375, "logps/rejected": -1107.776611328125, "loss": 0.139, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.9058221578598022, "rewards/margins": 8.424005508422852, "rewards/rejected": -10.329828262329102, "step": 840 }, { "epoch": 0.89, "grad_norm": 33.16503245142525, "learning_rate": 1.820784220652766e-08, "logits/chosen": -0.46998101472854614, "logits/rejected": 0.2729955017566681, "logps/chosen": -349.9148254394531, "logps/rejected": -1000.7681884765625, "loss": 0.1575, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.2772185802459717, "rewards/margins": 6.997946262359619, "rewards/rejected": -9.275164604187012, "step": 850 }, { "epoch": 0.9, "grad_norm": 34.5662684913937, "learning_rate": 1.4938170864468636e-08, "logits/chosen": -0.7951310276985168, "logits/rejected": 0.0224321149289608, "logps/chosen": -330.31744384765625, "logps/rejected": -1079.454345703125, "loss": 0.1483, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.00297212600708, "rewards/margins": 8.02238941192627, "rewards/rejected": -10.025362014770508, "step": 860 }, { "epoch": 0.91, "grad_norm": 39.90580157192396, "learning_rate": 1.1982873884064465e-08, "logits/chosen": -0.7433925867080688, "logits/rejected": 0.03819055110216141, "logps/chosen": -343.10076904296875, "logps/rejected": -1177.3214111328125, "loss": 0.1454, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.095435619354248, "rewards/margins": 8.91866683959961, "rewards/rejected": -11.0141019821167, "step": 870 }, { "epoch": 0.92, "grad_norm": 33.335658583470845, "learning_rate": 9.345903713082304e-09, "logits/chosen": -0.7445138692855835, "logits/rejected": 0.06519349664449692, "logps/chosen": -360.95751953125, "logps/rejected": -1240.344970703125, "loss": 0.1403, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -2.1640257835388184, "rewards/margins": 9.403306007385254, "rewards/rejected": -11.567331314086914, "step": 880 }, { "epoch": 0.93, "grad_norm": 34.256779785154734, "learning_rate": 7.030787065396865e-09, "logits/chosen": -0.5979295969009399, "logits/rejected": 0.12500345706939697, "logps/chosen": -324.71563720703125, "logps/rejected": -1090.809814453125, "loss": 0.1529, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.9876606464385986, "rewards/margins": 8.175052642822266, "rewards/rejected": -10.162714004516602, "step": 890 }, { "epoch": 0.94, "grad_norm": 21.350853836162187, "learning_rate": 5.04062020432286e-09, "logits/chosen": -0.6045975685119629, "logits/rejected": 0.057004231959581375, "logps/chosen": -314.7482604980469, "logps/rejected": -1017.1994018554688, "loss": 0.1462, "rewards/accuracies": 0.90625, "rewards/chosen": -1.934511423110962, "rewards/margins": 7.4669060707092285, "rewards/rejected": -9.401416778564453, "step": 900 }, { "epoch": 0.94, "eval_logits/chosen": -0.5892084240913391, "eval_logits/rejected": -0.4189561903476715, "eval_logps/chosen": -1001.0963134765625, "eval_logps/rejected": -1160.1461181640625, "eval_loss": 2.023190498352051, "eval_rewards/accuracies": 0.61328125, "eval_rewards/chosen": -7.1946282386779785, "eval_rewards/margins": 1.5292174816131592, "eval_rewards/rejected": -8.723845481872559, "eval_runtime": 97.2614, "eval_samples_per_second": 20.563, "eval_steps_per_second": 0.329, "step": 900 }, { "epoch": 0.95, "grad_norm": 23.95340887933029, "learning_rate": 3.3780648016376866e-09, "logits/chosen": -0.6600544452667236, "logits/rejected": 0.006202346179634333, "logps/chosen": -325.9316101074219, "logps/rejected": -1070.564208984375, "loss": 0.1294, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -2.046983480453491, "rewards/margins": 7.899454593658447, "rewards/rejected": -9.94643783569336, "step": 910 }, { "epoch": 0.96, "grad_norm": 27.425291781296558, "learning_rate": 2.0453443778310766e-09, "logits/chosen": -0.7382737994194031, "logits/rejected": -0.02898242510855198, "logps/chosen": -355.5832824707031, "logps/rejected": -1111.990234375, "loss": 0.159, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.140547275543213, "rewards/margins": 8.157334327697754, "rewards/rejected": -10.297882080078125, "step": 920 }, { "epoch": 0.97, "grad_norm": 34.03050925733607, "learning_rate": 1.0442413283435758e-09, "logits/chosen": -0.6262997388839722, "logits/rejected": 0.14048056304454803, "logps/chosen": -344.0566101074219, "logps/rejected": -1069.548583984375, "loss": 0.1613, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.1957008838653564, "rewards/margins": 7.760766506195068, "rewards/rejected": -9.956467628479004, "step": 930 }, { "epoch": 0.98, "grad_norm": 34.773688515138254, "learning_rate": 3.760945397705828e-10, "logits/chosen": -0.7389376163482666, "logits/rejected": 0.0326564684510231, "logps/chosen": -334.1388854980469, "logps/rejected": -1029.4984130859375, "loss": 0.1497, "rewards/accuracies": 0.875, "rewards/chosen": -2.1344165802001953, "rewards/margins": 7.39968204498291, "rewards/rejected": -9.534098625183105, "step": 940 }, { "epoch": 0.99, "grad_norm": 25.746566103316148, "learning_rate": 4.17975992204056e-11, "logits/chosen": -0.5864154696464539, "logits/rejected": 0.14644786715507507, "logps/chosen": -315.15032958984375, "logps/rejected": -1021.39453125, "loss": 0.1545, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.8772588968276978, "rewards/margins": 7.5670928955078125, "rewards/rejected": -9.444352149963379, "step": 950 }, { "epoch": 1.0, "step": 955, "total_flos": 0.0, "train_loss": 0.194384015168195, "train_runtime": 15417.2553, "train_samples_per_second": 7.931, "train_steps_per_second": 0.062 } ], "logging_steps": 10, "max_steps": 955, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }