{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6468305304010349, "eval_steps": 500.0, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.6667505502700806, "learning_rate": 6.666666666666668e-08, "logits/chosen": 0.6656537652015686, "logits/rejected": 0.8323326110839844, "logps/chosen": -105.4136962890625, "logps/rejected": -80.00390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 2.462697744369507, "learning_rate": 6.666666666666667e-07, "logits/chosen": 0.4595692455768585, "logits/rejected": 0.40892449021339417, "logps/chosen": -159.06524658203125, "logps/rejected": -111.41619110107422, "loss": 0.6921, "rewards/accuracies": 0.37037038803100586, "rewards/chosen": -0.005318281706422567, "rewards/margins": 0.002280694665387273, "rewards/rejected": -0.0075989761389791965, "step": 10 }, { "epoch": 0.01, "grad_norm": 1.2759745121002197, "learning_rate": 1.3333333333333334e-06, "logits/chosen": 0.4103795886039734, "logits/rejected": 0.3288261592388153, "logps/chosen": -154.99789428710938, "logps/rejected": -102.23339080810547, "loss": 0.6872, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.011106210760772228, "rewards/margins": 0.012265065684914589, "rewards/rejected": -0.0011588542256504297, "step": 20 }, { "epoch": 0.01, "grad_norm": 1.4769150018692017, "learning_rate": 2.0000000000000003e-06, "logits/chosen": 0.42967623472213745, "logits/rejected": 0.3888375759124756, "logps/chosen": -145.51222229003906, "logps/rejected": -99.47395324707031, "loss": 0.6943, "rewards/accuracies": 0.43333330750465393, "rewards/chosen": -0.0015143711352720857, "rewards/margins": -0.001963119488209486, "rewards/rejected": 0.0004487482365220785, "step": 30 }, { "epoch": 0.01, "grad_norm": 2.340982675552368, "learning_rate": 2.666666666666667e-06, "logits/chosen": 0.31144046783447266, "logits/rejected": 0.38159480690956116, "logps/chosen": -221.299072265625, "logps/rejected": -160.48690795898438, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": 0.00796764437109232, "rewards/margins": 0.0012136328732594848, "rewards/rejected": 0.006754010915756226, "step": 40 }, { "epoch": 0.02, "grad_norm": 1.5240986347198486, "learning_rate": 3.3333333333333333e-06, "logits/chosen": 0.5946449041366577, "logits/rejected": 0.44660329818725586, "logps/chosen": -190.49057006835938, "logps/rejected": -139.8750457763672, "loss": 0.6888, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 0.010610619559884071, "rewards/margins": 0.009138532914221287, "rewards/rejected": 0.0014720851322636008, "step": 50 }, { "epoch": 0.02, "grad_norm": 1.5300071239471436, "learning_rate": 4.000000000000001e-06, "logits/chosen": 0.42198482155799866, "logits/rejected": 0.33618858456611633, "logps/chosen": -173.13516235351562, "logps/rejected": -121.2822036743164, "loss": 0.6925, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": 0.00521390326321125, "rewards/margins": 0.0015516221756115556, "rewards/rejected": 0.0036622812040150166, "step": 60 }, { "epoch": 0.02, "grad_norm": 1.9583663940429688, "learning_rate": 4.666666666666667e-06, "logits/chosen": 0.33364781737327576, "logits/rejected": 0.2776263952255249, "logps/chosen": -209.07211303710938, "logps/rejected": -158.00839233398438, "loss": 0.6827, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 0.026117617264389992, "rewards/margins": 0.021206889301538467, "rewards/rejected": 0.004910729825496674, "step": 70 }, { "epoch": 0.03, "grad_norm": 1.7102651596069336, "learning_rate": 5.333333333333334e-06, "logits/chosen": 0.5094404816627502, "logits/rejected": 0.4666718542575836, "logps/chosen": -153.90780639648438, "logps/rejected": -103.93388366699219, "loss": 0.679, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 0.038611821830272675, "rewards/margins": 0.028847262263298035, "rewards/rejected": 0.00976455770432949, "step": 80 }, { "epoch": 0.03, "grad_norm": 2.9190127849578857, "learning_rate": 6e-06, "logits/chosen": 0.23412127792835236, "logits/rejected": 0.16527244448661804, "logps/chosen": -219.2425079345703, "logps/rejected": -149.35684204101562, "loss": 0.659, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.07324406504631042, "rewards/margins": 0.07025764882564545, "rewards/rejected": 0.0029864185489714146, "step": 90 }, { "epoch": 0.03, "grad_norm": 2.0400824546813965, "learning_rate": 6.666666666666667e-06, "logits/chosen": 0.3398872911930084, "logits/rejected": 0.2985154092311859, "logps/chosen": -185.22146606445312, "logps/rejected": -121.82730865478516, "loss": 0.6488, "rewards/accuracies": 1.0, "rewards/chosen": 0.10361306369304657, "rewards/margins": 0.09121204167604446, "rewards/rejected": 0.012401008978486061, "step": 100 }, { "epoch": 0.04, "grad_norm": 2.192309617996216, "learning_rate": 7.333333333333333e-06, "logits/chosen": 0.4448312222957611, "logits/rejected": 0.32321152091026306, "logps/chosen": -189.84707641601562, "logps/rejected": -124.48201751708984, "loss": 0.6194, "rewards/accuracies": 1.0, "rewards/chosen": 0.19106850028038025, "rewards/margins": 0.1558096706867218, "rewards/rejected": 0.03525884076952934, "step": 110 }, { "epoch": 0.04, "grad_norm": 2.386159896850586, "learning_rate": 8.000000000000001e-06, "logits/chosen": 0.2993674874305725, "logits/rejected": 0.3050278127193451, "logps/chosen": -187.64498901367188, "logps/rejected": -127.5892105102539, "loss": 0.5682, "rewards/accuracies": 1.0, "rewards/chosen": 0.30387812852859497, "rewards/margins": 0.2730325758457184, "rewards/rejected": 0.030845556408166885, "step": 120 }, { "epoch": 0.04, "grad_norm": 2.4128735065460205, "learning_rate": 8.666666666666668e-06, "logits/chosen": 0.470858097076416, "logits/rejected": 0.3819553256034851, "logps/chosen": -147.3748321533203, "logps/rejected": -106.97740173339844, "loss": 0.5524, "rewards/accuracies": 1.0, "rewards/chosen": 0.3712579309940338, "rewards/margins": 0.31013599038124084, "rewards/rejected": 0.06112197786569595, "step": 130 }, { "epoch": 0.05, "grad_norm": 3.5698485374450684, "learning_rate": 9.333333333333334e-06, "logits/chosen": 0.027050381526350975, "logits/rejected": 0.06544498354196548, "logps/chosen": -225.6494140625, "logps/rejected": -170.77578735351562, "loss": 0.4873, "rewards/accuracies": 1.0, "rewards/chosen": 0.5885292291641235, "rewards/margins": 0.4800504744052887, "rewards/rejected": 0.10847876965999603, "step": 140 }, { "epoch": 0.05, "grad_norm": 2.3326575756073, "learning_rate": 1e-05, "logits/chosen": 0.4391583800315857, "logits/rejected": 0.4003582000732422, "logps/chosen": -177.93197631835938, "logps/rejected": -129.4811248779297, "loss": 0.4473, "rewards/accuracies": 1.0, "rewards/chosen": 0.7490917444229126, "rewards/margins": 0.597412645816803, "rewards/rejected": 0.15167909860610962, "step": 150 }, { "epoch": 0.05, "grad_norm": 1.741441011428833, "learning_rate": 9.989042296734605e-06, "logits/chosen": 0.4242062568664551, "logits/rejected": 0.4251991808414459, "logps/chosen": -178.76414489746094, "logps/rejected": -133.4027557373047, "loss": 0.4098, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.8510511517524719, "rewards/margins": 0.7112440466880798, "rewards/rejected": 0.1398070752620697, "step": 160 }, { "epoch": 0.05, "grad_norm": 1.9917546510696411, "learning_rate": 9.97808459346921e-06, "logits/chosen": 0.254020094871521, "logits/rejected": 0.23922643065452576, "logps/chosen": -161.81881713867188, "logps/rejected": -125.13037109375, "loss": 0.3277, "rewards/accuracies": 1.0, "rewards/chosen": 1.1077089309692383, "rewards/margins": 0.9747198224067688, "rewards/rejected": 0.13298924267292023, "step": 170 }, { "epoch": 0.06, "grad_norm": 1.3671507835388184, "learning_rate": 9.967126890203814e-06, "logits/chosen": 0.2504037320613861, "logits/rejected": 0.18875229358673096, "logps/chosen": -147.4672088623047, "logps/rejected": -104.1845474243164, "loss": 0.2885, "rewards/accuracies": 1.0, "rewards/chosen": 1.3119361400604248, "rewards/margins": 1.1814303398132324, "rewards/rejected": 0.13050585985183716, "step": 180 }, { "epoch": 0.06, "grad_norm": 1.9124622344970703, "learning_rate": 9.956169186938418e-06, "logits/chosen": 0.2291211187839508, "logits/rejected": 0.24650339782238007, "logps/chosen": -147.00360107421875, "logps/rejected": -115.02952575683594, "loss": 0.2414, "rewards/accuracies": 1.0, "rewards/chosen": 1.5095247030258179, "rewards/margins": 1.3837789297103882, "rewards/rejected": 0.12574569880962372, "step": 190 }, { "epoch": 0.06, "grad_norm": 1.6257745027542114, "learning_rate": 9.945211483673022e-06, "logits/chosen": 0.18027305603027344, "logits/rejected": 0.17534476518630981, "logps/chosen": -180.81214904785156, "logps/rejected": -138.56167602539062, "loss": 0.2117, "rewards/accuracies": 1.0, "rewards/chosen": 1.7416807413101196, "rewards/margins": 1.576603889465332, "rewards/rejected": 0.16507670283317566, "step": 200 }, { "epoch": 0.07, "grad_norm": 1.6058803796768188, "learning_rate": 9.934253780407628e-06, "logits/chosen": 0.03618524968624115, "logits/rejected": 0.06655623763799667, "logps/chosen": -187.56057739257812, "logps/rejected": -165.72299194335938, "loss": 0.192, "rewards/accuracies": 1.0, "rewards/chosen": 1.6607027053833008, "rewards/margins": 1.624441385269165, "rewards/rejected": 0.03626134246587753, "step": 210 }, { "epoch": 0.07, "grad_norm": 2.9326171875, "learning_rate": 9.92329607714223e-06, "logits/chosen": 0.15423543751239777, "logits/rejected": 0.16546325385570526, "logps/chosen": -122.26210021972656, "logps/rejected": -105.48604583740234, "loss": 0.262, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 1.8262674808502197, "rewards/margins": 1.484720230102539, "rewards/rejected": 0.3415472209453583, "step": 220 }, { "epoch": 0.07, "grad_norm": 1.0585615634918213, "learning_rate": 9.912338373876837e-06, "logits/chosen": 0.0944317951798439, "logits/rejected": 0.0521501824259758, "logps/chosen": -183.9434051513672, "logps/rejected": -139.52586364746094, "loss": 0.1305, "rewards/accuracies": 1.0, "rewards/chosen": 2.2049338817596436, "rewards/margins": 2.331801652908325, "rewards/rejected": -0.12686775624752045, "step": 230 }, { "epoch": 0.08, "grad_norm": 1.2967203855514526, "learning_rate": 9.90138067061144e-06, "logits/chosen": 0.03958363085985184, "logits/rejected": -0.04342944175004959, "logps/chosen": -147.5401611328125, "logps/rejected": -118.40291595458984, "loss": 0.1008, "rewards/accuracies": 1.0, "rewards/chosen": 2.246441602706909, "rewards/margins": 2.4990291595458984, "rewards/rejected": -0.25258734822273254, "step": 240 }, { "epoch": 0.08, "grad_norm": 0.5034691095352173, "learning_rate": 9.890422967346045e-06, "logits/chosen": 0.03393579646945, "logits/rejected": 0.003592267632484436, "logps/chosen": -143.8810272216797, "logps/rejected": -128.96066284179688, "loss": 0.1339, "rewards/accuracies": 1.0, "rewards/chosen": 2.0601272583007812, "rewards/margins": 2.3098137378692627, "rewards/rejected": -0.24968591332435608, "step": 250 }, { "epoch": 0.08, "grad_norm": 0.8256264328956604, "learning_rate": 9.87946526408065e-06, "logits/chosen": -0.09204194694757462, "logits/rejected": -0.1390385925769806, "logps/chosen": -165.94276428222656, "logps/rejected": -140.0150909423828, "loss": 0.1342, "rewards/accuracies": 1.0, "rewards/chosen": 2.1594629287719727, "rewards/margins": 2.515359401702881, "rewards/rejected": -0.35589665174484253, "step": 260 }, { "epoch": 0.09, "grad_norm": 0.17470860481262207, "learning_rate": 9.868507560815254e-06, "logits/chosen": -0.09904654324054718, "logits/rejected": -0.21556393802165985, "logps/chosen": -138.27383422851562, "logps/rejected": -117.7011489868164, "loss": 0.1108, "rewards/accuracies": 1.0, "rewards/chosen": 2.1653850078582764, "rewards/margins": 2.631995677947998, "rewards/rejected": -0.4666108191013336, "step": 270 }, { "epoch": 0.09, "grad_norm": 0.4645913541316986, "learning_rate": 9.857549857549858e-06, "logits/chosen": -0.014490666799247265, "logits/rejected": -0.06418715417385101, "logps/chosen": -180.47317504882812, "logps/rejected": -161.57061767578125, "loss": 0.0746, "rewards/accuracies": 1.0, "rewards/chosen": 2.279376983642578, "rewards/margins": 3.045623302459717, "rewards/rejected": -0.766246497631073, "step": 280 }, { "epoch": 0.09, "grad_norm": 0.6914668679237366, "learning_rate": 9.846592154284462e-06, "logits/chosen": -0.046889033168554306, "logits/rejected": -0.1608402580022812, "logps/chosen": -173.64353942871094, "logps/rejected": -139.35154724121094, "loss": 0.0832, "rewards/accuracies": 1.0, "rewards/chosen": 2.0064756870269775, "rewards/margins": 2.9564709663391113, "rewards/rejected": -0.9499956369400024, "step": 290 }, { "epoch": 0.1, "grad_norm": 1.6635288000106812, "learning_rate": 9.835634451019067e-06, "logits/chosen": -0.017042959108948708, "logits/rejected": -0.12096239626407623, "logps/chosen": -159.43319702148438, "logps/rejected": -139.1463623046875, "loss": 0.0986, "rewards/accuracies": 1.0, "rewards/chosen": 1.9454389810562134, "rewards/margins": 2.7836952209472656, "rewards/rejected": -0.8382562398910522, "step": 300 }, { "epoch": 0.1, "grad_norm": 1.5297130346298218, "learning_rate": 9.824676747753673e-06, "logits/chosen": -0.12943556904792786, "logits/rejected": -0.2048647403717041, "logps/chosen": -149.43092346191406, "logps/rejected": -140.9419403076172, "loss": 0.0967, "rewards/accuracies": 1.0, "rewards/chosen": 1.958494782447815, "rewards/margins": 3.017915964126587, "rewards/rejected": -1.059421420097351, "step": 310 }, { "epoch": 0.1, "grad_norm": 0.18703162670135498, "learning_rate": 9.813719044488275e-06, "logits/chosen": -0.044006917625665665, "logits/rejected": -0.15224145352840424, "logps/chosen": -153.7903594970703, "logps/rejected": -128.21835327148438, "loss": 0.0409, "rewards/accuracies": 1.0, "rewards/chosen": 2.305603265762329, "rewards/margins": 3.5984387397766113, "rewards/rejected": -1.292834997177124, "step": 320 }, { "epoch": 0.11, "grad_norm": 0.7461889982223511, "learning_rate": 9.802761341222881e-06, "logits/chosen": -0.14230886101722717, "logits/rejected": -0.1584036648273468, "logps/chosen": -131.12109375, "logps/rejected": -111.15714263916016, "loss": 0.0709, "rewards/accuracies": 1.0, "rewards/chosen": 1.7661842107772827, "rewards/margins": 3.3135159015655518, "rewards/rejected": -1.5473315715789795, "step": 330 }, { "epoch": 0.11, "grad_norm": 1.7300301790237427, "learning_rate": 9.791803637957486e-06, "logits/chosen": -0.19161322712898254, "logits/rejected": -0.2922630310058594, "logps/chosen": -172.96087646484375, "logps/rejected": -160.4773712158203, "loss": 0.0519, "rewards/accuracies": 1.0, "rewards/chosen": 1.8651368618011475, "rewards/margins": 3.7756409645080566, "rewards/rejected": -1.9105039834976196, "step": 340 }, { "epoch": 0.11, "grad_norm": 2.3742196559906006, "learning_rate": 9.78084593469209e-06, "logits/chosen": -0.3112620711326599, "logits/rejected": -0.43214184045791626, "logps/chosen": -158.68028259277344, "logps/rejected": -136.09814453125, "loss": 0.0671, "rewards/accuracies": 1.0, "rewards/chosen": 2.1260719299316406, "rewards/margins": 3.7196857929229736, "rewards/rejected": -1.5936137437820435, "step": 350 }, { "epoch": 0.12, "grad_norm": 2.4912428855895996, "learning_rate": 9.769888231426694e-06, "logits/chosen": -0.0563586950302124, "logits/rejected": -0.20379754900932312, "logps/chosen": -117.54966735839844, "logps/rejected": -115.3812255859375, "loss": 0.0837, "rewards/accuracies": 1.0, "rewards/chosen": 1.9740089178085327, "rewards/margins": 3.207712173461914, "rewards/rejected": -1.23370361328125, "step": 360 }, { "epoch": 0.12, "grad_norm": 0.3334919512271881, "learning_rate": 9.758930528161298e-06, "logits/chosen": -0.16167142987251282, "logits/rejected": -0.24221567809581757, "logps/chosen": -138.83132934570312, "logps/rejected": -141.4235076904297, "loss": 0.1185, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 1.8540862798690796, "rewards/margins": 3.4170851707458496, "rewards/rejected": -1.5629991292953491, "step": 370 }, { "epoch": 0.12, "grad_norm": 0.1451215147972107, "learning_rate": 9.747972824895903e-06, "logits/chosen": -0.1488262414932251, "logits/rejected": -0.30983203649520874, "logps/chosen": -146.93450927734375, "logps/rejected": -145.71795654296875, "loss": 0.0315, "rewards/accuracies": 1.0, "rewards/chosen": 1.60548996925354, "rewards/margins": 4.409090995788574, "rewards/rejected": -2.803601026535034, "step": 380 }, { "epoch": 0.13, "grad_norm": 0.11398794502019882, "learning_rate": 9.737015121630507e-06, "logits/chosen": -0.3392196297645569, "logits/rejected": -0.4596717357635498, "logps/chosen": -192.10316467285156, "logps/rejected": -179.76910400390625, "loss": 0.04, "rewards/accuracies": 1.0, "rewards/chosen": 1.8670743703842163, "rewards/margins": 4.218127250671387, "rewards/rejected": -2.351052761077881, "step": 390 }, { "epoch": 0.13, "grad_norm": 1.373371958732605, "learning_rate": 9.726057418365111e-06, "logits/chosen": -0.2288927286863327, "logits/rejected": -0.4092441499233246, "logps/chosen": -134.75244140625, "logps/rejected": -126.3078384399414, "loss": 0.0498, "rewards/accuracies": 1.0, "rewards/chosen": 1.7074005603790283, "rewards/margins": 4.082337856292725, "rewards/rejected": -2.3749375343322754, "step": 400 }, { "epoch": 0.13, "grad_norm": 0.6404663920402527, "learning_rate": 9.715099715099716e-06, "logits/chosen": -0.21756196022033691, "logits/rejected": -0.3667296767234802, "logps/chosen": -111.0381088256836, "logps/rejected": -113.7396011352539, "loss": 0.0291, "rewards/accuracies": 1.0, "rewards/chosen": 1.8885473012924194, "rewards/margins": 4.248038291931152, "rewards/rejected": -2.3594906330108643, "step": 410 }, { "epoch": 0.14, "grad_norm": 0.2947131395339966, "learning_rate": 9.70414201183432e-06, "logits/chosen": -0.21332868933677673, "logits/rejected": -0.39654839038848877, "logps/chosen": -184.7637939453125, "logps/rejected": -165.0076904296875, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": 1.1396678686141968, "rewards/margins": 4.9540605545043945, "rewards/rejected": -3.8143928050994873, "step": 420 }, { "epoch": 0.14, "grad_norm": 0.2587800621986389, "learning_rate": 9.693184308568924e-06, "logits/chosen": -0.15772652626037598, "logits/rejected": -0.2967797815799713, "logps/chosen": -126.998046875, "logps/rejected": -127.1830825805664, "loss": 0.0436, "rewards/accuracies": 1.0, "rewards/chosen": 1.7573896646499634, "rewards/margins": 4.652812957763672, "rewards/rejected": -2.895423412322998, "step": 430 }, { "epoch": 0.14, "grad_norm": 0.18937894701957703, "learning_rate": 9.68222660530353e-06, "logits/chosen": -0.24051830172538757, "logits/rejected": -0.3756439685821533, "logps/chosen": -132.031494140625, "logps/rejected": -143.25439453125, "loss": 0.0317, "rewards/accuracies": 1.0, "rewards/chosen": 1.8294754028320312, "rewards/margins": 4.800570487976074, "rewards/rejected": -2.971095561981201, "step": 440 }, { "epoch": 0.15, "grad_norm": 0.796097457408905, "learning_rate": 9.671268902038133e-06, "logits/chosen": -0.19832219183444977, "logits/rejected": -0.351583868265152, "logps/chosen": -128.74539184570312, "logps/rejected": -126.41670227050781, "loss": 0.0183, "rewards/accuracies": 1.0, "rewards/chosen": 1.6307995319366455, "rewards/margins": 5.1652398109436035, "rewards/rejected": -3.534440517425537, "step": 450 }, { "epoch": 0.15, "grad_norm": 1.0965434312820435, "learning_rate": 9.660311198772739e-06, "logits/chosen": -0.1457364708185196, "logits/rejected": -0.33718162775039673, "logps/chosen": -153.73037719726562, "logps/rejected": -153.25807189941406, "loss": 0.0491, "rewards/accuracies": 1.0, "rewards/chosen": 0.8024961352348328, "rewards/margins": 4.706381797790527, "rewards/rejected": -3.903886079788208, "step": 460 }, { "epoch": 0.15, "grad_norm": 0.4351007342338562, "learning_rate": 9.649353495507341e-06, "logits/chosen": -0.29992926120758057, "logits/rejected": -0.45157748460769653, "logps/chosen": -141.03488159179688, "logps/rejected": -147.01473999023438, "loss": 0.0354, "rewards/accuracies": 1.0, "rewards/chosen": 1.1137398481369019, "rewards/margins": 4.605474948883057, "rewards/rejected": -3.491734743118286, "step": 470 }, { "epoch": 0.16, "grad_norm": 0.21928514540195465, "learning_rate": 9.638395792241947e-06, "logits/chosen": -0.2734539210796356, "logits/rejected": -0.42611390352249146, "logps/chosen": -178.15843200683594, "logps/rejected": -171.8292999267578, "loss": 0.0296, "rewards/accuracies": 1.0, "rewards/chosen": 1.0601214170455933, "rewards/margins": 5.109463214874268, "rewards/rejected": -4.049341678619385, "step": 480 }, { "epoch": 0.16, "grad_norm": 0.37911301851272583, "learning_rate": 9.627438088976552e-06, "logits/chosen": -0.38659173250198364, "logits/rejected": -0.5436467528343201, "logps/chosen": -187.867431640625, "logps/rejected": -188.06741333007812, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": 0.8567007780075073, "rewards/margins": 5.4622673988342285, "rewards/rejected": -4.605566501617432, "step": 490 }, { "epoch": 0.16, "grad_norm": 0.09822113811969757, "learning_rate": 9.616480385711156e-06, "logits/chosen": -0.37623029947280884, "logits/rejected": -0.48185300827026367, "logps/chosen": -202.08529663085938, "logps/rejected": -199.71482849121094, "loss": 0.0211, "rewards/accuracies": 1.0, "rewards/chosen": 0.24511973559856415, "rewards/margins": 5.403919219970703, "rewards/rejected": -5.158799171447754, "step": 500 }, { "epoch": 0.16, "grad_norm": 0.3931962251663208, "learning_rate": 9.60552268244576e-06, "logits/chosen": -0.2853025794029236, "logits/rejected": -0.413346529006958, "logps/chosen": -124.1753921508789, "logps/rejected": -140.43246459960938, "loss": 0.0171, "rewards/accuracies": 1.0, "rewards/chosen": 1.2122681140899658, "rewards/margins": 5.2854719161987305, "rewards/rejected": -4.0732035636901855, "step": 510 }, { "epoch": 0.17, "grad_norm": 1.2813736200332642, "learning_rate": 9.594564979180364e-06, "logits/chosen": -0.2814486622810364, "logits/rejected": -0.5304259657859802, "logps/chosen": -143.65679931640625, "logps/rejected": -140.1746368408203, "loss": 0.0151, "rewards/accuracies": 1.0, "rewards/chosen": 1.1810171604156494, "rewards/margins": 5.255087375640869, "rewards/rejected": -4.074069976806641, "step": 520 }, { "epoch": 0.17, "grad_norm": 0.6368138790130615, "learning_rate": 9.583607275914969e-06, "logits/chosen": -0.23208048939704895, "logits/rejected": -0.41035908460617065, "logps/chosen": -152.8085174560547, "logps/rejected": -158.786376953125, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": 1.0337063074111938, "rewards/margins": 5.832049369812012, "rewards/rejected": -4.798343181610107, "step": 530 }, { "epoch": 0.17, "grad_norm": 0.9199265241622925, "learning_rate": 9.572649572649575e-06, "logits/chosen": -0.3010120391845703, "logits/rejected": -0.4592529237270355, "logps/chosen": -217.2087860107422, "logps/rejected": -198.68173217773438, "loss": 0.0252, "rewards/accuracies": 1.0, "rewards/chosen": -0.41687291860580444, "rewards/margins": 5.248563289642334, "rewards/rejected": -5.6654372215271, "step": 540 }, { "epoch": 0.18, "grad_norm": 0.013650404289364815, "learning_rate": 9.561691869384177e-06, "logits/chosen": -0.4347296357154846, "logits/rejected": -0.6051202416419983, "logps/chosen": -190.7089080810547, "logps/rejected": -205.96047973632812, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": 0.1618773639202118, "rewards/margins": 6.865809440612793, "rewards/rejected": -6.703932762145996, "step": 550 }, { "epoch": 0.18, "grad_norm": 0.06230132654309273, "learning_rate": 9.550734166118783e-06, "logits/chosen": -0.3882770240306854, "logits/rejected": -0.6216930747032166, "logps/chosen": -175.98199462890625, "logps/rejected": -177.3306427001953, "loss": 0.0488, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.032947491854429245, "rewards/margins": 5.509705066680908, "rewards/rejected": -5.476758003234863, "step": 560 }, { "epoch": 0.18, "grad_norm": 0.16349956393241882, "learning_rate": 9.539776462853386e-06, "logits/chosen": -0.3122726082801819, "logits/rejected": -0.4245499074459076, "logps/chosen": -228.5991668701172, "logps/rejected": -225.5084686279297, "loss": 0.0161, "rewards/accuracies": 1.0, "rewards/chosen": -0.799796462059021, "rewards/margins": 5.888455390930176, "rewards/rejected": -6.688251495361328, "step": 570 }, { "epoch": 0.19, "grad_norm": 0.12062845379114151, "learning_rate": 9.528818759587992e-06, "logits/chosen": -0.5392414927482605, "logits/rejected": -0.6697270274162292, "logps/chosen": -170.96298217773438, "logps/rejected": -198.06301879882812, "loss": 0.0466, "rewards/accuracies": 1.0, "rewards/chosen": 0.008090650662779808, "rewards/margins": 6.091658592224121, "rewards/rejected": -6.0835676193237305, "step": 580 }, { "epoch": 0.19, "grad_norm": 0.3678707778453827, "learning_rate": 9.517861056322596e-06, "logits/chosen": -0.4352661669254303, "logits/rejected": -0.5481756925582886, "logps/chosen": -174.7131805419922, "logps/rejected": -171.861572265625, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": 0.18627242743968964, "rewards/margins": 6.163498878479004, "rewards/rejected": -5.977226257324219, "step": 590 }, { "epoch": 0.19, "grad_norm": 0.07159756124019623, "learning_rate": 9.5069033530572e-06, "logits/chosen": -0.20721349120140076, "logits/rejected": -0.417985737323761, "logps/chosen": -187.81405639648438, "logps/rejected": -185.33331298828125, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": 0.07214238494634628, "rewards/margins": 5.932024002075195, "rewards/rejected": -5.85988187789917, "step": 600 }, { "epoch": 0.2, "grad_norm": 0.09394218772649765, "learning_rate": 9.495945649791805e-06, "logits/chosen": -0.24830050766468048, "logits/rejected": -0.36983978748321533, "logps/chosen": -129.41619873046875, "logps/rejected": -145.60447692871094, "loss": 0.0193, "rewards/accuracies": 1.0, "rewards/chosen": 0.3323943018913269, "rewards/margins": 6.081761837005615, "rewards/rejected": -5.749367713928223, "step": 610 }, { "epoch": 0.2, "grad_norm": 0.561514675617218, "learning_rate": 9.484987946526409e-06, "logits/chosen": -0.3996688723564148, "logits/rejected": -0.5800243020057678, "logps/chosen": -202.79257202148438, "logps/rejected": -203.62704467773438, "loss": 0.0457, "rewards/accuracies": 0.966666579246521, "rewards/chosen": -0.36100807785987854, "rewards/margins": 6.192732810974121, "rewards/rejected": -6.553740501403809, "step": 620 }, { "epoch": 0.2, "grad_norm": 0.08309807628393173, "learning_rate": 9.474030243261013e-06, "logits/chosen": -0.3754512667655945, "logits/rejected": -0.5741219520568848, "logps/chosen": -174.9481658935547, "logps/rejected": -182.9730224609375, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -0.4041864275932312, "rewards/margins": 6.596762180328369, "rewards/rejected": -7.000948905944824, "step": 630 }, { "epoch": 0.21, "grad_norm": 0.0921340063214302, "learning_rate": 9.463072539995617e-06, "logits/chosen": -0.30068254470825195, "logits/rejected": -0.45074382424354553, "logps/chosen": -173.031005859375, "logps/rejected": -197.98251342773438, "loss": 0.0166, "rewards/accuracies": 1.0, "rewards/chosen": -0.10172281414270401, "rewards/margins": 6.632701873779297, "rewards/rejected": -6.734424591064453, "step": 640 }, { "epoch": 0.21, "grad_norm": 0.037897739559412, "learning_rate": 9.452114836730222e-06, "logits/chosen": -0.42777299880981445, "logits/rejected": -0.4529387354850769, "logps/chosen": -238.9905242919922, "logps/rejected": -260.50054931640625, "loss": 0.0165, "rewards/accuracies": 1.0, "rewards/chosen": -1.3973451852798462, "rewards/margins": 6.604907035827637, "rewards/rejected": -8.002251625061035, "step": 650 }, { "epoch": 0.21, "grad_norm": 0.19550499320030212, "learning_rate": 9.441157133464826e-06, "logits/chosen": -0.23329667747020721, "logits/rejected": -0.44183143973350525, "logps/chosen": -139.6894073486328, "logps/rejected": -164.62615966796875, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": 0.24222126603126526, "rewards/margins": 6.836798667907715, "rewards/rejected": -6.594576835632324, "step": 660 }, { "epoch": 0.22, "grad_norm": 0.3139669895172119, "learning_rate": 9.430199430199432e-06, "logits/chosen": -0.4021090865135193, "logits/rejected": -0.5786430239677429, "logps/chosen": -181.46817016601562, "logps/rejected": -197.1630096435547, "loss": 0.0187, "rewards/accuracies": 1.0, "rewards/chosen": -0.535909116268158, "rewards/margins": 6.705661773681641, "rewards/rejected": -7.241570949554443, "step": 670 }, { "epoch": 0.22, "grad_norm": 0.05340191349387169, "learning_rate": 9.419241726934035e-06, "logits/chosen": -0.42853325605392456, "logits/rejected": -0.6143732070922852, "logps/chosen": -168.3545379638672, "logps/rejected": -188.1068878173828, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -0.3123434782028198, "rewards/margins": 6.992964267730713, "rewards/rejected": -7.305306911468506, "step": 680 }, { "epoch": 0.22, "grad_norm": 2.9327523708343506, "learning_rate": 9.40828402366864e-06, "logits/chosen": -0.38912615180015564, "logits/rejected": -0.6179540753364563, "logps/chosen": -255.75888061523438, "logps/rejected": -249.5757598876953, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": -1.6745103597640991, "rewards/margins": 7.797115325927734, "rewards/rejected": -9.471625328063965, "step": 690 }, { "epoch": 0.23, "grad_norm": 0.03456411138176918, "learning_rate": 9.397326320403243e-06, "logits/chosen": -0.5495766997337341, "logits/rejected": -0.6684257984161377, "logps/chosen": -168.55702209472656, "logps/rejected": -188.98629760742188, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -0.858353316783905, "rewards/margins": 6.359633445739746, "rewards/rejected": -7.217986106872559, "step": 700 }, { "epoch": 0.23, "grad_norm": 0.06367018818855286, "learning_rate": 9.386368617137849e-06, "logits/chosen": -0.4233613610267639, "logits/rejected": -0.6099605560302734, "logps/chosen": -175.0663604736328, "logps/rejected": -189.76922607421875, "loss": 0.0294, "rewards/accuracies": 1.0, "rewards/chosen": -1.5615413188934326, "rewards/margins": 6.164674282073975, "rewards/rejected": -7.7262163162231445, "step": 710 }, { "epoch": 0.23, "grad_norm": 2.003807306289673, "learning_rate": 9.375410913872453e-06, "logits/chosen": -0.508701741695404, "logits/rejected": -0.6367843747138977, "logps/chosen": -201.49270629882812, "logps/rejected": -214.9327850341797, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -1.2017481327056885, "rewards/margins": 7.067420959472656, "rewards/rejected": -8.269168853759766, "step": 720 }, { "epoch": 0.24, "grad_norm": 0.8275896310806274, "learning_rate": 9.364453210607058e-06, "logits/chosen": -0.42019587755203247, "logits/rejected": -0.5408454537391663, "logps/chosen": -204.19387817382812, "logps/rejected": -225.56607055664062, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -2.0568044185638428, "rewards/margins": 7.597814083099365, "rewards/rejected": -9.654618263244629, "step": 730 }, { "epoch": 0.24, "grad_norm": 0.14503268897533417, "learning_rate": 9.353495507341662e-06, "logits/chosen": -0.39080002903938293, "logits/rejected": -0.5268298983573914, "logps/chosen": -184.9230194091797, "logps/rejected": -208.14492797851562, "loss": 0.0479, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": -2.121675968170166, "rewards/margins": 7.246522426605225, "rewards/rejected": -9.368197441101074, "step": 740 }, { "epoch": 0.24, "grad_norm": 0.9217634797096252, "learning_rate": 9.342537804076266e-06, "logits/chosen": -0.37332993745803833, "logits/rejected": -0.5701053142547607, "logps/chosen": -175.7810516357422, "logps/rejected": -201.13003540039062, "loss": 0.0408, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": -1.581413745880127, "rewards/margins": 6.754090309143066, "rewards/rejected": -8.335504531860352, "step": 750 }, { "epoch": 0.25, "grad_norm": 0.026290835812687874, "learning_rate": 9.33158010081087e-06, "logits/chosen": -0.4100477695465088, "logits/rejected": -0.5200067162513733, "logps/chosen": -233.86337280273438, "logps/rejected": -258.96881103515625, "loss": 0.0723, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": -2.9427809715270996, "rewards/margins": 6.980525016784668, "rewards/rejected": -9.923307418823242, "step": 760 }, { "epoch": 0.25, "grad_norm": 0.37062278389930725, "learning_rate": 9.320622397545477e-06, "logits/chosen": -0.3395642936229706, "logits/rejected": -0.4273925721645355, "logps/chosen": -242.663818359375, "logps/rejected": -268.10089111328125, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": -4.880597114562988, "rewards/margins": 7.653998374938965, "rewards/rejected": -12.534595489501953, "step": 770 }, { "epoch": 0.25, "grad_norm": 0.0023701719474047422, "learning_rate": 9.309664694280079e-06, "logits/chosen": -0.4241916537284851, "logits/rejected": -0.5316244959831238, "logps/chosen": -182.79238891601562, "logps/rejected": -211.8367156982422, "loss": 0.0338, "rewards/accuracies": 1.0, "rewards/chosen": -1.6987730264663696, "rewards/margins": 7.190474510192871, "rewards/rejected": -8.88924789428711, "step": 780 }, { "epoch": 0.26, "grad_norm": 0.008612029254436493, "learning_rate": 9.298706991014685e-06, "logits/chosen": -0.4729032516479492, "logits/rejected": -0.5640527606010437, "logps/chosen": -238.87527465820312, "logps/rejected": -276.45318603515625, "loss": 0.0358, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": -4.509866237640381, "rewards/margins": 8.078734397888184, "rewards/rejected": -12.588600158691406, "step": 790 }, { "epoch": 0.26, "grad_norm": 0.01925979182124138, "learning_rate": 9.287749287749288e-06, "logits/chosen": -0.42346763610839844, "logits/rejected": -0.5876745581626892, "logps/chosen": -217.486572265625, "logps/rejected": -251.65774536132812, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.6943228244781494, "rewards/margins": 9.3502197265625, "rewards/rejected": -12.044544219970703, "step": 800 }, { "epoch": 0.26, "grad_norm": 0.13818100094795227, "learning_rate": 9.276791584483894e-06, "logits/chosen": -0.4230351448059082, "logits/rejected": -0.5674090385437012, "logps/chosen": -217.064453125, "logps/rejected": -245.08786010742188, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -3.9776084423065186, "rewards/margins": 8.626860618591309, "rewards/rejected": -12.604470252990723, "step": 810 }, { "epoch": 0.27, "grad_norm": 0.011727742850780487, "learning_rate": 9.265833881218498e-06, "logits/chosen": -0.4069291055202484, "logits/rejected": -0.5598689913749695, "logps/chosen": -172.05592346191406, "logps/rejected": -200.79318237304688, "loss": 0.0204, "rewards/accuracies": 1.0, "rewards/chosen": -2.5719494819641113, "rewards/margins": 7.285019874572754, "rewards/rejected": -9.856969833374023, "step": 820 }, { "epoch": 0.27, "grad_norm": 1.0111483335494995, "learning_rate": 9.254876177953102e-06, "logits/chosen": -0.4388393759727478, "logits/rejected": -0.5350168347358704, "logps/chosen": -306.37823486328125, "logps/rejected": -342.58935546875, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -6.371355056762695, "rewards/margins": 10.12095832824707, "rewards/rejected": -16.492313385009766, "step": 830 }, { "epoch": 0.27, "grad_norm": 0.0097459452226758, "learning_rate": 9.243918474687706e-06, "logits/chosen": -0.3321714401245117, "logits/rejected": -0.5004499554634094, "logps/chosen": -181.58326721191406, "logps/rejected": -213.7935333251953, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -2.546564817428589, "rewards/margins": 8.579878807067871, "rewards/rejected": -11.126443862915039, "step": 840 }, { "epoch": 0.27, "grad_norm": 0.0109526002779603, "learning_rate": 9.23296077142231e-06, "logits/chosen": -0.4445480704307556, "logits/rejected": -0.5257662534713745, "logps/chosen": -201.4680938720703, "logps/rejected": -272.65118408203125, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -3.2386157512664795, "rewards/margins": 10.427217483520508, "rewards/rejected": -13.665834426879883, "step": 850 }, { "epoch": 0.28, "grad_norm": 4.875131607055664, "learning_rate": 9.222003068156915e-06, "logits/chosen": -0.4273023009300232, "logits/rejected": -0.5373457670211792, "logps/chosen": -263.1457824707031, "logps/rejected": -278.6871032714844, "loss": 0.0439, "rewards/accuracies": 1.0, "rewards/chosen": -4.379282474517822, "rewards/margins": 8.077393531799316, "rewards/rejected": -12.456674575805664, "step": 860 }, { "epoch": 0.28, "grad_norm": 0.012556626461446285, "learning_rate": 9.21104536489152e-06, "logits/chosen": -0.2668747007846832, "logits/rejected": -0.44963914155960083, "logps/chosen": -210.3493194580078, "logps/rejected": -252.1433563232422, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": -3.3864264488220215, "rewards/margins": 8.853643417358398, "rewards/rejected": -12.240070343017578, "step": 870 }, { "epoch": 0.28, "grad_norm": 0.024654850363731384, "learning_rate": 9.200087661626124e-06, "logits/chosen": -0.46519985795021057, "logits/rejected": -0.5054847002029419, "logps/chosen": -242.70095825195312, "logps/rejected": -250.40383911132812, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -3.888115406036377, "rewards/margins": 9.060136795043945, "rewards/rejected": -12.948251724243164, "step": 880 }, { "epoch": 0.29, "grad_norm": 0.0028619980439543724, "learning_rate": 9.189129958360728e-06, "logits/chosen": -0.39419493079185486, "logits/rejected": -0.4969407916069031, "logps/chosen": -210.2559814453125, "logps/rejected": -268.48870849609375, "loss": 0.0542, "rewards/accuracies": 0.966666579246521, "rewards/chosen": -4.357173442840576, "rewards/margins": 9.57945442199707, "rewards/rejected": -13.936625480651855, "step": 890 }, { "epoch": 0.29, "grad_norm": 0.5537806153297424, "learning_rate": 9.178172255095332e-06, "logits/chosen": -0.36690598726272583, "logits/rejected": -0.5239487886428833, "logps/chosen": -189.4766082763672, "logps/rejected": -234.23739624023438, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -3.8814499378204346, "rewards/margins": 8.542583465576172, "rewards/rejected": -12.424032211303711, "step": 900 }, { "epoch": 0.29, "grad_norm": 0.0037552732974290848, "learning_rate": 9.167214551829936e-06, "logits/chosen": -0.2370346486568451, "logits/rejected": -0.35934606194496155, "logps/chosen": -254.87759399414062, "logps/rejected": -300.49700927734375, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -6.661134243011475, "rewards/margins": 9.538171768188477, "rewards/rejected": -16.19930648803711, "step": 910 }, { "epoch": 0.3, "grad_norm": 0.016242943704128265, "learning_rate": 9.156256848564542e-06, "logits/chosen": -0.3371526598930359, "logits/rejected": -0.4652339518070221, "logps/chosen": -173.8836212158203, "logps/rejected": -223.115478515625, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -3.5612175464630127, "rewards/margins": 9.151124954223633, "rewards/rejected": -12.712343215942383, "step": 920 }, { "epoch": 0.3, "grad_norm": 0.024046355858445168, "learning_rate": 9.145299145299145e-06, "logits/chosen": -0.3485228419303894, "logits/rejected": -0.3939455449581146, "logps/chosen": -290.3489990234375, "logps/rejected": -343.43817138671875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -7.7552947998046875, "rewards/margins": 10.854909896850586, "rewards/rejected": -18.610204696655273, "step": 930 }, { "epoch": 0.3, "grad_norm": 0.046042028814554214, "learning_rate": 9.134341442033751e-06, "logits/chosen": -0.31552404165267944, "logits/rejected": -0.4237368106842041, "logps/chosen": -241.5800323486328, "logps/rejected": -283.7861328125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -5.1691718101501465, "rewards/margins": 9.845176696777344, "rewards/rejected": -15.014348030090332, "step": 940 }, { "epoch": 0.31, "grad_norm": 2.655742883682251, "learning_rate": 9.123383738768354e-06, "logits/chosen": -0.28424203395843506, "logits/rejected": -0.40134358406066895, "logps/chosen": -250.8709716796875, "logps/rejected": -309.8855285644531, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -5.845712184906006, "rewards/margins": 10.40029239654541, "rewards/rejected": -16.24600601196289, "step": 950 }, { "epoch": 0.31, "grad_norm": 0.00028849352383986115, "learning_rate": 9.11242603550296e-06, "logits/chosen": -0.2519022524356842, "logits/rejected": -0.3044319748878479, "logps/chosen": -256.7398986816406, "logps/rejected": -308.69329833984375, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": -6.2683329582214355, "rewards/margins": 10.402109146118164, "rewards/rejected": -16.670442581176758, "step": 960 }, { "epoch": 0.31, "grad_norm": 0.0053086057305336, "learning_rate": 9.101468332237564e-06, "logits/chosen": -0.2518306076526642, "logits/rejected": -0.3324928879737854, "logps/chosen": -247.19223022460938, "logps/rejected": -299.892822265625, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -7.235478401184082, "rewards/margins": 10.305525779724121, "rewards/rejected": -17.541004180908203, "step": 970 }, { "epoch": 0.32, "grad_norm": 0.054621316492557526, "learning_rate": 9.090510628972168e-06, "logits/chosen": -0.18655958771705627, "logits/rejected": -0.30609598755836487, "logps/chosen": -212.97329711914062, "logps/rejected": -268.6554260253906, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -5.168915748596191, "rewards/margins": 10.391229629516602, "rewards/rejected": -15.560145378112793, "step": 980 }, { "epoch": 0.32, "grad_norm": 0.023384546861052513, "learning_rate": 9.079552925706772e-06, "logits/chosen": -0.3303782641887665, "logits/rejected": -0.38056522607803345, "logps/chosen": -213.8797149658203, "logps/rejected": -288.4650573730469, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -5.234591484069824, "rewards/margins": 11.60734748840332, "rewards/rejected": -16.84193992614746, "step": 990 }, { "epoch": 0.32, "grad_norm": 0.028016259893774986, "learning_rate": 9.068595222441378e-06, "logits/chosen": -0.3271247446537018, "logits/rejected": -0.3924880623817444, "logps/chosen": -315.4078369140625, "logps/rejected": -360.6003112792969, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -9.352377891540527, "rewards/margins": 10.600927352905273, "rewards/rejected": -19.953306198120117, "step": 1000 }, { "epoch": 0.33, "grad_norm": 0.039689064025878906, "learning_rate": 9.057637519175981e-06, "logits/chosen": -0.3742726743221283, "logits/rejected": -0.5176577568054199, "logps/chosen": -260.1962585449219, "logps/rejected": -283.95538330078125, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": -7.897824764251709, "rewards/margins": 8.730276107788086, "rewards/rejected": -16.628101348876953, "step": 1010 }, { "epoch": 0.33, "grad_norm": 0.950749933719635, "learning_rate": 9.046679815910587e-06, "logits/chosen": -0.29500845074653625, "logits/rejected": -0.342989444732666, "logps/chosen": -235.64990234375, "logps/rejected": -296.71636962890625, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -5.87273645401001, "rewards/margins": 11.365694046020508, "rewards/rejected": -17.23843002319336, "step": 1020 }, { "epoch": 0.33, "grad_norm": 0.7492702603340149, "learning_rate": 9.03572211264519e-06, "logits/chosen": -0.29514080286026, "logits/rejected": -0.3748084008693695, "logps/chosen": -318.6951904296875, "logps/rejected": -366.6182556152344, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -8.613062858581543, "rewards/margins": 12.006413459777832, "rewards/rejected": -20.619478225708008, "step": 1030 }, { "epoch": 0.34, "grad_norm": 0.025907784700393677, "learning_rate": 9.024764409379796e-06, "logits/chosen": -0.2838585674762726, "logits/rejected": -0.290244460105896, "logps/chosen": -312.25347900390625, "logps/rejected": -396.3223571777344, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -9.871122360229492, "rewards/margins": 13.518135070800781, "rewards/rejected": -23.38925552368164, "step": 1040 }, { "epoch": 0.34, "grad_norm": 3.519346864777617e-05, "learning_rate": 9.0138067061144e-06, "logits/chosen": -0.33469122648239136, "logits/rejected": -0.4082818627357483, "logps/chosen": -285.1815185546875, "logps/rejected": -362.09332275390625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -8.829544067382812, "rewards/margins": 12.795137405395508, "rewards/rejected": -21.624683380126953, "step": 1050 }, { "epoch": 0.34, "grad_norm": 4.468189217732288e-05, "learning_rate": 9.002849002849004e-06, "logits/chosen": -0.1854289472103119, "logits/rejected": -0.19475221633911133, "logps/chosen": -247.6388397216797, "logps/rejected": -315.0453186035156, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.3854804039001465, "rewards/margins": 12.116926193237305, "rewards/rejected": -19.50240707397461, "step": 1060 }, { "epoch": 0.35, "grad_norm": 0.0009876694530248642, "learning_rate": 8.991891299583608e-06, "logits/chosen": -0.24933210015296936, "logits/rejected": -0.28711989521980286, "logps/chosen": -255.6931610107422, "logps/rejected": -328.758056640625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -6.750466346740723, "rewards/margins": 12.743404388427734, "rewards/rejected": -19.493867874145508, "step": 1070 }, { "epoch": 0.35, "grad_norm": 14.395219802856445, "learning_rate": 8.980933596318213e-06, "logits/chosen": -0.28123170137405396, "logits/rejected": -0.31940752267837524, "logps/chosen": -244.03466796875, "logps/rejected": -313.8268737792969, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -7.721076965332031, "rewards/margins": 10.604429244995117, "rewards/rejected": -18.32550811767578, "step": 1080 }, { "epoch": 0.35, "grad_norm": 0.04704693332314491, "learning_rate": 8.969975893052817e-06, "logits/chosen": -0.28245311975479126, "logits/rejected": -0.3764280378818512, "logps/chosen": -248.2042694091797, "logps/rejected": -286.869873046875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -6.773460388183594, "rewards/margins": 10.64108943939209, "rewards/rejected": -17.414548873901367, "step": 1090 }, { "epoch": 0.36, "grad_norm": 0.00043834373354911804, "learning_rate": 8.959018189787421e-06, "logits/chosen": -0.20289742946624756, "logits/rejected": -0.2996065020561218, "logps/chosen": -297.94512939453125, "logps/rejected": -347.0626525878906, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": -9.10444164276123, "rewards/margins": 12.04673957824707, "rewards/rejected": -21.151180267333984, "step": 1100 }, { "epoch": 0.36, "grad_norm": 0.12115999311208725, "learning_rate": 8.948060486522026e-06, "logits/chosen": -0.26539546251296997, "logits/rejected": -0.33644038438796997, "logps/chosen": -306.0853576660156, "logps/rejected": -363.36517333984375, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -8.828388214111328, "rewards/margins": 11.639673233032227, "rewards/rejected": -20.468059539794922, "step": 1110 }, { "epoch": 0.36, "grad_norm": 0.01209923718124628, "learning_rate": 8.93710278325663e-06, "logits/chosen": -0.20330910384655, "logits/rejected": -0.2543596625328064, "logps/chosen": -267.58465576171875, "logps/rejected": -339.16552734375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -7.954190254211426, "rewards/margins": 12.7907133102417, "rewards/rejected": -20.744905471801758, "step": 1120 }, { "epoch": 0.37, "grad_norm": 0.017156679183244705, "learning_rate": 8.926145079991234e-06, "logits/chosen": -0.21037821471691132, "logits/rejected": -0.335681676864624, "logps/chosen": -238.2596435546875, "logps/rejected": -287.96832275390625, "loss": 0.1154, "rewards/accuracies": 0.966666579246521, "rewards/chosen": -6.764012813568115, "rewards/margins": 10.85376262664795, "rewards/rejected": -17.617774963378906, "step": 1130 }, { "epoch": 0.37, "grad_norm": 0.01817765086889267, "learning_rate": 8.915187376725838e-06, "logits/chosen": -0.27532559633255005, "logits/rejected": -0.3343648314476013, "logps/chosen": -237.79660034179688, "logps/rejected": -278.28424072265625, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -6.450657844543457, "rewards/margins": 9.784764289855957, "rewards/rejected": -16.235422134399414, "step": 1140 }, { "epoch": 0.37, "grad_norm": 0.3464580178260803, "learning_rate": 8.904229673460444e-06, "logits/chosen": -0.20038633048534393, "logits/rejected": -0.34763047099113464, "logps/chosen": -264.88238525390625, "logps/rejected": -303.27056884765625, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -6.401070594787598, "rewards/margins": 10.594339370727539, "rewards/rejected": -16.995410919189453, "step": 1150 }, { "epoch": 0.38, "grad_norm": 0.010944131761789322, "learning_rate": 8.893271970195047e-06, "logits/chosen": -0.3168713450431824, "logits/rejected": -0.3542029857635498, "logps/chosen": -245.4883270263672, "logps/rejected": -310.0267639160156, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": -6.169398307800293, "rewards/margins": 11.141454696655273, "rewards/rejected": -17.310853958129883, "step": 1160 }, { "epoch": 0.38, "grad_norm": 0.012095646932721138, "learning_rate": 8.882314266929653e-06, "logits/chosen": -0.2291782796382904, "logits/rejected": -0.34653568267822266, "logps/chosen": -222.2202606201172, "logps/rejected": -258.69512939453125, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -4.978119850158691, "rewards/margins": 9.679037094116211, "rewards/rejected": -14.657157897949219, "step": 1170 }, { "epoch": 0.38, "grad_norm": 0.14866305887699127, "learning_rate": 8.871356563664255e-06, "logits/chosen": -0.21821892261505127, "logits/rejected": -0.28071773052215576, "logps/chosen": -199.72018432617188, "logps/rejected": -272.10589599609375, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -3.8853073120117188, "rewards/margins": 11.493281364440918, "rewards/rejected": -15.37859058380127, "step": 1180 }, { "epoch": 0.38, "grad_norm": 10.241555213928223, "learning_rate": 8.860398860398861e-06, "logits/chosen": -0.13861140608787537, "logits/rejected": -0.20187318325042725, "logps/chosen": -193.94729614257812, "logps/rejected": -244.1735076904297, "loss": 0.183, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": -4.192530632019043, "rewards/margins": 9.562729835510254, "rewards/rejected": -13.755261421203613, "step": 1190 }, { "epoch": 0.39, "grad_norm": 0.001308279111981392, "learning_rate": 8.849441157133466e-06, "logits/chosen": -0.2188553512096405, "logits/rejected": -0.27837082743644714, "logps/chosen": -256.08392333984375, "logps/rejected": -314.3865661621094, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.751282691955566, "rewards/margins": 11.724446296691895, "rewards/rejected": -16.47572898864746, "step": 1200 }, { "epoch": 0.39, "grad_norm": 0.0011354751186445355, "learning_rate": 8.83848345386807e-06, "logits/chosen": -0.33862900733947754, "logits/rejected": -0.3313067555427551, "logps/chosen": -259.02044677734375, "logps/rejected": -325.8959655761719, "loss": 0.0244, "rewards/accuracies": 0.966666579246521, "rewards/chosen": -5.828982830047607, "rewards/margins": 12.15399169921875, "rewards/rejected": -17.982975006103516, "step": 1210 }, { "epoch": 0.39, "grad_norm": 0.022255534306168556, "learning_rate": 8.827525750602674e-06, "logits/chosen": -0.1525467336177826, "logits/rejected": -0.30764085054397583, "logps/chosen": -164.30361938476562, "logps/rejected": -220.8531494140625, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -3.6489479541778564, "rewards/margins": 9.622652053833008, "rewards/rejected": -13.271600723266602, "step": 1220 }, { "epoch": 0.4, "grad_norm": 0.00422575231641531, "learning_rate": 8.816568047337279e-06, "logits/chosen": -0.3136499524116516, "logits/rejected": -0.3792189657688141, "logps/chosen": -228.77914428710938, "logps/rejected": -287.58441162109375, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -5.276397705078125, "rewards/margins": 11.693455696105957, "rewards/rejected": -16.969852447509766, "step": 1230 }, { "epoch": 0.4, "grad_norm": 0.0019199317321181297, "learning_rate": 8.805610344071883e-06, "logits/chosen": -0.26560407876968384, "logits/rejected": -0.3271743655204773, "logps/chosen": -268.156982421875, "logps/rejected": -299.9332580566406, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -6.225447654724121, "rewards/margins": 10.738948822021484, "rewards/rejected": -16.964397430419922, "step": 1240 }, { "epoch": 0.4, "grad_norm": 0.006867074873298407, "learning_rate": 8.794652640806489e-06, "logits/chosen": -0.15553151071071625, "logits/rejected": -0.21395400166511536, "logps/chosen": -241.8409423828125, "logps/rejected": -289.00848388671875, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -6.778923988342285, "rewards/margins": 9.904958724975586, "rewards/rejected": -16.683881759643555, "step": 1250 }, { "epoch": 0.41, "grad_norm": 0.009473240934312344, "learning_rate": 8.783694937541091e-06, "logits/chosen": -0.23859646916389465, "logits/rejected": -0.27141764760017395, "logps/chosen": -215.42276000976562, "logps/rejected": -278.4677429199219, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -5.839932441711426, "rewards/margins": 11.050471305847168, "rewards/rejected": -16.890405654907227, "step": 1260 }, { "epoch": 0.41, "grad_norm": 0.0002466948644723743, "learning_rate": 8.772737234275697e-06, "logits/chosen": -0.26885563135147095, "logits/rejected": -0.31586629152297974, "logps/chosen": -248.76797485351562, "logps/rejected": -342.48406982421875, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -6.838107109069824, "rewards/margins": 14.081052780151367, "rewards/rejected": -20.919160842895508, "step": 1270 }, { "epoch": 0.41, "grad_norm": 0.009549058973789215, "learning_rate": 8.7617795310103e-06, "logits/chosen": -0.10880441963672638, "logits/rejected": -0.16234132647514343, "logps/chosen": -250.2044677734375, "logps/rejected": -319.41778564453125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -7.536848545074463, "rewards/margins": 12.7142972946167, "rewards/rejected": -20.251144409179688, "step": 1280 }, { "epoch": 0.42, "grad_norm": 0.006435507442802191, "learning_rate": 8.750821827744906e-06, "logits/chosen": -0.18445457518100739, "logits/rejected": -0.18460145592689514, "logps/chosen": -262.4994201660156, "logps/rejected": -336.7874755859375, "loss": 0.0339, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": -7.42992639541626, "rewards/margins": 12.39716911315918, "rewards/rejected": -19.82709503173828, "step": 1290 }, { "epoch": 0.42, "grad_norm": 0.23167112469673157, "learning_rate": 8.73986412447951e-06, "logits/chosen": -0.14932586252689362, "logits/rejected": -0.19626209139823914, "logps/chosen": -247.1484832763672, "logps/rejected": -318.3656921386719, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -7.087119102478027, "rewards/margins": 11.31184196472168, "rewards/rejected": -18.39896011352539, "step": 1300 }, { "epoch": 0.42, "grad_norm": 0.0005600708536803722, "learning_rate": 8.728906421214115e-06, "logits/chosen": -0.1394408792257309, "logits/rejected": -0.14028649032115936, "logps/chosen": -289.9681701660156, "logps/rejected": -364.3168640136719, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -9.336925506591797, "rewards/margins": 12.717875480651855, "rewards/rejected": -22.054800033569336, "step": 1310 }, { "epoch": 0.43, "grad_norm": 0.14742839336395264, "learning_rate": 8.717948717948719e-06, "logits/chosen": -0.06897391378879547, "logits/rejected": -0.10997577756643295, "logps/chosen": -258.9363708496094, "logps/rejected": -328.3492736816406, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -8.73633098602295, "rewards/margins": 11.629599571228027, "rewards/rejected": -20.365930557250977, "step": 1320 }, { "epoch": 0.43, "grad_norm": 0.0032621161080896854, "learning_rate": 8.706991014683323e-06, "logits/chosen": 0.09344655275344849, "logits/rejected": 0.04874902218580246, "logps/chosen": -256.26904296875, "logps/rejected": -334.4007263183594, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -8.622756958007812, "rewards/margins": 12.946261405944824, "rewards/rejected": -21.569019317626953, "step": 1330 }, { "epoch": 0.43, "grad_norm": 0.00016526717809028924, "learning_rate": 8.696033311417927e-06, "logits/chosen": -0.011213278397917747, "logits/rejected": -0.029890483245253563, "logps/chosen": -285.08502197265625, "logps/rejected": -352.0074462890625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -9.995809555053711, "rewards/margins": 12.493630409240723, "rewards/rejected": -22.489439010620117, "step": 1340 }, { "epoch": 0.44, "grad_norm": 0.0005608515930362046, "learning_rate": 8.685075608152532e-06, "logits/chosen": -0.0565456822514534, "logits/rejected": -0.04827792942523956, "logps/chosen": -273.8840637207031, "logps/rejected": -346.35858154296875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -8.661928176879883, "rewards/margins": 12.965700149536133, "rewards/rejected": -21.627628326416016, "step": 1350 }, { "epoch": 0.44, "grad_norm": 0.2561068832874298, "learning_rate": 8.674117904887136e-06, "logits/chosen": -0.14858858287334442, "logits/rejected": -0.1448180228471756, "logps/chosen": -297.3650207519531, "logps/rejected": -404.10540771484375, "loss": 0.0238, "rewards/accuracies": 0.966666579246521, "rewards/chosen": -10.200472831726074, "rewards/margins": 15.200775146484375, "rewards/rejected": -25.401248931884766, "step": 1360 }, { "epoch": 0.44, "grad_norm": 0.6736346483230591, "learning_rate": 8.66316020162174e-06, "logits/chosen": 0.014827290549874306, "logits/rejected": 0.013812633231282234, "logps/chosen": -257.91583251953125, "logps/rejected": -342.5897216796875, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -8.798688888549805, "rewards/margins": 14.296686172485352, "rewards/rejected": -23.095375061035156, "step": 1370 }, { "epoch": 0.45, "grad_norm": 5.739891093980987e-06, "learning_rate": 8.652202498356346e-06, "logits/chosen": -0.004905500914901495, "logits/rejected": 0.008281905204057693, "logps/chosen": -284.5867614746094, "logps/rejected": -372.309326171875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -10.582194328308105, "rewards/margins": 13.63042163848877, "rewards/rejected": -24.212615966796875, "step": 1380 }, { "epoch": 0.45, "grad_norm": 10.869261741638184, "learning_rate": 8.641244795090949e-06, "logits/chosen": -0.06507638841867447, "logits/rejected": -0.0575793981552124, "logps/chosen": -294.3569641113281, "logps/rejected": -353.50250244140625, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": -11.367159843444824, "rewards/margins": 11.184629440307617, "rewards/rejected": -22.551788330078125, "step": 1390 }, { "epoch": 0.45, "grad_norm": 0.0022530544083565474, "learning_rate": 8.630287091825555e-06, "logits/chosen": 0.012960417196154594, "logits/rejected": -0.018600907176733017, "logps/chosen": -246.05801391601562, "logps/rejected": -325.35577392578125, "loss": 0.021, "rewards/accuracies": 1.0, "rewards/chosen": -8.39463996887207, "rewards/margins": 12.46562385559082, "rewards/rejected": -20.86026382446289, "step": 1400 }, { "epoch": 0.46, "grad_norm": 0.0010853647254407406, "learning_rate": 8.619329388560157e-06, "logits/chosen": -0.03639475628733635, "logits/rejected": 0.0061216773465275764, "logps/chosen": -252.0388641357422, "logps/rejected": -349.8416442871094, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.54641056060791, "rewards/margins": 13.886810302734375, "rewards/rejected": -22.4332218170166, "step": 1410 }, { "epoch": 0.46, "grad_norm": 0.5238139033317566, "learning_rate": 8.608371685294763e-06, "logits/chosen": -0.11145244538784027, "logits/rejected": -0.12166018784046173, "logps/chosen": -252.606689453125, "logps/rejected": -314.7965087890625, "loss": 0.1077, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": -7.48656702041626, "rewards/margins": 11.383016586303711, "rewards/rejected": -18.869583129882812, "step": 1420 }, { "epoch": 0.46, "grad_norm": 0.383899062871933, "learning_rate": 8.597413982029368e-06, "logits/chosen": -0.0349181704223156, "logits/rejected": -0.01999412663280964, "logps/chosen": -250.32009887695312, "logps/rejected": -321.7938537597656, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -8.556001663208008, "rewards/margins": 11.27842903137207, "rewards/rejected": -19.834430694580078, "step": 1430 }, { "epoch": 0.47, "grad_norm": 0.03882277384400368, "learning_rate": 8.586456278763972e-06, "logits/chosen": -0.041800715029239655, "logits/rejected": 0.028328755870461464, "logps/chosen": -264.7074890136719, "logps/rejected": -359.95001220703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -9.131043434143066, "rewards/margins": 14.964799880981445, "rewards/rejected": -24.095842361450195, "step": 1440 }, { "epoch": 0.47, "grad_norm": 0.005863294005393982, "learning_rate": 8.575498575498576e-06, "logits/chosen": -0.008469844236969948, "logits/rejected": 0.002064249012619257, "logps/chosen": -189.87020874023438, "logps/rejected": -289.1915588378906, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": -5.485939979553223, "rewards/margins": 12.884135246276855, "rewards/rejected": -18.370075225830078, "step": 1450 }, { "epoch": 0.47, "grad_norm": 0.09647411108016968, "learning_rate": 8.56454087223318e-06, "logits/chosen": 0.015825632959604263, "logits/rejected": -0.02976151742041111, "logps/chosen": -366.56439208984375, "logps/rejected": -430.0389709472656, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -13.442001342773438, "rewards/margins": 12.91467571258545, "rewards/rejected": -26.356678009033203, "step": 1460 }, { "epoch": 0.48, "grad_norm": 0.10359703004360199, "learning_rate": 8.553583168967785e-06, "logits/chosen": -0.09159889072179794, "logits/rejected": -0.06940022855997086, "logps/chosen": -315.4118347167969, "logps/rejected": -377.8108215332031, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -10.500893592834473, "rewards/margins": 13.29771614074707, "rewards/rejected": -23.79861068725586, "step": 1470 }, { "epoch": 0.48, "grad_norm": 0.011625263839960098, "learning_rate": 8.54262546570239e-06, "logits/chosen": -0.1008826494216919, "logits/rejected": -0.05810718610882759, "logps/chosen": -264.00909423828125, "logps/rejected": -356.61163330078125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -9.526174545288086, "rewards/margins": 14.310162544250488, "rewards/rejected": -23.836336135864258, "step": 1480 }, { "epoch": 0.48, "grad_norm": 0.024129299446940422, "learning_rate": 8.531667762436993e-06, "logits/chosen": 0.0249390359967947, "logits/rejected": 0.004033858422189951, "logps/chosen": -295.7063903808594, "logps/rejected": -394.6076965332031, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -10.936558723449707, "rewards/margins": 14.934048652648926, "rewards/rejected": -25.87060546875, "step": 1490 }, { "epoch": 0.49, "grad_norm": 0.006154273636639118, "learning_rate": 8.5207100591716e-06, "logits/chosen": 0.04360217973589897, "logits/rejected": 0.03041175566613674, "logps/chosen": -236.54794311523438, "logps/rejected": -317.7098693847656, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -8.552594184875488, "rewards/margins": 11.904518127441406, "rewards/rejected": -20.457111358642578, "step": 1500 }, { "epoch": 0.49, "grad_norm": 0.3130321502685547, "learning_rate": 8.509752355906202e-06, "logits/chosen": -0.07926555722951889, "logits/rejected": -0.02901688776910305, "logps/chosen": -248.68408203125, "logps/rejected": -326.94940185546875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -8.911336898803711, "rewards/margins": 13.27922534942627, "rewards/rejected": -22.190561294555664, "step": 1510 }, { "epoch": 0.49, "grad_norm": 0.09558708965778351, "learning_rate": 8.498794652640808e-06, "logits/chosen": -0.0705290287733078, "logits/rejected": -0.06556431949138641, "logps/chosen": -289.7066955566406, "logps/rejected": -359.8416748046875, "loss": 0.0249, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": -10.718439102172852, "rewards/margins": 12.26536750793457, "rewards/rejected": -22.983808517456055, "step": 1520 }, { "epoch": 0.49, "grad_norm": 0.02135203592479229, "learning_rate": 8.487836949375412e-06, "logits/chosen": 0.08843693137168884, "logits/rejected": 0.10300026834011078, "logps/chosen": -283.0185546875, "logps/rejected": -395.1629333496094, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -10.282575607299805, "rewards/margins": 15.90794563293457, "rewards/rejected": -26.190521240234375, "step": 1530 }, { "epoch": 0.5, "grad_norm": 0.02689371071755886, "learning_rate": 8.476879246110016e-06, "logits/chosen": -0.054816532880067825, "logits/rejected": -0.028935739770531654, "logps/chosen": -340.44329833984375, "logps/rejected": -424.52105712890625, "loss": 0.0361, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": -13.073100090026855, "rewards/margins": 14.94751262664795, "rewards/rejected": -28.020618438720703, "step": 1540 }, { "epoch": 0.5, "grad_norm": 0.005785965360701084, "learning_rate": 8.46592154284462e-06, "logits/chosen": -0.014959866181015968, "logits/rejected": 0.0628860592842102, "logps/chosen": -238.8596954345703, "logps/rejected": -339.1539611816406, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -9.337881088256836, "rewards/margins": 14.204916000366211, "rewards/rejected": -23.542797088623047, "step": 1550 }, { "epoch": 0.5, "grad_norm": 0.1502775102853775, "learning_rate": 8.454963839579225e-06, "logits/chosen": 0.05573273450136185, "logits/rejected": 0.09205415844917297, "logps/chosen": -303.62701416015625, "logps/rejected": -364.28204345703125, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": -12.095705032348633, "rewards/margins": 12.889692306518555, "rewards/rejected": -24.985401153564453, "step": 1560 }, { "epoch": 0.51, "grad_norm": 0.01375632081180811, "learning_rate": 8.44400613631383e-06, "logits/chosen": -0.0028796226251870394, "logits/rejected": 0.055630385875701904, "logps/chosen": -247.17288208007812, "logps/rejected": -334.44195556640625, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -8.552030563354492, "rewards/margins": 12.745870590209961, "rewards/rejected": -21.297901153564453, "step": 1570 }, { "epoch": 0.51, "grad_norm": 0.015819918364286423, "learning_rate": 8.433048433048434e-06, "logits/chosen": 0.029922613874077797, "logits/rejected": 0.1054656133055687, "logps/chosen": -290.9429626464844, "logps/rejected": -389.000732421875, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -11.37781810760498, "rewards/margins": 14.447565078735352, "rewards/rejected": -25.82538414001465, "step": 1580 }, { "epoch": 0.51, "grad_norm": 0.06199351325631142, "learning_rate": 8.422090729783038e-06, "logits/chosen": 0.034867942333221436, "logits/rejected": 0.1408187299966812, "logps/chosen": -304.77081298828125, "logps/rejected": -402.5494079589844, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -12.836946487426758, "rewards/margins": 15.10669994354248, "rewards/rejected": -27.943645477294922, "step": 1590 }, { "epoch": 0.52, "grad_norm": 0.011423285119235516, "learning_rate": 8.411133026517642e-06, "logits/chosen": 0.07202502340078354, "logits/rejected": 0.15315476059913635, "logps/chosen": -314.509033203125, "logps/rejected": -408.44647216796875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -12.762028694152832, "rewards/margins": 14.413556098937988, "rewards/rejected": -27.175586700439453, "step": 1600 }, { "epoch": 0.52, "grad_norm": 0.015212434343993664, "learning_rate": 8.400175323252246e-06, "logits/chosen": 0.07158254086971283, "logits/rejected": 0.12116159498691559, "logps/chosen": -301.7251892089844, "logps/rejected": -424.9703674316406, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -14.060445785522461, "rewards/margins": 15.829069137573242, "rewards/rejected": -29.889516830444336, "step": 1610 }, { "epoch": 0.52, "grad_norm": 0.00037714597419835627, "learning_rate": 8.38921761998685e-06, "logits/chosen": 0.07507513463497162, "logits/rejected": 0.1150326281785965, "logps/chosen": -228.2713165283203, "logps/rejected": -332.7132263183594, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -8.718751907348633, "rewards/margins": 14.904324531555176, "rewards/rejected": -23.623075485229492, "step": 1620 }, { "epoch": 0.53, "grad_norm": 0.00014221732271835208, "learning_rate": 8.378259916721457e-06, "logits/chosen": 0.19829820096492767, "logits/rejected": 0.1768883615732193, "logps/chosen": -286.7662048339844, "logps/rejected": -363.81463623046875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -12.576507568359375, "rewards/margins": 13.473172187805176, "rewards/rejected": -26.049678802490234, "step": 1630 }, { "epoch": 0.53, "grad_norm": 0.2192688137292862, "learning_rate": 8.36730221345606e-06, "logits/chosen": 0.1942572295665741, "logits/rejected": 0.22028391063213348, "logps/chosen": -301.4276123046875, "logps/rejected": -383.99432373046875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -13.628840446472168, "rewards/margins": 13.160855293273926, "rewards/rejected": -26.789697647094727, "step": 1640 }, { "epoch": 0.53, "grad_norm": 0.00010946116526611149, "learning_rate": 8.356344510190665e-06, "logits/chosen": 0.10659674555063248, "logits/rejected": 0.1302722990512848, "logps/chosen": -402.503173828125, "logps/rejected": -484.22589111328125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -16.998889923095703, "rewards/margins": 16.14960479736328, "rewards/rejected": -33.148494720458984, "step": 1650 }, { "epoch": 0.54, "grad_norm": 0.017199842259287834, "learning_rate": 8.34538680692527e-06, "logits/chosen": 0.07519405335187912, "logits/rejected": 0.15728269517421722, "logps/chosen": -284.95184326171875, "logps/rejected": -384.1084289550781, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -13.825132369995117, "rewards/margins": 12.739798545837402, "rewards/rejected": -26.564931869506836, "step": 1660 }, { "epoch": 0.54, "grad_norm": 0.01719660870730877, "learning_rate": 8.334429103659874e-06, "logits/chosen": 0.07660949975252151, "logits/rejected": 0.10592161118984222, "logps/chosen": -231.68411254882812, "logps/rejected": -322.4700012207031, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": -9.59976863861084, "rewards/margins": 12.979411125183105, "rewards/rejected": -22.579181671142578, "step": 1670 }, { "epoch": 0.54, "grad_norm": 0.012668246403336525, "learning_rate": 8.323471400394478e-06, "logits/chosen": 0.11749809980392456, "logits/rejected": 0.1774546504020691, "logps/chosen": -322.8683776855469, "logps/rejected": -432.86553955078125, "loss": 0.0232, "rewards/accuracies": 0.966666579246521, "rewards/chosen": -12.755744934082031, "rewards/margins": 16.210979461669922, "rewards/rejected": -28.966724395751953, "step": 1680 }, { "epoch": 0.55, "grad_norm": 0.007913627661764622, "learning_rate": 8.312513697129082e-06, "logits/chosen": 0.11255357414484024, "logits/rejected": 0.1525614708662033, "logps/chosen": -303.31060791015625, "logps/rejected": -404.16766357421875, "loss": 0.0255, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": -12.78972339630127, "rewards/margins": 14.71977424621582, "rewards/rejected": -27.509496688842773, "step": 1690 }, { "epoch": 0.55, "grad_norm": 7.69473408581689e-05, "learning_rate": 8.301555993863687e-06, "logits/chosen": 0.07176389545202255, "logits/rejected": 0.14741338789463043, "logps/chosen": -381.74859619140625, "logps/rejected": -479.43280029296875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -15.687698364257812, "rewards/margins": 17.027114868164062, "rewards/rejected": -32.71481704711914, "step": 1700 }, { "epoch": 0.55, "grad_norm": 0.020641475915908813, "learning_rate": 8.290598290598293e-06, "logits/chosen": 0.027815943583846092, "logits/rejected": 0.06404396146535873, "logps/chosen": -326.9677734375, "logps/rejected": -426.5794982910156, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -13.755020141601562, "rewards/margins": 15.451559066772461, "rewards/rejected": -29.206579208374023, "step": 1710 }, { "epoch": 0.56, "grad_norm": 1.0741193818830652e-06, "learning_rate": 8.279640587332895e-06, "logits/chosen": -0.04963821545243263, "logits/rejected": 0.060973964631557465, "logps/chosen": -282.719482421875, "logps/rejected": -397.8350830078125, "loss": 0.0378, "rewards/accuracies": 0.966666579246521, "rewards/chosen": -10.77167797088623, "rewards/margins": 16.46950912475586, "rewards/rejected": -27.241186141967773, "step": 1720 }, { "epoch": 0.56, "grad_norm": 0.0007996432832442224, "learning_rate": 8.268682884067501e-06, "logits/chosen": 0.0820159837603569, "logits/rejected": 0.05762636661529541, "logps/chosen": -317.8184509277344, "logps/rejected": -399.0465087890625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -12.283085823059082, "rewards/margins": 14.6398286819458, "rewards/rejected": -26.922916412353516, "step": 1730 }, { "epoch": 0.56, "grad_norm": 0.0067008682526648045, "learning_rate": 8.257725180802104e-06, "logits/chosen": 0.03673550486564636, "logits/rejected": 0.06919295340776443, "logps/chosen": -289.0475769042969, "logps/rejected": -387.4378662109375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.43751049041748, "rewards/margins": 16.087310791015625, "rewards/rejected": -25.52482032775879, "step": 1740 }, { "epoch": 0.57, "grad_norm": 1.0896865129470825, "learning_rate": 8.24676747753671e-06, "logits/chosen": 0.10127731412649155, "logits/rejected": 0.16299596428871155, "logps/chosen": -288.1521911621094, "logps/rejected": -375.989501953125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -9.477235794067383, "rewards/margins": 15.621920585632324, "rewards/rejected": -25.09915542602539, "step": 1750 }, { "epoch": 0.57, "grad_norm": 0.21109241247177124, "learning_rate": 8.235809774271314e-06, "logits/chosen": 0.008235934190452099, "logits/rejected": 0.028172463178634644, "logps/chosen": -295.41619873046875, "logps/rejected": -404.15533447265625, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -11.374763488769531, "rewards/margins": 15.838516235351562, "rewards/rejected": -27.21327781677246, "step": 1760 }, { "epoch": 0.57, "grad_norm": 0.341794490814209, "learning_rate": 8.224852071005918e-06, "logits/chosen": 0.1264527142047882, "logits/rejected": 0.15065120160579681, "logps/chosen": -292.97430419921875, "logps/rejected": -380.7444152832031, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -11.181876182556152, "rewards/margins": 14.194729804992676, "rewards/rejected": -25.376605987548828, "step": 1770 }, { "epoch": 0.58, "grad_norm": 0.007328469771891832, "learning_rate": 8.213894367740523e-06, "logits/chosen": 0.023362448439002037, "logits/rejected": 0.03181435540318489, "logps/chosen": -292.8009948730469, "logps/rejected": -382.83343505859375, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -11.59870719909668, "rewards/margins": 12.62264347076416, "rewards/rejected": -24.221351623535156, "step": 1780 }, { "epoch": 0.58, "grad_norm": 0.27736029028892517, "learning_rate": 8.202936664475127e-06, "logits/chosen": -0.05786416679620743, "logits/rejected": 0.0343763530254364, "logps/chosen": -281.7183532714844, "logps/rejected": -384.5142822265625, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -11.055322647094727, "rewards/margins": 14.507980346679688, "rewards/rejected": -25.56330108642578, "step": 1790 }, { "epoch": 0.58, "grad_norm": 0.00044713946408592165, "learning_rate": 8.191978961209731e-06, "logits/chosen": 0.04160480573773384, "logits/rejected": 0.15656664967536926, "logps/chosen": -255.13864135742188, "logps/rejected": -372.533935546875, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -9.455907821655273, "rewards/margins": 16.43738555908203, "rewards/rejected": -25.893295288085938, "step": 1800 }, { "epoch": 0.59, "grad_norm": 0.15391205251216888, "learning_rate": 8.181021257944335e-06, "logits/chosen": 0.13832136988639832, "logits/rejected": 0.21045103669166565, "logps/chosen": -304.4212341308594, "logps/rejected": -408.509033203125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -12.320929527282715, "rewards/margins": 14.877690315246582, "rewards/rejected": -27.198617935180664, "step": 1810 }, { "epoch": 0.59, "grad_norm": 0.1734129786491394, "learning_rate": 8.17006355467894e-06, "logits/chosen": -0.008830100297927856, "logits/rejected": 0.08690972626209259, "logps/chosen": -317.80670166015625, "logps/rejected": -403.6852111816406, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -13.214055061340332, "rewards/margins": 15.352259635925293, "rewards/rejected": -28.566314697265625, "step": 1820 }, { "epoch": 0.59, "grad_norm": 1.4723388064297183e-09, "learning_rate": 8.159105851413544e-06, "logits/chosen": 0.10790036618709564, "logits/rejected": 0.20435258746147156, "logps/chosen": -244.66781616210938, "logps/rejected": -349.64190673828125, "loss": 0.0391, "rewards/accuracies": 0.966666579246521, "rewards/chosen": -9.495941162109375, "rewards/margins": 14.044004440307617, "rewards/rejected": -23.539945602416992, "step": 1830 }, { "epoch": 0.6, "grad_norm": 4.041919601149857e-05, "learning_rate": 8.148148148148148e-06, "logits/chosen": 0.1392899751663208, "logits/rejected": 0.17356742918491364, "logps/chosen": -271.93536376953125, "logps/rejected": -376.3609313964844, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -11.918790817260742, "rewards/margins": 15.073224067687988, "rewards/rejected": -26.992013931274414, "step": 1840 }, { "epoch": 0.6, "grad_norm": 3.694919769259286e-06, "learning_rate": 8.137190444882753e-06, "logits/chosen": 0.2150353193283081, "logits/rejected": 0.2910372018814087, "logps/chosen": -296.4953918457031, "logps/rejected": -429.4913635253906, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -12.478975296020508, "rewards/margins": 17.906230926513672, "rewards/rejected": -30.385208129882812, "step": 1850 }, { "epoch": 0.6, "grad_norm": 14.997368812561035, "learning_rate": 8.126232741617359e-06, "logits/chosen": 0.13269153237342834, "logits/rejected": 0.1637151837348938, "logps/chosen": -393.65887451171875, "logps/rejected": -488.2777404785156, "loss": 0.0282, "rewards/accuracies": 1.0, "rewards/chosen": -17.513179779052734, "rewards/margins": 16.430784225463867, "rewards/rejected": -33.943965911865234, "step": 1860 }, { "epoch": 0.6, "grad_norm": 0.0013834636192768812, "learning_rate": 8.115275038351961e-06, "logits/chosen": 0.07304046303033829, "logits/rejected": 0.147117480635643, "logps/chosen": -308.97161865234375, "logps/rejected": -416.3505859375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -13.166252136230469, "rewards/margins": 15.148821830749512, "rewards/rejected": -28.315073013305664, "step": 1870 }, { "epoch": 0.61, "grad_norm": 0.005938555579632521, "learning_rate": 8.104317335086567e-06, "logits/chosen": 0.02907967008650303, "logits/rejected": 0.11755422502756119, "logps/chosen": -310.50933837890625, "logps/rejected": -403.22247314453125, "loss": 0.0225, "rewards/accuracies": 1.0, "rewards/chosen": -11.069503784179688, "rewards/margins": 15.674043655395508, "rewards/rejected": -26.743549346923828, "step": 1880 }, { "epoch": 0.61, "grad_norm": 0.3666568398475647, "learning_rate": 8.09335963182117e-06, "logits/chosen": -0.004576456733047962, "logits/rejected": 0.07972874492406845, "logps/chosen": -337.0972900390625, "logps/rejected": -455.228759765625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -11.475229263305664, "rewards/margins": 17.417362213134766, "rewards/rejected": -28.892593383789062, "step": 1890 }, { "epoch": 0.61, "grad_norm": 0.050740547478199005, "learning_rate": 8.082401928555776e-06, "logits/chosen": 0.006110090762376785, "logits/rejected": 0.045890793204307556, "logps/chosen": -242.1782684326172, "logps/rejected": -327.7618408203125, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -7.432751655578613, "rewards/margins": 15.302406311035156, "rewards/rejected": -22.735158920288086, "step": 1900 }, { "epoch": 0.62, "grad_norm": 0.05140925943851471, "learning_rate": 8.07144422529038e-06, "logits/chosen": -0.109877809882164, "logits/rejected": -0.02618744969367981, "logps/chosen": -255.05599975585938, "logps/rejected": -364.76654052734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.201934814453125, "rewards/margins": 16.78182029724121, "rewards/rejected": -24.983755111694336, "step": 1910 }, { "epoch": 0.62, "grad_norm": 3.2723581790924072, "learning_rate": 8.060486522024984e-06, "logits/chosen": -0.030681187286973, "logits/rejected": 0.031235750764608383, "logps/chosen": -223.58474731445312, "logps/rejected": -289.45196533203125, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -7.744879722595215, "rewards/margins": 11.148603439331055, "rewards/rejected": -18.893482208251953, "step": 1920 }, { "epoch": 0.62, "grad_norm": 1.3139744997024536, "learning_rate": 8.049528818759589e-06, "logits/chosen": -0.010395990684628487, "logits/rejected": 0.053827375173568726, "logps/chosen": -260.6311340332031, "logps/rejected": -343.34796142578125, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -8.475730895996094, "rewards/margins": 12.925222396850586, "rewards/rejected": -21.400951385498047, "step": 1930 }, { "epoch": 0.63, "grad_norm": 0.0011528899194672704, "learning_rate": 8.038571115494193e-06, "logits/chosen": 0.004040165338665247, "logits/rejected": 0.09620045125484467, "logps/chosen": -265.73260498046875, "logps/rejected": -368.19976806640625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -8.830063819885254, "rewards/margins": 14.222076416015625, "rewards/rejected": -23.052141189575195, "step": 1940 }, { "epoch": 0.63, "grad_norm": 3.633260348578915e-05, "learning_rate": 8.027613412228797e-06, "logits/chosen": 0.08991348743438721, "logits/rejected": 0.13095875084400177, "logps/chosen": -250.6635284423828, "logps/rejected": -348.5271301269531, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.241789817810059, "rewards/margins": 15.90589714050293, "rewards/rejected": -24.147687911987305, "step": 1950 }, { "epoch": 0.63, "grad_norm": 0.008139081299304962, "learning_rate": 8.016655708963403e-06, "logits/chosen": -0.05623581260442734, "logits/rejected": 0.008502885699272156, "logps/chosen": -312.2218017578125, "logps/rejected": -426.6249084472656, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -11.09146785736084, "rewards/margins": 17.29364013671875, "rewards/rejected": -28.385107040405273, "step": 1960 }, { "epoch": 0.64, "grad_norm": 0.0004545208648778498, "learning_rate": 8.005698005698006e-06, "logits/chosen": 0.013862645253539085, "logits/rejected": 0.05150808021426201, "logps/chosen": -273.6121826171875, "logps/rejected": -365.71307373046875, "loss": 0.0331, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": -8.933341026306152, "rewards/margins": 15.150711059570312, "rewards/rejected": -24.08405113220215, "step": 1970 }, { "epoch": 0.64, "grad_norm": 0.07295417040586472, "learning_rate": 7.994740302432612e-06, "logits/chosen": 0.04458921402692795, "logits/rejected": 0.10884840786457062, "logps/chosen": -224.69967651367188, "logps/rejected": -325.59356689453125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -7.435521125793457, "rewards/margins": 14.62103271484375, "rewards/rejected": -22.05655288696289, "step": 1980 }, { "epoch": 0.64, "grad_norm": 0.00015990910469554365, "learning_rate": 7.983782599167214e-06, "logits/chosen": -0.07026857882738113, "logits/rejected": 0.06809535622596741, "logps/chosen": -344.2391357421875, "logps/rejected": -496.5973205566406, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -11.867463111877441, "rewards/margins": 20.219188690185547, "rewards/rejected": -32.08665466308594, "step": 1990 }, { "epoch": 0.65, "grad_norm": 0.023937899619340897, "learning_rate": 7.97282489590182e-06, "logits/chosen": 0.07235778868198395, "logits/rejected": 0.1107616052031517, "logps/chosen": -232.8976287841797, "logps/rejected": -342.6751708984375, "loss": 0.0496, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": -8.303656578063965, "rewards/margins": 14.42664909362793, "rewards/rejected": -22.730304718017578, "step": 2000 } ], "logging_steps": 10, "max_steps": 9276, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }