diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3036 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6468305304010349, + "eval_steps": 500.0, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 1.6667505502700806, + "learning_rate": 6.666666666666668e-08, + "logits/chosen": 0.6656537652015686, + "logits/rejected": 0.8323326110839844, + "logps/chosen": -105.4136962890625, + "logps/rejected": -80.00390625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 2.462697744369507, + "learning_rate": 6.666666666666667e-07, + "logits/chosen": 0.4595692455768585, + "logits/rejected": 0.40892449021339417, + "logps/chosen": -159.06524658203125, + "logps/rejected": -111.41619110107422, + "loss": 0.6921, + "rewards/accuracies": 0.37037038803100586, + "rewards/chosen": -0.005318281706422567, + "rewards/margins": 0.002280694665387273, + "rewards/rejected": -0.0075989761389791965, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 1.2759745121002197, + "learning_rate": 1.3333333333333334e-06, + "logits/chosen": 0.4103795886039734, + "logits/rejected": 0.3288261592388153, + "logps/chosen": -154.99789428710938, + "logps/rejected": -102.23339080810547, + "loss": 0.6872, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.011106210760772228, + "rewards/margins": 0.012265065684914589, + "rewards/rejected": -0.0011588542256504297, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 1.4769150018692017, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": 0.42967623472213745, + "logits/rejected": 0.3888375759124756, + "logps/chosen": -145.51222229003906, + "logps/rejected": -99.47395324707031, + "loss": 0.6943, + "rewards/accuracies": 0.43333330750465393, + "rewards/chosen": -0.0015143711352720857, + "rewards/margins": -0.001963119488209486, + "rewards/rejected": 0.0004487482365220785, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 2.340982675552368, + "learning_rate": 2.666666666666667e-06, + "logits/chosen": 0.31144046783447266, + "logits/rejected": 0.38159480690956116, + "logps/chosen": -221.299072265625, + "logps/rejected": -160.48690795898438, + "loss": 0.6928, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00796764437109232, + "rewards/margins": 0.0012136328732594848, + "rewards/rejected": 0.006754010915756226, + "step": 40 + }, + { + "epoch": 0.02, + "grad_norm": 1.5240986347198486, + "learning_rate": 3.3333333333333333e-06, + "logits/chosen": 0.5946449041366577, + "logits/rejected": 0.44660329818725586, + "logps/chosen": -190.49057006835938, + "logps/rejected": -139.8750457763672, + "loss": 0.6888, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 0.010610619559884071, + "rewards/margins": 0.009138532914221287, + "rewards/rejected": 0.0014720851322636008, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 1.5300071239471436, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": 0.42198482155799866, + "logits/rejected": 0.33618858456611633, + "logps/chosen": -173.13516235351562, + "logps/rejected": -121.2822036743164, + "loss": 0.6925, + "rewards/accuracies": 0.5333333015441895, + "rewards/chosen": 0.00521390326321125, + "rewards/margins": 0.0015516221756115556, + "rewards/rejected": 0.0036622812040150166, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 1.9583663940429688, + "learning_rate": 4.666666666666667e-06, + "logits/chosen": 0.33364781737327576, + "logits/rejected": 0.2776263952255249, + "logps/chosen": -209.07211303710938, + "logps/rejected": -158.00839233398438, + "loss": 0.6827, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 0.026117617264389992, + "rewards/margins": 0.021206889301538467, + "rewards/rejected": 0.004910729825496674, + "step": 70 + }, + { + "epoch": 0.03, + "grad_norm": 1.7102651596069336, + "learning_rate": 5.333333333333334e-06, + "logits/chosen": 0.5094404816627502, + "logits/rejected": 0.4666718542575836, + "logps/chosen": -153.90780639648438, + "logps/rejected": -103.93388366699219, + "loss": 0.679, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": 0.038611821830272675, + "rewards/margins": 0.028847262263298035, + "rewards/rejected": 0.00976455770432949, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 2.9190127849578857, + "learning_rate": 6e-06, + "logits/chosen": 0.23412127792835236, + "logits/rejected": 0.16527244448661804, + "logps/chosen": -219.2425079345703, + "logps/rejected": -149.35684204101562, + "loss": 0.659, + "rewards/accuracies": 0.9666666984558105, + "rewards/chosen": 0.07324406504631042, + "rewards/margins": 0.07025764882564545, + "rewards/rejected": 0.0029864185489714146, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 2.0400824546813965, + "learning_rate": 6.666666666666667e-06, + "logits/chosen": 0.3398872911930084, + "logits/rejected": 0.2985154092311859, + "logps/chosen": -185.22146606445312, + "logps/rejected": -121.82730865478516, + "loss": 0.6488, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10361306369304657, + "rewards/margins": 0.09121204167604446, + "rewards/rejected": 0.012401008978486061, + "step": 100 + }, + { + "epoch": 0.04, + "grad_norm": 2.192309617996216, + "learning_rate": 7.333333333333333e-06, + "logits/chosen": 0.4448312222957611, + "logits/rejected": 0.32321152091026306, + "logps/chosen": -189.84707641601562, + "logps/rejected": -124.48201751708984, + "loss": 0.6194, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19106850028038025, + "rewards/margins": 0.1558096706867218, + "rewards/rejected": 0.03525884076952934, + "step": 110 + }, + { + "epoch": 0.04, + "grad_norm": 2.386159896850586, + "learning_rate": 8.000000000000001e-06, + "logits/chosen": 0.2993674874305725, + "logits/rejected": 0.3050278127193451, + "logps/chosen": -187.64498901367188, + "logps/rejected": -127.5892105102539, + "loss": 0.5682, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30387812852859497, + "rewards/margins": 0.2730325758457184, + "rewards/rejected": 0.030845556408166885, + "step": 120 + }, + { + "epoch": 0.04, + "grad_norm": 2.4128735065460205, + "learning_rate": 8.666666666666668e-06, + "logits/chosen": 0.470858097076416, + "logits/rejected": 0.3819553256034851, + "logps/chosen": -147.3748321533203, + "logps/rejected": -106.97740173339844, + "loss": 0.5524, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3712579309940338, + "rewards/margins": 0.31013599038124084, + "rewards/rejected": 0.06112197786569595, + "step": 130 + }, + { + "epoch": 0.05, + "grad_norm": 3.5698485374450684, + "learning_rate": 9.333333333333334e-06, + "logits/chosen": 0.027050381526350975, + "logits/rejected": 0.06544498354196548, + "logps/chosen": -225.6494140625, + "logps/rejected": -170.77578735351562, + "loss": 0.4873, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5885292291641235, + "rewards/margins": 0.4800504744052887, + "rewards/rejected": 0.10847876965999603, + "step": 140 + }, + { + "epoch": 0.05, + "grad_norm": 2.3326575756073, + "learning_rate": 1e-05, + "logits/chosen": 0.4391583800315857, + "logits/rejected": 0.4003582000732422, + "logps/chosen": -177.93197631835938, + "logps/rejected": -129.4811248779297, + "loss": 0.4473, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7490917444229126, + "rewards/margins": 0.597412645816803, + "rewards/rejected": 0.15167909860610962, + "step": 150 + }, + { + "epoch": 0.05, + "grad_norm": 1.741441011428833, + "learning_rate": 9.989042296734605e-06, + "logits/chosen": 0.4242062568664551, + "logits/rejected": 0.4251991808414459, + "logps/chosen": -178.76414489746094, + "logps/rejected": -133.4027557373047, + "loss": 0.4098, + "rewards/accuracies": 0.9666666984558105, + "rewards/chosen": 0.8510511517524719, + "rewards/margins": 0.7112440466880798, + "rewards/rejected": 0.1398070752620697, + "step": 160 + }, + { + "epoch": 0.05, + "grad_norm": 1.9917546510696411, + "learning_rate": 9.97808459346921e-06, + "logits/chosen": 0.254020094871521, + "logits/rejected": 0.23922643065452576, + "logps/chosen": -161.81881713867188, + "logps/rejected": -125.13037109375, + "loss": 0.3277, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1077089309692383, + "rewards/margins": 0.9747198224067688, + "rewards/rejected": 0.13298924267292023, + "step": 170 + }, + { + "epoch": 0.06, + "grad_norm": 1.3671507835388184, + "learning_rate": 9.967126890203814e-06, + "logits/chosen": 0.2504037320613861, + "logits/rejected": 0.18875229358673096, + "logps/chosen": -147.4672088623047, + "logps/rejected": -104.1845474243164, + "loss": 0.2885, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3119361400604248, + "rewards/margins": 1.1814303398132324, + "rewards/rejected": 0.13050585985183716, + "step": 180 + }, + { + "epoch": 0.06, + "grad_norm": 1.9124622344970703, + "learning_rate": 9.956169186938418e-06, + "logits/chosen": 0.2291211187839508, + "logits/rejected": 0.24650339782238007, + "logps/chosen": -147.00360107421875, + "logps/rejected": -115.02952575683594, + "loss": 0.2414, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5095247030258179, + "rewards/margins": 1.3837789297103882, + "rewards/rejected": 0.12574569880962372, + "step": 190 + }, + { + "epoch": 0.06, + "grad_norm": 1.6257745027542114, + "learning_rate": 9.945211483673022e-06, + "logits/chosen": 0.18027305603027344, + "logits/rejected": 0.17534476518630981, + "logps/chosen": -180.81214904785156, + "logps/rejected": -138.56167602539062, + "loss": 0.2117, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7416807413101196, + "rewards/margins": 1.576603889465332, + "rewards/rejected": 0.16507670283317566, + "step": 200 + }, + { + "epoch": 0.07, + "grad_norm": 1.6058803796768188, + "learning_rate": 9.934253780407628e-06, + "logits/chosen": 0.03618524968624115, + "logits/rejected": 0.06655623763799667, + "logps/chosen": -187.56057739257812, + "logps/rejected": -165.72299194335938, + "loss": 0.192, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6607027053833008, + "rewards/margins": 1.624441385269165, + "rewards/rejected": 0.03626134246587753, + "step": 210 + }, + { + "epoch": 0.07, + "grad_norm": 2.9326171875, + "learning_rate": 9.92329607714223e-06, + "logits/chosen": 0.15423543751239777, + "logits/rejected": 0.16546325385570526, + "logps/chosen": -122.26210021972656, + "logps/rejected": -105.48604583740234, + "loss": 0.262, + "rewards/accuracies": 0.9333333969116211, + "rewards/chosen": 1.8262674808502197, + "rewards/margins": 1.484720230102539, + "rewards/rejected": 0.3415472209453583, + "step": 220 + }, + { + "epoch": 0.07, + "grad_norm": 1.0585615634918213, + "learning_rate": 9.912338373876837e-06, + "logits/chosen": 0.0944317951798439, + "logits/rejected": 0.0521501824259758, + "logps/chosen": -183.9434051513672, + "logps/rejected": -139.52586364746094, + "loss": 0.1305, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2049338817596436, + "rewards/margins": 2.331801652908325, + "rewards/rejected": -0.12686775624752045, + "step": 230 + }, + { + "epoch": 0.08, + "grad_norm": 1.2967203855514526, + "learning_rate": 9.90138067061144e-06, + "logits/chosen": 0.03958363085985184, + "logits/rejected": -0.04342944175004959, + "logps/chosen": -147.5401611328125, + "logps/rejected": -118.40291595458984, + "loss": 0.1008, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.246441602706909, + "rewards/margins": 2.4990291595458984, + "rewards/rejected": -0.25258734822273254, + "step": 240 + }, + { + "epoch": 0.08, + "grad_norm": 0.5034691095352173, + "learning_rate": 9.890422967346045e-06, + "logits/chosen": 0.03393579646945, + "logits/rejected": 0.003592267632484436, + "logps/chosen": -143.8810272216797, + "logps/rejected": -128.96066284179688, + "loss": 0.1339, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0601272583007812, + "rewards/margins": 2.3098137378692627, + "rewards/rejected": -0.24968591332435608, + "step": 250 + }, + { + "epoch": 0.08, + "grad_norm": 0.8256264328956604, + "learning_rate": 9.87946526408065e-06, + "logits/chosen": -0.09204194694757462, + "logits/rejected": -0.1390385925769806, + "logps/chosen": -165.94276428222656, + "logps/rejected": -140.0150909423828, + "loss": 0.1342, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1594629287719727, + "rewards/margins": 2.515359401702881, + "rewards/rejected": -0.35589665174484253, + "step": 260 + }, + { + "epoch": 0.09, + "grad_norm": 0.17470860481262207, + "learning_rate": 9.868507560815254e-06, + "logits/chosen": -0.09904654324054718, + "logits/rejected": -0.21556393802165985, + "logps/chosen": -138.27383422851562, + "logps/rejected": -117.7011489868164, + "loss": 0.1108, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1653850078582764, + "rewards/margins": 2.631995677947998, + "rewards/rejected": -0.4666108191013336, + "step": 270 + }, + { + "epoch": 0.09, + "grad_norm": 0.4645913541316986, + "learning_rate": 9.857549857549858e-06, + "logits/chosen": -0.014490666799247265, + "logits/rejected": -0.06418715417385101, + "logps/chosen": -180.47317504882812, + "logps/rejected": -161.57061767578125, + "loss": 0.0746, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.279376983642578, + "rewards/margins": 3.045623302459717, + "rewards/rejected": -0.766246497631073, + "step": 280 + }, + { + "epoch": 0.09, + "grad_norm": 0.6914668679237366, + "learning_rate": 9.846592154284462e-06, + "logits/chosen": -0.046889033168554306, + "logits/rejected": -0.1608402580022812, + "logps/chosen": -173.64353942871094, + "logps/rejected": -139.35154724121094, + "loss": 0.0832, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0064756870269775, + "rewards/margins": 2.9564709663391113, + "rewards/rejected": -0.9499956369400024, + "step": 290 + }, + { + "epoch": 0.1, + "grad_norm": 1.6635288000106812, + "learning_rate": 9.835634451019067e-06, + "logits/chosen": -0.017042959108948708, + "logits/rejected": -0.12096239626407623, + "logps/chosen": -159.43319702148438, + "logps/rejected": -139.1463623046875, + "loss": 0.0986, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9454389810562134, + "rewards/margins": 2.7836952209472656, + "rewards/rejected": -0.8382562398910522, + "step": 300 + }, + { + "epoch": 0.1, + "grad_norm": 1.5297130346298218, + "learning_rate": 9.824676747753673e-06, + "logits/chosen": -0.12943556904792786, + "logits/rejected": -0.2048647403717041, + "logps/chosen": -149.43092346191406, + "logps/rejected": -140.9419403076172, + "loss": 0.0967, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.958494782447815, + "rewards/margins": 3.017915964126587, + "rewards/rejected": -1.059421420097351, + "step": 310 + }, + { + "epoch": 0.1, + "grad_norm": 0.18703162670135498, + "learning_rate": 9.813719044488275e-06, + "logits/chosen": -0.044006917625665665, + "logits/rejected": -0.15224145352840424, + "logps/chosen": -153.7903594970703, + "logps/rejected": -128.21835327148438, + "loss": 0.0409, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.305603265762329, + "rewards/margins": 3.5984387397766113, + "rewards/rejected": -1.292834997177124, + "step": 320 + }, + { + "epoch": 0.11, + "grad_norm": 0.7461889982223511, + "learning_rate": 9.802761341222881e-06, + "logits/chosen": -0.14230886101722717, + "logits/rejected": -0.1584036648273468, + "logps/chosen": -131.12109375, + "logps/rejected": -111.15714263916016, + "loss": 0.0709, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7661842107772827, + "rewards/margins": 3.3135159015655518, + "rewards/rejected": -1.5473315715789795, + "step": 330 + }, + { + "epoch": 0.11, + "grad_norm": 1.7300301790237427, + "learning_rate": 9.791803637957486e-06, + "logits/chosen": -0.19161322712898254, + "logits/rejected": -0.2922630310058594, + "logps/chosen": -172.96087646484375, + "logps/rejected": -160.4773712158203, + "loss": 0.0519, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8651368618011475, + "rewards/margins": 3.7756409645080566, + "rewards/rejected": -1.9105039834976196, + "step": 340 + }, + { + "epoch": 0.11, + "grad_norm": 2.3742196559906006, + "learning_rate": 9.78084593469209e-06, + "logits/chosen": -0.3112620711326599, + "logits/rejected": -0.43214184045791626, + "logps/chosen": -158.68028259277344, + "logps/rejected": -136.09814453125, + "loss": 0.0671, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1260719299316406, + "rewards/margins": 3.7196857929229736, + "rewards/rejected": -1.5936137437820435, + "step": 350 + }, + { + "epoch": 0.12, + "grad_norm": 2.4912428855895996, + "learning_rate": 9.769888231426694e-06, + "logits/chosen": -0.0563586950302124, + "logits/rejected": -0.20379754900932312, + "logps/chosen": -117.54966735839844, + "logps/rejected": -115.3812255859375, + "loss": 0.0837, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9740089178085327, + "rewards/margins": 3.207712173461914, + "rewards/rejected": -1.23370361328125, + "step": 360 + }, + { + "epoch": 0.12, + "grad_norm": 0.3334919512271881, + "learning_rate": 9.758930528161298e-06, + "logits/chosen": -0.16167142987251282, + "logits/rejected": -0.24221567809581757, + "logps/chosen": -138.83132934570312, + "logps/rejected": -141.4235076904297, + "loss": 0.1185, + "rewards/accuracies": 0.9333333969116211, + "rewards/chosen": 1.8540862798690796, + "rewards/margins": 3.4170851707458496, + "rewards/rejected": -1.5629991292953491, + "step": 370 + }, + { + "epoch": 0.12, + "grad_norm": 0.1451215147972107, + "learning_rate": 9.747972824895903e-06, + "logits/chosen": -0.1488262414932251, + "logits/rejected": -0.30983203649520874, + "logps/chosen": -146.93450927734375, + "logps/rejected": -145.71795654296875, + "loss": 0.0315, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.60548996925354, + "rewards/margins": 4.409090995788574, + "rewards/rejected": -2.803601026535034, + "step": 380 + }, + { + "epoch": 0.13, + "grad_norm": 0.11398794502019882, + "learning_rate": 9.737015121630507e-06, + "logits/chosen": -0.3392196297645569, + "logits/rejected": -0.4596717357635498, + "logps/chosen": -192.10316467285156, + "logps/rejected": -179.76910400390625, + "loss": 0.04, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8670743703842163, + "rewards/margins": 4.218127250671387, + "rewards/rejected": -2.351052761077881, + "step": 390 + }, + { + "epoch": 0.13, + "grad_norm": 1.373371958732605, + "learning_rate": 9.726057418365111e-06, + "logits/chosen": -0.2288927286863327, + "logits/rejected": -0.4092441499233246, + "logps/chosen": -134.75244140625, + "logps/rejected": -126.3078384399414, + "loss": 0.0498, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7074005603790283, + "rewards/margins": 4.082337856292725, + "rewards/rejected": -2.3749375343322754, + "step": 400 + }, + { + "epoch": 0.13, + "grad_norm": 0.6404663920402527, + "learning_rate": 9.715099715099716e-06, + "logits/chosen": -0.21756196022033691, + "logits/rejected": -0.3667296767234802, + "logps/chosen": -111.0381088256836, + "logps/rejected": -113.7396011352539, + "loss": 0.0291, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8885473012924194, + "rewards/margins": 4.248038291931152, + "rewards/rejected": -2.3594906330108643, + "step": 410 + }, + { + "epoch": 0.14, + "grad_norm": 0.2947131395339966, + "learning_rate": 9.70414201183432e-06, + "logits/chosen": -0.21332868933677673, + "logits/rejected": -0.39654839038848877, + "logps/chosen": -184.7637939453125, + "logps/rejected": -165.0076904296875, + "loss": 0.016, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1396678686141968, + "rewards/margins": 4.9540605545043945, + "rewards/rejected": -3.8143928050994873, + "step": 420 + }, + { + "epoch": 0.14, + "grad_norm": 0.2587800621986389, + "learning_rate": 9.693184308568924e-06, + "logits/chosen": -0.15772652626037598, + "logits/rejected": -0.2967797815799713, + "logps/chosen": -126.998046875, + "logps/rejected": -127.1830825805664, + "loss": 0.0436, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7573896646499634, + "rewards/margins": 4.652812957763672, + "rewards/rejected": -2.895423412322998, + "step": 430 + }, + { + "epoch": 0.14, + "grad_norm": 0.18937894701957703, + "learning_rate": 9.68222660530353e-06, + "logits/chosen": -0.24051830172538757, + "logits/rejected": -0.3756439685821533, + "logps/chosen": -132.031494140625, + "logps/rejected": -143.25439453125, + "loss": 0.0317, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8294754028320312, + "rewards/margins": 4.800570487976074, + "rewards/rejected": -2.971095561981201, + "step": 440 + }, + { + "epoch": 0.15, + "grad_norm": 0.796097457408905, + "learning_rate": 9.671268902038133e-06, + "logits/chosen": -0.19832219183444977, + "logits/rejected": -0.351583868265152, + "logps/chosen": -128.74539184570312, + "logps/rejected": -126.41670227050781, + "loss": 0.0183, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6307995319366455, + "rewards/margins": 5.1652398109436035, + "rewards/rejected": -3.534440517425537, + "step": 450 + }, + { + "epoch": 0.15, + "grad_norm": 1.0965434312820435, + "learning_rate": 9.660311198772739e-06, + "logits/chosen": -0.1457364708185196, + "logits/rejected": -0.33718162775039673, + "logps/chosen": -153.73037719726562, + "logps/rejected": -153.25807189941406, + "loss": 0.0491, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8024961352348328, + "rewards/margins": 4.706381797790527, + "rewards/rejected": -3.903886079788208, + "step": 460 + }, + { + "epoch": 0.15, + "grad_norm": 0.4351007342338562, + "learning_rate": 9.649353495507341e-06, + "logits/chosen": -0.29992926120758057, + "logits/rejected": -0.45157748460769653, + "logps/chosen": -141.03488159179688, + "logps/rejected": -147.01473999023438, + "loss": 0.0354, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1137398481369019, + "rewards/margins": 4.605474948883057, + "rewards/rejected": -3.491734743118286, + "step": 470 + }, + { + "epoch": 0.16, + "grad_norm": 0.21928514540195465, + "learning_rate": 9.638395792241947e-06, + "logits/chosen": -0.2734539210796356, + "logits/rejected": -0.42611390352249146, + "logps/chosen": -178.15843200683594, + "logps/rejected": -171.8292999267578, + "loss": 0.0296, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0601214170455933, + "rewards/margins": 5.109463214874268, + "rewards/rejected": -4.049341678619385, + "step": 480 + }, + { + "epoch": 0.16, + "grad_norm": 0.37911301851272583, + "learning_rate": 9.627438088976552e-06, + "logits/chosen": -0.38659173250198364, + "logits/rejected": -0.5436467528343201, + "logps/chosen": -187.867431640625, + "logps/rejected": -188.06741333007812, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8567007780075073, + "rewards/margins": 5.4622673988342285, + "rewards/rejected": -4.605566501617432, + "step": 490 + }, + { + "epoch": 0.16, + "grad_norm": 0.09822113811969757, + "learning_rate": 9.616480385711156e-06, + "logits/chosen": -0.37623029947280884, + "logits/rejected": -0.48185300827026367, + "logps/chosen": -202.08529663085938, + "logps/rejected": -199.71482849121094, + "loss": 0.0211, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24511973559856415, + "rewards/margins": 5.403919219970703, + "rewards/rejected": -5.158799171447754, + "step": 500 + }, + { + "epoch": 0.16, + "grad_norm": 0.3931962251663208, + "learning_rate": 9.60552268244576e-06, + "logits/chosen": -0.2853025794029236, + "logits/rejected": -0.413346529006958, + "logps/chosen": -124.1753921508789, + "logps/rejected": -140.43246459960938, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2122681140899658, + "rewards/margins": 5.2854719161987305, + "rewards/rejected": -4.0732035636901855, + "step": 510 + }, + { + "epoch": 0.17, + "grad_norm": 1.2813736200332642, + "learning_rate": 9.594564979180364e-06, + "logits/chosen": -0.2814486622810364, + "logits/rejected": -0.5304259657859802, + "logps/chosen": -143.65679931640625, + "logps/rejected": -140.1746368408203, + "loss": 0.0151, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1810171604156494, + "rewards/margins": 5.255087375640869, + "rewards/rejected": -4.074069976806641, + "step": 520 + }, + { + "epoch": 0.17, + "grad_norm": 0.6368138790130615, + "learning_rate": 9.583607275914969e-06, + "logits/chosen": -0.23208048939704895, + "logits/rejected": -0.41035908460617065, + "logps/chosen": -152.8085174560547, + "logps/rejected": -158.786376953125, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0337063074111938, + "rewards/margins": 5.832049369812012, + "rewards/rejected": -4.798343181610107, + "step": 530 + }, + { + "epoch": 0.17, + "grad_norm": 0.9199265241622925, + "learning_rate": 9.572649572649575e-06, + "logits/chosen": -0.3010120391845703, + "logits/rejected": -0.4592529237270355, + "logps/chosen": -217.2087860107422, + "logps/rejected": -198.68173217773438, + "loss": 0.0252, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41687291860580444, + "rewards/margins": 5.248563289642334, + "rewards/rejected": -5.6654372215271, + "step": 540 + }, + { + "epoch": 0.18, + "grad_norm": 0.013650404289364815, + "learning_rate": 9.561691869384177e-06, + "logits/chosen": -0.4347296357154846, + "logits/rejected": -0.6051202416419983, + "logps/chosen": -190.7089080810547, + "logps/rejected": -205.96047973632812, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1618773639202118, + "rewards/margins": 6.865809440612793, + "rewards/rejected": -6.703932762145996, + "step": 550 + }, + { + "epoch": 0.18, + "grad_norm": 0.06230132654309273, + "learning_rate": 9.550734166118783e-06, + "logits/chosen": -0.3882770240306854, + "logits/rejected": -0.6216930747032166, + "logps/chosen": -175.98199462890625, + "logps/rejected": -177.3306427001953, + "loss": 0.0488, + "rewards/accuracies": 0.9666666984558105, + "rewards/chosen": 0.032947491854429245, + "rewards/margins": 5.509705066680908, + "rewards/rejected": -5.476758003234863, + "step": 560 + }, + { + "epoch": 0.18, + "grad_norm": 0.16349956393241882, + "learning_rate": 9.539776462853386e-06, + "logits/chosen": -0.3122726082801819, + "logits/rejected": -0.4245499074459076, + "logps/chosen": -228.5991668701172, + "logps/rejected": -225.5084686279297, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.799796462059021, + "rewards/margins": 5.888455390930176, + "rewards/rejected": -6.688251495361328, + "step": 570 + }, + { + "epoch": 0.19, + "grad_norm": 0.12062845379114151, + "learning_rate": 9.528818759587992e-06, + "logits/chosen": -0.5392414927482605, + "logits/rejected": -0.6697270274162292, + "logps/chosen": -170.96298217773438, + "logps/rejected": -198.06301879882812, + "loss": 0.0466, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.008090650662779808, + "rewards/margins": 6.091658592224121, + "rewards/rejected": -6.0835676193237305, + "step": 580 + }, + { + "epoch": 0.19, + "grad_norm": 0.3678707778453827, + "learning_rate": 9.517861056322596e-06, + "logits/chosen": -0.4352661669254303, + "logits/rejected": -0.5481756925582886, + "logps/chosen": -174.7131805419922, + "logps/rejected": -171.861572265625, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18627242743968964, + "rewards/margins": 6.163498878479004, + "rewards/rejected": -5.977226257324219, + "step": 590 + }, + { + "epoch": 0.19, + "grad_norm": 0.07159756124019623, + "learning_rate": 9.5069033530572e-06, + "logits/chosen": -0.20721349120140076, + "logits/rejected": -0.417985737323761, + "logps/chosen": -187.81405639648438, + "logps/rejected": -185.33331298828125, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07214238494634628, + "rewards/margins": 5.932024002075195, + "rewards/rejected": -5.85988187789917, + "step": 600 + }, + { + "epoch": 0.2, + "grad_norm": 0.09394218772649765, + "learning_rate": 9.495945649791805e-06, + "logits/chosen": -0.24830050766468048, + "logits/rejected": -0.36983978748321533, + "logps/chosen": -129.41619873046875, + "logps/rejected": -145.60447692871094, + "loss": 0.0193, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3323943018913269, + "rewards/margins": 6.081761837005615, + "rewards/rejected": -5.749367713928223, + "step": 610 + }, + { + "epoch": 0.2, + "grad_norm": 0.561514675617218, + "learning_rate": 9.484987946526409e-06, + "logits/chosen": -0.3996688723564148, + "logits/rejected": -0.5800243020057678, + "logps/chosen": -202.79257202148438, + "logps/rejected": -203.62704467773438, + "loss": 0.0457, + "rewards/accuracies": 0.966666579246521, + "rewards/chosen": -0.36100807785987854, + "rewards/margins": 6.192732810974121, + "rewards/rejected": -6.553740501403809, + "step": 620 + }, + { + "epoch": 0.2, + "grad_norm": 0.08309807628393173, + "learning_rate": 9.474030243261013e-06, + "logits/chosen": -0.3754512667655945, + "logits/rejected": -0.5741219520568848, + "logps/chosen": -174.9481658935547, + "logps/rejected": -182.9730224609375, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4041864275932312, + "rewards/margins": 6.596762180328369, + "rewards/rejected": -7.000948905944824, + "step": 630 + }, + { + "epoch": 0.21, + "grad_norm": 0.0921340063214302, + "learning_rate": 9.463072539995617e-06, + "logits/chosen": -0.30068254470825195, + "logits/rejected": -0.45074382424354553, + "logps/chosen": -173.031005859375, + "logps/rejected": -197.98251342773438, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10172281414270401, + "rewards/margins": 6.632701873779297, + "rewards/rejected": -6.734424591064453, + "step": 640 + }, + { + "epoch": 0.21, + "grad_norm": 0.037897739559412, + "learning_rate": 9.452114836730222e-06, + "logits/chosen": -0.42777299880981445, + "logits/rejected": -0.4529387354850769, + "logps/chosen": -238.9905242919922, + "logps/rejected": -260.50054931640625, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3973451852798462, + "rewards/margins": 6.604907035827637, + "rewards/rejected": -8.002251625061035, + "step": 650 + }, + { + "epoch": 0.21, + "grad_norm": 0.19550499320030212, + "learning_rate": 9.441157133464826e-06, + "logits/chosen": -0.23329667747020721, + "logits/rejected": -0.44183143973350525, + "logps/chosen": -139.6894073486328, + "logps/rejected": -164.62615966796875, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24222126603126526, + "rewards/margins": 6.836798667907715, + "rewards/rejected": -6.594576835632324, + "step": 660 + }, + { + "epoch": 0.22, + "grad_norm": 0.3139669895172119, + "learning_rate": 9.430199430199432e-06, + "logits/chosen": -0.4021090865135193, + "logits/rejected": -0.5786430239677429, + "logps/chosen": -181.46817016601562, + "logps/rejected": -197.1630096435547, + "loss": 0.0187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.535909116268158, + "rewards/margins": 6.705661773681641, + "rewards/rejected": -7.241570949554443, + "step": 670 + }, + { + "epoch": 0.22, + "grad_norm": 0.05340191349387169, + "learning_rate": 9.419241726934035e-06, + "logits/chosen": -0.42853325605392456, + "logits/rejected": -0.6143732070922852, + "logps/chosen": -168.3545379638672, + "logps/rejected": -188.1068878173828, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3123434782028198, + "rewards/margins": 6.992964267730713, + "rewards/rejected": -7.305306911468506, + "step": 680 + }, + { + "epoch": 0.22, + "grad_norm": 2.9327523708343506, + "learning_rate": 9.40828402366864e-06, + "logits/chosen": -0.38912615180015564, + "logits/rejected": -0.6179540753364563, + "logps/chosen": -255.75888061523438, + "logps/rejected": -249.5757598876953, + "loss": 0.0143, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6745103597640991, + "rewards/margins": 7.797115325927734, + "rewards/rejected": -9.471625328063965, + "step": 690 + }, + { + "epoch": 0.23, + "grad_norm": 0.03456411138176918, + "learning_rate": 9.397326320403243e-06, + "logits/chosen": -0.5495766997337341, + "logits/rejected": -0.6684257984161377, + "logps/chosen": -168.55702209472656, + "logps/rejected": -188.98629760742188, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.858353316783905, + "rewards/margins": 6.359633445739746, + "rewards/rejected": -7.217986106872559, + "step": 700 + }, + { + "epoch": 0.23, + "grad_norm": 0.06367018818855286, + "learning_rate": 9.386368617137849e-06, + "logits/chosen": -0.4233613610267639, + "logits/rejected": -0.6099605560302734, + "logps/chosen": -175.0663604736328, + "logps/rejected": -189.76922607421875, + "loss": 0.0294, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5615413188934326, + "rewards/margins": 6.164674282073975, + "rewards/rejected": -7.7262163162231445, + "step": 710 + }, + { + "epoch": 0.23, + "grad_norm": 2.003807306289673, + "learning_rate": 9.375410913872453e-06, + "logits/chosen": -0.508701741695404, + "logits/rejected": -0.6367843747138977, + "logps/chosen": -201.49270629882812, + "logps/rejected": -214.9327850341797, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2017481327056885, + "rewards/margins": 7.067420959472656, + "rewards/rejected": -8.269168853759766, + "step": 720 + }, + { + "epoch": 0.24, + "grad_norm": 0.8275896310806274, + "learning_rate": 9.364453210607058e-06, + "logits/chosen": -0.42019587755203247, + "logits/rejected": -0.5408454537391663, + "logps/chosen": -204.19387817382812, + "logps/rejected": -225.56607055664062, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0568044185638428, + "rewards/margins": 7.597814083099365, + "rewards/rejected": -9.654618263244629, + "step": 730 + }, + { + "epoch": 0.24, + "grad_norm": 0.14503268897533417, + "learning_rate": 9.353495507341662e-06, + "logits/chosen": -0.39080002903938293, + "logits/rejected": -0.5268298983573914, + "logps/chosen": -184.9230194091797, + "logps/rejected": -208.14492797851562, + "loss": 0.0479, + "rewards/accuracies": 0.9666666984558105, + "rewards/chosen": -2.121675968170166, + "rewards/margins": 7.246522426605225, + "rewards/rejected": -9.368197441101074, + "step": 740 + }, + { + "epoch": 0.24, + "grad_norm": 0.9217634797096252, + "learning_rate": 9.342537804076266e-06, + "logits/chosen": -0.37332993745803833, + "logits/rejected": -0.5701053142547607, + "logps/chosen": -175.7810516357422, + "logps/rejected": -201.13003540039062, + "loss": 0.0408, + "rewards/accuracies": 0.9666666984558105, + "rewards/chosen": -1.581413745880127, + "rewards/margins": 6.754090309143066, + "rewards/rejected": -8.335504531860352, + "step": 750 + }, + { + "epoch": 0.25, + "grad_norm": 0.026290835812687874, + "learning_rate": 9.33158010081087e-06, + "logits/chosen": -0.4100477695465088, + "logits/rejected": -0.5200067162513733, + "logps/chosen": -233.86337280273438, + "logps/rejected": -258.96881103515625, + "loss": 0.0723, + "rewards/accuracies": 0.9666666984558105, + "rewards/chosen": -2.9427809715270996, + "rewards/margins": 6.980525016784668, + "rewards/rejected": -9.923307418823242, + "step": 760 + }, + { + "epoch": 0.25, + "grad_norm": 0.37062278389930725, + "learning_rate": 9.320622397545477e-06, + "logits/chosen": -0.3395642936229706, + "logits/rejected": -0.4273925721645355, + "logps/chosen": -242.663818359375, + "logps/rejected": -268.10089111328125, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.880597114562988, + "rewards/margins": 7.653998374938965, + "rewards/rejected": -12.534595489501953, + "step": 770 + }, + { + "epoch": 0.25, + "grad_norm": 0.0023701719474047422, + "learning_rate": 9.309664694280079e-06, + "logits/chosen": -0.4241916537284851, + "logits/rejected": -0.5316244959831238, + "logps/chosen": -182.79238891601562, + "logps/rejected": -211.8367156982422, + "loss": 0.0338, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6987730264663696, + "rewards/margins": 7.190474510192871, + "rewards/rejected": -8.88924789428711, + "step": 780 + }, + { + "epoch": 0.26, + "grad_norm": 0.008612029254436493, + "learning_rate": 9.298706991014685e-06, + "logits/chosen": -0.4729032516479492, + "logits/rejected": -0.5640527606010437, + "logps/chosen": -238.87527465820312, + "logps/rejected": -276.45318603515625, + "loss": 0.0358, + "rewards/accuracies": 0.9666666984558105, + "rewards/chosen": -4.509866237640381, + "rewards/margins": 8.078734397888184, + "rewards/rejected": -12.588600158691406, + "step": 790 + }, + { + "epoch": 0.26, + "grad_norm": 0.01925979182124138, + "learning_rate": 9.287749287749288e-06, + "logits/chosen": -0.42346763610839844, + "logits/rejected": -0.5876745581626892, + "logps/chosen": -217.486572265625, + "logps/rejected": -251.65774536132812, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6943228244781494, + "rewards/margins": 9.3502197265625, + "rewards/rejected": -12.044544219970703, + "step": 800 + }, + { + "epoch": 0.26, + "grad_norm": 0.13818100094795227, + "learning_rate": 9.276791584483894e-06, + "logits/chosen": -0.4230351448059082, + "logits/rejected": -0.5674090385437012, + "logps/chosen": -217.064453125, + "logps/rejected": -245.08786010742188, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9776084423065186, + "rewards/margins": 8.626860618591309, + "rewards/rejected": -12.604470252990723, + "step": 810 + }, + { + "epoch": 0.27, + "grad_norm": 0.011727742850780487, + "learning_rate": 9.265833881218498e-06, + "logits/chosen": -0.4069291055202484, + "logits/rejected": -0.5598689913749695, + "logps/chosen": -172.05592346191406, + "logps/rejected": -200.79318237304688, + "loss": 0.0204, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5719494819641113, + "rewards/margins": 7.285019874572754, + "rewards/rejected": -9.856969833374023, + "step": 820 + }, + { + "epoch": 0.27, + "grad_norm": 1.0111483335494995, + "learning_rate": 9.254876177953102e-06, + "logits/chosen": -0.4388393759727478, + "logits/rejected": -0.5350168347358704, + "logps/chosen": -306.37823486328125, + "logps/rejected": -342.58935546875, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.371355056762695, + "rewards/margins": 10.12095832824707, + "rewards/rejected": -16.492313385009766, + "step": 830 + }, + { + "epoch": 0.27, + "grad_norm": 0.0097459452226758, + "learning_rate": 9.243918474687706e-06, + "logits/chosen": -0.3321714401245117, + "logits/rejected": -0.5004499554634094, + "logps/chosen": -181.58326721191406, + "logps/rejected": -213.7935333251953, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.546564817428589, + "rewards/margins": 8.579878807067871, + "rewards/rejected": -11.126443862915039, + "step": 840 + }, + { + "epoch": 0.27, + "grad_norm": 0.0109526002779603, + "learning_rate": 9.23296077142231e-06, + "logits/chosen": -0.4445480704307556, + "logits/rejected": -0.5257662534713745, + "logps/chosen": -201.4680938720703, + "logps/rejected": -272.65118408203125, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2386157512664795, + "rewards/margins": 10.427217483520508, + "rewards/rejected": -13.665834426879883, + "step": 850 + }, + { + "epoch": 0.28, + "grad_norm": 4.875131607055664, + "learning_rate": 9.222003068156915e-06, + "logits/chosen": -0.4273023009300232, + "logits/rejected": -0.5373457670211792, + "logps/chosen": -263.1457824707031, + "logps/rejected": -278.6871032714844, + "loss": 0.0439, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.379282474517822, + "rewards/margins": 8.077393531799316, + "rewards/rejected": -12.456674575805664, + "step": 860 + }, + { + "epoch": 0.28, + "grad_norm": 0.012556626461446285, + "learning_rate": 9.21104536489152e-06, + "logits/chosen": -0.2668747007846832, + "logits/rejected": -0.44963914155960083, + "logps/chosen": -210.3493194580078, + "logps/rejected": -252.1433563232422, + "loss": 0.0131, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3864264488220215, + "rewards/margins": 8.853643417358398, + "rewards/rejected": -12.240070343017578, + "step": 870 + }, + { + "epoch": 0.28, + "grad_norm": 0.024654850363731384, + "learning_rate": 9.200087661626124e-06, + "logits/chosen": -0.46519985795021057, + "logits/rejected": -0.5054847002029419, + "logps/chosen": -242.70095825195312, + "logps/rejected": -250.40383911132812, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.888115406036377, + "rewards/margins": 9.060136795043945, + "rewards/rejected": -12.948251724243164, + "step": 880 + }, + { + "epoch": 0.29, + "grad_norm": 0.0028619980439543724, + "learning_rate": 9.189129958360728e-06, + "logits/chosen": -0.39419493079185486, + "logits/rejected": -0.4969407916069031, + "logps/chosen": -210.2559814453125, + "logps/rejected": -268.48870849609375, + "loss": 0.0542, + "rewards/accuracies": 0.966666579246521, + "rewards/chosen": -4.357173442840576, + "rewards/margins": 9.57945442199707, + "rewards/rejected": -13.936625480651855, + "step": 890 + }, + { + "epoch": 0.29, + "grad_norm": 0.5537806153297424, + "learning_rate": 9.178172255095332e-06, + "logits/chosen": -0.36690598726272583, + "logits/rejected": -0.5239487886428833, + "logps/chosen": -189.4766082763672, + "logps/rejected": -234.23739624023438, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8814499378204346, + "rewards/margins": 8.542583465576172, + "rewards/rejected": -12.424032211303711, + "step": 900 + }, + { + "epoch": 0.29, + "grad_norm": 0.0037552732974290848, + "learning_rate": 9.167214551829936e-06, + "logits/chosen": -0.2370346486568451, + "logits/rejected": -0.35934606194496155, + "logps/chosen": -254.87759399414062, + "logps/rejected": -300.49700927734375, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.661134243011475, + "rewards/margins": 9.538171768188477, + "rewards/rejected": -16.19930648803711, + "step": 910 + }, + { + "epoch": 0.3, + "grad_norm": 0.016242943704128265, + "learning_rate": 9.156256848564542e-06, + "logits/chosen": -0.3371526598930359, + "logits/rejected": -0.4652339518070221, + "logps/chosen": -173.8836212158203, + "logps/rejected": -223.115478515625, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5612175464630127, + "rewards/margins": 9.151124954223633, + "rewards/rejected": -12.712343215942383, + "step": 920 + }, + { + "epoch": 0.3, + "grad_norm": 0.024046355858445168, + "learning_rate": 9.145299145299145e-06, + "logits/chosen": -0.3485228419303894, + "logits/rejected": -0.3939455449581146, + "logps/chosen": -290.3489990234375, + "logps/rejected": -343.43817138671875, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.7552947998046875, + "rewards/margins": 10.854909896850586, + "rewards/rejected": -18.610204696655273, + "step": 930 + }, + { + "epoch": 0.3, + "grad_norm": 0.046042028814554214, + "learning_rate": 9.134341442033751e-06, + "logits/chosen": -0.31552404165267944, + "logits/rejected": -0.4237368106842041, + "logps/chosen": -241.5800323486328, + "logps/rejected": -283.7861328125, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.1691718101501465, + "rewards/margins": 9.845176696777344, + "rewards/rejected": -15.014348030090332, + "step": 940 + }, + { + "epoch": 0.31, + "grad_norm": 2.655742883682251, + "learning_rate": 9.123383738768354e-06, + "logits/chosen": -0.28424203395843506, + "logits/rejected": -0.40134358406066895, + "logps/chosen": -250.8709716796875, + "logps/rejected": -309.8855285644531, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.845712184906006, + "rewards/margins": 10.40029239654541, + "rewards/rejected": -16.24600601196289, + "step": 950 + }, + { + "epoch": 0.31, + "grad_norm": 0.00028849352383986115, + "learning_rate": 9.11242603550296e-06, + "logits/chosen": -0.2519022524356842, + "logits/rejected": -0.3044319748878479, + "logps/chosen": -256.7398986816406, + "logps/rejected": -308.69329833984375, + "loss": 0.0131, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.2683329582214355, + "rewards/margins": 10.402109146118164, + "rewards/rejected": -16.670442581176758, + "step": 960 + }, + { + "epoch": 0.31, + "grad_norm": 0.0053086057305336, + "learning_rate": 9.101468332237564e-06, + "logits/chosen": -0.2518306076526642, + "logits/rejected": -0.3324928879737854, + "logps/chosen": -247.19223022460938, + "logps/rejected": -299.892822265625, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.235478401184082, + "rewards/margins": 10.305525779724121, + "rewards/rejected": -17.541004180908203, + "step": 970 + }, + { + "epoch": 0.32, + "grad_norm": 0.054621316492557526, + "learning_rate": 9.090510628972168e-06, + "logits/chosen": -0.18655958771705627, + "logits/rejected": -0.30609598755836487, + "logps/chosen": -212.97329711914062, + "logps/rejected": -268.6554260253906, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.168915748596191, + "rewards/margins": 10.391229629516602, + "rewards/rejected": -15.560145378112793, + "step": 980 + }, + { + "epoch": 0.32, + "grad_norm": 0.023384546861052513, + "learning_rate": 9.079552925706772e-06, + "logits/chosen": -0.3303782641887665, + "logits/rejected": -0.38056522607803345, + "logps/chosen": -213.8797149658203, + "logps/rejected": -288.4650573730469, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.234591484069824, + "rewards/margins": 11.60734748840332, + "rewards/rejected": -16.84193992614746, + "step": 990 + }, + { + "epoch": 0.32, + "grad_norm": 0.028016259893774986, + "learning_rate": 9.068595222441378e-06, + "logits/chosen": -0.3271247446537018, + "logits/rejected": -0.3924880623817444, + "logps/chosen": -315.4078369140625, + "logps/rejected": -360.6003112792969, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.352377891540527, + "rewards/margins": 10.600927352905273, + "rewards/rejected": -19.953306198120117, + "step": 1000 + }, + { + "epoch": 0.33, + "grad_norm": 0.039689064025878906, + "learning_rate": 9.057637519175981e-06, + "logits/chosen": -0.3742726743221283, + "logits/rejected": -0.5176577568054199, + "logps/chosen": -260.1962585449219, + "logps/rejected": -283.95538330078125, + "loss": 0.0131, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.897824764251709, + "rewards/margins": 8.730276107788086, + "rewards/rejected": -16.628101348876953, + "step": 1010 + }, + { + "epoch": 0.33, + "grad_norm": 0.950749933719635, + "learning_rate": 9.046679815910587e-06, + "logits/chosen": -0.29500845074653625, + "logits/rejected": -0.342989444732666, + "logps/chosen": -235.64990234375, + "logps/rejected": -296.71636962890625, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.87273645401001, + "rewards/margins": 11.365694046020508, + "rewards/rejected": -17.23843002319336, + "step": 1020 + }, + { + "epoch": 0.33, + "grad_norm": 0.7492702603340149, + "learning_rate": 9.03572211264519e-06, + "logits/chosen": -0.29514080286026, + "logits/rejected": -0.3748084008693695, + "logps/chosen": -318.6951904296875, + "logps/rejected": -366.6182556152344, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.613062858581543, + "rewards/margins": 12.006413459777832, + "rewards/rejected": -20.619478225708008, + "step": 1030 + }, + { + "epoch": 0.34, + "grad_norm": 0.025907784700393677, + "learning_rate": 9.024764409379796e-06, + "logits/chosen": -0.2838585674762726, + "logits/rejected": -0.290244460105896, + "logps/chosen": -312.25347900390625, + "logps/rejected": -396.3223571777344, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.871122360229492, + "rewards/margins": 13.518135070800781, + "rewards/rejected": -23.38925552368164, + "step": 1040 + }, + { + "epoch": 0.34, + "grad_norm": 3.519346864777617e-05, + "learning_rate": 9.0138067061144e-06, + "logits/chosen": -0.33469122648239136, + "logits/rejected": -0.4082818627357483, + "logps/chosen": -285.1815185546875, + "logps/rejected": -362.09332275390625, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.829544067382812, + "rewards/margins": 12.795137405395508, + "rewards/rejected": -21.624683380126953, + "step": 1050 + }, + { + "epoch": 0.34, + "grad_norm": 4.468189217732288e-05, + "learning_rate": 9.002849002849004e-06, + "logits/chosen": -0.1854289472103119, + "logits/rejected": -0.19475221633911133, + "logps/chosen": -247.6388397216797, + "logps/rejected": -315.0453186035156, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.3854804039001465, + "rewards/margins": 12.116926193237305, + "rewards/rejected": -19.50240707397461, + "step": 1060 + }, + { + "epoch": 0.35, + "grad_norm": 0.0009876694530248642, + "learning_rate": 8.991891299583608e-06, + "logits/chosen": -0.24933210015296936, + "logits/rejected": -0.28711989521980286, + "logps/chosen": -255.6931610107422, + "logps/rejected": -328.758056640625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.750466346740723, + "rewards/margins": 12.743404388427734, + "rewards/rejected": -19.493867874145508, + "step": 1070 + }, + { + "epoch": 0.35, + "grad_norm": 14.395219802856445, + "learning_rate": 8.980933596318213e-06, + "logits/chosen": -0.28123170137405396, + "logits/rejected": -0.31940752267837524, + "logps/chosen": -244.03466796875, + "logps/rejected": -313.8268737792969, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.721076965332031, + "rewards/margins": 10.604429244995117, + "rewards/rejected": -18.32550811767578, + "step": 1080 + }, + { + "epoch": 0.35, + "grad_norm": 0.04704693332314491, + "learning_rate": 8.969975893052817e-06, + "logits/chosen": -0.28245311975479126, + "logits/rejected": -0.3764280378818512, + "logps/chosen": -248.2042694091797, + "logps/rejected": -286.869873046875, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.773460388183594, + "rewards/margins": 10.64108943939209, + "rewards/rejected": -17.414548873901367, + "step": 1090 + }, + { + "epoch": 0.36, + "grad_norm": 0.00043834373354911804, + "learning_rate": 8.959018189787421e-06, + "logits/chosen": -0.20289742946624756, + "logits/rejected": -0.2996065020561218, + "logps/chosen": -297.94512939453125, + "logps/rejected": -347.0626525878906, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.10444164276123, + "rewards/margins": 12.04673957824707, + "rewards/rejected": -21.151180267333984, + "step": 1100 + }, + { + "epoch": 0.36, + "grad_norm": 0.12115999311208725, + "learning_rate": 8.948060486522026e-06, + "logits/chosen": -0.26539546251296997, + "logits/rejected": -0.33644038438796997, + "logps/chosen": -306.0853576660156, + "logps/rejected": -363.36517333984375, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.828388214111328, + "rewards/margins": 11.639673233032227, + "rewards/rejected": -20.468059539794922, + "step": 1110 + }, + { + "epoch": 0.36, + "grad_norm": 0.01209923718124628, + "learning_rate": 8.93710278325663e-06, + "logits/chosen": -0.20330910384655, + "logits/rejected": -0.2543596625328064, + "logps/chosen": -267.58465576171875, + "logps/rejected": -339.16552734375, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.954190254211426, + "rewards/margins": 12.7907133102417, + "rewards/rejected": -20.744905471801758, + "step": 1120 + }, + { + "epoch": 0.37, + "grad_norm": 0.017156679183244705, + "learning_rate": 8.926145079991234e-06, + "logits/chosen": -0.21037821471691132, + "logits/rejected": -0.335681676864624, + "logps/chosen": -238.2596435546875, + "logps/rejected": -287.96832275390625, + "loss": 0.1154, + "rewards/accuracies": 0.966666579246521, + "rewards/chosen": -6.764012813568115, + "rewards/margins": 10.85376262664795, + "rewards/rejected": -17.617774963378906, + "step": 1130 + }, + { + "epoch": 0.37, + "grad_norm": 0.01817765086889267, + "learning_rate": 8.915187376725838e-06, + "logits/chosen": -0.27532559633255005, + "logits/rejected": -0.3343648314476013, + "logps/chosen": -237.79660034179688, + "logps/rejected": -278.28424072265625, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.450657844543457, + "rewards/margins": 9.784764289855957, + "rewards/rejected": -16.235422134399414, + "step": 1140 + }, + { + "epoch": 0.37, + "grad_norm": 0.3464580178260803, + "learning_rate": 8.904229673460444e-06, + "logits/chosen": -0.20038633048534393, + "logits/rejected": -0.34763047099113464, + "logps/chosen": -264.88238525390625, + "logps/rejected": -303.27056884765625, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.401070594787598, + "rewards/margins": 10.594339370727539, + "rewards/rejected": -16.995410919189453, + "step": 1150 + }, + { + "epoch": 0.38, + "grad_norm": 0.010944131761789322, + "learning_rate": 8.893271970195047e-06, + "logits/chosen": -0.3168713450431824, + "logits/rejected": -0.3542029857635498, + "logps/chosen": -245.4883270263672, + "logps/rejected": -310.0267639160156, + "loss": 0.029, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.169398307800293, + "rewards/margins": 11.141454696655273, + "rewards/rejected": -17.310853958129883, + "step": 1160 + }, + { + "epoch": 0.38, + "grad_norm": 0.012095646932721138, + "learning_rate": 8.882314266929653e-06, + "logits/chosen": -0.2291782796382904, + "logits/rejected": -0.34653568267822266, + "logps/chosen": -222.2202606201172, + "logps/rejected": -258.69512939453125, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.978119850158691, + "rewards/margins": 9.679037094116211, + "rewards/rejected": -14.657157897949219, + "step": 1170 + }, + { + "epoch": 0.38, + "grad_norm": 0.14866305887699127, + "learning_rate": 8.871356563664255e-06, + "logits/chosen": -0.21821892261505127, + "logits/rejected": -0.28071773052215576, + "logps/chosen": -199.72018432617188, + "logps/rejected": -272.10589599609375, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8853073120117188, + "rewards/margins": 11.493281364440918, + "rewards/rejected": -15.37859058380127, + "step": 1180 + }, + { + "epoch": 0.38, + "grad_norm": 10.241555213928223, + "learning_rate": 8.860398860398861e-06, + "logits/chosen": -0.13861140608787537, + "logits/rejected": -0.20187318325042725, + "logps/chosen": -193.94729614257812, + "logps/rejected": -244.1735076904297, + "loss": 0.183, + "rewards/accuracies": 0.9666666984558105, + "rewards/chosen": -4.192530632019043, + "rewards/margins": 9.562729835510254, + "rewards/rejected": -13.755261421203613, + "step": 1190 + }, + { + "epoch": 0.39, + "grad_norm": 0.001308279111981392, + "learning_rate": 8.849441157133466e-06, + "logits/chosen": -0.2188553512096405, + "logits/rejected": -0.27837082743644714, + "logps/chosen": -256.08392333984375, + "logps/rejected": -314.3865661621094, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.751282691955566, + "rewards/margins": 11.724446296691895, + "rewards/rejected": -16.47572898864746, + "step": 1200 + }, + { + "epoch": 0.39, + "grad_norm": 0.0011354751186445355, + "learning_rate": 8.83848345386807e-06, + "logits/chosen": -0.33862900733947754, + "logits/rejected": -0.3313067555427551, + "logps/chosen": -259.02044677734375, + "logps/rejected": -325.8959655761719, + "loss": 0.0244, + "rewards/accuracies": 0.966666579246521, + "rewards/chosen": -5.828982830047607, + "rewards/margins": 12.15399169921875, + "rewards/rejected": -17.982975006103516, + "step": 1210 + }, + { + "epoch": 0.39, + "grad_norm": 0.022255534306168556, + "learning_rate": 8.827525750602674e-06, + "logits/chosen": -0.1525467336177826, + "logits/rejected": -0.30764085054397583, + "logps/chosen": -164.30361938476562, + "logps/rejected": -220.8531494140625, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6489479541778564, + "rewards/margins": 9.622652053833008, + "rewards/rejected": -13.271600723266602, + "step": 1220 + }, + { + "epoch": 0.4, + "grad_norm": 0.00422575231641531, + "learning_rate": 8.816568047337279e-06, + "logits/chosen": -0.3136499524116516, + "logits/rejected": -0.3792189657688141, + "logps/chosen": -228.77914428710938, + "logps/rejected": -287.58441162109375, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.276397705078125, + "rewards/margins": 11.693455696105957, + "rewards/rejected": -16.969852447509766, + "step": 1230 + }, + { + "epoch": 0.4, + "grad_norm": 0.0019199317321181297, + "learning_rate": 8.805610344071883e-06, + "logits/chosen": -0.26560407876968384, + "logits/rejected": -0.3271743655204773, + "logps/chosen": -268.156982421875, + "logps/rejected": -299.9332580566406, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.225447654724121, + "rewards/margins": 10.738948822021484, + "rewards/rejected": -16.964397430419922, + "step": 1240 + }, + { + "epoch": 0.4, + "grad_norm": 0.006867074873298407, + "learning_rate": 8.794652640806489e-06, + "logits/chosen": -0.15553151071071625, + "logits/rejected": -0.21395400166511536, + "logps/chosen": -241.8409423828125, + "logps/rejected": -289.00848388671875, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.778923988342285, + "rewards/margins": 9.904958724975586, + "rewards/rejected": -16.683881759643555, + "step": 1250 + }, + { + "epoch": 0.41, + "grad_norm": 0.009473240934312344, + "learning_rate": 8.783694937541091e-06, + "logits/chosen": -0.23859646916389465, + "logits/rejected": -0.27141764760017395, + "logps/chosen": -215.42276000976562, + "logps/rejected": -278.4677429199219, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.839932441711426, + "rewards/margins": 11.050471305847168, + "rewards/rejected": -16.890405654907227, + "step": 1260 + }, + { + "epoch": 0.41, + "grad_norm": 0.0002466948644723743, + "learning_rate": 8.772737234275697e-06, + "logits/chosen": -0.26885563135147095, + "logits/rejected": -0.31586629152297974, + "logps/chosen": -248.76797485351562, + "logps/rejected": -342.48406982421875, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.838107109069824, + "rewards/margins": 14.081052780151367, + "rewards/rejected": -20.919160842895508, + "step": 1270 + }, + { + "epoch": 0.41, + "grad_norm": 0.009549058973789215, + "learning_rate": 8.7617795310103e-06, + "logits/chosen": -0.10880441963672638, + "logits/rejected": -0.16234132647514343, + "logps/chosen": -250.2044677734375, + "logps/rejected": -319.41778564453125, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.536848545074463, + "rewards/margins": 12.7142972946167, + "rewards/rejected": -20.251144409179688, + "step": 1280 + }, + { + "epoch": 0.42, + "grad_norm": 0.006435507442802191, + "learning_rate": 8.750821827744906e-06, + "logits/chosen": -0.18445457518100739, + "logits/rejected": -0.18460145592689514, + "logps/chosen": -262.4994201660156, + "logps/rejected": -336.7874755859375, + "loss": 0.0339, + "rewards/accuracies": 0.9666666984558105, + "rewards/chosen": -7.42992639541626, + "rewards/margins": 12.39716911315918, + "rewards/rejected": -19.82709503173828, + "step": 1290 + }, + { + "epoch": 0.42, + "grad_norm": 0.23167112469673157, + "learning_rate": 8.73986412447951e-06, + "logits/chosen": -0.14932586252689362, + "logits/rejected": -0.19626209139823914, + "logps/chosen": -247.1484832763672, + "logps/rejected": -318.3656921386719, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.087119102478027, + "rewards/margins": 11.31184196472168, + "rewards/rejected": -18.39896011352539, + "step": 1300 + }, + { + "epoch": 0.42, + "grad_norm": 0.0005600708536803722, + "learning_rate": 8.728906421214115e-06, + "logits/chosen": -0.1394408792257309, + "logits/rejected": -0.14028649032115936, + "logps/chosen": -289.9681701660156, + "logps/rejected": -364.3168640136719, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.336925506591797, + "rewards/margins": 12.717875480651855, + "rewards/rejected": -22.054800033569336, + "step": 1310 + }, + { + "epoch": 0.43, + "grad_norm": 0.14742839336395264, + "learning_rate": 8.717948717948719e-06, + "logits/chosen": -0.06897391378879547, + "logits/rejected": -0.10997577756643295, + "logps/chosen": -258.9363708496094, + "logps/rejected": -328.3492736816406, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.73633098602295, + "rewards/margins": 11.629599571228027, + "rewards/rejected": -20.365930557250977, + "step": 1320 + }, + { + "epoch": 0.43, + "grad_norm": 0.0032621161080896854, + "learning_rate": 8.706991014683323e-06, + "logits/chosen": 0.09344655275344849, + "logits/rejected": 0.04874902218580246, + "logps/chosen": -256.26904296875, + "logps/rejected": -334.4007263183594, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.622756958007812, + "rewards/margins": 12.946261405944824, + "rewards/rejected": -21.569019317626953, + "step": 1330 + }, + { + "epoch": 0.43, + "grad_norm": 0.00016526717809028924, + "learning_rate": 8.696033311417927e-06, + "logits/chosen": -0.011213278397917747, + "logits/rejected": -0.029890483245253563, + "logps/chosen": -285.08502197265625, + "logps/rejected": -352.0074462890625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.995809555053711, + "rewards/margins": 12.493630409240723, + "rewards/rejected": -22.489439010620117, + "step": 1340 + }, + { + "epoch": 0.44, + "grad_norm": 0.0005608515930362046, + "learning_rate": 8.685075608152532e-06, + "logits/chosen": -0.0565456822514534, + "logits/rejected": -0.04827792942523956, + "logps/chosen": -273.8840637207031, + "logps/rejected": -346.35858154296875, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.661928176879883, + "rewards/margins": 12.965700149536133, + "rewards/rejected": -21.627628326416016, + "step": 1350 + }, + { + "epoch": 0.44, + "grad_norm": 0.2561068832874298, + "learning_rate": 8.674117904887136e-06, + "logits/chosen": -0.14858858287334442, + "logits/rejected": -0.1448180228471756, + "logps/chosen": -297.3650207519531, + "logps/rejected": -404.10540771484375, + "loss": 0.0238, + "rewards/accuracies": 0.966666579246521, + "rewards/chosen": -10.200472831726074, + "rewards/margins": 15.200775146484375, + "rewards/rejected": -25.401248931884766, + "step": 1360 + }, + { + "epoch": 0.44, + "grad_norm": 0.6736346483230591, + "learning_rate": 8.66316020162174e-06, + "logits/chosen": 0.014827290549874306, + "logits/rejected": 0.013812633231282234, + "logps/chosen": -257.91583251953125, + "logps/rejected": -342.5897216796875, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.798688888549805, + "rewards/margins": 14.296686172485352, + "rewards/rejected": -23.095375061035156, + "step": 1370 + }, + { + "epoch": 0.45, + "grad_norm": 5.739891093980987e-06, + "learning_rate": 8.652202498356346e-06, + "logits/chosen": -0.004905500914901495, + "logits/rejected": 0.008281905204057693, + "logps/chosen": -284.5867614746094, + "logps/rejected": -372.309326171875, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.582194328308105, + "rewards/margins": 13.63042163848877, + "rewards/rejected": -24.212615966796875, + "step": 1380 + }, + { + "epoch": 0.45, + "grad_norm": 10.869261741638184, + "learning_rate": 8.641244795090949e-06, + "logits/chosen": -0.06507638841867447, + "logits/rejected": -0.0575793981552124, + "logps/chosen": -294.3569641113281, + "logps/rejected": -353.50250244140625, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.367159843444824, + "rewards/margins": 11.184629440307617, + "rewards/rejected": -22.551788330078125, + "step": 1390 + }, + { + "epoch": 0.45, + "grad_norm": 0.0022530544083565474, + "learning_rate": 8.630287091825555e-06, + "logits/chosen": 0.012960417196154594, + "logits/rejected": -0.018600907176733017, + "logps/chosen": -246.05801391601562, + "logps/rejected": -325.35577392578125, + "loss": 0.021, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.39463996887207, + "rewards/margins": 12.46562385559082, + "rewards/rejected": -20.86026382446289, + "step": 1400 + }, + { + "epoch": 0.46, + "grad_norm": 0.0010853647254407406, + "learning_rate": 8.619329388560157e-06, + "logits/chosen": -0.03639475628733635, + "logits/rejected": 0.0061216773465275764, + "logps/chosen": -252.0388641357422, + "logps/rejected": -349.8416442871094, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.54641056060791, + "rewards/margins": 13.886810302734375, + "rewards/rejected": -22.4332218170166, + "step": 1410 + }, + { + "epoch": 0.46, + "grad_norm": 0.5238139033317566, + "learning_rate": 8.608371685294763e-06, + "logits/chosen": -0.11145244538784027, + "logits/rejected": -0.12166018784046173, + "logps/chosen": -252.606689453125, + "logps/rejected": -314.7965087890625, + "loss": 0.1077, + "rewards/accuracies": 0.9666666984558105, + "rewards/chosen": -7.48656702041626, + "rewards/margins": 11.383016586303711, + "rewards/rejected": -18.869583129882812, + "step": 1420 + }, + { + "epoch": 0.46, + "grad_norm": 0.383899062871933, + "learning_rate": 8.597413982029368e-06, + "logits/chosen": -0.0349181704223156, + "logits/rejected": -0.01999412663280964, + "logps/chosen": -250.32009887695312, + "logps/rejected": -321.7938537597656, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.556001663208008, + "rewards/margins": 11.27842903137207, + "rewards/rejected": -19.834430694580078, + "step": 1430 + }, + { + "epoch": 0.47, + "grad_norm": 0.03882277384400368, + "learning_rate": 8.586456278763972e-06, + "logits/chosen": -0.041800715029239655, + "logits/rejected": 0.028328755870461464, + "logps/chosen": -264.7074890136719, + "logps/rejected": -359.95001220703125, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.131043434143066, + "rewards/margins": 14.964799880981445, + "rewards/rejected": -24.095842361450195, + "step": 1440 + }, + { + "epoch": 0.47, + "grad_norm": 0.005863294005393982, + "learning_rate": 8.575498575498576e-06, + "logits/chosen": -0.008469844236969948, + "logits/rejected": 0.002064249012619257, + "logps/chosen": -189.87020874023438, + "logps/rejected": -289.1915588378906, + "loss": 0.0173, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.485939979553223, + "rewards/margins": 12.884135246276855, + "rewards/rejected": -18.370075225830078, + "step": 1450 + }, + { + "epoch": 0.47, + "grad_norm": 0.09647411108016968, + "learning_rate": 8.56454087223318e-06, + "logits/chosen": 0.015825632959604263, + "logits/rejected": -0.02976151742041111, + "logps/chosen": -366.56439208984375, + "logps/rejected": -430.0389709472656, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.442001342773438, + "rewards/margins": 12.91467571258545, + "rewards/rejected": -26.356678009033203, + "step": 1460 + }, + { + "epoch": 0.48, + "grad_norm": 0.10359703004360199, + "learning_rate": 8.553583168967785e-06, + "logits/chosen": -0.09159889072179794, + "logits/rejected": -0.06940022855997086, + "logps/chosen": -315.4118347167969, + "logps/rejected": -377.8108215332031, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.500893592834473, + "rewards/margins": 13.29771614074707, + "rewards/rejected": -23.79861068725586, + "step": 1470 + }, + { + "epoch": 0.48, + "grad_norm": 0.011625263839960098, + "learning_rate": 8.54262546570239e-06, + "logits/chosen": -0.1008826494216919, + "logits/rejected": -0.05810718610882759, + "logps/chosen": -264.00909423828125, + "logps/rejected": -356.61163330078125, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.526174545288086, + "rewards/margins": 14.310162544250488, + "rewards/rejected": -23.836336135864258, + "step": 1480 + }, + { + "epoch": 0.48, + "grad_norm": 0.024129299446940422, + "learning_rate": 8.531667762436993e-06, + "logits/chosen": 0.0249390359967947, + "logits/rejected": 0.004033858422189951, + "logps/chosen": -295.7063903808594, + "logps/rejected": -394.6076965332031, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.936558723449707, + "rewards/margins": 14.934048652648926, + "rewards/rejected": -25.87060546875, + "step": 1490 + }, + { + "epoch": 0.49, + "grad_norm": 0.006154273636639118, + "learning_rate": 8.5207100591716e-06, + "logits/chosen": 0.04360217973589897, + "logits/rejected": 0.03041175566613674, + "logps/chosen": -236.54794311523438, + "logps/rejected": -317.7098693847656, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.552594184875488, + "rewards/margins": 11.904518127441406, + "rewards/rejected": -20.457111358642578, + "step": 1500 + }, + { + "epoch": 0.49, + "grad_norm": 0.3130321502685547, + "learning_rate": 8.509752355906202e-06, + "logits/chosen": -0.07926555722951889, + "logits/rejected": -0.02901688776910305, + "logps/chosen": -248.68408203125, + "logps/rejected": -326.94940185546875, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.911336898803711, + "rewards/margins": 13.27922534942627, + "rewards/rejected": -22.190561294555664, + "step": 1510 + }, + { + "epoch": 0.49, + "grad_norm": 0.09558708965778351, + "learning_rate": 8.498794652640808e-06, + "logits/chosen": -0.0705290287733078, + "logits/rejected": -0.06556431949138641, + "logps/chosen": -289.7066955566406, + "logps/rejected": -359.8416748046875, + "loss": 0.0249, + "rewards/accuracies": 0.9666666984558105, + "rewards/chosen": -10.718439102172852, + "rewards/margins": 12.26536750793457, + "rewards/rejected": -22.983808517456055, + "step": 1520 + }, + { + "epoch": 0.49, + "grad_norm": 0.02135203592479229, + "learning_rate": 8.487836949375412e-06, + "logits/chosen": 0.08843693137168884, + "logits/rejected": 0.10300026834011078, + "logps/chosen": -283.0185546875, + "logps/rejected": -395.1629333496094, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.282575607299805, + "rewards/margins": 15.90794563293457, + "rewards/rejected": -26.190521240234375, + "step": 1530 + }, + { + "epoch": 0.5, + "grad_norm": 0.02689371071755886, + "learning_rate": 8.476879246110016e-06, + "logits/chosen": -0.054816532880067825, + "logits/rejected": -0.028935739770531654, + "logps/chosen": -340.44329833984375, + "logps/rejected": -424.52105712890625, + "loss": 0.0361, + "rewards/accuracies": 0.9666666984558105, + "rewards/chosen": -13.073100090026855, + "rewards/margins": 14.94751262664795, + "rewards/rejected": -28.020618438720703, + "step": 1540 + }, + { + "epoch": 0.5, + "grad_norm": 0.005785965360701084, + "learning_rate": 8.46592154284462e-06, + "logits/chosen": -0.014959866181015968, + "logits/rejected": 0.0628860592842102, + "logps/chosen": -238.8596954345703, + "logps/rejected": -339.1539611816406, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.337881088256836, + "rewards/margins": 14.204916000366211, + "rewards/rejected": -23.542797088623047, + "step": 1550 + }, + { + "epoch": 0.5, + "grad_norm": 0.1502775102853775, + "learning_rate": 8.454963839579225e-06, + "logits/chosen": 0.05573273450136185, + "logits/rejected": 0.09205415844917297, + "logps/chosen": -303.62701416015625, + "logps/rejected": -364.28204345703125, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.095705032348633, + "rewards/margins": 12.889692306518555, + "rewards/rejected": -24.985401153564453, + "step": 1560 + }, + { + "epoch": 0.51, + "grad_norm": 0.01375632081180811, + "learning_rate": 8.44400613631383e-06, + "logits/chosen": -0.0028796226251870394, + "logits/rejected": 0.055630385875701904, + "logps/chosen": -247.17288208007812, + "logps/rejected": -334.44195556640625, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.552030563354492, + "rewards/margins": 12.745870590209961, + "rewards/rejected": -21.297901153564453, + "step": 1570 + }, + { + "epoch": 0.51, + "grad_norm": 0.015819918364286423, + "learning_rate": 8.433048433048434e-06, + "logits/chosen": 0.029922613874077797, + "logits/rejected": 0.1054656133055687, + "logps/chosen": -290.9429626464844, + "logps/rejected": -389.000732421875, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.37781810760498, + "rewards/margins": 14.447565078735352, + "rewards/rejected": -25.82538414001465, + "step": 1580 + }, + { + "epoch": 0.51, + "grad_norm": 0.06199351325631142, + "learning_rate": 8.422090729783038e-06, + "logits/chosen": 0.034867942333221436, + "logits/rejected": 0.1408187299966812, + "logps/chosen": -304.77081298828125, + "logps/rejected": -402.5494079589844, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.836946487426758, + "rewards/margins": 15.10669994354248, + "rewards/rejected": -27.943645477294922, + "step": 1590 + }, + { + "epoch": 0.52, + "grad_norm": 0.011423285119235516, + "learning_rate": 8.411133026517642e-06, + "logits/chosen": 0.07202502340078354, + "logits/rejected": 0.15315476059913635, + "logps/chosen": -314.509033203125, + "logps/rejected": -408.44647216796875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.762028694152832, + "rewards/margins": 14.413556098937988, + "rewards/rejected": -27.175586700439453, + "step": 1600 + }, + { + "epoch": 0.52, + "grad_norm": 0.015212434343993664, + "learning_rate": 8.400175323252246e-06, + "logits/chosen": 0.07158254086971283, + "logits/rejected": 0.12116159498691559, + "logps/chosen": -301.7251892089844, + "logps/rejected": -424.9703674316406, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.060445785522461, + "rewards/margins": 15.829069137573242, + "rewards/rejected": -29.889516830444336, + "step": 1610 + }, + { + "epoch": 0.52, + "grad_norm": 0.00037714597419835627, + "learning_rate": 8.38921761998685e-06, + "logits/chosen": 0.07507513463497162, + "logits/rejected": 0.1150326281785965, + "logps/chosen": -228.2713165283203, + "logps/rejected": -332.7132263183594, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.718751907348633, + "rewards/margins": 14.904324531555176, + "rewards/rejected": -23.623075485229492, + "step": 1620 + }, + { + "epoch": 0.53, + "grad_norm": 0.00014221732271835208, + "learning_rate": 8.378259916721457e-06, + "logits/chosen": 0.19829820096492767, + "logits/rejected": 0.1768883615732193, + "logps/chosen": -286.7662048339844, + "logps/rejected": -363.81463623046875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.576507568359375, + "rewards/margins": 13.473172187805176, + "rewards/rejected": -26.049678802490234, + "step": 1630 + }, + { + "epoch": 0.53, + "grad_norm": 0.2192688137292862, + "learning_rate": 8.36730221345606e-06, + "logits/chosen": 0.1942572295665741, + "logits/rejected": 0.22028391063213348, + "logps/chosen": -301.4276123046875, + "logps/rejected": -383.99432373046875, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.628840446472168, + "rewards/margins": 13.160855293273926, + "rewards/rejected": -26.789697647094727, + "step": 1640 + }, + { + "epoch": 0.53, + "grad_norm": 0.00010946116526611149, + "learning_rate": 8.356344510190665e-06, + "logits/chosen": 0.10659674555063248, + "logits/rejected": 0.1302722990512848, + "logps/chosen": -402.503173828125, + "logps/rejected": -484.22589111328125, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.998889923095703, + "rewards/margins": 16.14960479736328, + "rewards/rejected": -33.148494720458984, + "step": 1650 + }, + { + "epoch": 0.54, + "grad_norm": 0.017199842259287834, + "learning_rate": 8.34538680692527e-06, + "logits/chosen": 0.07519405335187912, + "logits/rejected": 0.15728269517421722, + "logps/chosen": -284.95184326171875, + "logps/rejected": -384.1084289550781, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.825132369995117, + "rewards/margins": 12.739798545837402, + "rewards/rejected": -26.564931869506836, + "step": 1660 + }, + { + "epoch": 0.54, + "grad_norm": 0.01719660870730877, + "learning_rate": 8.334429103659874e-06, + "logits/chosen": 0.07660949975252151, + "logits/rejected": 0.10592161118984222, + "logps/chosen": -231.68411254882812, + "logps/rejected": -322.4700012207031, + "loss": 0.0173, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.59976863861084, + "rewards/margins": 12.979411125183105, + "rewards/rejected": -22.579181671142578, + "step": 1670 + }, + { + "epoch": 0.54, + "grad_norm": 0.012668246403336525, + "learning_rate": 8.323471400394478e-06, + "logits/chosen": 0.11749809980392456, + "logits/rejected": 0.1774546504020691, + "logps/chosen": -322.8683776855469, + "logps/rejected": -432.86553955078125, + "loss": 0.0232, + "rewards/accuracies": 0.966666579246521, + "rewards/chosen": -12.755744934082031, + "rewards/margins": 16.210979461669922, + "rewards/rejected": -28.966724395751953, + "step": 1680 + }, + { + "epoch": 0.55, + "grad_norm": 0.007913627661764622, + "learning_rate": 8.312513697129082e-06, + "logits/chosen": 0.11255357414484024, + "logits/rejected": 0.1525614708662033, + "logps/chosen": -303.31060791015625, + "logps/rejected": -404.16766357421875, + "loss": 0.0255, + "rewards/accuracies": 0.9666666984558105, + "rewards/chosen": -12.78972339630127, + "rewards/margins": 14.71977424621582, + "rewards/rejected": -27.509496688842773, + "step": 1690 + }, + { + "epoch": 0.55, + "grad_norm": 7.69473408581689e-05, + "learning_rate": 8.301555993863687e-06, + "logits/chosen": 0.07176389545202255, + "logits/rejected": 0.14741338789463043, + "logps/chosen": -381.74859619140625, + "logps/rejected": -479.43280029296875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.687698364257812, + "rewards/margins": 17.027114868164062, + "rewards/rejected": -32.71481704711914, + "step": 1700 + }, + { + "epoch": 0.55, + "grad_norm": 0.020641475915908813, + "learning_rate": 8.290598290598293e-06, + "logits/chosen": 0.027815943583846092, + "logits/rejected": 0.06404396146535873, + "logps/chosen": -326.9677734375, + "logps/rejected": -426.5794982910156, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.755020141601562, + "rewards/margins": 15.451559066772461, + "rewards/rejected": -29.206579208374023, + "step": 1710 + }, + { + "epoch": 0.56, + "grad_norm": 1.0741193818830652e-06, + "learning_rate": 8.279640587332895e-06, + "logits/chosen": -0.04963821545243263, + "logits/rejected": 0.060973964631557465, + "logps/chosen": -282.719482421875, + "logps/rejected": -397.8350830078125, + "loss": 0.0378, + "rewards/accuracies": 0.966666579246521, + "rewards/chosen": -10.77167797088623, + "rewards/margins": 16.46950912475586, + "rewards/rejected": -27.241186141967773, + "step": 1720 + }, + { + "epoch": 0.56, + "grad_norm": 0.0007996432832442224, + "learning_rate": 8.268682884067501e-06, + "logits/chosen": 0.0820159837603569, + "logits/rejected": 0.05762636661529541, + "logps/chosen": -317.8184509277344, + "logps/rejected": -399.0465087890625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.283085823059082, + "rewards/margins": 14.6398286819458, + "rewards/rejected": -26.922916412353516, + "step": 1730 + }, + { + "epoch": 0.56, + "grad_norm": 0.0067008682526648045, + "learning_rate": 8.257725180802104e-06, + "logits/chosen": 0.03673550486564636, + "logits/rejected": 0.06919295340776443, + "logps/chosen": -289.0475769042969, + "logps/rejected": -387.4378662109375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.43751049041748, + "rewards/margins": 16.087310791015625, + "rewards/rejected": -25.52482032775879, + "step": 1740 + }, + { + "epoch": 0.57, + "grad_norm": 1.0896865129470825, + "learning_rate": 8.24676747753671e-06, + "logits/chosen": 0.10127731412649155, + "logits/rejected": 0.16299596428871155, + "logps/chosen": -288.1521911621094, + "logps/rejected": -375.989501953125, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.477235794067383, + "rewards/margins": 15.621920585632324, + "rewards/rejected": -25.09915542602539, + "step": 1750 + }, + { + "epoch": 0.57, + "grad_norm": 0.21109241247177124, + "learning_rate": 8.235809774271314e-06, + "logits/chosen": 0.008235934190452099, + "logits/rejected": 0.028172463178634644, + "logps/chosen": -295.41619873046875, + "logps/rejected": -404.15533447265625, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.374763488769531, + "rewards/margins": 15.838516235351562, + "rewards/rejected": -27.21327781677246, + "step": 1760 + }, + { + "epoch": 0.57, + "grad_norm": 0.341794490814209, + "learning_rate": 8.224852071005918e-06, + "logits/chosen": 0.1264527142047882, + "logits/rejected": 0.15065120160579681, + "logps/chosen": -292.97430419921875, + "logps/rejected": -380.7444152832031, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.181876182556152, + "rewards/margins": 14.194729804992676, + "rewards/rejected": -25.376605987548828, + "step": 1770 + }, + { + "epoch": 0.58, + "grad_norm": 0.007328469771891832, + "learning_rate": 8.213894367740523e-06, + "logits/chosen": 0.023362448439002037, + "logits/rejected": 0.03181435540318489, + "logps/chosen": -292.8009948730469, + "logps/rejected": -382.83343505859375, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.59870719909668, + "rewards/margins": 12.62264347076416, + "rewards/rejected": -24.221351623535156, + "step": 1780 + }, + { + "epoch": 0.58, + "grad_norm": 0.27736029028892517, + "learning_rate": 8.202936664475127e-06, + "logits/chosen": -0.05786416679620743, + "logits/rejected": 0.0343763530254364, + "logps/chosen": -281.7183532714844, + "logps/rejected": -384.5142822265625, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.055322647094727, + "rewards/margins": 14.507980346679688, + "rewards/rejected": -25.56330108642578, + "step": 1790 + }, + { + "epoch": 0.58, + "grad_norm": 0.00044713946408592165, + "learning_rate": 8.191978961209731e-06, + "logits/chosen": 0.04160480573773384, + "logits/rejected": 0.15656664967536926, + "logps/chosen": -255.13864135742188, + "logps/rejected": -372.533935546875, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.455907821655273, + "rewards/margins": 16.43738555908203, + "rewards/rejected": -25.893295288085938, + "step": 1800 + }, + { + "epoch": 0.59, + "grad_norm": 0.15391205251216888, + "learning_rate": 8.181021257944335e-06, + "logits/chosen": 0.13832136988639832, + "logits/rejected": 0.21045103669166565, + "logps/chosen": -304.4212341308594, + "logps/rejected": -408.509033203125, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.320929527282715, + "rewards/margins": 14.877690315246582, + "rewards/rejected": -27.198617935180664, + "step": 1810 + }, + { + "epoch": 0.59, + "grad_norm": 0.1734129786491394, + "learning_rate": 8.17006355467894e-06, + "logits/chosen": -0.008830100297927856, + "logits/rejected": 0.08690972626209259, + "logps/chosen": -317.80670166015625, + "logps/rejected": -403.6852111816406, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.214055061340332, + "rewards/margins": 15.352259635925293, + "rewards/rejected": -28.566314697265625, + "step": 1820 + }, + { + "epoch": 0.59, + "grad_norm": 1.4723388064297183e-09, + "learning_rate": 8.159105851413544e-06, + "logits/chosen": 0.10790036618709564, + "logits/rejected": 0.20435258746147156, + "logps/chosen": -244.66781616210938, + "logps/rejected": -349.64190673828125, + "loss": 0.0391, + "rewards/accuracies": 0.966666579246521, + "rewards/chosen": -9.495941162109375, + "rewards/margins": 14.044004440307617, + "rewards/rejected": -23.539945602416992, + "step": 1830 + }, + { + "epoch": 0.6, + "grad_norm": 4.041919601149857e-05, + "learning_rate": 8.148148148148148e-06, + "logits/chosen": 0.1392899751663208, + "logits/rejected": 0.17356742918491364, + "logps/chosen": -271.93536376953125, + "logps/rejected": -376.3609313964844, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.918790817260742, + "rewards/margins": 15.073224067687988, + "rewards/rejected": -26.992013931274414, + "step": 1840 + }, + { + "epoch": 0.6, + "grad_norm": 3.694919769259286e-06, + "learning_rate": 8.137190444882753e-06, + "logits/chosen": 0.2150353193283081, + "logits/rejected": 0.2910372018814087, + "logps/chosen": -296.4953918457031, + "logps/rejected": -429.4913635253906, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.478975296020508, + "rewards/margins": 17.906230926513672, + "rewards/rejected": -30.385208129882812, + "step": 1850 + }, + { + "epoch": 0.6, + "grad_norm": 14.997368812561035, + "learning_rate": 8.126232741617359e-06, + "logits/chosen": 0.13269153237342834, + "logits/rejected": 0.1637151837348938, + "logps/chosen": -393.65887451171875, + "logps/rejected": -488.2777404785156, + "loss": 0.0282, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.513179779052734, + "rewards/margins": 16.430784225463867, + "rewards/rejected": -33.943965911865234, + "step": 1860 + }, + { + "epoch": 0.6, + "grad_norm": 0.0013834636192768812, + "learning_rate": 8.115275038351961e-06, + "logits/chosen": 0.07304046303033829, + "logits/rejected": 0.147117480635643, + "logps/chosen": -308.97161865234375, + "logps/rejected": -416.3505859375, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.166252136230469, + "rewards/margins": 15.148821830749512, + "rewards/rejected": -28.315073013305664, + "step": 1870 + }, + { + "epoch": 0.61, + "grad_norm": 0.005938555579632521, + "learning_rate": 8.104317335086567e-06, + "logits/chosen": 0.02907967008650303, + "logits/rejected": 0.11755422502756119, + "logps/chosen": -310.50933837890625, + "logps/rejected": -403.22247314453125, + "loss": 0.0225, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.069503784179688, + "rewards/margins": 15.674043655395508, + "rewards/rejected": -26.743549346923828, + "step": 1880 + }, + { + "epoch": 0.61, + "grad_norm": 0.3666568398475647, + "learning_rate": 8.09335963182117e-06, + "logits/chosen": -0.004576456733047962, + "logits/rejected": 0.07972874492406845, + "logps/chosen": -337.0972900390625, + "logps/rejected": -455.228759765625, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.475229263305664, + "rewards/margins": 17.417362213134766, + "rewards/rejected": -28.892593383789062, + "step": 1890 + }, + { + "epoch": 0.61, + "grad_norm": 0.050740547478199005, + "learning_rate": 8.082401928555776e-06, + "logits/chosen": 0.006110090762376785, + "logits/rejected": 0.045890793204307556, + "logps/chosen": -242.1782684326172, + "logps/rejected": -327.7618408203125, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.432751655578613, + "rewards/margins": 15.302406311035156, + "rewards/rejected": -22.735158920288086, + "step": 1900 + }, + { + "epoch": 0.62, + "grad_norm": 0.05140925943851471, + "learning_rate": 8.07144422529038e-06, + "logits/chosen": -0.109877809882164, + "logits/rejected": -0.02618744969367981, + "logps/chosen": -255.05599975585938, + "logps/rejected": -364.76654052734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.201934814453125, + "rewards/margins": 16.78182029724121, + "rewards/rejected": -24.983755111694336, + "step": 1910 + }, + { + "epoch": 0.62, + "grad_norm": 3.2723581790924072, + "learning_rate": 8.060486522024984e-06, + "logits/chosen": -0.030681187286973, + "logits/rejected": 0.031235750764608383, + "logps/chosen": -223.58474731445312, + "logps/rejected": -289.45196533203125, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.744879722595215, + "rewards/margins": 11.148603439331055, + "rewards/rejected": -18.893482208251953, + "step": 1920 + }, + { + "epoch": 0.62, + "grad_norm": 1.3139744997024536, + "learning_rate": 8.049528818759589e-06, + "logits/chosen": -0.010395990684628487, + "logits/rejected": 0.053827375173568726, + "logps/chosen": -260.6311340332031, + "logps/rejected": -343.34796142578125, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.475730895996094, + "rewards/margins": 12.925222396850586, + "rewards/rejected": -21.400951385498047, + "step": 1930 + }, + { + "epoch": 0.63, + "grad_norm": 0.0011528899194672704, + "learning_rate": 8.038571115494193e-06, + "logits/chosen": 0.004040165338665247, + "logits/rejected": 0.09620045125484467, + "logps/chosen": -265.73260498046875, + "logps/rejected": -368.19976806640625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.830063819885254, + "rewards/margins": 14.222076416015625, + "rewards/rejected": -23.052141189575195, + "step": 1940 + }, + { + "epoch": 0.63, + "grad_norm": 3.633260348578915e-05, + "learning_rate": 8.027613412228797e-06, + "logits/chosen": 0.08991348743438721, + "logits/rejected": 0.13095875084400177, + "logps/chosen": -250.6635284423828, + "logps/rejected": -348.5271301269531, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.241789817810059, + "rewards/margins": 15.90589714050293, + "rewards/rejected": -24.147687911987305, + "step": 1950 + }, + { + "epoch": 0.63, + "grad_norm": 0.008139081299304962, + "learning_rate": 8.016655708963403e-06, + "logits/chosen": -0.05623581260442734, + "logits/rejected": 0.008502885699272156, + "logps/chosen": -312.2218017578125, + "logps/rejected": -426.6249084472656, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.09146785736084, + "rewards/margins": 17.29364013671875, + "rewards/rejected": -28.385107040405273, + "step": 1960 + }, + { + "epoch": 0.64, + "grad_norm": 0.0004545208648778498, + "learning_rate": 8.005698005698006e-06, + "logits/chosen": 0.013862645253539085, + "logits/rejected": 0.05150808021426201, + "logps/chosen": -273.6121826171875, + "logps/rejected": -365.71307373046875, + "loss": 0.0331, + "rewards/accuracies": 0.9666666984558105, + "rewards/chosen": -8.933341026306152, + "rewards/margins": 15.150711059570312, + "rewards/rejected": -24.08405113220215, + "step": 1970 + }, + { + "epoch": 0.64, + "grad_norm": 0.07295417040586472, + "learning_rate": 7.994740302432612e-06, + "logits/chosen": 0.04458921402692795, + "logits/rejected": 0.10884840786457062, + "logps/chosen": -224.69967651367188, + "logps/rejected": -325.59356689453125, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.435521125793457, + "rewards/margins": 14.62103271484375, + "rewards/rejected": -22.05655288696289, + "step": 1980 + }, + { + "epoch": 0.64, + "grad_norm": 0.00015990910469554365, + "learning_rate": 7.983782599167214e-06, + "logits/chosen": -0.07026857882738113, + "logits/rejected": 0.06809535622596741, + "logps/chosen": -344.2391357421875, + "logps/rejected": -496.5973205566406, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.867463111877441, + "rewards/margins": 20.219188690185547, + "rewards/rejected": -32.08665466308594, + "step": 1990 + }, + { + "epoch": 0.65, + "grad_norm": 0.023937899619340897, + "learning_rate": 7.97282489590182e-06, + "logits/chosen": 0.07235778868198395, + "logits/rejected": 0.1107616052031517, + "logps/chosen": -232.8976287841797, + "logps/rejected": -342.6751708984375, + "loss": 0.0496, + "rewards/accuracies": 0.9666666984558105, + "rewards/chosen": -8.303656578063965, + "rewards/margins": 14.42664909362793, + "rewards/rejected": -22.730304718017578, + "step": 2000 + } + ], + "logging_steps": 10, + "max_steps": 9276, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 3, + "trial_name": null, + "trial_params": null +}