{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 368, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 2062.9417756205603, "learning_rate": 2.702702702702703e-10, "logits/chosen": -1.3332719802856445, "logits/rejected": -1.246394395828247, "logps/chosen": -286.9539794921875, "logps/rejected": -263.3782958984375, "loss": 0.7136, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.05, "grad_norm": 2488.3980990852974, "learning_rate": 2.702702702702703e-09, "logits/chosen": -1.6142714023590088, "logits/rejected": -1.3925563097000122, "logps/chosen": -342.4814758300781, "logps/rejected": -294.5446472167969, "loss": 0.8226, "rewards/accuracies": 0.4618055522441864, "rewards/chosen": 0.079922616481781, "rewards/margins": 0.09200635552406311, "rewards/rejected": -0.012083739042282104, "step": 10 }, { "epoch": 0.11, "grad_norm": 2085.30491295085, "learning_rate": 5.405405405405406e-09, "logits/chosen": -1.4863827228546143, "logits/rejected": -1.3085709810256958, "logps/chosen": -314.74273681640625, "logps/rejected": -279.32977294921875, "loss": 0.8217, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.03496693819761276, "rewards/margins": 0.07092654705047607, "rewards/rejected": -0.03595960885286331, "step": 20 }, { "epoch": 0.16, "grad_norm": 2613.9787597915297, "learning_rate": 8.108108108108109e-09, "logits/chosen": -1.5464979410171509, "logits/rejected": -1.3788726329803467, "logps/chosen": -324.9065246582031, "logps/rejected": -286.29925537109375, "loss": 0.8318, "rewards/accuracies": 0.515625, "rewards/chosen": -0.0007322698947973549, "rewards/margins": 0.02973010204732418, "rewards/rejected": -0.030462373048067093, "step": 30 }, { "epoch": 0.22, "grad_norm": 2309.6989479898994, "learning_rate": 9.997973265157192e-09, "logits/chosen": -1.5338213443756104, "logits/rejected": -1.356065034866333, "logps/chosen": -325.39349365234375, "logps/rejected": -285.630859375, "loss": 0.8544, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": -0.00019043684005737305, "rewards/margins": -0.028223956003785133, "rewards/rejected": 0.02803351916372776, "step": 40 }, { "epoch": 0.27, "grad_norm": 2372.8781916000794, "learning_rate": 9.961988113473708e-09, "logits/chosen": -1.540814757347107, "logits/rejected": -1.3939155340194702, "logps/chosen": -337.01385498046875, "logps/rejected": -297.3047790527344, "loss": 0.7925, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.010568022727966309, "rewards/margins": 0.0009421706199645996, "rewards/rejected": 0.009625854901969433, "step": 50 }, { "epoch": 0.33, "grad_norm": 1906.9193219897543, "learning_rate": 9.881337335184878e-09, "logits/chosen": -1.5821880102157593, "logits/rejected": -1.433316707611084, "logps/chosen": -319.8349609375, "logps/rejected": -285.03131103515625, "loss": 0.7444, "rewards/accuracies": 0.59375, "rewards/chosen": 0.011926290579140186, "rewards/margins": 0.23517760634422302, "rewards/rejected": -0.22325129806995392, "step": 60 }, { "epoch": 0.38, "grad_norm": 2229.621479388874, "learning_rate": 9.756746912994832e-09, "logits/chosen": -1.5089519023895264, "logits/rejected": -1.3478004932403564, "logps/chosen": -312.11767578125, "logps/rejected": -275.03704833984375, "loss": 0.7381, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": -0.015234187245368958, "rewards/margins": 0.07565010339021683, "rewards/rejected": -0.09088429063558578, "step": 70 }, { "epoch": 0.43, "grad_norm": 1970.0426820414286, "learning_rate": 9.589338354885628e-09, "logits/chosen": -1.591552734375, "logits/rejected": -1.4374128580093384, "logps/chosen": -323.3088684082031, "logps/rejected": -288.12445068359375, "loss": 0.7257, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.1117367148399353, "rewards/margins": 0.34563174843788147, "rewards/rejected": -0.23389501869678497, "step": 80 }, { "epoch": 0.49, "grad_norm": 1647.476042777907, "learning_rate": 9.380618598797472e-09, "logits/chosen": -1.6083869934082031, "logits/rejected": -1.4117141962051392, "logps/chosen": -319.9634094238281, "logps/rejected": -281.79248046875, "loss": 0.6768, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.1753208488225937, "rewards/margins": 0.44467267394065857, "rewards/rejected": -0.2693518102169037, "step": 90 }, { "epoch": 0.54, "grad_norm": 1779.591190181612, "learning_rate": 9.132466447838596e-09, "logits/chosen": -1.5439790487289429, "logits/rejected": -1.368858814239502, "logps/chosen": -321.8800964355469, "logps/rejected": -282.66168212890625, "loss": 0.6482, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.34998807311058044, "rewards/margins": 0.6073418855667114, "rewards/rejected": -0.25735384225845337, "step": 100 }, { "epoch": 0.6, "grad_norm": 1799.5128068859713, "learning_rate": 8.847115658129039e-09, "logits/chosen": -1.5068881511688232, "logits/rejected": -1.3783992528915405, "logps/chosen": -318.10797119140625, "logps/rejected": -287.1791076660156, "loss": 0.6577, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.35399100184440613, "rewards/margins": 0.5296486616134644, "rewards/rejected": -0.17565762996673584, "step": 110 }, { "epoch": 0.65, "grad_norm": 1676.764876114058, "learning_rate": 8.527134831514116e-09, "logits/chosen": -1.5781362056732178, "logits/rejected": -1.4229751825332642, "logps/chosen": -331.3733825683594, "logps/rejected": -297.85699462890625, "loss": 0.6575, "rewards/accuracies": 0.609375, "rewards/chosen": 0.3793606460094452, "rewards/margins": 0.4118588864803314, "rewards/rejected": -0.03249818831682205, "step": 120 }, { "epoch": 0.71, "grad_norm": 1566.6901996912077, "learning_rate": 8.175404294144481e-09, "logits/chosen": -1.6145737171173096, "logits/rejected": -1.4269483089447021, "logps/chosen": -317.0880432128906, "logps/rejected": -271.5414123535156, "loss": 0.6044, "rewards/accuracies": 0.671875, "rewards/chosen": 0.6310849189758301, "rewards/margins": 0.7299145460128784, "rewards/rejected": -0.09882961958646774, "step": 130 }, { "epoch": 0.76, "grad_norm": 1706.595775593044, "learning_rate": 7.79509016905158e-09, "logits/chosen": -1.5648548603057861, "logits/rejected": -1.4158308506011963, "logps/chosen": -331.06622314453125, "logps/rejected": -294.2123718261719, "loss": 0.6171, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7887445092201233, "rewards/margins": 0.765161395072937, "rewards/rejected": 0.023583168163895607, "step": 140 }, { "epoch": 0.82, "grad_norm": 1648.2049279025357, "learning_rate": 7.389615876105773e-09, "logits/chosen": -1.5560743808746338, "logits/rejected": -1.4283266067504883, "logps/chosen": -314.5069274902344, "logps/rejected": -291.7706298828125, "loss": 0.6127, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": 0.8379846811294556, "rewards/margins": 0.7371869087219238, "rewards/rejected": 0.10079775750637054, "step": 150 }, { "epoch": 0.87, "grad_norm": 1635.8235385722824, "learning_rate": 6.962631315901861e-09, "logits/chosen": -1.5186518430709839, "logits/rejected": -1.4028724431991577, "logps/chosen": -317.958251953125, "logps/rejected": -291.0096435546875, "loss": 0.6088, "rewards/accuracies": 0.653124988079071, "rewards/chosen": 0.8378221392631531, "rewards/margins": 0.6740074753761292, "rewards/rejected": 0.16381461918354034, "step": 160 }, { "epoch": 0.92, "grad_norm": 1575.6130834814026, "learning_rate": 6.517980014965139e-09, "logits/chosen": -1.6025912761688232, "logits/rejected": -1.4152277708053589, "logps/chosen": -331.40386962890625, "logps/rejected": -289.4659729003906, "loss": 0.5997, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.8780991435050964, "rewards/margins": 0.8349622488021851, "rewards/rejected": 0.04313689470291138, "step": 170 }, { "epoch": 0.98, "grad_norm": 1546.3751249922345, "learning_rate": 6.059664528022266e-09, "logits/chosen": -1.5942988395690918, "logits/rejected": -1.44364333152771, "logps/chosen": -315.07196044921875, "logps/rejected": -276.7376708984375, "loss": 0.5773, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.8913241624832153, "rewards/margins": 0.9472495317459106, "rewards/rejected": -0.05592530965805054, "step": 180 }, { "epoch": 1.03, "grad_norm": 1681.3148479750444, "learning_rate": 5.591810408770492e-09, "logits/chosen": -1.5504480600357056, "logits/rejected": -1.3759148120880127, "logps/chosen": -315.5844421386719, "logps/rejected": -278.6695861816406, "loss": 0.5632, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": 0.8848656415939331, "rewards/margins": 0.8844806551933289, "rewards/rejected": 0.00038505197153426707, "step": 190 }, { "epoch": 1.09, "grad_norm": 1651.7882136807318, "learning_rate": 5.118629073464423e-09, "logits/chosen": -1.571003794670105, "logits/rejected": -1.3608561754226685, "logps/chosen": -325.93023681640625, "logps/rejected": -282.7080993652344, "loss": 0.5605, "rewards/accuracies": 0.71875, "rewards/chosen": 1.0313498973846436, "rewards/margins": 0.9450349807739258, "rewards/rejected": 0.08631500601768494, "step": 200 }, { "epoch": 1.14, "grad_norm": 1538.4386313699126, "learning_rate": 4.644379891605983e-09, "logits/chosen": -1.608812689781189, "logits/rejected": -1.4315342903137207, "logps/chosen": -324.66522216796875, "logps/rejected": -291.33428955078125, "loss": 0.5478, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": 1.0752595663070679, "rewards/margins": 1.0428497791290283, "rewards/rejected": 0.03240995481610298, "step": 210 }, { "epoch": 1.2, "grad_norm": 1737.3887570467818, "learning_rate": 4.173331844980362e-09, "logits/chosen": -1.5384166240692139, "logits/rejected": -1.4137290716171265, "logps/chosen": -323.9536437988281, "logps/rejected": -293.42535400390625, "loss": 0.563, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": 0.9658479690551758, "rewards/margins": 0.9138795137405396, "rewards/rejected": 0.051968496292829514, "step": 220 }, { "epoch": 1.25, "grad_norm": 1605.3661746462226, "learning_rate": 3.7097251001664824e-09, "logits/chosen": -1.537548542022705, "logits/rejected": -1.3787362575531006, "logps/chosen": -323.85125732421875, "logps/rejected": -286.95379638671875, "loss": 0.526, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 1.146087408065796, "rewards/margins": 1.0939618349075317, "rewards/rejected": 0.0521254763007164, "step": 230 }, { "epoch": 1.3, "grad_norm": 1689.839854162397, "learning_rate": 3.2577328404292057e-09, "logits/chosen": -1.5391089916229248, "logits/rejected": -1.4084638357162476, "logps/chosen": -312.51373291015625, "logps/rejected": -285.9711608886719, "loss": 0.5418, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": 1.0901774168014526, "rewards/margins": 1.016390085220337, "rewards/rejected": 0.07378745824098587, "step": 240 }, { "epoch": 1.36, "grad_norm": 1710.94558540331, "learning_rate": 2.821423700565763e-09, "logits/chosen": -1.5968081951141357, "logits/rejected": -1.4188272953033447, "logps/chosen": -350.68487548828125, "logps/rejected": -306.6036071777344, "loss": 0.532, "rewards/accuracies": 0.78125, "rewards/chosen": 1.2678377628326416, "rewards/margins": 1.2405023574829102, "rewards/rejected": 0.027335500344634056, "step": 250 }, { "epoch": 1.41, "grad_norm": 1638.2367115980887, "learning_rate": 2.4047251428513483e-09, "logits/chosen": -1.6129051446914673, "logits/rejected": -1.4581451416015625, "logps/chosen": -325.2450256347656, "logps/rejected": -291.1476745605469, "loss": 0.5289, "rewards/accuracies": 0.734375, "rewards/chosen": 1.2301806211471558, "rewards/margins": 1.2308820486068726, "rewards/rejected": -0.0007013082504272461, "step": 260 }, { "epoch": 1.47, "grad_norm": 1199.4883951774482, "learning_rate": 2.011388103757442e-09, "logits/chosen": -1.5265954732894897, "logits/rejected": -1.3828239440917969, "logps/chosen": -316.2944641113281, "logps/rejected": -285.7884826660156, "loss": 0.5191, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 1.3710923194885254, "rewards/margins": 1.2594387531280518, "rewards/rejected": 0.11165344715118408, "step": 270 }, { "epoch": 1.52, "grad_norm": 1472.2115597857592, "learning_rate": 1.644953229677474e-09, "logits/chosen": -1.600651502609253, "logits/rejected": -1.4179413318634033, "logps/chosen": -326.00335693359375, "logps/rejected": -284.74188232421875, "loss": 0.5459, "rewards/accuracies": 0.75, "rewards/chosen": 1.3610546588897705, "rewards/margins": 1.2091944217681885, "rewards/rejected": 0.1518600881099701, "step": 280 }, { "epoch": 1.58, "grad_norm": 1566.9737970600454, "learning_rate": 1.308719005590957e-09, "logits/chosen": -1.5032551288604736, "logits/rejected": -1.3876453638076782, "logps/chosen": -318.40948486328125, "logps/rejected": -282.49554443359375, "loss": 0.5407, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 1.2658413648605347, "rewards/margins": 1.187675952911377, "rewards/rejected": 0.07816555351018906, "step": 290 }, { "epoch": 1.63, "grad_norm": 1348.7257224769698, "learning_rate": 1.005712063557776e-09, "logits/chosen": -1.6333671808242798, "logits/rejected": -1.455556869506836, "logps/chosen": -324.13885498046875, "logps/rejected": -290.60186767578125, "loss": 0.5346, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": 1.1175706386566162, "rewards/margins": 1.0337438583374023, "rewards/rejected": 0.08382664620876312, "step": 300 }, { "epoch": 1.68, "grad_norm": 1356.5441208888985, "learning_rate": 7.386599383124321e-10, "logits/chosen": -1.565224051475525, "logits/rejected": -1.3825923204421997, "logps/chosen": -321.80316162109375, "logps/rejected": -285.7908630371094, "loss": 0.5304, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 1.2159234285354614, "rewards/margins": 1.1465200185775757, "rewards/rejected": 0.06940338760614395, "step": 310 }, { "epoch": 1.74, "grad_norm": 1445.3559110776998, "learning_rate": 5.099665152003929e-10, "logits/chosen": -1.5921494960784912, "logits/rejected": -1.3807857036590576, "logps/chosen": -333.7308654785156, "logps/rejected": -289.9362487792969, "loss": 0.5241, "rewards/accuracies": 0.7718750238418579, "rewards/chosen": 1.3256893157958984, "rewards/margins": 1.292041540145874, "rewards/rejected": 0.03364778310060501, "step": 320 }, { "epoch": 1.79, "grad_norm": 1681.5042999261696, "learning_rate": 3.216903914633745e-10, "logits/chosen": -1.5627129077911377, "logits/rejected": -1.4408833980560303, "logps/chosen": -325.2505187988281, "logps/rejected": -296.106201171875, "loss": 0.5429, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 1.165374517440796, "rewards/margins": 1.0651426315307617, "rewards/rejected": 0.1002318263053894, "step": 330 }, { "epoch": 1.85, "grad_norm": 1536.75287567762, "learning_rate": 1.7552634565570324e-10, "logits/chosen": -1.5574743747711182, "logits/rejected": -1.3901411294937134, "logps/chosen": -329.89141845703125, "logps/rejected": -292.8751525878906, "loss": 0.5342, "rewards/accuracies": 0.753125011920929, "rewards/chosen": 1.4129165410995483, "rewards/margins": 1.3112914562225342, "rewards/rejected": 0.10162514448165894, "step": 340 }, { "epoch": 1.9, "grad_norm": 1492.8399510840338, "learning_rate": 7.279008199590543e-11, "logits/chosen": -1.5503973960876465, "logits/rejected": -1.3889100551605225, "logps/chosen": -326.42120361328125, "logps/rejected": -291.9585266113281, "loss": 0.5261, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 1.3398044109344482, "rewards/margins": 1.2421011924743652, "rewards/rejected": 0.09770330041646957, "step": 350 }, { "epoch": 1.96, "grad_norm": 1452.281513333118, "learning_rate": 1.4406386978128017e-11, "logits/chosen": -1.6207876205444336, "logits/rejected": -1.424393653869629, "logps/chosen": -331.06390380859375, "logps/rejected": -291.6929626464844, "loss": 0.5043, "rewards/accuracies": 0.7906249761581421, "rewards/chosen": 1.518845558166504, "rewards/margins": 1.381410837173462, "rewards/rejected": 0.13743488490581512, "step": 360 }, { "epoch": 2.0, "step": 368, "total_flos": 0.0, "train_loss": 0.6161670185949492, "train_runtime": 9955.6802, "train_samples_per_second": 9.461, "train_steps_per_second": 0.037 } ], "logging_steps": 10, "max_steps": 368, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }