{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 368, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 2733.5875966596914, "learning_rate": 2.702702702702703e-10, "logits/chosen": -1.3332719802856445, "logits/rejected": -1.246394395828247, "logps/chosen": -286.9539794921875, "logps/rejected": -263.3782958984375, "loss": 0.7283, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.05, "grad_norm": 3426.28374639058, "learning_rate": 2.702702702702703e-09, "logits/chosen": -1.617490530014038, "logits/rejected": -1.3964743614196777, "logps/chosen": -342.53607177734375, "logps/rejected": -294.5452575683594, "loss": 0.9019, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0025859144516289234, "rewards/margins": 0.014665775932371616, "rewards/rejected": -0.017251690849661827, "step": 10 }, { "epoch": 0.11, "grad_norm": 3068.517544136213, "learning_rate": 5.405405405405406e-09, "logits/chosen": -1.4905732870101929, "logits/rejected": -1.3132953643798828, "logps/chosen": -314.7499084472656, "logps/rejected": -279.27752685546875, "loss": 0.9225, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.03222837299108505, "rewards/margins": -0.024351513013243675, "rewards/rejected": 0.05657988786697388, "step": 20 }, { "epoch": 0.16, "grad_norm": 3019.197796031677, "learning_rate": 8.108108108108109e-09, "logits/chosen": -1.5479624271392822, "logits/rejected": -1.3802028894424438, "logps/chosen": -324.89044189453125, "logps/rejected": -286.2395324707031, "loss": 0.9562, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": 0.031223665922880173, "rewards/margins": -0.0476018562912941, "rewards/rejected": 0.07882551848888397, "step": 30 }, { "epoch": 0.22, "grad_norm": 2861.837005400921, "learning_rate": 9.997973265157192e-09, "logits/chosen": -1.5354044437408447, "logits/rejected": -1.3576419353485107, "logps/chosen": -325.43408203125, "logps/rejected": -285.6204528808594, "loss": 0.9309, "rewards/accuracies": 0.484375, "rewards/chosen": -0.08140890300273895, "rewards/margins": -0.13951030373573303, "rewards/rejected": 0.05810140445828438, "step": 40 }, { "epoch": 0.27, "grad_norm": 3285.7135249709568, "learning_rate": 9.961988113473708e-09, "logits/chosen": -1.534355640411377, "logits/rejected": -1.3875898122787476, "logps/chosen": -337.02044677734375, "logps/rejected": -297.35101318359375, "loss": 0.88, "rewards/accuracies": 0.546875, "rewards/chosen": 0.0009558796882629395, "rewards/margins": 0.08057532459497452, "rewards/rejected": -0.07961944490671158, "step": 50 }, { "epoch": 0.33, "grad_norm": 2547.1665545835035, "learning_rate": 9.881337335184878e-09, "logits/chosen": -1.5822935104370117, "logits/rejected": -1.4333903789520264, "logps/chosen": -319.79644775390625, "logps/rejected": -285.0381164550781, "loss": 0.8105, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": 0.09285839647054672, "rewards/margins": 0.40408092737197876, "rewards/rejected": -0.31122252345085144, "step": 60 }, { "epoch": 0.38, "grad_norm": 3178.8313195657583, "learning_rate": 9.756746912994832e-09, "logits/chosen": -1.5119212865829468, "logits/rejected": -1.350838541984558, "logps/chosen": -312.1349182128906, "logps/rejected": -275.08660888671875, "loss": 0.7993, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.05482473224401474, "rewards/margins": 0.165395587682724, "rewards/rejected": -0.22022032737731934, "step": 70 }, { "epoch": 0.43, "grad_norm": 2596.199281277795, "learning_rate": 9.589338354885628e-09, "logits/chosen": -1.5992329120635986, "logits/rejected": -1.4463211297988892, "logps/chosen": -323.2821960449219, "logps/rejected": -288.0993347167969, "loss": 0.7772, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.20231203734874725, "rewards/margins": 0.4638887345790863, "rewards/rejected": -0.26157671213150024, "step": 80 }, { "epoch": 0.49, "grad_norm": 2404.162696635418, "learning_rate": 9.380618598797472e-09, "logits/chosen": -1.6108148097991943, "logits/rejected": -1.4147026538848877, "logps/chosen": -319.95526123046875, "logps/rejected": -281.7666015625, "loss": 0.7649, "rewards/accuracies": 0.640625, "rewards/chosen": 0.2500740885734558, "rewards/margins": 0.5574880838394165, "rewards/rejected": -0.3074139356613159, "step": 90 }, { "epoch": 0.54, "grad_norm": 2500.271614922207, "learning_rate": 9.132466447838596e-09, "logits/chosen": -1.542976975440979, "logits/rejected": -1.3676128387451172, "logps/chosen": -321.9007263183594, "logps/rejected": -282.65899658203125, "loss": 0.7305, "rewards/accuracies": 0.659375011920929, "rewards/chosen": 0.4254188537597656, "rewards/margins": 0.763160228729248, "rewards/rejected": -0.3377414047718048, "step": 100 }, { "epoch": 0.6, "grad_norm": 2525.572173445101, "learning_rate": 8.847115658129039e-09, "logits/chosen": -1.512939453125, "logits/rejected": -1.3849382400512695, "logps/chosen": -318.14813232421875, "logps/rejected": -287.1947937011719, "loss": 0.7164, "rewards/accuracies": 0.628125011920929, "rewards/chosen": 0.3916184604167938, "rewards/margins": 0.657262921333313, "rewards/rejected": -0.265644371509552, "step": 110 }, { "epoch": 0.65, "grad_norm": 2498.220455730566, "learning_rate": 8.527134831514116e-09, "logits/chosen": -1.5739517211914062, "logits/rejected": -1.41860032081604, "logps/chosen": -331.3175354003906, "logps/rejected": -297.8718566894531, "loss": 0.7018, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6175383925437927, "rewards/margins": 0.6906081438064575, "rewards/rejected": -0.07306969165802002, "step": 120 }, { "epoch": 0.71, "grad_norm": 2170.854176631694, "learning_rate": 8.175404294144481e-09, "logits/chosen": -1.616276502609253, "logits/rejected": -1.429518699645996, "logps/chosen": -317.1609802246094, "logps/rejected": -271.5557861328125, "loss": 0.6719, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6955646872520447, "rewards/margins": 0.8560658693313599, "rewards/rejected": -0.160501167178154, "step": 130 }, { "epoch": 0.76, "grad_norm": 2147.543587634845, "learning_rate": 7.79509016905158e-09, "logits/chosen": -1.5726101398468018, "logits/rejected": -1.4245671033859253, "logps/chosen": -331.12109375, "logps/rejected": -294.2488098144531, "loss": 0.6686, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": 0.9418653249740601, "rewards/margins": 0.9832620620727539, "rewards/rejected": -0.04139674827456474, "step": 140 }, { "epoch": 0.82, "grad_norm": 2298.0130679506387, "learning_rate": 7.389615876105773e-09, "logits/chosen": -1.5536715984344482, "logits/rejected": -1.4254592657089233, "logps/chosen": -314.55267333984375, "logps/rejected": -291.81536865234375, "loss": 0.6793, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.0258944034576416, "rewards/margins": 0.981005072593689, "rewards/rejected": 0.04488936811685562, "step": 150 }, { "epoch": 0.87, "grad_norm": 2220.286727308493, "learning_rate": 6.962631315901861e-09, "logits/chosen": -1.5181314945220947, "logits/rejected": -1.4019381999969482, "logps/chosen": -318.02752685546875, "logps/rejected": -291.03936767578125, "loss": 0.6742, "rewards/accuracies": 0.628125011920929, "rewards/chosen": 0.9784830808639526, "rewards/margins": 0.819505512714386, "rewards/rejected": 0.15897764265537262, "step": 160 }, { "epoch": 0.92, "grad_norm": 2125.4983562302123, "learning_rate": 6.517980014965139e-09, "logits/chosen": -1.5958881378173828, "logits/rejected": -1.4071909189224243, "logps/chosen": -331.4378356933594, "logps/rejected": -289.5236511230469, "loss": 0.6456, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 1.1029136180877686, "rewards/margins": 1.160766363143921, "rewards/rejected": -0.05785265564918518, "step": 170 }, { "epoch": 0.98, "grad_norm": 2116.9259500110184, "learning_rate": 6.059664528022266e-09, "logits/chosen": -1.5962104797363281, "logits/rejected": -1.445967197418213, "logps/chosen": -315.10467529296875, "logps/rejected": -276.73443603515625, "loss": 0.6191, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.123002052307129, "rewards/margins": 1.1910655498504639, "rewards/rejected": -0.0680634081363678, "step": 180 }, { "epoch": 1.03, "grad_norm": 2067.8979230397717, "learning_rate": 5.591810408770492e-09, "logits/chosen": -1.55275559425354, "logits/rejected": -1.3787180185317993, "logps/chosen": -315.572509765625, "logps/rejected": -278.71087646484375, "loss": 0.6052, "rewards/accuracies": 0.71875, "rewards/chosen": 1.2037546634674072, "rewards/margins": 1.2858160734176636, "rewards/rejected": -0.0820615291595459, "step": 190 }, { "epoch": 1.09, "grad_norm": 2203.6189854484824, "learning_rate": 5.118629073464423e-09, "logits/chosen": -1.5673738718032837, "logits/rejected": -1.3565856218338013, "logps/chosen": -325.91680908203125, "logps/rejected": -282.65869140625, "loss": 0.6024, "rewards/accuracies": 0.746874988079071, "rewards/chosen": 1.4020355939865112, "rewards/margins": 1.1880966424942017, "rewards/rejected": 0.21393892168998718, "step": 200 }, { "epoch": 1.14, "grad_norm": 2263.504332795979, "learning_rate": 4.644379891605983e-09, "logits/chosen": -1.611310601234436, "logits/rejected": -1.4343440532684326, "logps/chosen": -324.752197265625, "logps/rejected": -291.36102294921875, "loss": 0.5985, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.2598803043365479, "rewards/margins": 1.2701908349990845, "rewards/rejected": -0.010310685262084007, "step": 210 }, { "epoch": 1.2, "grad_norm": 2324.309417872748, "learning_rate": 4.173331844980362e-09, "logits/chosen": -1.5291264057159424, "logits/rejected": -1.4033840894699097, "logps/chosen": -323.9982604980469, "logps/rejected": -293.4136047363281, "loss": 0.5948, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": 1.1985571384429932, "rewards/margins": 1.105764627456665, "rewards/rejected": 0.09279236942529678, "step": 220 }, { "epoch": 1.25, "grad_norm": 2275.313721629071, "learning_rate": 3.7097251001664824e-09, "logits/chosen": -1.5342741012573242, "logits/rejected": -1.3754017353057861, "logps/chosen": -323.9897766113281, "logps/rejected": -287.0173645019531, "loss": 0.577, "rewards/accuracies": 0.71875, "rewards/chosen": 1.2511075735092163, "rewards/margins": 1.3087403774261475, "rewards/rejected": -0.05763290077447891, "step": 230 }, { "epoch": 1.3, "grad_norm": 2261.889907648677, "learning_rate": 3.2577328404292057e-09, "logits/chosen": -1.5497777462005615, "logits/rejected": -1.4208284616470337, "logps/chosen": -312.53802490234375, "logps/rejected": -285.97076416015625, "loss": 0.5741, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.4049217700958252, "rewards/margins": 1.3057136535644531, "rewards/rejected": 0.09920807182788849, "step": 240 }, { "epoch": 1.36, "grad_norm": 2520.220972155196, "learning_rate": 2.821423700565763e-09, "logits/chosen": -1.5996572971343994, "logits/rejected": -1.4216984510421753, "logps/chosen": -350.76129150390625, "logps/rejected": -306.58831787109375, "loss": 0.5681, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 1.537647008895874, "rewards/margins": 1.4706511497497559, "rewards/rejected": 0.06699595600366592, "step": 250 }, { "epoch": 1.41, "grad_norm": 2185.2135747623915, "learning_rate": 2.4047251428513483e-09, "logits/chosen": -1.61586594581604, "logits/rejected": -1.4618706703186035, "logps/chosen": -325.3050537109375, "logps/rejected": -291.10345458984375, "loss": 0.5977, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 1.5201152563095093, "rewards/margins": 1.4326369762420654, "rewards/rejected": 0.08747831732034683, "step": 260 }, { "epoch": 1.47, "grad_norm": 1735.3140228188304, "learning_rate": 2.011388103757442e-09, "logits/chosen": -1.5243465900421143, "logits/rejected": -1.3802506923675537, "logps/chosen": -316.4330139160156, "logps/rejected": -285.81353759765625, "loss": 0.5429, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": 1.550986647605896, "rewards/margins": 1.45218026638031, "rewards/rejected": 0.09880634397268295, "step": 270 }, { "epoch": 1.52, "grad_norm": 2121.5830866823144, "learning_rate": 1.644953229677474e-09, "logits/chosen": -1.6015859842300415, "logits/rejected": -1.4193016290664673, "logps/chosen": -326.1202087402344, "logps/rejected": -284.7384033203125, "loss": 0.5887, "rewards/accuracies": 0.746874988079071, "rewards/chosen": 1.580999732017517, "rewards/margins": 1.3716144561767578, "rewards/rejected": 0.20938535034656525, "step": 280 }, { "epoch": 1.58, "grad_norm": 2103.020456118981, "learning_rate": 1.308719005590957e-09, "logits/chosen": -1.509340524673462, "logits/rejected": -1.3944005966186523, "logps/chosen": -318.451416015625, "logps/rejected": -282.4563293457031, "loss": 0.5721, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 1.603921890258789, "rewards/margins": 1.4211392402648926, "rewards/rejected": 0.182782843708992, "step": 290 }, { "epoch": 1.63, "grad_norm": 2010.5734100649909, "learning_rate": 1.005712063557776e-09, "logits/chosen": -1.6272541284561157, "logits/rejected": -1.4480578899383545, "logps/chosen": -324.20068359375, "logps/rejected": -290.54803466796875, "loss": 0.5898, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": 1.3664501905441284, "rewards/margins": 1.1471518278121948, "rewards/rejected": 0.21929831802845, "step": 300 }, { "epoch": 1.68, "grad_norm": 1862.455491029429, "learning_rate": 7.386599383124321e-10, "logits/chosen": -1.563561201095581, "logits/rejected": -1.3803369998931885, "logps/chosen": -321.889404296875, "logps/rejected": -285.8083190917969, "loss": 0.5879, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.4487113952636719, "rewards/margins": 1.3910987377166748, "rewards/rejected": 0.057612527161836624, "step": 310 }, { "epoch": 1.74, "grad_norm": 1875.0391267528262, "learning_rate": 5.099665152003929e-10, "logits/chosen": -1.5980346202850342, "logits/rejected": -1.3878109455108643, "logps/chosen": -333.843994140625, "logps/rejected": -289.8874816894531, "loss": 0.5634, "rewards/accuracies": 0.7593749761581421, "rewards/chosen": 1.5412708520889282, "rewards/margins": 1.3988701105117798, "rewards/rejected": 0.14240065217018127, "step": 320 }, { "epoch": 1.79, "grad_norm": 2232.545936783362, "learning_rate": 3.216903914633745e-10, "logits/chosen": -1.5563807487487793, "logits/rejected": -1.4342092275619507, "logps/chosen": -325.34674072265625, "logps/rejected": -296.1554870605469, "loss": 0.5762, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 1.3614461421966553, "rewards/margins": 1.3264009952545166, "rewards/rejected": 0.03504505008459091, "step": 330 }, { "epoch": 1.85, "grad_norm": 1895.2431252651277, "learning_rate": 1.7552634565570324e-10, "logits/chosen": -1.5503065586090088, "logits/rejected": -1.381874680519104, "logps/chosen": -329.953857421875, "logps/rejected": -292.89447021484375, "loss": 0.5681, "rewards/accuracies": 0.7718750238418579, "rewards/chosen": 1.7590471506118774, "rewards/margins": 1.6621748208999634, "rewards/rejected": 0.09687252342700958, "step": 340 }, { "epoch": 1.9, "grad_norm": 2070.257116146559, "learning_rate": 7.279008199590543e-11, "logits/chosen": -1.541084885597229, "logits/rejected": -1.378144383430481, "logps/chosen": -326.51507568359375, "logps/rejected": -291.95123291015625, "loss": 0.5797, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": 1.5987285375595093, "rewards/margins": 1.4538962841033936, "rewards/rejected": 0.1448323279619217, "step": 350 }, { "epoch": 1.96, "grad_norm": 2166.9724743285583, "learning_rate": 1.4406386978128017e-11, "logits/chosen": -1.6209802627563477, "logits/rejected": -1.42485511302948, "logps/chosen": -331.2142639160156, "logps/rejected": -291.69842529296875, "loss": 0.5582, "rewards/accuracies": 0.778124988079071, "rewards/chosen": 1.724250078201294, "rewards/margins": 1.5519315004348755, "rewards/rejected": 0.1723184883594513, "step": 360 }, { "epoch": 2.0, "step": 368, "total_flos": 0.0, "train_loss": 0.26489045179408527, "train_runtime": 4520.7899, "train_samples_per_second": 20.835, "train_steps_per_second": 0.081 } ], "logging_steps": 10, "max_steps": 368, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }