{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984591679506933, "eval_steps": 100, "global_step": 324, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 39.87631410320537, "learning_rate": 1.5151515151515152e-08, "logits/chosen": -3.1684141159057617, "logits/rejected": -3.1765036582946777, "logps/chosen": -1262.7908935546875, "logps/rejected": -1304.270263671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 44.090928533988176, "learning_rate": 1.5151515151515152e-07, "logits/chosen": -3.145017385482788, "logits/rejected": -3.17344069480896, "logps/chosen": -1035.2520751953125, "logps/rejected": -1331.3636474609375, "loss": 0.6917, "rewards/accuracies": 0.5277777910232544, "rewards/chosen": 0.003002108307555318, "rewards/margins": 0.0036764023825526237, "rewards/rejected": -0.0006742942496202886, "step": 10 }, { "epoch": 0.06, "grad_norm": 32.37744800941447, "learning_rate": 3.0303030303030305e-07, "logits/chosen": -3.1358423233032227, "logits/rejected": -3.18705415725708, "logps/chosen": -968.2097778320312, "logps/rejected": -1354.069580078125, "loss": 0.6617, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.03913033753633499, "rewards/margins": 0.06312780827283859, "rewards/rejected": -0.023997480049729347, "step": 20 }, { "epoch": 0.09, "grad_norm": 30.93962012271263, "learning_rate": 4.545454545454545e-07, "logits/chosen": -3.2511069774627686, "logits/rejected": -3.244719982147217, "logps/chosen": -1036.7672119140625, "logps/rejected": -1373.1820068359375, "loss": 0.5896, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.06491168588399887, "rewards/margins": 0.31840670108795166, "rewards/rejected": -0.2534949779510498, "step": 30 }, { "epoch": 0.12, "grad_norm": 31.51946350483435, "learning_rate": 4.992864684782648e-07, "logits/chosen": -3.3422675132751465, "logits/rejected": -3.370623826980591, "logps/chosen": -1083.17431640625, "logps/rejected": -1492.5845947265625, "loss": 0.5125, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07627250999212265, "rewards/margins": 0.8225336074829102, "rewards/rejected": -0.898806095123291, "step": 40 }, { "epoch": 0.15, "grad_norm": 29.782679812110892, "learning_rate": 4.958014217656854e-07, "logits/chosen": -3.3696506023406982, "logits/rejected": -3.4038467407226562, "logps/chosen": -1081.7869873046875, "logps/rejected": -1461.259033203125, "loss": 0.4171, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.1066322773694992, "rewards/margins": 1.0666204690933228, "rewards/rejected": -1.173252820968628, "step": 50 }, { "epoch": 0.18, "grad_norm": 30.594547647279217, "learning_rate": 4.894543310469967e-07, "logits/chosen": -3.352465867996216, "logits/rejected": -3.3652706146240234, "logps/chosen": -1111.260009765625, "logps/rejected": -1504.715087890625, "loss": 0.3969, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.2318076640367508, "rewards/margins": 1.2468664646148682, "rewards/rejected": -1.4786741733551025, "step": 60 }, { "epoch": 0.22, "grad_norm": 31.824817232007625, "learning_rate": 4.803191000971128e-07, "logits/chosen": -3.3312506675720215, "logits/rejected": -3.355130434036255, "logps/chosen": -968.1290893554688, "logps/rejected": -1600.333251953125, "loss": 0.3874, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.22658078372478485, "rewards/margins": 2.3517754077911377, "rewards/rejected": -2.5783562660217285, "step": 70 }, { "epoch": 0.25, "grad_norm": 25.823223858100576, "learning_rate": 4.685020970273189e-07, "logits/chosen": -3.2700467109680176, "logits/rejected": -3.3080413341522217, "logps/chosen": -1002.4366455078125, "logps/rejected": -1593.41796875, "loss": 0.3546, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.07068847864866257, "rewards/margins": 2.2586522102355957, "rewards/rejected": -2.329341173171997, "step": 80 }, { "epoch": 0.28, "grad_norm": 43.928369861559965, "learning_rate": 4.541409157643027e-07, "logits/chosen": -3.235419511795044, "logits/rejected": -3.2496044635772705, "logps/chosen": -956.4049072265625, "logps/rejected": -1599.0389404296875, "loss": 0.3143, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.038097791373729706, "rewards/margins": 2.2678751945495605, "rewards/rejected": -2.2297775745391846, "step": 90 }, { "epoch": 0.31, "grad_norm": 30.820532733997354, "learning_rate": 4.374027739443952e-07, "logits/chosen": -3.204524517059326, "logits/rejected": -3.163343906402588, "logps/chosen": -1068.4237060546875, "logps/rejected": -1704.1986083984375, "loss": 0.2799, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.3904297947883606, "rewards/margins": 3.4663283824920654, "rewards/rejected": -3.8567581176757812, "step": 100 }, { "epoch": 0.31, "eval_logits/chosen": -3.0348000526428223, "eval_logits/rejected": -3.0867843627929688, "eval_logps/chosen": -584.1478881835938, "eval_logps/rejected": -794.01025390625, "eval_loss": 0.5261008143424988, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": -1.5201810598373413, "eval_rewards/margins": 0.9906590580940247, "eval_rewards/rejected": -2.5108399391174316, "eval_runtime": 34.7053, "eval_samples_per_second": 7.261, "eval_steps_per_second": 0.231, "step": 100 }, { "epoch": 0.34, "grad_norm": 47.845448282397456, "learning_rate": 4.184825658775027e-07, "logits/chosen": -3.128324031829834, "logits/rejected": -3.134152889251709, "logps/chosen": -1042.473388671875, "logps/rejected": -1787.997802734375, "loss": 0.2816, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.6833322644233704, "rewards/margins": 3.4197134971618652, "rewards/rejected": -4.10304594039917, "step": 110 }, { "epoch": 0.37, "grad_norm": 24.2664669682948, "learning_rate": 3.9760059325148063e-07, "logits/chosen": -3.1436760425567627, "logits/rejected": -3.091614246368408, "logps/chosen": -1067.834716796875, "logps/rejected": -1788.0120849609375, "loss": 0.2536, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.610935389995575, "rewards/margins": 4.445748329162598, "rewards/rejected": -5.056683540344238, "step": 120 }, { "epoch": 0.4, "grad_norm": 22.23462347593175, "learning_rate": 3.75e-07, "logits/chosen": -3.1414103507995605, "logits/rejected": -3.0941264629364014, "logps/chosen": -1100.4937744140625, "logps/rejected": -1801.8560791015625, "loss": 0.2298, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.540351152420044, "rewards/margins": 3.7757785320281982, "rewards/rejected": -4.3161301612854, "step": 130 }, { "epoch": 0.43, "grad_norm": 28.613362043744857, "learning_rate": 3.509439412016004e-07, "logits/chosen": -3.0641441345214844, "logits/rejected": -3.0451717376708984, "logps/chosen": -1098.5340576171875, "logps/rejected": -1918.6890869140625, "loss": 0.2135, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.7187612056732178, "rewards/margins": 5.323573112487793, "rewards/rejected": -6.04233455657959, "step": 140 }, { "epoch": 0.46, "grad_norm": 30.54616548038225, "learning_rate": 3.2571251897448763e-07, "logits/chosen": -2.992375135421753, "logits/rejected": -2.95180606842041, "logps/chosen": -1197.9376220703125, "logps/rejected": -2077.058349609375, "loss": 0.1801, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.5255634784698486, "rewards/margins": 5.869881629943848, "rewards/rejected": -7.395445346832275, "step": 150 }, { "epoch": 0.49, "grad_norm": 36.74589910484145, "learning_rate": 2.9959952104467243e-07, "logits/chosen": -2.9339356422424316, "logits/rejected": -2.85386323928833, "logps/chosen": -1247.737060546875, "logps/rejected": -2310.10205078125, "loss": 0.1778, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0261478424072266, "rewards/margins": 7.8707451820373535, "rewards/rejected": -9.896891593933105, "step": 160 }, { "epoch": 0.52, "grad_norm": 32.132075393104884, "learning_rate": 2.729089999626637e-07, "logits/chosen": -2.980856418609619, "logits/rejected": -2.856822967529297, "logps/chosen": -1185.372802734375, "logps/rejected": -2347.78076171875, "loss": 0.1698, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.51887047290802, "rewards/margins": 8.371113777160645, "rewards/rejected": -9.889985084533691, "step": 170 }, { "epoch": 0.55, "grad_norm": 26.839790210789428, "learning_rate": 2.459517327993746e-07, "logits/chosen": -2.962564468383789, "logits/rejected": -2.8451316356658936, "logps/chosen": -1266.3397216796875, "logps/rejected": -2263.588623046875, "loss": 0.141, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.7179988622665405, "rewards/margins": 7.524572849273682, "rewards/rejected": -9.242570877075195, "step": 180 }, { "epoch": 0.59, "grad_norm": 33.374896332465084, "learning_rate": 2.1904160254356748e-07, "logits/chosen": -2.881953477859497, "logits/rejected": -2.7538435459136963, "logps/chosen": -1169.200927734375, "logps/rejected": -2476.24072265625, "loss": 0.1207, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.6462358236312866, "rewards/margins": 9.677408218383789, "rewards/rejected": -11.323644638061523, "step": 190 }, { "epoch": 0.62, "grad_norm": 44.90567050679125, "learning_rate": 1.9249194333484563e-07, "logits/chosen": -2.8342463970184326, "logits/rejected": -2.730264902114868, "logps/chosen": -1229.9298095703125, "logps/rejected": -2291.61181640625, "loss": 0.154, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.0603644847869873, "rewards/margins": 7.22509765625, "rewards/rejected": -9.285462379455566, "step": 200 }, { "epoch": 0.62, "eval_logits/chosen": -2.694772720336914, "eval_logits/rejected": -2.554710626602173, "eval_logps/chosen": -742.1358642578125, "eval_logps/rejected": -1446.8753662109375, "eval_loss": 0.09226308017969131, "eval_rewards/accuracies": 0.9375, "eval_rewards/chosen": -3.100059986114502, "eval_rewards/margins": 5.939432144165039, "eval_rewards/rejected": -9.0394926071167, "eval_runtime": 34.5072, "eval_samples_per_second": 7.303, "eval_steps_per_second": 0.232, "step": 200 }, { "epoch": 0.65, "grad_norm": 29.94298857126716, "learning_rate": 1.6661189208729489e-07, "logits/chosen": -2.786771059036255, "logits/rejected": -2.6327857971191406, "logps/chosen": -1283.205810546875, "logps/rejected": -2401.399169921875, "loss": 0.1526, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.189497709274292, "rewards/margins": 8.632684707641602, "rewards/rejected": -10.822182655334473, "step": 210 }, { "epoch": 0.68, "grad_norm": 33.13137270748439, "learning_rate": 1.4170278898446175e-07, "logits/chosen": -2.828369379043579, "logits/rejected": -2.650123119354248, "logps/chosen": -1175.46826171875, "logps/rejected": -2410.84326171875, "loss": 0.1252, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.8023840188980103, "rewards/margins": 8.576199531555176, "rewards/rejected": -10.378583908081055, "step": 220 }, { "epoch": 0.71, "grad_norm": 30.216897410019698, "learning_rate": 1.1805466875731276e-07, "logits/chosen": -2.820298671722412, "logits/rejected": -2.622697591781616, "logps/chosen": -1135.1295166015625, "logps/rejected": -2719.731201171875, "loss": 0.1353, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.7641971111297607, "rewards/margins": 11.304890632629395, "rewards/rejected": -13.06908893585205, "step": 230 }, { "epoch": 0.74, "grad_norm": 28.800219923929006, "learning_rate": 9.594288359976815e-08, "logits/chosen": -2.815680742263794, "logits/rejected": -2.6530845165252686, "logps/chosen": -1304.4205322265625, "logps/rejected": -2253.841064453125, "loss": 0.1093, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.9788525104522705, "rewards/margins": 7.066276550292969, "rewards/rejected": -9.045129776000977, "step": 240 }, { "epoch": 0.77, "grad_norm": 28.53520064286844, "learning_rate": 7.56248970436493e-08, "logits/chosen": -2.7308030128479004, "logits/rejected": -2.56375789642334, "logps/chosen": -1186.9593505859375, "logps/rejected": -2636.3701171875, "loss": 0.0913, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.045689105987549, "rewards/margins": 10.186556816101074, "rewards/rejected": -12.232245445251465, "step": 250 }, { "epoch": 0.8, "grad_norm": 21.7882636792373, "learning_rate": 5.733728612427771e-08, "logits/chosen": -2.7645225524902344, "logits/rejected": -2.5233638286590576, "logps/chosen": -1321.7745361328125, "logps/rejected": -2658.189453125, "loss": 0.1155, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.5851030349731445, "rewards/margins": 10.245210647583008, "rewards/rejected": -12.830312728881836, "step": 260 }, { "epoch": 0.83, "grad_norm": 34.32906854711248, "learning_rate": 4.1292986742682254e-08, "logits/chosen": -2.668457508087158, "logits/rejected": -2.500288963317871, "logps/chosen": -1262.6650390625, "logps/rejected": -2782.10009765625, "loss": 0.0989, "rewards/accuracies": 0.96875, "rewards/chosen": -2.7102155685424805, "rewards/margins": 11.686820983886719, "rewards/rejected": -14.3970365524292, "step": 270 }, { "epoch": 0.86, "grad_norm": 42.763438453906815, "learning_rate": 2.7678814298657732e-08, "logits/chosen": -2.6972427368164062, "logits/rejected": -2.4791617393493652, "logps/chosen": -1353.071533203125, "logps/rejected": -2790.54052734375, "loss": 0.1022, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.827423095703125, "rewards/margins": 11.204734802246094, "rewards/rejected": -14.032157897949219, "step": 280 }, { "epoch": 0.89, "grad_norm": 26.418126829556318, "learning_rate": 1.6653288463741062e-08, "logits/chosen": -2.689786672592163, "logits/rejected": -2.518730401992798, "logps/chosen": -1242.576416015625, "logps/rejected": -2554.541748046875, "loss": 0.1067, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.544224262237549, "rewards/margins": 9.17101001739502, "rewards/rejected": -11.715234756469727, "step": 290 }, { "epoch": 0.92, "grad_norm": 40.4422866987403, "learning_rate": 8.344787421847216e-09, "logits/chosen": -2.65974497795105, "logits/rejected": -2.4722535610198975, "logps/chosen": -1281.4610595703125, "logps/rejected": -2672.197265625, "loss": 0.0948, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.5506279468536377, "rewards/margins": 10.449880599975586, "rewards/rejected": -13.000508308410645, "step": 300 }, { "epoch": 0.92, "eval_logits/chosen": -2.5877139568328857, "eval_logits/rejected": -2.3930397033691406, "eval_logps/chosen": -803.1033325195312, "eval_logps/rejected": -1661.4266357421875, "eval_loss": 0.07533077150583267, "eval_rewards/accuracies": 0.953125, "eval_rewards/chosen": -3.70973539352417, "eval_rewards/margins": 7.4752678871154785, "eval_rewards/rejected": -11.185002326965332, "eval_runtime": 34.126, "eval_samples_per_second": 7.384, "eval_steps_per_second": 0.234, "step": 300 }, { "epoch": 0.96, "grad_norm": 38.70439473653127, "learning_rate": 2.850053069080344e-09, "logits/chosen": -2.730034351348877, "logits/rejected": -2.4978787899017334, "logps/chosen": -1227.954345703125, "logps/rejected": -2714.13623046875, "loss": 0.1114, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.477766990661621, "rewards/margins": 11.021059036254883, "rewards/rejected": -13.498825073242188, "step": 310 }, { "epoch": 0.99, "grad_norm": 25.381541507877866, "learning_rate": 2.3306457775981727e-10, "logits/chosen": -2.6724932193756104, "logits/rejected": -2.4461209774017334, "logps/chosen": -1288.38623046875, "logps/rejected": -2817.382568359375, "loss": 0.0963, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.566399097442627, "rewards/margins": 11.983835220336914, "rewards/rejected": -14.550233840942383, "step": 320 }, { "epoch": 1.0, "step": 324, "total_flos": 0.0, "train_loss": 0.007740737387427577, "train_runtime": 396.513, "train_samples_per_second": 52.306, "train_steps_per_second": 0.817 } ], "logging_steps": 10, "max_steps": 324, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }