{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 478, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 27.37984871419997, "learning_rate": 1.0416666666666666e-08, "logits/chosen": -1.8783892393112183, "logits/rejected": -1.8756425380706787, "logps/chosen": -298.4870300292969, "logps/rejected": -398.0157165527344, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "grad_norm": 25.334426597070937, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -1.750243067741394, "logits/rejected": -1.7067593336105347, "logps/chosen": -280.5216369628906, "logps/rejected": -271.8791809082031, "loss": 0.6932, "rewards/accuracies": 0.4236111044883728, "rewards/chosen": -0.00042370916344225407, "rewards/margins": -0.0002716032031457871, "rewards/rejected": -0.00015210600395221263, "step": 10 }, { "epoch": 0.04, "grad_norm": 23.205563002993117, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -1.8309519290924072, "logits/rejected": -1.7239341735839844, "logps/chosen": -298.9266662597656, "logps/rejected": -320.81036376953125, "loss": 0.6919, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.00880073755979538, "rewards/margins": 0.0003546981024555862, "rewards/rejected": 0.008446039631962776, "step": 20 }, { "epoch": 0.06, "grad_norm": 22.833130746886702, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -1.8621642589569092, "logits/rejected": -1.811255693435669, "logps/chosen": -315.0081481933594, "logps/rejected": -281.7824401855469, "loss": 0.6846, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.06436704099178314, "rewards/margins": 0.02108323760330677, "rewards/rejected": 0.04328380152583122, "step": 30 }, { "epoch": 0.08, "grad_norm": 20.296209907433, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -1.7256094217300415, "logits/rejected": -1.6898906230926514, "logps/chosen": -269.07220458984375, "logps/rejected": -258.07366943359375, "loss": 0.6708, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.15135471522808075, "rewards/margins": 0.05834723263978958, "rewards/rejected": 0.09300748258829117, "step": 40 }, { "epoch": 0.1, "grad_norm": 18.992519669533575, "learning_rate": 4.999733114418725e-07, "logits/chosen": -1.7586348056793213, "logits/rejected": -1.7471107244491577, "logps/chosen": -274.77728271484375, "logps/rejected": -298.24298095703125, "loss": 0.6568, "rewards/accuracies": 0.625, "rewards/chosen": 0.2238006889820099, "rewards/margins": 0.05361497402191162, "rewards/rejected": 0.17018567025661469, "step": 50 }, { "epoch": 0.13, "grad_norm": 22.488749510223712, "learning_rate": 4.990398100856366e-07, "logits/chosen": -1.8446115255355835, "logits/rejected": -1.8052647113800049, "logps/chosen": -268.59100341796875, "logps/rejected": -318.24041748046875, "loss": 0.642, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.22674357891082764, "rewards/margins": 0.11847379058599472, "rewards/rejected": 0.10826978832483292, "step": 60 }, { "epoch": 0.15, "grad_norm": 24.241452630651324, "learning_rate": 4.967775735898179e-07, "logits/chosen": -1.6720319986343384, "logits/rejected": -1.6877762079238892, "logps/chosen": -274.5986022949219, "logps/rejected": -289.9263610839844, "loss": 0.6123, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.11401952803134918, "rewards/margins": 0.22531266510486603, "rewards/rejected": -0.11129315197467804, "step": 70 }, { "epoch": 0.17, "grad_norm": 32.48718302712838, "learning_rate": 4.931986719649298e-07, "logits/chosen": -1.910599946975708, "logits/rejected": -1.7989906072616577, "logps/chosen": -356.32135009765625, "logps/rejected": -325.3817443847656, "loss": 0.5878, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.07416001707315445, "rewards/margins": 0.30830469727516174, "rewards/rejected": -0.3824646770954132, "step": 80 }, { "epoch": 0.19, "grad_norm": 32.46521048247274, "learning_rate": 4.883222001996351e-07, "logits/chosen": -1.781141996383667, "logits/rejected": -1.773406982421875, "logps/chosen": -326.0487365722656, "logps/rejected": -370.7205505371094, "loss": 0.5637, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.45380252599716187, "rewards/margins": 0.5182568430900574, "rewards/rejected": -0.9720592498779297, "step": 90 }, { "epoch": 0.21, "grad_norm": 33.51530497027872, "learning_rate": 4.821741763807186e-07, "logits/chosen": -1.795566201210022, "logits/rejected": -1.7746385335922241, "logps/chosen": -341.0810241699219, "logps/rejected": -391.9131774902344, "loss": 0.5671, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6200565695762634, "rewards/margins": 0.5509090423583984, "rewards/rejected": -1.1709656715393066, "step": 100 }, { "epoch": 0.21, "eval_logits/chosen": -1.8679490089416504, "eval_logits/rejected": -1.8570616245269775, "eval_logps/chosen": -316.96636962890625, "eval_logps/rejected": -376.7557373046875, "eval_loss": 0.5698967576026917, "eval_rewards/accuracies": 0.73046875, "eval_rewards/chosen": -0.3533283472061157, "eval_rewards/margins": 0.5366135239601135, "eval_rewards/rejected": -0.8899418115615845, "eval_runtime": 97.6563, "eval_samples_per_second": 20.48, "eval_steps_per_second": 0.328, "step": 100 }, { "epoch": 0.23, "grad_norm": 34.820943984944364, "learning_rate": 4.747874028753375e-07, "logits/chosen": -1.9302442073822021, "logits/rejected": -1.8041632175445557, "logps/chosen": -364.3658142089844, "logps/rejected": -368.28619384765625, "loss": 0.5779, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.3326733410358429, "rewards/margins": 0.5019634962081909, "rewards/rejected": -0.8346366882324219, "step": 110 }, { "epoch": 0.25, "grad_norm": 35.52031238722188, "learning_rate": 4.662012913161997e-07, "logits/chosen": -1.8828121423721313, "logits/rejected": -1.8731359243392944, "logps/chosen": -346.777099609375, "logps/rejected": -378.0817565917969, "loss": 0.544, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.42460617423057556, "rewards/margins": 0.5200009942054749, "rewards/rejected": -0.9446069598197937, "step": 120 }, { "epoch": 0.27, "grad_norm": 40.83171596073763, "learning_rate": 4.5646165232345103e-07, "logits/chosen": -1.9067420959472656, "logits/rejected": -1.848259687423706, "logps/chosen": -353.1668395996094, "logps/rejected": -412.601806640625, "loss": 0.5319, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5689653158187866, "rewards/margins": 0.6179059147834778, "rewards/rejected": -1.1868712902069092, "step": 130 }, { "epoch": 0.29, "grad_norm": 39.57816446283388, "learning_rate": 4.456204510851956e-07, "logits/chosen": -1.79110848903656, "logits/rejected": -1.710828423500061, "logps/chosen": -390.3045959472656, "logps/rejected": -453.116943359375, "loss": 0.537, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.5443016290664673, "rewards/margins": 0.7209955453872681, "rewards/rejected": -1.2652971744537354, "step": 140 }, { "epoch": 0.31, "grad_norm": 45.241736858623206, "learning_rate": 4.337355301007335e-07, "logits/chosen": -1.8114426136016846, "logits/rejected": -1.7426559925079346, "logps/chosen": -352.48992919921875, "logps/rejected": -402.91943359375, "loss": 0.5462, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.5064585208892822, "rewards/margins": 0.5219663381576538, "rewards/rejected": -1.028424859046936, "step": 150 }, { "epoch": 0.33, "grad_norm": 73.25214998863763, "learning_rate": 4.2087030056579986e-07, "logits/chosen": -1.8390640020370483, "logits/rejected": -1.7504537105560303, "logps/chosen": -339.1869812011719, "logps/rejected": -387.9916076660156, "loss": 0.5442, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.4455109238624573, "rewards/margins": 0.7391675710678101, "rewards/rejected": -1.1846784353256226, "step": 160 }, { "epoch": 0.36, "grad_norm": 48.08778532882697, "learning_rate": 4.070934040463998e-07, "logits/chosen": -1.7452236413955688, "logits/rejected": -1.6487846374511719, "logps/chosen": -335.72528076171875, "logps/rejected": -377.5245361328125, "loss": 0.5304, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.6902536749839783, "rewards/margins": 0.5506319999694824, "rewards/rejected": -1.2408854961395264, "step": 170 }, { "epoch": 0.38, "grad_norm": 45.566526901622865, "learning_rate": 3.9247834624635404e-07, "logits/chosen": -1.5920779705047607, "logits/rejected": -1.5328117609024048, "logps/chosen": -352.29937744140625, "logps/rejected": -390.72100830078125, "loss": 0.5011, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8087765574455261, "rewards/margins": 0.6529080867767334, "rewards/rejected": -1.4616845846176147, "step": 180 }, { "epoch": 0.4, "grad_norm": 48.30624199959232, "learning_rate": 3.7710310482256523e-07, "logits/chosen": -1.7276074886322021, "logits/rejected": -1.6613149642944336, "logps/chosen": -347.87579345703125, "logps/rejected": -405.24237060546875, "loss": 0.5308, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5396801829338074, "rewards/margins": 0.6805658936500549, "rewards/rejected": -1.2202460765838623, "step": 190 }, { "epoch": 0.42, "grad_norm": 99.6040419345467, "learning_rate": 3.610497133404795e-07, "logits/chosen": -1.7740137577056885, "logits/rejected": -1.7177015542984009, "logps/chosen": -344.6033020019531, "logps/rejected": -404.29229736328125, "loss": 0.5413, "rewards/accuracies": 0.75, "rewards/chosen": -0.6725525856018066, "rewards/margins": 0.6624492406845093, "rewards/rejected": -1.3350017070770264, "step": 200 }, { "epoch": 0.42, "eval_logits/chosen": -1.901658296585083, "eval_logits/rejected": -1.8679291009902954, "eval_logps/chosen": -349.037841796875, "eval_logps/rejected": -432.6194152832031, "eval_loss": 0.5253521800041199, "eval_rewards/accuracies": 0.7265625, "eval_rewards/chosen": -0.6740425825119019, "eval_rewards/margins": 0.7745361328125, "eval_rewards/rejected": -1.4485788345336914, "eval_runtime": 97.5006, "eval_samples_per_second": 20.513, "eval_steps_per_second": 0.328, "step": 200 }, { "epoch": 0.44, "grad_norm": 46.68866608504909, "learning_rate": 3.4440382358952115e-07, "logits/chosen": -1.734480619430542, "logits/rejected": -1.6646308898925781, "logps/chosen": -384.4491882324219, "logps/rejected": -421.3724670410156, "loss": 0.5373, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7106617093086243, "rewards/margins": 0.7391539812088013, "rewards/rejected": -1.4498156309127808, "step": 210 }, { "epoch": 0.46, "grad_norm": 44.67370083595421, "learning_rate": 3.272542485937368e-07, "logits/chosen": -1.7060235738754272, "logits/rejected": -1.621319055557251, "logps/chosen": -333.583740234375, "logps/rejected": -387.3582458496094, "loss": 0.5233, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.5107825994491577, "rewards/margins": 0.7757614850997925, "rewards/rejected": -1.2865440845489502, "step": 220 }, { "epoch": 0.48, "grad_norm": 44.602377758622936, "learning_rate": 3.096924887558854e-07, "logits/chosen": -1.672357201576233, "logits/rejected": -1.677425742149353, "logps/chosen": -334.2008361816406, "logps/rejected": -428.0926208496094, "loss": 0.5239, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.6795379519462585, "rewards/margins": 0.8884698152542114, "rewards/rejected": -1.5680078268051147, "step": 230 }, { "epoch": 0.5, "grad_norm": 43.82303533573589, "learning_rate": 2.9181224366319943e-07, "logits/chosen": -1.6859185695648193, "logits/rejected": -1.6255781650543213, "logps/chosen": -357.11773681640625, "logps/rejected": -421.4244079589844, "loss": 0.4902, "rewards/accuracies": 0.75, "rewards/chosen": -0.6822614073753357, "rewards/margins": 0.8219982385635376, "rewards/rejected": -1.5042595863342285, "step": 240 }, { "epoch": 0.52, "grad_norm": 46.68066851465082, "learning_rate": 2.7370891215954565e-07, "logits/chosen": -1.5676295757293701, "logits/rejected": -1.4538037776947021, "logps/chosen": -401.59979248046875, "logps/rejected": -471.2294006347656, "loss": 0.5154, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7957779765129089, "rewards/margins": 1.0585238933563232, "rewards/rejected": -1.8543018102645874, "step": 250 }, { "epoch": 0.54, "grad_norm": 45.74080164598797, "learning_rate": 2.55479083351317e-07, "logits/chosen": -1.5453943014144897, "logits/rejected": -1.3946092128753662, "logps/chosen": -411.67681884765625, "logps/rejected": -464.185791015625, "loss": 0.5124, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.9510448575019836, "rewards/margins": 0.9077135324478149, "rewards/rejected": -1.8587583303451538, "step": 260 }, { "epoch": 0.56, "grad_norm": 35.67215071482242, "learning_rate": 2.3722002126275822e-07, "logits/chosen": -1.4410674571990967, "logits/rejected": -1.4173917770385742, "logps/chosen": -389.84442138671875, "logps/rejected": -446.63946533203125, "loss": 0.5176, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.882165253162384, "rewards/margins": 0.7578494548797607, "rewards/rejected": -1.6400146484375, "step": 270 }, { "epoch": 0.59, "grad_norm": 45.13283149696396, "learning_rate": 2.19029145890313e-07, "logits/chosen": -1.3330192565917969, "logits/rejected": -1.2097164392471313, "logps/chosen": -361.392578125, "logps/rejected": -428.5855407714844, "loss": 0.5182, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7854728102684021, "rewards/margins": 0.8376423716545105, "rewards/rejected": -1.6231151819229126, "step": 280 }, { "epoch": 0.61, "grad_norm": 42.58221661061121, "learning_rate": 2.0100351342479216e-07, "logits/chosen": -1.3783751726150513, "logits/rejected": -1.3098156452178955, "logps/chosen": -341.5384521484375, "logps/rejected": -416.83050537109375, "loss": 0.5035, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.7412964105606079, "rewards/margins": 0.8173438906669617, "rewards/rejected": -1.5586402416229248, "step": 290 }, { "epoch": 0.63, "grad_norm": 44.44135138330837, "learning_rate": 1.8323929841460178e-07, "logits/chosen": -1.4361331462860107, "logits/rejected": -1.2948487997055054, "logps/chosen": -415.39324951171875, "logps/rejected": -463.1932067871094, "loss": 0.4829, "rewards/accuracies": 0.75, "rewards/chosen": -0.9276901483535767, "rewards/margins": 0.8659119606018066, "rewards/rejected": -1.7936019897460938, "step": 300 }, { "epoch": 0.63, "eval_logits/chosen": -1.256626844406128, "eval_logits/rejected": -1.199381709098816, "eval_logps/chosen": -368.2399597167969, "eval_logps/rejected": -477.2876892089844, "eval_loss": 0.49556368589401245, "eval_rewards/accuracies": 0.78125, "eval_rewards/chosen": -0.8660640716552734, "eval_rewards/margins": 1.0291972160339355, "eval_rewards/rejected": -1.895261287689209, "eval_runtime": 97.5907, "eval_samples_per_second": 20.494, "eval_steps_per_second": 0.328, "step": 300 }, { "epoch": 0.65, "grad_norm": 49.20598478293576, "learning_rate": 1.6583128063291573e-07, "logits/chosen": -1.070894479751587, "logits/rejected": -0.999220073223114, "logps/chosen": -422.22509765625, "logps/rejected": -478.1600646972656, "loss": 0.4882, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.0501785278320312, "rewards/margins": 0.9182316660881042, "rewards/rejected": -1.9684101343154907, "step": 310 }, { "epoch": 0.67, "grad_norm": 49.52786644109137, "learning_rate": 1.488723393865766e-07, "logits/chosen": -0.9732829332351685, "logits/rejected": -0.8598931431770325, "logps/chosen": -423.58465576171875, "logps/rejected": -463.65087890625, "loss": 0.482, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0610870122909546, "rewards/margins": 0.8992059826850891, "rewards/rejected": -1.960293173789978, "step": 320 }, { "epoch": 0.69, "grad_norm": 44.16474950280745, "learning_rate": 1.3245295796480788e-07, "logits/chosen": -1.1072012186050415, "logits/rejected": -0.9854669570922852, "logps/chosen": -383.697509765625, "logps/rejected": -467.64630126953125, "loss": 0.4809, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.015866756439209, "rewards/margins": 0.7990261316299438, "rewards/rejected": -1.8148927688598633, "step": 330 }, { "epoch": 0.71, "grad_norm": 49.790170416193874, "learning_rate": 1.1666074087171627e-07, "logits/chosen": -1.0710186958312988, "logits/rejected": -0.9443724751472473, "logps/chosen": -406.4459228515625, "logps/rejected": -490.1005859375, "loss": 0.4931, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.9062315821647644, "rewards/margins": 0.9880655407905579, "rewards/rejected": -1.8942972421646118, "step": 340 }, { "epoch": 0.73, "grad_norm": 45.78788884909769, "learning_rate": 1.0157994641835734e-07, "logits/chosen": -0.964527428150177, "logits/rejected": -0.8877021670341492, "logps/chosen": -366.8417053222656, "logps/rejected": -448.39239501953125, "loss": 0.4708, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8918215036392212, "rewards/margins": 0.9880329966545105, "rewards/rejected": -1.879854440689087, "step": 350 }, { "epoch": 0.75, "grad_norm": 47.523486254775236, "learning_rate": 8.729103716819111e-08, "logits/chosen": -1.099103331565857, "logits/rejected": -0.9152529835700989, "logps/chosen": -418.78790283203125, "logps/rejected": -472.5894470214844, "loss": 0.5139, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9209517240524292, "rewards/margins": 0.9369718432426453, "rewards/rejected": -1.8579237461090088, "step": 360 }, { "epoch": 0.77, "grad_norm": 40.46200259764798, "learning_rate": 7.387025063449081e-08, "logits/chosen": -1.0258140563964844, "logits/rejected": -0.9037224054336548, "logps/chosen": -389.7789611816406, "logps/rejected": -427.80902099609375, "loss": 0.501, "rewards/accuracies": 0.75, "rewards/chosen": -0.9432722926139832, "rewards/margins": 0.7514128684997559, "rewards/rejected": -1.6946852207183838, "step": 370 }, { "epoch": 0.79, "grad_norm": 44.99044596346264, "learning_rate": 6.138919252022435e-08, "logits/chosen": -0.9202815294265747, "logits/rejected": -0.9092128872871399, "logps/chosen": -369.4691467285156, "logps/rejected": -499.80047607421875, "loss": 0.4856, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1355737447738647, "rewards/margins": 1.0553382635116577, "rewards/rejected": -2.1909122467041016, "step": 380 }, { "epoch": 0.82, "grad_norm": 46.73184407203235, "learning_rate": 4.991445467064689e-08, "logits/chosen": -1.017165184020996, "logits/rejected": -0.9522297978401184, "logps/chosen": -429.59124755859375, "logps/rejected": -499.8984375, "loss": 0.4844, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0497267246246338, "rewards/margins": 0.8575556874275208, "rewards/rejected": -1.9072824716567993, "step": 390 }, { "epoch": 0.84, "grad_norm": 45.88759783660656, "learning_rate": 3.9507259776993954e-08, "logits/chosen": -0.8822342753410339, "logits/rejected": -0.7616764307022095, "logps/chosen": -393.79986572265625, "logps/rejected": -488.7137145996094, "loss": 0.4981, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0166757106781006, "rewards/margins": 0.9999138116836548, "rewards/rejected": -2.016589403152466, "step": 400 }, { "epoch": 0.84, "eval_logits/chosen": -1.0730373859405518, "eval_logits/rejected": -0.9850106239318848, "eval_logps/chosen": -365.3529357910156, "eval_logps/rejected": -476.14508056640625, "eval_loss": 0.49130114912986755, "eval_rewards/accuracies": 0.78515625, "eval_rewards/chosen": -0.8371938467025757, "eval_rewards/margins": 1.0466417074203491, "eval_rewards/rejected": -1.8838355541229248, "eval_runtime": 97.6225, "eval_samples_per_second": 20.487, "eval_steps_per_second": 0.328, "step": 400 }, { "epoch": 0.86, "grad_norm": 44.331882429947925, "learning_rate": 3.022313472693447e-08, "logits/chosen": -1.0954724550247192, "logits/rejected": -0.854290783405304, "logps/chosen": -407.5718078613281, "logps/rejected": -482.7383728027344, "loss": 0.4921, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8705843687057495, "rewards/margins": 1.0231659412384033, "rewards/rejected": -1.8937501907348633, "step": 410 }, { "epoch": 0.88, "grad_norm": 48.31749590006741, "learning_rate": 2.2111614344599684e-08, "logits/chosen": -1.0192204713821411, "logits/rejected": -0.973158061504364, "logps/chosen": -416.341552734375, "logps/rejected": -486.69232177734375, "loss": 0.4856, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.9502483606338501, "rewards/margins": 0.9265721440315247, "rewards/rejected": -1.8768205642700195, "step": 420 }, { "epoch": 0.9, "grad_norm": 40.281913550333705, "learning_rate": 1.521597710086439e-08, "logits/chosen": -0.9044865369796753, "logits/rejected": -0.8032494783401489, "logps/chosen": -405.5864562988281, "logps/rejected": -480.13201904296875, "loss": 0.4776, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.037095546722412, "rewards/margins": 0.9771502614021301, "rewards/rejected": -2.0142457485198975, "step": 430 }, { "epoch": 0.92, "grad_norm": 43.058313272164526, "learning_rate": 9.57301420397924e-09, "logits/chosen": -0.9727311134338379, "logits/rejected": -0.8283950090408325, "logps/chosen": -393.12823486328125, "logps/rejected": -472.7400817871094, "loss": 0.4944, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8835296630859375, "rewards/margins": 0.8925860524177551, "rewards/rejected": -1.7761156558990479, "step": 440 }, { "epoch": 0.94, "grad_norm": 45.17569103872668, "learning_rate": 5.212833302556258e-09, "logits/chosen": -0.8834640383720398, "logits/rejected": -0.8035561442375183, "logps/chosen": -416.1705627441406, "logps/rejected": -517.479248046875, "loss": 0.4973, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0344994068145752, "rewards/margins": 0.8305438756942749, "rewards/rejected": -1.8650434017181396, "step": 450 }, { "epoch": 0.96, "grad_norm": 57.18112420515564, "learning_rate": 2.158697848236607e-09, "logits/chosen": -0.9279729723930359, "logits/rejected": -0.8204873204231262, "logps/chosen": -390.6974182128906, "logps/rejected": -443.84051513671875, "loss": 0.4944, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9189409017562866, "rewards/margins": 0.8423686027526855, "rewards/rejected": -1.7613098621368408, "step": 460 }, { "epoch": 0.98, "grad_norm": 40.005457130126345, "learning_rate": 4.269029751107489e-10, "logits/chosen": -0.9693315625190735, "logits/rejected": -0.8152003288269043, "logps/chosen": -384.0590515136719, "logps/rejected": -482.89630126953125, "loss": 0.4792, "rewards/accuracies": 0.75, "rewards/chosen": -0.8925178647041321, "rewards/margins": 0.9355740547180176, "rewards/rejected": -1.8280918598175049, "step": 470 }, { "epoch": 1.0, "step": 478, "total_flos": 0.0, "train_loss": 0.5347933170685708, "train_runtime": 7634.2165, "train_samples_per_second": 8.008, "train_steps_per_second": 0.063 } ], "logging_steps": 10, "max_steps": 478, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }