{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998089050257978, "eval_steps": 100, "global_step": 2616, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.8359375, "learning_rate": 1.908396946564885e-09, "logits/chosen": -3.3302907943725586, "logits/rejected": -3.269564628601074, "logps/chosen": -81.06770324707031, "logps/rejected": -120.27629089355469, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 1.4296875, "learning_rate": 1.9083969465648856e-08, "logits/chosen": -3.2870726585388184, "logits/rejected": -3.03167986869812, "logps/chosen": -196.69744873046875, "logps/rejected": -238.91517639160156, "loss": 0.6933, "rewards/accuracies": 0.4027777910232544, "rewards/chosen": -0.00020526793377939612, "rewards/margins": -0.0005111552309244871, "rewards/margins_max": 0.0015908111818134785, "rewards/margins_min": -0.0026131218764930964, "rewards/margins_std": 0.002972629852592945, "rewards/rejected": 0.0003058873408008367, "step": 10 }, { "epoch": 0.01, "grad_norm": 2.21875, "learning_rate": 3.816793893129771e-08, "logits/chosen": -3.2914886474609375, "logits/rejected": -2.96828293800354, "logps/chosen": -205.34304809570312, "logps/rejected": -226.73660278320312, "loss": 0.6932, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.00018589093815535307, "rewards/margins": 0.0002819823566824198, "rewards/margins_max": 0.0020934194326400757, "rewards/margins_min": -0.001529454835690558, "rewards/margins_std": 0.0025617589708417654, "rewards/rejected": -9.609150583855808e-05, "step": 20 }, { "epoch": 0.01, "grad_norm": 1.953125, "learning_rate": 5.725190839694656e-08, "logits/chosen": -3.212151288986206, "logits/rejected": -2.9446871280670166, "logps/chosen": -218.0032196044922, "logps/rejected": -250.72683715820312, "loss": 0.6926, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 4.363178595667705e-05, "rewards/margins": 0.0007587060681544244, "rewards/margins_max": 0.002750576240941882, "rewards/margins_min": -0.001233164221048355, "rewards/margins_std": 0.002816930180415511, "rewards/rejected": -0.0007150743040256202, "step": 30 }, { "epoch": 0.02, "grad_norm": 2.0, "learning_rate": 7.633587786259542e-08, "logits/chosen": -3.209655284881592, "logits/rejected": -2.982962131500244, "logps/chosen": -203.66883850097656, "logps/rejected": -245.7393341064453, "loss": 0.6928, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.00010408814705442637, "rewards/margins": 0.0006946413777768612, "rewards/margins_max": 0.0033363462425768375, "rewards/margins_min": -0.0019470632541924715, "rewards/margins_std": 0.003735934616997838, "rewards/rejected": -0.000798729422967881, "step": 40 }, { "epoch": 0.02, "grad_norm": 2.296875, "learning_rate": 9.541984732824428e-08, "logits/chosen": -3.225496768951416, "logits/rejected": -2.956040620803833, "logps/chosen": -225.0539093017578, "logps/rejected": -225.1595458984375, "loss": 0.6923, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.000378791824914515, "rewards/margins": 0.0016400141175836325, "rewards/margins_max": 0.004391551483422518, "rewards/margins_min": -0.0011115235975012183, "rewards/margins_std": 0.0038912619929760695, "rewards/rejected": -0.0020188060589134693, "step": 50 }, { "epoch": 0.02, "grad_norm": 2.25, "learning_rate": 1.1450381679389312e-07, "logits/chosen": -3.2494418621063232, "logits/rejected": -2.886786460876465, "logps/chosen": -215.2377166748047, "logps/rejected": -227.4031219482422, "loss": 0.6915, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.0002904659486375749, "rewards/margins": 0.003191638272255659, "rewards/margins_max": 0.005964426789432764, "rewards/margins_min": 0.0004188496677670628, "rewards/margins_std": 0.00392131507396698, "rewards/rejected": -0.0029011727310717106, "step": 60 }, { "epoch": 0.03, "grad_norm": 1.59375, "learning_rate": 1.3358778625954197e-07, "logits/chosen": -3.2788949012756348, "logits/rejected": -2.9262871742248535, "logps/chosen": -179.19461059570312, "logps/rejected": -200.50384521484375, "loss": 0.6907, "rewards/accuracies": 0.8125, "rewards/chosen": 0.000643759616650641, "rewards/margins": 0.005182401277124882, "rewards/margins_max": 0.008886445313692093, "rewards/margins_min": 0.0014783585211262107, "rewards/margins_std": 0.005238307174295187, "rewards/rejected": -0.004538641776889563, "step": 70 }, { "epoch": 0.03, "grad_norm": 2.375, "learning_rate": 1.5267175572519085e-07, "logits/chosen": -3.2208304405212402, "logits/rejected": -2.935650110244751, "logps/chosen": -209.3942413330078, "logps/rejected": -238.1333465576172, "loss": 0.6903, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.0011481496039777994, "rewards/margins": 0.004855319391936064, "rewards/margins_max": 0.007420788519084454, "rewards/margins_min": 0.0022898507304489613, "rewards/margins_std": 0.0036281212233006954, "rewards/rejected": -0.006003469228744507, "step": 80 }, { "epoch": 0.03, "grad_norm": 2.625, "learning_rate": 1.717557251908397e-07, "logits/chosen": -3.2265000343322754, "logits/rejected": -2.9264445304870605, "logps/chosen": -220.7260284423828, "logps/rejected": -239.3951873779297, "loss": 0.6887, "rewards/accuracies": 0.875, "rewards/chosen": -6.91156237735413e-05, "rewards/margins": 0.008305264636874199, "rewards/margins_max": 0.013177113607525826, "rewards/margins_min": 0.003433419391512871, "rewards/margins_std": 0.0068898312747478485, "rewards/rejected": -0.008374381810426712, "step": 90 }, { "epoch": 0.04, "grad_norm": 2.015625, "learning_rate": 1.9083969465648855e-07, "logits/chosen": -3.179041862487793, "logits/rejected": -2.956526041030884, "logps/chosen": -163.98475646972656, "logps/rejected": -195.57383728027344, "loss": 0.6883, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.0005407828139141202, "rewards/margins": 0.010149060748517513, "rewards/margins_max": 0.01537814736366272, "rewards/margins_min": 0.004919976461678743, "rewards/margins_std": 0.007395043037831783, "rewards/rejected": -0.010689844377338886, "step": 100 }, { "epoch": 0.04, "grad_norm": 1.96875, "learning_rate": 2.0992366412213738e-07, "logits/chosen": -3.272021532058716, "logits/rejected": -3.0388495922088623, "logps/chosen": -198.17446899414062, "logps/rejected": -233.8720703125, "loss": 0.687, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.00015886728942859918, "rewards/margins": 0.01238742470741272, "rewards/margins_max": 0.017614034935832024, "rewards/margins_min": 0.007160813547670841, "rewards/margins_std": 0.00739154452458024, "rewards/rejected": -0.012546291574835777, "step": 110 }, { "epoch": 0.05, "grad_norm": 1.8203125, "learning_rate": 2.2900763358778623e-07, "logits/chosen": -3.2624504566192627, "logits/rejected": -3.0127363204956055, "logps/chosen": -185.32650756835938, "logps/rejected": -233.5771026611328, "loss": 0.6853, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.0006447458872571588, "rewards/margins": 0.015380805358290672, "rewards/margins_max": 0.022997872903943062, "rewards/margins_min": 0.007763735018670559, "rewards/margins_std": 0.01077216025441885, "rewards/rejected": -0.016025548800826073, "step": 120 }, { "epoch": 0.05, "grad_norm": 1.96875, "learning_rate": 2.480916030534351e-07, "logits/chosen": -3.210339069366455, "logits/rejected": -2.8807480335235596, "logps/chosen": -192.64651489257812, "logps/rejected": -240.62496948242188, "loss": 0.6835, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.0016148982103914022, "rewards/margins": 0.021930258721113205, "rewards/margins_max": 0.03173477575182915, "rewards/margins_min": 0.012125745415687561, "rewards/margins_std": 0.013865679502487183, "rewards/rejected": -0.02354515716433525, "step": 130 }, { "epoch": 0.05, "grad_norm": 2.265625, "learning_rate": 2.6717557251908394e-07, "logits/chosen": -3.1735782623291016, "logits/rejected": -2.892324447631836, "logps/chosen": -202.73878479003906, "logps/rejected": -245.81185913085938, "loss": 0.6828, "rewards/accuracies": 0.9375, "rewards/chosen": -0.00102281104773283, "rewards/margins": 0.022799000144004822, "rewards/margins_max": 0.0319000706076622, "rewards/margins_min": 0.013697926886379719, "rewards/margins_std": 0.012870860286056995, "rewards/rejected": -0.023821810260415077, "step": 140 }, { "epoch": 0.06, "grad_norm": 2.046875, "learning_rate": 2.8625954198473276e-07, "logits/chosen": -3.2441039085388184, "logits/rejected": -2.972184896469116, "logps/chosen": -170.27938842773438, "logps/rejected": -226.83462524414062, "loss": 0.6806, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.00395964365452528, "rewards/margins": 0.026034507900476456, "rewards/margins_max": 0.036855850368738174, "rewards/margins_min": 0.015213166363537312, "rewards/margins_std": 0.015303686261177063, "rewards/rejected": -0.02999415434896946, "step": 150 }, { "epoch": 0.06, "grad_norm": 2.328125, "learning_rate": 3.053435114503817e-07, "logits/chosen": -3.2375476360321045, "logits/rejected": -2.889326333999634, "logps/chosen": -189.0028533935547, "logps/rejected": -238.2917022705078, "loss": 0.6786, "rewards/accuracies": 0.9375, "rewards/chosen": -0.003632964100688696, "rewards/margins": 0.03097444772720337, "rewards/margins_max": 0.03957698494195938, "rewards/margins_min": 0.02237190678715706, "rewards/margins_std": 0.012165828607976437, "rewards/rejected": -0.0346074104309082, "step": 160 }, { "epoch": 0.06, "grad_norm": 1.796875, "learning_rate": 3.244274809160305e-07, "logits/chosen": -3.205308437347412, "logits/rejected": -2.902846097946167, "logps/chosen": -213.3687744140625, "logps/rejected": -253.6230010986328, "loss": 0.6758, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.0031909930985420942, "rewards/margins": 0.03212432190775871, "rewards/margins_max": 0.04667423292994499, "rewards/margins_min": 0.017574409022927284, "rewards/margins_std": 0.020576683804392815, "rewards/rejected": -0.03531531244516373, "step": 170 }, { "epoch": 0.07, "grad_norm": 2.015625, "learning_rate": 3.435114503816794e-07, "logits/chosen": -3.233248233795166, "logits/rejected": -2.934068202972412, "logps/chosen": -163.8170623779297, "logps/rejected": -220.64205932617188, "loss": 0.6762, "rewards/accuracies": 0.9375, "rewards/chosen": -0.006008402444422245, "rewards/margins": 0.034125737845897675, "rewards/margins_max": 0.050690434873104095, "rewards/margins_min": 0.017561035230755806, "rewards/margins_std": 0.02342602238059044, "rewards/rejected": -0.040134135633707047, "step": 180 }, { "epoch": 0.07, "grad_norm": 1.8828125, "learning_rate": 3.6259541984732823e-07, "logits/chosen": -3.282362699508667, "logits/rejected": -3.0158865451812744, "logps/chosen": -192.17164611816406, "logps/rejected": -220.742431640625, "loss": 0.6731, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.007757545914500952, "rewards/margins": 0.04202093929052353, "rewards/margins_max": 0.060148369520902634, "rewards/margins_min": 0.023893514648079872, "rewards/margins_std": 0.02563605271279812, "rewards/rejected": -0.04977848380804062, "step": 190 }, { "epoch": 0.08, "grad_norm": 1.9765625, "learning_rate": 3.816793893129771e-07, "logits/chosen": -3.174314022064209, "logits/rejected": -2.9219601154327393, "logps/chosen": -194.1986846923828, "logps/rejected": -267.26055908203125, "loss": 0.6686, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.010592457838356495, "rewards/margins": 0.05324589088559151, "rewards/margins_max": 0.07371693104505539, "rewards/margins_min": 0.03277484327554703, "rewards/margins_std": 0.028950434178113937, "rewards/rejected": -0.06383834034204483, "step": 200 }, { "epoch": 0.08, "grad_norm": 2.21875, "learning_rate": 4.0076335877862593e-07, "logits/chosen": -3.2590651512145996, "logits/rejected": -3.009448528289795, "logps/chosen": -193.40025329589844, "logps/rejected": -269.24017333984375, "loss": 0.666, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.006985983345657587, "rewards/margins": 0.05964884161949158, "rewards/margins_max": 0.08800722658634186, "rewards/margins_min": 0.03129046410322189, "rewards/margins_std": 0.04010480269789696, "rewards/rejected": -0.06663481891155243, "step": 210 }, { "epoch": 0.08, "grad_norm": 2.234375, "learning_rate": 4.1984732824427476e-07, "logits/chosen": -3.212986469268799, "logits/rejected": -2.872310161590576, "logps/chosen": -245.77197265625, "logps/rejected": -249.1974639892578, "loss": 0.6624, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.010857370682060719, "rewards/margins": 0.07341400533914566, "rewards/margins_max": 0.10470987856388092, "rewards/margins_min": 0.0421181321144104, "rewards/margins_std": 0.04425904154777527, "rewards/rejected": -0.08427136391401291, "step": 220 }, { "epoch": 0.09, "grad_norm": 2.09375, "learning_rate": 4.3893129770992364e-07, "logits/chosen": -3.1967101097106934, "logits/rejected": -2.923083782196045, "logps/chosen": -201.79908752441406, "logps/rejected": -248.243896484375, "loss": 0.6585, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0139171052724123, "rewards/margins": 0.06787364184856415, "rewards/margins_max": 0.09401793777942657, "rewards/margins_min": 0.04172936826944351, "rewards/margins_std": 0.03697359561920166, "rewards/rejected": -0.08179076015949249, "step": 230 }, { "epoch": 0.09, "grad_norm": 1.9453125, "learning_rate": 4.5801526717557246e-07, "logits/chosen": -3.285114288330078, "logits/rejected": -2.8961422443389893, "logps/chosen": -205.2576446533203, "logps/rejected": -215.2158203125, "loss": 0.6531, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.011311140842735767, "rewards/margins": 0.08977559953927994, "rewards/margins_max": 0.12556931376457214, "rewards/margins_min": 0.05398188903927803, "rewards/margins_std": 0.05061995983123779, "rewards/rejected": -0.10108675062656403, "step": 240 }, { "epoch": 0.1, "grad_norm": 2.140625, "learning_rate": 4.770992366412213e-07, "logits/chosen": -3.273923397064209, "logits/rejected": -2.972449779510498, "logps/chosen": -190.66929626464844, "logps/rejected": -236.36181640625, "loss": 0.65, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.021324489265680313, "rewards/margins": 0.09128403663635254, "rewards/margins_max": 0.1358390599489212, "rewards/margins_min": 0.04672900587320328, "rewards/margins_std": 0.0630103200674057, "rewards/rejected": -0.11260852962732315, "step": 250 }, { "epoch": 0.1, "grad_norm": 2.046875, "learning_rate": 4.961832061068702e-07, "logits/chosen": -3.2471776008605957, "logits/rejected": -2.966447353363037, "logps/chosen": -203.96543884277344, "logps/rejected": -235.2838897705078, "loss": 0.647, "rewards/accuracies": 0.9375, "rewards/chosen": -0.020830674096941948, "rewards/margins": 0.09641597419977188, "rewards/margins_max": 0.1450330764055252, "rewards/margins_min": 0.04779886454343796, "rewards/margins_std": 0.06875498592853546, "rewards/rejected": -0.11724665015935898, "step": 260 }, { "epoch": 0.1, "grad_norm": 2.078125, "learning_rate": 4.99985751383753e-07, "logits/chosen": -3.2874855995178223, "logits/rejected": -3.0439696311950684, "logps/chosen": -198.27664184570312, "logps/rejected": -249.9978485107422, "loss": 0.6469, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.017464281991124153, "rewards/margins": 0.10535471141338348, "rewards/margins_max": 0.1504170447587967, "rewards/margins_min": 0.06029237434267998, "rewards/margins_std": 0.06372777372598648, "rewards/rejected": -0.12281899154186249, "step": 270 }, { "epoch": 0.11, "grad_norm": 2.046875, "learning_rate": 4.999278691638749e-07, "logits/chosen": -3.2457098960876465, "logits/rejected": -3.017002582550049, "logps/chosen": -213.6870880126953, "logps/rejected": -219.8416290283203, "loss": 0.6445, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.03185909986495972, "rewards/margins": 0.09040321409702301, "rewards/margins_max": 0.12558069825172424, "rewards/margins_min": 0.055225737392902374, "rewards/margins_std": 0.049748457968235016, "rewards/rejected": -0.12226231396198273, "step": 280 }, { "epoch": 0.11, "grad_norm": 2.28125, "learning_rate": 4.998254731031337e-07, "logits/chosen": -3.2504220008850098, "logits/rejected": -2.9975688457489014, "logps/chosen": -195.15164184570312, "logps/rejected": -227.1262664794922, "loss": 0.637, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.02987954393029213, "rewards/margins": 0.1159171611070633, "rewards/margins_max": 0.17162121832370758, "rewards/margins_min": 0.060213081538677216, "rewards/margins_std": 0.07877745479345322, "rewards/rejected": -0.14579668641090393, "step": 290 }, { "epoch": 0.11, "grad_norm": 1.953125, "learning_rate": 4.996785814389591e-07, "logits/chosen": -3.309065580368042, "logits/rejected": -2.962916135787964, "logps/chosen": -166.18507385253906, "logps/rejected": -179.7928466796875, "loss": 0.6393, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.03008580207824707, "rewards/margins": 0.10214884579181671, "rewards/margins_max": 0.15787146985530853, "rewards/margins_min": 0.04642622545361519, "rewards/margins_std": 0.07880368083715439, "rewards/rejected": -0.13223466277122498, "step": 300 }, { "epoch": 0.12, "grad_norm": 2.4375, "learning_rate": 4.994872203337482e-07, "logits/chosen": -3.257145643234253, "logits/rejected": -2.9680373668670654, "logps/chosen": -202.3374786376953, "logps/rejected": -219.5332489013672, "loss": 0.6225, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04237338528037071, "rewards/margins": 0.12682045996189117, "rewards/margins_max": 0.18470799922943115, "rewards/margins_min": 0.0689328983426094, "rewards/margins_std": 0.0818653553724289, "rewards/rejected": -0.169193834066391, "step": 310 }, { "epoch": 0.12, "grad_norm": 1.9140625, "learning_rate": 4.992514238702059e-07, "logits/chosen": -3.314955234527588, "logits/rejected": -3.1039557456970215, "logps/chosen": -199.2044219970703, "logps/rejected": -245.0098876953125, "loss": 0.6227, "rewards/accuracies": 0.9375, "rewards/chosen": -0.042611610144376755, "rewards/margins": 0.15529310703277588, "rewards/margins_max": 0.21862797439098358, "rewards/margins_min": 0.09195823222398758, "rewards/margins_std": 0.08956903219223022, "rewards/rejected": -0.19790470600128174, "step": 320 }, { "epoch": 0.13, "grad_norm": 2.203125, "learning_rate": 4.989712340452743e-07, "logits/chosen": -3.231292247772217, "logits/rejected": -2.928297519683838, "logps/chosen": -197.5637969970703, "logps/rejected": -262.58245849609375, "loss": 0.6165, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.06006884574890137, "rewards/margins": 0.16975189745426178, "rewards/margins_max": 0.24120891094207764, "rewards/margins_min": 0.09829487651586533, "rewards/margins_std": 0.10105548053979874, "rewards/rejected": -0.22982072830200195, "step": 330 }, { "epoch": 0.13, "grad_norm": 2.140625, "learning_rate": 4.986467007626528e-07, "logits/chosen": -3.212827682495117, "logits/rejected": -2.941976547241211, "logps/chosen": -223.7288818359375, "logps/rejected": -274.39337158203125, "loss": 0.6163, "rewards/accuracies": 0.875, "rewards/chosen": -0.06747810542583466, "rewards/margins": 0.15132711827754974, "rewards/margins_max": 0.23900623619556427, "rewards/margins_min": 0.06364797800779343, "rewards/margins_std": 0.12399701774120331, "rewards/rejected": -0.2188052237033844, "step": 340 }, { "epoch": 0.13, "grad_norm": 2.234375, "learning_rate": 4.982778818239101e-07, "logits/chosen": -3.303295612335205, "logits/rejected": -3.0548179149627686, "logps/chosen": -181.9226837158203, "logps/rejected": -212.4131622314453, "loss": 0.6097, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06988344341516495, "rewards/margins": 0.1707289218902588, "rewards/margins_max": 0.2409042865037918, "rewards/margins_min": 0.10055355727672577, "rewards/margins_std": 0.09924294799566269, "rewards/rejected": -0.24061235785484314, "step": 350 }, { "epoch": 0.14, "grad_norm": 2.578125, "learning_rate": 4.978648429181893e-07, "logits/chosen": -3.270679473876953, "logits/rejected": -3.0009212493896484, "logps/chosen": -186.18716430664062, "logps/rejected": -220.5108642578125, "loss": 0.6079, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.07348515838384628, "rewards/margins": 0.1932184398174286, "rewards/margins_max": 0.274120956659317, "rewards/margins_min": 0.11231593787670135, "rewards/margins_std": 0.11441340297460556, "rewards/rejected": -0.26670360565185547, "step": 360 }, { "epoch": 0.14, "grad_norm": 2.46875, "learning_rate": 4.97407657610508e-07, "logits/chosen": -3.271900177001953, "logits/rejected": -3.0213537216186523, "logps/chosen": -203.10650634765625, "logps/rejected": -261.69158935546875, "loss": 0.5911, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.07118692994117737, "rewards/margins": 0.21831946074962616, "rewards/margins_max": 0.3098284602165222, "rewards/margins_min": 0.1268104463815689, "rewards/margins_std": 0.12941327691078186, "rewards/rejected": -0.28950637578964233, "step": 370 }, { "epoch": 0.15, "grad_norm": 2.265625, "learning_rate": 4.969064073286563e-07, "logits/chosen": -3.357651472091675, "logits/rejected": -3.0608596801757812, "logps/chosen": -208.51754760742188, "logps/rejected": -262.3056640625, "loss": 0.5993, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.09235924482345581, "rewards/margins": 0.19000664353370667, "rewards/margins_max": 0.29324302077293396, "rewards/margins_min": 0.08677025139331818, "rewards/margins_std": 0.14599831402301788, "rewards/rejected": -0.2823658585548401, "step": 380 }, { "epoch": 0.15, "grad_norm": 2.109375, "learning_rate": 4.963611813486935e-07, "logits/chosen": -3.2650883197784424, "logits/rejected": -2.899033308029175, "logps/chosen": -235.99942016601562, "logps/rejected": -312.6029357910156, "loss": 0.5716, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.08987380564212799, "rewards/margins": 0.28126034140586853, "rewards/margins_max": 0.40632572770118713, "rewards/margins_min": 0.15619491040706635, "rewards/margins_std": 0.1768692135810852, "rewards/rejected": -0.3711341321468353, "step": 390 }, { "epoch": 0.15, "grad_norm": 2.375, "learning_rate": 4.957720767790477e-07, "logits/chosen": -3.2625536918640137, "logits/rejected": -3.0297203063964844, "logps/chosen": -189.33514404296875, "logps/rejected": -244.502685546875, "loss": 0.5818, "rewards/accuracies": 0.9375, "rewards/chosen": -0.09811891615390778, "rewards/margins": 0.2365761548280716, "rewards/margins_max": 0.33593007922172546, "rewards/margins_min": 0.13722223043441772, "rewards/margins_std": 0.14050766825675964, "rewards/rejected": -0.334695041179657, "step": 400 }, { "epoch": 0.16, "grad_norm": 2.28125, "learning_rate": 4.951391985432198e-07, "logits/chosen": -3.3009209632873535, "logits/rejected": -3.0335850715637207, "logps/chosen": -176.28749084472656, "logps/rejected": -214.1027374267578, "loss": 0.574, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.1052820086479187, "rewards/margins": 0.2626669108867645, "rewards/margins_max": 0.3927062153816223, "rewards/margins_min": 0.13262765109539032, "rewards/margins_std": 0.1839032918214798, "rewards/rejected": -0.3679489493370056, "step": 410 }, { "epoch": 0.16, "grad_norm": 2.46875, "learning_rate": 4.944626593610968e-07, "logits/chosen": -3.2991390228271484, "logits/rejected": -2.928713321685791, "logps/chosen": -231.21533203125, "logps/rejected": -282.78961181640625, "loss": 0.5844, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.12326918542385101, "rewards/margins": 0.2481236755847931, "rewards/margins_max": 0.35814613103866577, "rewards/margins_min": 0.13810119032859802, "rewards/margins_std": 0.15559527277946472, "rewards/rejected": -0.3713928461074829, "step": 420 }, { "epoch": 0.16, "grad_norm": 2.484375, "learning_rate": 4.937425797288742e-07, "logits/chosen": -3.2530465126037598, "logits/rejected": -3.0559334754943848, "logps/chosen": -190.10829162597656, "logps/rejected": -273.42584228515625, "loss": 0.549, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.13329292833805084, "rewards/margins": 0.29958730936050415, "rewards/margins_max": 0.4274858832359314, "rewards/margins_min": 0.17168866097927094, "rewards/margins_std": 0.1808759868144989, "rewards/rejected": -0.4328802227973938, "step": 430 }, { "epoch": 0.17, "grad_norm": 2.609375, "learning_rate": 4.929790878975965e-07, "logits/chosen": -3.194209098815918, "logits/rejected": -2.93556547164917, "logps/chosen": -225.93896484375, "logps/rejected": -259.15179443359375, "loss": 0.5657, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.13958829641342163, "rewards/margins": 0.27957916259765625, "rewards/margins_max": 0.4229784905910492, "rewards/margins_min": 0.13617977499961853, "rewards/margins_std": 0.2027973234653473, "rewards/rejected": -0.4191674590110779, "step": 440 }, { "epoch": 0.17, "grad_norm": 2.390625, "learning_rate": 4.921723198503132e-07, "logits/chosen": -3.26872181892395, "logits/rejected": -3.0192275047302246, "logps/chosen": -201.65443420410156, "logps/rejected": -289.94171142578125, "loss": 0.5472, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.15182039141654968, "rewards/margins": 0.33517175912857056, "rewards/margins_max": 0.4883649945259094, "rewards/margins_min": 0.18197841942310333, "rewards/margins_std": 0.21664805710315704, "rewards/rejected": -0.4869921803474426, "step": 450 }, { "epoch": 0.18, "grad_norm": 2.953125, "learning_rate": 4.913224192778603e-07, "logits/chosen": -3.334212064743042, "logits/rejected": -3.1082167625427246, "logps/chosen": -228.478759765625, "logps/rejected": -311.8726501464844, "loss": 0.5477, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.15663278102874756, "rewards/margins": 0.3697594106197357, "rewards/margins_max": 0.536509096622467, "rewards/margins_min": 0.20300979912281036, "rewards/margins_std": 0.23581957817077637, "rewards/rejected": -0.5263921618461609, "step": 460 }, { "epoch": 0.18, "grad_norm": 2.6875, "learning_rate": 4.90429537553268e-07, "logits/chosen": -3.2931313514709473, "logits/rejected": -3.0807833671569824, "logps/chosen": -217.96719360351562, "logps/rejected": -276.76202392578125, "loss": 0.5404, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.19370004534721375, "rewards/margins": 0.32050028443336487, "rewards/margins_max": 0.4694735109806061, "rewards/margins_min": 0.17152710258960724, "rewards/margins_std": 0.21067988872528076, "rewards/rejected": -0.5142003297805786, "step": 470 }, { "epoch": 0.18, "grad_norm": 2.921875, "learning_rate": 4.894938337047995e-07, "logits/chosen": -3.20732045173645, "logits/rejected": -2.931908369064331, "logps/chosen": -264.7499694824219, "logps/rejected": -318.031005859375, "loss": 0.526, "rewards/accuracies": 0.9375, "rewards/chosen": -0.20364347100257874, "rewards/margins": 0.42259034514427185, "rewards/margins_max": 0.5794626474380493, "rewards/margins_min": 0.2657180726528168, "rewards/margins_std": 0.2218509465456009, "rewards/rejected": -0.6262338757514954, "step": 480 }, { "epoch": 0.19, "grad_norm": 2.8125, "learning_rate": 4.885154743876277e-07, "logits/chosen": -3.3099350929260254, "logits/rejected": -3.021787643432617, "logps/chosen": -200.65884399414062, "logps/rejected": -255.55300903320312, "loss": 0.5332, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.19876748323440552, "rewards/margins": 0.349515825510025, "rewards/margins_max": 0.5269657373428345, "rewards/margins_min": 0.17206597328186035, "rewards/margins_std": 0.2509520649909973, "rewards/rejected": -0.5482833385467529, "step": 490 }, { "epoch": 0.19, "grad_norm": 2.734375, "learning_rate": 4.87494633854152e-07, "logits/chosen": -3.3137099742889404, "logits/rejected": -2.9903388023376465, "logps/chosen": -208.6374969482422, "logps/rejected": -282.22052001953125, "loss": 0.5294, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.23697516322135925, "rewards/margins": 0.4222859740257263, "rewards/margins_max": 0.6355811357498169, "rewards/margins_min": 0.20899085700511932, "rewards/margins_std": 0.30164486169815063, "rewards/rejected": -0.6592611074447632, "step": 500 }, { "epoch": 0.19, "grad_norm": 2.15625, "learning_rate": 4.864314939229637e-07, "logits/chosen": -3.306776523590088, "logits/rejected": -3.001708507537842, "logps/chosen": -255.39126586914062, "logps/rejected": -296.8736572265625, "loss": 0.5311, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.2626154124736786, "rewards/margins": 0.394510954618454, "rewards/margins_max": 0.6293004751205444, "rewards/margins_min": 0.1597214937210083, "rewards/margins_std": 0.332042396068573, "rewards/rejected": -0.6571264266967773, "step": 510 }, { "epoch": 0.2, "grad_norm": 2.734375, "learning_rate": 4.853262439464624e-07, "logits/chosen": -3.235895872116089, "logits/rejected": -2.9632811546325684, "logps/chosen": -203.52279663085938, "logps/rejected": -269.3291320800781, "loss": 0.5338, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.228928804397583, "rewards/margins": 0.3607742190361023, "rewards/margins_max": 0.5433238744735718, "rewards/margins_min": 0.17822448909282684, "rewards/margins_std": 0.25816428661346436, "rewards/rejected": -0.5897030234336853, "step": 520 }, { "epoch": 0.2, "grad_norm": 2.96875, "learning_rate": 4.841790807771307e-07, "logits/chosen": -3.166623592376709, "logits/rejected": -2.9125328063964844, "logps/chosen": -236.2999267578125, "logps/rejected": -336.2409362792969, "loss": 0.5043, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.2680327296257019, "rewards/margins": 0.46621331572532654, "rewards/margins_max": 0.655846357345581, "rewards/margins_min": 0.2765803337097168, "rewards/margins_std": 0.26818156242370605, "rewards/rejected": -0.7342461347579956, "step": 530 }, { "epoch": 0.21, "grad_norm": 3.015625, "learning_rate": 4.82990208732474e-07, "logits/chosen": -3.3195443153381348, "logits/rejected": -3.0613291263580322, "logps/chosen": -174.39599609375, "logps/rejected": -257.5628967285156, "loss": 0.5302, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.25635436177253723, "rewards/margins": 0.4157690107822418, "rewards/margins_max": 0.6029486060142517, "rewards/margins_min": 0.22858937084674835, "rewards/margins_std": 0.26471197605133057, "rewards/rejected": -0.6721233129501343, "step": 540 }, { "epoch": 0.21, "grad_norm": 2.921875, "learning_rate": 4.817598395586301e-07, "logits/chosen": -3.2022743225097656, "logits/rejected": -2.887551784515381, "logps/chosen": -214.01513671875, "logps/rejected": -309.3605651855469, "loss": 0.4893, "rewards/accuracies": 0.9375, "rewards/chosen": -0.24757170677185059, "rewards/margins": 0.5083514451980591, "rewards/margins_max": 0.7038620114326477, "rewards/margins_min": 0.31284087896347046, "rewards/margins_std": 0.27649372816085815, "rewards/rejected": -0.7559231519699097, "step": 550 }, { "epoch": 0.21, "grad_norm": 2.90625, "learning_rate": 4.804881923926556e-07, "logits/chosen": -3.2066566944122314, "logits/rejected": -2.9500646591186523, "logps/chosen": -216.474609375, "logps/rejected": -268.8821716308594, "loss": 0.5012, "rewards/accuracies": 0.875, "rewards/chosen": -0.30655044317245483, "rewards/margins": 0.4422832429409027, "rewards/margins_max": 0.6758220195770264, "rewards/margins_min": 0.20874443650245667, "rewards/margins_std": 0.33027374744415283, "rewards/rejected": -0.7488336563110352, "step": 560 }, { "epoch": 0.22, "grad_norm": 3.046875, "learning_rate": 4.791754937234961e-07, "logits/chosen": -3.171912670135498, "logits/rejected": -2.918295383453369, "logps/chosen": -221.5971221923828, "logps/rejected": -293.3380432128906, "loss": 0.4949, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.29368987679481506, "rewards/margins": 0.5147393941879272, "rewards/margins_max": 0.7441326379776001, "rewards/margins_min": 0.28534621000289917, "rewards/margins_std": 0.32441097497940063, "rewards/rejected": -0.8084293603897095, "step": 570 }, { "epoch": 0.22, "grad_norm": 2.9375, "learning_rate": 4.778219773516472e-07, "logits/chosen": -3.1883535385131836, "logits/rejected": -2.9556570053100586, "logps/chosen": -247.1591796875, "logps/rejected": -354.550048828125, "loss": 0.4733, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3137117326259613, "rewards/margins": 0.5875142812728882, "rewards/margins_max": 0.8420342206954956, "rewards/margins_min": 0.3329945206642151, "rewards/margins_std": 0.3599454462528229, "rewards/rejected": -0.9012260437011719, "step": 580 }, { "epoch": 0.23, "grad_norm": 3.5, "learning_rate": 4.764278843475128e-07, "logits/chosen": -3.222123622894287, "logits/rejected": -2.993802547454834, "logps/chosen": -205.1719512939453, "logps/rejected": -280.248291015625, "loss": 0.4764, "rewards/accuracies": 0.9375, "rewards/chosen": -0.36355796456336975, "rewards/margins": 0.5085680484771729, "rewards/margins_max": 0.7664892673492432, "rewards/margins_min": 0.2506466507911682, "rewards/margins_std": 0.3647558093070984, "rewards/rejected": -0.8721259832382202, "step": 590 }, { "epoch": 0.23, "grad_norm": 2.65625, "learning_rate": 4.749934630084691e-07, "logits/chosen": -3.3156044483184814, "logits/rejected": -3.007734775543213, "logps/chosen": -259.8424377441406, "logps/rejected": -326.71185302734375, "loss": 0.5174, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.30121082067489624, "rewards/margins": 0.45573800802230835, "rewards/margins_max": 0.6853656768798828, "rewards/margins_min": 0.22611038386821747, "rewards/margins_std": 0.32474246621131897, "rewards/rejected": -0.7569488286972046, "step": 600 }, { "epoch": 0.23, "grad_norm": 2.859375, "learning_rate": 4.735189688146409e-07, "logits/chosen": -3.2011077404022217, "logits/rejected": -2.964134931564331, "logps/chosen": -244.4047088623047, "logps/rejected": -343.80706787109375, "loss": 0.4885, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.37781041860580444, "rewards/margins": 0.52849942445755, "rewards/margins_max": 0.8547603487968445, "rewards/margins_min": 0.20223841071128845, "rewards/margins_std": 0.46140265464782715, "rewards/rejected": -0.9063097834587097, "step": 610 }, { "epoch": 0.24, "grad_norm": 2.9375, "learning_rate": 4.7200466438339916e-07, "logits/chosen": -3.2470192909240723, "logits/rejected": -2.94683575630188, "logps/chosen": -260.84661865234375, "logps/rejected": -325.08734130859375, "loss": 0.484, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.3126043975353241, "rewards/margins": 0.5272418260574341, "rewards/margins_max": 0.8143749237060547, "rewards/margins_min": 0.24010880291461945, "rewards/margins_std": 0.40606746077537537, "rewards/rejected": -0.8398463129997253, "step": 620 }, { "epoch": 0.24, "grad_norm": 3.0, "learning_rate": 4.704508194225866e-07, "logits/chosen": -3.2237815856933594, "logits/rejected": -2.926332950592041, "logps/chosen": -244.0590362548828, "logps/rejected": -343.0704345703125, "loss": 0.4808, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.39641642570495605, "rewards/margins": 0.609396755695343, "rewards/margins_max": 0.9228858947753906, "rewards/margins_min": 0.295907586812973, "rewards/margins_std": 0.44334059953689575, "rewards/rejected": -1.0058131217956543, "step": 630 }, { "epoch": 0.24, "grad_norm": 3.09375, "learning_rate": 4.688577106824814e-07, "logits/chosen": -3.252931594848633, "logits/rejected": -2.984818935394287, "logps/chosen": -207.40176391601562, "logps/rejected": -314.35943603515625, "loss": 0.4617, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.32950547337532043, "rewards/margins": 0.6272681951522827, "rewards/margins_max": 0.8804882168769836, "rewards/margins_min": 0.3740481734275818, "rewards/margins_std": 0.35810714960098267, "rewards/rejected": -0.956773579120636, "step": 640 }, { "epoch": 0.25, "grad_norm": 3.40625, "learning_rate": 4.672256219065059e-07, "logits/chosen": -3.2750720977783203, "logits/rejected": -2.9872028827667236, "logps/chosen": -234.55569458007812, "logps/rejected": -319.52337646484375, "loss": 0.4489, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4240082800388336, "rewards/margins": 0.6940449476242065, "rewards/margins_max": 1.0679466724395752, "rewards/margins_min": 0.3201431334018707, "rewards/margins_std": 0.5287769436836243, "rewards/rejected": -1.1180531978607178, "step": 650 }, { "epoch": 0.25, "grad_norm": 3.03125, "learning_rate": 4.655548437806902e-07, "logits/chosen": -3.146660327911377, "logits/rejected": -2.780895471572876, "logps/chosen": -292.53155517578125, "logps/rejected": -379.9189453125, "loss": 0.444, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.43200021982192993, "rewards/margins": 0.7211217880249023, "rewards/margins_max": 1.0315678119659424, "rewards/margins_min": 0.41067585349082947, "rewards/margins_std": 0.4390367865562439, "rewards/rejected": -1.153122067451477, "step": 660 }, { "epoch": 0.26, "grad_norm": 3.578125, "learning_rate": 4.6384567388189835e-07, "logits/chosen": -3.1456127166748047, "logits/rejected": -2.8448398113250732, "logps/chosen": -241.07363891601562, "logps/rejected": -331.47509765625, "loss": 0.4397, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.4291381239891052, "rewards/margins": 0.6892618536949158, "rewards/margins_max": 0.9590433835983276, "rewards/margins_min": 0.41948023438453674, "rewards/margins_std": 0.3815288245677948, "rewards/rejected": -1.118399977684021, "step": 670 }, { "epoch": 0.26, "grad_norm": 3.046875, "learning_rate": 4.6209841662482874e-07, "logits/chosen": -3.130969524383545, "logits/rejected": -2.9060287475585938, "logps/chosen": -216.61447143554688, "logps/rejected": -346.3977355957031, "loss": 0.4242, "rewards/accuracies": 0.9375, "rewards/chosen": -0.42712074518203735, "rewards/margins": 0.7977662682533264, "rewards/margins_max": 1.103691816329956, "rewards/margins_min": 0.4918406009674072, "rewards/margins_std": 0.4326442778110504, "rewards/rejected": -1.2248871326446533, "step": 680 }, { "epoch": 0.26, "grad_norm": 3.46875, "learning_rate": 4.603133832077953e-07, "logits/chosen": -3.1673851013183594, "logits/rejected": -2.961228609085083, "logps/chosen": -228.9781036376953, "logps/rejected": -322.8971252441406, "loss": 0.4451, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.49834170937538147, "rewards/margins": 0.6934456825256348, "rewards/margins_max": 1.0356100797653198, "rewards/margins_min": 0.3512812554836273, "rewards/margins_std": 0.4838935434818268, "rewards/rejected": -1.1917873620986938, "step": 690 }, { "epoch": 0.27, "grad_norm": 3.265625, "learning_rate": 4.58490891557301e-07, "logits/chosen": -3.1827971935272217, "logits/rejected": -2.9217400550842285, "logps/chosen": -214.40975952148438, "logps/rejected": -327.9997253417969, "loss": 0.4295, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.47098636627197266, "rewards/margins": 0.7713015675544739, "rewards/margins_max": 1.1291908025741577, "rewards/margins_min": 0.4134122431278229, "rewards/margins_std": 0.5061318874359131, "rewards/rejected": -1.2422878742218018, "step": 700 }, { "epoch": 0.27, "grad_norm": 5.03125, "learning_rate": 4.5663126627141346e-07, "logits/chosen": -3.233410596847534, "logits/rejected": -2.9121780395507812, "logps/chosen": -264.9963684082031, "logps/rejected": -351.5663757324219, "loss": 0.4268, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.5588346123695374, "rewards/margins": 0.7766939997673035, "rewards/margins_max": 1.2045036554336548, "rewards/margins_min": 0.3488844037055969, "rewards/margins_std": 0.6050141453742981, "rewards/rejected": -1.3355286121368408, "step": 710 }, { "epoch": 0.28, "grad_norm": 3.453125, "learning_rate": 4.5473483856195085e-07, "logits/chosen": -3.1042134761810303, "logits/rejected": -2.884307384490967, "logps/chosen": -278.4771423339844, "logps/rejected": -435.12518310546875, "loss": 0.411, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5594338774681091, "rewards/margins": 0.87383633852005, "rewards/margins_max": 1.3376729488372803, "rewards/margins_min": 0.4099995493888855, "rewards/margins_std": 0.6559640765190125, "rewards/rejected": -1.4332702159881592, "step": 720 }, { "epoch": 0.28, "grad_norm": 3.515625, "learning_rate": 4.5280194619549197e-07, "logits/chosen": -3.1154184341430664, "logits/rejected": -2.8020920753479004, "logps/chosen": -263.41326904296875, "logps/rejected": -353.222900390625, "loss": 0.4186, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5418119430541992, "rewards/margins": 0.9471070170402527, "rewards/margins_max": 1.409268856048584, "rewards/margins_min": 0.48494523763656616, "rewards/margins_std": 0.653595507144928, "rewards/rejected": -1.4889190196990967, "step": 730 }, { "epoch": 0.28, "grad_norm": 4.375, "learning_rate": 4.50832933433217e-07, "logits/chosen": -3.1423096656799316, "logits/rejected": -2.93670654296875, "logps/chosen": -230.8040008544922, "logps/rejected": -336.51263427734375, "loss": 0.4317, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.539901077747345, "rewards/margins": 0.7871753573417664, "rewards/margins_max": 1.2703603506088257, "rewards/margins_min": 0.3039904236793518, "rewards/margins_std": 0.6833267211914062, "rewards/rejected": -1.3270765542984009, "step": 740 }, { "epoch": 0.29, "grad_norm": 6.28125, "learning_rate": 4.4882815096959246e-07, "logits/chosen": -3.125920534133911, "logits/rejected": -2.9035699367523193, "logps/chosen": -220.22531127929688, "logps/rejected": -358.2559509277344, "loss": 0.4116, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5155752897262573, "rewards/margins": 0.72476726770401, "rewards/margins_max": 1.1075283288955688, "rewards/margins_min": 0.3420061469078064, "rewards/margins_std": 0.5413058996200562, "rewards/rejected": -1.2403424978256226, "step": 750 }, { "epoch": 0.29, "grad_norm": 3.5, "learning_rate": 4.4678795586991023e-07, "logits/chosen": -3.1662585735321045, "logits/rejected": -2.9570212364196777, "logps/chosen": -251.75430297851562, "logps/rejected": -413.55145263671875, "loss": 0.415, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.561039924621582, "rewards/margins": 0.8251296877861023, "rewards/margins_max": 1.2652571201324463, "rewards/margins_min": 0.38500216603279114, "rewards/margins_std": 0.6224343180656433, "rewards/rejected": -1.3861695528030396, "step": 760 }, { "epoch": 0.29, "grad_norm": 3.53125, "learning_rate": 4.447127115066919e-07, "logits/chosen": -3.15718412399292, "logits/rejected": -2.9018607139587402, "logps/chosen": -254.60348510742188, "logps/rejected": -410.89581298828125, "loss": 0.4225, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.5274133682250977, "rewards/margins": 0.9205818176269531, "rewards/margins_max": 1.3906073570251465, "rewards/margins_min": 0.45055636763572693, "rewards/margins_std": 0.664716362953186, "rewards/rejected": -1.4479950666427612, "step": 770 }, { "epoch": 0.3, "grad_norm": 4.375, "learning_rate": 4.4260278749496916e-07, "logits/chosen": -3.1671698093414307, "logits/rejected": -2.9417810440063477, "logps/chosen": -236.66024780273438, "logps/rejected": -383.62213134765625, "loss": 0.4109, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6313245296478271, "rewards/margins": 1.054483413696289, "rewards/margins_max": 1.598443865776062, "rewards/margins_min": 0.5105229616165161, "rewards/margins_std": 0.7692762613296509, "rewards/rejected": -1.6858079433441162, "step": 780 }, { "epoch": 0.3, "grad_norm": 4.40625, "learning_rate": 4.4045855962645363e-07, "logits/chosen": -3.073002576828003, "logits/rejected": -2.8750250339508057, "logps/chosen": -236.89315795898438, "logps/rejected": -391.52423095703125, "loss": 0.3795, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.54277503490448, "rewards/margins": 0.9837581515312195, "rewards/margins_max": 1.4403795003890991, "rewards/margins_min": 0.5271369814872742, "rewards/margins_std": 0.6457599997520447, "rewards/rejected": -1.5265332460403442, "step": 790 }, { "epoch": 0.31, "grad_norm": 4.09375, "learning_rate": 4.3828040980260504e-07, "logits/chosen": -3.178225517272949, "logits/rejected": -2.885007381439209, "logps/chosen": -285.74554443359375, "logps/rejected": -402.0855407714844, "loss": 0.3933, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6648572683334351, "rewards/margins": 1.0061523914337158, "rewards/margins_max": 1.521410346031189, "rewards/margins_min": 0.4908943176269531, "rewards/margins_std": 0.7286848425865173, "rewards/rejected": -1.6710094213485718, "step": 800 }, { "epoch": 0.31, "grad_norm": 4.15625, "learning_rate": 4.360687259666129e-07, "logits/chosen": -3.152057647705078, "logits/rejected": -2.9315738677978516, "logps/chosen": -284.7326965332031, "logps/rejected": -381.3499450683594, "loss": 0.4157, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.646824061870575, "rewards/margins": 0.8092839121818542, "rewards/margins_max": 1.2926756143569946, "rewards/margins_min": 0.32589206099510193, "rewards/margins_std": 0.6836191415786743, "rewards/rejected": -1.4561078548431396, "step": 810 }, { "epoch": 0.31, "grad_norm": 5.59375, "learning_rate": 4.3382390203430015e-07, "logits/chosen": -3.1908488273620605, "logits/rejected": -2.944361925125122, "logps/chosen": -296.958251953125, "logps/rejected": -394.0612487792969, "loss": 0.3889, "rewards/accuracies": 0.875, "rewards/chosen": -0.818184494972229, "rewards/margins": 0.8906744122505188, "rewards/margins_max": 1.3960387706756592, "rewards/margins_min": 0.3853098750114441, "rewards/margins_std": 0.7146932482719421, "rewards/rejected": -1.7088590860366821, "step": 820 }, { "epoch": 0.32, "grad_norm": 5.125, "learning_rate": 4.3154633782396493e-07, "logits/chosen": -3.162999153137207, "logits/rejected": -2.876415729522705, "logps/chosen": -304.06121826171875, "logps/rejected": -402.8589172363281, "loss": 0.4076, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.8082460165023804, "rewards/margins": 1.0327093601226807, "rewards/margins_max": 1.5921701192855835, "rewards/margins_min": 0.47324857115745544, "rewards/margins_std": 0.7911970615386963, "rewards/rejected": -1.840955376625061, "step": 830 }, { "epoch": 0.32, "grad_norm": 3.28125, "learning_rate": 4.2923643898516983e-07, "logits/chosen": -3.105457067489624, "logits/rejected": -2.8327999114990234, "logps/chosen": -313.3583068847656, "logps/rejected": -489.62060546875, "loss": 0.3631, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.668003499507904, "rewards/margins": 1.2182451486587524, "rewards/margins_max": 1.824080228805542, "rewards/margins_min": 0.6124096512794495, "rewards/margins_std": 0.8567806482315063, "rewards/rejected": -1.8862483501434326, "step": 840 }, { "epoch": 0.32, "grad_norm": 3.640625, "learning_rate": 4.268946169264932e-07, "logits/chosen": -3.1345181465148926, "logits/rejected": -2.9487783908843994, "logps/chosen": -230.07131958007812, "logps/rejected": -395.2250671386719, "loss": 0.3687, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.7640656232833862, "rewards/margins": 1.161650538444519, "rewards/margins_max": 1.6811326742172241, "rewards/margins_min": 0.6421682238578796, "rewards/margins_std": 0.7346588969230652, "rewards/rejected": -1.9257161617279053, "step": 850 }, { "epoch": 0.33, "grad_norm": 4.8125, "learning_rate": 4.245212887422542e-07, "logits/chosen": -3.1086442470550537, "logits/rejected": -2.9294607639312744, "logps/chosen": -273.58416748046875, "logps/rejected": -438.7652282714844, "loss": 0.3613, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8308951258659363, "rewards/margins": 1.2336242198944092, "rewards/margins_max": 1.882441520690918, "rewards/margins_min": 0.5848069190979004, "rewards/margins_std": 0.917566180229187, "rewards/rejected": -2.0645194053649902, "step": 860 }, { "epoch": 0.33, "grad_norm": 4.1875, "learning_rate": 4.2211687713822574e-07, "logits/chosen": -3.089200019836426, "logits/rejected": -2.8396849632263184, "logps/chosen": -317.65618896484375, "logps/rejected": -468.9850158691406, "loss": 0.3686, "rewards/accuracies": 1.0, "rewards/chosen": -0.7857069373130798, "rewards/margins": 1.1800651550292969, "rewards/margins_max": 1.7438383102416992, "rewards/margins_min": 0.6162917017936707, "rewards/margins_std": 0.797295868396759, "rewards/rejected": -1.965772032737732, "step": 870 }, { "epoch": 0.34, "grad_norm": 5.28125, "learning_rate": 4.196818103563477e-07, "logits/chosen": -3.1307501792907715, "logits/rejected": -2.890303134918213, "logps/chosen": -284.9822692871094, "logps/rejected": -456.78582763671875, "loss": 0.3578, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.8763874769210815, "rewards/margins": 1.411645531654358, "rewards/margins_max": 2.090603828430176, "rewards/margins_min": 0.7326871156692505, "rewards/margins_std": 0.960192084312439, "rewards/rejected": -2.2880330085754395, "step": 880 }, { "epoch": 0.34, "grad_norm": 4.625, "learning_rate": 4.172165220984541e-07, "logits/chosen": -3.075063467025757, "logits/rejected": -2.7540695667266846, "logps/chosen": -315.30389404296875, "logps/rejected": -419.2861328125, "loss": 0.3873, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8819743394851685, "rewards/margins": 0.9528709650039673, "rewards/margins_max": 1.3681405782699585, "rewards/margins_min": 0.5376013517379761, "rewards/margins_std": 0.5872799754142761, "rewards/rejected": -1.8348453044891357, "step": 890 }, { "epoch": 0.34, "grad_norm": 3.625, "learning_rate": 4.1472145144902775e-07, "logits/chosen": -3.0279951095581055, "logits/rejected": -2.79876708984375, "logps/chosen": -313.17828369140625, "logps/rejected": -494.333984375, "loss": 0.3517, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.9009196162223816, "rewards/margins": 1.3655102252960205, "rewards/margins_max": 2.058493137359619, "rewards/margins_min": 0.6725271344184875, "rewards/margins_std": 0.9800260663032532, "rewards/rejected": -2.2664296627044678, "step": 900 }, { "epoch": 0.35, "grad_norm": 4.5, "learning_rate": 4.121970427969966e-07, "logits/chosen": -3.0823280811309814, "logits/rejected": -2.7526888847351074, "logps/chosen": -274.48516845703125, "logps/rejected": -379.95843505859375, "loss": 0.3596, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.8261201977729797, "rewards/margins": 1.091605544090271, "rewards/margins_max": 1.6673996448516846, "rewards/margins_min": 0.5158115029335022, "rewards/margins_std": 0.8142956495285034, "rewards/rejected": -1.9177258014678955, "step": 910 }, { "epoch": 0.35, "grad_norm": 5.125, "learning_rate": 4.0964374575658496e-07, "logits/chosen": -3.0822136402130127, "logits/rejected": -2.8262829780578613, "logps/chosen": -312.8172912597656, "logps/rejected": -474.4310607910156, "loss": 0.3671, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.8950836062431335, "rewards/margins": 1.219533085823059, "rewards/margins_max": 1.8111814260482788, "rewards/margins_min": 0.6278846859931946, "rewards/margins_std": 0.8367172479629517, "rewards/rejected": -2.114616870880127, "step": 920 }, { "epoch": 0.36, "grad_norm": 5.21875, "learning_rate": 4.070620150872339e-07, "logits/chosen": -3.046312093734741, "logits/rejected": -2.8204238414764404, "logps/chosen": -294.24627685546875, "logps/rejected": -452.1871032714844, "loss": 0.3853, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.867325484752655, "rewards/margins": 1.1414000988006592, "rewards/margins_max": 1.6486103534698486, "rewards/margins_min": 0.6341898441314697, "rewards/margins_std": 0.717303454875946, "rewards/rejected": -2.008725643157959, "step": 930 }, { "epoch": 0.36, "grad_norm": 5.71875, "learning_rate": 4.044523106126061e-07, "logits/chosen": -3.0358595848083496, "logits/rejected": -2.808770179748535, "logps/chosen": -295.25604248046875, "logps/rejected": -459.0888671875, "loss": 0.3634, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.7942778468132019, "rewards/margins": 1.0517923831939697, "rewards/margins_max": 1.6069154739379883, "rewards/margins_min": 0.49666935205459595, "rewards/margins_std": 0.7850624322891235, "rewards/rejected": -1.8460700511932373, "step": 940 }, { "epoch": 0.36, "grad_norm": 7.875, "learning_rate": 4.0181509713868765e-07, "logits/chosen": -3.1421244144439697, "logits/rejected": -2.8746113777160645, "logps/chosen": -295.7052917480469, "logps/rejected": -496.74700927734375, "loss": 0.3753, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.8820911645889282, "rewards/margins": 1.6108951568603516, "rewards/margins_max": 2.3748936653137207, "rewards/margins_min": 0.8468970060348511, "rewards/margins_std": 1.0804564952850342, "rewards/rejected": -2.4929862022399902, "step": 950 }, { "epoch": 0.37, "grad_norm": 8.125, "learning_rate": 3.991508443710031e-07, "logits/chosen": -3.188358783721924, "logits/rejected": -2.9022974967956543, "logps/chosen": -326.6969299316406, "logps/rejected": -470.706787109375, "loss": 0.3591, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9822729229927063, "rewards/margins": 1.193819522857666, "rewards/margins_max": 1.7589439153671265, "rewards/margins_min": 0.628695011138916, "rewards/margins_std": 0.7992067337036133, "rewards/rejected": -2.1760921478271484, "step": 960 }, { "epoch": 0.37, "grad_norm": 7.78125, "learning_rate": 3.9646002683095794e-07, "logits/chosen": -3.1077704429626465, "logits/rejected": -2.8467211723327637, "logps/chosen": -280.2427673339844, "logps/rejected": -415.1084899902344, "loss": 0.3603, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8817678689956665, "rewards/margins": 1.1791865825653076, "rewards/margins_max": 1.6983654499053955, "rewards/margins_min": 0.660007655620575, "rewards/margins_std": 0.7342298030853271, "rewards/rejected": -2.0609545707702637, "step": 970 }, { "epoch": 0.37, "grad_norm": 6.59375, "learning_rate": 3.937431237713227e-07, "logits/chosen": -3.09631609916687, "logits/rejected": -2.7876319885253906, "logps/chosen": -343.01019287109375, "logps/rejected": -494.8714904785156, "loss": 0.3643, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.8703140020370483, "rewards/margins": 1.3103339672088623, "rewards/margins_max": 2.1210694313049316, "rewards/margins_min": 0.4995986819267273, "rewards/margins_std": 1.1465529203414917, "rewards/rejected": -2.1806480884552, "step": 980 }, { "epoch": 0.38, "grad_norm": 3.640625, "learning_rate": 3.910006190908753e-07, "logits/chosen": -3.1266021728515625, "logits/rejected": -2.803832530975342, "logps/chosen": -313.9851989746094, "logps/rejected": -454.4408264160156, "loss": 0.3745, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.8938585519790649, "rewards/margins": 1.1841809749603271, "rewards/margins_max": 1.9790232181549072, "rewards/margins_min": 0.38933879137039185, "rewards/margins_std": 1.1240766048431396, "rewards/rejected": -2.0780396461486816, "step": 990 }, { "epoch": 0.38, "grad_norm": 4.125, "learning_rate": 3.882330012482152e-07, "logits/chosen": -3.1263976097106934, "logits/rejected": -2.940556287765503, "logps/chosen": -265.8106689453125, "logps/rejected": -446.62603759765625, "loss": 0.3468, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.8905905485153198, "rewards/margins": 1.3784196376800537, "rewards/margins_max": 2.0906035900115967, "rewards/margins_min": 0.6662355065345764, "rewards/margins_std": 1.0071804523468018, "rewards/rejected": -2.269010066986084, "step": 1000 }, { "epoch": 0.39, "grad_norm": 3.984375, "learning_rate": 3.854407631747653e-07, "logits/chosen": -3.0423550605773926, "logits/rejected": -2.7694637775421143, "logps/chosen": -325.1182556152344, "logps/rejected": -545.2157592773438, "loss": 0.3244, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.9049133062362671, "rewards/margins": 1.3824645280838013, "rewards/margins_max": 2.044529438018799, "rewards/margins_min": 0.7203994989395142, "rewards/margins_std": 0.9363012313842773, "rewards/rejected": -2.2873778343200684, "step": 1010 }, { "epoch": 0.39, "grad_norm": 4.625, "learning_rate": 3.826244021869782e-07, "logits/chosen": -3.1056835651397705, "logits/rejected": -2.8130106925964355, "logps/chosen": -321.215576171875, "logps/rejected": -474.0379943847656, "loss": 0.3367, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.938174843788147, "rewards/margins": 1.3489874601364136, "rewards/margins_max": 2.1338260173797607, "rewards/margins_min": 0.564149022102356, "rewards/margins_std": 1.1099293231964111, "rewards/rejected": -2.2871623039245605, "step": 1020 }, { "epoch": 0.39, "grad_norm": 7.78125, "learning_rate": 3.797844198977601e-07, "logits/chosen": -3.0277373790740967, "logits/rejected": -2.816030740737915, "logps/chosen": -305.11749267578125, "logps/rejected": -515.0469970703125, "loss": 0.3415, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.8409072160720825, "rewards/margins": 1.5637253522872925, "rewards/margins_max": 2.281038761138916, "rewards/margins_min": 0.8464117050170898, "rewards/margins_std": 1.014434576034546, "rewards/rejected": -2.404632568359375, "step": 1030 }, { "epoch": 0.4, "grad_norm": 3.453125, "learning_rate": 3.769213221271306e-07, "logits/chosen": -3.0825905799865723, "logits/rejected": -2.811018466949463, "logps/chosen": -302.92364501953125, "logps/rejected": -513.6621704101562, "loss": 0.3001, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.9827852249145508, "rewards/margins": 1.5373833179473877, "rewards/margins_max": 2.370154857635498, "rewards/margins_min": 0.704612135887146, "rewards/margins_std": 1.1777164936065674, "rewards/rejected": -2.5201685428619385, "step": 1040 }, { "epoch": 0.4, "grad_norm": 5.9375, "learning_rate": 3.740356188121326e-07, "logits/chosen": -3.0749309062957764, "logits/rejected": -2.808995008468628, "logps/chosen": -313.23779296875, "logps/rejected": -494.24658203125, "loss": 0.325, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.0359914302825928, "rewards/margins": 1.6361278295516968, "rewards/margins_max": 2.5817818641662598, "rewards/margins_min": 0.6904740929603577, "rewards/margins_std": 1.3373563289642334, "rewards/rejected": -2.672119140625, "step": 1050 }, { "epoch": 0.41, "grad_norm": 3.96875, "learning_rate": 3.711278239160092e-07, "logits/chosen": -3.0516886711120605, "logits/rejected": -2.8046505451202393, "logps/chosen": -290.2547912597656, "logps/rejected": -450.64215087890625, "loss": 0.337, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.8965566754341125, "rewards/margins": 1.2011537551879883, "rewards/margins_max": 1.8123470544815063, "rewards/margins_min": 0.5899609327316284, "rewards/margins_std": 0.8643573522567749, "rewards/rejected": -2.097710371017456, "step": 1060 }, { "epoch": 0.41, "grad_norm": 5.625, "learning_rate": 3.681984553366629e-07, "logits/chosen": -3.067981719970703, "logits/rejected": -2.813534736633301, "logps/chosen": -317.41302490234375, "logps/rejected": -559.5974731445312, "loss": 0.3242, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.159551739692688, "rewards/margins": 1.695622205734253, "rewards/margins_max": 2.6865200996398926, "rewards/margins_min": 0.7047241926193237, "rewards/margins_std": 1.401341199874878, "rewards/rejected": -2.8551738262176514, "step": 1070 }, { "epoch": 0.41, "grad_norm": 5.28125, "learning_rate": 3.652480348144152e-07, "logits/chosen": -3.0792620182037354, "logits/rejected": -2.8351120948791504, "logps/chosen": -315.0097351074219, "logps/rejected": -499.28741455078125, "loss": 0.349, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.2088465690612793, "rewards/margins": 1.4478918313980103, "rewards/margins_max": 2.187783718109131, "rewards/margins_min": 0.7079996466636658, "rewards/margins_std": 1.0463653802871704, "rewards/rejected": -2.656738042831421, "step": 1080 }, { "epoch": 0.42, "grad_norm": 4.46875, "learning_rate": 3.6227708783908053e-07, "logits/chosen": -3.0654194355010986, "logits/rejected": -2.8562426567077637, "logps/chosen": -303.43280029296875, "logps/rejected": -504.5311584472656, "loss": 0.3367, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.1927417516708374, "rewards/margins": 1.5436158180236816, "rewards/margins_max": 2.4939346313476562, "rewards/margins_min": 0.5932968854904175, "rewards/margins_std": 1.3439538478851318, "rewards/rejected": -2.7363574504852295, "step": 1090 }, { "epoch": 0.42, "grad_norm": 4.59375, "learning_rate": 3.5928614355637324e-07, "logits/chosen": -3.0500872135162354, "logits/rejected": -2.8141045570373535, "logps/chosen": -288.33636474609375, "logps/rejected": -500.2225036621094, "loss": 0.3218, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.9479445219039917, "rewards/margins": 1.6601346731185913, "rewards/margins_max": 2.4742271900177, "rewards/margins_min": 0.8460421562194824, "rewards/margins_std": 1.1513007879257202, "rewards/rejected": -2.608079433441162, "step": 1100 }, { "epoch": 0.42, "grad_norm": 3.921875, "learning_rate": 3.562757346736633e-07, "logits/chosen": -3.1348893642425537, "logits/rejected": -2.8808882236480713, "logps/chosen": -299.9723205566406, "logps/rejected": -512.7084350585938, "loss": 0.3175, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.1462712287902832, "rewards/margins": 1.8254330158233643, "rewards/margins_max": 2.717803716659546, "rewards/margins_min": 0.9330623745918274, "rewards/margins_std": 1.26200270652771, "rewards/rejected": -2.9717042446136475, "step": 1110 }, { "epoch": 0.43, "grad_norm": 3.265625, "learning_rate": 3.532463973650971e-07, "logits/chosen": -3.0953421592712402, "logits/rejected": -2.8315436840057373, "logps/chosen": -320.49896240234375, "logps/rejected": -559.3016357421875, "loss": 0.3058, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.0349481105804443, "rewards/margins": 1.8946539163589478, "rewards/margins_max": 3.176825761795044, "rewards/margins_min": 0.612481951713562, "rewards/margins_std": 1.813265085220337, "rewards/rejected": -2.9296019077301025, "step": 1120 }, { "epoch": 0.43, "grad_norm": 6.3125, "learning_rate": 3.501986711761016e-07, "logits/chosen": -3.0640907287597656, "logits/rejected": -2.8440439701080322, "logps/chosen": -262.0741882324219, "logps/rejected": -465.90155029296875, "loss": 0.3491, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.0486122369766235, "rewards/margins": 1.6883703470230103, "rewards/margins_max": 2.6328392028808594, "rewards/margins_min": 0.7439014911651611, "rewards/margins_std": 1.3356807231903076, "rewards/rejected": -2.736982822418213, "step": 1130 }, { "epoch": 0.44, "grad_norm": 5.0, "learning_rate": 3.4713309892728755e-07, "logits/chosen": -3.0397841930389404, "logits/rejected": -2.7506637573242188, "logps/chosen": -288.4527282714844, "logps/rejected": -446.4369201660156, "loss": 0.3477, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0533605813980103, "rewards/margins": 1.4881670475006104, "rewards/margins_max": 2.2428665161132812, "rewards/margins_min": 0.7334678769111633, "rewards/margins_std": 1.0673058032989502, "rewards/rejected": -2.54152774810791, "step": 1140 }, { "epoch": 0.44, "grad_norm": 3.28125, "learning_rate": 3.4405022661776933e-07, "logits/chosen": -3.0119423866271973, "logits/rejected": -2.75697660446167, "logps/chosen": -313.69891357421875, "logps/rejected": -527.7509765625, "loss": 0.3095, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.0210492610931396, "rewards/margins": 1.6547205448150635, "rewards/margins_max": 2.5389270782470703, "rewards/margins_min": 0.770513653755188, "rewards/margins_std": 1.2504572868347168, "rewards/rejected": -2.675769567489624, "step": 1150 }, { "epoch": 0.44, "grad_norm": 4.53125, "learning_rate": 3.40950603327919e-07, "logits/chosen": -3.034954309463501, "logits/rejected": -2.717050790786743, "logps/chosen": -311.44989013671875, "logps/rejected": -536.5928955078125, "loss": 0.2818, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1659419536590576, "rewards/margins": 2.0434906482696533, "rewards/margins_max": 2.969123363494873, "rewards/margins_min": 1.1178580522537231, "rewards/margins_std": 1.30904221534729, "rewards/rejected": -3.209432601928711, "step": 1160 }, { "epoch": 0.45, "grad_norm": 4.21875, "learning_rate": 3.3783478112157144e-07, "logits/chosen": -3.0681750774383545, "logits/rejected": -2.842912197113037, "logps/chosen": -298.2220764160156, "logps/rejected": -496.28509521484375, "loss": 0.3477, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.1258448362350464, "rewards/margins": 1.5719826221466064, "rewards/margins_max": 2.495079278945923, "rewards/margins_min": 0.64888596534729, "rewards/margins_std": 1.3054558038711548, "rewards/rejected": -2.6978273391723633, "step": 1170 }, { "epoch": 0.45, "grad_norm": 3.0, "learning_rate": 3.347033149476976e-07, "logits/chosen": -3.000627279281616, "logits/rejected": -2.82975172996521, "logps/chosen": -327.8063049316406, "logps/rejected": -512.77587890625, "loss": 0.3452, "rewards/accuracies": 0.875, "rewards/chosen": -1.0002752542495728, "rewards/margins": 1.4279507398605347, "rewards/margins_max": 2.1153764724731445, "rewards/margins_min": 0.7405253648757935, "rewards/margins_std": 0.9721664190292358, "rewards/rejected": -2.4282259941101074, "step": 1180 }, { "epoch": 0.45, "grad_norm": 4.75, "learning_rate": 3.3155676254156496e-07, "logits/chosen": -3.07631254196167, "logits/rejected": -2.837388038635254, "logps/chosen": -340.0731506347656, "logps/rejected": -575.410888671875, "loss": 0.3498, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.210822343826294, "rewards/margins": 1.6817712783813477, "rewards/margins_max": 2.6699368953704834, "rewards/margins_min": 0.6936053037643433, "rewards/margins_std": 1.3974777460098267, "rewards/rejected": -2.8925936222076416, "step": 1190 }, { "epoch": 0.46, "grad_norm": 4.5625, "learning_rate": 3.2839568432540064e-07, "logits/chosen": -3.0555381774902344, "logits/rejected": -2.8638923168182373, "logps/chosen": -336.87213134765625, "logps/rejected": -524.3484497070312, "loss": 0.2813, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.0288445949554443, "rewards/margins": 1.6101176738739014, "rewards/margins_max": 2.4245457649230957, "rewards/margins_min": 0.7956899404525757, "rewards/margins_std": 1.1517750024795532, "rewards/rejected": -2.6389622688293457, "step": 1200 }, { "epoch": 0.46, "grad_norm": 5.375, "learning_rate": 3.252206433085768e-07, "logits/chosen": -3.0896949768066406, "logits/rejected": -2.7695508003234863, "logps/chosen": -315.62530517578125, "logps/rejected": -450.58856201171875, "loss": 0.3317, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.0867998600006104, "rewards/margins": 1.556259274482727, "rewards/margins_max": 2.347869396209717, "rewards/margins_min": 0.7646490335464478, "rewards/margins_std": 1.1195060014724731, "rewards/rejected": -2.643059253692627, "step": 1210 }, { "epoch": 0.47, "grad_norm": 8.375, "learning_rate": 3.220322049873344e-07, "logits/chosen": -3.053899049758911, "logits/rejected": -2.7996201515197754, "logps/chosen": -293.49700927734375, "logps/rejected": -517.9727172851562, "loss": 0.3098, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.0438956022262573, "rewards/margins": 1.7273838520050049, "rewards/margins_max": 2.7014057636260986, "rewards/margins_min": 0.7533617615699768, "rewards/margins_std": 1.3774750232696533, "rewards/rejected": -2.7712790966033936, "step": 1220 }, { "epoch": 0.47, "grad_norm": 8.875, "learning_rate": 3.1883093724406493e-07, "logits/chosen": -3.0701441764831543, "logits/rejected": -2.776665687561035, "logps/chosen": -336.69366455078125, "logps/rejected": -524.4509887695312, "loss": 0.3133, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.1884243488311768, "rewards/margins": 1.7032394409179688, "rewards/margins_max": 2.4934375286102295, "rewards/margins_min": 0.9130409359931946, "rewards/margins_std": 1.117509126663208, "rewards/rejected": -2.8916633129119873, "step": 1230 }, { "epoch": 0.47, "grad_norm": 5.9375, "learning_rate": 3.156174102461666e-07, "logits/chosen": -3.0736892223358154, "logits/rejected": -2.805081605911255, "logps/chosen": -311.7049255371094, "logps/rejected": -526.9119873046875, "loss": 0.3061, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.4257994890213013, "rewards/margins": 1.9017452001571655, "rewards/margins_max": 3.0200836658477783, "rewards/margins_min": 0.7834072113037109, "rewards/margins_std": 1.581568956375122, "rewards/rejected": -3.327544689178467, "step": 1240 }, { "epoch": 0.48, "grad_norm": 3.8125, "learning_rate": 3.1239219634449347e-07, "logits/chosen": -2.9780619144439697, "logits/rejected": -2.771231174468994, "logps/chosen": -298.0756530761719, "logps/rejected": -519.0932006835938, "loss": 0.3565, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.2179266214370728, "rewards/margins": 1.8608818054199219, "rewards/margins_max": 2.9755165576934814, "rewards/margins_min": 0.7462473511695862, "rewards/margins_std": 1.5763311386108398, "rewards/rejected": -3.078808307647705, "step": 1250 }, { "epoch": 0.48, "grad_norm": 15.3125, "learning_rate": 3.0915586997141624e-07, "logits/chosen": -3.0769824981689453, "logits/rejected": -2.8244097232818604, "logps/chosen": -355.56427001953125, "logps/rejected": -530.8713989257812, "loss": 0.3188, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.1896952390670776, "rewards/margins": 1.6044542789459229, "rewards/margins_max": 2.3940157890319824, "rewards/margins_min": 0.8148924708366394, "rewards/margins_std": 1.1166088581085205, "rewards/rejected": -2.794149875640869, "step": 1260 }, { "epoch": 0.49, "grad_norm": 5.28125, "learning_rate": 3.059090075385117e-07, "logits/chosen": -3.030339002609253, "logits/rejected": -2.784923791885376, "logps/chosen": -345.70062255859375, "logps/rejected": -536.1658325195312, "loss": 0.3522, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.2129261493682861, "rewards/margins": 1.6397039890289307, "rewards/margins_max": 2.4216091632843018, "rewards/margins_min": 0.8577985763549805, "rewards/margins_std": 1.1057811975479126, "rewards/rejected": -2.852630138397217, "step": 1270 }, { "epoch": 0.49, "grad_norm": 4.59375, "learning_rate": 3.0265218733390004e-07, "logits/chosen": -3.020380973815918, "logits/rejected": -2.6990604400634766, "logps/chosen": -326.10516357421875, "logps/rejected": -501.0379943847656, "loss": 0.323, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0968921184539795, "rewards/margins": 1.6457980871200562, "rewards/margins_max": 2.706159830093384, "rewards/margins_min": 0.5854364633560181, "rewards/margins_std": 1.4995777606964111, "rewards/rejected": -2.742690324783325, "step": 1280 }, { "epoch": 0.49, "grad_norm": 9.25, "learning_rate": 2.993859894192477e-07, "logits/chosen": -3.0308547019958496, "logits/rejected": -2.7866368293762207, "logps/chosen": -305.8663635253906, "logps/rejected": -504.67413330078125, "loss": 0.2811, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.2033522129058838, "rewards/margins": 1.8916361331939697, "rewards/margins_max": 2.847817897796631, "rewards/margins_min": 0.9354543685913086, "rewards/margins_std": 1.3522452116012573, "rewards/rejected": -3.0949883460998535, "step": 1290 }, { "epoch": 0.5, "grad_norm": 5.0625, "learning_rate": 2.961109955264549e-07, "logits/chosen": -3.040210723876953, "logits/rejected": -2.740476369857788, "logps/chosen": -336.0604553222656, "logps/rejected": -490.28961181640625, "loss": 0.3035, "rewards/accuracies": 0.9375, "rewards/chosen": -1.12978994846344, "rewards/margins": 1.6222766637802124, "rewards/margins_max": 2.5636367797851562, "rewards/margins_min": 0.6809166669845581, "rewards/margins_std": 1.3312841653823853, "rewards/rejected": -2.7520663738250732, "step": 1300 }, { "epoch": 0.5, "grad_norm": 4.6875, "learning_rate": 2.9282778895404474e-07, "logits/chosen": -2.9875807762145996, "logits/rejected": -2.7361321449279785, "logps/chosen": -326.562744140625, "logps/rejected": -542.1422729492188, "loss": 0.2952, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.2226388454437256, "rewards/margins": 1.9603245258331299, "rewards/margins_max": 2.9473373889923096, "rewards/margins_min": 0.9733120203018188, "rewards/margins_std": 1.3958467245101929, "rewards/rejected": -3.1829633712768555, "step": 1310 }, { "epoch": 0.5, "grad_norm": 2.84375, "learning_rate": 2.895369544632739e-07, "logits/chosen": -3.1332552433013916, "logits/rejected": -2.908846855163574, "logps/chosen": -306.4833068847656, "logps/rejected": -578.1776123046875, "loss": 0.3291, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3200147151947021, "rewards/margins": 2.0552730560302734, "rewards/margins_max": 3.0561721324920654, "rewards/margins_min": 1.0543737411499023, "rewards/margins_std": 1.415485143661499, "rewards/rejected": -3.3752880096435547, "step": 1320 }, { "epoch": 0.51, "grad_norm": 4.71875, "learning_rate": 2.8623907817398305e-07, "logits/chosen": -3.0694479942321777, "logits/rejected": -2.8231074810028076, "logps/chosen": -339.3045959472656, "logps/rejected": -579.858154296875, "loss": 0.2679, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.2447212934494019, "rewards/margins": 2.1124045848846436, "rewards/margins_max": 3.3244433403015137, "rewards/margins_min": 0.9003661274909973, "rewards/margins_std": 1.7140815258026123, "rewards/rejected": -3.357126235961914, "step": 1330 }, { "epoch": 0.51, "grad_norm": 4.0, "learning_rate": 2.8293474746020467e-07, "logits/chosen": -3.0237174034118652, "logits/rejected": -2.7316677570343018, "logps/chosen": -341.8821716308594, "logps/rejected": -525.6448974609375, "loss": 0.3374, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.3774925470352173, "rewards/margins": 1.6952784061431885, "rewards/margins_max": 2.7387375831604004, "rewards/margins_min": 0.6518189311027527, "rewards/margins_std": 1.4756742715835571, "rewards/rejected": -3.072770595550537, "step": 1340 }, { "epoch": 0.52, "grad_norm": 5.21875, "learning_rate": 2.796245508455478e-07, "logits/chosen": -3.1177384853363037, "logits/rejected": -2.8181514739990234, "logps/chosen": -315.0109558105469, "logps/rejected": -522.7584228515625, "loss": 0.2943, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.494227647781372, "rewards/margins": 1.7711334228515625, "rewards/margins_max": 2.687786340713501, "rewards/margins_min": 0.8544808626174927, "rewards/margins_std": 1.2963426113128662, "rewards/rejected": -3.2653610706329346, "step": 1350 }, { "epoch": 0.52, "grad_norm": 9.25, "learning_rate": 2.7630907789837765e-07, "logits/chosen": -2.9989418983459473, "logits/rejected": -2.716702938079834, "logps/chosen": -306.7658996582031, "logps/rejected": -521.6512451171875, "loss": 0.3498, "rewards/accuracies": 0.875, "rewards/chosen": -1.203908920288086, "rewards/margins": 1.7978990077972412, "rewards/margins_max": 2.8368608951568604, "rewards/margins_min": 0.7589374780654907, "rewards/margins_std": 1.469313621520996, "rewards/rejected": -3.0018081665039062, "step": 1360 }, { "epoch": 0.52, "grad_norm": 9.375, "learning_rate": 2.7298891912681063e-07, "logits/chosen": -3.0549111366271973, "logits/rejected": -2.8135063648223877, "logps/chosen": -332.3044128417969, "logps/rejected": -516.0485229492188, "loss": 0.3097, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4526578187942505, "rewards/margins": 1.7673333883285522, "rewards/margins_max": 2.8053770065307617, "rewards/margins_min": 0.7292898893356323, "rewards/margins_std": 1.4680153131484985, "rewards/rejected": -3.2199912071228027, "step": 1370 }, { "epoch": 0.53, "grad_norm": 4.875, "learning_rate": 2.696646658735396e-07, "logits/chosen": -3.087620973587036, "logits/rejected": -2.817905902862549, "logps/chosen": -336.32794189453125, "logps/rejected": -506.7635192871094, "loss": 0.315, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.2476387023925781, "rewards/margins": 1.67379891872406, "rewards/margins_max": 2.6297619342803955, "rewards/margins_min": 0.717836320400238, "rewards/margins_std": 1.3519353866577148, "rewards/rejected": -2.9214377403259277, "step": 1380 }, { "epoch": 0.53, "grad_norm": 5.25, "learning_rate": 2.6633691021051225e-07, "logits/chosen": -3.058051347732544, "logits/rejected": -2.823256254196167, "logps/chosen": -315.0464172363281, "logps/rejected": -477.55255126953125, "loss": 0.3293, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.428307294845581, "rewards/margins": 1.4829260110855103, "rewards/margins_max": 2.2689406871795654, "rewards/margins_min": 0.6969112753868103, "rewards/margins_std": 1.1115926504135132, "rewards/rejected": -2.911233425140381, "step": 1390 }, { "epoch": 0.54, "grad_norm": 8.75, "learning_rate": 2.630062448334792e-07, "logits/chosen": -3.0223097801208496, "logits/rejected": -2.8338611125946045, "logps/chosen": -287.4037170410156, "logps/rejected": -502.56329345703125, "loss": 0.3243, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.0643078088760376, "rewards/margins": 1.7524343729019165, "rewards/margins_max": 2.540440797805786, "rewards/margins_min": 0.964427649974823, "rewards/margins_std": 1.1144096851348877, "rewards/rejected": -2.816742420196533, "step": 1400 }, { "epoch": 0.54, "grad_norm": 6.9375, "learning_rate": 2.596732629564309e-07, "logits/chosen": -3.0276882648468018, "logits/rejected": -2.8791420459747314, "logps/chosen": -303.8673095703125, "logps/rejected": -552.97314453125, "loss": 0.3188, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2054811716079712, "rewards/margins": 1.6991316080093384, "rewards/margins_max": 2.568361759185791, "rewards/margins_min": 0.8299016952514648, "rewards/margins_std": 1.2292768955230713, "rewards/rejected": -2.9046127796173096, "step": 1410 }, { "epoch": 0.54, "grad_norm": 3.71875, "learning_rate": 2.5633855820594197e-07, "logits/chosen": -3.15714430809021, "logits/rejected": -2.9314417839050293, "logps/chosen": -320.03668212890625, "logps/rejected": -644.4346923828125, "loss": 0.3033, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.3862671852111816, "rewards/margins": 2.538801431655884, "rewards/margins_max": 3.948146343231201, "rewards/margins_min": 1.1294561624526978, "rewards/margins_std": 1.9931151866912842, "rewards/rejected": -3.9250686168670654, "step": 1420 }, { "epoch": 0.55, "grad_norm": 4.09375, "learning_rate": 2.530027245154423e-07, "logits/chosen": -2.962214708328247, "logits/rejected": -2.746046543121338, "logps/chosen": -328.22784423828125, "logps/rejected": -617.6292724609375, "loss": 0.2839, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.3232418298721313, "rewards/margins": 2.3371334075927734, "rewards/margins_max": 3.821005344390869, "rewards/margins_min": 0.8532617688179016, "rewards/margins_std": 2.0985114574432373, "rewards/rejected": -3.6603751182556152, "step": 1430 }, { "epoch": 0.55, "grad_norm": 4.6875, "learning_rate": 2.496663560194338e-07, "logits/chosen": -3.0740249156951904, "logits/rejected": -2.795609951019287, "logps/chosen": -326.8231506347656, "logps/rejected": -546.2271728515625, "loss": 0.2853, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.206667184829712, "rewards/margins": 1.9110145568847656, "rewards/margins_max": 3.0812644958496094, "rewards/margins_min": 0.7407640814781189, "rewards/margins_std": 1.6549837589263916, "rewards/rejected": -3.1176817417144775, "step": 1440 }, { "epoch": 0.55, "grad_norm": 8.4375, "learning_rate": 2.4633004694767086e-07, "logits/chosen": -3.035614252090454, "logits/rejected": -2.816070556640625, "logps/chosen": -287.0525207519531, "logps/rejected": -511.42010498046875, "loss": 0.3196, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.2523744106292725, "rewards/margins": 1.8659451007843018, "rewards/margins_max": 2.831995725631714, "rewards/margins_min": 0.8998948931694031, "rewards/margins_std": 1.3662011623382568, "rewards/rejected": -3.1183197498321533, "step": 1450 }, { "epoch": 0.56, "grad_norm": 7.90625, "learning_rate": 2.4299439151932387e-07, "logits/chosen": -2.9972755908966064, "logits/rejected": -2.7236037254333496, "logps/chosen": -341.40789794921875, "logps/rejected": -611.5470581054688, "loss": 0.2963, "rewards/accuracies": 0.875, "rewards/chosen": -1.2511876821517944, "rewards/margins": 2.2317798137664795, "rewards/margins_max": 3.6496143341064453, "rewards/margins_min": 0.8139451146125793, "rewards/margins_std": 2.0051209926605225, "rewards/rejected": -3.4829673767089844, "step": 1460 }, { "epoch": 0.56, "grad_norm": 4.3125, "learning_rate": 2.3965998383714496e-07, "logits/chosen": -3.0932700634002686, "logits/rejected": -2.840926170349121, "logps/chosen": -316.71820068359375, "logps/rejected": -547.8084716796875, "loss": 0.3287, "rewards/accuracies": 0.875, "rewards/chosen": -1.2523170709609985, "rewards/margins": 1.8998005390167236, "rewards/margins_max": 3.1817667484283447, "rewards/margins_min": 0.617834210395813, "rewards/margins_std": 1.812973976135254, "rewards/rejected": -3.1521174907684326, "step": 1470 }, { "epoch": 0.57, "grad_norm": 4.34375, "learning_rate": 2.3632741778165442e-07, "logits/chosen": -3.0418949127197266, "logits/rejected": -2.756995916366577, "logps/chosen": -330.74176025390625, "logps/rejected": -586.962646484375, "loss": 0.2811, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.4056506156921387, "rewards/margins": 2.22354793548584, "rewards/margins_max": 3.1083569526672363, "rewards/margins_min": 1.3387386798858643, "rewards/margins_std": 1.2513091564178467, "rewards/rejected": -3.6291985511779785, "step": 1480 }, { "epoch": 0.57, "grad_norm": 5.0, "learning_rate": 2.3299728690536608e-07, "logits/chosen": -3.123486280441284, "logits/rejected": -2.8105220794677734, "logps/chosen": -334.51751708984375, "logps/rejected": -510.5174865722656, "loss": 0.3437, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.432910680770874, "rewards/margins": 1.7944892644882202, "rewards/margins_max": 2.8090806007385254, "rewards/margins_min": 0.7798979878425598, "rewards/margins_std": 1.434848666191101, "rewards/rejected": -3.227400302886963, "step": 1490 }, { "epoch": 0.57, "grad_norm": 7.125, "learning_rate": 2.296701843270721e-07, "logits/chosen": -3.0233614444732666, "logits/rejected": -2.722904920578003, "logps/chosen": -383.2572021484375, "logps/rejected": -575.3277587890625, "loss": 0.306, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.3304874897003174, "rewards/margins": 1.631151556968689, "rewards/margins_max": 2.6261086463928223, "rewards/margins_min": 0.6361947655677795, "rewards/margins_std": 1.4070814847946167, "rewards/rejected": -2.961639165878296, "step": 1500 }, { "epoch": 0.58, "grad_norm": 5.90625, "learning_rate": 2.2634670262620448e-07, "logits/chosen": -3.0541083812713623, "logits/rejected": -2.8443028926849365, "logps/chosen": -283.5533142089844, "logps/rejected": -542.0906982421875, "loss": 0.294, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.050548791885376, "rewards/margins": 2.099508285522461, "rewards/margins_max": 3.110419273376465, "rewards/margins_min": 1.0885975360870361, "rewards/margins_std": 1.429643988609314, "rewards/rejected": -3.150057077407837, "step": 1510 }, { "epoch": 0.58, "grad_norm": 6.34375, "learning_rate": 2.2302743373729206e-07, "logits/chosen": -3.053959369659424, "logits/rejected": -2.795989751815796, "logps/chosen": -303.9230651855469, "logps/rejected": -550.1535034179688, "loss": 0.295, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.2478278875350952, "rewards/margins": 2.0318145751953125, "rewards/margins_max": 3.167066812515259, "rewards/margins_min": 0.8965622782707214, "rewards/margins_std": 1.6054890155792236, "rewards/rejected": -3.2796425819396973, "step": 1520 }, { "epoch": 0.58, "grad_norm": 3.59375, "learning_rate": 2.1971296884453388e-07, "logits/chosen": -3.122481346130371, "logits/rejected": -2.825242519378662, "logps/chosen": -341.49603271484375, "logps/rejected": -550.0460815429688, "loss": 0.3166, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.2628682851791382, "rewards/margins": 1.9104182720184326, "rewards/margins_max": 3.124661684036255, "rewards/margins_min": 0.6961749792098999, "rewards/margins_std": 1.7171993255615234, "rewards/rejected": -3.1732866764068604, "step": 1530 }, { "epoch": 0.59, "grad_norm": 4.84375, "learning_rate": 2.1640389827650468e-07, "logits/chosen": -3.086146116256714, "logits/rejected": -2.824626922607422, "logps/chosen": -328.03009033203125, "logps/rejected": -552.62060546875, "loss": 0.287, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.9867761731147766, "rewards/margins": 2.0814549922943115, "rewards/margins_max": 3.2015552520751953, "rewards/margins_min": 0.9613545536994934, "rewards/margins_std": 1.5840612649917603, "rewards/rejected": -3.0682311058044434, "step": 1540 }, { "epoch": 0.59, "grad_norm": 11.0625, "learning_rate": 2.1310081140101327e-07, "logits/chosen": -3.09912371635437, "logits/rejected": -2.8447043895721436, "logps/chosen": -317.84014892578125, "logps/rejected": -500.9691467285156, "loss": 0.3094, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.4269239902496338, "rewards/margins": 1.7367973327636719, "rewards/margins_max": 2.898564577102661, "rewards/margins_min": 0.5750298500061035, "rewards/margins_std": 1.6429872512817383, "rewards/rejected": -3.1637210845947266, "step": 1550 }, { "epoch": 0.6, "grad_norm": 14.0, "learning_rate": 2.0980429652013295e-07, "logits/chosen": -3.072404623031616, "logits/rejected": -2.8281359672546387, "logps/chosen": -310.8489685058594, "logps/rejected": -544.1529541015625, "loss": 0.3317, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.312307596206665, "rewards/margins": 2.0402112007141113, "rewards/margins_max": 3.448382616043091, "rewards/margins_min": 0.6320397853851318, "rewards/margins_std": 1.991454839706421, "rewards/rejected": -3.3525187969207764, "step": 1560 }, { "epoch": 0.6, "grad_norm": 3.484375, "learning_rate": 2.0651494076541996e-07, "logits/chosen": -3.017042636871338, "logits/rejected": -2.806481122970581, "logps/chosen": -300.30584716796875, "logps/rejected": -586.01904296875, "loss": 0.3159, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.2244617938995361, "rewards/margins": 2.292558431625366, "rewards/margins_max": 3.585287094116211, "rewards/margins_min": 0.999829888343811, "rewards/margins_std": 1.8281943798065186, "rewards/rejected": -3.5170199871063232, "step": 1570 }, { "epoch": 0.6, "grad_norm": 5.0, "learning_rate": 2.0323332999334193e-07, "logits/chosen": -3.004047155380249, "logits/rejected": -2.7635300159454346, "logps/chosen": -297.9124450683594, "logps/rejected": -565.5846557617188, "loss": 0.322, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.4263161420822144, "rewards/margins": 2.243640184402466, "rewards/margins_max": 3.4020352363586426, "rewards/margins_min": 1.0852453708648682, "rewards/margins_std": 1.6382176876068115, "rewards/rejected": -3.6699562072753906, "step": 1580 }, { "epoch": 0.61, "grad_norm": 5.03125, "learning_rate": 1.999600486809331e-07, "logits/chosen": -3.0038676261901855, "logits/rejected": -2.782977819442749, "logps/chosen": -319.99542236328125, "logps/rejected": -548.05615234375, "loss": 0.3235, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.2841845750808716, "rewards/margins": 2.042858600616455, "rewards/margins_max": 3.4500343799591064, "rewards/margins_min": 0.6356827020645142, "rewards/margins_std": 1.9900470972061157, "rewards/rejected": -3.327043056488037, "step": 1590 }, { "epoch": 0.61, "grad_norm": 4.40625, "learning_rate": 1.9669567982169428e-07, "logits/chosen": -3.0793793201446533, "logits/rejected": -2.839048147201538, "logps/chosen": -321.1004333496094, "logps/rejected": -508.93914794921875, "loss": 0.3593, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.6104228496551514, "rewards/margins": 1.6521637439727783, "rewards/margins_max": 3.04083251953125, "rewards/margins_min": 0.26349449157714844, "rewards/margins_std": 1.9638748168945312, "rewards/rejected": -3.2625865936279297, "step": 1600 }, { "epoch": 0.62, "grad_norm": 7.5, "learning_rate": 1.9344080482175835e-07, "logits/chosen": -2.9939608573913574, "logits/rejected": -2.805124521255493, "logps/chosen": -302.0801696777344, "logps/rejected": -551.068359375, "loss": 0.2995, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.3268810510635376, "rewards/margins": 2.172919988632202, "rewards/margins_max": 3.472337245941162, "rewards/margins_min": 0.8735028505325317, "rewards/margins_std": 1.8376535177230835, "rewards/rejected": -3.4998011589050293, "step": 1610 }, { "epoch": 0.62, "grad_norm": 6.5, "learning_rate": 1.9019600339633797e-07, "logits/chosen": -3.0695903301239014, "logits/rejected": -2.8530142307281494, "logps/chosen": -364.30255126953125, "logps/rejected": -550.4779052734375, "loss": 0.329, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.4837919473648071, "rewards/margins": 1.7356408834457397, "rewards/margins_max": 2.772507905960083, "rewards/margins_min": 0.6987739205360413, "rewards/margins_std": 1.4663512706756592, "rewards/rejected": -3.2194323539733887, "step": 1620 }, { "epoch": 0.62, "grad_norm": 6.6875, "learning_rate": 1.8696185346647386e-07, "logits/chosen": -3.0316128730773926, "logits/rejected": -2.7795474529266357, "logps/chosen": -301.7175598144531, "logps/rejected": -547.9525146484375, "loss": 0.3055, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.2730237245559692, "rewards/margins": 2.1233882904052734, "rewards/margins_max": 3.315976619720459, "rewards/margins_min": 0.9308001399040222, "rewards/margins_std": 1.6865743398666382, "rewards/rejected": -3.3964123725891113, "step": 1630 }, { "epoch": 0.63, "grad_norm": 14.375, "learning_rate": 1.8373893105610356e-07, "logits/chosen": -3.0410008430480957, "logits/rejected": -2.821869373321533, "logps/chosen": -300.0240783691406, "logps/rejected": -579.7957153320312, "loss": 0.2773, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1710954904556274, "rewards/margins": 2.312669038772583, "rewards/margins_max": 3.4646263122558594, "rewards/margins_min": 1.1607115268707275, "rewards/margins_std": 1.629113793373108, "rewards/rejected": -3.4837639331817627, "step": 1640 }, { "epoch": 0.63, "grad_norm": 4.96875, "learning_rate": 1.8052781018946776e-07, "logits/chosen": -3.058983325958252, "logits/rejected": -2.825737476348877, "logps/chosen": -318.34185791015625, "logps/rejected": -551.1431274414062, "loss": 0.3039, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0352818965911865, "rewards/margins": 1.9767980575561523, "rewards/margins_max": 3.301335573196411, "rewards/margins_min": 0.6522601842880249, "rewards/margins_std": 1.8731791973114014, "rewards/rejected": -3.012079954147339, "step": 1650 }, { "epoch": 0.63, "grad_norm": 4.5, "learning_rate": 1.7732906278887222e-07, "logits/chosen": -2.973245620727539, "logits/rejected": -2.6782615184783936, "logps/chosen": -339.115234375, "logps/rejected": -555.8124389648438, "loss": 0.303, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1952590942382812, "rewards/margins": 1.9027913808822632, "rewards/margins_max": 2.9387829303741455, "rewards/margins_min": 0.8667998313903809, "rewards/margins_std": 1.4651134014129639, "rewards/rejected": -3.098050594329834, "step": 1660 }, { "epoch": 0.64, "grad_norm": 3.765625, "learning_rate": 1.7414325857282526e-07, "logits/chosen": -3.071760892868042, "logits/rejected": -2.8689608573913574, "logps/chosen": -285.7099304199219, "logps/rejected": -561.7498779296875, "loss": 0.3187, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.2145211696624756, "rewards/margins": 1.9574220180511475, "rewards/margins_max": 2.9473042488098145, "rewards/margins_min": 0.9675399661064148, "rewards/margins_std": 1.399904489517212, "rewards/rejected": -3.171943426132202, "step": 1670 }, { "epoch": 0.64, "grad_norm": 2.453125, "learning_rate": 1.709709649545662e-07, "logits/chosen": -3.034092426300049, "logits/rejected": -2.7237911224365234, "logps/chosen": -315.25665283203125, "logps/rejected": -538.0850830078125, "loss": 0.2661, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.0960243940353394, "rewards/margins": 2.064706325531006, "rewards/margins_max": 3.1389946937561035, "rewards/margins_min": 0.9904179573059082, "rewards/margins_std": 1.519273281097412, "rewards/rejected": -3.1607306003570557, "step": 1680 }, { "epoch": 0.65, "grad_norm": 4.71875, "learning_rate": 1.6781274694100599e-07, "logits/chosen": -3.057701587677002, "logits/rejected": -2.8561203479766846, "logps/chosen": -312.094970703125, "logps/rejected": -571.929443359375, "loss": 0.2961, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2997673749923706, "rewards/margins": 2.410301685333252, "rewards/margins_max": 3.7557570934295654, "rewards/margins_min": 1.064846396446228, "rewards/margins_std": 1.9027611017227173, "rewards/rejected": -3.7100696563720703, "step": 1690 }, { "epoch": 0.65, "grad_norm": 3.8125, "learning_rate": 1.6466916703209532e-07, "logits/chosen": -3.1039514541625977, "logits/rejected": -2.8413565158843994, "logps/chosen": -323.77264404296875, "logps/rejected": -546.0987548828125, "loss": 0.2862, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.2360670566558838, "rewards/margins": 1.9988797903060913, "rewards/margins_max": 3.3218770027160645, "rewards/margins_min": 0.6758825182914734, "rewards/margins_std": 1.87100088596344, "rewards/rejected": -3.2349467277526855, "step": 1700 }, { "epoch": 0.65, "grad_norm": 6.4375, "learning_rate": 1.6154078512063948e-07, "logits/chosen": -3.0772576332092285, "logits/rejected": -2.8048458099365234, "logps/chosen": -354.092529296875, "logps/rejected": -560.3134765625, "loss": 0.3128, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.3545913696289062, "rewards/margins": 1.8672946691513062, "rewards/margins_max": 2.8722145557403564, "rewards/margins_min": 0.8623749613761902, "rewards/margins_std": 1.4211711883544922, "rewards/rejected": -3.221886157989502, "step": 1710 }, { "epoch": 0.66, "grad_norm": 3.953125, "learning_rate": 1.5842815839257787e-07, "logits/chosen": -3.0298233032226562, "logits/rejected": -2.7669081687927246, "logps/chosen": -308.47601318359375, "logps/rejected": -582.238525390625, "loss": 0.2733, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.0452951192855835, "rewards/margins": 2.085793972015381, "rewards/margins_max": 3.37977933883667, "rewards/margins_min": 0.7918087244033813, "rewards/margins_std": 1.8299715518951416, "rewards/rejected": -3.131088972091675, "step": 1720 }, { "epoch": 0.66, "grad_norm": 3.90625, "learning_rate": 1.553318412277455e-07, "logits/chosen": -2.9976773262023926, "logits/rejected": -2.753671884536743, "logps/chosen": -310.14483642578125, "logps/rejected": -524.3933715820312, "loss": 0.3032, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0656635761260986, "rewards/margins": 1.9153623580932617, "rewards/margins_max": 2.951770305633545, "rewards/margins_min": 0.8789544105529785, "rewards/margins_std": 1.4657022953033447, "rewards/rejected": -2.9810261726379395, "step": 1730 }, { "epoch": 0.67, "grad_norm": 25.0, "learning_rate": 1.5225238510113375e-07, "logits/chosen": -3.018756628036499, "logits/rejected": -2.8047399520874023, "logps/chosen": -296.329345703125, "logps/rejected": -502.345703125, "loss": 0.329, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2520067691802979, "rewards/margins": 1.5649439096450806, "rewards/margins_max": 2.421596050262451, "rewards/margins_min": 0.7082915306091309, "rewards/margins_std": 1.211489200592041, "rewards/rejected": -2.816950559616089, "step": 1740 }, { "epoch": 0.67, "grad_norm": 4.875, "learning_rate": 1.4919033848466962e-07, "logits/chosen": -3.09132981300354, "logits/rejected": -2.871804714202881, "logps/chosen": -319.3221130371094, "logps/rejected": -574.5970458984375, "loss": 0.2568, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.2768352031707764, "rewards/margins": 2.041638135910034, "rewards/margins_max": 2.9990642070770264, "rewards/margins_min": 1.0842119455337524, "rewards/margins_std": 1.3540050983428955, "rewards/rejected": -3.3184731006622314, "step": 1750 }, { "epoch": 0.67, "grad_norm": 4.9375, "learning_rate": 1.461462467495284e-07, "logits/chosen": -3.0827224254608154, "logits/rejected": -2.813368082046509, "logps/chosen": -358.135009765625, "logps/rejected": -571.1021728515625, "loss": 0.3097, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.214004635810852, "rewards/margins": 1.886165976524353, "rewards/margins_max": 2.9341206550598145, "rewards/margins_min": 0.8382118344306946, "rewards/margins_std": 1.4820313453674316, "rewards/rejected": -3.100170612335205, "step": 1760 }, { "epoch": 0.68, "grad_norm": 6.34375, "learning_rate": 1.4312065206900021e-07, "logits/chosen": -3.048474073410034, "logits/rejected": -2.8194785118103027, "logps/chosen": -378.2091369628906, "logps/rejected": -616.70703125, "loss": 0.3034, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.3116062879562378, "rewards/margins": 2.1770853996276855, "rewards/margins_max": 3.711287021636963, "rewards/margins_min": 0.6428841948509216, "rewards/margins_std": 2.1696884632110596, "rewards/rejected": -3.488692045211792, "step": 1770 }, { "epoch": 0.68, "grad_norm": 6.0, "learning_rate": 1.401140933219247e-07, "logits/chosen": -3.1598076820373535, "logits/rejected": -2.8843746185302734, "logps/chosen": -340.0314636230469, "logps/rejected": -537.6961669921875, "loss": 0.3382, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.213836908340454, "rewards/margins": 1.8678576946258545, "rewards/margins_max": 2.903883934020996, "rewards/margins_min": 0.831831157207489, "rewards/margins_std": 1.4651625156402588, "rewards/rejected": -3.0816946029663086, "step": 1780 }, { "epoch": 0.68, "grad_norm": 4.09375, "learning_rate": 1.37127105996713e-07, "logits/chosen": -3.06451153755188, "logits/rejected": -2.7919728755950928, "logps/chosen": -302.01983642578125, "logps/rejected": -489.68426513671875, "loss": 0.3351, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.1221381425857544, "rewards/margins": 1.8237155675888062, "rewards/margins_max": 2.9347827434539795, "rewards/margins_min": 0.7126487493515015, "rewards/margins_std": 1.5712860822677612, "rewards/rejected": -2.9458537101745605, "step": 1790 }, { "epoch": 0.69, "grad_norm": 3.40625, "learning_rate": 1.3416022209597428e-07, "logits/chosen": -3.0701396465301514, "logits/rejected": -2.8056159019470215, "logps/chosen": -283.6884765625, "logps/rejected": -581.525146484375, "loss": 0.2636, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.2768995761871338, "rewards/margins": 2.5604019165039062, "rewards/margins_max": 3.7317397594451904, "rewards/margins_min": 1.389063835144043, "rewards/margins_std": 1.6565221548080444, "rewards/rejected": -3.837301254272461, "step": 1800 }, { "epoch": 0.69, "grad_norm": 5.34375, "learning_rate": 1.3121397004176192e-07, "logits/chosen": -3.0139946937561035, "logits/rejected": -2.8204457759857178, "logps/chosen": -328.19464111328125, "logps/rejected": -496.7694396972656, "loss": 0.4052, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.3268804550170898, "rewards/margins": 1.4515395164489746, "rewards/margins_max": 2.597702741622925, "rewards/margins_min": 0.3053762912750244, "rewards/margins_std": 1.6209194660186768, "rewards/rejected": -2.7784199714660645, "step": 1810 }, { "epoch": 0.7, "grad_norm": 4.71875, "learning_rate": 1.2828887458145803e-07, "logits/chosen": -3.0096335411071777, "logits/rejected": -2.7300803661346436, "logps/chosen": -338.5462646484375, "logps/rejected": -565.7578125, "loss": 0.3193, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.3147917985916138, "rewards/margins": 1.8738778829574585, "rewards/margins_max": 2.926962375640869, "rewards/margins_min": 0.8207935094833374, "rewards/margins_std": 1.489286184310913, "rewards/rejected": -3.1886696815490723, "step": 1820 }, { "epoch": 0.7, "grad_norm": 6.09375, "learning_rate": 1.2538545669431277e-07, "logits/chosen": -2.9853451251983643, "logits/rejected": -2.771872043609619, "logps/chosen": -289.3203125, "logps/rejected": -526.6580810546875, "loss": 0.2986, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.1213651895523071, "rewards/margins": 1.9687013626098633, "rewards/margins_max": 3.2991204261779785, "rewards/margins_min": 0.638282299041748, "rewards/margins_std": 1.8814964294433594, "rewards/rejected": -3.090066432952881, "step": 1830 }, { "epoch": 0.7, "grad_norm": 2.859375, "learning_rate": 1.2250423349865385e-07, "logits/chosen": -3.014766216278076, "logits/rejected": -2.7629036903381348, "logps/chosen": -315.1452941894531, "logps/rejected": -545.957275390625, "loss": 0.326, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2841541767120361, "rewards/margins": 2.023402214050293, "rewards/margins_max": 3.26432466506958, "rewards/margins_min": 0.7824802398681641, "rewards/margins_std": 1.7549289464950562, "rewards/rejected": -3.307556629180908, "step": 1840 }, { "epoch": 0.71, "grad_norm": 3.0625, "learning_rate": 1.1964571815978466e-07, "logits/chosen": -3.092644214630127, "logits/rejected": -2.8553617000579834, "logps/chosen": -312.3218688964844, "logps/rejected": -571.085693359375, "loss": 0.2972, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.4290850162506104, "rewards/margins": 2.0302395820617676, "rewards/margins_max": 3.186135768890381, "rewards/margins_min": 0.874343752861023, "rewards/margins_std": 1.63468337059021, "rewards/rejected": -3.459324598312378, "step": 1850 }, { "epoch": 0.71, "grad_norm": 3.625, "learning_rate": 1.1681041979858625e-07, "logits/chosen": -3.033277988433838, "logits/rejected": -2.7657084465026855, "logps/chosen": -306.395751953125, "logps/rejected": -543.9789428710938, "loss": 0.3153, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.2107644081115723, "rewards/margins": 2.087048292160034, "rewards/margins_max": 3.434495210647583, "rewards/margins_min": 0.7396020889282227, "rewards/margins_std": 1.9055770635604858, "rewards/rejected": -3.2978129386901855, "step": 1860 }, { "epoch": 0.71, "grad_norm": 9.8125, "learning_rate": 1.1399884340083885e-07, "logits/chosen": -3.0568225383758545, "logits/rejected": -2.797673225402832, "logps/chosen": -353.53240966796875, "logps/rejected": -581.5899047851562, "loss": 0.3496, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.299001932144165, "rewards/margins": 1.858944296836853, "rewards/margins_max": 3.037067413330078, "rewards/margins_min": 0.6808212399482727, "rewards/margins_std": 1.6661179065704346, "rewards/rejected": -3.1579461097717285, "step": 1870 }, { "epoch": 0.72, "grad_norm": 3.09375, "learning_rate": 1.1121148972728103e-07, "logits/chosen": -2.9481570720672607, "logits/rejected": -2.7904300689697266, "logps/chosen": -265.8861389160156, "logps/rejected": -570.0164794921875, "loss": 0.2655, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1066445112228394, "rewards/margins": 2.189269781112671, "rewards/margins_max": 3.4255566596984863, "rewards/margins_min": 0.9529832005500793, "rewards/margins_std": 1.74837327003479, "rewards/rejected": -3.2959141731262207, "step": 1880 }, { "epoch": 0.72, "grad_norm": 3.953125, "learning_rate": 1.0844885522442074e-07, "logits/chosen": -3.0224673748016357, "logits/rejected": -2.7885611057281494, "logps/chosen": -345.76611328125, "logps/rejected": -586.8031005859375, "loss": 0.3028, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2756171226501465, "rewards/margins": 2.0080113410949707, "rewards/margins_max": 3.22710919380188, "rewards/margins_min": 0.7889140844345093, "rewards/margins_std": 1.7240642309188843, "rewards/rejected": -3.2836289405822754, "step": 1890 }, { "epoch": 0.73, "grad_norm": 5.25, "learning_rate": 1.0571143193611442e-07, "logits/chosen": -3.0753421783447266, "logits/rejected": -2.75911283493042, "logps/chosen": -365.8919372558594, "logps/rejected": -536.22119140625, "loss": 0.3328, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.240403175354004, "rewards/margins": 1.6046421527862549, "rewards/margins_max": 2.751216173171997, "rewards/margins_min": 0.4580683708190918, "rewards/margins_std": 1.6215002536773682, "rewards/rejected": -2.845045566558838, "step": 1900 }, { "epoch": 0.73, "grad_norm": 8.625, "learning_rate": 1.0299970741593139e-07, "logits/chosen": -3.0553393363952637, "logits/rejected": -2.8270599842071533, "logps/chosen": -332.8170471191406, "logps/rejected": -568.9782104492188, "loss": 0.3059, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.547603964805603, "rewards/margins": 2.207674741744995, "rewards/margins_max": 3.555354356765747, "rewards/margins_min": 0.8599950075149536, "rewards/margins_std": 1.9059069156646729, "rewards/rejected": -3.7552788257598877, "step": 1910 }, { "epoch": 0.73, "grad_norm": 3.265625, "learning_rate": 1.0031416464031654e-07, "logits/chosen": -3.004044532775879, "logits/rejected": -2.7544479370117188, "logps/chosen": -296.4781494140625, "logps/rejected": -561.3325805664062, "loss": 0.2683, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.03928542137146, "rewards/margins": 2.1187376976013184, "rewards/margins_max": 3.1847622394561768, "rewards/margins_min": 1.052713394165039, "rewards/margins_std": 1.507586121559143, "rewards/rejected": -3.1580233573913574, "step": 1920 }, { "epoch": 0.74, "grad_norm": 3.546875, "learning_rate": 9.765528192256928e-08, "logits/chosen": -3.0507009029388428, "logits/rejected": -2.7500503063201904, "logps/chosen": -355.6528625488281, "logps/rejected": -617.4149169921875, "loss": 0.2526, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.4352195262908936, "rewards/margins": 2.4360108375549316, "rewards/margins_max": 3.6944077014923096, "rewards/margins_min": 1.1776138544082642, "rewards/margins_std": 1.779642105102539, "rewards/rejected": -3.871230363845825, "step": 1930 }, { "epoch": 0.74, "grad_norm": 31.5, "learning_rate": 9.502353282765305e-08, "logits/chosen": -3.0021920204162598, "logits/rejected": -2.794358491897583, "logps/chosen": -335.790283203125, "logps/rejected": -586.9259643554688, "loss": 0.3317, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.3478296995162964, "rewards/margins": 2.267702341079712, "rewards/margins_max": 3.787633180618286, "rewards/margins_min": 0.7477713823318481, "rewards/margins_std": 2.1495070457458496, "rewards/rejected": -3.6155319213867188, "step": 1940 }, { "epoch": 0.75, "grad_norm": 2.921875, "learning_rate": 9.241938608784952e-08, "logits/chosen": -3.0923657417297363, "logits/rejected": -2.8275530338287354, "logps/chosen": -343.7580871582031, "logps/rejected": -642.3468627929688, "loss": 0.2858, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.290956735610962, "rewards/margins": 2.2989776134490967, "rewards/margins_max": 3.6432526111602783, "rewards/margins_min": 0.9547020196914673, "rewards/margins_std": 1.901092529296875, "rewards/rejected": -3.5899341106414795, "step": 1950 }, { "epoch": 0.75, "grad_norm": 11.8125, "learning_rate": 8.984330551927474e-08, "logits/chosen": -3.041471242904663, "logits/rejected": -2.7196826934814453, "logps/chosen": -335.8681335449219, "logps/rejected": -525.2930908203125, "loss": 0.296, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.2617841958999634, "rewards/margins": 1.64345383644104, "rewards/margins_max": 2.7398273944854736, "rewards/margins_min": 0.5470799207687378, "rewards/margins_std": 1.550506591796875, "rewards/rejected": -2.905237913131714, "step": 1960 }, { "epoch": 0.75, "grad_norm": 4.21875, "learning_rate": 8.729574993927027e-08, "logits/chosen": -3.0498995780944824, "logits/rejected": -2.7774124145507812, "logps/chosen": -298.8482666015625, "logps/rejected": -548.7689819335938, "loss": 0.2758, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.2929372787475586, "rewards/margins": 2.247105121612549, "rewards/margins_max": 3.341545581817627, "rewards/margins_min": 1.1526648998260498, "rewards/margins_std": 1.5477720499038696, "rewards/rejected": -3.5400421619415283, "step": 1970 }, { "epoch": 0.76, "grad_norm": 4.5, "learning_rate": 8.47771730846844e-08, "logits/chosen": -3.0463712215423584, "logits/rejected": -2.820709705352783, "logps/chosen": -355.2076416015625, "logps/rejected": -596.3829345703125, "loss": 0.3158, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.3873268365859985, "rewards/margins": 2.0964386463165283, "rewards/margins_max": 3.318743944168091, "rewards/margins_min": 0.8741332292556763, "rewards/margins_std": 1.728601098060608, "rewards/rejected": -3.4837653636932373, "step": 1980 }, { "epoch": 0.76, "grad_norm": 6.53125, "learning_rate": 8.228802353105879e-08, "logits/chosen": -3.0121498107910156, "logits/rejected": -2.75282621383667, "logps/chosen": -333.64752197265625, "logps/rejected": -581.3692626953125, "loss": 0.2918, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.227485179901123, "rewards/margins": 2.29580020904541, "rewards/margins_max": 3.5629870891571045, "rewards/margins_min": 1.0286139249801636, "rewards/margins_std": 1.7920722961425781, "rewards/rejected": -3.5232856273651123, "step": 1990 }, { "epoch": 0.76, "grad_norm": 3.625, "learning_rate": 7.982874461273436e-08, "logits/chosen": -3.079096794128418, "logits/rejected": -2.8346662521362305, "logps/chosen": -356.8946838378906, "logps/rejected": -570.3480224609375, "loss": 0.3284, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.3698757886886597, "rewards/margins": 1.7264187335968018, "rewards/margins_max": 2.5157878398895264, "rewards/margins_min": 0.9370495080947876, "rewards/margins_std": 1.1163365840911865, "rewards/rejected": -3.096294403076172, "step": 2000 }, { "epoch": 0.77, "grad_norm": 5.71875, "learning_rate": 7.739977434388989e-08, "logits/chosen": -3.095477819442749, "logits/rejected": -2.766036033630371, "logps/chosen": -303.59136962890625, "logps/rejected": -491.6221618652344, "loss": 0.3677, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.1862236261367798, "rewards/margins": 1.5905078649520874, "rewards/margins_max": 2.6899428367614746, "rewards/margins_min": 0.4910725951194763, "rewards/margins_std": 1.5548362731933594, "rewards/rejected": -2.7767317295074463, "step": 2010 }, { "epoch": 0.77, "grad_norm": 3.515625, "learning_rate": 7.500154534052932e-08, "logits/chosen": -3.0353145599365234, "logits/rejected": -2.828632116317749, "logps/chosen": -309.5120849609375, "logps/rejected": -628.2648315429688, "loss": 0.2883, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.3138376474380493, "rewards/margins": 2.359034299850464, "rewards/margins_max": 3.667515516281128, "rewards/margins_min": 1.0505527257919312, "rewards/margins_std": 1.850472092628479, "rewards/rejected": -3.6728718280792236, "step": 2020 }, { "epoch": 0.78, "grad_norm": 5.21875, "learning_rate": 7.263448474342967e-08, "logits/chosen": -3.044409990310669, "logits/rejected": -2.806182384490967, "logps/chosen": -288.3754577636719, "logps/rejected": -628.6566162109375, "loss": 0.2556, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.1923936605453491, "rewards/margins": 2.8368632793426514, "rewards/margins_max": 4.438173770904541, "rewards/margins_min": 1.2355536222457886, "rewards/margins_std": 2.264594316482544, "rewards/rejected": -4.029257297515869, "step": 2030 }, { "epoch": 0.78, "grad_norm": 3.875, "learning_rate": 7.02990141420641e-08, "logits/chosen": -3.081254720687866, "logits/rejected": -2.7363178730010986, "logps/chosen": -341.04254150390625, "logps/rejected": -610.2572021484375, "loss": 0.2851, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.430784821510315, "rewards/margins": 2.504645824432373, "rewards/margins_max": 3.509565830230713, "rewards/margins_min": 1.4997262954711914, "rewards/margins_std": 1.4211710691452026, "rewards/rejected": -3.9354305267333984, "step": 2040 }, { "epoch": 0.78, "grad_norm": 2.8125, "learning_rate": 6.799554949951459e-08, "logits/chosen": -3.0202066898345947, "logits/rejected": -2.839040994644165, "logps/chosen": -313.3542175292969, "logps/rejected": -556.8174438476562, "loss": 0.3178, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3130320310592651, "rewards/margins": 2.1237895488739014, "rewards/margins_max": 3.231538772583008, "rewards/margins_min": 1.0160400867462158, "rewards/margins_std": 1.566594123840332, "rewards/rejected": -3.436821699142456, "step": 2050 }, { "epoch": 0.79, "grad_norm": 4.21875, "learning_rate": 6.57245010783855e-08, "logits/chosen": -3.0388119220733643, "logits/rejected": -2.771134376525879, "logps/chosen": -348.8595275878906, "logps/rejected": -569.1920776367188, "loss": 0.2867, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.1526178121566772, "rewards/margins": 2.0269393920898438, "rewards/margins_max": 3.0691983699798584, "rewards/margins_min": 0.98468017578125, "rewards/margins_std": 1.4739770889282227, "rewards/rejected": -3.1795573234558105, "step": 2060 }, { "epoch": 0.79, "grad_norm": 4.65625, "learning_rate": 6.348627336773337e-08, "logits/chosen": -3.071326732635498, "logits/rejected": -2.8534488677978516, "logps/chosen": -312.7285461425781, "logps/rejected": -518.8443603515625, "loss": 0.3679, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.1652886867523193, "rewards/margins": 1.7322543859481812, "rewards/margins_max": 2.9441657066345215, "rewards/margins_min": 0.5203433036804199, "rewards/margins_std": 1.7139012813568115, "rewards/rejected": -2.89754319190979, "step": 2070 }, { "epoch": 0.79, "grad_norm": 3.1875, "learning_rate": 6.12812650110248e-08, "logits/chosen": -2.9665591716766357, "logits/rejected": -2.7093448638916016, "logps/chosen": -285.0913391113281, "logps/rejected": -539.43603515625, "loss": 0.3377, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.1528418064117432, "rewards/margins": 2.0406405925750732, "rewards/margins_max": 3.2592735290527344, "rewards/margins_min": 0.8220078349113464, "rewards/margins_std": 1.7234073877334595, "rewards/rejected": -3.1934823989868164, "step": 2080 }, { "epoch": 0.8, "grad_norm": 4.8125, "learning_rate": 5.910986873513485e-08, "logits/chosen": -3.0143747329711914, "logits/rejected": -2.7879300117492676, "logps/chosen": -306.3103942871094, "logps/rejected": -582.0455932617188, "loss": 0.2541, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3029712438583374, "rewards/margins": 2.4551165103912354, "rewards/margins_max": 3.8277435302734375, "rewards/margins_min": 1.0824897289276123, "rewards/margins_std": 1.9411876201629639, "rewards/rejected": -3.758087635040283, "step": 2090 }, { "epoch": 0.8, "grad_norm": 8.6875, "learning_rate": 5.697247128040036e-08, "logits/chosen": -3.0154166221618652, "logits/rejected": -2.8093299865722656, "logps/chosen": -337.4362487792969, "logps/rejected": -536.3162841796875, "loss": 0.3107, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.3548002243041992, "rewards/margins": 1.746351957321167, "rewards/margins_max": 2.8274214267730713, "rewards/margins_min": 0.6652824878692627, "rewards/margins_std": 1.5288629531860352, "rewards/rejected": -3.101152181625366, "step": 2100 }, { "epoch": 0.81, "grad_norm": 7.0625, "learning_rate": 5.486945333173851e-08, "logits/chosen": -3.050468683242798, "logits/rejected": -2.7848165035247803, "logps/chosen": -342.97528076171875, "logps/rejected": -538.8195190429688, "loss": 0.3164, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.2223647832870483, "rewards/margins": 1.7037765979766846, "rewards/margins_max": 2.781301975250244, "rewards/margins_min": 0.6262511014938354, "rewards/margins_std": 1.523850917816162, "rewards/rejected": -2.9261412620544434, "step": 2110 }, { "epoch": 0.81, "grad_norm": 6.53125, "learning_rate": 5.280118945084422e-08, "logits/chosen": -2.931457757949829, "logits/rejected": -2.6956043243408203, "logps/chosen": -311.7135925292969, "logps/rejected": -522.5653076171875, "loss": 0.2989, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.1440449953079224, "rewards/margins": 1.8880630731582642, "rewards/margins_max": 2.7849342823028564, "rewards/margins_min": 0.9911916851997375, "rewards/margins_std": 1.2683675289154053, "rewards/rejected": -3.0321078300476074, "step": 2120 }, { "epoch": 0.81, "grad_norm": 5.75, "learning_rate": 5.076804800947834e-08, "logits/chosen": -3.0498359203338623, "logits/rejected": -2.8847270011901855, "logps/chosen": -293.4126892089844, "logps/rejected": -525.2155151367188, "loss": 0.3278, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.3807709217071533, "rewards/margins": 1.8494899272918701, "rewards/margins_max": 2.753748655319214, "rewards/margins_min": 0.9452314376831055, "rewards/margins_std": 1.2788149118423462, "rewards/rejected": -3.2302603721618652, "step": 2130 }, { "epoch": 0.82, "grad_norm": 3.578125, "learning_rate": 4.877039112385814e-08, "logits/chosen": -3.111642360687256, "logits/rejected": -2.8808951377868652, "logps/chosen": -282.8914489746094, "logps/rejected": -561.9642333984375, "loss": 0.3362, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1109817028045654, "rewards/margins": 2.2935783863067627, "rewards/margins_max": 3.600482940673828, "rewards/margins_min": 0.9866735339164734, "rewards/margins_std": 1.8482424020767212, "rewards/rejected": -3.404560089111328, "step": 2140 }, { "epoch": 0.82, "grad_norm": 6.21875, "learning_rate": 4.680857459016196e-08, "logits/chosen": -3.049572467803955, "logits/rejected": -2.7968525886535645, "logps/chosen": -312.0675354003906, "logps/rejected": -581.7535400390625, "loss": 0.2966, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1987165212631226, "rewards/margins": 2.0752511024475098, "rewards/margins_max": 3.3707432746887207, "rewards/margins_min": 0.7797588109970093, "rewards/margins_std": 1.8321025371551514, "rewards/rejected": -3.2739672660827637, "step": 2150 }, { "epoch": 0.83, "grad_norm": 4.8125, "learning_rate": 4.4882947821159563e-08, "logits/chosen": -3.0233592987060547, "logits/rejected": -2.7910993099212646, "logps/chosen": -279.7276306152344, "logps/rejected": -569.35791015625, "loss": 0.3003, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.1314738988876343, "rewards/margins": 2.3299572467803955, "rewards/margins_max": 3.502735137939453, "rewards/margins_min": 1.1571792364120483, "rewards/margins_std": 1.6585586071014404, "rewards/rejected": -3.4614310264587402, "step": 2160 }, { "epoch": 0.83, "grad_norm": 2.9375, "learning_rate": 4.299385378397907e-08, "logits/chosen": -2.9819421768188477, "logits/rejected": -2.7031192779541016, "logps/chosen": -361.3184509277344, "logps/rejected": -575.5309448242188, "loss": 0.2952, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.4940073490142822, "rewards/margins": 2.0843570232391357, "rewards/margins_max": 3.1069493293762207, "rewards/margins_min": 1.0617649555206299, "rewards/margins_std": 1.4461634159088135, "rewards/rejected": -3.578364133834839, "step": 2170 }, { "epoch": 0.83, "grad_norm": 3.78125, "learning_rate": 4.114162893902259e-08, "logits/chosen": -2.9869210720062256, "logits/rejected": -2.736611843109131, "logps/chosen": -311.1821594238281, "logps/rejected": -537.497314453125, "loss": 0.3177, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.308483362197876, "rewards/margins": 1.9217637777328491, "rewards/margins_max": 3.026961326599121, "rewards/margins_min": 0.8165658712387085, "rewards/margins_std": 1.5629857778549194, "rewards/rejected": -3.2302470207214355, "step": 2180 }, { "epoch": 0.84, "grad_norm": 4.59375, "learning_rate": 3.9326603180040216e-08, "logits/chosen": -2.9793362617492676, "logits/rejected": -2.692769765853882, "logps/chosen": -323.01751708984375, "logps/rejected": -538.4032592773438, "loss": 0.3301, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.254874587059021, "rewards/margins": 1.8814289569854736, "rewards/margins_max": 2.7431275844573975, "rewards/margins_min": 1.0197299718856812, "rewards/margins_std": 1.2186262607574463, "rewards/rejected": -3.136303663253784, "step": 2190 }, { "epoch": 0.84, "grad_norm": 3.59375, "learning_rate": 3.754909977537357e-08, "logits/chosen": -3.031088352203369, "logits/rejected": -2.7731916904449463, "logps/chosen": -350.8569641113281, "logps/rejected": -553.83642578125, "loss": 0.33, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.4683470726013184, "rewards/margins": 1.888584852218628, "rewards/margins_max": 3.0695700645446777, "rewards/margins_min": 0.707599937915802, "rewards/margins_std": 1.6701648235321045, "rewards/rejected": -3.356931686401367, "step": 2200 }, { "epoch": 0.84, "grad_norm": 5.15625, "learning_rate": 3.5809435310379556e-08, "logits/chosen": -3.0824389457702637, "logits/rejected": -2.8151016235351562, "logps/chosen": -291.5655212402344, "logps/rejected": -544.6790161132812, "loss": 0.306, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.1818848848342896, "rewards/margins": 2.224213123321533, "rewards/margins_max": 3.5027458667755127, "rewards/margins_min": 0.9456807971000671, "rewards/margins_std": 1.8081178665161133, "rewards/rejected": -3.4060981273651123, "step": 2210 }, { "epoch": 0.85, "grad_norm": 5.96875, "learning_rate": 3.410791963104473e-08, "logits/chosen": -3.0624046325683594, "logits/rejected": -2.817383289337158, "logps/chosen": -316.928466796875, "logps/rejected": -554.0638427734375, "loss": 0.2644, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2545480728149414, "rewards/margins": 2.1589207649230957, "rewards/margins_max": 3.1708950996398926, "rewards/margins_min": 1.1469463109970093, "rewards/margins_std": 1.431147813796997, "rewards/rejected": -3.413468837738037, "step": 2220 }, { "epoch": 0.85, "grad_norm": 3.625, "learning_rate": 3.2444855788799075e-08, "logits/chosen": -3.0546398162841797, "logits/rejected": -2.768601179122925, "logps/chosen": -311.37213134765625, "logps/rejected": -548.6424560546875, "loss": 0.2899, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.1787984371185303, "rewards/margins": 1.9840619564056396, "rewards/margins_max": 2.971385955810547, "rewards/margins_min": 0.9967382550239563, "rewards/margins_std": 1.3962868452072144, "rewards/rejected": -3.16286039352417, "step": 2230 }, { "epoch": 0.86, "grad_norm": 5.71875, "learning_rate": 3.082053998654105e-08, "logits/chosen": -3.0698394775390625, "logits/rejected": -2.863145112991333, "logps/chosen": -305.7530212402344, "logps/rejected": -512.8255004882812, "loss": 0.3497, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.2612425088882446, "rewards/margins": 1.7084944248199463, "rewards/margins_max": 2.653332233428955, "rewards/margins_min": 0.7636561393737793, "rewards/margins_std": 1.33620285987854, "rewards/rejected": -2.9697365760803223, "step": 2240 }, { "epoch": 0.86, "grad_norm": 18.0, "learning_rate": 2.9235261525881322e-08, "logits/chosen": -3.0396475791931152, "logits/rejected": -2.8047540187835693, "logps/chosen": -313.1606140136719, "logps/rejected": -493.97412109375, "loss": 0.369, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.431262731552124, "rewards/margins": 1.6049553155899048, "rewards/margins_max": 2.717376232147217, "rewards/margins_min": 0.49253416061401367, "rewards/margins_std": 1.5732009410858154, "rewards/rejected": -3.03621768951416, "step": 2250 }, { "epoch": 0.86, "grad_norm": 5.21875, "learning_rate": 2.7689302755616732e-08, "logits/chosen": -3.084151268005371, "logits/rejected": -2.7894577980041504, "logps/chosen": -317.45355224609375, "logps/rejected": -485.2964782714844, "loss": 0.3091, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.1630436182022095, "rewards/margins": 1.6040722131729126, "rewards/margins_max": 2.4594357013702393, "rewards/margins_min": 0.7487087845802307, "rewards/margins_std": 1.20966637134552, "rewards/rejected": -2.767115831375122, "step": 2260 }, { "epoch": 0.87, "grad_norm": 15.9375, "learning_rate": 2.6182939021441584e-08, "logits/chosen": -3.061344861984253, "logits/rejected": -2.835780620574951, "logps/chosen": -323.6014404296875, "logps/rejected": -565.6847534179688, "loss": 0.3159, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.2828537225723267, "rewards/margins": 2.0452182292938232, "rewards/margins_max": 3.042397975921631, "rewards/margins_min": 1.0480389595031738, "rewards/margins_std": 1.4102246761322021, "rewards/rejected": -3.3280720710754395, "step": 2270 }, { "epoch": 0.87, "grad_norm": 5.21875, "learning_rate": 2.4716438616906975e-08, "logits/chosen": -2.9988512992858887, "logits/rejected": -2.801060199737549, "logps/chosen": -342.7635498046875, "logps/rejected": -568.91552734375, "loss": 0.2964, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.5359113216400146, "rewards/margins": 1.9777557849884033, "rewards/margins_max": 3.3029723167419434, "rewards/margins_min": 0.6525388360023499, "rewards/margins_std": 1.8741397857666016, "rewards/rejected": -3.513666868209839, "step": 2280 }, { "epoch": 0.88, "grad_norm": 3.96875, "learning_rate": 2.3290062735635914e-08, "logits/chosen": -3.061750888824463, "logits/rejected": -2.8491766452789307, "logps/chosen": -309.3940734863281, "logps/rejected": -563.3863525390625, "loss": 0.2996, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4320241212844849, "rewards/margins": 2.2175357341766357, "rewards/margins_max": 3.681055784225464, "rewards/margins_min": 0.7540156841278076, "rewards/margins_std": 2.069730043411255, "rewards/rejected": -3.649559736251831, "step": 2290 }, { "epoch": 0.88, "grad_norm": 9.0, "learning_rate": 2.1904065424802997e-08, "logits/chosen": -3.0209171772003174, "logits/rejected": -2.817324638366699, "logps/chosen": -368.29156494140625, "logps/rejected": -561.48876953125, "loss": 0.3503, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.4189422130584717, "rewards/margins": 1.4640214443206787, "rewards/margins_max": 2.460878849029541, "rewards/margins_min": 0.4671642780303955, "rewards/margins_std": 1.40976881980896, "rewards/rejected": -2.8829636573791504, "step": 2300 }, { "epoch": 0.88, "grad_norm": 3.84375, "learning_rate": 2.0558693539886595e-08, "logits/chosen": -3.0322585105895996, "logits/rejected": -2.836747884750366, "logps/chosen": -325.99859619140625, "logps/rejected": -588.0489501953125, "loss": 0.292, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.230724573135376, "rewards/margins": 2.264963150024414, "rewards/margins_max": 3.455662965774536, "rewards/margins_min": 1.0742634534835815, "rewards/margins_std": 1.683903694152832, "rewards/rejected": -3.495687961578369, "step": 2310 }, { "epoch": 0.89, "grad_norm": 4.875, "learning_rate": 1.9254186700702667e-08, "logits/chosen": -3.0456230640411377, "logits/rejected": -2.819836139678955, "logps/chosen": -326.4022521972656, "logps/rejected": -572.4381713867188, "loss": 0.299, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.2172520160675049, "rewards/margins": 2.1527469158172607, "rewards/margins_max": 3.443035125732422, "rewards/margins_min": 0.8624590039253235, "rewards/margins_std": 1.8247426748275757, "rewards/rejected": -3.3699989318847656, "step": 2320 }, { "epoch": 0.89, "grad_norm": 4.90625, "learning_rate": 1.799077724872644e-08, "logits/chosen": -3.0328400135040283, "logits/rejected": -2.8203063011169434, "logps/chosen": -283.4230041503906, "logps/rejected": -519.71826171875, "loss": 0.3396, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.2383689880371094, "rewards/margins": 1.9863897562026978, "rewards/margins_max": 3.1531052589416504, "rewards/margins_min": 0.8196744918823242, "rewards/margins_std": 1.64998459815979, "rewards/rejected": -3.2247588634490967, "step": 2330 }, { "epoch": 0.89, "grad_norm": 3.765625, "learning_rate": 1.6768690205711173e-08, "logits/chosen": -3.133364200592041, "logits/rejected": -2.901482343673706, "logps/chosen": -367.28460693359375, "logps/rejected": -628.627197265625, "loss": 0.2691, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2051244974136353, "rewards/margins": 1.9070392847061157, "rewards/margins_max": 3.0293052196502686, "rewards/margins_min": 0.784773051738739, "rewards/margins_std": 1.587123990058899, "rewards/rejected": -3.112163782119751, "step": 2340 }, { "epoch": 0.9, "grad_norm": 10.3125, "learning_rate": 1.558814323361002e-08, "logits/chosen": -3.0593042373657227, "logits/rejected": -2.857165813446045, "logps/chosen": -307.4436950683594, "logps/rejected": -567.9011840820312, "loss": 0.2897, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.1894042491912842, "rewards/margins": 1.9394479990005493, "rewards/margins_max": 2.996490716934204, "rewards/margins_min": 0.8824055790901184, "rewards/margins_std": 1.4948837757110596, "rewards/rejected": -3.128852128982544, "step": 2350 }, { "epoch": 0.9, "grad_norm": 5.3125, "learning_rate": 1.4449346595809014e-08, "logits/chosen": -2.9993538856506348, "logits/rejected": -2.744136095046997, "logps/chosen": -317.97711181640625, "logps/rejected": -568.7042236328125, "loss": 0.2759, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.3569293022155762, "rewards/margins": 2.2924506664276123, "rewards/margins_max": 3.323498249053955, "rewards/margins_min": 1.2614028453826904, "rewards/margins_std": 1.4581215381622314, "rewards/rejected": -3.6493797302246094, "step": 2360 }, { "epoch": 0.91, "grad_norm": 4.0625, "learning_rate": 1.3352503119677867e-08, "logits/chosen": -3.0968692302703857, "logits/rejected": -2.8480188846588135, "logps/chosen": -323.5044860839844, "logps/rejected": -632.51318359375, "loss": 0.2821, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.2613385915756226, "rewards/margins": 2.256674289703369, "rewards/margins_max": 3.341503858566284, "rewards/margins_min": 1.1718448400497437, "rewards/margins_std": 1.5341806411743164, "rewards/rejected": -3.5180130004882812, "step": 2370 }, { "epoch": 0.91, "grad_norm": 6.03125, "learning_rate": 1.2297808160444928e-08, "logits/chosen": -3.090576648712158, "logits/rejected": -2.854712963104248, "logps/chosen": -310.79571533203125, "logps/rejected": -578.5720825195312, "loss": 0.3438, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1912732124328613, "rewards/margins": 2.3021795749664307, "rewards/margins_max": 3.6481099128723145, "rewards/margins_min": 0.9562493562698364, "rewards/margins_std": 1.903433084487915, "rewards/rejected": -3.493452787399292, "step": 2380 }, { "epoch": 0.91, "grad_norm": 5.53125, "learning_rate": 1.1285449566403094e-08, "logits/chosen": -3.020697832107544, "logits/rejected": -2.7931606769561768, "logps/chosen": -302.6512451171875, "logps/rejected": -517.8263549804688, "loss": 0.3479, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.2877894639968872, "rewards/margins": 1.8629472255706787, "rewards/margins_max": 3.250032901763916, "rewards/margins_min": 0.47586172819137573, "rewards/margins_std": 1.9616352319717407, "rewards/rejected": -3.1507368087768555, "step": 2390 }, { "epoch": 0.92, "grad_norm": 5.78125, "learning_rate": 1.0315607645452834e-08, "logits/chosen": -3.067622661590576, "logits/rejected": -2.768596649169922, "logps/chosen": -316.2894592285156, "logps/rejected": -593.660888671875, "loss": 0.2706, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.1791785955429077, "rewards/margins": 2.433260679244995, "rewards/margins_max": 3.671154022216797, "rewards/margins_min": 1.1953675746917725, "rewards/margins_std": 1.7506450414657593, "rewards/rejected": -3.6124393939971924, "step": 2400 }, { "epoch": 0.92, "grad_norm": 7.25, "learning_rate": 9.388455132988054e-09, "logits/chosen": -3.003413677215576, "logits/rejected": -2.767271041870117, "logps/chosen": -302.7046203613281, "logps/rejected": -537.4854736328125, "loss": 0.3177, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.296728253364563, "rewards/margins": 2.105752468109131, "rewards/margins_max": 3.5697181224823, "rewards/margins_min": 0.6417877078056335, "rewards/margins_std": 2.070359230041504, "rewards/rejected": -3.4024810791015625, "step": 2410 }, { "epoch": 0.92, "grad_norm": 13.9375, "learning_rate": 8.504157161130786e-09, "logits/chosen": -2.9792139530181885, "logits/rejected": -2.774827480316162, "logps/chosen": -265.3929748535156, "logps/rejected": -534.5281982421875, "loss": 0.3145, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.139758586883545, "rewards/margins": 2.1166577339172363, "rewards/margins_max": 3.568591594696045, "rewards/margins_min": 0.6647233366966248, "rewards/margins_std": 2.053344964981079, "rewards/rejected": -3.256415843963623, "step": 2420 }, { "epoch": 0.93, "grad_norm": 4.3125, "learning_rate": 7.662871229320106e-09, "logits/chosen": -3.083374500274658, "logits/rejected": -2.891068935394287, "logps/chosen": -320.02984619140625, "logps/rejected": -579.4918823242188, "loss": 0.2939, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.4123804569244385, "rewards/margins": 2.1614882946014404, "rewards/margins_max": 3.478391647338867, "rewards/margins_min": 0.8445852994918823, "rewards/margins_std": 1.862382173538208, "rewards/rejected": -3.5738685131073, "step": 2430 }, { "epoch": 0.93, "grad_norm": 3.015625, "learning_rate": 6.864747176260288e-09, "logits/chosen": -3.0258588790893555, "logits/rejected": -2.75772762298584, "logps/chosen": -327.40850830078125, "logps/rejected": -539.2699584960938, "loss": 0.3086, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2346729040145874, "rewards/margins": 1.9701659679412842, "rewards/margins_max": 2.9854886531829834, "rewards/margins_min": 0.9548432230949402, "rewards/margins_std": 1.4358831644058228, "rewards/rejected": -3.204838991165161, "step": 2440 }, { "epoch": 0.94, "grad_norm": 2.40625, "learning_rate": 6.10992715323369e-09, "logits/chosen": -3.0179896354675293, "logits/rejected": -2.7822508811950684, "logps/chosen": -303.41876220703125, "logps/rejected": -566.8040771484375, "loss": 0.289, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.1414506435394287, "rewards/margins": 2.2253711223602295, "rewards/margins_max": 3.476818561553955, "rewards/margins_min": 0.9739240407943726, "rewards/margins_std": 1.7698135375976562, "rewards/rejected": -3.366821765899658, "step": 2450 }, { "epoch": 0.94, "grad_norm": 5.9375, "learning_rate": 5.398545598782528e-09, "logits/chosen": -2.946030855178833, "logits/rejected": -2.7149243354797363, "logps/chosen": -321.11444091796875, "logps/rejected": -576.3018188476562, "loss": 0.3173, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.2470299005508423, "rewards/margins": 2.265151023864746, "rewards/margins_max": 3.5846290588378906, "rewards/margins_min": 0.9456728100776672, "rewards/margins_std": 1.8660240173339844, "rewards/rejected": -3.512180805206299, "step": 2460 }, { "epoch": 0.94, "grad_norm": 7.0, "learning_rate": 4.730729214764417e-09, "logits/chosen": -3.089811325073242, "logits/rejected": -2.8148770332336426, "logps/chosen": -333.5905456542969, "logps/rejected": -606.7553100585938, "loss": 0.316, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.306650161743164, "rewards/margins": 2.504451274871826, "rewards/margins_max": 3.8686652183532715, "rewards/margins_min": 1.1402372121810913, "rewards/margins_std": 1.9292898178100586, "rewards/rejected": -3.8111014366149902, "step": 2470 }, { "epoch": 0.95, "grad_norm": 9.6875, "learning_rate": 4.106596943786095e-09, "logits/chosen": -3.0792603492736816, "logits/rejected": -2.848381519317627, "logps/chosen": -339.4683837890625, "logps/rejected": -617.6273803710938, "loss": 0.301, "rewards/accuracies": 0.9375, "rewards/chosen": -1.349266529083252, "rewards/margins": 2.4606006145477295, "rewards/margins_max": 3.7330939769744873, "rewards/margins_min": 1.1881073713302612, "rewards/margins_std": 1.7995771169662476, "rewards/rejected": -3.8098673820495605, "step": 2480 }, { "epoch": 0.95, "grad_norm": 3.640625, "learning_rate": 3.526259948018778e-09, "logits/chosen": -3.0362095832824707, "logits/rejected": -2.8663055896759033, "logps/chosen": -365.0773620605469, "logps/rejected": -618.965087890625, "loss": 0.2862, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.4282352924346924, "rewards/margins": 1.976050615310669, "rewards/margins_max": 3.0453240871429443, "rewards/margins_min": 0.9067766070365906, "rewards/margins_std": 1.5121815204620361, "rewards/rejected": -3.4042859077453613, "step": 2490 }, { "epoch": 0.96, "grad_norm": 13.0625, "learning_rate": 2.989821589399505e-09, "logits/chosen": -3.0085415840148926, "logits/rejected": -2.8075146675109863, "logps/chosen": -322.4816589355469, "logps/rejected": -571.5615844726562, "loss": 0.2951, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4327504634857178, "rewards/margins": 2.106593132019043, "rewards/margins_max": 3.44807767868042, "rewards/margins_min": 0.7651088833808899, "rewards/margins_std": 1.8971455097198486, "rewards/rejected": -3.539344072341919, "step": 2500 }, { "epoch": 0.96, "grad_norm": 4.3125, "learning_rate": 2.4973774112216628e-09, "logits/chosen": -3.05271577835083, "logits/rejected": -2.8218674659729004, "logps/chosen": -337.2112121582031, "logps/rejected": -588.983642578125, "loss": 0.3041, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.3858591318130493, "rewards/margins": 1.8815782070159912, "rewards/margins_max": 3.0813260078430176, "rewards/margins_min": 0.6818308234214783, "rewards/margins_std": 1.6966991424560547, "rewards/rejected": -3.267437696456909, "step": 2510 }, { "epoch": 0.96, "grad_norm": 4.21875, "learning_rate": 2.049015121118075e-09, "logits/chosen": -3.0388782024383545, "logits/rejected": -2.7133755683898926, "logps/chosen": -308.1634216308594, "logps/rejected": -517.82763671875, "loss": 0.3344, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.2324225902557373, "rewards/margins": 1.8060786724090576, "rewards/margins_max": 2.9905166625976562, "rewards/margins_min": 0.6216403841972351, "rewards/margins_std": 1.675048589706421, "rewards/rejected": -3.038501262664795, "step": 2520 }, { "epoch": 0.97, "grad_norm": 6.53125, "learning_rate": 1.6448145754396625e-09, "logits/chosen": -3.0789260864257812, "logits/rejected": -2.7612602710723877, "logps/chosen": -335.08795166015625, "logps/rejected": -567.6199340820312, "loss": 0.3478, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3611897230148315, "rewards/margins": 1.9877793788909912, "rewards/margins_max": 3.0852162837982178, "rewards/margins_min": 0.8903425931930542, "rewards/margins_std": 1.5520099401474, "rewards/rejected": -3.3489692211151123, "step": 2530 }, { "epoch": 0.97, "grad_norm": 5.0625, "learning_rate": 1.2848477650325984e-09, "logits/chosen": -3.0007452964782715, "logits/rejected": -2.734440326690674, "logps/chosen": -272.87933349609375, "logps/rejected": -529.8055419921875, "loss": 0.3078, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1611636877059937, "rewards/margins": 2.15301775932312, "rewards/margins_max": 3.293184757232666, "rewards/margins_min": 1.0128505229949951, "rewards/margins_std": 1.612439751625061, "rewards/rejected": -3.3141815662384033, "step": 2540 }, { "epoch": 0.97, "grad_norm": 10.125, "learning_rate": 9.691788024160374e-10, "logits/chosen": -3.0972485542297363, "logits/rejected": -2.8439688682556152, "logps/chosen": -263.6483154296875, "logps/rejected": -510.2359313964844, "loss": 0.287, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.1465216875076294, "rewards/margins": 2.213108539581299, "rewards/margins_max": 3.255052089691162, "rewards/margins_min": 1.1711642742156982, "rewards/margins_std": 1.4735311269760132, "rewards/rejected": -3.3596298694610596, "step": 2550 }, { "epoch": 0.98, "grad_norm": 5.9375, "learning_rate": 6.978639103634443e-10, "logits/chosen": -3.022329807281494, "logits/rejected": -2.8326973915100098, "logps/chosen": -286.43682861328125, "logps/rejected": -538.6163940429688, "loss": 0.2913, "rewards/accuracies": 1.0, "rewards/chosen": -1.083613395690918, "rewards/margins": 2.006159543991089, "rewards/margins_max": 2.973505735397339, "rewards/margins_min": 1.0388134717941284, "rewards/margins_std": 1.3680341243743896, "rewards/rejected": -3.089773178100586, "step": 2560 }, { "epoch": 0.98, "grad_norm": 5.625, "learning_rate": 4.709514118888813e-10, "logits/chosen": -3.088514566421509, "logits/rejected": -2.8289153575897217, "logps/chosen": -352.9057922363281, "logps/rejected": -557.4052734375, "loss": 0.2997, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.546465516090393, "rewards/margins": 1.8814094066619873, "rewards/margins_max": 2.9587039947509766, "rewards/margins_min": 0.8041146993637085, "rewards/margins_std": 1.5235246419906616, "rewards/rejected": -3.427874803543091, "step": 2570 }, { "epoch": 0.99, "grad_norm": 5.40625, "learning_rate": 2.884817216402546e-10, "logits/chosen": -3.038875102996826, "logits/rejected": -2.7465806007385254, "logps/chosen": -348.75567626953125, "logps/rejected": -502.0702209472656, "loss": 0.3639, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.3894237279891968, "rewards/margins": 1.6732631921768188, "rewards/margins_max": 2.5952847003936768, "rewards/margins_min": 0.7512421011924744, "rewards/margins_std": 1.303934931755066, "rewards/rejected": -3.0626869201660156, "step": 2580 }, { "epoch": 0.99, "grad_norm": 4.625, "learning_rate": 1.5048733870137719e-10, "logits/chosen": -3.0684123039245605, "logits/rejected": -2.8573265075683594, "logps/chosen": -363.2174377441406, "logps/rejected": -667.3125610351562, "loss": 0.3333, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3807947635650635, "rewards/margins": 2.112975597381592, "rewards/margins_max": 3.1048035621643066, "rewards/margins_min": 1.121147632598877, "rewards/margins_std": 1.4026567935943604, "rewards/rejected": -3.493770122528076, "step": 2590 }, { "epoch": 0.99, "grad_norm": 17.375, "learning_rate": 5.699284080346034e-11, "logits/chosen": -3.046823263168335, "logits/rejected": -2.718289613723755, "logps/chosen": -332.06915283203125, "logps/rejected": -508.7821350097656, "loss": 0.3244, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.235876441001892, "rewards/margins": 1.7919772863388062, "rewards/margins_max": 2.906499147415161, "rewards/margins_min": 0.677455484867096, "rewards/margins_std": 1.5761719942092896, "rewards/rejected": -3.0278537273406982, "step": 2600 }, { "epoch": 1.0, "grad_norm": 7.03125, "learning_rate": 8.014879947837449e-12, "logits/chosen": -3.0880868434906006, "logits/rejected": -2.723057985305786, "logps/chosen": -331.1163024902344, "logps/rejected": -531.2738037109375, "loss": 0.296, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.2712851762771606, "rewards/margins": 1.8950446844100952, "rewards/margins_max": 3.0397720336914062, "rewards/margins_min": 0.7503169775009155, "rewards/margins_std": 1.6188892126083374, "rewards/rejected": -3.1663296222686768, "step": 2610 }, { "epoch": 1.0, "eval_logits/chosen": -2.2173049449920654, "eval_logits/rejected": -2.112638235092163, "eval_logps/chosen": -363.9157409667969, "eval_logps/rejected": -358.1680908203125, "eval_loss": 0.7011914253234863, "eval_rewards/accuracies": 0.5506666898727417, "eval_rewards/chosen": -0.8454986810684204, "eval_rewards/margins": 0.07637320458889008, "eval_rewards/margins_max": 0.9500231146812439, "eval_rewards/margins_min": -0.7111339569091797, "eval_rewards/margins_std": 0.5453019738197327, "eval_rewards/rejected": -0.9218719005584717, "eval_runtime": 1325.5647, "eval_samples_per_second": 4.526, "eval_steps_per_second": 0.283, "step": 2616 }, { "epoch": 1.0, "step": 2616, "total_flos": 0.0, "train_loss": 0.39473982852533324, "train_runtime": 26215.8871, "train_samples_per_second": 1.597, "train_steps_per_second": 0.1 } ], "logging_steps": 10, "max_steps": 2616, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }