{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 478, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 8.482095207381906, "learning_rate": 1.0416666666666666e-08, "logits/chosen": -2.715719223022461, "logits/rejected": -2.648977279663086, "logps/chosen": -280.43304443359375, "logps/rejected": -269.5838623046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "grad_norm": 8.16883844563723, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -2.732839345932007, "logits/rejected": -2.667829990386963, "logps/chosen": -252.18836975097656, "logps/rejected": -247.36721801757812, "loss": 0.6932, "rewards/accuracies": 0.4027777910232544, "rewards/chosen": -0.00020123104332014918, "rewards/margins": -0.000496760243549943, "rewards/rejected": 0.00029552922933362424, "step": 10 }, { "epoch": 0.04, "grad_norm": 14.177063698718772, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -2.69161057472229, "logits/rejected": -2.6473488807678223, "logps/chosen": -281.9129333496094, "logps/rejected": -240.79598999023438, "loss": 0.6923, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0002729900588747114, "rewards/margins": 0.0022440399043262005, "rewards/rejected": -0.002517030341550708, "step": 20 }, { "epoch": 0.06, "grad_norm": 9.049842796547402, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.7057762145996094, "logits/rejected": -2.6666598320007324, "logps/chosen": -263.8976135253906, "logps/rejected": -272.29376220703125, "loss": 0.688, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.005987819284200668, "rewards/margins": 0.008191236294806004, "rewards/rejected": -0.0022034167777746916, "step": 30 }, { "epoch": 0.08, "grad_norm": 8.064536836305289, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.644676446914673, "logits/rejected": -2.6523287296295166, "logps/chosen": -250.64651489257812, "logps/rejected": -240.74032592773438, "loss": 0.6775, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.03351370617747307, "rewards/margins": 0.024272698909044266, "rewards/rejected": 0.009241009131073952, "step": 40 }, { "epoch": 0.1, "grad_norm": 10.48307069571783, "learning_rate": 4.999733114418725e-07, "logits/chosen": -2.61690616607666, "logits/rejected": -2.606698989868164, "logps/chosen": -280.32537841796875, "logps/rejected": -285.14044189453125, "loss": 0.6572, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.027328694239258766, "rewards/margins": 0.08270208537578583, "rewards/rejected": -0.05537338927388191, "step": 50 }, { "epoch": 0.13, "grad_norm": 11.217495552902317, "learning_rate": 4.990398100856366e-07, "logits/chosen": -2.6028363704681396, "logits/rejected": -2.567634344100952, "logps/chosen": -257.1751403808594, "logps/rejected": -286.7103271484375, "loss": 0.6382, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.04173869267106056, "rewards/margins": 0.14793893694877625, "rewards/rejected": -0.1896776258945465, "step": 60 }, { "epoch": 0.15, "grad_norm": 17.002512653941917, "learning_rate": 4.967775735898179e-07, "logits/chosen": -2.5477585792541504, "logits/rejected": -2.5538854598999023, "logps/chosen": -292.3646240234375, "logps/rejected": -295.1927795410156, "loss": 0.6089, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.17977146804332733, "rewards/margins": 0.22043642401695251, "rewards/rejected": -0.40020790696144104, "step": 70 }, { "epoch": 0.17, "grad_norm": 20.83722381714638, "learning_rate": 4.931986719649298e-07, "logits/chosen": -2.428433895111084, "logits/rejected": -2.4765429496765137, "logps/chosen": -269.8500671386719, "logps/rejected": -280.10003662109375, "loss": 0.6115, "rewards/accuracies": 0.6875, "rewards/chosen": -0.11873555183410645, "rewards/margins": 0.21846923232078552, "rewards/rejected": -0.3372047543525696, "step": 80 }, { "epoch": 0.19, "grad_norm": 20.582635256945927, "learning_rate": 4.883222001996351e-07, "logits/chosen": -2.3149566650390625, "logits/rejected": -2.3007912635803223, "logps/chosen": -331.8517761230469, "logps/rejected": -353.4700012207031, "loss": 0.6084, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4723566174507141, "rewards/margins": 0.2896661162376404, "rewards/rejected": -0.7620226740837097, "step": 90 }, { "epoch": 0.21, "grad_norm": 15.474211826471516, "learning_rate": 4.821741763807186e-07, "logits/chosen": -2.2471911907196045, "logits/rejected": -2.2106168270111084, "logps/chosen": -346.5975646972656, "logps/rejected": -342.70501708984375, "loss": 0.5875, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5387172698974609, "rewards/margins": 0.4208316206932068, "rewards/rejected": -0.9595489501953125, "step": 100 }, { "epoch": 0.21, "eval_logits/chosen": -1.839242935180664, "eval_logits/rejected": -1.8929309844970703, "eval_logps/chosen": -327.45477294921875, "eval_logps/rejected": -373.7126159667969, "eval_loss": 0.5814013481140137, "eval_rewards/accuracies": 0.6953125, "eval_rewards/chosen": -0.6485257148742676, "eval_rewards/margins": 0.46177732944488525, "eval_rewards/rejected": -1.1103030443191528, "eval_runtime": 42.9635, "eval_samples_per_second": 46.551, "eval_steps_per_second": 0.745, "step": 100 }, { "epoch": 0.23, "grad_norm": 20.231938142939548, "learning_rate": 4.747874028753375e-07, "logits/chosen": -1.7017923593521118, "logits/rejected": -1.6744320392608643, "logps/chosen": -330.83038330078125, "logps/rejected": -384.0965576171875, "loss": 0.559, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.6345968246459961, "rewards/margins": 0.5422910451889038, "rewards/rejected": -1.1768878698349, "step": 110 }, { "epoch": 0.25, "grad_norm": 33.96451578064435, "learning_rate": 4.662012913161997e-07, "logits/chosen": -1.5577061176300049, "logits/rejected": -1.454756498336792, "logps/chosen": -354.79632568359375, "logps/rejected": -374.60076904296875, "loss": 0.5508, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6455836892127991, "rewards/margins": 0.6170965433120728, "rewards/rejected": -1.2626802921295166, "step": 120 }, { "epoch": 0.27, "grad_norm": 22.138057395490627, "learning_rate": 4.5646165232345103e-07, "logits/chosen": -1.985395073890686, "logits/rejected": -1.7330490350723267, "logps/chosen": -381.68524169921875, "logps/rejected": -393.6003723144531, "loss": 0.5691, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6896110773086548, "rewards/margins": 0.46295279264450073, "rewards/rejected": -1.1525638103485107, "step": 130 }, { "epoch": 0.29, "grad_norm": 22.948235111766778, "learning_rate": 4.456204510851956e-07, "logits/chosen": -1.7121162414550781, "logits/rejected": -1.4992659091949463, "logps/chosen": -352.444580078125, "logps/rejected": -359.44293212890625, "loss": 0.5444, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7159131765365601, "rewards/margins": 0.5610858798027039, "rewards/rejected": -1.2769991159439087, "step": 140 }, { "epoch": 0.31, "grad_norm": 21.94614729668817, "learning_rate": 4.337355301007335e-07, "logits/chosen": -1.6999915838241577, "logits/rejected": -1.5280932188034058, "logps/chosen": -363.3777770996094, "logps/rejected": -409.53472900390625, "loss": 0.5354, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.6176407933235168, "rewards/margins": 0.7246734499931335, "rewards/rejected": -1.3423142433166504, "step": 150 }, { "epoch": 0.33, "grad_norm": 25.951614311748312, "learning_rate": 4.2087030056579986e-07, "logits/chosen": -0.8817731142044067, "logits/rejected": -0.6137579679489136, "logps/chosen": -382.1938171386719, "logps/rejected": -432.11004638671875, "loss": 0.5293, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0363513231277466, "rewards/margins": 0.7439759373664856, "rewards/rejected": -1.7803272008895874, "step": 160 }, { "epoch": 0.36, "grad_norm": 22.62183660087882, "learning_rate": 4.070934040463998e-07, "logits/chosen": -0.8362399339675903, "logits/rejected": -0.5129006505012512, "logps/chosen": -383.85943603515625, "logps/rejected": -450.67376708984375, "loss": 0.5396, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1981004476547241, "rewards/margins": 0.5252794623374939, "rewards/rejected": -1.7233800888061523, "step": 170 }, { "epoch": 0.38, "grad_norm": 20.471718314655647, "learning_rate": 3.9247834624635404e-07, "logits/chosen": -1.1186776161193848, "logits/rejected": -0.807415783405304, "logps/chosen": -367.24517822265625, "logps/rejected": -383.65325927734375, "loss": 0.5423, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7441693544387817, "rewards/margins": 0.7058154344558716, "rewards/rejected": -1.4499847888946533, "step": 180 }, { "epoch": 0.4, "grad_norm": 27.806972171143425, "learning_rate": 3.7710310482256523e-07, "logits/chosen": -0.8124464750289917, "logits/rejected": -0.3878273069858551, "logps/chosen": -388.7113342285156, "logps/rejected": -429.1011657714844, "loss": 0.5411, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.1005661487579346, "rewards/margins": 0.6401538252830505, "rewards/rejected": -1.7407200336456299, "step": 190 }, { "epoch": 0.42, "grad_norm": 26.53253409564454, "learning_rate": 3.610497133404795e-07, "logits/chosen": -0.11316045373678207, "logits/rejected": 0.337258517742157, "logps/chosen": -417.2351989746094, "logps/rejected": -437.7659606933594, "loss": 0.5306, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.1853481531143188, "rewards/margins": 0.7383456826210022, "rewards/rejected": -1.9236938953399658, "step": 200 }, { "epoch": 0.42, "eval_logits/chosen": -0.4834875464439392, "eval_logits/rejected": -0.16467474400997162, "eval_logps/chosen": -377.3648681640625, "eval_logps/rejected": -458.6296691894531, "eval_loss": 0.5258087515830994, "eval_rewards/accuracies": 0.7578125, "eval_rewards/chosen": -1.1476268768310547, "eval_rewards/margins": 0.8118469715118408, "eval_rewards/rejected": -1.959473729133606, "eval_runtime": 42.8446, "eval_samples_per_second": 46.68, "eval_steps_per_second": 0.747, "step": 200 }, { "epoch": 0.44, "grad_norm": 32.607007304168285, "learning_rate": 3.4440382358952115e-07, "logits/chosen": -0.4726165235042572, "logits/rejected": 0.2889423966407776, "logps/chosen": -445.2005310058594, "logps/rejected": -516.150390625, "loss": 0.5157, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5719295740127563, "rewards/margins": 0.899932861328125, "rewards/rejected": -2.471862316131592, "step": 210 }, { "epoch": 0.46, "grad_norm": 26.982376912169133, "learning_rate": 3.272542485937368e-07, "logits/chosen": 0.02971530519425869, "logits/rejected": 0.6972896456718445, "logps/chosen": -488.6690979003906, "logps/rejected": -517.7215576171875, "loss": 0.5045, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.9397350549697876, "rewards/margins": 0.67905592918396, "rewards/rejected": -2.618790626525879, "step": 220 }, { "epoch": 0.48, "grad_norm": 28.31686307704009, "learning_rate": 3.096924887558854e-07, "logits/chosen": 0.3274112641811371, "logits/rejected": 1.6159236431121826, "logps/chosen": -472.36590576171875, "logps/rejected": -560.3911743164062, "loss": 0.5157, "rewards/accuracies": 0.6875, "rewards/chosen": -2.139970302581787, "rewards/margins": 0.9002918004989624, "rewards/rejected": -3.040262460708618, "step": 230 }, { "epoch": 0.5, "grad_norm": 26.93455267413187, "learning_rate": 2.9181224366319943e-07, "logits/chosen": 0.34222739934921265, "logits/rejected": 1.275580644607544, "logps/chosen": -493.95916748046875, "logps/rejected": -570.4260864257812, "loss": 0.4844, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.126845598220825, "rewards/margins": 0.9677503705024719, "rewards/rejected": -3.0945961475372314, "step": 240 }, { "epoch": 0.52, "grad_norm": 34.97761081158377, "learning_rate": 2.7370891215954565e-07, "logits/chosen": 0.15051239728927612, "logits/rejected": 1.3550792932510376, "logps/chosen": -447.21490478515625, "logps/rejected": -547.626708984375, "loss": 0.5157, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.229414463043213, "rewards/margins": 1.004492998123169, "rewards/rejected": -3.2339072227478027, "step": 250 }, { "epoch": 0.54, "grad_norm": 22.98186057582904, "learning_rate": 2.55479083351317e-07, "logits/chosen": -0.6737252473831177, "logits/rejected": -0.22596630454063416, "logps/chosen": -423.21124267578125, "logps/rejected": -537.1641845703125, "loss": 0.4931, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5507694482803345, "rewards/margins": 1.1678037643432617, "rewards/rejected": -2.7185730934143066, "step": 260 }, { "epoch": 0.56, "grad_norm": 24.568187901601522, "learning_rate": 2.3722002126275822e-07, "logits/chosen": -0.7251144647598267, "logits/rejected": 0.055858515202999115, "logps/chosen": -460.47369384765625, "logps/rejected": -515.4779663085938, "loss": 0.5056, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7928097248077393, "rewards/margins": 0.9766537547111511, "rewards/rejected": -2.769463300704956, "step": 270 }, { "epoch": 0.59, "grad_norm": 26.18745823449182, "learning_rate": 2.19029145890313e-07, "logits/chosen": -0.2248018980026245, "logits/rejected": 0.7499777674674988, "logps/chosen": -570.017822265625, "logps/rejected": -637.1007690429688, "loss": 0.4807, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.686659097671509, "rewards/margins": 1.049829125404358, "rewards/rejected": -3.736487865447998, "step": 280 }, { "epoch": 0.61, "grad_norm": 27.843254345626296, "learning_rate": 2.0100351342479216e-07, "logits/chosen": 0.017834633588790894, "logits/rejected": 0.5087012052536011, "logps/chosen": -523.2457275390625, "logps/rejected": -614.5767211914062, "loss": 0.4986, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.559013605117798, "rewards/margins": 0.9906851053237915, "rewards/rejected": -3.5496985912323, "step": 290 }, { "epoch": 0.63, "grad_norm": 28.812343531542734, "learning_rate": 1.8323929841460178e-07, "logits/chosen": -0.09833024442195892, "logits/rejected": 0.13667245209217072, "logps/chosen": -530.9118041992188, "logps/rejected": -594.4251098632812, "loss": 0.5097, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.618612766265869, "rewards/margins": 0.6055675745010376, "rewards/rejected": -3.224180221557617, "step": 300 }, { "epoch": 0.63, "eval_logits/chosen": -0.46580713987350464, "eval_logits/rejected": -0.05736924335360527, "eval_logps/chosen": -498.608642578125, "eval_logps/rejected": -600.8517456054688, "eval_loss": 0.5078553557395935, "eval_rewards/accuracies": 0.765625, "eval_rewards/chosen": -2.3600645065307617, "eval_rewards/margins": 1.021630048751831, "eval_rewards/rejected": -3.3816945552825928, "eval_runtime": 42.9133, "eval_samples_per_second": 46.606, "eval_steps_per_second": 0.746, "step": 300 }, { "epoch": 0.65, "grad_norm": 24.395908084810543, "learning_rate": 1.6583128063291573e-07, "logits/chosen": -0.20998772978782654, "logits/rejected": 0.597333550453186, "logps/chosen": -553.8638305664062, "logps/rejected": -622.6854248046875, "loss": 0.4601, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.6587719917297363, "rewards/margins": 1.019250512123108, "rewards/rejected": -3.678022861480713, "step": 310 }, { "epoch": 0.67, "grad_norm": 34.66768420911215, "learning_rate": 1.488723393865766e-07, "logits/chosen": 0.16718712449073792, "logits/rejected": 0.9329349398612976, "logps/chosen": -593.7220458984375, "logps/rejected": -669.1134033203125, "loss": 0.4733, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.0446815490722656, "rewards/margins": 1.103857159614563, "rewards/rejected": -4.1485395431518555, "step": 320 }, { "epoch": 0.69, "grad_norm": 28.471803594507218, "learning_rate": 1.3245295796480788e-07, "logits/chosen": -0.11941705644130707, "logits/rejected": 0.6031097173690796, "logps/chosen": -576.390869140625, "logps/rejected": -662.9435424804688, "loss": 0.4956, "rewards/accuracies": 0.75, "rewards/chosen": -2.895800828933716, "rewards/margins": 1.0525661706924438, "rewards/rejected": -3.9483673572540283, "step": 330 }, { "epoch": 0.71, "grad_norm": 29.920730024726208, "learning_rate": 1.1666074087171627e-07, "logits/chosen": -0.6586230397224426, "logits/rejected": -0.01618196628987789, "logps/chosen": -558.61328125, "logps/rejected": -645.0929565429688, "loss": 0.4901, "rewards/accuracies": 0.78125, "rewards/chosen": -2.6631388664245605, "rewards/margins": 0.9750139117240906, "rewards/rejected": -3.638153076171875, "step": 340 }, { "epoch": 0.73, "grad_norm": 28.451943555198234, "learning_rate": 1.0157994641835734e-07, "logits/chosen": -0.27327781915664673, "logits/rejected": 0.1500699818134308, "logps/chosen": -551.64501953125, "logps/rejected": -615.7404174804688, "loss": 0.5079, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.8532228469848633, "rewards/margins": 0.7952502369880676, "rewards/rejected": -3.6484732627868652, "step": 350 }, { "epoch": 0.75, "grad_norm": 25.677922944710783, "learning_rate": 8.729103716819111e-08, "logits/chosen": -0.08269649744033813, "logits/rejected": 0.27684250473976135, "logps/chosen": -541.1990966796875, "logps/rejected": -651.2036743164062, "loss": 0.4674, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.9707937240600586, "rewards/margins": 0.9196038246154785, "rewards/rejected": -3.8903980255126953, "step": 360 }, { "epoch": 0.77, "grad_norm": 36.170517312989006, "learning_rate": 7.387025063449081e-08, "logits/chosen": -0.2130926102399826, "logits/rejected": 0.6245878338813782, "logps/chosen": -593.176513671875, "logps/rejected": -702.429443359375, "loss": 0.4914, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.0834755897521973, "rewards/margins": 1.1826813220977783, "rewards/rejected": -4.266157150268555, "step": 370 }, { "epoch": 0.79, "grad_norm": 29.948938424368286, "learning_rate": 6.138919252022435e-08, "logits/chosen": -0.31227773427963257, "logits/rejected": 0.1758570820093155, "logps/chosen": -569.8441162109375, "logps/rejected": -653.1160278320312, "loss": 0.4834, "rewards/accuracies": 0.78125, "rewards/chosen": -2.861605644226074, "rewards/margins": 1.1251481771469116, "rewards/rejected": -3.9867539405822754, "step": 380 }, { "epoch": 0.82, "grad_norm": 28.643050867948876, "learning_rate": 4.991445467064689e-08, "logits/chosen": -0.4378163814544678, "logits/rejected": 0.1975017786026001, "logps/chosen": -487.02447509765625, "logps/rejected": -627.4256591796875, "loss": 0.4863, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.4755725860595703, "rewards/margins": 1.2951563596725464, "rewards/rejected": -3.770728588104248, "step": 390 }, { "epoch": 0.84, "grad_norm": 28.740512536833982, "learning_rate": 3.9507259776993954e-08, "logits/chosen": -0.464282363653183, "logits/rejected": 0.3086758852005005, "logps/chosen": -510.72564697265625, "logps/rejected": -584.2462158203125, "loss": 0.4906, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.4956374168395996, "rewards/margins": 0.9018915891647339, "rewards/rejected": -3.397528886795044, "step": 400 }, { "epoch": 0.84, "eval_logits/chosen": -0.5081287026405334, "eval_logits/rejected": -0.03897371515631676, "eval_logps/chosen": -499.4172058105469, "eval_logps/rejected": -610.7911376953125, "eval_loss": 0.49998462200164795, "eval_rewards/accuracies": 0.76953125, "eval_rewards/chosen": -2.368149995803833, "eval_rewards/margins": 1.1129380464553833, "eval_rewards/rejected": -3.4810879230499268, "eval_runtime": 43.5613, "eval_samples_per_second": 45.912, "eval_steps_per_second": 0.735, "step": 400 }, { "epoch": 0.86, "grad_norm": 29.137185743173024, "learning_rate": 3.022313472693447e-08, "logits/chosen": -0.427814781665802, "logits/rejected": 0.47378939390182495, "logps/chosen": -502.87939453125, "logps/rejected": -579.4691162109375, "loss": 0.5048, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.4262542724609375, "rewards/margins": 0.9697957038879395, "rewards/rejected": -3.396049976348877, "step": 410 }, { "epoch": 0.88, "grad_norm": 28.72794205317694, "learning_rate": 2.2111614344599684e-08, "logits/chosen": -0.4604805111885071, "logits/rejected": 0.19487139582633972, "logps/chosen": -517.6533813476562, "logps/rejected": -557.2385864257812, "loss": 0.4911, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.416501760482788, "rewards/margins": 0.9597095251083374, "rewards/rejected": -3.376211643218994, "step": 420 }, { "epoch": 0.9, "grad_norm": 21.80245746988776, "learning_rate": 1.521597710086439e-08, "logits/chosen": -0.3804924488067627, "logits/rejected": 0.39526861906051636, "logps/chosen": -535.6771240234375, "logps/rejected": -618.59326171875, "loss": 0.4719, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.5517053604125977, "rewards/margins": 1.1444313526153564, "rewards/rejected": -3.696136474609375, "step": 430 }, { "epoch": 0.92, "grad_norm": 25.533460874638475, "learning_rate": 9.57301420397924e-09, "logits/chosen": -0.2722262442111969, "logits/rejected": 0.6270692944526672, "logps/chosen": -534.4539184570312, "logps/rejected": -636.7295532226562, "loss": 0.4921, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.657174587249756, "rewards/margins": 1.1696617603302002, "rewards/rejected": -3.826836347579956, "step": 440 }, { "epoch": 0.94, "grad_norm": 29.41617255057514, "learning_rate": 5.212833302556258e-09, "logits/chosen": -0.4902656078338623, "logits/rejected": 0.2050538957118988, "logps/chosen": -552.1812744140625, "logps/rejected": -615.8868408203125, "loss": 0.4784, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.5670862197875977, "rewards/margins": 1.0947918891906738, "rewards/rejected": -3.6618781089782715, "step": 450 }, { "epoch": 0.96, "grad_norm": 26.40697406008373, "learning_rate": 2.158697848236607e-09, "logits/chosen": -0.5525180697441101, "logits/rejected": 0.4095240533351898, "logps/chosen": -521.8342895507812, "logps/rejected": -599.0489501953125, "loss": 0.4933, "rewards/accuracies": 0.75, "rewards/chosen": -2.4641835689544678, "rewards/margins": 1.049481987953186, "rewards/rejected": -3.5136656761169434, "step": 460 }, { "epoch": 0.98, "grad_norm": 40.87484210645662, "learning_rate": 4.269029751107489e-10, "logits/chosen": -0.38540196418762207, "logits/rejected": 0.25467342138290405, "logps/chosen": -550.1776123046875, "logps/rejected": -633.5750732421875, "loss": 0.4908, "rewards/accuracies": 0.71875, "rewards/chosen": -2.6054952144622803, "rewards/margins": 0.973294734954834, "rewards/rejected": -3.5787901878356934, "step": 470 }, { "epoch": 1.0, "step": 478, "total_flos": 0.0, "train_loss": 0.535196884905444, "train_runtime": 3527.4214, "train_samples_per_second": 17.331, "train_steps_per_second": 0.136 } ], "logging_steps": 10, "max_steps": 478, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }