{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 400, "global_step": 17412, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00017229496898690558, "grad_norm": 2.0903713703155518, "learning_rate": 2.870264064293915e-10, "logits/chosen": -2.8080272674560547, "logits/rejected": -2.785019874572754, "logps/chosen": -44.8405876159668, "logps/rejected": -39.36625671386719, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0017229496898690559, "grad_norm": 2.0972132682800293, "learning_rate": 2.870264064293915e-09, "logits/chosen": -2.9041833877563477, "logits/rejected": -2.8814697265625, "logps/chosen": -51.8206672668457, "logps/rejected": -49.242462158203125, "loss": 0.6932, "rewards/accuracies": 0.4652777910232544, "rewards/chosen": -0.0001266792678507045, "rewards/margins": -6.697729259030893e-05, "rewards/rejected": -5.9702000726247206e-05, "step": 10 }, { "epoch": 0.0034458993797381117, "grad_norm": 2.0939364433288574, "learning_rate": 5.74052812858783e-09, "logits/chosen": -2.9464728832244873, "logits/rejected": -2.941704273223877, "logps/chosen": -53.830238342285156, "logps/rejected": -52.87778854370117, "loss": 0.6932, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -8.338769839610904e-05, "rewards/margins": -6.040342850610614e-05, "rewards/rejected": -2.2984275346971117e-05, "step": 20 }, { "epoch": 0.005168849069607168, "grad_norm": 2.2386536598205566, "learning_rate": 8.610792192881744e-09, "logits/chosen": -2.9099199771881104, "logits/rejected": -2.8916428089141846, "logps/chosen": -57.69584274291992, "logps/rejected": -57.812767028808594, "loss": 0.6933, "rewards/accuracies": 0.40625, "rewards/chosen": -0.0001644211879465729, "rewards/margins": -0.0003975186846219003, "rewards/rejected": 0.00023309746757149696, "step": 30 }, { "epoch": 0.006891798759476223, "grad_norm": 1.8409967422485352, "learning_rate": 1.148105625717566e-08, "logits/chosen": -2.9275033473968506, "logits/rejected": -2.90380597114563, "logps/chosen": -56.05250930786133, "logps/rejected": -50.15835189819336, "loss": 0.693, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.00019465530931483954, "rewards/margins": 0.0001988656586036086, "rewards/rejected": -4.210349288769066e-06, "step": 40 }, { "epoch": 0.00861474844934528, "grad_norm": 1.9817814826965332, "learning_rate": 1.4351320321469575e-08, "logits/chosen": -2.9312682151794434, "logits/rejected": -2.92026948928833, "logps/chosen": -53.17545700073242, "logps/rejected": -50.43696975708008, "loss": 0.6934, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -7.185345748439431e-05, "rewards/margins": -0.0004073601739946753, "rewards/rejected": 0.00033550671651028097, "step": 50 }, { "epoch": 0.010337698139214336, "grad_norm": 2.351980209350586, "learning_rate": 1.7221584385763487e-08, "logits/chosen": -2.950517177581787, "logits/rejected": -2.9276511669158936, "logps/chosen": -58.39799118041992, "logps/rejected": -53.92292404174805, "loss": 0.6931, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.628993800295575e-06, "rewards/margins": 0.00013930079876445234, "rewards/rejected": -0.00014092979836277664, "step": 60 }, { "epoch": 0.012060647829083391, "grad_norm": 2.0369927883148193, "learning_rate": 2.0091848450057407e-08, "logits/chosen": -2.907961368560791, "logits/rejected": -2.895718812942505, "logps/chosen": -54.824188232421875, "logps/rejected": -52.39093017578125, "loss": 0.6932, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 3.3644850191194564e-05, "rewards/margins": -7.855640433263034e-05, "rewards/rejected": 0.00011220127635169774, "step": 70 }, { "epoch": 0.013783597518952447, "grad_norm": 2.256385326385498, "learning_rate": 2.296211251435132e-08, "logits/chosen": -2.9645910263061523, "logits/rejected": -2.9431025981903076, "logps/chosen": -60.18164825439453, "logps/rejected": -53.25905227661133, "loss": 0.693, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0001644838775973767, "rewards/margins": 0.00026324234204366803, "rewards/rejected": -9.875847899820656e-05, "step": 80 }, { "epoch": 0.015506547208821502, "grad_norm": 2.140350580215454, "learning_rate": 2.5832376578645234e-08, "logits/chosen": -2.868856906890869, "logits/rejected": -2.861722469329834, "logps/chosen": -54.91559982299805, "logps/rejected": -51.80546951293945, "loss": 0.693, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.00016598291404079646, "rewards/margins": 0.00022237647499423474, "rewards/rejected": -5.6393568229395896e-05, "step": 90 }, { "epoch": 0.01722949689869056, "grad_norm": 2.204599380493164, "learning_rate": 2.870264064293915e-08, "logits/chosen": -2.9691081047058105, "logits/rejected": -2.9209182262420654, "logps/chosen": -57.41423797607422, "logps/rejected": -48.773643493652344, "loss": 0.6931, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -4.708242340711877e-05, "rewards/margins": 2.3137796233640984e-05, "rewards/rejected": -7.022023783065379e-05, "step": 100 }, { "epoch": 0.018952446588559616, "grad_norm": 2.250347852706909, "learning_rate": 3.157290470723306e-08, "logits/chosen": -2.94686222076416, "logits/rejected": -2.927717924118042, "logps/chosen": -56.643592834472656, "logps/rejected": -51.976280212402344, "loss": 0.693, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.00028140738140791655, "rewards/margins": 0.0003524109488353133, "rewards/rejected": -7.100355287548155e-05, "step": 110 }, { "epoch": 0.02067539627842867, "grad_norm": 2.315481185913086, "learning_rate": 3.4443168771526975e-08, "logits/chosen": -2.884237766265869, "logits/rejected": -2.8725972175598145, "logps/chosen": -53.704261779785156, "logps/rejected": -54.928077697753906, "loss": 0.6931, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -4.277153129805811e-05, "rewards/margins": 4.7175493818940595e-05, "rewards/rejected": -8.994706149678677e-05, "step": 120 }, { "epoch": 0.022398345968297727, "grad_norm": 1.970468282699585, "learning_rate": 3.731343283582089e-08, "logits/chosen": -2.9317197799682617, "logits/rejected": -2.926629066467285, "logps/chosen": -56.63734817504883, "logps/rejected": -53.116554260253906, "loss": 0.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.00013711750216316432, "rewards/margins": 0.00013836535799782723, "rewards/rejected": -0.000275482889264822, "step": 130 }, { "epoch": 0.024121295658166782, "grad_norm": 2.384549856185913, "learning_rate": 4.0183696900114815e-08, "logits/chosen": -2.941270589828491, "logits/rejected": -2.931300163269043, "logps/chosen": -54.492347717285156, "logps/rejected": -52.616127014160156, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0001752917014528066, "rewards/margins": 1.1596118383749854e-05, "rewards/rejected": -0.00018688786076381803, "step": 140 }, { "epoch": 0.025844245348035838, "grad_norm": 2.067789316177368, "learning_rate": 4.305396096440873e-08, "logits/chosen": -2.887172222137451, "logits/rejected": -2.8742759227752686, "logps/chosen": -53.08788299560547, "logps/rejected": -51.228477478027344, "loss": 0.6931, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.0002487800666131079, "rewards/margins": 3.332551614221302e-06, "rewards/rejected": -0.0002521126007195562, "step": 150 }, { "epoch": 0.027567195037904894, "grad_norm": 1.888665795326233, "learning_rate": 4.592422502870264e-08, "logits/chosen": -2.9302210807800293, "logits/rejected": -2.9166245460510254, "logps/chosen": -54.6097297668457, "logps/rejected": -54.28050994873047, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0001351906539639458, "rewards/margins": 3.80988021788653e-05, "rewards/rejected": -0.00017328947433270514, "step": 160 }, { "epoch": 0.02929014472777395, "grad_norm": 2.0785109996795654, "learning_rate": 4.8794489092996555e-08, "logits/chosen": -2.915494441986084, "logits/rejected": -2.9011707305908203, "logps/chosen": -56.4631233215332, "logps/rejected": -50.787933349609375, "loss": 0.693, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -8.93574469955638e-05, "rewards/margins": 0.0002879555686376989, "rewards/rejected": -0.0003773130592890084, "step": 170 }, { "epoch": 0.031013094417643005, "grad_norm": 2.2689266204833984, "learning_rate": 5.166475315729047e-08, "logits/chosen": -2.914288282394409, "logits/rejected": -2.8982763290405273, "logps/chosen": -57.08890914916992, "logps/rejected": -52.475975036621094, "loss": 0.6931, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.0004877672472503036, "rewards/margins": 0.00010756841948023066, "rewards/rejected": -0.000595335615798831, "step": 180 }, { "epoch": 0.03273604410751206, "grad_norm": 2.552917957305908, "learning_rate": 5.453501722158439e-08, "logits/chosen": -2.9489758014678955, "logits/rejected": -2.9161152839660645, "logps/chosen": -59.655799865722656, "logps/rejected": -51.72637939453125, "loss": 0.6929, "rewards/accuracies": 0.59375, "rewards/chosen": -0.00032277655554935336, "rewards/margins": 0.0004504809621721506, "rewards/rejected": -0.0007732574595138431, "step": 190 }, { "epoch": 0.03445899379738112, "grad_norm": 2.2386064529418945, "learning_rate": 5.74052812858783e-08, "logits/chosen": -2.907075881958008, "logits/rejected": -2.8978381156921387, "logps/chosen": -54.8097038269043, "logps/rejected": -53.757843017578125, "loss": 0.693, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.00047458295011892915, "rewards/margins": 0.00024578528245911, "rewards/rejected": -0.0007203682325780392, "step": 200 }, { "epoch": 0.03618194348725017, "grad_norm": 2.1775388717651367, "learning_rate": 6.027554535017222e-08, "logits/chosen": -2.865586519241333, "logits/rejected": -2.8630738258361816, "logps/chosen": -54.1197624206543, "logps/rejected": -56.469505310058594, "loss": 0.693, "rewards/accuracies": 0.53125, "rewards/chosen": -0.00030365376733243465, "rewards/margins": 0.00024198104802053422, "rewards/rejected": -0.000545634888112545, "step": 210 }, { "epoch": 0.03790489317711923, "grad_norm": 2.0510928630828857, "learning_rate": 6.314580941446612e-08, "logits/chosen": -2.9047980308532715, "logits/rejected": -2.882296085357666, "logps/chosen": -53.47319412231445, "logps/rejected": -50.114261627197266, "loss": 0.6929, "rewards/accuracies": 0.5625, "rewards/chosen": -0.00045462168054655194, "rewards/margins": 0.0005546043394133449, "rewards/rejected": -0.0010092259617522359, "step": 220 }, { "epoch": 0.03962784286698828, "grad_norm": 2.2624847888946533, "learning_rate": 6.601607347876004e-08, "logits/chosen": -2.9001288414001465, "logits/rejected": -2.889115333557129, "logps/chosen": -49.89358139038086, "logps/rejected": -49.509525299072266, "loss": 0.6928, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0009472787496633828, "rewards/margins": 0.0006674075848422945, "rewards/rejected": -0.0016146863345056772, "step": 230 }, { "epoch": 0.04135079255685734, "grad_norm": 1.9900765419006348, "learning_rate": 6.888633754305395e-08, "logits/chosen": -2.875915765762329, "logits/rejected": -2.846496343612671, "logps/chosen": -56.79510498046875, "logps/rejected": -51.82697677612305, "loss": 0.6926, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0007951630395837128, "rewards/margins": 0.001057974761351943, "rewards/rejected": -0.0018531378591433167, "step": 240 }, { "epoch": 0.043073742246726394, "grad_norm": 2.0720698833465576, "learning_rate": 7.175660160734787e-08, "logits/chosen": -2.949646472930908, "logits/rejected": -2.932356357574463, "logps/chosen": -53.51816940307617, "logps/rejected": -50.2498893737793, "loss": 0.6927, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0007559804944321513, "rewards/margins": 0.0008698742603883147, "rewards/rejected": -0.0016258548712357879, "step": 250 }, { "epoch": 0.044796691936595454, "grad_norm": 2.004641532897949, "learning_rate": 7.462686567164178e-08, "logits/chosen": -2.93061900138855, "logits/rejected": -2.9231138229370117, "logps/chosen": -55.85028076171875, "logps/rejected": -55.425811767578125, "loss": 0.6924, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0011615584371611476, "rewards/margins": 0.0014426795532926917, "rewards/rejected": -0.0026042379904538393, "step": 260 }, { "epoch": 0.046519641626464506, "grad_norm": 2.1566646099090576, "learning_rate": 7.74971297359357e-08, "logits/chosen": -2.8958191871643066, "logits/rejected": -2.8884072303771973, "logps/chosen": -53.99376678466797, "logps/rejected": -53.75768280029297, "loss": 0.6925, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.002173061715438962, "rewards/margins": 0.0012080974411219358, "rewards/rejected": -0.003381159156560898, "step": 270 }, { "epoch": 0.048242591316333565, "grad_norm": 2.050694227218628, "learning_rate": 8.036739380022963e-08, "logits/chosen": -2.9549427032470703, "logits/rejected": -2.9329402446746826, "logps/chosen": -58.91938018798828, "logps/rejected": -52.90944290161133, "loss": 0.6922, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0015352768823504448, "rewards/margins": 0.0019018324092030525, "rewards/rejected": -0.0034371097572147846, "step": 280 }, { "epoch": 0.04996554100620262, "grad_norm": 2.0070717334747314, "learning_rate": 8.323765786452354e-08, "logits/chosen": -2.903834819793701, "logits/rejected": -2.8948657512664795, "logps/chosen": -57.140472412109375, "logps/rejected": -53.616294860839844, "loss": 0.6925, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0020656147971749306, "rewards/margins": 0.001217393553815782, "rewards/rejected": -0.0032830084674060345, "step": 290 }, { "epoch": 0.051688490696071676, "grad_norm": 2.071979284286499, "learning_rate": 8.610792192881746e-08, "logits/chosen": -2.851034164428711, "logits/rejected": -2.8530735969543457, "logps/chosen": -55.19122314453125, "logps/rejected": -53.63108444213867, "loss": 0.6923, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.0020001870580017567, "rewards/margins": 0.0016876447480171919, "rewards/rejected": -0.003687832038849592, "step": 300 }, { "epoch": 0.05341144038594073, "grad_norm": 2.0738368034362793, "learning_rate": 8.897818599311136e-08, "logits/chosen": -2.900289535522461, "logits/rejected": -2.9008705615997314, "logps/chosen": -55.09581756591797, "logps/rejected": -52.99066925048828, "loss": 0.692, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0026262167375534773, "rewards/margins": 0.002256598323583603, "rewards/rejected": -0.004882815293967724, "step": 310 }, { "epoch": 0.05513439007580979, "grad_norm": 2.287440538406372, "learning_rate": 9.184845005740528e-08, "logits/chosen": -2.8849425315856934, "logits/rejected": -2.866859197616577, "logps/chosen": -56.91963577270508, "logps/rejected": -49.55434799194336, "loss": 0.6916, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0035529057495296, "rewards/margins": 0.0031561902724206448, "rewards/rejected": -0.006709096021950245, "step": 320 }, { "epoch": 0.05685733976567884, "grad_norm": 2.2119367122650146, "learning_rate": 9.47187141216992e-08, "logits/chosen": -2.913548707962036, "logits/rejected": -2.8961174488067627, "logps/chosen": -56.565879821777344, "logps/rejected": -51.76605224609375, "loss": 0.6913, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0037055802531540394, "rewards/margins": 0.003760724561288953, "rewards/rejected": -0.007466305047273636, "step": 330 }, { "epoch": 0.0585802894555479, "grad_norm": 2.02966570854187, "learning_rate": 9.758897818599311e-08, "logits/chosen": -2.8804399967193604, "logits/rejected": -2.8679065704345703, "logps/chosen": -53.284034729003906, "logps/rejected": -52.49842071533203, "loss": 0.6921, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.004340114537626505, "rewards/margins": 0.0022195247001945972, "rewards/rejected": -0.006559638772159815, "step": 340 }, { "epoch": 0.06030323914541695, "grad_norm": 2.2227911949157715, "learning_rate": 1.0045924225028703e-07, "logits/chosen": -2.8534722328186035, "logits/rejected": -2.8211724758148193, "logps/chosen": -57.56907272338867, "logps/rejected": -54.702880859375, "loss": 0.6912, "rewards/accuracies": 0.59375, "rewards/chosen": -0.004618941806256771, "rewards/margins": 0.004073272459208965, "rewards/rejected": -0.008692214265465736, "step": 350 }, { "epoch": 0.06202618883528601, "grad_norm": 2.202179431915283, "learning_rate": 1.0332950631458094e-07, "logits/chosen": -2.9505436420440674, "logits/rejected": -2.9322307109832764, "logps/chosen": -56.734352111816406, "logps/rejected": -50.521728515625, "loss": 0.6908, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.007336387876421213, "rewards/margins": 0.0047567421570420265, "rewards/rejected": -0.012093128636479378, "step": 360 }, { "epoch": 0.06374913852515507, "grad_norm": 2.0590083599090576, "learning_rate": 1.0619977037887486e-07, "logits/chosen": -2.935725688934326, "logits/rejected": -2.9110045433044434, "logps/chosen": -54.897926330566406, "logps/rejected": -52.303985595703125, "loss": 0.6892, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.005601981189101934, "rewards/margins": 0.00793964508920908, "rewards/rejected": -0.013541625812649727, "step": 370 }, { "epoch": 0.06547208821502412, "grad_norm": 1.9834247827529907, "learning_rate": 1.0907003444316878e-07, "logits/chosen": -2.9965431690216064, "logits/rejected": -2.977295160293579, "logps/chosen": -55.92670822143555, "logps/rejected": -52.464569091796875, "loss": 0.6904, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.00846773386001587, "rewards/margins": 0.0055896444246172905, "rewards/rejected": -0.014057380147278309, "step": 380 }, { "epoch": 0.06719503790489317, "grad_norm": 2.2894082069396973, "learning_rate": 1.1194029850746268e-07, "logits/chosen": -2.9287617206573486, "logits/rejected": -2.9139797687530518, "logps/chosen": -58.235557556152344, "logps/rejected": -55.66755294799805, "loss": 0.6897, "rewards/accuracies": 0.59375, "rewards/chosen": -0.00911947526037693, "rewards/margins": 0.00718895485624671, "rewards/rejected": -0.016308428719639778, "step": 390 }, { "epoch": 0.06891798759476224, "grad_norm": 1.8951120376586914, "learning_rate": 1.148105625717566e-07, "logits/chosen": -2.915398597717285, "logits/rejected": -2.903876781463623, "logps/chosen": -55.58208084106445, "logps/rejected": -53.35950469970703, "loss": 0.689, "rewards/accuracies": 0.59375, "rewards/chosen": -0.010802007280290127, "rewards/margins": 0.008632650598883629, "rewards/rejected": -0.01943465694785118, "step": 400 }, { "epoch": 0.06891798759476224, "eval_logits/chosen": -2.9668679237365723, "eval_logits/rejected": -2.9633283615112305, "eval_logps/chosen": -58.91602325439453, "eval_logps/rejected": -62.86379623413086, "eval_loss": 0.6921194791793823, "eval_rewards/accuracies": 0.5615706443786621, "eval_rewards/chosen": 0.000994483008980751, "eval_rewards/margins": 0.002136482624337077, "eval_rewards/rejected": -0.0011419994989410043, "eval_runtime": 361.4134, "eval_samples_per_second": 11.909, "eval_steps_per_second": 1.489, "step": 400 }, { "epoch": 0.07064093728463129, "grad_norm": 1.98946213722229, "learning_rate": 1.1768082663605051e-07, "logits/chosen": -2.907158374786377, "logits/rejected": -2.908372402191162, "logps/chosen": -52.62499237060547, "logps/rejected": -56.15748977661133, "loss": 0.6917, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.015026355162262917, "rewards/margins": 0.003188853384926915, "rewards/rejected": -0.018215205520391464, "step": 410 }, { "epoch": 0.07236388697450034, "grad_norm": 2.406496524810791, "learning_rate": 1.2055109070034443e-07, "logits/chosen": -2.8934340476989746, "logits/rejected": -2.8902764320373535, "logps/chosen": -56.8239860534668, "logps/rejected": -55.338157653808594, "loss": 0.6913, "rewards/accuracies": 0.5625, "rewards/chosen": -0.014624091796576977, "rewards/margins": 0.0038867860566824675, "rewards/rejected": -0.018510878086090088, "step": 420 }, { "epoch": 0.0740868366643694, "grad_norm": 2.2292561531066895, "learning_rate": 1.2342135476463834e-07, "logits/chosen": -2.9318697452545166, "logits/rejected": -2.9210400581359863, "logps/chosen": -56.003822326660156, "logps/rejected": -54.83512496948242, "loss": 0.6903, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.012456988915801048, "rewards/margins": 0.006052222102880478, "rewards/rejected": -0.018509209156036377, "step": 430 }, { "epoch": 0.07580978635423846, "grad_norm": 2.427631378173828, "learning_rate": 1.2629161882893224e-07, "logits/chosen": -2.9725725650787354, "logits/rejected": -2.9464778900146484, "logps/chosen": -56.38798141479492, "logps/rejected": -55.255462646484375, "loss": 0.6872, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.013722503557801247, "rewards/margins": 0.01236006710678339, "rewards/rejected": -0.026082569733262062, "step": 440 }, { "epoch": 0.07753273604410751, "grad_norm": 2.2296125888824463, "learning_rate": 1.2916188289322615e-07, "logits/chosen": -2.9100751876831055, "logits/rejected": -2.888523817062378, "logps/chosen": -58.57600021362305, "logps/rejected": -56.778778076171875, "loss": 0.6877, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.012910036370158195, "rewards/margins": 0.011217395775020123, "rewards/rejected": -0.024127434939146042, "step": 450 }, { "epoch": 0.07925568573397657, "grad_norm": 2.1595005989074707, "learning_rate": 1.3203214695752008e-07, "logits/chosen": -2.8796286582946777, "logits/rejected": -2.8681280612945557, "logps/chosen": -58.94380569458008, "logps/rejected": -54.874755859375, "loss": 0.6892, "rewards/accuracies": 0.5625, "rewards/chosen": -0.019626779481768608, "rewards/margins": 0.008413241244852543, "rewards/rejected": -0.028040017932653427, "step": 460 }, { "epoch": 0.08097863542384562, "grad_norm": 2.187743663787842, "learning_rate": 1.34902411021814e-07, "logits/chosen": -2.8922088146209717, "logits/rejected": -2.8674492835998535, "logps/chosen": -56.465782165527344, "logps/rejected": -53.37446212768555, "loss": 0.688, "rewards/accuracies": 0.59375, "rewards/chosen": -0.020162660628557205, "rewards/margins": 0.010726598091423512, "rewards/rejected": -0.03088926151394844, "step": 470 }, { "epoch": 0.08270158511371468, "grad_norm": 2.302581548690796, "learning_rate": 1.377726750861079e-07, "logits/chosen": -2.9019789695739746, "logits/rejected": -2.8941612243652344, "logps/chosen": -57.38104248046875, "logps/rejected": -61.480247497558594, "loss": 0.6897, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.02589554153382778, "rewards/margins": 0.007494648452848196, "rewards/rejected": -0.03339018672704697, "step": 480 }, { "epoch": 0.08442453480358374, "grad_norm": 2.3632287979125977, "learning_rate": 1.406429391504018e-07, "logits/chosen": -2.848597764968872, "logits/rejected": -2.813154697418213, "logps/chosen": -63.735321044921875, "logps/rejected": -54.053192138671875, "loss": 0.6874, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.026205237954854965, "rewards/margins": 0.012445995584130287, "rewards/rejected": -0.0386512354016304, "step": 490 }, { "epoch": 0.08614748449345279, "grad_norm": 2.103153944015503, "learning_rate": 1.4351320321469574e-07, "logits/chosen": -2.8737709522247314, "logits/rejected": -2.856574535369873, "logps/chosen": -59.20196533203125, "logps/rejected": -55.20966339111328, "loss": 0.689, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.029176678508520126, "rewards/margins": 0.008984221145510674, "rewards/rejected": -0.03816089779138565, "step": 500 }, { "epoch": 0.08787043418332184, "grad_norm": 2.163799285888672, "learning_rate": 1.4638346727898965e-07, "logits/chosen": -2.8468947410583496, "logits/rejected": -2.8360259532928467, "logps/chosen": -61.5234489440918, "logps/rejected": -55.301490783691406, "loss": 0.6901, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.02848530374467373, "rewards/margins": 0.006644343491643667, "rewards/rejected": -0.035129647701978683, "step": 510 }, { "epoch": 0.08959338387319091, "grad_norm": 2.273447036743164, "learning_rate": 1.4925373134328355e-07, "logits/chosen": -2.8971662521362305, "logits/rejected": -2.8807578086853027, "logps/chosen": -60.303489685058594, "logps/rejected": -56.05200958251953, "loss": 0.6867, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03529467061161995, "rewards/margins": 0.013707530684769154, "rewards/rejected": -0.04900220036506653, "step": 520 }, { "epoch": 0.09131633356305996, "grad_norm": 2.012242555618286, "learning_rate": 1.5212399540757749e-07, "logits/chosen": -2.8951823711395264, "logits/rejected": -2.864811658859253, "logps/chosen": -59.95064163208008, "logps/rejected": -55.23982620239258, "loss": 0.6835, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.032894477248191833, "rewards/margins": 0.020433872938156128, "rewards/rejected": -0.05332835391163826, "step": 530 }, { "epoch": 0.09303928325292901, "grad_norm": 2.126307725906372, "learning_rate": 1.549942594718714e-07, "logits/chosen": -2.886043071746826, "logits/rejected": -2.873652696609497, "logps/chosen": -58.64757537841797, "logps/rejected": -57.500885009765625, "loss": 0.6909, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.0470709502696991, "rewards/margins": 0.005540185607969761, "rewards/rejected": -0.05261113494634628, "step": 540 }, { "epoch": 0.09476223294279806, "grad_norm": 2.0772576332092285, "learning_rate": 1.5786452353616533e-07, "logits/chosen": -2.91432785987854, "logits/rejected": -2.898002862930298, "logps/chosen": -59.36701583862305, "logps/rejected": -56.921630859375, "loss": 0.687, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.04622073844075203, "rewards/margins": 0.013325628824532032, "rewards/rejected": -0.059546370059251785, "step": 550 }, { "epoch": 0.09648518263266713, "grad_norm": 2.507021188735962, "learning_rate": 1.6073478760045926e-07, "logits/chosen": -2.900132656097412, "logits/rejected": -2.8904690742492676, "logps/chosen": -58.27146530151367, "logps/rejected": -59.824424743652344, "loss": 0.6887, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.04870317503809929, "rewards/margins": 0.010025747120380402, "rewards/rejected": -0.05872892215847969, "step": 560 }, { "epoch": 0.09820813232253618, "grad_norm": 2.172370195388794, "learning_rate": 1.6360505166475317e-07, "logits/chosen": -2.872685670852661, "logits/rejected": -2.868788242340088, "logps/chosen": -57.04808807373047, "logps/rejected": -59.1138801574707, "loss": 0.6892, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.04688990116119385, "rewards/margins": 0.008995410054922104, "rewards/rejected": -0.05588530749082565, "step": 570 }, { "epoch": 0.09993108201240523, "grad_norm": 2.109297275543213, "learning_rate": 1.6647531572904707e-07, "logits/chosen": -2.885718584060669, "logits/rejected": -2.880315065383911, "logps/chosen": -56.78486251831055, "logps/rejected": -56.82386016845703, "loss": 0.6926, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.04839697107672691, "rewards/margins": 0.002040210645645857, "rewards/rejected": -0.05043717473745346, "step": 580 }, { "epoch": 0.1016540317022743, "grad_norm": 2.156661033630371, "learning_rate": 1.6934557979334098e-07, "logits/chosen": -2.8819355964660645, "logits/rejected": -2.8681695461273193, "logps/chosen": -58.881500244140625, "logps/rejected": -59.273109436035156, "loss": 0.6892, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.039070673286914825, "rewards/margins": 0.00895953830331564, "rewards/rejected": -0.04803021252155304, "step": 590 }, { "epoch": 0.10337698139214335, "grad_norm": 2.46242618560791, "learning_rate": 1.722158438576349e-07, "logits/chosen": -2.8836355209350586, "logits/rejected": -2.8641319274902344, "logps/chosen": -59.34708786010742, "logps/rejected": -61.18199920654297, "loss": 0.6829, "rewards/accuracies": 0.625, "rewards/chosen": -0.03741302713751793, "rewards/margins": 0.021672798320651054, "rewards/rejected": -0.05908583477139473, "step": 600 }, { "epoch": 0.1050999310820124, "grad_norm": 2.2858612537384033, "learning_rate": 1.7508610792192882e-07, "logits/chosen": -2.8395636081695557, "logits/rejected": -2.8389573097229004, "logps/chosen": -59.175819396972656, "logps/rejected": -58.0677604675293, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": -0.048518843948841095, "rewards/margins": 0.0016841506585478783, "rewards/rejected": -0.050203002989292145, "step": 610 }, { "epoch": 0.10682288077188146, "grad_norm": 2.372987747192383, "learning_rate": 1.7795637198622273e-07, "logits/chosen": -2.9277400970458984, "logits/rejected": -2.906395196914673, "logps/chosen": -60.30255889892578, "logps/rejected": -58.70441818237305, "loss": 0.6823, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0395224392414093, "rewards/margins": 0.022816382348537445, "rewards/rejected": -0.06233882158994675, "step": 620 }, { "epoch": 0.10854583046175052, "grad_norm": 2.3703997135162354, "learning_rate": 1.8082663605051666e-07, "logits/chosen": -2.921839475631714, "logits/rejected": -2.8985724449157715, "logps/chosen": -60.25489044189453, "logps/rejected": -55.992027282714844, "loss": 0.6881, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.05049477890133858, "rewards/margins": 0.011371849104762077, "rewards/rejected": -0.06186662241816521, "step": 630 }, { "epoch": 0.11026878015161957, "grad_norm": 2.6197826862335205, "learning_rate": 1.8369690011481057e-07, "logits/chosen": -2.903247356414795, "logits/rejected": -2.900747776031494, "logps/chosen": -59.17439651489258, "logps/rejected": -60.142822265625, "loss": 0.689, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.05740165710449219, "rewards/margins": 0.009622467681765556, "rewards/rejected": -0.0670241266489029, "step": 640 }, { "epoch": 0.11199172984148863, "grad_norm": 2.6450724601745605, "learning_rate": 1.8656716417910447e-07, "logits/chosen": -2.9058008193969727, "logits/rejected": -2.9096693992614746, "logps/chosen": -59.77911376953125, "logps/rejected": -60.92927932739258, "loss": 0.6937, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0681704506278038, "rewards/margins": 0.00016152606985997409, "rewards/rejected": -0.06833197176456451, "step": 650 }, { "epoch": 0.11371467953135768, "grad_norm": 2.3745062351226807, "learning_rate": 1.894374282433984e-07, "logits/chosen": -2.848968029022217, "logits/rejected": -2.8496081829071045, "logps/chosen": -62.2055549621582, "logps/rejected": -59.9939079284668, "loss": 0.6862, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0651950091123581, "rewards/margins": 0.015136671252548695, "rewards/rejected": -0.0803316980600357, "step": 660 }, { "epoch": 0.11543762922122675, "grad_norm": 2.419529676437378, "learning_rate": 1.9230769230769231e-07, "logits/chosen": -2.8650903701782227, "logits/rejected": -2.860945701599121, "logps/chosen": -61.01726150512695, "logps/rejected": -64.31108856201172, "loss": 0.6911, "rewards/accuracies": 0.53125, "rewards/chosen": -0.06766843795776367, "rewards/margins": 0.005870690569281578, "rewards/rejected": -0.0735391229391098, "step": 670 }, { "epoch": 0.1171605789110958, "grad_norm": 2.495730400085449, "learning_rate": 1.9517795637198622e-07, "logits/chosen": -2.8328330516815186, "logits/rejected": -2.8099312782287598, "logps/chosen": -61.77775192260742, "logps/rejected": -59.171119689941406, "loss": 0.6833, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0667979046702385, "rewards/margins": 0.021314959973096848, "rewards/rejected": -0.08811286091804504, "step": 680 }, { "epoch": 0.11888352860096485, "grad_norm": 2.725309133529663, "learning_rate": 1.9804822043628013e-07, "logits/chosen": -2.9220008850097656, "logits/rejected": -2.8979132175445557, "logps/chosen": -66.50996398925781, "logps/rejected": -58.5330924987793, "loss": 0.6826, "rewards/accuracies": 0.59375, "rewards/chosen": -0.060177695006132126, "rewards/margins": 0.022599829360842705, "rewards/rejected": -0.08277751505374908, "step": 690 }, { "epoch": 0.1206064782908339, "grad_norm": 2.3490757942199707, "learning_rate": 2.0091848450057406e-07, "logits/chosen": -2.8857009410858154, "logits/rejected": -2.861926555633545, "logps/chosen": -63.034881591796875, "logps/rejected": -60.65327072143555, "loss": 0.6836, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.06567408889532089, "rewards/margins": 0.020616449415683746, "rewards/rejected": -0.08629053831100464, "step": 700 }, { "epoch": 0.12232942798070297, "grad_norm": 2.6334235668182373, "learning_rate": 2.0378874856486797e-07, "logits/chosen": -2.877946376800537, "logits/rejected": -2.8614790439605713, "logps/chosen": -62.624969482421875, "logps/rejected": -63.417335510253906, "loss": 0.6832, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.07368381321430206, "rewards/margins": 0.02137075923383236, "rewards/rejected": -0.09505458176136017, "step": 710 }, { "epoch": 0.12405237767057202, "grad_norm": 2.7815544605255127, "learning_rate": 2.0665901262916187e-07, "logits/chosen": -2.85099458694458, "logits/rejected": -2.8473141193389893, "logps/chosen": -62.34055709838867, "logps/rejected": -62.96751022338867, "loss": 0.6871, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.08011172711849213, "rewards/margins": 0.013986065983772278, "rewards/rejected": -0.094097800552845, "step": 720 }, { "epoch": 0.12577532736044109, "grad_norm": 2.759892702102661, "learning_rate": 2.095292766934558e-07, "logits/chosen": -2.9181156158447266, "logits/rejected": -2.8994133472442627, "logps/chosen": -65.53150939941406, "logps/rejected": -62.81523513793945, "loss": 0.6801, "rewards/accuracies": 0.59375, "rewards/chosen": -0.08038532733917236, "rewards/margins": 0.02786167338490486, "rewards/rejected": -0.10824700444936752, "step": 730 }, { "epoch": 0.12749827705031014, "grad_norm": 3.3210110664367676, "learning_rate": 2.1239954075774971e-07, "logits/chosen": -2.8546454906463623, "logits/rejected": -2.834545612335205, "logps/chosen": -63.27663040161133, "logps/rejected": -63.44011688232422, "loss": 0.6815, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.07576959580183029, "rewards/margins": 0.02525268867611885, "rewards/rejected": -0.10102228075265884, "step": 740 }, { "epoch": 0.1292212267401792, "grad_norm": 2.628596067428589, "learning_rate": 2.1526980482204362e-07, "logits/chosen": -2.9643959999084473, "logits/rejected": -2.9449915885925293, "logps/chosen": -64.82920837402344, "logps/rejected": -63.93852996826172, "loss": 0.6772, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.07628447562456131, "rewards/margins": 0.03370644524693489, "rewards/rejected": -0.1099909320473671, "step": 750 }, { "epoch": 0.13094417643004824, "grad_norm": 3.2479381561279297, "learning_rate": 2.1814006888633755e-07, "logits/chosen": -2.8714213371276855, "logits/rejected": -2.8436214923858643, "logps/chosen": -64.50923156738281, "logps/rejected": -59.529205322265625, "loss": 0.6835, "rewards/accuracies": 0.59375, "rewards/chosen": -0.08983375877141953, "rewards/margins": 0.021566372364759445, "rewards/rejected": -0.11140014231204987, "step": 760 }, { "epoch": 0.1326671261199173, "grad_norm": 4.023756504058838, "learning_rate": 2.2101033295063146e-07, "logits/chosen": -2.895245313644409, "logits/rejected": -2.8813509941101074, "logps/chosen": -62.97990036010742, "logps/rejected": -62.46956253051758, "loss": 0.6839, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0862732082605362, "rewards/margins": 0.02098711207509041, "rewards/rejected": -0.1072603091597557, "step": 770 }, { "epoch": 0.13439007580978635, "grad_norm": 3.13224196434021, "learning_rate": 2.2388059701492537e-07, "logits/chosen": -2.8914451599121094, "logits/rejected": -2.867403745651245, "logps/chosen": -63.27967071533203, "logps/rejected": -61.3089599609375, "loss": 0.6828, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.08697815239429474, "rewards/margins": 0.022902244701981544, "rewards/rejected": -0.10988038778305054, "step": 780 }, { "epoch": 0.1361130254996554, "grad_norm": 2.790550470352173, "learning_rate": 2.2675086107921927e-07, "logits/chosen": -2.889011859893799, "logits/rejected": -2.863284111022949, "logps/chosen": -62.410133361816406, "logps/rejected": -63.09989547729492, "loss": 0.6804, "rewards/accuracies": 0.65625, "rewards/chosen": -0.07453693449497223, "rewards/margins": 0.027675826102495193, "rewards/rejected": -0.10221276432275772, "step": 790 }, { "epoch": 0.13783597518952448, "grad_norm": 2.736335515975952, "learning_rate": 2.296211251435132e-07, "logits/chosen": -2.826007843017578, "logits/rejected": -2.807002067565918, "logps/chosen": -64.5998764038086, "logps/rejected": -65.90070343017578, "loss": 0.6822, "rewards/accuracies": 0.625, "rewards/chosen": -0.0889119952917099, "rewards/margins": 0.024376126006245613, "rewards/rejected": -0.11328812688589096, "step": 800 }, { "epoch": 0.13783597518952448, "eval_logits/chosen": -2.9291374683380127, "eval_logits/rejected": -2.925549030303955, "eval_logps/chosen": -64.04643249511719, "eval_logps/rejected": -69.37918090820312, "eval_loss": 0.6860670447349548, "eval_rewards/accuracies": 0.5745818018913269, "eval_rewards/chosen": -0.05030960217118263, "eval_rewards/margins": 0.015986217185854912, "eval_rewards/rejected": -0.06629582494497299, "eval_runtime": 361.4001, "eval_samples_per_second": 11.909, "eval_steps_per_second": 1.489, "step": 800 }, { "epoch": 0.13955892487939353, "grad_norm": 3.887179136276245, "learning_rate": 2.3249138920780711e-07, "logits/chosen": -2.8539488315582275, "logits/rejected": -2.8323490619659424, "logps/chosen": -66.29832458496094, "logps/rejected": -65.9931869506836, "loss": 0.6838, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.09360718727111816, "rewards/margins": 0.02118730917572975, "rewards/rejected": -0.11479450762271881, "step": 810 }, { "epoch": 0.14128187456926258, "grad_norm": 2.8187315464019775, "learning_rate": 2.3536165327210102e-07, "logits/chosen": -2.885472297668457, "logits/rejected": -2.8682382106781006, "logps/chosen": -62.4206657409668, "logps/rejected": -62.85497283935547, "loss": 0.6803, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.09593228250741959, "rewards/margins": 0.028071347624063492, "rewards/rejected": -0.12400363385677338, "step": 820 }, { "epoch": 0.14300482425913164, "grad_norm": 3.3670856952667236, "learning_rate": 2.3823191733639495e-07, "logits/chosen": -2.87396240234375, "logits/rejected": -2.856924057006836, "logps/chosen": -65.39492797851562, "logps/rejected": -65.48418426513672, "loss": 0.6802, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.09522920101881027, "rewards/margins": 0.028466004878282547, "rewards/rejected": -0.12369520962238312, "step": 830 }, { "epoch": 0.1447277739490007, "grad_norm": 2.8142333030700684, "learning_rate": 2.4110218140068886e-07, "logits/chosen": -2.9053337574005127, "logits/rejected": -2.8831725120544434, "logps/chosen": -64.02351379394531, "logps/rejected": -61.65770721435547, "loss": 0.6782, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08901962637901306, "rewards/margins": 0.032698072493076324, "rewards/rejected": -0.12171770632266998, "step": 840 }, { "epoch": 0.14645072363886974, "grad_norm": 2.8712363243103027, "learning_rate": 2.4397244546498277e-07, "logits/chosen": -2.8432557582855225, "logits/rejected": -2.834526300430298, "logps/chosen": -61.71845626831055, "logps/rejected": -66.37455749511719, "loss": 0.6854, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.10202014446258545, "rewards/margins": 0.017942581325769424, "rewards/rejected": -0.11996271461248398, "step": 850 }, { "epoch": 0.1481736733287388, "grad_norm": 2.912658214569092, "learning_rate": 2.468427095292767e-07, "logits/chosen": -2.866929531097412, "logits/rejected": -2.8523142337799072, "logps/chosen": -63.9709587097168, "logps/rejected": -64.18878936767578, "loss": 0.6785, "rewards/accuracies": 0.59375, "rewards/chosen": -0.10068394988775253, "rewards/margins": 0.03187858313322067, "rewards/rejected": -0.1325625330209732, "step": 860 }, { "epoch": 0.14989662301860784, "grad_norm": 3.5958614349365234, "learning_rate": 2.497129735935706e-07, "logits/chosen": -2.890873908996582, "logits/rejected": -2.8933827877044678, "logps/chosen": -61.8720703125, "logps/rejected": -65.64959716796875, "loss": 0.6799, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0999109223484993, "rewards/margins": 0.029013585299253464, "rewards/rejected": -0.12892451882362366, "step": 870 }, { "epoch": 0.15161957270847692, "grad_norm": 3.718127489089966, "learning_rate": 2.525832376578645e-07, "logits/chosen": -2.830348491668701, "logits/rejected": -2.809882879257202, "logps/chosen": -63.20650100708008, "logps/rejected": -63.93225860595703, "loss": 0.6748, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.09787528216838837, "rewards/margins": 0.03956557810306549, "rewards/rejected": -0.13744086027145386, "step": 880 }, { "epoch": 0.15334252239834598, "grad_norm": 2.6806609630584717, "learning_rate": 2.554535017221584e-07, "logits/chosen": -2.855287790298462, "logits/rejected": -2.829123020172119, "logps/chosen": -68.56311798095703, "logps/rejected": -65.0200424194336, "loss": 0.6805, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0874931737780571, "rewards/margins": 0.027916138991713524, "rewards/rejected": -0.11540931463241577, "step": 890 }, { "epoch": 0.15506547208821503, "grad_norm": 4.033020496368408, "learning_rate": 2.583237657864523e-07, "logits/chosen": -2.873629331588745, "logits/rejected": -2.868142604827881, "logps/chosen": -65.75654602050781, "logps/rejected": -64.44902801513672, "loss": 0.6815, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.10713325440883636, "rewards/margins": 0.02599569596350193, "rewards/rejected": -0.13312894105911255, "step": 900 }, { "epoch": 0.15678842177808408, "grad_norm": 3.472957134246826, "learning_rate": 2.611940298507462e-07, "logits/chosen": -2.8482868671417236, "logits/rejected": -2.8463685512542725, "logps/chosen": -65.55746459960938, "logps/rejected": -68.69539642333984, "loss": 0.6782, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.13495570421218872, "rewards/margins": 0.03325473517179489, "rewards/rejected": -0.16821043193340302, "step": 910 }, { "epoch": 0.15851137146795313, "grad_norm": 3.7193310260772705, "learning_rate": 2.6406429391504017e-07, "logits/chosen": -2.8696815967559814, "logits/rejected": -2.8379764556884766, "logps/chosen": -70.16816711425781, "logps/rejected": -66.00575256347656, "loss": 0.679, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.14669588208198547, "rewards/margins": 0.03142491728067398, "rewards/rejected": -0.17812080681324005, "step": 920 }, { "epoch": 0.16023432115782218, "grad_norm": 4.217350482940674, "learning_rate": 2.669345579793341e-07, "logits/chosen": -2.894317626953125, "logits/rejected": -2.8823533058166504, "logps/chosen": -68.23013305664062, "logps/rejected": -71.0595703125, "loss": 0.6782, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.14622612297534943, "rewards/margins": 0.03370025008916855, "rewards/rejected": -0.17992636561393738, "step": 930 }, { "epoch": 0.16195727084769124, "grad_norm": 3.810546636581421, "learning_rate": 2.69804822043628e-07, "logits/chosen": -2.9013657569885254, "logits/rejected": -2.874518871307373, "logps/chosen": -75.99950408935547, "logps/rejected": -73.35377502441406, "loss": 0.6746, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.1469353437423706, "rewards/margins": 0.04149908572435379, "rewards/rejected": -0.1884344518184662, "step": 940 }, { "epoch": 0.16368022053756032, "grad_norm": 4.176051616668701, "learning_rate": 2.726750861079219e-07, "logits/chosen": -2.7648746967315674, "logits/rejected": -2.748953342437744, "logps/chosen": -73.96687316894531, "logps/rejected": -74.79029846191406, "loss": 0.6835, "rewards/accuracies": 0.59375, "rewards/chosen": -0.18032407760620117, "rewards/margins": 0.023691440001130104, "rewards/rejected": -0.20401552319526672, "step": 950 }, { "epoch": 0.16540317022742937, "grad_norm": 3.564423084259033, "learning_rate": 2.755453501722158e-07, "logits/chosen": -2.756643772125244, "logits/rejected": -2.7593448162078857, "logps/chosen": -70.12141418457031, "logps/rejected": -74.08448028564453, "loss": 0.6905, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.18573260307312012, "rewards/margins": 0.008934564888477325, "rewards/rejected": -0.19466714560985565, "step": 960 }, { "epoch": 0.16712611991729842, "grad_norm": 4.809911251068115, "learning_rate": 2.784156142365097e-07, "logits/chosen": -2.8420376777648926, "logits/rejected": -2.8150031566619873, "logps/chosen": -78.68153381347656, "logps/rejected": -70.57462310791016, "loss": 0.6829, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.1740989089012146, "rewards/margins": 0.024326497688889503, "rewards/rejected": -0.19842541217803955, "step": 970 }, { "epoch": 0.16884906960716747, "grad_norm": 5.355932235717773, "learning_rate": 2.812858783008036e-07, "logits/chosen": -2.9094066619873047, "logits/rejected": -2.890482187271118, "logps/chosen": -74.46675109863281, "logps/rejected": -74.08334350585938, "loss": 0.6776, "rewards/accuracies": 0.625, "rewards/chosen": -0.17054997384548187, "rewards/margins": 0.03467658534646034, "rewards/rejected": -0.2052265703678131, "step": 980 }, { "epoch": 0.17057201929703653, "grad_norm": 5.172062873840332, "learning_rate": 2.8415614236509757e-07, "logits/chosen": -2.828876256942749, "logits/rejected": -2.810253143310547, "logps/chosen": -71.98927307128906, "logps/rejected": -72.3040542602539, "loss": 0.6758, "rewards/accuracies": 0.59375, "rewards/chosen": -0.17168062925338745, "rewards/margins": 0.0381183996796608, "rewards/rejected": -0.20979902148246765, "step": 990 }, { "epoch": 0.17229496898690558, "grad_norm": 4.146362781524658, "learning_rate": 2.870264064293915e-07, "logits/chosen": -2.7948880195617676, "logits/rejected": -2.772251844406128, "logps/chosen": -77.09089660644531, "logps/rejected": -72.52245330810547, "loss": 0.6821, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.1894773691892624, "rewards/margins": 0.026282016187906265, "rewards/rejected": -0.21575936675071716, "step": 1000 }, { "epoch": 0.17401791867677463, "grad_norm": 5.479541778564453, "learning_rate": 2.898966704936854e-07, "logits/chosen": -2.7532236576080322, "logits/rejected": -2.757920742034912, "logps/chosen": -72.29302978515625, "logps/rejected": -78.77204132080078, "loss": 0.6735, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.17986199259757996, "rewards/margins": 0.04465629905462265, "rewards/rejected": -0.22451826930046082, "step": 1010 }, { "epoch": 0.17574086836664368, "grad_norm": 4.487902641296387, "learning_rate": 2.927669345579793e-07, "logits/chosen": -2.8538780212402344, "logits/rejected": -2.8298897743225098, "logps/chosen": -76.56214904785156, "logps/rejected": -76.95623779296875, "loss": 0.6684, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.18401969969272614, "rewards/margins": 0.05350281670689583, "rewards/rejected": -0.23752252757549286, "step": 1020 }, { "epoch": 0.17746381805651276, "grad_norm": 4.088461875915527, "learning_rate": 2.956371986222732e-07, "logits/chosen": -2.878474235534668, "logits/rejected": -2.85931134223938, "logps/chosen": -73.5353012084961, "logps/rejected": -72.88682556152344, "loss": 0.6766, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.18161997199058533, "rewards/margins": 0.037842702120542526, "rewards/rejected": -0.21946267783641815, "step": 1030 }, { "epoch": 0.17918676774638181, "grad_norm": 5.068690776824951, "learning_rate": 2.985074626865671e-07, "logits/chosen": -2.8367671966552734, "logits/rejected": -2.827674388885498, "logps/chosen": -73.47721862792969, "logps/rejected": -76.98851013183594, "loss": 0.6863, "rewards/accuracies": 0.5625, "rewards/chosen": -0.20157821476459503, "rewards/margins": 0.018608151003718376, "rewards/rejected": -0.22018635272979736, "step": 1040 }, { "epoch": 0.18090971743625087, "grad_norm": 4.511013507843018, "learning_rate": 3.0137772675086106e-07, "logits/chosen": -2.7761824131011963, "logits/rejected": -2.744597911834717, "logps/chosen": -77.72283935546875, "logps/rejected": -73.64256286621094, "loss": 0.6747, "rewards/accuracies": 0.625, "rewards/chosen": -0.19585371017456055, "rewards/margins": 0.042059190571308136, "rewards/rejected": -0.23791289329528809, "step": 1050 }, { "epoch": 0.18263266712611992, "grad_norm": 4.592730522155762, "learning_rate": 3.0424799081515497e-07, "logits/chosen": -2.793379306793213, "logits/rejected": -2.775411367416382, "logps/chosen": -80.44139099121094, "logps/rejected": -79.88837432861328, "loss": 0.681, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2278936207294464, "rewards/margins": 0.029009496793150902, "rewards/rejected": -0.25690311193466187, "step": 1060 }, { "epoch": 0.18435561681598897, "grad_norm": 5.228631973266602, "learning_rate": 3.071182548794489e-07, "logits/chosen": -2.8702175617218018, "logits/rejected": -2.8414740562438965, "logps/chosen": -81.30256652832031, "logps/rejected": -79.92613220214844, "loss": 0.6736, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.2350526601076126, "rewards/margins": 0.04512694478034973, "rewards/rejected": -0.28017958998680115, "step": 1070 }, { "epoch": 0.18607856650585802, "grad_norm": 4.461070537567139, "learning_rate": 3.099885189437428e-07, "logits/chosen": -2.8689026832580566, "logits/rejected": -2.851099729537964, "logps/chosen": -83.89669036865234, "logps/rejected": -80.62748718261719, "loss": 0.6778, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.26774320006370544, "rewards/margins": 0.035492606461048126, "rewards/rejected": -0.30323582887649536, "step": 1080 }, { "epoch": 0.18780151619572708, "grad_norm": 5.683162689208984, "learning_rate": 3.1285878300803674e-07, "logits/chosen": -2.779700517654419, "logits/rejected": -2.7773754596710205, "logps/chosen": -80.9176025390625, "logps/rejected": -82.39884948730469, "loss": 0.681, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.27540233731269836, "rewards/margins": 0.029661575332283974, "rewards/rejected": -0.3050638735294342, "step": 1090 }, { "epoch": 0.18952446588559613, "grad_norm": 5.814540863037109, "learning_rate": 3.1572904707233065e-07, "logits/chosen": -2.799462080001831, "logits/rejected": -2.800605535507202, "logps/chosen": -80.4347915649414, "logps/rejected": -86.2599105834961, "loss": 0.6776, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.28192073106765747, "rewards/margins": 0.038120221346616745, "rewards/rejected": -0.3200409710407257, "step": 1100 }, { "epoch": 0.1912474155754652, "grad_norm": 8.544889450073242, "learning_rate": 3.1859931113662456e-07, "logits/chosen": -2.8129661083221436, "logits/rejected": -2.822056531906128, "logps/chosen": -80.9301528930664, "logps/rejected": -86.2048568725586, "loss": 0.6748, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2709517478942871, "rewards/margins": 0.042826101183891296, "rewards/rejected": -0.3137778341770172, "step": 1110 }, { "epoch": 0.19297036526533426, "grad_norm": 5.1092753410339355, "learning_rate": 3.214695752009185e-07, "logits/chosen": -2.833594560623169, "logits/rejected": -2.816227436065674, "logps/chosen": -83.26692199707031, "logps/rejected": -82.28617858886719, "loss": 0.6781, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.2621954679489136, "rewards/margins": 0.035831719636917114, "rewards/rejected": -0.2980271875858307, "step": 1120 }, { "epoch": 0.1946933149552033, "grad_norm": 5.326091766357422, "learning_rate": 3.243398392652124e-07, "logits/chosen": -2.8480701446533203, "logits/rejected": -2.832531213760376, "logps/chosen": -80.08226776123047, "logps/rejected": -86.4864730834961, "loss": 0.6658, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.27589139342308044, "rewards/margins": 0.06190886348485947, "rewards/rejected": -0.3378002643585205, "step": 1130 }, { "epoch": 0.19641626464507236, "grad_norm": 6.402179718017578, "learning_rate": 3.2721010332950633e-07, "logits/chosen": -2.800346612930298, "logits/rejected": -2.7789957523345947, "logps/chosen": -86.55638885498047, "logps/rejected": -84.81587219238281, "loss": 0.6778, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2876639664173126, "rewards/margins": 0.03826269879937172, "rewards/rejected": -0.32592669129371643, "step": 1140 }, { "epoch": 0.19813921433494142, "grad_norm": 6.185295104980469, "learning_rate": 3.3008036739380024e-07, "logits/chosen": -2.763390064239502, "logits/rejected": -2.753084182739258, "logps/chosen": -81.66207122802734, "logps/rejected": -85.63570404052734, "loss": 0.6686, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.270382285118103, "rewards/margins": 0.056910622864961624, "rewards/rejected": -0.32729288935661316, "step": 1150 }, { "epoch": 0.19986216402481047, "grad_norm": 5.151527404785156, "learning_rate": 3.3295063145809414e-07, "logits/chosen": -2.8289105892181396, "logits/rejected": -2.821500062942505, "logps/chosen": -79.19139862060547, "logps/rejected": -82.31703186035156, "loss": 0.6699, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.23462215065956116, "rewards/margins": 0.053680647164583206, "rewards/rejected": -0.28830283880233765, "step": 1160 }, { "epoch": 0.20158511371467952, "grad_norm": 5.562000751495361, "learning_rate": 3.3582089552238805e-07, "logits/chosen": -2.755552053451538, "logits/rejected": -2.746124505996704, "logps/chosen": -81.64362335205078, "logps/rejected": -86.65480041503906, "loss": 0.6736, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.287183552980423, "rewards/margins": 0.045087821781635284, "rewards/rejected": -0.33227136731147766, "step": 1170 }, { "epoch": 0.2033080634045486, "grad_norm": 6.965643405914307, "learning_rate": 3.3869115958668196e-07, "logits/chosen": -2.7298243045806885, "logits/rejected": -2.706173896789551, "logps/chosen": -88.53440856933594, "logps/rejected": -88.83980560302734, "loss": 0.6752, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3485092520713806, "rewards/margins": 0.04484327509999275, "rewards/rejected": -0.39335256814956665, "step": 1180 }, { "epoch": 0.20503101309441765, "grad_norm": 7.193821430206299, "learning_rate": 3.415614236509759e-07, "logits/chosen": -2.8387513160705566, "logits/rejected": -2.817599296569824, "logps/chosen": -92.8900375366211, "logps/rejected": -89.29020690917969, "loss": 0.6668, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.338334858417511, "rewards/margins": 0.06232946366071701, "rewards/rejected": -0.4006643295288086, "step": 1190 }, { "epoch": 0.2067539627842867, "grad_norm": 5.700401306152344, "learning_rate": 3.444316877152698e-07, "logits/chosen": -2.739269733428955, "logits/rejected": -2.7290844917297363, "logps/chosen": -87.00050354003906, "logps/rejected": -88.2465591430664, "loss": 0.6737, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.329051673412323, "rewards/margins": 0.0502270832657814, "rewards/rejected": -0.3792787790298462, "step": 1200 }, { "epoch": 0.2067539627842867, "eval_logits/chosen": -2.8561644554138184, "eval_logits/rejected": -2.852672815322876, "eval_logps/chosen": -86.91645050048828, "eval_logps/rejected": -94.43667602539062, "eval_loss": 0.6780018210411072, "eval_rewards/accuracies": 0.5762081742286682, "eval_rewards/chosen": -0.27900978922843933, "eval_rewards/margins": 0.0378609225153923, "eval_rewards/rejected": -0.31687071919441223, "eval_runtime": 361.1927, "eval_samples_per_second": 11.916, "eval_steps_per_second": 1.49, "step": 1200 }, { "epoch": 0.20847691247415576, "grad_norm": 11.93420696258545, "learning_rate": 3.4730195177956373e-07, "logits/chosen": -2.805880069732666, "logits/rejected": -2.781040668487549, "logps/chosen": -94.92154693603516, "logps/rejected": -98.58137512207031, "loss": 0.669, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.39941346645355225, "rewards/margins": 0.06010304018855095, "rewards/rejected": -0.4595165252685547, "step": 1210 }, { "epoch": 0.2101998621640248, "grad_norm": 7.634225845336914, "learning_rate": 3.5017221584385764e-07, "logits/chosen": -2.8049941062927246, "logits/rejected": -2.7828567028045654, "logps/chosen": -93.53218078613281, "logps/rejected": -100.98593139648438, "loss": 0.6539, "rewards/accuracies": 0.625, "rewards/chosen": -0.3984236717224121, "rewards/margins": 0.09305372089147568, "rewards/rejected": -0.4914774000644684, "step": 1220 }, { "epoch": 0.21192281185389386, "grad_norm": 10.751758575439453, "learning_rate": 3.5304247990815155e-07, "logits/chosen": -2.867164134979248, "logits/rejected": -2.836141586303711, "logps/chosen": -99.47651672363281, "logps/rejected": -99.87283325195312, "loss": 0.6687, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.42474478483200073, "rewards/margins": 0.06045510619878769, "rewards/rejected": -0.4851999282836914, "step": 1230 }, { "epoch": 0.2136457615437629, "grad_norm": 8.225252151489258, "learning_rate": 3.5591274397244545e-07, "logits/chosen": -2.765495777130127, "logits/rejected": -2.7551586627960205, "logps/chosen": -93.8685302734375, "logps/rejected": -98.10967254638672, "loss": 0.6746, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4086647927761078, "rewards/margins": 0.05014032870531082, "rewards/rejected": -0.4588051438331604, "step": 1240 }, { "epoch": 0.21536871123363197, "grad_norm": 9.079387664794922, "learning_rate": 3.587830080367394e-07, "logits/chosen": -2.8410396575927734, "logits/rejected": -2.8106024265289307, "logps/chosen": -92.39698791503906, "logps/rejected": -95.01252746582031, "loss": 0.6638, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.3722931146621704, "rewards/margins": 0.07202822715044022, "rewards/rejected": -0.44432133436203003, "step": 1250 }, { "epoch": 0.21709166092350105, "grad_norm": 8.725356101989746, "learning_rate": 3.616532721010333e-07, "logits/chosen": -2.7265498638153076, "logits/rejected": -2.7055981159210205, "logps/chosen": -100.69922637939453, "logps/rejected": -104.2999038696289, "loss": 0.6583, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.462491899728775, "rewards/margins": 0.08412086218595505, "rewards/rejected": -0.5466128587722778, "step": 1260 }, { "epoch": 0.2188146106133701, "grad_norm": 9.932343482971191, "learning_rate": 3.645235361653272e-07, "logits/chosen": -2.760315179824829, "logits/rejected": -2.7526631355285645, "logps/chosen": -101.89413452148438, "logps/rejected": -108.7292251586914, "loss": 0.6704, "rewards/accuracies": 0.625, "rewards/chosen": -0.48711615800857544, "rewards/margins": 0.058868370950222015, "rewards/rejected": -0.5459845662117004, "step": 1270 }, { "epoch": 0.22053756030323915, "grad_norm": 9.49893569946289, "learning_rate": 3.6739380022962113e-07, "logits/chosen": -2.788695812225342, "logits/rejected": -2.7951552867889404, "logps/chosen": -92.65678405761719, "logps/rejected": -108.2343978881836, "loss": 0.6576, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.40768080949783325, "rewards/margins": 0.08443986624479294, "rewards/rejected": -0.4921206533908844, "step": 1280 }, { "epoch": 0.2222605099931082, "grad_norm": 10.11652946472168, "learning_rate": 3.7026406429391504e-07, "logits/chosen": -2.737595796585083, "logits/rejected": -2.7155940532684326, "logps/chosen": -99.75877380371094, "logps/rejected": -100.16853332519531, "loss": 0.6709, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.4402269423007965, "rewards/margins": 0.06059195473790169, "rewards/rejected": -0.5008189082145691, "step": 1290 }, { "epoch": 0.22398345968297725, "grad_norm": 7.431065082550049, "learning_rate": 3.7313432835820895e-07, "logits/chosen": -2.7869224548339844, "logits/rejected": -2.778533458709717, "logps/chosen": -93.90223693847656, "logps/rejected": -102.38191986083984, "loss": 0.6745, "rewards/accuracies": 0.5625, "rewards/chosen": -0.43852171301841736, "rewards/margins": 0.052222102880477905, "rewards/rejected": -0.4907437860965729, "step": 1300 }, { "epoch": 0.2257064093728463, "grad_norm": 10.433018684387207, "learning_rate": 3.7600459242250285e-07, "logits/chosen": -2.7792112827301025, "logits/rejected": -2.755417585372925, "logps/chosen": -97.74311065673828, "logps/rejected": -100.9432373046875, "loss": 0.6592, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.43741893768310547, "rewards/margins": 0.0848575159907341, "rewards/rejected": -0.5222764611244202, "step": 1310 }, { "epoch": 0.22742935906271536, "grad_norm": 7.756955146789551, "learning_rate": 3.788748564867968e-07, "logits/chosen": -2.7987451553344727, "logits/rejected": -2.7830893993377686, "logps/chosen": -97.71382904052734, "logps/rejected": -95.74200439453125, "loss": 0.6791, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.4117484986782074, "rewards/margins": 0.0427439846098423, "rewards/rejected": -0.4544924795627594, "step": 1320 }, { "epoch": 0.22915230875258444, "grad_norm": 5.824132442474365, "learning_rate": 3.817451205510907e-07, "logits/chosen": -2.8279478549957275, "logits/rejected": -2.804975986480713, "logps/chosen": -90.00537109375, "logps/rejected": -89.6543197631836, "loss": 0.6684, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.3362139165401459, "rewards/margins": 0.06075562909245491, "rewards/rejected": -0.3969695568084717, "step": 1330 }, { "epoch": 0.2308752584424535, "grad_norm": 6.094809055328369, "learning_rate": 3.8461538461538463e-07, "logits/chosen": -2.7152292728424072, "logits/rejected": -2.6990323066711426, "logps/chosen": -84.92386627197266, "logps/rejected": -87.4384536743164, "loss": 0.6654, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.28962719440460205, "rewards/margins": 0.06613529473543167, "rewards/rejected": -0.3557624816894531, "step": 1340 }, { "epoch": 0.23259820813232254, "grad_norm": 9.321885108947754, "learning_rate": 3.8748564867967853e-07, "logits/chosen": -2.7584316730499268, "logits/rejected": -2.7400782108306885, "logps/chosen": -95.60079956054688, "logps/rejected": -96.64454650878906, "loss": 0.6877, "rewards/accuracies": 0.5625, "rewards/chosen": -0.41059422492980957, "rewards/margins": 0.026401972398161888, "rewards/rejected": -0.4369961619377136, "step": 1350 }, { "epoch": 0.2343211578221916, "grad_norm": 8.124109268188477, "learning_rate": 3.9035591274397244e-07, "logits/chosen": -2.717226028442383, "logits/rejected": -2.7062318325042725, "logps/chosen": -91.82489013671875, "logps/rejected": -94.97454833984375, "loss": 0.677, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.35411137342453003, "rewards/margins": 0.04675086960196495, "rewards/rejected": -0.40086227655410767, "step": 1360 }, { "epoch": 0.23604410751206065, "grad_norm": 6.5040283203125, "learning_rate": 3.9322617680826635e-07, "logits/chosen": -2.800179958343506, "logits/rejected": -2.783581256866455, "logps/chosen": -87.88378143310547, "logps/rejected": -94.44990539550781, "loss": 0.6599, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.3401029706001282, "rewards/margins": 0.08058915287256241, "rewards/rejected": -0.42069211602211, "step": 1370 }, { "epoch": 0.2377670572019297, "grad_norm": 9.241910934448242, "learning_rate": 3.9609644087256025e-07, "logits/chosen": -2.7376112937927246, "logits/rejected": -2.728394031524658, "logps/chosen": -96.5356674194336, "logps/rejected": -99.0477523803711, "loss": 0.6657, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4025992453098297, "rewards/margins": 0.07375938445329666, "rewards/rejected": -0.4763585925102234, "step": 1380 }, { "epoch": 0.23949000689179875, "grad_norm": 10.927423477172852, "learning_rate": 3.989667049368542e-07, "logits/chosen": -2.827388286590576, "logits/rejected": -2.807051181793213, "logps/chosen": -111.54438781738281, "logps/rejected": -111.20584869384766, "loss": 0.6775, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.535297155380249, "rewards/margins": 0.0524635836482048, "rewards/rejected": -0.5877608060836792, "step": 1390 }, { "epoch": 0.2412129565816678, "grad_norm": 8.385926246643066, "learning_rate": 4.018369690011481e-07, "logits/chosen": -2.7086968421936035, "logits/rejected": -2.6919989585876465, "logps/chosen": -100.72987365722656, "logps/rejected": -104.24874114990234, "loss": 0.6587, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.438565194606781, "rewards/margins": 0.08525042235851288, "rewards/rejected": -0.5238156318664551, "step": 1400 }, { "epoch": 0.24293590627153688, "grad_norm": 9.942192077636719, "learning_rate": 4.0470723306544203e-07, "logits/chosen": -2.7430672645568848, "logits/rejected": -2.735506057739258, "logps/chosen": -96.6695327758789, "logps/rejected": -100.8282241821289, "loss": 0.6703, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4228385090827942, "rewards/margins": 0.05999479442834854, "rewards/rejected": -0.48283329606056213, "step": 1410 }, { "epoch": 0.24465885596140594, "grad_norm": 8.175232887268066, "learning_rate": 4.0757749712973593e-07, "logits/chosen": -2.683074474334717, "logits/rejected": -2.6863036155700684, "logps/chosen": -94.46149444580078, "logps/rejected": -104.10331726074219, "loss": 0.6689, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.43330973386764526, "rewards/margins": 0.06626768410205841, "rewards/rejected": -0.4995773732662201, "step": 1420 }, { "epoch": 0.246381805651275, "grad_norm": 9.212152481079102, "learning_rate": 4.1044776119402984e-07, "logits/chosen": -2.756502866744995, "logits/rejected": -2.7466683387756348, "logps/chosen": -99.7936782836914, "logps/rejected": -108.74727630615234, "loss": 0.6577, "rewards/accuracies": 0.65625, "rewards/chosen": -0.45254722237586975, "rewards/margins": 0.0870833694934845, "rewards/rejected": -0.539630651473999, "step": 1430 }, { "epoch": 0.24810475534114404, "grad_norm": 12.781633377075195, "learning_rate": 4.1331802525832375e-07, "logits/chosen": -2.6940224170684814, "logits/rejected": -2.6727001667022705, "logps/chosen": -100.91461944580078, "logps/rejected": -103.84224700927734, "loss": 0.6681, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.4932823181152344, "rewards/margins": 0.0696868747472763, "rewards/rejected": -0.5629692077636719, "step": 1440 }, { "epoch": 0.2498277050310131, "grad_norm": 13.068676948547363, "learning_rate": 4.1618828932261765e-07, "logits/chosen": -2.695457935333252, "logits/rejected": -2.6661224365234375, "logps/chosen": -118.35257720947266, "logps/rejected": -123.06526184082031, "loss": 0.6647, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6264103055000305, "rewards/margins": 0.08263033628463745, "rewards/rejected": -0.709040641784668, "step": 1450 }, { "epoch": 0.25155065472088217, "grad_norm": 10.450855255126953, "learning_rate": 4.190585533869116e-07, "logits/chosen": -2.7157628536224365, "logits/rejected": -2.6888949871063232, "logps/chosen": -126.8835220336914, "logps/rejected": -131.21517944335938, "loss": 0.6679, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7365719079971313, "rewards/margins": 0.07880139350891113, "rewards/rejected": -0.8153733015060425, "step": 1460 }, { "epoch": 0.2532736044107512, "grad_norm": 21.110004425048828, "learning_rate": 4.219288174512055e-07, "logits/chosen": -2.8386118412017822, "logits/rejected": -2.8074774742126465, "logps/chosen": -131.73805236816406, "logps/rejected": -135.1602783203125, "loss": 0.6651, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.733563244342804, "rewards/margins": 0.07851357758045197, "rewards/rejected": -0.8120768666267395, "step": 1470 }, { "epoch": 0.2549965541006203, "grad_norm": 8.21864128112793, "learning_rate": 4.2479908151549943e-07, "logits/chosen": -2.7962803840637207, "logits/rejected": -2.7671940326690674, "logps/chosen": -117.83184814453125, "logps/rejected": -125.13349914550781, "loss": 0.6615, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6565744280815125, "rewards/margins": 0.08087681233882904, "rewards/rejected": -0.7374511957168579, "step": 1480 }, { "epoch": 0.2567195037904893, "grad_norm": 8.276408195495605, "learning_rate": 4.2766934557979333e-07, "logits/chosen": -2.7768349647521973, "logits/rejected": -2.755868911743164, "logps/chosen": -110.86151123046875, "logps/rejected": -114.18798828125, "loss": 0.6546, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5235632061958313, "rewards/margins": 0.09388275444507599, "rewards/rejected": -0.6174460649490356, "step": 1490 }, { "epoch": 0.2584424534803584, "grad_norm": 10.690779685974121, "learning_rate": 4.3053960964408724e-07, "logits/chosen": -2.696868658065796, "logits/rejected": -2.694939374923706, "logps/chosen": -105.4218978881836, "logps/rejected": -111.50212097167969, "loss": 0.6778, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.5433076620101929, "rewards/margins": 0.05183683708310127, "rewards/rejected": -0.5951444506645203, "step": 1500 }, { "epoch": 0.2601654031702274, "grad_norm": 8.066518783569336, "learning_rate": 4.3340987370838115e-07, "logits/chosen": -2.69407320022583, "logits/rejected": -2.6686859130859375, "logps/chosen": -100.59495544433594, "logps/rejected": -105.2588119506836, "loss": 0.6471, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.4159814715385437, "rewards/margins": 0.10820995271205902, "rewards/rejected": -0.5241914391517639, "step": 1510 }, { "epoch": 0.2618883528600965, "grad_norm": 7.057396411895752, "learning_rate": 4.362801377726751e-07, "logits/chosen": -2.750384569168091, "logits/rejected": -2.748133420944214, "logps/chosen": -102.165283203125, "logps/rejected": -104.2789077758789, "loss": 0.6792, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.4892779290676117, "rewards/margins": 0.047772832214832306, "rewards/rejected": -0.537050724029541, "step": 1520 }, { "epoch": 0.26361130254996556, "grad_norm": 8.002290725708008, "learning_rate": 4.39150401836969e-07, "logits/chosen": -2.6707193851470947, "logits/rejected": -2.658749580383301, "logps/chosen": -100.72066497802734, "logps/rejected": -103.98619079589844, "loss": 0.6707, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.4789508879184723, "rewards/margins": 0.0677775889635086, "rewards/rejected": -0.5467284917831421, "step": 1530 }, { "epoch": 0.2653342522398346, "grad_norm": 10.716758728027344, "learning_rate": 4.420206659012629e-07, "logits/chosen": -2.7498934268951416, "logits/rejected": -2.7239015102386475, "logps/chosen": -98.62110900878906, "logps/rejected": -102.955810546875, "loss": 0.6512, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.4214154779911041, "rewards/margins": 0.10634119808673859, "rewards/rejected": -0.5277566313743591, "step": 1540 }, { "epoch": 0.26705720192970367, "grad_norm": 11.615742683410645, "learning_rate": 4.4489092996555683e-07, "logits/chosen": -2.678488254547119, "logits/rejected": -2.6662161350250244, "logps/chosen": -103.6252670288086, "logps/rejected": -107.395751953125, "loss": 0.6577, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4881840646266937, "rewards/margins": 0.09163917601108551, "rewards/rejected": -0.5798231959342957, "step": 1550 }, { "epoch": 0.2687801516195727, "grad_norm": 14.48892593383789, "learning_rate": 4.4776119402985074e-07, "logits/chosen": -2.680042028427124, "logits/rejected": -2.6629881858825684, "logps/chosen": -122.0106430053711, "logps/rejected": -128.95391845703125, "loss": 0.6634, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.7039979696273804, "rewards/margins": 0.08618190139532089, "rewards/rejected": -0.7901797294616699, "step": 1560 }, { "epoch": 0.2705031013094418, "grad_norm": 10.497743606567383, "learning_rate": 4.5063145809414464e-07, "logits/chosen": -2.71398663520813, "logits/rejected": -2.7192444801330566, "logps/chosen": -130.80361938476562, "logps/rejected": -144.6383056640625, "loss": 0.6559, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.775855541229248, "rewards/margins": 0.11350611597299576, "rewards/rejected": -0.8893616795539856, "step": 1570 }, { "epoch": 0.2722260509993108, "grad_norm": 13.277445793151855, "learning_rate": 4.5350172215843855e-07, "logits/chosen": -2.6595773696899414, "logits/rejected": -2.653949499130249, "logps/chosen": -130.13494873046875, "logps/rejected": -139.91482543945312, "loss": 0.6658, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7724030613899231, "rewards/margins": 0.09441746026277542, "rewards/rejected": -0.8668205142021179, "step": 1580 }, { "epoch": 0.2739490006891799, "grad_norm": 11.512948036193848, "learning_rate": 4.563719862227325e-07, "logits/chosen": -2.694927215576172, "logits/rejected": -2.6844635009765625, "logps/chosen": -118.55953216552734, "logps/rejected": -128.92166137695312, "loss": 0.6541, "rewards/accuracies": 0.625, "rewards/chosen": -0.6396364569664001, "rewards/margins": 0.10439294576644897, "rewards/rejected": -0.7440294027328491, "step": 1590 }, { "epoch": 0.27567195037904896, "grad_norm": 9.366582870483398, "learning_rate": 4.592422502870264e-07, "logits/chosen": -2.678959369659424, "logits/rejected": -2.6533610820770264, "logps/chosen": -108.41175842285156, "logps/rejected": -113.81941223144531, "loss": 0.6648, "rewards/accuracies": 0.5625, "rewards/chosen": -0.550684928894043, "rewards/margins": 0.07856275141239166, "rewards/rejected": -0.629247784614563, "step": 1600 }, { "epoch": 0.27567195037904896, "eval_logits/chosen": -2.76123309135437, "eval_logits/rejected": -2.7578325271606445, "eval_logps/chosen": -104.01419067382812, "eval_logps/rejected": -114.58289337158203, "eval_loss": 0.6676661968231201, "eval_rewards/accuracies": 0.6029275059700012, "eval_rewards/chosen": -0.44998711347579956, "eval_rewards/margins": 0.06834576278924942, "eval_rewards/rejected": -0.5183328986167908, "eval_runtime": 361.1688, "eval_samples_per_second": 11.917, "eval_steps_per_second": 1.49, "step": 1600 }, { "epoch": 0.277394900068918, "grad_norm": 9.210315704345703, "learning_rate": 4.621125143513203e-07, "logits/chosen": -2.7067387104034424, "logits/rejected": -2.6871132850646973, "logps/chosen": -115.17427825927734, "logps/rejected": -127.82564544677734, "loss": 0.6484, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5927554368972778, "rewards/margins": 0.12254355102777481, "rewards/rejected": -0.7152990102767944, "step": 1610 }, { "epoch": 0.27911784975878706, "grad_norm": 17.557851791381836, "learning_rate": 4.6498277841561423e-07, "logits/chosen": -2.636976718902588, "logits/rejected": -2.6198878288269043, "logps/chosen": -123.63352966308594, "logps/rejected": -137.7049102783203, "loss": 0.648, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7105950713157654, "rewards/margins": 0.13303378224372864, "rewards/rejected": -0.8436288833618164, "step": 1620 }, { "epoch": 0.2808407994486561, "grad_norm": 13.887923240661621, "learning_rate": 4.6785304247990814e-07, "logits/chosen": -2.6831889152526855, "logits/rejected": -2.6683363914489746, "logps/chosen": -124.29011535644531, "logps/rejected": -129.17848205566406, "loss": 0.6547, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.6721479296684265, "rewards/margins": 0.10923449695110321, "rewards/rejected": -0.7813824415206909, "step": 1630 }, { "epoch": 0.28256374913852517, "grad_norm": 11.93607234954834, "learning_rate": 4.7072330654420204e-07, "logits/chosen": -2.7547574043273926, "logits/rejected": -2.727531671524048, "logps/chosen": -123.5032958984375, "logps/rejected": -124.08573150634766, "loss": 0.6673, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6534172892570496, "rewards/margins": 0.08224787563085556, "rewards/rejected": -0.7356651425361633, "step": 1640 }, { "epoch": 0.2842866988283942, "grad_norm": 13.57874870300293, "learning_rate": 4.7359357060849595e-07, "logits/chosen": -2.7263035774230957, "logits/rejected": -2.7082901000976562, "logps/chosen": -121.01668548583984, "logps/rejected": -132.3745880126953, "loss": 0.6387, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6287935376167297, "rewards/margins": 0.14556141197681427, "rewards/rejected": -0.774354875087738, "step": 1650 }, { "epoch": 0.28600964851826327, "grad_norm": 11.956063270568848, "learning_rate": 4.764638346727899e-07, "logits/chosen": -2.7257180213928223, "logits/rejected": -2.696934700012207, "logps/chosen": -130.76295471191406, "logps/rejected": -131.2085723876953, "loss": 0.6709, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.7444382309913635, "rewards/margins": 0.080128513276577, "rewards/rejected": -0.824566662311554, "step": 1660 }, { "epoch": 0.2877325982081323, "grad_norm": 13.178788185119629, "learning_rate": 4.793340987370838e-07, "logits/chosen": -2.723220109939575, "logits/rejected": -2.7056446075439453, "logps/chosen": -126.40181732177734, "logps/rejected": -133.3967742919922, "loss": 0.6639, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.7011595368385315, "rewards/margins": 0.09002326428890228, "rewards/rejected": -0.7911826968193054, "step": 1670 }, { "epoch": 0.2894555478980014, "grad_norm": 17.14413833618164, "learning_rate": 4.822043628013777e-07, "logits/chosen": -2.6886258125305176, "logits/rejected": -2.671329975128174, "logps/chosen": -126.47139739990234, "logps/rejected": -136.08221435546875, "loss": 0.6492, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6947437524795532, "rewards/margins": 0.13168177008628845, "rewards/rejected": -0.8264254331588745, "step": 1680 }, { "epoch": 0.29117849758787046, "grad_norm": 19.760637283325195, "learning_rate": 4.850746268656717e-07, "logits/chosen": -2.6687569618225098, "logits/rejected": -2.6578474044799805, "logps/chosen": -135.00634765625, "logps/rejected": -142.72479248046875, "loss": 0.6519, "rewards/accuracies": 0.625, "rewards/chosen": -0.7900819182395935, "rewards/margins": 0.12264666706323624, "rewards/rejected": -0.9127286076545715, "step": 1690 }, { "epoch": 0.2929014472777395, "grad_norm": 24.47527503967285, "learning_rate": 4.879448909299655e-07, "logits/chosen": -2.600966691970825, "logits/rejected": -2.5976383686065674, "logps/chosen": -178.060546875, "logps/rejected": -188.95413208007812, "loss": 0.6819, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2723419666290283, "rewards/margins": 0.06976453959941864, "rewards/rejected": -1.3421064615249634, "step": 1700 }, { "epoch": 0.29462439696760856, "grad_norm": 35.903533935546875, "learning_rate": 4.908151549942595e-07, "logits/chosen": -2.6738171577453613, "logits/rejected": -2.641862630844116, "logps/chosen": -183.44834899902344, "logps/rejected": -188.65505981445312, "loss": 0.6629, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2418150901794434, "rewards/margins": 0.10853724181652069, "rewards/rejected": -1.3503522872924805, "step": 1710 }, { "epoch": 0.2963473466574776, "grad_norm": 26.870349884033203, "learning_rate": 4.936854190585534e-07, "logits/chosen": -2.655912160873413, "logits/rejected": -2.6322195529937744, "logps/chosen": -180.99163818359375, "logps/rejected": -182.69912719726562, "loss": 0.6723, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2384740114212036, "rewards/margins": 0.08691282570362091, "rewards/rejected": -1.325386881828308, "step": 1720 }, { "epoch": 0.29807029634734666, "grad_norm": 26.99607276916504, "learning_rate": 4.965556831228473e-07, "logits/chosen": -2.686877965927124, "logits/rejected": -2.67513370513916, "logps/chosen": -184.79806518554688, "logps/rejected": -186.98841857910156, "loss": 0.6921, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.294754147529602, "rewards/margins": 0.05460459738969803, "rewards/rejected": -1.3493586778640747, "step": 1730 }, { "epoch": 0.2997932460372157, "grad_norm": 15.668267250061035, "learning_rate": 4.994259471871412e-07, "logits/chosen": -2.6696135997772217, "logits/rejected": -2.6621170043945312, "logps/chosen": -156.1314239501953, "logps/rejected": -167.2497100830078, "loss": 0.6654, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.0346574783325195, "rewards/margins": 0.10083886235952377, "rewards/rejected": -1.1354963779449463, "step": 1740 }, { "epoch": 0.30151619572708477, "grad_norm": 17.95091438293457, "learning_rate": 4.999996784476807e-07, "logits/chosen": -2.661165475845337, "logits/rejected": -2.654550790786743, "logps/chosen": -154.8639373779297, "logps/rejected": -165.72445678710938, "loss": 0.6588, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.9890151023864746, "rewards/margins": 0.1082865372300148, "rewards/rejected": -1.0973016023635864, "step": 1750 }, { "epoch": 0.30323914541695385, "grad_norm": 16.63321304321289, "learning_rate": 4.999983721428015e-07, "logits/chosen": -2.681396245956421, "logits/rejected": -2.6542375087738037, "logps/chosen": -140.18325805664062, "logps/rejected": -148.6129913330078, "loss": 0.6453, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.8316587209701538, "rewards/margins": 0.13463722169399261, "rewards/rejected": -0.9662960171699524, "step": 1760 }, { "epoch": 0.3049620951068229, "grad_norm": 14.12116527557373, "learning_rate": 4.999960609935887e-07, "logits/chosen": -2.674481153488159, "logits/rejected": -2.643754482269287, "logps/chosen": -137.46963500976562, "logps/rejected": -141.88929748535156, "loss": 0.649, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.8109871745109558, "rewards/margins": 0.12851256132125854, "rewards/rejected": -0.9394997358322144, "step": 1770 }, { "epoch": 0.30668504479669195, "grad_norm": 14.201094627380371, "learning_rate": 4.99992745009332e-07, "logits/chosen": -2.640939474105835, "logits/rejected": -2.639155149459839, "logps/chosen": -136.6545867919922, "logps/rejected": -158.16629028320312, "loss": 0.6268, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.817334771156311, "rewards/margins": 0.1898796558380127, "rewards/rejected": -1.0072143077850342, "step": 1780 }, { "epoch": 0.308407994486561, "grad_norm": 14.31953239440918, "learning_rate": 4.999884242033595e-07, "logits/chosen": -2.699934244155884, "logits/rejected": -2.685863971710205, "logps/chosen": -136.34207153320312, "logps/rejected": -150.13742065429688, "loss": 0.643, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8333311080932617, "rewards/margins": 0.1485327184200287, "rewards/rejected": -0.981863796710968, "step": 1790 }, { "epoch": 0.31013094417643006, "grad_norm": 16.573509216308594, "learning_rate": 4.999830985930383e-07, "logits/chosen": -2.6949079036712646, "logits/rejected": -2.6760029792785645, "logps/chosen": -138.4075927734375, "logps/rejected": -148.45355224609375, "loss": 0.6608, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8612344861030579, "rewards/margins": 0.1088922992348671, "rewards/rejected": -0.9701266288757324, "step": 1800 }, { "epoch": 0.3118538938662991, "grad_norm": 17.915237426757812, "learning_rate": 4.999767681997743e-07, "logits/chosen": -2.671882390975952, "logits/rejected": -2.6458749771118164, "logps/chosen": -145.06224060058594, "logps/rejected": -151.92984008789062, "loss": 0.6671, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8746577501296997, "rewards/margins": 0.11453260481357574, "rewards/rejected": -0.989190399646759, "step": 1810 }, { "epoch": 0.31357684355616816, "grad_norm": 31.947986602783203, "learning_rate": 4.999694330490117e-07, "logits/chosen": -2.657968759536743, "logits/rejected": -2.635417938232422, "logps/chosen": -142.74301147460938, "logps/rejected": -145.25271606445312, "loss": 0.6749, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.8701076507568359, "rewards/margins": 0.07651756703853607, "rewards/rejected": -0.9466251134872437, "step": 1820 }, { "epoch": 0.31529979324603724, "grad_norm": 8.617423057556152, "learning_rate": 4.999610931702336e-07, "logits/chosen": -2.604358673095703, "logits/rejected": -2.5983524322509766, "logps/chosen": -118.5447006225586, "logps/rejected": -129.49539184570312, "loss": 0.6509, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.622494637966156, "rewards/margins": 0.12726853787899017, "rewards/rejected": -0.7497631311416626, "step": 1830 }, { "epoch": 0.31702274293590627, "grad_norm": 12.534258842468262, "learning_rate": 4.999517485969614e-07, "logits/chosen": -2.7332472801208496, "logits/rejected": -2.727543592453003, "logps/chosen": -113.6526107788086, "logps/rejected": -123.16817474365234, "loss": 0.6558, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.5773425102233887, "rewards/margins": 0.11520341783761978, "rewards/rejected": -0.692546010017395, "step": 1840 }, { "epoch": 0.31874569262577535, "grad_norm": 13.039063453674316, "learning_rate": 4.999413993667544e-07, "logits/chosen": -2.6505751609802246, "logits/rejected": -2.650954484939575, "logps/chosen": -120.35369873046875, "logps/rejected": -131.4852294921875, "loss": 0.6571, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6617897748947144, "rewards/margins": 0.11535720527172089, "rewards/rejected": -0.777147114276886, "step": 1850 }, { "epoch": 0.32046864231564437, "grad_norm": 13.398212432861328, "learning_rate": 4.999300455212105e-07, "logits/chosen": -2.600046396255493, "logits/rejected": -2.577758550643921, "logps/chosen": -134.55966186523438, "logps/rejected": -137.8440399169922, "loss": 0.6601, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7518646121025085, "rewards/margins": 0.10340027511119843, "rewards/rejected": -0.8552649617195129, "step": 1860 }, { "epoch": 0.32219159200551345, "grad_norm": 19.999652862548828, "learning_rate": 4.999176871059655e-07, "logits/chosen": -2.601722002029419, "logits/rejected": -2.5823540687561035, "logps/chosen": -135.49420166015625, "logps/rejected": -146.9949188232422, "loss": 0.6417, "rewards/accuracies": 0.625, "rewards/chosen": -0.7866042852401733, "rewards/margins": 0.1571255922317505, "rewards/rejected": -0.9437299966812134, "step": 1870 }, { "epoch": 0.3239145416953825, "grad_norm": 13.094282150268555, "learning_rate": 4.999043241706928e-07, "logits/chosen": -2.6528284549713135, "logits/rejected": -2.6293511390686035, "logps/chosen": -131.6373291015625, "logps/rejected": -140.1243133544922, "loss": 0.6582, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.7725619077682495, "rewards/margins": 0.12212643772363663, "rewards/rejected": -0.8946884274482727, "step": 1880 }, { "epoch": 0.32563749138525155, "grad_norm": 15.415971755981445, "learning_rate": 4.998899567691033e-07, "logits/chosen": -2.6632533073425293, "logits/rejected": -2.6573128700256348, "logps/chosen": -112.79826354980469, "logps/rejected": -126.64198303222656, "loss": 0.6393, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5907924175262451, "rewards/margins": 0.14132273197174072, "rewards/rejected": -0.7321151494979858, "step": 1890 }, { "epoch": 0.32736044107512063, "grad_norm": 13.935209274291992, "learning_rate": 4.998745849589455e-07, "logits/chosen": -2.651942014694214, "logits/rejected": -2.631425619125366, "logps/chosen": -118.67203521728516, "logps/rejected": -118.9170913696289, "loss": 0.6699, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6040962934494019, "rewards/margins": 0.07356070727109909, "rewards/rejected": -0.6776569485664368, "step": 1900 }, { "epoch": 0.32908339076498966, "grad_norm": 13.166512489318848, "learning_rate": 4.998582088020049e-07, "logits/chosen": -2.5637130737304688, "logits/rejected": -2.5380656719207764, "logps/chosen": -124.6986083984375, "logps/rejected": -139.21798706054688, "loss": 0.6304, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.6991332173347473, "rewards/margins": 0.16562217473983765, "rewards/rejected": -0.8647553324699402, "step": 1910 }, { "epoch": 0.33080634045485874, "grad_norm": 14.246166229248047, "learning_rate": 4.998408283641039e-07, "logits/chosen": -2.608985424041748, "logits/rejected": -2.5885133743286133, "logps/chosen": -144.44247436523438, "logps/rejected": -152.5337677001953, "loss": 0.66, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8797372579574585, "rewards/margins": 0.1202826276421547, "rewards/rejected": -1.0000197887420654, "step": 1920 }, { "epoch": 0.33252929014472776, "grad_norm": 12.540620803833008, "learning_rate": 4.998224437151014e-07, "logits/chosen": -2.5735909938812256, "logits/rejected": -2.5492091178894043, "logps/chosen": -133.1687774658203, "logps/rejected": -144.11866760253906, "loss": 0.6478, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7766335606575012, "rewards/margins": 0.13882730901241302, "rewards/rejected": -0.9154609441757202, "step": 1930 }, { "epoch": 0.33425223983459684, "grad_norm": 11.98005199432373, "learning_rate": 4.998030549288929e-07, "logits/chosen": -2.604550838470459, "logits/rejected": -2.5947508811950684, "logps/chosen": -113.2973861694336, "logps/rejected": -124.0053939819336, "loss": 0.6442, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6130251884460449, "rewards/margins": 0.13096332550048828, "rewards/rejected": -0.7439886331558228, "step": 1940 }, { "epoch": 0.33597518952446587, "grad_norm": 14.77886962890625, "learning_rate": 4.997826620834095e-07, "logits/chosen": -2.625196933746338, "logits/rejected": -2.621878147125244, "logps/chosen": -129.58670043945312, "logps/rejected": -142.3292236328125, "loss": 0.6623, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.7655792832374573, "rewards/margins": 0.11051289737224579, "rewards/rejected": -0.8760923147201538, "step": 1950 }, { "epoch": 0.33769813921433495, "grad_norm": 14.128043174743652, "learning_rate": 4.997612652606184e-07, "logits/chosen": -2.6094412803649902, "logits/rejected": -2.5943543910980225, "logps/chosen": -149.7881317138672, "logps/rejected": -163.10986328125, "loss": 0.6459, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9467031359672546, "rewards/margins": 0.14725585281848907, "rewards/rejected": -1.0939589738845825, "step": 1960 }, { "epoch": 0.33942108890420397, "grad_norm": 13.105121612548828, "learning_rate": 4.997388645465222e-07, "logits/chosen": -2.6148409843444824, "logits/rejected": -2.5972740650177, "logps/chosen": -144.23641967773438, "logps/rejected": -149.72213745117188, "loss": 0.6636, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.8702799677848816, "rewards/margins": 0.1133175864815712, "rewards/rejected": -0.9835975766181946, "step": 1970 }, { "epoch": 0.34114403859407305, "grad_norm": 16.13191795349121, "learning_rate": 4.997154600311581e-07, "logits/chosen": -2.5553300380706787, "logits/rejected": -2.5461409091949463, "logps/chosen": -147.89500427246094, "logps/rejected": -159.90122985839844, "loss": 0.6493, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.9154181480407715, "rewards/margins": 0.15090778470039368, "rewards/rejected": -1.0663259029388428, "step": 1980 }, { "epoch": 0.34286698828394213, "grad_norm": 11.924009323120117, "learning_rate": 4.996910518085987e-07, "logits/chosen": -2.6293768882751465, "logits/rejected": -2.602466344833374, "logps/chosen": -151.89749145507812, "logps/rejected": -166.92367553710938, "loss": 0.6237, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9988789558410645, "rewards/margins": 0.19908463954925537, "rewards/rejected": -1.1979637145996094, "step": 1990 }, { "epoch": 0.34458993797381116, "grad_norm": 12.42876148223877, "learning_rate": 4.996656399769503e-07, "logits/chosen": -2.586879014968872, "logits/rejected": -2.593677043914795, "logps/chosen": -149.94015502929688, "logps/rejected": -166.60580444335938, "loss": 0.6678, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.9948354959487915, "rewards/margins": 0.11195556074380875, "rewards/rejected": -1.1067911386489868, "step": 2000 }, { "epoch": 0.34458993797381116, "eval_logits/chosen": -2.6651065349578857, "eval_logits/rejected": -2.6611075401306152, "eval_logps/chosen": -129.95816040039062, "eval_logps/rejected": -144.4978790283203, "eval_loss": 0.6575666666030884, "eval_rewards/accuracies": 0.6217471957206726, "eval_rewards/chosen": -0.7094268798828125, "eval_rewards/margins": 0.10805582255125046, "eval_rewards/rejected": -0.8174827098846436, "eval_runtime": 361.1244, "eval_samples_per_second": 11.918, "eval_steps_per_second": 1.49, "step": 2000 }, { "epoch": 0.34631288766368024, "grad_norm": 12.081204414367676, "learning_rate": 4.996392246383531e-07, "logits/chosen": -2.622715711593628, "logits/rejected": -2.5998973846435547, "logps/chosen": -143.4109649658203, "logps/rejected": -150.24758911132812, "loss": 0.6582, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.8668712377548218, "rewards/margins": 0.12826725840568542, "rewards/rejected": -0.9951385259628296, "step": 2010 }, { "epoch": 0.34803583735354926, "grad_norm": 14.753251075744629, "learning_rate": 4.996118058989812e-07, "logits/chosen": -2.588796615600586, "logits/rejected": -2.574514150619507, "logps/chosen": -134.34652709960938, "logps/rejected": -153.8282928466797, "loss": 0.64, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8343431353569031, "rewards/margins": 0.16698385775089264, "rewards/rejected": -1.0013270378112793, "step": 2020 }, { "epoch": 0.34975878704341834, "grad_norm": 10.726822853088379, "learning_rate": 4.995833838690415e-07, "logits/chosen": -2.6623213291168213, "logits/rejected": -2.643949031829834, "logps/chosen": -138.64453125, "logps/rejected": -149.1639404296875, "loss": 0.6534, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8112980723381042, "rewards/margins": 0.12683862447738647, "rewards/rejected": -0.9381367564201355, "step": 2030 }, { "epoch": 0.35148173673328736, "grad_norm": 10.432881355285645, "learning_rate": 4.995539586627739e-07, "logits/chosen": -2.5564632415771484, "logits/rejected": -2.550128936767578, "logps/chosen": -130.7455596923828, "logps/rejected": -145.64593505859375, "loss": 0.6385, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.7647576928138733, "rewards/margins": 0.16195419430732727, "rewards/rejected": -0.926711916923523, "step": 2040 }, { "epoch": 0.35320468642315644, "grad_norm": 12.2381591796875, "learning_rate": 4.995235303984498e-07, "logits/chosen": -2.612844228744507, "logits/rejected": -2.59574556350708, "logps/chosen": -128.8781280517578, "logps/rejected": -147.67213439941406, "loss": 0.6327, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.7679967880249023, "rewards/margins": 0.17120540142059326, "rewards/rejected": -0.9392021894454956, "step": 2050 }, { "epoch": 0.3549276361130255, "grad_norm": 9.396986961364746, "learning_rate": 4.994920991983728e-07, "logits/chosen": -2.6187431812286377, "logits/rejected": -2.591163158416748, "logps/chosen": -126.61274719238281, "logps/rejected": -138.3388214111328, "loss": 0.6381, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.7087310552597046, "rewards/margins": 0.1614312380552292, "rewards/rejected": -0.8701623678207397, "step": 2060 }, { "epoch": 0.35665058580289455, "grad_norm": 12.76343822479248, "learning_rate": 4.994596651888776e-07, "logits/chosen": -2.623337745666504, "logits/rejected": -2.605090618133545, "logps/chosen": -139.77703857421875, "logps/rejected": -158.06289672851562, "loss": 0.6229, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8338707089424133, "rewards/margins": 0.20421893894672394, "rewards/rejected": -1.038089632987976, "step": 2070 }, { "epoch": 0.35837353549276363, "grad_norm": 18.8853702545166, "learning_rate": 4.994262285003295e-07, "logits/chosen": -2.5183849334716797, "logits/rejected": -2.4982750415802, "logps/chosen": -169.9882049560547, "logps/rejected": -187.1335906982422, "loss": 0.6337, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1717898845672607, "rewards/margins": 0.20283909142017365, "rewards/rejected": -1.374629020690918, "step": 2080 }, { "epoch": 0.36009648518263265, "grad_norm": 18.604270935058594, "learning_rate": 4.993917892671242e-07, "logits/chosen": -2.5403242111206055, "logits/rejected": -2.538454532623291, "logps/chosen": -188.715087890625, "logps/rejected": -204.15806579589844, "loss": 0.662, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3316481113433838, "rewards/margins": 0.17142681777477264, "rewards/rejected": -1.5030750036239624, "step": 2090 }, { "epoch": 0.36181943487250173, "grad_norm": 15.117061614990234, "learning_rate": 4.993563476276868e-07, "logits/chosen": -2.5182087421417236, "logits/rejected": -2.499211311340332, "logps/chosen": -182.68942260742188, "logps/rejected": -198.25323486328125, "loss": 0.6322, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2642195224761963, "rewards/margins": 0.21435031294822693, "rewards/rejected": -1.4785699844360352, "step": 2100 }, { "epoch": 0.36354238456237076, "grad_norm": 15.694002151489258, "learning_rate": 4.993199037244714e-07, "logits/chosen": -2.539485216140747, "logits/rejected": -2.5362348556518555, "logps/chosen": -175.72463989257812, "logps/rejected": -200.694580078125, "loss": 0.6325, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2525984048843384, "rewards/margins": 0.2126205712556839, "rewards/rejected": -1.4652191400527954, "step": 2110 }, { "epoch": 0.36526533425223984, "grad_norm": 17.295534133911133, "learning_rate": 4.992824577039611e-07, "logits/chosen": -2.5031023025512695, "logits/rejected": -2.4806714057922363, "logps/chosen": -161.68716430664062, "logps/rejected": -185.3408660888672, "loss": 0.6012, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.071370244026184, "rewards/margins": 0.278527170419693, "rewards/rejected": -1.3498971462249756, "step": 2120 }, { "epoch": 0.3669882839421089, "grad_norm": 17.566556930541992, "learning_rate": 4.992440097166661e-07, "logits/chosen": -2.499347686767578, "logits/rejected": -2.479637622833252, "logps/chosen": -171.9679412841797, "logps/rejected": -195.6393585205078, "loss": 0.6072, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1703380346298218, "rewards/margins": 0.2702018618583679, "rewards/rejected": -1.440540075302124, "step": 2130 }, { "epoch": 0.36871123363197794, "grad_norm": 18.932222366333008, "learning_rate": 4.992045599171248e-07, "logits/chosen": -2.4745125770568848, "logits/rejected": -2.466383934020996, "logps/chosen": -173.25491333007812, "logps/rejected": -198.31932067871094, "loss": 0.6265, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2386658191680908, "rewards/margins": 0.21843025088310242, "rewards/rejected": -1.457095980644226, "step": 2140 }, { "epoch": 0.370434183321847, "grad_norm": 21.17801284790039, "learning_rate": 4.991641084639016e-07, "logits/chosen": -2.4825165271759033, "logits/rejected": -2.4485490322113037, "logps/chosen": -183.6627197265625, "logps/rejected": -202.3411865234375, "loss": 0.604, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2694909572601318, "rewards/margins": 0.27605897188186646, "rewards/rejected": -1.545549750328064, "step": 2150 }, { "epoch": 0.37215713301171605, "grad_norm": 26.66555404663086, "learning_rate": 4.991226555195873e-07, "logits/chosen": -2.446937084197998, "logits/rejected": -2.4302046298980713, "logps/chosen": -219.9103240966797, "logps/rejected": -238.98471069335938, "loss": 0.624, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6342747211456299, "rewards/margins": 0.26687201857566833, "rewards/rejected": -1.901146650314331, "step": 2160 }, { "epoch": 0.3738800827015851, "grad_norm": 23.64926528930664, "learning_rate": 4.990802012507981e-07, "logits/chosen": -2.5328054428100586, "logits/rejected": -2.5089778900146484, "logps/chosen": -220.40914916992188, "logps/rejected": -250.80282592773438, "loss": 0.5978, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6542152166366577, "rewards/margins": 0.34678736329078674, "rewards/rejected": -2.001002788543701, "step": 2170 }, { "epoch": 0.37560303239145415, "grad_norm": 18.5404052734375, "learning_rate": 4.990367458281747e-07, "logits/chosen": -2.5687835216522217, "logits/rejected": -2.5436742305755615, "logps/chosen": -202.6275634765625, "logps/rejected": -214.3360137939453, "loss": 0.6682, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4308843612670898, "rewards/margins": 0.19051192700862885, "rewards/rejected": -1.6213963031768799, "step": 2180 }, { "epoch": 0.37732598208132323, "grad_norm": 10.299328804016113, "learning_rate": 4.98992289426382e-07, "logits/chosen": -2.655515670776367, "logits/rejected": -2.6492748260498047, "logps/chosen": -140.56468200683594, "logps/rejected": -157.1565399169922, "loss": 0.6409, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8734159469604492, "rewards/margins": 0.16740913689136505, "rewards/rejected": -1.0408251285552979, "step": 2190 }, { "epoch": 0.37904893177119225, "grad_norm": 13.931880950927734, "learning_rate": 4.989468322241082e-07, "logits/chosen": -2.5287442207336426, "logits/rejected": -2.515350341796875, "logps/chosen": -128.5990447998047, "logps/rejected": -150.06759643554688, "loss": 0.6252, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7572832703590393, "rewards/margins": 0.20894217491149902, "rewards/rejected": -0.9662254452705383, "step": 2200 }, { "epoch": 0.38077188146106133, "grad_norm": 16.680767059326172, "learning_rate": 4.989003744040643e-07, "logits/chosen": -2.550046443939209, "logits/rejected": -2.5360865592956543, "logps/chosen": -162.45230102539062, "logps/rejected": -179.36317443847656, "loss": 0.6329, "rewards/accuracies": 0.625, "rewards/chosen": -1.0659806728363037, "rewards/margins": 0.20338082313537598, "rewards/rejected": -1.2693614959716797, "step": 2210 }, { "epoch": 0.3824948311509304, "grad_norm": 16.553239822387695, "learning_rate": 4.988529161529828e-07, "logits/chosen": -2.400988817214966, "logits/rejected": -2.3895211219787598, "logps/chosen": -179.7524871826172, "logps/rejected": -198.7301483154297, "loss": 0.6401, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2496070861816406, "rewards/margins": 0.2034221887588501, "rewards/rejected": -1.4530293941497803, "step": 2220 }, { "epoch": 0.38421778084079944, "grad_norm": 18.305532455444336, "learning_rate": 4.988044576616177e-07, "logits/chosen": -2.508699417114258, "logits/rejected": -2.487192153930664, "logps/chosen": -183.61419677734375, "logps/rejected": -195.9385986328125, "loss": 0.6539, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2917561531066895, "rewards/margins": 0.16890095174312592, "rewards/rejected": -1.4606572389602661, "step": 2230 }, { "epoch": 0.3859407305306685, "grad_norm": 12.873188018798828, "learning_rate": 4.987549991247432e-07, "logits/chosen": -2.5305745601654053, "logits/rejected": -2.5025839805603027, "logps/chosen": -157.09034729003906, "logps/rejected": -167.4355926513672, "loss": 0.6605, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.9968762397766113, "rewards/margins": 0.14799538254737854, "rewards/rejected": -1.1448715925216675, "step": 2240 }, { "epoch": 0.38766368022053754, "grad_norm": 24.835359573364258, "learning_rate": 4.987045407411531e-07, "logits/chosen": -2.474675416946411, "logits/rejected": -2.447321653366089, "logps/chosen": -153.1079559326172, "logps/rejected": -163.66799926757812, "loss": 0.647, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9653034210205078, "rewards/margins": 0.14642596244812012, "rewards/rejected": -1.111729383468628, "step": 2250 }, { "epoch": 0.3893866299104066, "grad_norm": 13.058670997619629, "learning_rate": 4.9865308271366e-07, "logits/chosen": -2.505406618118286, "logits/rejected": -2.4751663208007812, "logps/chosen": -145.5873565673828, "logps/rejected": -152.96646118164062, "loss": 0.6666, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.8449538946151733, "rewards/margins": 0.11964478343725204, "rewards/rejected": -0.964598536491394, "step": 2260 }, { "epoch": 0.39110957960027565, "grad_norm": 13.146913528442383, "learning_rate": 4.986006252490946e-07, "logits/chosen": -2.495392322540283, "logits/rejected": -2.466439723968506, "logps/chosen": -142.65966796875, "logps/rejected": -156.15109252929688, "loss": 0.6388, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.8774528503417969, "rewards/margins": 0.17820842564105988, "rewards/rejected": -1.0556614398956299, "step": 2270 }, { "epoch": 0.3928325292901447, "grad_norm": 12.109760284423828, "learning_rate": 4.985471685583044e-07, "logits/chosen": -2.4972500801086426, "logits/rejected": -2.488992214202881, "logps/chosen": -160.86257934570312, "logps/rejected": -183.5970001220703, "loss": 0.6188, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.061710238456726, "rewards/margins": 0.220829039812088, "rewards/rejected": -1.2825391292572021, "step": 2280 }, { "epoch": 0.3945554789800138, "grad_norm": 20.802764892578125, "learning_rate": 4.984927128561536e-07, "logits/chosen": -2.4000649452209473, "logits/rejected": -2.3780412673950195, "logps/chosen": -194.49526977539062, "logps/rejected": -223.74038696289062, "loss": 0.5946, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4108482599258423, "rewards/margins": 0.30530619621276855, "rewards/rejected": -1.7161544561386108, "step": 2290 }, { "epoch": 0.39627842866988283, "grad_norm": 25.00774383544922, "learning_rate": 4.984372583615214e-07, "logits/chosen": -2.480699062347412, "logits/rejected": -2.450462818145752, "logps/chosen": -224.07742309570312, "logps/rejected": -253.26211547851562, "loss": 0.5972, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6604259014129639, "rewards/margins": 0.34256020188331604, "rewards/rejected": -2.002986431121826, "step": 2300 }, { "epoch": 0.3980013783597519, "grad_norm": 26.16922378540039, "learning_rate": 4.983808052973021e-07, "logits/chosen": -2.4037115573883057, "logits/rejected": -2.387246608734131, "logps/chosen": -224.7569122314453, "logps/rejected": -248.262451171875, "loss": 0.632, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.7277648448944092, "rewards/margins": 0.2531067132949829, "rewards/rejected": -1.980871558189392, "step": 2310 }, { "epoch": 0.39972432804962094, "grad_norm": 17.589717864990234, "learning_rate": 4.983233538904031e-07, "logits/chosen": -2.4658432006835938, "logits/rejected": -2.4311699867248535, "logps/chosen": -215.7435760498047, "logps/rejected": -239.8856964111328, "loss": 0.611, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5715186595916748, "rewards/margins": 0.31844526529312134, "rewards/rejected": -1.8899637460708618, "step": 2320 }, { "epoch": 0.40144727773949, "grad_norm": 17.14701271057129, "learning_rate": 4.98264904371745e-07, "logits/chosen": -2.454108715057373, "logits/rejected": -2.4426796436309814, "logps/chosen": -181.88339233398438, "logps/rejected": -211.3361358642578, "loss": 0.579, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.245214819908142, "rewards/margins": 0.35555753111839294, "rewards/rejected": -1.6007722616195679, "step": 2330 }, { "epoch": 0.40317022742935904, "grad_norm": 25.447280883789062, "learning_rate": 4.982054569762597e-07, "logits/chosen": -2.471432685852051, "logits/rejected": -2.459500789642334, "logps/chosen": -185.2336883544922, "logps/rejected": -207.1048126220703, "loss": 0.6453, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3201853036880493, "rewards/margins": 0.20477263629436493, "rewards/rejected": -1.5249578952789307, "step": 2340 }, { "epoch": 0.4048931771192281, "grad_norm": 22.150114059448242, "learning_rate": 4.981450119428906e-07, "logits/chosen": -2.4325966835021973, "logits/rejected": -2.416619300842285, "logps/chosen": -189.86355590820312, "logps/rejected": -218.9242401123047, "loss": 0.5977, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.31093430519104, "rewards/margins": 0.3363367021083832, "rewards/rejected": -1.647270917892456, "step": 2350 }, { "epoch": 0.4066161268090972, "grad_norm": 23.8380069732666, "learning_rate": 4.980835695145906e-07, "logits/chosen": -2.4336209297180176, "logits/rejected": -2.4131217002868652, "logps/chosen": -210.425048828125, "logps/rejected": -243.5559844970703, "loss": 0.6073, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5858418941497803, "rewards/margins": 0.34787601232528687, "rewards/rejected": -1.9337177276611328, "step": 2360 }, { "epoch": 0.4083390764989662, "grad_norm": 20.9385929107666, "learning_rate": 4.980211299383213e-07, "logits/chosen": -2.4194045066833496, "logits/rejected": -2.403456449508667, "logps/chosen": -234.6072235107422, "logps/rejected": -268.6842346191406, "loss": 0.6145, "rewards/accuracies": 0.625, "rewards/chosen": -1.797498345375061, "rewards/margins": 0.35873493552207947, "rewards/rejected": -2.156233310699463, "step": 2370 }, { "epoch": 0.4100620261888353, "grad_norm": 26.916467666625977, "learning_rate": 4.979576934650529e-07, "logits/chosen": -2.375267744064331, "logits/rejected": -2.3532843589782715, "logps/chosen": -236.09671020507812, "logps/rejected": -261.98760986328125, "loss": 0.6505, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.8136438131332397, "rewards/margins": 0.2862181067466736, "rewards/rejected": -2.0998618602752686, "step": 2380 }, { "epoch": 0.41178497587870433, "grad_norm": 24.42380142211914, "learning_rate": 4.978932603497622e-07, "logits/chosen": -2.4193663597106934, "logits/rejected": -2.3957924842834473, "logps/chosen": -239.5980987548828, "logps/rejected": -269.45208740234375, "loss": 0.596, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.825730323791504, "rewards/margins": 0.3546573519706726, "rewards/rejected": -2.1803877353668213, "step": 2390 }, { "epoch": 0.4135079255685734, "grad_norm": 14.231871604919434, "learning_rate": 4.978278308514316e-07, "logits/chosen": -2.4884352684020996, "logits/rejected": -2.4744627475738525, "logps/chosen": -193.97128295898438, "logps/rejected": -220.0875701904297, "loss": 0.6253, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4105595350265503, "rewards/margins": 0.2634104788303375, "rewards/rejected": -1.6739702224731445, "step": 2400 }, { "epoch": 0.4135079255685734, "eval_logits/chosen": -2.503838062286377, "eval_logits/rejected": -2.496575355529785, "eval_logps/chosen": -168.8844451904297, "eval_logps/rejected": -188.3249053955078, "eval_loss": 0.6468085646629333, "eval_rewards/accuracies": 0.6236059665679932, "eval_rewards/chosen": -1.0986895561218262, "eval_rewards/margins": 0.15706345438957214, "eval_rewards/rejected": -1.2557529211044312, "eval_runtime": 361.4451, "eval_samples_per_second": 11.908, "eval_steps_per_second": 1.488, "step": 2400 }, { "epoch": 0.41523087525844243, "grad_norm": 24.890613555908203, "learning_rate": 4.977614052330489e-07, "logits/chosen": -2.451368808746338, "logits/rejected": -2.425685167312622, "logps/chosen": -188.91796875, "logps/rejected": -203.52093505859375, "loss": 0.6329, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3318657875061035, "rewards/margins": 0.21818795800209045, "rewards/rejected": -1.5500537157058716, "step": 2410 }, { "epoch": 0.4169538249483115, "grad_norm": 17.065593719482422, "learning_rate": 4.976939837616053e-07, "logits/chosen": -2.4561610221862793, "logits/rejected": -2.430938482284546, "logps/chosen": -192.31578063964844, "logps/rejected": -218.99703979492188, "loss": 0.6077, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3533475399017334, "rewards/margins": 0.2949267029762268, "rewards/rejected": -1.6482741832733154, "step": 2420 }, { "epoch": 0.41867677463818054, "grad_norm": 13.427143096923828, "learning_rate": 4.976255667080951e-07, "logits/chosen": -2.413860559463501, "logits/rejected": -2.3959386348724365, "logps/chosen": -159.7108612060547, "logps/rejected": -171.32122802734375, "loss": 0.6608, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0394437313079834, "rewards/margins": 0.15998996794223785, "rewards/rejected": -1.1994339227676392, "step": 2430 }, { "epoch": 0.4203997243280496, "grad_norm": 13.65481185913086, "learning_rate": 4.975561543475139e-07, "logits/chosen": -2.4448227882385254, "logits/rejected": -2.4286270141601562, "logps/chosen": -155.6831512451172, "logps/rejected": -177.15257263183594, "loss": 0.6192, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9906308054924011, "rewards/margins": 0.2443792074918747, "rewards/rejected": -1.235010027885437, "step": 2440 }, { "epoch": 0.4221226740179187, "grad_norm": 16.86613655090332, "learning_rate": 4.974857469588579e-07, "logits/chosen": -2.4048614501953125, "logits/rejected": -2.3870797157287598, "logps/chosen": -161.153564453125, "logps/rejected": -176.39842224121094, "loss": 0.6442, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.0660475492477417, "rewards/margins": 0.18088501691818237, "rewards/rejected": -1.2469327449798584, "step": 2450 }, { "epoch": 0.4238456237077877, "grad_norm": 14.639715194702148, "learning_rate": 4.97414344825123e-07, "logits/chosen": -2.3903567790985107, "logits/rejected": -2.3600940704345703, "logps/chosen": -172.84315490722656, "logps/rejected": -194.74884033203125, "loss": 0.6248, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1549216508865356, "rewards/margins": 0.2764430344104767, "rewards/rejected": -1.4313645362854004, "step": 2460 }, { "epoch": 0.4255685733976568, "grad_norm": 24.885929107666016, "learning_rate": 4.973419482333032e-07, "logits/chosen": -2.423205852508545, "logits/rejected": -2.400514841079712, "logps/chosen": -207.8368377685547, "logps/rejected": -228.2991943359375, "loss": 0.6344, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5136899948120117, "rewards/margins": 0.24480590224266052, "rewards/rejected": -1.7584959268569946, "step": 2470 }, { "epoch": 0.4272915230875258, "grad_norm": 17.576356887817383, "learning_rate": 4.972685574743893e-07, "logits/chosen": -2.3530068397521973, "logits/rejected": -2.3250749111175537, "logps/chosen": -208.4572296142578, "logps/rejected": -228.11386108398438, "loss": 0.6431, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5363104343414307, "rewards/margins": 0.24977517127990723, "rewards/rejected": -1.7860854864120483, "step": 2480 }, { "epoch": 0.4290144727773949, "grad_norm": 16.877614974975586, "learning_rate": 4.971941728433687e-07, "logits/chosen": -2.3844947814941406, "logits/rejected": -2.3725411891937256, "logps/chosen": -183.4873809814453, "logps/rejected": -214.20358276367188, "loss": 0.6047, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3321155309677124, "rewards/margins": 0.29924309253692627, "rewards/rejected": -1.6313585042953491, "step": 2490 }, { "epoch": 0.43073742246726393, "grad_norm": 18.2493953704834, "learning_rate": 4.971187946392232e-07, "logits/chosen": -2.416098117828369, "logits/rejected": -2.3896546363830566, "logps/chosen": -198.02102661132812, "logps/rejected": -223.6732635498047, "loss": 0.6283, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.4123165607452393, "rewards/margins": 0.2674259543418884, "rewards/rejected": -1.679742455482483, "step": 2500 }, { "epoch": 0.432460372157133, "grad_norm": 16.896526336669922, "learning_rate": 4.970424231649281e-07, "logits/chosen": -2.3382649421691895, "logits/rejected": -2.323734998703003, "logps/chosen": -179.71163940429688, "logps/rejected": -205.0246124267578, "loss": 0.6203, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2651691436767578, "rewards/margins": 0.2776244282722473, "rewards/rejected": -1.5427935123443604, "step": 2510 }, { "epoch": 0.4341833218470021, "grad_norm": 18.621265411376953, "learning_rate": 4.969650587274512e-07, "logits/chosen": -2.3268797397613525, "logits/rejected": -2.303870677947998, "logps/chosen": -175.41378784179688, "logps/rejected": -195.4097137451172, "loss": 0.6276, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2218420505523682, "rewards/margins": 0.24985983967781067, "rewards/rejected": -1.471701741218567, "step": 2520 }, { "epoch": 0.4359062715368711, "grad_norm": 17.27171516418457, "learning_rate": 4.968867016377514e-07, "logits/chosen": -2.396489381790161, "logits/rejected": -2.3934218883514404, "logps/chosen": -171.95578002929688, "logps/rejected": -191.56161499023438, "loss": 0.6468, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1982240676879883, "rewards/margins": 0.18646053969860077, "rewards/rejected": -1.384684681892395, "step": 2530 }, { "epoch": 0.4376292212267402, "grad_norm": 21.131698608398438, "learning_rate": 4.968073522107776e-07, "logits/chosen": -2.3872904777526855, "logits/rejected": -2.368335247039795, "logps/chosen": -181.82164001464844, "logps/rejected": -206.3303985595703, "loss": 0.6412, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2794595956802368, "rewards/margins": 0.25313353538513184, "rewards/rejected": -1.532593011856079, "step": 2540 }, { "epoch": 0.4393521709166092, "grad_norm": 17.541488647460938, "learning_rate": 4.96727010765467e-07, "logits/chosen": -2.432960271835327, "logits/rejected": -2.402984619140625, "logps/chosen": -160.93421936035156, "logps/rejected": -181.20828247070312, "loss": 0.6181, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.0086137056350708, "rewards/margins": 0.2705707252025604, "rewards/rejected": -1.2791844606399536, "step": 2550 }, { "epoch": 0.4410751206064783, "grad_norm": 13.708597183227539, "learning_rate": 4.966456776247443e-07, "logits/chosen": -2.4958736896514893, "logits/rejected": -2.475297451019287, "logps/chosen": -143.11212158203125, "logps/rejected": -162.2141876220703, "loss": 0.6275, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.869605541229248, "rewards/margins": 0.21452376246452332, "rewards/rejected": -1.0841293334960938, "step": 2560 }, { "epoch": 0.4427980702963473, "grad_norm": 12.59628677368164, "learning_rate": 4.965633531155203e-07, "logits/chosen": -2.429553747177124, "logits/rejected": -2.4171500205993652, "logps/chosen": -153.55223083496094, "logps/rejected": -172.22024536132812, "loss": 0.6171, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9363746643066406, "rewards/margins": 0.22742292284965515, "rewards/rejected": -1.1637976169586182, "step": 2570 }, { "epoch": 0.4445210199862164, "grad_norm": 15.672533988952637, "learning_rate": 4.964800375686903e-07, "logits/chosen": -2.4386448860168457, "logits/rejected": -2.424298048019409, "logps/chosen": -152.78854370117188, "logps/rejected": -167.5104522705078, "loss": 0.649, "rewards/accuracies": 0.625, "rewards/chosen": -0.9870938062667847, "rewards/margins": 0.16412413120269775, "rewards/rejected": -1.151218056678772, "step": 2580 }, { "epoch": 0.4462439696760855, "grad_norm": 13.439681053161621, "learning_rate": 4.963957313191332e-07, "logits/chosen": -2.4154040813446045, "logits/rejected": -2.387998342514038, "logps/chosen": -135.6188201904297, "logps/rejected": -143.81808471679688, "loss": 0.6398, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.7850553393363953, "rewards/margins": 0.15120729804039001, "rewards/rejected": -0.9362626075744629, "step": 2590 }, { "epoch": 0.4479669193659545, "grad_norm": 17.686616897583008, "learning_rate": 4.963104347057098e-07, "logits/chosen": -2.433934211730957, "logits/rejected": -2.404848575592041, "logps/chosen": -153.14370727539062, "logps/rejected": -165.4264678955078, "loss": 0.6196, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9457501173019409, "rewards/margins": 0.22615385055541992, "rewards/rejected": -1.1719040870666504, "step": 2600 }, { "epoch": 0.4496898690558236, "grad_norm": 17.013931274414062, "learning_rate": 4.962241480712617e-07, "logits/chosen": -2.404201030731201, "logits/rejected": -2.374253034591675, "logps/chosen": -162.6221923828125, "logps/rejected": -173.8114013671875, "loss": 0.6446, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.039057731628418, "rewards/margins": 0.18781016767024994, "rewards/rejected": -1.226867914199829, "step": 2610 }, { "epoch": 0.4514128187456926, "grad_norm": 15.482500076293945, "learning_rate": 4.961368717626094e-07, "logits/chosen": -2.4310014247894287, "logits/rejected": -2.4053142070770264, "logps/chosen": -151.65878295898438, "logps/rejected": -174.03086853027344, "loss": 0.6122, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9968489408493042, "rewards/margins": 0.24560029804706573, "rewards/rejected": -1.242449402809143, "step": 2620 }, { "epoch": 0.4531357684355617, "grad_norm": 12.22416877746582, "learning_rate": 4.960486061305519e-07, "logits/chosen": -2.375060796737671, "logits/rejected": -2.3530540466308594, "logps/chosen": -153.27517700195312, "logps/rejected": -178.20663452148438, "loss": 0.6036, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9763904809951782, "rewards/margins": 0.2762894332408905, "rewards/rejected": -1.2526799440383911, "step": 2630 }, { "epoch": 0.4548587181254307, "grad_norm": 18.672534942626953, "learning_rate": 4.959593515298643e-07, "logits/chosen": -2.376676082611084, "logits/rejected": -2.3608272075653076, "logps/chosen": -167.3198699951172, "logps/rejected": -189.13951110839844, "loss": 0.6185, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1600196361541748, "rewards/margins": 0.24958789348602295, "rewards/rejected": -1.4096072912216187, "step": 2640 }, { "epoch": 0.4565816678152998, "grad_norm": 15.9100341796875, "learning_rate": 4.95869108319297e-07, "logits/chosen": -2.326450824737549, "logits/rejected": -2.310497999191284, "logps/chosen": -178.01046752929688, "logps/rejected": -204.0076141357422, "loss": 0.6313, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2428455352783203, "rewards/margins": 0.2660408616065979, "rewards/rejected": -1.508886456489563, "step": 2650 }, { "epoch": 0.4583046175051689, "grad_norm": 26.8997859954834, "learning_rate": 4.957778768615736e-07, "logits/chosen": -2.3683345317840576, "logits/rejected": -2.336277723312378, "logps/chosen": -182.6915740966797, "logps/rejected": -200.2922821044922, "loss": 0.6458, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2424503564834595, "rewards/margins": 0.24122914671897888, "rewards/rejected": -1.4836794137954712, "step": 2660 }, { "epoch": 0.4600275671950379, "grad_norm": 14.404425621032715, "learning_rate": 4.956856575233903e-07, "logits/chosen": -2.351348400115967, "logits/rejected": -2.326855182647705, "logps/chosen": -173.9849395751953, "logps/rejected": -194.16741943359375, "loss": 0.6401, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1892454624176025, "rewards/margins": 0.23701095581054688, "rewards/rejected": -1.4262564182281494, "step": 2670 }, { "epoch": 0.461750516884907, "grad_norm": 11.633365631103516, "learning_rate": 4.955924506754137e-07, "logits/chosen": -2.45227313041687, "logits/rejected": -2.4269490242004395, "logps/chosen": -151.9998016357422, "logps/rejected": -161.83682250976562, "loss": 0.6499, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.9237052798271179, "rewards/margins": 0.1673552542924881, "rewards/rejected": -1.0910605192184448, "step": 2680 }, { "epoch": 0.463473466574776, "grad_norm": 12.507375717163086, "learning_rate": 4.9549825669228e-07, "logits/chosen": -2.4198460578918457, "logits/rejected": -2.4037673473358154, "logps/chosen": -133.57400512695312, "logps/rejected": -155.14573669433594, "loss": 0.6079, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8238075375556946, "rewards/margins": 0.23528620600700378, "rewards/rejected": -1.059093713760376, "step": 2690 }, { "epoch": 0.4651964162646451, "grad_norm": 18.42345428466797, "learning_rate": 4.954030759525926e-07, "logits/chosen": -2.3795695304870605, "logits/rejected": -2.3539459705352783, "logps/chosen": -158.07931518554688, "logps/rejected": -182.7133026123047, "loss": 0.621, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0432260036468506, "rewards/margins": 0.2598930597305298, "rewards/rejected": -1.3031190633773804, "step": 2700 }, { "epoch": 0.4669193659545141, "grad_norm": 18.570087432861328, "learning_rate": 4.953069088389212e-07, "logits/chosen": -2.391944408416748, "logits/rejected": -2.3680570125579834, "logps/chosen": -180.21099853515625, "logps/rejected": -200.0708770751953, "loss": 0.6203, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.193274974822998, "rewards/margins": 0.26030537486076355, "rewards/rejected": -1.4535801410675049, "step": 2710 }, { "epoch": 0.4686423156443832, "grad_norm": 20.145286560058594, "learning_rate": 4.952097557378007e-07, "logits/chosen": -2.346468210220337, "logits/rejected": -2.348935604095459, "logps/chosen": -177.99276733398438, "logps/rejected": -204.42294311523438, "loss": 0.6294, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2465171813964844, "rewards/margins": 0.2638143002986908, "rewards/rejected": -1.510331630706787, "step": 2720 }, { "epoch": 0.4703652653342522, "grad_norm": 26.07516098022461, "learning_rate": 4.95111617039728e-07, "logits/chosen": -2.375075578689575, "logits/rejected": -2.3608245849609375, "logps/chosen": -179.36578369140625, "logps/rejected": -200.58523559570312, "loss": 0.6385, "rewards/accuracies": 0.625, "rewards/chosen": -1.1957206726074219, "rewards/margins": 0.26566845178604126, "rewards/rejected": -1.4613893032073975, "step": 2730 }, { "epoch": 0.4720882150241213, "grad_norm": 11.392072677612305, "learning_rate": 4.950124931391627e-07, "logits/chosen": -2.3920974731445312, "logits/rejected": -2.378690242767334, "logps/chosen": -145.80947875976562, "logps/rejected": -156.5276336669922, "loss": 0.6588, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.916241466999054, "rewards/margins": 0.1627216637134552, "rewards/rejected": -1.0789631605148315, "step": 2740 }, { "epoch": 0.4738111647139904, "grad_norm": 17.176795959472656, "learning_rate": 4.949123844345233e-07, "logits/chosen": -2.351637363433838, "logits/rejected": -2.3455610275268555, "logps/chosen": -134.7650146484375, "logps/rejected": -164.57571411132812, "loss": 0.6025, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8470023274421692, "rewards/margins": 0.27665042877197266, "rewards/rejected": -1.1236528158187866, "step": 2750 }, { "epoch": 0.4755341144038594, "grad_norm": 15.060934066772461, "learning_rate": 4.948112913281874e-07, "logits/chosen": -2.369976282119751, "logits/rejected": -2.359400749206543, "logps/chosen": -162.0025634765625, "logps/rejected": -189.9093017578125, "loss": 0.6101, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0648624897003174, "rewards/margins": 0.2908114790916443, "rewards/rejected": -1.3556737899780273, "step": 2760 }, { "epoch": 0.4772570640937285, "grad_norm": 19.979862213134766, "learning_rate": 4.947092142264888e-07, "logits/chosen": -2.415330648422241, "logits/rejected": -2.3838698863983154, "logps/chosen": -192.2305145263672, "logps/rejected": -213.5076904296875, "loss": 0.6274, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3599928617477417, "rewards/margins": 0.2542092204093933, "rewards/rejected": -1.6142021417617798, "step": 2770 }, { "epoch": 0.4789800137835975, "grad_norm": 15.92596435546875, "learning_rate": 4.946061535397166e-07, "logits/chosen": -2.284851312637329, "logits/rejected": -2.2607173919677734, "logps/chosen": -198.20108032226562, "logps/rejected": -222.64962768554688, "loss": 0.6231, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4504778385162354, "rewards/margins": 0.2777005434036255, "rewards/rejected": -1.72817862033844, "step": 2780 }, { "epoch": 0.4807029634734666, "grad_norm": 13.30046558380127, "learning_rate": 4.945021096821133e-07, "logits/chosen": -2.361159324645996, "logits/rejected": -2.338351011276245, "logps/chosen": -177.97535705566406, "logps/rejected": -197.57366943359375, "loss": 0.6289, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2466442584991455, "rewards/margins": 0.23334665596485138, "rewards/rejected": -1.4799909591674805, "step": 2790 }, { "epoch": 0.4824259131633356, "grad_norm": 15.571688652038574, "learning_rate": 4.943970830718733e-07, "logits/chosen": -2.4174747467041016, "logits/rejected": -2.3927206993103027, "logps/chosen": -181.1056671142578, "logps/rejected": -194.77957153320312, "loss": 0.6616, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.210784673690796, "rewards/margins": 0.1808403581380844, "rewards/rejected": -1.391625165939331, "step": 2800 }, { "epoch": 0.4824259131633356, "eval_logits/chosen": -2.47371506690979, "eval_logits/rejected": -2.4667787551879883, "eval_logps/chosen": -137.4051055908203, "eval_logps/rejected": -155.18771362304688, "eval_loss": 0.6472736597061157, "eval_rewards/accuracies": 0.6303438544273376, "eval_rewards/chosen": -0.7838963866233826, "eval_rewards/margins": 0.14048486948013306, "eval_rewards/rejected": -0.9243812561035156, "eval_runtime": 361.0112, "eval_samples_per_second": 11.922, "eval_steps_per_second": 1.49, "step": 2800 }, { "epoch": 0.4841488628532047, "grad_norm": 18.38011932373047, "learning_rate": 4.942910741311406e-07, "logits/chosen": -2.3694589138031006, "logits/rejected": -2.3449339866638184, "logps/chosen": -145.30213928222656, "logps/rejected": -169.84364318847656, "loss": 0.6158, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8849374651908875, "rewards/margins": 0.25815480947494507, "rewards/rejected": -1.143092393875122, "step": 2810 }, { "epoch": 0.48587181254307377, "grad_norm": 12.251883506774902, "learning_rate": 4.941840832860081e-07, "logits/chosen": -2.4300949573516846, "logits/rejected": -2.4172868728637695, "logps/chosen": -154.06777954101562, "logps/rejected": -165.54075622558594, "loss": 0.6604, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.9826035499572754, "rewards/margins": 0.15538419783115387, "rewards/rejected": -1.1379878520965576, "step": 2820 }, { "epoch": 0.4875947622329428, "grad_norm": 13.790352821350098, "learning_rate": 4.940761109665151e-07, "logits/chosen": -2.3257176876068115, "logits/rejected": -2.30985426902771, "logps/chosen": -139.08047485351562, "logps/rejected": -166.15426635742188, "loss": 0.5967, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8232904672622681, "rewards/margins": 0.2932248115539551, "rewards/rejected": -1.1165152788162231, "step": 2830 }, { "epoch": 0.48931771192281187, "grad_norm": 15.159543991088867, "learning_rate": 4.939671576066461e-07, "logits/chosen": -2.3804128170013428, "logits/rejected": -2.3669979572296143, "logps/chosen": -163.49893188476562, "logps/rejected": -184.92532348632812, "loss": 0.6222, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0932546854019165, "rewards/margins": 0.251810222864151, "rewards/rejected": -1.3450647592544556, "step": 2840 }, { "epoch": 0.4910406616126809, "grad_norm": 20.24427032470703, "learning_rate": 4.938572236443284e-07, "logits/chosen": -2.2991671562194824, "logits/rejected": -2.280132293701172, "logps/chosen": -193.11656188964844, "logps/rejected": -227.9095001220703, "loss": 0.5959, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3988648653030396, "rewards/margins": 0.33976760506629944, "rewards/rejected": -1.7386324405670166, "step": 2850 }, { "epoch": 0.49276361130255, "grad_norm": 22.15233039855957, "learning_rate": 4.937463095214311e-07, "logits/chosen": -2.2852959632873535, "logits/rejected": -2.2603890895843506, "logps/chosen": -198.12644958496094, "logps/rejected": -225.90185546875, "loss": 0.6083, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4299789667129517, "rewards/margins": 0.31678932905197144, "rewards/rejected": -1.7467683553695679, "step": 2860 }, { "epoch": 0.494486560992419, "grad_norm": 19.36538314819336, "learning_rate": 4.936344156837628e-07, "logits/chosen": -2.3628876209259033, "logits/rejected": -2.3462116718292236, "logps/chosen": -202.44029235839844, "logps/rejected": -227.3742218017578, "loss": 0.6243, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.4570209980010986, "rewards/margins": 0.29711562395095825, "rewards/rejected": -1.7541366815567017, "step": 2870 }, { "epoch": 0.4962095106822881, "grad_norm": 19.28398323059082, "learning_rate": 4.935215425810699e-07, "logits/chosen": -2.380265235900879, "logits/rejected": -2.3476052284240723, "logps/chosen": -193.31112670898438, "logps/rejected": -222.06802368164062, "loss": 0.5895, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3700450658798218, "rewards/margins": 0.35239681601524353, "rewards/rejected": -1.7224420309066772, "step": 2880 }, { "epoch": 0.49793246037215716, "grad_norm": 27.148590087890625, "learning_rate": 4.93407690667035e-07, "logits/chosen": -2.3664560317993164, "logits/rejected": -2.350916624069214, "logps/chosen": -186.02096557617188, "logps/rejected": -211.7760009765625, "loss": 0.6284, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3212529420852661, "rewards/margins": 0.2776423692703247, "rewards/rejected": -1.5988953113555908, "step": 2890 }, { "epoch": 0.4996554100620262, "grad_norm": 24.74915313720703, "learning_rate": 4.93292860399275e-07, "logits/chosen": -2.307837724685669, "logits/rejected": -2.2958502769470215, "logps/chosen": -174.7818603515625, "logps/rejected": -202.28302001953125, "loss": 0.6065, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2114994525909424, "rewards/margins": 0.2999010384082794, "rewards/rejected": -1.5114004611968994, "step": 2900 }, { "epoch": 0.5013783597518953, "grad_norm": 34.53604507446289, "learning_rate": 4.931770522393388e-07, "logits/chosen": -2.3079707622528076, "logits/rejected": -2.2857425212860107, "logps/chosen": -206.4672088623047, "logps/rejected": -244.2429962158203, "loss": 0.5876, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.491794228553772, "rewards/margins": 0.3996439576148987, "rewards/rejected": -1.8914382457733154, "step": 2910 }, { "epoch": 0.5031013094417643, "grad_norm": 26.741483688354492, "learning_rate": 4.930602666527063e-07, "logits/chosen": -2.2209506034851074, "logits/rejected": -2.2009482383728027, "logps/chosen": -222.34085083007812, "logps/rejected": -267.97027587890625, "loss": 0.5931, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6801646947860718, "rewards/margins": 0.4615021347999573, "rewards/rejected": -2.141667127609253, "step": 2920 }, { "epoch": 0.5048242591316333, "grad_norm": 14.45638656616211, "learning_rate": 4.929425041087859e-07, "logits/chosen": -2.20749831199646, "logits/rejected": -2.176126718521118, "logps/chosen": -204.36605834960938, "logps/rejected": -239.574951171875, "loss": 0.5804, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5025591850280762, "rewards/margins": 0.3914002776145935, "rewards/rejected": -1.893959641456604, "step": 2930 }, { "epoch": 0.5065472088215024, "grad_norm": 15.108778953552246, "learning_rate": 4.928237650809127e-07, "logits/chosen": -2.2956180572509766, "logits/rejected": -2.265584945678711, "logps/chosen": -193.35580444335938, "logps/rejected": -215.659423828125, "loss": 0.6479, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4088243246078491, "rewards/margins": 0.24615474045276642, "rewards/rejected": -1.6549791097640991, "step": 2940 }, { "epoch": 0.5082701585113715, "grad_norm": 18.875507354736328, "learning_rate": 4.927040500463468e-07, "logits/chosen": -2.283984661102295, "logits/rejected": -2.262730121612549, "logps/chosen": -188.10000610351562, "logps/rejected": -218.161376953125, "loss": 0.6201, "rewards/accuracies": 0.65625, "rewards/chosen": -1.328522801399231, "rewards/margins": 0.31565409898757935, "rewards/rejected": -1.6441768407821655, "step": 2950 }, { "epoch": 0.5099931082012406, "grad_norm": 14.578801155090332, "learning_rate": 4.925833594862714e-07, "logits/chosen": -2.285252809524536, "logits/rejected": -2.256265640258789, "logps/chosen": -176.9915008544922, "logps/rejected": -215.2601776123047, "loss": 0.5781, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2218300104141235, "rewards/margins": 0.41677817702293396, "rewards/rejected": -1.6386082172393799, "step": 2960 }, { "epoch": 0.5117160578911096, "grad_norm": 22.361921310424805, "learning_rate": 4.924616938857903e-07, "logits/chosen": -2.255094051361084, "logits/rejected": -2.2333807945251465, "logps/chosen": -194.3338165283203, "logps/rejected": -219.2552490234375, "loss": 0.6254, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3872995376586914, "rewards/margins": 0.29174166917800903, "rewards/rejected": -1.6790411472320557, "step": 2970 }, { "epoch": 0.5134390075809786, "grad_norm": 26.852266311645508, "learning_rate": 4.923390537339268e-07, "logits/chosen": -2.2011523246765137, "logits/rejected": -2.1737256050109863, "logps/chosen": -191.21688842773438, "logps/rejected": -223.82461547851562, "loss": 0.5996, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3598167896270752, "rewards/margins": 0.3606993556022644, "rewards/rejected": -1.7205158472061157, "step": 2980 }, { "epoch": 0.5151619572708477, "grad_norm": 16.266706466674805, "learning_rate": 4.922154395236211e-07, "logits/chosen": -2.2656571865081787, "logits/rejected": -2.2512364387512207, "logps/chosen": -197.9558868408203, "logps/rejected": -223.987548828125, "loss": 0.6164, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4030365943908691, "rewards/margins": 0.30091115832328796, "rewards/rejected": -1.7039477825164795, "step": 2990 }, { "epoch": 0.5168849069607168, "grad_norm": 12.130602836608887, "learning_rate": 4.920908517517286e-07, "logits/chosen": -2.3124916553497314, "logits/rejected": -2.29948091506958, "logps/chosen": -173.8235626220703, "logps/rejected": -202.1744842529297, "loss": 0.639, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2107598781585693, "rewards/margins": 0.2734646797180176, "rewards/rejected": -1.484224557876587, "step": 3000 }, { "epoch": 0.5186078566505858, "grad_norm": 12.290034294128418, "learning_rate": 4.919652909190178e-07, "logits/chosen": -2.3610310554504395, "logits/rejected": -2.340604305267334, "logps/chosen": -149.28158569335938, "logps/rejected": -167.35586547851562, "loss": 0.6329, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9376060366630554, "rewards/margins": 0.21691469848155975, "rewards/rejected": -1.1545206308364868, "step": 3010 }, { "epoch": 0.5203308063404548, "grad_norm": 13.899296760559082, "learning_rate": 4.918387575301684e-07, "logits/chosen": -2.431248426437378, "logits/rejected": -2.4067440032958984, "logps/chosen": -138.94432067871094, "logps/rejected": -162.56423950195312, "loss": 0.603, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8373797535896301, "rewards/margins": 0.2983129918575287, "rewards/rejected": -1.1356927156448364, "step": 3020 }, { "epoch": 0.5220537560303239, "grad_norm": 14.007013320922852, "learning_rate": 4.91711252093769e-07, "logits/chosen": -2.4040751457214355, "logits/rejected": -2.3848490715026855, "logps/chosen": -140.6970977783203, "logps/rejected": -159.55972290039062, "loss": 0.6295, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8438035249710083, "rewards/margins": 0.2084636688232422, "rewards/rejected": -1.05226731300354, "step": 3030 }, { "epoch": 0.523776705720193, "grad_norm": 13.476959228515625, "learning_rate": 4.915827751223158e-07, "logits/chosen": -2.367610454559326, "logits/rejected": -2.360579013824463, "logps/chosen": -141.59710693359375, "logps/rejected": -167.885009765625, "loss": 0.6215, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8983238935470581, "rewards/margins": 0.2517377436161041, "rewards/rejected": -1.1500616073608398, "step": 3040 }, { "epoch": 0.525499655410062, "grad_norm": 28.89926528930664, "learning_rate": 4.914533271322091e-07, "logits/chosen": -2.2987053394317627, "logits/rejected": -2.2912533283233643, "logps/chosen": -154.363525390625, "logps/rejected": -182.75950622558594, "loss": 0.6162, "rewards/accuracies": 0.6875, "rewards/chosen": -1.002435564994812, "rewards/margins": 0.27345195412635803, "rewards/rejected": -1.2758877277374268, "step": 3050 }, { "epoch": 0.5272226050999311, "grad_norm": 20.796573638916016, "learning_rate": 4.913229086437528e-07, "logits/chosen": -2.2584993839263916, "logits/rejected": -2.239915132522583, "logps/chosen": -173.90652465820312, "logps/rejected": -203.5644073486328, "loss": 0.6046, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1754299402236938, "rewards/margins": 0.32074540853500366, "rewards/rejected": -1.4961752891540527, "step": 3060 }, { "epoch": 0.5289455547898001, "grad_norm": 24.239168167114258, "learning_rate": 4.911915201811515e-07, "logits/chosen": -2.2567455768585205, "logits/rejected": -2.2298760414123535, "logps/chosen": -184.6838836669922, "logps/rejected": -213.2223663330078, "loss": 0.6006, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2551294565200806, "rewards/margins": 0.3479345440864563, "rewards/rejected": -1.6030641794204712, "step": 3070 }, { "epoch": 0.5306685044796692, "grad_norm": 21.80423927307129, "learning_rate": 4.910591622725084e-07, "logits/chosen": -2.2212891578674316, "logits/rejected": -2.20196533203125, "logps/chosen": -180.50128173828125, "logps/rejected": -218.0332794189453, "loss": 0.6132, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2961989641189575, "rewards/margins": 0.3487749397754669, "rewards/rejected": -1.6449737548828125, "step": 3080 }, { "epoch": 0.5323914541695383, "grad_norm": 16.022611618041992, "learning_rate": 4.909258354498235e-07, "logits/chosen": -2.2686338424682617, "logits/rejected": -2.2473368644714355, "logps/chosen": -161.1438446044922, "logps/rejected": -190.21652221679688, "loss": 0.5953, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0593734979629517, "rewards/margins": 0.31629690527915955, "rewards/rejected": -1.3756701946258545, "step": 3090 }, { "epoch": 0.5341144038594073, "grad_norm": 24.18372917175293, "learning_rate": 4.907915402489907e-07, "logits/chosen": -2.2207179069519043, "logits/rejected": -2.192347526550293, "logps/chosen": -179.25706481933594, "logps/rejected": -212.54580688476562, "loss": 0.5964, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2334949970245361, "rewards/margins": 0.3810732364654541, "rewards/rejected": -1.6145683526992798, "step": 3100 }, { "epoch": 0.5358373535492763, "grad_norm": 20.850187301635742, "learning_rate": 4.90656277209797e-07, "logits/chosen": -2.213014602661133, "logits/rejected": -2.2100424766540527, "logps/chosen": -188.37265014648438, "logps/rejected": -221.6591796875, "loss": 0.6286, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.368798017501831, "rewards/margins": 0.30511146783828735, "rewards/rejected": -1.6739095449447632, "step": 3110 }, { "epoch": 0.5375603032391454, "grad_norm": 11.811249732971191, "learning_rate": 4.905200468759188e-07, "logits/chosen": -2.2730698585510254, "logits/rejected": -2.249375820159912, "logps/chosen": -170.12158203125, "logps/rejected": -206.99087524414062, "loss": 0.5684, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1288915872573853, "rewards/margins": 0.3948993980884552, "rewards/rejected": -1.523790955543518, "step": 3120 }, { "epoch": 0.5392832529290145, "grad_norm": 23.460145950317383, "learning_rate": 4.903828497949211e-07, "logits/chosen": -2.178682804107666, "logits/rejected": -2.165452480316162, "logps/chosen": -175.53173828125, "logps/rejected": -213.2287139892578, "loss": 0.6033, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2262389659881592, "rewards/margins": 0.3743092119693756, "rewards/rejected": -1.6005481481552124, "step": 3130 }, { "epoch": 0.5410062026188835, "grad_norm": 19.67573356628418, "learning_rate": 4.90244686518254e-07, "logits/chosen": -2.1993725299835205, "logits/rejected": -2.169201135635376, "logps/chosen": -198.3735809326172, "logps/rejected": -246.44351196289062, "loss": 0.5797, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4586102962493896, "rewards/margins": 0.47803306579589844, "rewards/rejected": -1.9366432428359985, "step": 3140 }, { "epoch": 0.5427291523087526, "grad_norm": 22.29399299621582, "learning_rate": 4.901055576012518e-07, "logits/chosen": -2.1717159748077393, "logits/rejected": -2.1441261768341064, "logps/chosen": -223.3928985595703, "logps/rejected": -258.8575744628906, "loss": 0.5987, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6766141653060913, "rewards/margins": 0.3859788775444031, "rewards/rejected": -2.0625932216644287, "step": 3150 }, { "epoch": 0.5444521019986216, "grad_norm": 18.63553810119629, "learning_rate": 4.899654636031295e-07, "logits/chosen": -2.114305257797241, "logits/rejected": -2.0909764766693115, "logps/chosen": -223.34683227539062, "logps/rejected": -255.32778930664062, "loss": 0.628, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6803725957870483, "rewards/margins": 0.36115798354148865, "rewards/rejected": -2.0415308475494385, "step": 3160 }, { "epoch": 0.5461750516884907, "grad_norm": 15.236860275268555, "learning_rate": 4.898244050869817e-07, "logits/chosen": -2.231354236602783, "logits/rejected": -2.2046492099761963, "logps/chosen": -191.9114227294922, "logps/rejected": -218.0845184326172, "loss": 0.6064, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3479373455047607, "rewards/margins": 0.35387980937957764, "rewards/rejected": -1.7018171548843384, "step": 3170 }, { "epoch": 0.5478980013783598, "grad_norm": 21.483970642089844, "learning_rate": 4.896823826197791e-07, "logits/chosen": -2.3086674213409424, "logits/rejected": -2.2663798332214355, "logps/chosen": -171.53570556640625, "logps/rejected": -203.14710998535156, "loss": 0.5987, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1747710704803467, "rewards/margins": 0.34922733902931213, "rewards/rejected": -1.5239986181259155, "step": 3180 }, { "epoch": 0.5496209510682288, "grad_norm": 20.17723274230957, "learning_rate": 4.895393967723675e-07, "logits/chosen": -2.27504563331604, "logits/rejected": -2.2569966316223145, "logps/chosen": -172.12185668945312, "logps/rejected": -224.7152099609375, "loss": 0.5484, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1867334842681885, "rewards/margins": 0.49948158860206604, "rewards/rejected": -1.686214804649353, "step": 3190 }, { "epoch": 0.5513439007580979, "grad_norm": 18.862043380737305, "learning_rate": 4.893954481194647e-07, "logits/chosen": -2.160454034805298, "logits/rejected": -2.137086868286133, "logps/chosen": -207.37405395507812, "logps/rejected": -232.21044921875, "loss": 0.6282, "rewards/accuracies": 0.625, "rewards/chosen": -1.5000948905944824, "rewards/margins": 0.30478715896606445, "rewards/rejected": -1.8048820495605469, "step": 3200 }, { "epoch": 0.5513439007580979, "eval_logits/chosen": -2.2572572231292725, "eval_logits/rejected": -2.244072198867798, "eval_logps/chosen": -196.64373779296875, "eval_logps/rejected": -222.1839599609375, "eval_loss": 0.6395382881164551, "eval_rewards/accuracies": 0.6331319808959961, "eval_rewards/chosen": -1.3762826919555664, "eval_rewards/margins": 0.21806086599826813, "eval_rewards/rejected": -1.5943433046340942, "eval_runtime": 361.1551, "eval_samples_per_second": 11.917, "eval_steps_per_second": 1.49, "step": 3200 }, { "epoch": 0.5530668504479669, "grad_norm": 19.522754669189453, "learning_rate": 4.892505372396586e-07, "logits/chosen": -2.1047682762145996, "logits/rejected": -2.072514295578003, "logps/chosen": -204.9078826904297, "logps/rejected": -240.2461700439453, "loss": 0.5809, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5026588439941406, "rewards/margins": 0.38025617599487305, "rewards/rejected": -1.8829149007797241, "step": 3210 }, { "epoch": 0.554789800137836, "grad_norm": 15.139505386352539, "learning_rate": 4.891046647154042e-07, "logits/chosen": -2.159850835800171, "logits/rejected": -2.144533634185791, "logps/chosen": -174.6404266357422, "logps/rejected": -217.5984649658203, "loss": 0.5862, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2361440658569336, "rewards/margins": 0.4201563000679016, "rewards/rejected": -1.6563003063201904, "step": 3220 }, { "epoch": 0.556512749827705, "grad_norm": 32.36344909667969, "learning_rate": 4.889578311330222e-07, "logits/chosen": -2.195793628692627, "logits/rejected": -2.1746668815612793, "logps/chosen": -170.8546142578125, "logps/rejected": -203.3587646484375, "loss": 0.6136, "rewards/accuracies": 0.625, "rewards/chosen": -1.1639937162399292, "rewards/margins": 0.3140779137611389, "rewards/rejected": -1.4780715703964233, "step": 3230 }, { "epoch": 0.5582356995175741, "grad_norm": 17.12906837463379, "learning_rate": 4.88810037082696e-07, "logits/chosen": -2.2963457107543945, "logits/rejected": -2.2741098403930664, "logps/chosen": -166.2859344482422, "logps/rejected": -190.31089782714844, "loss": 0.6198, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0893151760101318, "rewards/margins": 0.3024675250053406, "rewards/rejected": -1.3917827606201172, "step": 3240 }, { "epoch": 0.5599586492074431, "grad_norm": 22.398632049560547, "learning_rate": 4.886612831584695e-07, "logits/chosen": -2.224777936935425, "logits/rejected": -2.206634521484375, "logps/chosen": -147.0729522705078, "logps/rejected": -176.0797882080078, "loss": 0.607, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9327560663223267, "rewards/margins": 0.2998958230018616, "rewards/rejected": -1.232651948928833, "step": 3250 }, { "epoch": 0.5616815988973122, "grad_norm": 20.484375, "learning_rate": 4.885115699582447e-07, "logits/chosen": -2.2385010719299316, "logits/rejected": -2.2157609462738037, "logps/chosen": -150.0866241455078, "logps/rejected": -186.73904418945312, "loss": 0.5937, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9772035479545593, "rewards/margins": 0.3763968050479889, "rewards/rejected": -1.353600263595581, "step": 3260 }, { "epoch": 0.5634045485871813, "grad_norm": 18.28183364868164, "learning_rate": 4.883608980837795e-07, "logits/chosen": -2.2444045543670654, "logits/rejected": -2.2267048358917236, "logps/chosen": -185.88681030273438, "logps/rejected": -222.32772827148438, "loss": 0.6101, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.29819917678833, "rewards/margins": 0.36155635118484497, "rewards/rejected": -1.6597554683685303, "step": 3270 }, { "epoch": 0.5651274982770503, "grad_norm": 21.511371612548828, "learning_rate": 4.882092681406849e-07, "logits/chosen": -2.238133668899536, "logits/rejected": -2.213491439819336, "logps/chosen": -182.04161071777344, "logps/rejected": -219.9537811279297, "loss": 0.5955, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3175451755523682, "rewards/margins": 0.36441078782081604, "rewards/rejected": -1.6819560527801514, "step": 3280 }, { "epoch": 0.5668504479669194, "grad_norm": 22.683391571044922, "learning_rate": 4.880566807384227e-07, "logits/chosen": -2.188809394836426, "logits/rejected": -2.1437454223632812, "logps/chosen": -226.0194549560547, "logps/rejected": -278.88140869140625, "loss": 0.5833, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7210540771484375, "rewards/margins": 0.5367581248283386, "rewards/rejected": -2.257812261581421, "step": 3290 }, { "epoch": 0.5685733976567884, "grad_norm": 52.026302337646484, "learning_rate": 4.879031364903034e-07, "logits/chosen": -2.0331356525421143, "logits/rejected": -2.0007588863372803, "logps/chosen": -296.7420959472656, "logps/rejected": -345.68072509765625, "loss": 0.6259, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.411536693572998, "rewards/margins": 0.5197795629501343, "rewards/rejected": -2.931316614151001, "step": 3300 }, { "epoch": 0.5702963473466575, "grad_norm": 20.97651481628418, "learning_rate": 4.877486360134832e-07, "logits/chosen": -2.025834560394287, "logits/rejected": -1.9820560216903687, "logps/chosen": -266.26531982421875, "logps/rejected": -314.23394775390625, "loss": 0.5828, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1163125038146973, "rewards/margins": 0.5228889584541321, "rewards/rejected": -2.6392009258270264, "step": 3310 }, { "epoch": 0.5720192970365265, "grad_norm": 16.926071166992188, "learning_rate": 4.875931799289619e-07, "logits/chosen": -2.148395538330078, "logits/rejected": -2.1126646995544434, "logps/chosen": -198.46603393554688, "logps/rejected": -232.729736328125, "loss": 0.6, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.440104365348816, "rewards/margins": 0.40426725149154663, "rewards/rejected": -1.8443717956542969, "step": 3320 }, { "epoch": 0.5737422467263956, "grad_norm": 15.929891586303711, "learning_rate": 4.874367688615803e-07, "logits/chosen": -2.296051263809204, "logits/rejected": -2.2659592628479004, "logps/chosen": -154.8794403076172, "logps/rejected": -193.5670928955078, "loss": 0.572, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.9966483116149902, "rewards/margins": 0.40629035234451294, "rewards/rejected": -1.4029386043548584, "step": 3330 }, { "epoch": 0.5754651964162646, "grad_norm": 21.51392936706543, "learning_rate": 4.872794034400174e-07, "logits/chosen": -2.2137632369995117, "logits/rejected": -2.1765713691711426, "logps/chosen": -184.21322631835938, "logps/rejected": -228.24124145507812, "loss": 0.573, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2646104097366333, "rewards/margins": 0.4884043335914612, "rewards/rejected": -1.7530148029327393, "step": 3340 }, { "epoch": 0.5771881461061337, "grad_norm": 18.116823196411133, "learning_rate": 4.871210842967885e-07, "logits/chosen": -2.0938210487365723, "logits/rejected": -2.066746473312378, "logps/chosen": -196.8106689453125, "logps/rejected": -243.6275177001953, "loss": 0.5657, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3989002704620361, "rewards/margins": 0.4810122847557068, "rewards/rejected": -1.8799126148223877, "step": 3350 }, { "epoch": 0.5789110957960028, "grad_norm": 17.3485107421875, "learning_rate": 4.86961812068242e-07, "logits/chosen": -2.12273907661438, "logits/rejected": -2.092031240463257, "logps/chosen": -195.5784912109375, "logps/rejected": -230.8212890625, "loss": 0.6057, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.39964759349823, "rewards/margins": 0.39662671089172363, "rewards/rejected": -1.796274185180664, "step": 3360 }, { "epoch": 0.5806340454858718, "grad_norm": 19.851686477661133, "learning_rate": 4.868015873945572e-07, "logits/chosen": -2.1575028896331787, "logits/rejected": -2.143172264099121, "logps/chosen": -171.63800048828125, "logps/rejected": -214.4031219482422, "loss": 0.5772, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1917240619659424, "rewards/margins": 0.4194125235080719, "rewards/rejected": -1.611136794090271, "step": 3370 }, { "epoch": 0.5823569951757409, "grad_norm": 33.63004684448242, "learning_rate": 4.86640410919742e-07, "logits/chosen": -2.223252296447754, "logits/rejected": -2.192837715148926, "logps/chosen": -169.27816772460938, "logps/rejected": -198.42666625976562, "loss": 0.5858, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1220273971557617, "rewards/margins": 0.3612268567085266, "rewards/rejected": -1.483254313468933, "step": 3380 }, { "epoch": 0.5840799448656099, "grad_norm": 14.943294525146484, "learning_rate": 4.864782832916295e-07, "logits/chosen": -2.1843461990356445, "logits/rejected": -2.1562962532043457, "logps/chosen": -166.73507690429688, "logps/rejected": -196.1730194091797, "loss": 0.614, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1232101917266846, "rewards/margins": 0.3320769667625427, "rewards/rejected": -1.4552870988845825, "step": 3390 }, { "epoch": 0.585802894555479, "grad_norm": 14.006731986999512, "learning_rate": 4.86315205161876e-07, "logits/chosen": -2.2225568294525146, "logits/rejected": -2.18027663230896, "logps/chosen": -158.7477569580078, "logps/rejected": -195.69419860839844, "loss": 0.5931, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0513432025909424, "rewards/margins": 0.40023574233055115, "rewards/rejected": -1.451578974723816, "step": 3400 }, { "epoch": 0.587525844245348, "grad_norm": 18.841947555541992, "learning_rate": 4.861511771859586e-07, "logits/chosen": -2.1404201984405518, "logits/rejected": -2.109682559967041, "logps/chosen": -155.5089569091797, "logps/rejected": -185.97349548339844, "loss": 0.5831, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0530818700790405, "rewards/margins": 0.3437369465827942, "rewards/rejected": -1.39681875705719, "step": 3410 }, { "epoch": 0.5892487939352171, "grad_norm": 30.081941604614258, "learning_rate": 4.859862000231714e-07, "logits/chosen": -2.1383605003356934, "logits/rejected": -2.1121487617492676, "logps/chosen": -173.26690673828125, "logps/rejected": -215.2777862548828, "loss": 0.5707, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1941272020339966, "rewards/margins": 0.4264501929283142, "rewards/rejected": -1.6205774545669556, "step": 3420 }, { "epoch": 0.5909717436250862, "grad_norm": 18.759355545043945, "learning_rate": 4.858202743366247e-07, "logits/chosen": -2.1610615253448486, "logits/rejected": -2.130058765411377, "logps/chosen": -209.3471221923828, "logps/rejected": -253.21231079101562, "loss": 0.6117, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5709012746810913, "rewards/margins": 0.4228098392486572, "rewards/rejected": -1.9937111139297485, "step": 3430 }, { "epoch": 0.5926946933149552, "grad_norm": 37.444095611572266, "learning_rate": 4.856534007932404e-07, "logits/chosen": -2.0540525913238525, "logits/rejected": -2.0197017192840576, "logps/chosen": -241.2897491455078, "logps/rejected": -293.1056213378906, "loss": 0.5848, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.8855911493301392, "rewards/margins": 0.5275468826293945, "rewards/rejected": -2.4131383895874023, "step": 3440 }, { "epoch": 0.5944176430048242, "grad_norm": 22.572738647460938, "learning_rate": 4.854855800637509e-07, "logits/chosen": -2.192974090576172, "logits/rejected": -2.154181957244873, "logps/chosen": -263.01385498046875, "logps/rejected": -305.43609619140625, "loss": 0.6104, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.023679733276367, "rewards/margins": 0.4952165186405182, "rewards/rejected": -2.5188965797424316, "step": 3450 }, { "epoch": 0.5961405926946933, "grad_norm": 18.213420867919922, "learning_rate": 4.853168128226953e-07, "logits/chosen": -2.114217758178711, "logits/rejected": -2.0940046310424805, "logps/chosen": -184.91647338867188, "logps/rejected": -224.1553497314453, "loss": 0.5927, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.319911241531372, "rewards/margins": 0.3920961022377014, "rewards/rejected": -1.7120075225830078, "step": 3460 }, { "epoch": 0.5978635423845624, "grad_norm": 16.568256378173828, "learning_rate": 4.851470997484172e-07, "logits/chosen": -2.203477382659912, "logits/rejected": -2.184323787689209, "logps/chosen": -181.62326049804688, "logps/rejected": -217.5211639404297, "loss": 0.6112, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2437589168548584, "rewards/margins": 0.39844948053359985, "rewards/rejected": -1.642208456993103, "step": 3470 }, { "epoch": 0.5995864920744314, "grad_norm": 27.65972328186035, "learning_rate": 4.84976441523062e-07, "logits/chosen": -2.1619374752044678, "logits/rejected": -2.1237118244171143, "logps/chosen": -187.3358917236328, "logps/rejected": -231.9523162841797, "loss": 0.5642, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.315253496170044, "rewards/margins": 0.4921639561653137, "rewards/rejected": -1.8074172735214233, "step": 3480 }, { "epoch": 0.6013094417643005, "grad_norm": 29.402324676513672, "learning_rate": 4.848048388325741e-07, "logits/chosen": -2.1421029567718506, "logits/rejected": -2.108595371246338, "logps/chosen": -201.7513427734375, "logps/rejected": -242.60134887695312, "loss": 0.5918, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4710626602172852, "rewards/margins": 0.41561394929885864, "rewards/rejected": -1.8866764307022095, "step": 3490 }, { "epoch": 0.6030323914541695, "grad_norm": 21.794660568237305, "learning_rate": 4.846322923666936e-07, "logits/chosen": -2.117928981781006, "logits/rejected": -2.100590944290161, "logps/chosen": -191.99649047851562, "logps/rejected": -228.5430145263672, "loss": 0.595, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3924285173416138, "rewards/margins": 0.35744625329971313, "rewards/rejected": -1.7498747110366821, "step": 3500 }, { "epoch": 0.6047553411440386, "grad_norm": 14.772516250610352, "learning_rate": 4.844588028189546e-07, "logits/chosen": -2.102381467819214, "logits/rejected": -2.068084716796875, "logps/chosen": -185.61502075195312, "logps/rejected": -215.8332977294922, "loss": 0.6099, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3156392574310303, "rewards/margins": 0.33836859464645386, "rewards/rejected": -1.6540076732635498, "step": 3510 }, { "epoch": 0.6064782908339077, "grad_norm": 18.655094146728516, "learning_rate": 4.842843708866815e-07, "logits/chosen": -2.1331818103790283, "logits/rejected": -2.090283155441284, "logps/chosen": -172.63299560546875, "logps/rejected": -209.20645141601562, "loss": 0.5801, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1688952445983887, "rewards/margins": 0.4096272587776184, "rewards/rejected": -1.5785224437713623, "step": 3520 }, { "epoch": 0.6082012405237767, "grad_norm": 35.46128463745117, "learning_rate": 4.841089972709868e-07, "logits/chosen": -2.1921870708465576, "logits/rejected": -2.1422884464263916, "logps/chosen": -201.27774047851562, "logps/rejected": -237.84603881835938, "loss": 0.6031, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4475243091583252, "rewards/margins": 0.4292474687099457, "rewards/rejected": -1.8767716884613037, "step": 3530 }, { "epoch": 0.6099241902136457, "grad_norm": 22.01081657409668, "learning_rate": 4.839326826767677e-07, "logits/chosen": -2.0616462230682373, "logits/rejected": -2.0317611694335938, "logps/chosen": -209.13870239257812, "logps/rejected": -243.5667266845703, "loss": 0.6427, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5373376607894897, "rewards/margins": 0.3787280023097992, "rewards/rejected": -1.9160658121109009, "step": 3540 }, { "epoch": 0.6116471399035148, "grad_norm": 19.502050399780273, "learning_rate": 4.837554278127036e-07, "logits/chosen": -2.1562960147857666, "logits/rejected": -2.1431212425231934, "logps/chosen": -186.15951538085938, "logps/rejected": -210.40780639648438, "loss": 0.6361, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3113396167755127, "rewards/margins": 0.27536725997924805, "rewards/rejected": -1.5867068767547607, "step": 3550 }, { "epoch": 0.6133700895933839, "grad_norm": 16.305097579956055, "learning_rate": 4.835772333912535e-07, "logits/chosen": -2.1457653045654297, "logits/rejected": -2.1218769550323486, "logps/chosen": -162.876220703125, "logps/rejected": -192.2865447998047, "loss": 0.6051, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0821752548217773, "rewards/margins": 0.3265151381492615, "rewards/rejected": -1.408690333366394, "step": 3560 }, { "epoch": 0.6150930392832529, "grad_norm": 19.12994384765625, "learning_rate": 4.833981001286526e-07, "logits/chosen": -2.2191901206970215, "logits/rejected": -2.1843180656433105, "logps/chosen": -183.60513305664062, "logps/rejected": -203.54257202148438, "loss": 0.6627, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2583025693893433, "rewards/margins": 0.2502022683620453, "rewards/rejected": -1.5085046291351318, "step": 3570 }, { "epoch": 0.616815988973122, "grad_norm": 14.844572067260742, "learning_rate": 4.832180287449098e-07, "logits/chosen": -2.187638759613037, "logits/rejected": -2.1604015827178955, "logps/chosen": -178.97811889648438, "logps/rejected": -217.069091796875, "loss": 0.577, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1986453533172607, "rewards/margins": 0.4069907069206238, "rewards/rejected": -1.6056360006332397, "step": 3580 }, { "epoch": 0.618538938662991, "grad_norm": 24.435321807861328, "learning_rate": 4.830370199638046e-07, "logits/chosen": -2.1201322078704834, "logits/rejected": -2.093515157699585, "logps/chosen": -199.69039916992188, "logps/rejected": -242.7078399658203, "loss": 0.5897, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4402469396591187, "rewards/margins": 0.4578830301761627, "rewards/rejected": -1.8981298208236694, "step": 3590 }, { "epoch": 0.6202618883528601, "grad_norm": 30.424062728881836, "learning_rate": 4.828550745128844e-07, "logits/chosen": -2.1129040718078613, "logits/rejected": -2.0745091438293457, "logps/chosen": -200.34878540039062, "logps/rejected": -240.11331176757812, "loss": 0.5886, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.446361780166626, "rewards/margins": 0.44327202439308167, "rewards/rejected": -1.8896337747573853, "step": 3600 }, { "epoch": 0.6202618883528601, "eval_logits/chosen": -2.163433074951172, "eval_logits/rejected": -2.148742914199829, "eval_logps/chosen": -186.64744567871094, "eval_logps/rejected": -211.47340393066406, "eval_loss": 0.6382359862327576, "eval_rewards/accuracies": 0.6354553699493408, "eval_rewards/chosen": -1.2763198614120483, "eval_rewards/margins": 0.21091830730438232, "eval_rewards/rejected": -1.4872381687164307, "eval_runtime": 360.7962, "eval_samples_per_second": 11.929, "eval_steps_per_second": 1.491, "step": 3600 }, { "epoch": 0.6219848380427292, "grad_norm": 18.377422332763672, "learning_rate": 4.826721931234613e-07, "logits/chosen": -2.0545711517333984, "logits/rejected": -2.031045913696289, "logps/chosen": -215.98068237304688, "logps/rejected": -255.8895263671875, "loss": 0.5794, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6339839696884155, "rewards/margins": 0.4386228621006012, "rewards/rejected": -2.0726068019866943, "step": 3610 }, { "epoch": 0.6237077877325982, "grad_norm": 22.3870792388916, "learning_rate": 4.824883765306095e-07, "logits/chosen": -1.9651950597763062, "logits/rejected": -1.930354356765747, "logps/chosen": -215.5059814453125, "logps/rejected": -266.27532958984375, "loss": 0.5384, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5873385667800903, "rewards/margins": 0.5435501933097839, "rewards/rejected": -2.1308884620666504, "step": 3620 }, { "epoch": 0.6254307374224672, "grad_norm": 23.889678955078125, "learning_rate": 4.82303625473162e-07, "logits/chosen": -2.0317368507385254, "logits/rejected": -2.004117488861084, "logps/chosen": -244.93923950195312, "logps/rejected": -314.2686767578125, "loss": 0.5556, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.925708532333374, "rewards/margins": 0.6688078045845032, "rewards/rejected": -2.5945162773132324, "step": 3630 }, { "epoch": 0.6271536871123363, "grad_norm": 28.27094078063965, "learning_rate": 4.821179406937077e-07, "logits/chosen": -2.006577968597412, "logits/rejected": -1.9828994274139404, "logps/chosen": -255.15847778320312, "logps/rejected": -303.88653564453125, "loss": 0.5869, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.9872589111328125, "rewards/margins": 0.4996883273124695, "rewards/rejected": -2.4869472980499268, "step": 3640 }, { "epoch": 0.6288766368022054, "grad_norm": 26.817228317260742, "learning_rate": 4.819313229385889e-07, "logits/chosen": -2.0243873596191406, "logits/rejected": -2.008803367614746, "logps/chosen": -224.79824829101562, "logps/rejected": -282.2015075683594, "loss": 0.5573, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7291301488876343, "rewards/margins": 0.5527663826942444, "rewards/rejected": -2.2818961143493652, "step": 3650 }, { "epoch": 0.6305995864920745, "grad_norm": 26.336957931518555, "learning_rate": 4.817437729578975e-07, "logits/chosen": -2.0267727375030518, "logits/rejected": -1.9827531576156616, "logps/chosen": -237.3345184326172, "logps/rejected": -290.75372314453125, "loss": 0.5696, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8172929286956787, "rewards/margins": 0.5689106583595276, "rewards/rejected": -2.3862035274505615, "step": 3660 }, { "epoch": 0.6323225361819435, "grad_norm": 18.02985191345215, "learning_rate": 4.815552915054727e-07, "logits/chosen": -2.087648630142212, "logits/rejected": -2.0519766807556152, "logps/chosen": -196.21951293945312, "logps/rejected": -235.7421875, "loss": 0.609, "rewards/accuracies": 0.65625, "rewards/chosen": -1.422478199005127, "rewards/margins": 0.41819635033607483, "rewards/rejected": -1.840674638748169, "step": 3670 }, { "epoch": 0.6340454858718125, "grad_norm": 15.94927978515625, "learning_rate": 4.813658793388974e-07, "logits/chosen": -2.163003444671631, "logits/rejected": -2.1265764236450195, "logps/chosen": -192.26455688476562, "logps/rejected": -229.267822265625, "loss": 0.5855, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3595788478851318, "rewards/margins": 0.44124117493629456, "rewards/rejected": -1.8008201122283936, "step": 3680 }, { "epoch": 0.6357684355616816, "grad_norm": 28.644386291503906, "learning_rate": 4.811755372194955e-07, "logits/chosen": -2.1186342239379883, "logits/rejected": -2.0941452980041504, "logps/chosen": -179.84536743164062, "logps/rejected": -238.16116333007812, "loss": 0.5634, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2687454223632812, "rewards/margins": 0.5312276482582092, "rewards/rejected": -1.7999728918075562, "step": 3690 }, { "epoch": 0.6374913852515507, "grad_norm": 18.490943908691406, "learning_rate": 4.809842659123287e-07, "logits/chosen": -2.0873656272888184, "logits/rejected": -2.039702892303467, "logps/chosen": -214.9015655517578, "logps/rejected": -271.69390869140625, "loss": 0.5666, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5629366636276245, "rewards/margins": 0.5744749903678894, "rewards/rejected": -2.1374118328094482, "step": 3700 }, { "epoch": 0.6392143349414197, "grad_norm": 24.37276268005371, "learning_rate": 4.80792066186194e-07, "logits/chosen": -2.0921623706817627, "logits/rejected": -2.063568592071533, "logps/chosen": -227.4940948486328, "logps/rejected": -269.02099609375, "loss": 0.6005, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.7243483066558838, "rewards/margins": 0.4640493392944336, "rewards/rejected": -2.1883976459503174, "step": 3710 }, { "epoch": 0.6409372846312887, "grad_norm": 18.740873336791992, "learning_rate": 4.80598938813619e-07, "logits/chosen": -2.092731237411499, "logits/rejected": -2.0568840503692627, "logps/chosen": -199.36892700195312, "logps/rejected": -257.3482971191406, "loss": 0.5358, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4646618366241455, "rewards/margins": 0.5761027336120605, "rewards/rejected": -2.040764570236206, "step": 3720 }, { "epoch": 0.6426602343211578, "grad_norm": 19.2831974029541, "learning_rate": 4.804048845708612e-07, "logits/chosen": -2.0760316848754883, "logits/rejected": -2.0255093574523926, "logps/chosen": -226.48281860351562, "logps/rejected": -281.45660400390625, "loss": 0.5591, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7033042907714844, "rewards/margins": 0.6070685982704163, "rewards/rejected": -2.310372829437256, "step": 3730 }, { "epoch": 0.6443831840110269, "grad_norm": 18.635801315307617, "learning_rate": 4.802099042379023e-07, "logits/chosen": -2.0803098678588867, "logits/rejected": -2.0478389263153076, "logps/chosen": -203.64378356933594, "logps/rejected": -255.9177703857422, "loss": 0.5829, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.489525318145752, "rewards/margins": 0.5065483450889587, "rewards/rejected": -1.9960737228393555, "step": 3740 }, { "epoch": 0.646106133700896, "grad_norm": 18.290414810180664, "learning_rate": 4.800139985984474e-07, "logits/chosen": -2.1877551078796387, "logits/rejected": -2.1602888107299805, "logps/chosen": -170.83364868164062, "logps/rejected": -199.7840118408203, "loss": 0.6315, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1604950428009033, "rewards/margins": 0.3101738393306732, "rewards/rejected": -1.4706690311431885, "step": 3750 }, { "epoch": 0.647829083390765, "grad_norm": 24.337427139282227, "learning_rate": 4.798171684399201e-07, "logits/chosen": -2.165510654449463, "logits/rejected": -2.143298387527466, "logps/chosen": -172.0293731689453, "logps/rejected": -201.77560424804688, "loss": 0.6059, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.151803970336914, "rewards/margins": 0.320174902677536, "rewards/rejected": -1.4719789028167725, "step": 3760 }, { "epoch": 0.649552033080634, "grad_norm": 26.545852661132812, "learning_rate": 4.796194145534603e-07, "logits/chosen": -2.1026763916015625, "logits/rejected": -2.0653865337371826, "logps/chosen": -211.08804321289062, "logps/rejected": -257.2968444824219, "loss": 0.565, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5634223222732544, "rewards/margins": 0.4742763638496399, "rewards/rejected": -2.03769850730896, "step": 3770 }, { "epoch": 0.6512749827705031, "grad_norm": 23.558462142944336, "learning_rate": 4.794207377339204e-07, "logits/chosen": -1.97832453250885, "logits/rejected": -1.9379304647445679, "logps/chosen": -298.8166809082031, "logps/rejected": -340.4345397949219, "loss": 0.612, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.439976930618286, "rewards/margins": 0.4786749482154846, "rewards/rejected": -2.918652057647705, "step": 3780 }, { "epoch": 0.6529979324603722, "grad_norm": 23.203041076660156, "learning_rate": 4.792211387798632e-07, "logits/chosen": -1.9999135732650757, "logits/rejected": -1.9647667407989502, "logps/chosen": -266.22418212890625, "logps/rejected": -308.0155334472656, "loss": 0.6225, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.112579822540283, "rewards/margins": 0.46802085638046265, "rewards/rejected": -2.5806007385253906, "step": 3790 }, { "epoch": 0.6547208821502413, "grad_norm": 39.15723419189453, "learning_rate": 4.79020618493557e-07, "logits/chosen": -2.085218906402588, "logits/rejected": -2.051792621612549, "logps/chosen": -216.2850341796875, "logps/rejected": -256.6274108886719, "loss": 0.5749, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5779272317886353, "rewards/margins": 0.46570149064064026, "rewards/rejected": -2.0436289310455322, "step": 3800 }, { "epoch": 0.6564438318401102, "grad_norm": 38.20709991455078, "learning_rate": 4.788191776809739e-07, "logits/chosen": -2.1185243129730225, "logits/rejected": -2.078909397125244, "logps/chosen": -216.7564239501953, "logps/rejected": -255.44570922851562, "loss": 0.5804, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5775924921035767, "rewards/margins": 0.46415385603904724, "rewards/rejected": -2.0417463779449463, "step": 3810 }, { "epoch": 0.6581667815299793, "grad_norm": 23.264312744140625, "learning_rate": 4.78616817151786e-07, "logits/chosen": -2.0765395164489746, "logits/rejected": -2.033416986465454, "logps/chosen": -201.68533325195312, "logps/rejected": -256.2938232421875, "loss": 0.5339, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.464045763015747, "rewards/margins": 0.5756982564926147, "rewards/rejected": -2.0397439002990723, "step": 3820 }, { "epoch": 0.6598897312198484, "grad_norm": 21.760732650756836, "learning_rate": 4.784135377193615e-07, "logits/chosen": -2.0924277305603027, "logits/rejected": -2.0375728607177734, "logps/chosen": -217.6341094970703, "logps/rejected": -256.01416015625, "loss": 0.5711, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5802688598632812, "rewards/margins": 0.4732998013496399, "rewards/rejected": -2.0535686016082764, "step": 3830 }, { "epoch": 0.6616126809097175, "grad_norm": 20.805545806884766, "learning_rate": 4.782093402007628e-07, "logits/chosen": -2.0534451007843018, "logits/rejected": -2.0274665355682373, "logps/chosen": -211.24807739257812, "logps/rejected": -272.18072509765625, "loss": 0.5685, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5590877532958984, "rewards/margins": 0.5817896723747253, "rewards/rejected": -2.1408774852752686, "step": 3840 }, { "epoch": 0.6633356305995864, "grad_norm": 22.729835510253906, "learning_rate": 4.780042254167421e-07, "logits/chosen": -2.1388187408447266, "logits/rejected": -2.105391263961792, "logps/chosen": -201.5885772705078, "logps/rejected": -247.5126953125, "loss": 0.5705, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4321880340576172, "rewards/margins": 0.5137678980827332, "rewards/rejected": -1.9459559917449951, "step": 3850 }, { "epoch": 0.6650585802894555, "grad_norm": 24.899465560913086, "learning_rate": 4.777981941917383e-07, "logits/chosen": -2.1241769790649414, "logits/rejected": -2.080303192138672, "logps/chosen": -224.6488037109375, "logps/rejected": -267.0788879394531, "loss": 0.603, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6878725290298462, "rewards/margins": 0.4614773392677307, "rewards/rejected": -2.1493499279022217, "step": 3860 }, { "epoch": 0.6667815299793246, "grad_norm": 23.346389770507812, "learning_rate": 4.775912473538742e-07, "logits/chosen": -2.0845589637756348, "logits/rejected": -2.0392050743103027, "logps/chosen": -208.6995086669922, "logps/rejected": -268.3965759277344, "loss": 0.5356, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4873734712600708, "rewards/margins": 0.6505120992660522, "rewards/rejected": -2.137885570526123, "step": 3870 }, { "epoch": 0.6685044796691937, "grad_norm": 25.57661247253418, "learning_rate": 4.773833857349525e-07, "logits/chosen": -2.0941011905670166, "logits/rejected": -2.0803701877593994, "logps/chosen": -205.35806274414062, "logps/rejected": -248.0812530517578, "loss": 0.6143, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5345416069030762, "rewards/margins": 0.4184637665748596, "rewards/rejected": -1.9530051946640015, "step": 3880 }, { "epoch": 0.6702274293590628, "grad_norm": 21.064476013183594, "learning_rate": 4.771746101704531e-07, "logits/chosen": -2.163846731185913, "logits/rejected": -2.131821393966675, "logps/chosen": -195.33494567871094, "logps/rejected": -234.24307250976562, "loss": 0.6183, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4035874605178833, "rewards/margins": 0.427592933177948, "rewards/rejected": -1.8311803340911865, "step": 3890 }, { "epoch": 0.6719503790489317, "grad_norm": 18.766502380371094, "learning_rate": 4.769649214995291e-07, "logits/chosen": -2.1936285495758057, "logits/rejected": -2.1510443687438965, "logps/chosen": -169.03466796875, "logps/rejected": -200.029052734375, "loss": 0.5827, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1086899042129517, "rewards/margins": 0.39425966143608093, "rewards/rejected": -1.5029494762420654, "step": 3900 }, { "epoch": 0.6736733287388008, "grad_norm": 14.846955299377441, "learning_rate": 4.7675432056500383e-07, "logits/chosen": -2.2091522216796875, "logits/rejected": -2.183218002319336, "logps/chosen": -171.3837127685547, "logps/rejected": -212.4227752685547, "loss": 0.5652, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.1485109329223633, "rewards/margins": 0.44453829526901245, "rewards/rejected": -1.5930492877960205, "step": 3910 }, { "epoch": 0.6753962784286699, "grad_norm": 20.422460556030273, "learning_rate": 4.765428082133675e-07, "logits/chosen": -2.203390598297119, "logits/rejected": -2.1545228958129883, "logps/chosen": -175.43124389648438, "logps/rejected": -218.783935546875, "loss": 0.5466, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1507736444473267, "rewards/margins": 0.5376171469688416, "rewards/rejected": -1.6883907318115234, "step": 3920 }, { "epoch": 0.677119228118539, "grad_norm": 34.621334075927734, "learning_rate": 4.7633038529477366e-07, "logits/chosen": -2.098792314529419, "logits/rejected": -2.075239658355713, "logps/chosen": -223.42971801757812, "logps/rejected": -275.72698974609375, "loss": 0.5941, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7072811126708984, "rewards/margins": 0.5297926664352417, "rewards/rejected": -2.2370738983154297, "step": 3930 }, { "epoch": 0.6788421778084079, "grad_norm": 28.183530807495117, "learning_rate": 4.761170526630357e-07, "logits/chosen": -2.0133912563323975, "logits/rejected": -1.9819608926773071, "logps/chosen": -218.24734497070312, "logps/rejected": -270.80633544921875, "loss": 0.5831, "rewards/accuracies": 0.71875, "rewards/chosen": -1.634020447731018, "rewards/margins": 0.5462647676467896, "rewards/rejected": -2.1802852153778076, "step": 3940 }, { "epoch": 0.680565127498277, "grad_norm": 21.1882381439209, "learning_rate": 4.759028111756235e-07, "logits/chosen": -2.0428051948547363, "logits/rejected": -1.988119125366211, "logps/chosen": -205.39639282226562, "logps/rejected": -265.3125915527344, "loss": 0.5209, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.493817687034607, "rewards/margins": 0.6602969169616699, "rewards/rejected": -2.1541144847869873, "step": 3950 }, { "epoch": 0.6822880771881461, "grad_norm": 38.1711311340332, "learning_rate": 4.756876616936601e-07, "logits/chosen": -2.08381986618042, "logits/rejected": -2.056079149246216, "logps/chosen": -237.8214111328125, "logps/rejected": -308.1978454589844, "loss": 0.5514, "rewards/accuracies": 0.71875, "rewards/chosen": -1.8356602191925049, "rewards/margins": 0.6658327579498291, "rewards/rejected": -2.501492977142334, "step": 3960 }, { "epoch": 0.6840110268780152, "grad_norm": 33.71718215942383, "learning_rate": 4.7547160508191805e-07, "logits/chosen": -2.00370454788208, "logits/rejected": -1.972821831703186, "logps/chosen": -267.02410888671875, "logps/rejected": -322.6805114746094, "loss": 0.5907, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.1182658672332764, "rewards/margins": 0.57957923412323, "rewards/rejected": -2.697844982147217, "step": 3970 }, { "epoch": 0.6857339765678843, "grad_norm": 22.376996994018555, "learning_rate": 4.7525464220881604e-07, "logits/chosen": -2.0071051120758057, "logits/rejected": -1.9739021062850952, "logps/chosen": -224.4896697998047, "logps/rejected": -292.85809326171875, "loss": 0.5085, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7044188976287842, "rewards/margins": 0.6982835531234741, "rewards/rejected": -2.402702569961548, "step": 3980 }, { "epoch": 0.6874569262577532, "grad_norm": 17.49305534362793, "learning_rate": 4.7503677394641537e-07, "logits/chosen": -2.080984592437744, "logits/rejected": -2.052885055541992, "logps/chosen": -201.67666625976562, "logps/rejected": -245.2376708984375, "loss": 0.6024, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4744858741760254, "rewards/margins": 0.4377506375312805, "rewards/rejected": -1.9122365713119507, "step": 3990 }, { "epoch": 0.6891798759476223, "grad_norm": 26.856908798217773, "learning_rate": 4.748180011704166e-07, "logits/chosen": -2.189699649810791, "logits/rejected": -2.1511547565460205, "logps/chosen": -185.24520874023438, "logps/rejected": -220.28189086914062, "loss": 0.5903, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.275210976600647, "rewards/margins": 0.4114285409450531, "rewards/rejected": -1.686639428138733, "step": 4000 }, { "epoch": 0.6891798759476223, "eval_logits/chosen": -2.203526496887207, "eval_logits/rejected": -2.1888158321380615, "eval_logps/chosen": -160.05337524414062, "eval_logps/rejected": -184.0546417236328, "eval_loss": 0.6398167014122009, "eval_rewards/accuracies": 0.636617124080658, "eval_rewards/chosen": -1.0103789567947388, "eval_rewards/margins": 0.20267143845558167, "eval_rewards/rejected": -1.2130504846572876, "eval_runtime": 361.1127, "eval_samples_per_second": 11.919, "eval_steps_per_second": 1.49, "step": 4000 }, { "epoch": 0.6909028256374914, "grad_norm": 24.459688186645508, "learning_rate": 4.745983247601557e-07, "logits/chosen": -2.091068744659424, "logits/rejected": -2.061098098754883, "logps/chosen": -176.71707153320312, "logps/rejected": -237.23373413085938, "loss": 0.5295, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.2744981050491333, "rewards/margins": 0.5773097276687622, "rewards/rejected": -1.8518078327178955, "step": 4010 }, { "epoch": 0.6926257753273605, "grad_norm": 22.180160522460938, "learning_rate": 4.7437774559860095e-07, "logits/chosen": -2.0425899028778076, "logits/rejected": -2.0301616191864014, "logps/chosen": -231.13900756835938, "logps/rejected": -276.4510498046875, "loss": 0.6297, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.784000039100647, "rewards/margins": 0.45418062806129456, "rewards/rejected": -2.238180637359619, "step": 4020 }, { "epoch": 0.6943487250172296, "grad_norm": 18.078332901000977, "learning_rate": 4.741562645723488e-07, "logits/chosen": -2.0929934978485107, "logits/rejected": -2.0571064949035645, "logps/chosen": -204.25234985351562, "logps/rejected": -253.4650421142578, "loss": 0.5611, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.470801830291748, "rewards/margins": 0.5107512474060059, "rewards/rejected": -1.9815528392791748, "step": 4030 }, { "epoch": 0.6960716747070985, "grad_norm": 28.691661834716797, "learning_rate": 4.7393388257162104e-07, "logits/chosen": -2.135240077972412, "logits/rejected": -2.0994486808776855, "logps/chosen": -204.0321044921875, "logps/rejected": -262.3167419433594, "loss": 0.5422, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4717985391616821, "rewards/margins": 0.6114691495895386, "rewards/rejected": -2.0832676887512207, "step": 4040 }, { "epoch": 0.6977946243969676, "grad_norm": 20.770586013793945, "learning_rate": 4.737106004902605e-07, "logits/chosen": -2.084413528442383, "logits/rejected": -2.0501129627227783, "logps/chosen": -203.73574829101562, "logps/rejected": -260.212890625, "loss": 0.5556, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5126979351043701, "rewards/margins": 0.5731831789016724, "rewards/rejected": -2.085881233215332, "step": 4050 }, { "epoch": 0.6995175740868367, "grad_norm": 18.81822395324707, "learning_rate": 4.7348641922572805e-07, "logits/chosen": -2.0613832473754883, "logits/rejected": -2.026353359222412, "logps/chosen": -225.5125732421875, "logps/rejected": -272.64666748046875, "loss": 0.6014, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.683030366897583, "rewards/margins": 0.5311646461486816, "rewards/rejected": -2.2141947746276855, "step": 4060 }, { "epoch": 0.7012405237767058, "grad_norm": 14.052509307861328, "learning_rate": 4.732613396790987e-07, "logits/chosen": -2.0514869689941406, "logits/rejected": -2.0147604942321777, "logps/chosen": -189.19113159179688, "logps/rejected": -247.239990234375, "loss": 0.5451, "rewards/accuracies": 0.71875, "rewards/chosen": -1.354153037071228, "rewards/margins": 0.5994340777397156, "rewards/rejected": -1.9535871744155884, "step": 4070 }, { "epoch": 0.7029634734665747, "grad_norm": 20.1846866607666, "learning_rate": 4.730353627550579e-07, "logits/chosen": -2.052722454071045, "logits/rejected": -2.0262348651885986, "logps/chosen": -192.2119903564453, "logps/rejected": -245.517333984375, "loss": 0.5704, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3890540599822998, "rewards/margins": 0.5318419337272644, "rewards/rejected": -1.9208959341049194, "step": 4080 }, { "epoch": 0.7046864231564438, "grad_norm": 20.500852584838867, "learning_rate": 4.728084893618981e-07, "logits/chosen": -2.0883076190948486, "logits/rejected": -2.041938304901123, "logps/chosen": -192.36859130859375, "logps/rejected": -250.56906127929688, "loss": 0.5452, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3556301593780518, "rewards/margins": 0.6143723726272583, "rewards/rejected": -1.9700024127960205, "step": 4090 }, { "epoch": 0.7064093728463129, "grad_norm": 36.07284164428711, "learning_rate": 4.72580720411515e-07, "logits/chosen": -2.039346218109131, "logits/rejected": -2.0028650760650635, "logps/chosen": -228.094482421875, "logps/rejected": -285.2467956542969, "loss": 0.5489, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.740915060043335, "rewards/margins": 0.6266389489173889, "rewards/rejected": -2.367554187774658, "step": 4100 }, { "epoch": 0.708132322536182, "grad_norm": 20.588911056518555, "learning_rate": 4.723520568194039e-07, "logits/chosen": -1.9775018692016602, "logits/rejected": -1.9522132873535156, "logps/chosen": -256.61480712890625, "logps/rejected": -297.35394287109375, "loss": 0.6332, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.9994217157363892, "rewards/margins": 0.4667152464389801, "rewards/rejected": -2.466136932373047, "step": 4110 }, { "epoch": 0.709855272226051, "grad_norm": 24.14111328125, "learning_rate": 4.7212249950465623e-07, "logits/chosen": -2.0453383922576904, "logits/rejected": -2.037416458129883, "logps/chosen": -200.84133911132812, "logps/rejected": -235.08303833007812, "loss": 0.6323, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4962066411972046, "rewards/margins": 0.31947365403175354, "rewards/rejected": -1.8156802654266357, "step": 4120 }, { "epoch": 0.71157822191592, "grad_norm": 14.83882999420166, "learning_rate": 4.7189204938995517e-07, "logits/chosen": -2.159144878387451, "logits/rejected": -2.1300342082977295, "logps/chosen": -160.5081787109375, "logps/rejected": -203.0730743408203, "loss": 0.5582, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.0484564304351807, "rewards/margins": 0.46635621786117554, "rewards/rejected": -1.514812707901001, "step": 4130 }, { "epoch": 0.7133011716057891, "grad_norm": 23.90931510925293, "learning_rate": 4.716607074015729e-07, "logits/chosen": -2.1309354305267334, "logits/rejected": -2.107447624206543, "logps/chosen": -184.16400146484375, "logps/rejected": -209.5248565673828, "loss": 0.6221, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.269916296005249, "rewards/margins": 0.3217626214027405, "rewards/rejected": -1.5916789770126343, "step": 4140 }, { "epoch": 0.7150241212956582, "grad_norm": 14.905668258666992, "learning_rate": 4.7142847446936616e-07, "logits/chosen": -2.1701815128326416, "logits/rejected": -2.141645669937134, "logps/chosen": -167.40696716308594, "logps/rejected": -203.14218139648438, "loss": 0.5669, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.086830496788025, "rewards/margins": 0.44008636474609375, "rewards/rejected": -1.526916742324829, "step": 4150 }, { "epoch": 0.7167470709855273, "grad_norm": 17.2357234954834, "learning_rate": 4.711953515267729e-07, "logits/chosen": -2.1029295921325684, "logits/rejected": -2.05981707572937, "logps/chosen": -171.5275115966797, "logps/rejected": -221.8201141357422, "loss": 0.5613, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1827718019485474, "rewards/margins": 0.5532180070877075, "rewards/rejected": -1.7359898090362549, "step": 4160 }, { "epoch": 0.7184700206753962, "grad_norm": 18.144237518310547, "learning_rate": 4.709613395108082e-07, "logits/chosen": -2.1172525882720947, "logits/rejected": -2.0912692546844482, "logps/chosen": -192.63381958007812, "logps/rejected": -238.55593872070312, "loss": 0.6094, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.4013437032699585, "rewards/margins": 0.4558753967285156, "rewards/rejected": -1.8572193384170532, "step": 4170 }, { "epoch": 0.7201929703652653, "grad_norm": 31.00777816772461, "learning_rate": 4.707264393620608e-07, "logits/chosen": -2.039618968963623, "logits/rejected": -1.996636986732483, "logps/chosen": -214.5855712890625, "logps/rejected": -272.11907958984375, "loss": 0.566, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5773744583129883, "rewards/margins": 0.5817798972129822, "rewards/rejected": -2.1591544151306152, "step": 4180 }, { "epoch": 0.7219159200551344, "grad_norm": 31.242525100708008, "learning_rate": 4.7049065202468917e-07, "logits/chosen": -2.0613465309143066, "logits/rejected": -2.033451795578003, "logps/chosen": -209.00247192382812, "logps/rejected": -260.03814697265625, "loss": 0.5924, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.570135235786438, "rewards/margins": 0.5099143981933594, "rewards/rejected": -2.080049514770508, "step": 4190 }, { "epoch": 0.7236388697450035, "grad_norm": 21.106891632080078, "learning_rate": 4.702539784464178e-07, "logits/chosen": -2.1120667457580566, "logits/rejected": -2.0834715366363525, "logps/chosen": -198.08238220214844, "logps/rejected": -234.7488555908203, "loss": 0.6127, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.410007119178772, "rewards/margins": 0.4233945906162262, "rewards/rejected": -1.8334014415740967, "step": 4200 }, { "epoch": 0.7253618194348725, "grad_norm": 30.25786590576172, "learning_rate": 4.700164195785333e-07, "logits/chosen": -2.1003994941711426, "logits/rejected": -2.0700809955596924, "logps/chosen": -205.91104125976562, "logps/rejected": -237.6742706298828, "loss": 0.6474, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5018773078918457, "rewards/margins": 0.358981192111969, "rewards/rejected": -1.8608585596084595, "step": 4210 }, { "epoch": 0.7270847691247415, "grad_norm": 16.47760772705078, "learning_rate": 4.697779763758806e-07, "logits/chosen": -2.0359458923339844, "logits/rejected": -2.0083885192871094, "logps/chosen": -183.71450805664062, "logps/rejected": -229.0781707763672, "loss": 0.5891, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3065073490142822, "rewards/margins": 0.46343597769737244, "rewards/rejected": -1.7699432373046875, "step": 4220 }, { "epoch": 0.7288077188146106, "grad_norm": 25.033123016357422, "learning_rate": 4.6953864979685903e-07, "logits/chosen": -2.190072536468506, "logits/rejected": -2.1575088500976562, "logps/chosen": -183.2091522216797, "logps/rejected": -227.3050994873047, "loss": 0.5817, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2629189491271973, "rewards/margins": 0.4795466363430023, "rewards/rejected": -1.7424653768539429, "step": 4230 }, { "epoch": 0.7305306685044797, "grad_norm": 21.957361221313477, "learning_rate": 4.6929844080341886e-07, "logits/chosen": -2.1083076000213623, "logits/rejected": -2.074075222015381, "logps/chosen": -170.28692626953125, "logps/rejected": -226.2017822265625, "loss": 0.5448, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1472822427749634, "rewards/margins": 0.5683554410934448, "rewards/rejected": -1.7156378030776978, "step": 4240 }, { "epoch": 0.7322536181943488, "grad_norm": 21.567333221435547, "learning_rate": 4.6905735036105686e-07, "logits/chosen": -2.0879921913146973, "logits/rejected": -2.054762363433838, "logps/chosen": -179.27525329589844, "logps/rejected": -222.5855712890625, "loss": 0.5709, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.228659749031067, "rewards/margins": 0.4789217412471771, "rewards/rejected": -1.7075812816619873, "step": 4250 }, { "epoch": 0.7339765678842178, "grad_norm": 24.15262222290039, "learning_rate": 4.688153794388129e-07, "logits/chosen": -2.0704314708709717, "logits/rejected": -2.0310099124908447, "logps/chosen": -192.286376953125, "logps/rejected": -251.5869140625, "loss": 0.5766, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.363967776298523, "rewards/margins": 0.6139348149299622, "rewards/rejected": -1.9779026508331299, "step": 4260 }, { "epoch": 0.7356995175740868, "grad_norm": 38.45740509033203, "learning_rate": 4.685725290092657e-07, "logits/chosen": -2.007741928100586, "logits/rejected": -1.9698295593261719, "logps/chosen": -230.7338104248047, "logps/rejected": -288.38262939453125, "loss": 0.5929, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.7784141302108765, "rewards/margins": 0.6014623641967773, "rewards/rejected": -2.3798763751983643, "step": 4270 }, { "epoch": 0.7374224672639559, "grad_norm": 28.856172561645508, "learning_rate": 4.6832880004852906e-07, "logits/chosen": -2.050381898880005, "logits/rejected": -2.016599178314209, "logps/chosen": -203.6682586669922, "logps/rejected": -268.5137634277344, "loss": 0.5599, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5079454183578491, "rewards/margins": 0.669056236743927, "rewards/rejected": -2.177001476287842, "step": 4280 }, { "epoch": 0.739145416953825, "grad_norm": 31.12295150756836, "learning_rate": 4.68084193536248e-07, "logits/chosen": -2.07197642326355, "logits/rejected": -2.03266978263855, "logps/chosen": -185.06918334960938, "logps/rejected": -243.5443115234375, "loss": 0.5611, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2924091815948486, "rewards/margins": 0.6399238705635071, "rewards/rejected": -1.932332992553711, "step": 4290 }, { "epoch": 0.740868366643694, "grad_norm": 19.193397521972656, "learning_rate": 4.678387104555949e-07, "logits/chosen": -2.144469738006592, "logits/rejected": -2.115739583969116, "logps/chosen": -168.5285186767578, "logps/rejected": -214.89163208007812, "loss": 0.5821, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1478251218795776, "rewards/margins": 0.4843457341194153, "rewards/rejected": -1.6321709156036377, "step": 4300 }, { "epoch": 0.742591316333563, "grad_norm": 26.5443058013916, "learning_rate": 4.6759235179326533e-07, "logits/chosen": -2.0550007820129395, "logits/rejected": -2.0242109298706055, "logps/chosen": -182.1680450439453, "logps/rejected": -222.0409698486328, "loss": 0.5926, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2681941986083984, "rewards/margins": 0.44236668944358826, "rewards/rejected": -1.7105610370635986, "step": 4310 }, { "epoch": 0.7443142660234321, "grad_norm": 17.787080764770508, "learning_rate": 4.673451185394741e-07, "logits/chosen": -2.1228199005126953, "logits/rejected": -2.0787336826324463, "logps/chosen": -192.5994873046875, "logps/rejected": -250.6980743408203, "loss": 0.5343, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.328160047531128, "rewards/margins": 0.6373846530914307, "rewards/rejected": -1.9655447006225586, "step": 4320 }, { "epoch": 0.7460372157133012, "grad_norm": 27.061187744140625, "learning_rate": 4.6709701168795143e-07, "logits/chosen": -2.0905137062072754, "logits/rejected": -2.051339626312256, "logps/chosen": -201.50439453125, "logps/rejected": -251.25942993164062, "loss": 0.563, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.458300232887268, "rewards/margins": 0.563452959060669, "rewards/rejected": -2.0217530727386475, "step": 4330 }, { "epoch": 0.7477601654031703, "grad_norm": 19.062034606933594, "learning_rate": 4.6684803223593884e-07, "logits/chosen": -2.044057846069336, "logits/rejected": -2.0012805461883545, "logps/chosen": -198.71922302246094, "logps/rejected": -262.88330078125, "loss": 0.552, "rewards/accuracies": 0.71875, "rewards/chosen": -1.480617642402649, "rewards/margins": 0.6513062715530396, "rewards/rejected": -2.1319241523742676, "step": 4340 }, { "epoch": 0.7494831150930393, "grad_norm": 23.250179290771484, "learning_rate": 4.665981811841852e-07, "logits/chosen": -2.0866827964782715, "logits/rejected": -2.0565686225891113, "logps/chosen": -183.35971069335938, "logps/rejected": -239.50076293945312, "loss": 0.547, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.310355305671692, "rewards/margins": 0.5533686876296997, "rewards/rejected": -1.8637237548828125, "step": 4350 }, { "epoch": 0.7512060647829083, "grad_norm": 15.809772491455078, "learning_rate": 4.6634745953694275e-07, "logits/chosen": -2.1166555881500244, "logits/rejected": -2.076220989227295, "logps/chosen": -186.1615447998047, "logps/rejected": -231.997802734375, "loss": 0.5922, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3146768808364868, "rewards/margins": 0.49725937843322754, "rewards/rejected": -1.811936378479004, "step": 4360 }, { "epoch": 0.7529290144727774, "grad_norm": 29.0755615234375, "learning_rate": 4.6609586830196303e-07, "logits/chosen": -2.1236376762390137, "logits/rejected": -2.0855138301849365, "logps/chosen": -188.16433715820312, "logps/rejected": -233.6986083984375, "loss": 0.5861, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3227546215057373, "rewards/margins": 0.47268590331077576, "rewards/rejected": -1.795440435409546, "step": 4370 }, { "epoch": 0.7546519641626465, "grad_norm": 23.789920806884766, "learning_rate": 4.658434084904925e-07, "logits/chosen": -2.134676456451416, "logits/rejected": -2.0958380699157715, "logps/chosen": -193.49124145507812, "logps/rejected": -244.9150848388672, "loss": 0.5758, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3772523403167725, "rewards/margins": 0.5550267696380615, "rewards/rejected": -1.9322795867919922, "step": 4380 }, { "epoch": 0.7563749138525155, "grad_norm": 25.28292465209961, "learning_rate": 4.6559008111726933e-07, "logits/chosen": -2.094460964202881, "logits/rejected": -2.0625646114349365, "logps/chosen": -173.20095825195312, "logps/rejected": -225.84683227539062, "loss": 0.5471, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1926004886627197, "rewards/margins": 0.5353989601135254, "rewards/rejected": -1.7279994487762451, "step": 4390 }, { "epoch": 0.7580978635423845, "grad_norm": 31.825584411621094, "learning_rate": 4.653358872005182e-07, "logits/chosen": -2.079986095428467, "logits/rejected": -2.0458712577819824, "logps/chosen": -200.60797119140625, "logps/rejected": -249.6495361328125, "loss": 0.5886, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4168068170547485, "rewards/margins": 0.5300611257553101, "rewards/rejected": -1.9468679428100586, "step": 4400 }, { "epoch": 0.7580978635423845, "eval_logits/chosen": -2.1110775470733643, "eval_logits/rejected": -2.089820146560669, "eval_logps/chosen": -187.45079040527344, "eval_logps/rejected": -220.06756591796875, "eval_loss": 0.6348621249198914, "eval_rewards/accuracies": 0.634061336517334, "eval_rewards/chosen": -1.2843528985977173, "eval_rewards/margins": 0.2888266146183014, "eval_rewards/rejected": -1.5731797218322754, "eval_runtime": 361.8147, "eval_samples_per_second": 11.896, "eval_steps_per_second": 1.487, "step": 4400 }, { "epoch": 0.7598208132322536, "grad_norm": 29.133798599243164, "learning_rate": 4.650808277619471e-07, "logits/chosen": -1.993761658668518, "logits/rejected": -1.9603182077407837, "logps/chosen": -209.12503051757812, "logps/rejected": -257.6119079589844, "loss": 0.61, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.525017261505127, "rewards/margins": 0.5080575942993164, "rewards/rejected": -2.0330748558044434, "step": 4410 }, { "epoch": 0.7615437629221227, "grad_norm": 37.29832077026367, "learning_rate": 4.648249038267429e-07, "logits/chosen": -2.144231081008911, "logits/rejected": -2.097996950149536, "logps/chosen": -185.92796325683594, "logps/rejected": -219.46731567382812, "loss": 0.6137, "rewards/accuracies": 0.625, "rewards/chosen": -1.2986061573028564, "rewards/margins": 0.3996659815311432, "rewards/rejected": -1.6982723474502563, "step": 4420 }, { "epoch": 0.7632667126119917, "grad_norm": 26.491430282592773, "learning_rate": 4.6456811642356706e-07, "logits/chosen": -2.1009583473205566, "logits/rejected": -2.0602753162384033, "logps/chosen": -161.1925048828125, "logps/rejected": -199.30960083007812, "loss": 0.5989, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0676313638687134, "rewards/margins": 0.4227350354194641, "rewards/rejected": -1.4903662204742432, "step": 4430 }, { "epoch": 0.7649896623018608, "grad_norm": 24.634960174560547, "learning_rate": 4.6431046658455184e-07, "logits/chosen": -2.122077465057373, "logits/rejected": -2.0819251537323, "logps/chosen": -181.2198028564453, "logps/rejected": -225.71194458007812, "loss": 0.568, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2117745876312256, "rewards/margins": 0.5051149129867554, "rewards/rejected": -1.7168896198272705, "step": 4440 }, { "epoch": 0.7667126119917298, "grad_norm": 17.893177032470703, "learning_rate": 4.6405195534529587e-07, "logits/chosen": -2.0605499744415283, "logits/rejected": -2.0298588275909424, "logps/chosen": -165.44662475585938, "logps/rejected": -213.81552124023438, "loss": 0.5544, "rewards/accuracies": 0.71875, "rewards/chosen": -1.083539366722107, "rewards/margins": 0.5248002409934998, "rewards/rejected": -1.6083396673202515, "step": 4450 }, { "epoch": 0.7684355616815989, "grad_norm": 23.43973159790039, "learning_rate": 4.637925837448601e-07, "logits/chosen": -2.065504550933838, "logits/rejected": -2.0319535732269287, "logps/chosen": -207.7255859375, "logps/rejected": -272.2740173339844, "loss": 0.5671, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5451226234436035, "rewards/margins": 0.6585391759872437, "rewards/rejected": -2.203662157058716, "step": 4460 }, { "epoch": 0.770158511371468, "grad_norm": 29.553508758544922, "learning_rate": 4.6353235282576377e-07, "logits/chosen": -2.0801141262054443, "logits/rejected": -2.035614490509033, "logps/chosen": -224.8473358154297, "logps/rejected": -278.14422607421875, "loss": 0.5721, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6836763620376587, "rewards/margins": 0.6017245054244995, "rewards/rejected": -2.285400867462158, "step": 4470 }, { "epoch": 0.771881461061337, "grad_norm": 49.11103820800781, "learning_rate": 4.6327126363397983e-07, "logits/chosen": -2.079634189605713, "logits/rejected": -2.039485216140747, "logps/chosen": -232.9774169921875, "logps/rejected": -293.3952941894531, "loss": 0.611, "rewards/accuracies": 0.65625, "rewards/chosen": -1.8091109991073608, "rewards/margins": 0.6142225861549377, "rewards/rejected": -2.4233336448669434, "step": 4480 }, { "epoch": 0.7736044107512061, "grad_norm": 26.819843292236328, "learning_rate": 4.6300931721893115e-07, "logits/chosen": -2.015071392059326, "logits/rejected": -1.9706470966339111, "logps/chosen": -220.6169891357422, "logps/rejected": -271.7673645019531, "loss": 0.5764, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6831495761871338, "rewards/margins": 0.5396355390548706, "rewards/rejected": -2.222785234451294, "step": 4490 }, { "epoch": 0.7753273604410751, "grad_norm": 15.393174171447754, "learning_rate": 4.6274651463348614e-07, "logits/chosen": -2.063544511795044, "logits/rejected": -2.0253031253814697, "logps/chosen": -197.45291137695312, "logps/rejected": -244.4704132080078, "loss": 0.5975, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3964500427246094, "rewards/margins": 0.5273274779319763, "rewards/rejected": -1.9237775802612305, "step": 4500 }, { "epoch": 0.7770503101309442, "grad_norm": 16.522377014160156, "learning_rate": 4.624828569339542e-07, "logits/chosen": -2.184967279434204, "logits/rejected": -2.144160032272339, "logps/chosen": -163.47604370117188, "logps/rejected": -216.01449584960938, "loss": 0.5428, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0812361240386963, "rewards/margins": 0.528548002243042, "rewards/rejected": -1.6097841262817383, "step": 4510 }, { "epoch": 0.7787732598208132, "grad_norm": 21.438154220581055, "learning_rate": 4.622183451800822e-07, "logits/chosen": -2.150951862335205, "logits/rejected": -2.1152586936950684, "logps/chosen": -171.11355590820312, "logps/rejected": -217.55422973632812, "loss": 0.5677, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1760194301605225, "rewards/margins": 0.5032806396484375, "rewards/rejected": -1.6792999505996704, "step": 4520 }, { "epoch": 0.7804962095106823, "grad_norm": 40.94443893432617, "learning_rate": 4.619529804350496e-07, "logits/chosen": -2.0781443119049072, "logits/rejected": -2.0500683784484863, "logps/chosen": -172.21592712402344, "logps/rejected": -211.99575805664062, "loss": 0.5967, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1749165058135986, "rewards/margins": 0.4194861352443695, "rewards/rejected": -1.5944026708602905, "step": 4530 }, { "epoch": 0.7822191592005513, "grad_norm": 38.200279235839844, "learning_rate": 4.616867637654643e-07, "logits/chosen": -1.967995285987854, "logits/rejected": -1.9232499599456787, "logps/chosen": -209.7811279296875, "logps/rejected": -269.48504638671875, "loss": 0.5681, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5585715770721436, "rewards/margins": 0.636776864528656, "rewards/rejected": -2.1953485012054443, "step": 4540 }, { "epoch": 0.7839421088904204, "grad_norm": 26.998828887939453, "learning_rate": 4.6141969624135867e-07, "logits/chosen": -2.0069527626037598, "logits/rejected": -1.963327169418335, "logps/chosen": -228.33132934570312, "logps/rejected": -293.47454833984375, "loss": 0.5607, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7035646438598633, "rewards/margins": 0.6979004144668579, "rewards/rejected": -2.4014651775360107, "step": 4550 }, { "epoch": 0.7856650585802895, "grad_norm": 22.13633155822754, "learning_rate": 4.611517789361847e-07, "logits/chosen": -1.9552606344223022, "logits/rejected": -1.9036716222763062, "logps/chosen": -235.66116333007812, "logps/rejected": -299.6728820800781, "loss": 0.5582, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7863060235977173, "rewards/margins": 0.6884779930114746, "rewards/rejected": -2.4747838973999023, "step": 4560 }, { "epoch": 0.7873880082701585, "grad_norm": 28.82307243347168, "learning_rate": 4.608830129268102e-07, "logits/chosen": -2.0014357566833496, "logits/rejected": -1.9593117237091064, "logps/chosen": -223.325927734375, "logps/rejected": -288.80859375, "loss": 0.5903, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7067028284072876, "rewards/margins": 0.6705763339996338, "rewards/rejected": -2.377279043197632, "step": 4570 }, { "epoch": 0.7891109579600276, "grad_norm": 18.439023971557617, "learning_rate": 4.606133992935142e-07, "logits/chosen": -2.0619914531707764, "logits/rejected": -2.0284805297851562, "logps/chosen": -162.1114044189453, "logps/rejected": -200.59962463378906, "loss": 0.5809, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0640976428985596, "rewards/margins": 0.4258154034614563, "rewards/rejected": -1.4899132251739502, "step": 4580 }, { "epoch": 0.7908339076498966, "grad_norm": 17.036758422851562, "learning_rate": 4.603429391199827e-07, "logits/chosen": -2.151122808456421, "logits/rejected": -2.11444091796875, "logps/chosen": -167.10597229003906, "logps/rejected": -199.73013305664062, "loss": 0.6347, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1154625415802002, "rewards/margins": 0.3650432527065277, "rewards/rejected": -1.4805057048797607, "step": 4590 }, { "epoch": 0.7925568573397657, "grad_norm": 23.528215408325195, "learning_rate": 4.600716334933043e-07, "logits/chosen": -2.1029391288757324, "logits/rejected": -2.0599780082702637, "logps/chosen": -166.78787231445312, "logps/rejected": -222.00003051757812, "loss": 0.5284, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1256091594696045, "rewards/margins": 0.5571141242980957, "rewards/rejected": -1.6827232837677002, "step": 4600 }, { "epoch": 0.7942798070296347, "grad_norm": 28.61471176147461, "learning_rate": 4.597994835039657e-07, "logits/chosen": -2.01672625541687, "logits/rejected": -1.9960731267929077, "logps/chosen": -228.39688110351562, "logps/rejected": -265.24932861328125, "loss": 0.6374, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.7630393505096436, "rewards/margins": 0.39058494567871094, "rewards/rejected": -2.1536242961883545, "step": 4610 }, { "epoch": 0.7960027567195038, "grad_norm": 29.602712631225586, "learning_rate": 4.595264902458476e-07, "logits/chosen": -2.0758678913116455, "logits/rejected": -2.029189348220825, "logps/chosen": -222.35690307617188, "logps/rejected": -293.4488220214844, "loss": 0.516, "rewards/accuracies": 0.78125, "rewards/chosen": -1.656280279159546, "rewards/margins": 0.7265432476997375, "rewards/rejected": -2.3828234672546387, "step": 4620 }, { "epoch": 0.7977257064093728, "grad_norm": 37.52836608886719, "learning_rate": 4.5925265481622e-07, "logits/chosen": -2.029547691345215, "logits/rejected": -1.9858028888702393, "logps/chosen": -264.10479736328125, "logps/rejected": -319.6578063964844, "loss": 0.5979, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.060103178024292, "rewards/margins": 0.5817283391952515, "rewards/rejected": -2.641831636428833, "step": 4630 }, { "epoch": 0.7994486560992419, "grad_norm": 24.00440216064453, "learning_rate": 4.58977978315738e-07, "logits/chosen": -2.067925453186035, "logits/rejected": -2.041778087615967, "logps/chosen": -225.9872589111328, "logps/rejected": -279.8884582519531, "loss": 0.5754, "rewards/accuracies": 0.71875, "rewards/chosen": -1.709956169128418, "rewards/margins": 0.5415796637535095, "rewards/rejected": -2.2515358924865723, "step": 4640 }, { "epoch": 0.801171605789111, "grad_norm": 27.420583724975586, "learning_rate": 4.587024618484374e-07, "logits/chosen": -2.077469825744629, "logits/rejected": -2.0368990898132324, "logps/chosen": -213.94699096679688, "logps/rejected": -260.081787109375, "loss": 0.6022, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.586656093597412, "rewards/margins": 0.5107787251472473, "rewards/rejected": -2.0974347591400146, "step": 4650 }, { "epoch": 0.80289455547898, "grad_norm": 17.519289016723633, "learning_rate": 4.5842610652172986e-07, "logits/chosen": -2.090916156768799, "logits/rejected": -2.0595808029174805, "logps/chosen": -180.28671264648438, "logps/rejected": -218.8395538330078, "loss": 0.5995, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.249285101890564, "rewards/margins": 0.4089018404483795, "rewards/rejected": -1.658186912536621, "step": 4660 }, { "epoch": 0.8046175051688491, "grad_norm": 24.84550666809082, "learning_rate": 4.581489134463991e-07, "logits/chosen": -2.0917375087738037, "logits/rejected": -2.050793170928955, "logps/chosen": -180.09719848632812, "logps/rejected": -239.96701049804688, "loss": 0.5157, "rewards/accuracies": 0.75, "rewards/chosen": -1.2716337442398071, "rewards/margins": 0.6463640332221985, "rewards/rejected": -1.9179977178573608, "step": 4670 }, { "epoch": 0.8063404548587181, "grad_norm": 31.604997634887695, "learning_rate": 4.578708837365959e-07, "logits/chosen": -2.0107340812683105, "logits/rejected": -1.9718272686004639, "logps/chosen": -249.32247924804688, "logps/rejected": -303.2679138183594, "loss": 0.6106, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.954459547996521, "rewards/margins": 0.5629742741584778, "rewards/rejected": -2.5174336433410645, "step": 4680 }, { "epoch": 0.8080634045485872, "grad_norm": 23.1247501373291, "learning_rate": 4.575920185098338e-07, "logits/chosen": -1.9877821207046509, "logits/rejected": -1.9434763193130493, "logps/chosen": -289.8751525878906, "logps/rejected": -370.42926025390625, "loss": 0.5545, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.3339085578918457, "rewards/margins": 0.8350251913070679, "rewards/rejected": -3.168933629989624, "step": 4690 }, { "epoch": 0.8097863542384562, "grad_norm": 24.924081802368164, "learning_rate": 4.5731231888698474e-07, "logits/chosen": -2.0037014484405518, "logits/rejected": -1.9652973413467407, "logps/chosen": -256.3509826660156, "logps/rejected": -330.2041320800781, "loss": 0.5493, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.0343449115753174, "rewards/margins": 0.7227517366409302, "rewards/rejected": -2.757096767425537, "step": 4700 }, { "epoch": 0.8115093039283253, "grad_norm": 35.95729446411133, "learning_rate": 4.570317859922743e-07, "logits/chosen": -2.0424256324768066, "logits/rejected": -2.003058671951294, "logps/chosen": -256.46026611328125, "logps/rejected": -315.011962890625, "loss": 0.5735, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.975829839706421, "rewards/margins": 0.6578549742698669, "rewards/rejected": -2.6336848735809326, "step": 4710 }, { "epoch": 0.8132322536181944, "grad_norm": 22.36492156982422, "learning_rate": 4.567504209532774e-07, "logits/chosen": -2.0171761512756348, "logits/rejected": -1.9732568264007568, "logps/chosen": -219.6728515625, "logps/rejected": -281.9639892578125, "loss": 0.5342, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6190826892852783, "rewards/margins": 0.6796184778213501, "rewards/rejected": -2.2987008094787598, "step": 4720 }, { "epoch": 0.8149552033080634, "grad_norm": 29.95681381225586, "learning_rate": 4.5646822490091375e-07, "logits/chosen": -2.0352606773376465, "logits/rejected": -1.9968599081039429, "logps/chosen": -202.66268920898438, "logps/rejected": -247.3496856689453, "loss": 0.6141, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4769208431243896, "rewards/margins": 0.49215084314346313, "rewards/rejected": -1.9690717458724976, "step": 4730 }, { "epoch": 0.8166781529979324, "grad_norm": 16.657939910888672, "learning_rate": 4.5618519896944303e-07, "logits/chosen": -2.0957555770874023, "logits/rejected": -2.0625622272491455, "logps/chosen": -178.37844848632812, "logps/rejected": -246.50326538085938, "loss": 0.5106, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2259323596954346, "rewards/margins": 0.7048084735870361, "rewards/rejected": -1.9307407140731812, "step": 4740 }, { "epoch": 0.8184011026878015, "grad_norm": 29.667522430419922, "learning_rate": 4.559013442964607e-07, "logits/chosen": -2.0111019611358643, "logits/rejected": -1.9912898540496826, "logps/chosen": -230.82424926757812, "logps/rejected": -279.6896057128906, "loss": 0.5621, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7427289485931396, "rewards/margins": 0.5153648257255554, "rewards/rejected": -2.25809383392334, "step": 4750 }, { "epoch": 0.8201240523776706, "grad_norm": 39.808433532714844, "learning_rate": 4.556166620228933e-07, "logits/chosen": -1.9434912204742432, "logits/rejected": -1.8982303142547607, "logps/chosen": -276.0238037109375, "logps/rejected": -351.0096435546875, "loss": 0.5305, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.2328338623046875, "rewards/margins": 0.7706564664840698, "rewards/rejected": -3.003490447998047, "step": 4760 }, { "epoch": 0.8218470020675396, "grad_norm": 18.926952362060547, "learning_rate": 4.5533115329299366e-07, "logits/chosen": -2.0191707611083984, "logits/rejected": -1.988918662071228, "logps/chosen": -271.0950927734375, "logps/rejected": -319.5252380371094, "loss": 0.6132, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.10599946975708, "rewards/margins": 0.5503307580947876, "rewards/rejected": -2.6563303470611572, "step": 4770 }, { "epoch": 0.8235699517574087, "grad_norm": 20.014331817626953, "learning_rate": 4.5504481925433656e-07, "logits/chosen": -2.0038554668426514, "logits/rejected": -1.9677921533584595, "logps/chosen": -215.09152221679688, "logps/rejected": -283.9213562011719, "loss": 0.4968, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6241964101791382, "rewards/margins": 0.6891189813613892, "rewards/rejected": -2.3133153915405273, "step": 4780 }, { "epoch": 0.8252929014472777, "grad_norm": 19.218746185302734, "learning_rate": 4.547576610578141e-07, "logits/chosen": -2.042541027069092, "logits/rejected": -2.0085904598236084, "logps/chosen": -221.81741333007812, "logps/rejected": -260.5118408203125, "loss": 0.5968, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.632112741470337, "rewards/margins": 0.45219525694847107, "rewards/rejected": -2.08430814743042, "step": 4790 }, { "epoch": 0.8270158511371468, "grad_norm": 19.792543411254883, "learning_rate": 4.5446967985763094e-07, "logits/chosen": -2.0283398628234863, "logits/rejected": -1.9994605779647827, "logps/chosen": -209.61532592773438, "logps/rejected": -256.652099609375, "loss": 0.5907, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5691382884979248, "rewards/margins": 0.49877309799194336, "rewards/rejected": -2.0679116249084473, "step": 4800 }, { "epoch": 0.8270158511371468, "eval_logits/chosen": -2.1136837005615234, "eval_logits/rejected": -2.0942389965057373, "eval_logps/chosen": -193.4448699951172, "eval_logps/rejected": -224.09593200683594, "eval_loss": 0.6306005716323853, "eval_rewards/accuracies": 0.6477695107460022, "eval_rewards/chosen": -1.3442940711975098, "eval_rewards/margins": 0.26916930079460144, "eval_rewards/rejected": -1.6134634017944336, "eval_runtime": 361.3503, "eval_samples_per_second": 11.911, "eval_steps_per_second": 1.489, "step": 4800 }, { "epoch": 0.8287388008270159, "grad_norm": 20.022554397583008, "learning_rate": 4.5418087681129976e-07, "logits/chosen": -2.027597188949585, "logits/rejected": -1.9831022024154663, "logps/chosen": -205.69595336914062, "logps/rejected": -258.70233154296875, "loss": 0.5463, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4737117290496826, "rewards/margins": 0.6040796041488647, "rewards/rejected": -2.077791213989258, "step": 4810 }, { "epoch": 0.8304617505168849, "grad_norm": 21.67654037475586, "learning_rate": 4.5389125307963644e-07, "logits/chosen": -2.056954860687256, "logits/rejected": -2.0273966789245605, "logps/chosen": -204.76792907714844, "logps/rejected": -260.35498046875, "loss": 0.5549, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5181210041046143, "rewards/margins": 0.574622631072998, "rewards/rejected": -2.0927438735961914, "step": 4820 }, { "epoch": 0.832184700206754, "grad_norm": 17.18356704711914, "learning_rate": 4.536008098267556e-07, "logits/chosen": -2.029345750808716, "logits/rejected": -1.999261498451233, "logps/chosen": -205.8705596923828, "logps/rejected": -256.86639404296875, "loss": 0.5935, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5080629587173462, "rewards/margins": 0.5383862257003784, "rewards/rejected": -2.0464491844177246, "step": 4830 }, { "epoch": 0.833907649896623, "grad_norm": 24.565731048583984, "learning_rate": 4.5330954822006607e-07, "logits/chosen": -2.0305984020233154, "logits/rejected": -1.9849293231964111, "logps/chosen": -206.6695556640625, "logps/rejected": -260.18609619140625, "loss": 0.5656, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4914562702178955, "rewards/margins": 0.5923083424568176, "rewards/rejected": -2.0837645530700684, "step": 4840 }, { "epoch": 0.8356305995864921, "grad_norm": 21.961519241333008, "learning_rate": 4.530174694302656e-07, "logits/chosen": -2.0665595531463623, "logits/rejected": -2.0297117233276367, "logps/chosen": -200.59591674804688, "logps/rejected": -260.6346130371094, "loss": 0.5293, "rewards/accuracies": 0.75, "rewards/chosen": -1.4524840116500854, "rewards/margins": 0.6254092454910278, "rewards/rejected": -2.0778932571411133, "step": 4850 }, { "epoch": 0.8373535492763611, "grad_norm": 33.070308685302734, "learning_rate": 4.5272457463133676e-07, "logits/chosen": -2.021787643432617, "logits/rejected": -1.9725964069366455, "logps/chosen": -229.82498168945312, "logps/rejected": -280.1785888671875, "loss": 0.5687, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.7038345336914062, "rewards/margins": 0.5985569953918457, "rewards/rejected": -2.302391529083252, "step": 4860 }, { "epoch": 0.8390764989662302, "grad_norm": 28.93623161315918, "learning_rate": 4.5243086500054194e-07, "logits/chosen": -2.0471935272216797, "logits/rejected": -2.004492998123169, "logps/chosen": -234.60592651367188, "logps/rejected": -289.7828063964844, "loss": 0.5829, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.757394552230835, "rewards/margins": 0.6308781504631042, "rewards/rejected": -2.388273000717163, "step": 4870 }, { "epoch": 0.8407994486560992, "grad_norm": 20.8734130859375, "learning_rate": 4.5213634171841866e-07, "logits/chosen": -2.0354058742523193, "logits/rejected": -1.9888004064559937, "logps/chosen": -190.033203125, "logps/rejected": -246.0937042236328, "loss": 0.5459, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3628569841384888, "rewards/margins": 0.6046766638755798, "rewards/rejected": -1.9675334692001343, "step": 4880 }, { "epoch": 0.8425223983459683, "grad_norm": 20.91156005859375, "learning_rate": 4.518410059687747e-07, "logits/chosen": -2.1184709072113037, "logits/rejected": -2.076908588409424, "logps/chosen": -185.0583953857422, "logps/rejected": -244.900146484375, "loss": 0.5299, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2911940813064575, "rewards/margins": 0.6303024888038635, "rewards/rejected": -1.9214966297149658, "step": 4890 }, { "epoch": 0.8442453480358374, "grad_norm": 20.507543563842773, "learning_rate": 4.515448589386838e-07, "logits/chosen": -2.0396854877471924, "logits/rejected": -1.9937269687652588, "logps/chosen": -195.95950317382812, "logps/rejected": -245.6066436767578, "loss": 0.5872, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4216787815093994, "rewards/margins": 0.540787398815155, "rewards/rejected": -1.9624662399291992, "step": 4900 }, { "epoch": 0.8459682977257064, "grad_norm": 36.45354080200195, "learning_rate": 4.5124790181848024e-07, "logits/chosen": -2.0997748374938965, "logits/rejected": -2.043391466140747, "logps/chosen": -234.44973754882812, "logps/rejected": -304.2508239746094, "loss": 0.5514, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7719612121582031, "rewards/margins": 0.7539782524108887, "rewards/rejected": -2.525939464569092, "step": 4910 }, { "epoch": 0.8476912474155754, "grad_norm": 26.842365264892578, "learning_rate": 4.5095013580175455e-07, "logits/chosen": -1.953744888305664, "logits/rejected": -1.91195547580719, "logps/chosen": -259.47552490234375, "logps/rejected": -333.6404724121094, "loss": 0.5534, "rewards/accuracies": 0.6875, "rewards/chosen": -2.045459270477295, "rewards/margins": 0.7692426443099976, "rewards/rejected": -2.814702033996582, "step": 4920 }, { "epoch": 0.8494141971054445, "grad_norm": 22.462615966796875, "learning_rate": 4.5065156208534855e-07, "logits/chosen": -2.07277774810791, "logits/rejected": -2.032794237136841, "logps/chosen": -257.4947814941406, "logps/rejected": -320.9485778808594, "loss": 0.5975, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.032346725463867, "rewards/margins": 0.6085593700408936, "rewards/rejected": -2.6409060955047607, "step": 4930 }, { "epoch": 0.8511371467953136, "grad_norm": 25.397506713867188, "learning_rate": 4.5035218186935044e-07, "logits/chosen": -2.0871431827545166, "logits/rejected": -2.041184663772583, "logps/chosen": -237.12026977539062, "logps/rejected": -303.4742736816406, "loss": 0.5388, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.8053655624389648, "rewards/margins": 0.715019702911377, "rewards/rejected": -2.520385503768921, "step": 4940 }, { "epoch": 0.8528600964851827, "grad_norm": 20.48045539855957, "learning_rate": 4.500519963570901e-07, "logits/chosen": -1.9613748788833618, "logits/rejected": -1.9228299856185913, "logps/chosen": -226.21548461914062, "logps/rejected": -280.1295471191406, "loss": 0.5836, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7012525796890259, "rewards/margins": 0.5617281198501587, "rewards/rejected": -2.2629806995391846, "step": 4950 }, { "epoch": 0.8545830461750517, "grad_norm": 27.782730102539062, "learning_rate": 4.497510067551342e-07, "logits/chosen": -2.024528980255127, "logits/rejected": -1.9988200664520264, "logps/chosen": -210.90072631835938, "logps/rejected": -276.5225524902344, "loss": 0.5306, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.580496907234192, "rewards/margins": 0.6341825127601624, "rewards/rejected": -2.21467924118042, "step": 4960 }, { "epoch": 0.8563059958649207, "grad_norm": 24.19654655456543, "learning_rate": 4.494492142732815e-07, "logits/chosen": -1.986158013343811, "logits/rejected": -1.9527645111083984, "logps/chosen": -214.08493041992188, "logps/rejected": -287.34979248046875, "loss": 0.5212, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.613403558731079, "rewards/margins": 0.7499521374702454, "rewards/rejected": -2.3633556365966797, "step": 4970 }, { "epoch": 0.8580289455547898, "grad_norm": 23.096698760986328, "learning_rate": 4.491466201245577e-07, "logits/chosen": -1.9755605459213257, "logits/rejected": -1.9360367059707642, "logps/chosen": -232.80691528320312, "logps/rejected": -311.80572509765625, "loss": 0.4896, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.782967209815979, "rewards/margins": 0.7969039678573608, "rewards/rejected": -2.57987117767334, "step": 4980 }, { "epoch": 0.8597518952446589, "grad_norm": 33.58396530151367, "learning_rate": 4.488432255252108e-07, "logits/chosen": -1.9477779865264893, "logits/rejected": -1.9163627624511719, "logps/chosen": -258.144775390625, "logps/rejected": -322.1652526855469, "loss": 0.6286, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.055413007736206, "rewards/margins": 0.6222153306007385, "rewards/rejected": -2.677628517150879, "step": 4990 }, { "epoch": 0.8614748449345279, "grad_norm": 23.230907440185547, "learning_rate": 4.485390316947061e-07, "logits/chosen": -1.9799747467041016, "logits/rejected": -1.937718152999878, "logps/chosen": -210.01394653320312, "logps/rejected": -273.9417419433594, "loss": 0.5374, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5297095775604248, "rewards/margins": 0.6865459680557251, "rewards/rejected": -2.2162556648254395, "step": 5000 }, { "epoch": 0.8631977946243969, "grad_norm": 20.354990005493164, "learning_rate": 4.482340398557213e-07, "logits/chosen": -2.001593589782715, "logits/rejected": -1.9596580266952515, "logps/chosen": -202.83631896972656, "logps/rejected": -269.1433410644531, "loss": 0.516, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4915802478790283, "rewards/margins": 0.7033747434616089, "rewards/rejected": -2.1949551105499268, "step": 5010 }, { "epoch": 0.864920744314266, "grad_norm": 27.68824005126953, "learning_rate": 4.479282512341418e-07, "logits/chosen": -1.9780470132827759, "logits/rejected": -1.928483247756958, "logps/chosen": -208.64810180664062, "logps/rejected": -275.57171630859375, "loss": 0.5416, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5427706241607666, "rewards/margins": 0.6966052055358887, "rewards/rejected": -2.2393758296966553, "step": 5020 }, { "epoch": 0.8666436940041351, "grad_norm": 39.72317123413086, "learning_rate": 4.476216670590553e-07, "logits/chosen": -1.9476467370986938, "logits/rejected": -1.9107706546783447, "logps/chosen": -234.83425903320312, "logps/rejected": -296.7217712402344, "loss": 0.5841, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.8102229833602905, "rewards/margins": 0.6275158524513245, "rewards/rejected": -2.4377388954162598, "step": 5030 }, { "epoch": 0.8683666436940042, "grad_norm": 21.903242111206055, "learning_rate": 4.4731428856274745e-07, "logits/chosen": -2.0723166465759277, "logits/rejected": -2.0274131298065186, "logps/chosen": -224.57333374023438, "logps/rejected": -275.7362060546875, "loss": 0.5598, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6640199422836304, "rewards/margins": 0.592391848564148, "rewards/rejected": -2.2564117908477783, "step": 5040 }, { "epoch": 0.8700895933838731, "grad_norm": 16.352096557617188, "learning_rate": 4.4700611698069636e-07, "logits/chosen": -2.002920627593994, "logits/rejected": -1.9675649404525757, "logps/chosen": -225.5820770263672, "logps/rejected": -281.49505615234375, "loss": 0.5821, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6847527027130127, "rewards/margins": 0.5841774344444275, "rewards/rejected": -2.268929958343506, "step": 5050 }, { "epoch": 0.8718125430737422, "grad_norm": 18.560102462768555, "learning_rate": 4.4669715355156794e-07, "logits/chosen": -1.9634443521499634, "logits/rejected": -1.9424867630004883, "logps/chosen": -218.47726440429688, "logps/rejected": -266.7620544433594, "loss": 0.6381, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6775621175765991, "rewards/margins": 0.47239774465560913, "rewards/rejected": -2.1499600410461426, "step": 5060 }, { "epoch": 0.8735354927636113, "grad_norm": 25.141725540161133, "learning_rate": 4.46387399517211e-07, "logits/chosen": -2.0127017498016357, "logits/rejected": -1.9779157638549805, "logps/chosen": -194.95367431640625, "logps/rejected": -239.8086395263672, "loss": 0.5867, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4085301160812378, "rewards/margins": 0.516280472278595, "rewards/rejected": -1.9248106479644775, "step": 5070 }, { "epoch": 0.8752584424534804, "grad_norm": 16.204591751098633, "learning_rate": 4.4607685612265186e-07, "logits/chosen": -2.2413554191589355, "logits/rejected": -2.1903884410858154, "logps/chosen": -167.66220092773438, "logps/rejected": -216.31350708007812, "loss": 0.5423, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0963835716247559, "rewards/margins": 0.5614444613456726, "rewards/rejected": -1.6578279733657837, "step": 5080 }, { "epoch": 0.8769813921433495, "grad_norm": 22.40886878967285, "learning_rate": 4.457655246160899e-07, "logits/chosen": -2.0810563564300537, "logits/rejected": -2.038487672805786, "logps/chosen": -186.6328125, "logps/rejected": -242.01724243164062, "loss": 0.5428, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3152021169662476, "rewards/margins": 0.597240149974823, "rewards/rejected": -1.9124422073364258, "step": 5090 }, { "epoch": 0.8787043418332184, "grad_norm": 29.630615234375, "learning_rate": 4.454534062488919e-07, "logits/chosen": -1.976575493812561, "logits/rejected": -1.9305028915405273, "logps/chosen": -218.09609985351562, "logps/rejected": -264.9560546875, "loss": 0.6023, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.6200100183486938, "rewards/margins": 0.5091668367385864, "rewards/rejected": -2.129176616668701, "step": 5100 }, { "epoch": 0.8804272915230875, "grad_norm": 23.184295654296875, "learning_rate": 4.451405022755876e-07, "logits/chosen": -2.0997207164764404, "logits/rejected": -2.059025764465332, "logps/chosen": -205.8336181640625, "logps/rejected": -244.8787841796875, "loss": 0.6094, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4718234539031982, "rewards/margins": 0.4441297948360443, "rewards/rejected": -1.9159530401229858, "step": 5110 }, { "epoch": 0.8821502412129566, "grad_norm": 25.002307891845703, "learning_rate": 4.4482681395386437e-07, "logits/chosen": -2.0949456691741943, "logits/rejected": -2.0567798614501953, "logps/chosen": -183.1117706298828, "logps/rejected": -222.38412475585938, "loss": 0.5952, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2867579460144043, "rewards/margins": 0.43893274664878845, "rewards/rejected": -1.7256906032562256, "step": 5120 }, { "epoch": 0.8838731909028257, "grad_norm": 23.23900604248047, "learning_rate": 4.4451234254456206e-07, "logits/chosen": -2.1370229721069336, "logits/rejected": -2.087592363357544, "logps/chosen": -195.84664916992188, "logps/rejected": -248.58169555664062, "loss": 0.5581, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4125874042510986, "rewards/margins": 0.5822240114212036, "rewards/rejected": -1.9948116540908813, "step": 5130 }, { "epoch": 0.8855961405926946, "grad_norm": 27.39185905456543, "learning_rate": 4.441970893116681e-07, "logits/chosen": -2.1077470779418945, "logits/rejected": -2.0724475383758545, "logps/chosen": -186.17869567871094, "logps/rejected": -215.886962890625, "loss": 0.6192, "rewards/accuracies": 0.625, "rewards/chosen": -1.271774411201477, "rewards/margins": 0.3687826991081238, "rewards/rejected": -1.640557050704956, "step": 5140 }, { "epoch": 0.8873190902825637, "grad_norm": 21.129880905151367, "learning_rate": 4.4388105552231264e-07, "logits/chosen": -2.101102828979492, "logits/rejected": -2.068150043487549, "logps/chosen": -176.59967041015625, "logps/rejected": -221.4196319580078, "loss": 0.5771, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2250016927719116, "rewards/margins": 0.45095372200012207, "rewards/rejected": -1.6759554147720337, "step": 5150 }, { "epoch": 0.8890420399724328, "grad_norm": 31.25546646118164, "learning_rate": 4.4356424244676283e-07, "logits/chosen": -2.112565517425537, "logits/rejected": -2.072075366973877, "logps/chosen": -188.5159149169922, "logps/rejected": -227.3189239501953, "loss": 0.6029, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3593846559524536, "rewards/margins": 0.4357883036136627, "rewards/rejected": -1.7951726913452148, "step": 5160 }, { "epoch": 0.8907649896623019, "grad_norm": 20.901613235473633, "learning_rate": 4.432466513584183e-07, "logits/chosen": -2.0666048526763916, "logits/rejected": -2.0185022354125977, "logps/chosen": -199.58738708496094, "logps/rejected": -272.3442687988281, "loss": 0.5017, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.4221034049987793, "rewards/margins": 0.7882121801376343, "rewards/rejected": -2.2103159427642822, "step": 5170 }, { "epoch": 0.892487939352171, "grad_norm": 35.356990814208984, "learning_rate": 4.4292828353380586e-07, "logits/chosen": -2.1241612434387207, "logits/rejected": -2.098829746246338, "logps/chosen": -239.27700805664062, "logps/rejected": -277.2031555175781, "loss": 0.6428, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.8310835361480713, "rewards/margins": 0.398749977350235, "rewards/rejected": -2.2298333644866943, "step": 5180 }, { "epoch": 0.8942108890420399, "grad_norm": 17.450794219970703, "learning_rate": 4.4260914025257423e-07, "logits/chosen": -2.0571634769439697, "logits/rejected": -2.0331737995147705, "logps/chosen": -202.2477264404297, "logps/rejected": -245.2686767578125, "loss": 0.6153, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4918283224105835, "rewards/margins": 0.4285239279270172, "rewards/rejected": -1.9203522205352783, "step": 5190 }, { "epoch": 0.895933838731909, "grad_norm": 20.656455993652344, "learning_rate": 4.4228922279748894e-07, "logits/chosen": -2.1796975135803223, "logits/rejected": -2.139998435974121, "logps/chosen": -179.96275329589844, "logps/rejected": -231.1851806640625, "loss": 0.5456, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2708046436309814, "rewards/margins": 0.5161277651786804, "rewards/rejected": -1.786932349205017, "step": 5200 }, { "epoch": 0.895933838731909, "eval_logits/chosen": -2.1393842697143555, "eval_logits/rejected": -2.12139892578125, "eval_logps/chosen": -176.54408264160156, "eval_logps/rejected": -204.74227905273438, "eval_loss": 0.6326830387115479, "eval_rewards/accuracies": 0.6407992839813232, "eval_rewards/chosen": -1.1752861738204956, "eval_rewards/margins": 0.2446404993534088, "eval_rewards/rejected": -1.419926643371582, "eval_runtime": 361.3757, "eval_samples_per_second": 11.91, "eval_steps_per_second": 1.489, "step": 5200 }, { "epoch": 0.8976567884217781, "grad_norm": 20.31707191467285, "learning_rate": 4.4196853245442735e-07, "logits/chosen": -2.063828706741333, "logits/rejected": -2.0261330604553223, "logps/chosen": -199.20712280273438, "logps/rejected": -246.3192901611328, "loss": 0.573, "rewards/accuracies": 0.6875, "rewards/chosen": -1.466806411743164, "rewards/margins": 0.5038591623306274, "rewards/rejected": -1.970665693283081, "step": 5210 }, { "epoch": 0.8993797381116472, "grad_norm": 20.915828704833984, "learning_rate": 4.416470705123735e-07, "logits/chosen": -2.042283773422241, "logits/rejected": -2.0166451930999756, "logps/chosen": -211.0543975830078, "logps/rejected": -252.4989013671875, "loss": 0.6208, "rewards/accuracies": 0.6875, "rewards/chosen": -1.578370213508606, "rewards/margins": 0.3995417654514313, "rewards/rejected": -1.9779119491577148, "step": 5220 }, { "epoch": 0.9011026878015161, "grad_norm": 22.140911102294922, "learning_rate": 4.4132483826341243e-07, "logits/chosen": -2.057380199432373, "logits/rejected": -2.02563214302063, "logps/chosen": -199.10812377929688, "logps/rejected": -263.26824951171875, "loss": 0.5398, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4180949926376343, "rewards/margins": 0.6641402244567871, "rewards/rejected": -2.082235336303711, "step": 5230 }, { "epoch": 0.9028256374913852, "grad_norm": 20.671411514282227, "learning_rate": 4.4100183700272574e-07, "logits/chosen": -2.0643410682678223, "logits/rejected": -2.014376401901245, "logps/chosen": -180.79989624023438, "logps/rejected": -252.0329132080078, "loss": 0.5092, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.2713226079940796, "rewards/margins": 0.7234185338020325, "rewards/rejected": -1.9947410821914673, "step": 5240 }, { "epoch": 0.9045485871812543, "grad_norm": 18.627723693847656, "learning_rate": 4.4067806802858575e-07, "logits/chosen": -1.994053602218628, "logits/rejected": -1.961503028869629, "logps/chosen": -196.02743530273438, "logps/rejected": -259.9338073730469, "loss": 0.5248, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4385006427764893, "rewards/margins": 0.6568225622177124, "rewards/rejected": -2.095323085784912, "step": 5250 }, { "epoch": 0.9062715368711234, "grad_norm": 27.890745162963867, "learning_rate": 4.403535326423507e-07, "logits/chosen": -1.976142168045044, "logits/rejected": -1.9262492656707764, "logps/chosen": -213.0786895751953, "logps/rejected": -269.7364501953125, "loss": 0.5812, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5587716102600098, "rewards/margins": 0.6320794224739075, "rewards/rejected": -2.1908509731292725, "step": 5260 }, { "epoch": 0.9079944865609925, "grad_norm": 17.79314613342285, "learning_rate": 4.400282321484591e-07, "logits/chosen": -2.1180338859558105, "logits/rejected": -2.0819621086120605, "logps/chosen": -193.53594970703125, "logps/rejected": -248.2975616455078, "loss": 0.5829, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4066722393035889, "rewards/margins": 0.5712646245956421, "rewards/rejected": -1.97793710231781, "step": 5270 }, { "epoch": 0.9097174362508614, "grad_norm": 24.169647216796875, "learning_rate": 4.3970216785442503e-07, "logits/chosen": -2.0658493041992188, "logits/rejected": -2.0197129249572754, "logps/chosen": -201.77073669433594, "logps/rejected": -250.7774658203125, "loss": 0.575, "rewards/accuracies": 0.6875, "rewards/chosen": -1.467287302017212, "rewards/margins": 0.5436084866523743, "rewards/rejected": -2.0108959674835205, "step": 5280 }, { "epoch": 0.9114403859407305, "grad_norm": 23.597225189208984, "learning_rate": 4.393753410708324e-07, "logits/chosen": -2.0580217838287354, "logits/rejected": -2.0074574947357178, "logps/chosen": -182.91940307617188, "logps/rejected": -236.07595825195312, "loss": 0.5594, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2588675022125244, "rewards/margins": 0.5918609499931335, "rewards/rejected": -1.8507282733917236, "step": 5290 }, { "epoch": 0.9131633356305996, "grad_norm": 24.949676513671875, "learning_rate": 4.390477531113299e-07, "logits/chosen": -2.0552399158477783, "logits/rejected": -2.019660472869873, "logps/chosen": -192.0709991455078, "logps/rejected": -250.5839385986328, "loss": 0.5423, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3520668745040894, "rewards/margins": 0.6095935702323914, "rewards/rejected": -1.961660623550415, "step": 5300 }, { "epoch": 0.9148862853204687, "grad_norm": 23.743989944458008, "learning_rate": 4.3871940529262586e-07, "logits/chosen": -2.096210241317749, "logits/rejected": -2.058560848236084, "logps/chosen": -214.4279327392578, "logps/rejected": -268.00299072265625, "loss": 0.5619, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.552255630493164, "rewards/margins": 0.6108680963516235, "rewards/rejected": -2.163123607635498, "step": 5310 }, { "epoch": 0.9166092350103378, "grad_norm": 29.332298278808594, "learning_rate": 4.3839029893448255e-07, "logits/chosen": -2.0367541313171387, "logits/rejected": -1.974304437637329, "logps/chosen": -232.62893676757812, "logps/rejected": -285.0311279296875, "loss": 0.5797, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7374168634414673, "rewards/margins": 0.6230419874191284, "rewards/rejected": -2.3604588508605957, "step": 5320 }, { "epoch": 0.9183321847002067, "grad_norm": 21.769271850585938, "learning_rate": 4.3806043535971116e-07, "logits/chosen": -2.001335382461548, "logits/rejected": -1.968015432357788, "logps/chosen": -211.955810546875, "logps/rejected": -298.8793029785156, "loss": 0.5062, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6066347360610962, "rewards/margins": 0.8615700602531433, "rewards/rejected": -2.4682047367095947, "step": 5330 }, { "epoch": 0.9200551343900758, "grad_norm": 26.31940460205078, "learning_rate": 4.377298158941666e-07, "logits/chosen": -2.0301687717437744, "logits/rejected": -1.9788501262664795, "logps/chosen": -217.60464477539062, "logps/rejected": -272.79046630859375, "loss": 0.5414, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5948331356048584, "rewards/margins": 0.6235058903694153, "rewards/rejected": -2.218338966369629, "step": 5340 }, { "epoch": 0.9217780840799449, "grad_norm": 16.695066452026367, "learning_rate": 4.3739844186674186e-07, "logits/chosen": -2.0444159507751465, "logits/rejected": -2.0111351013183594, "logps/chosen": -202.3340301513672, "logps/rejected": -249.8513946533203, "loss": 0.5996, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4879268407821655, "rewards/margins": 0.5023617148399353, "rewards/rejected": -1.990288496017456, "step": 5350 }, { "epoch": 0.923501033769814, "grad_norm": 20.44490623474121, "learning_rate": 4.370663146093628e-07, "logits/chosen": -2.114773988723755, "logits/rejected": -2.0791077613830566, "logps/chosen": -185.92677307128906, "logps/rejected": -251.1392822265625, "loss": 0.5587, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3128293752670288, "rewards/margins": 0.6386434435844421, "rewards/rejected": -1.9514728784561157, "step": 5360 }, { "epoch": 0.9252239834596829, "grad_norm": 31.61473846435547, "learning_rate": 4.3673343545698316e-07, "logits/chosen": -2.0283827781677246, "logits/rejected": -1.9892237186431885, "logps/chosen": -205.3874969482422, "logps/rejected": -269.06243896484375, "loss": 0.5403, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5163631439208984, "rewards/margins": 0.6468100547790527, "rewards/rejected": -2.163173198699951, "step": 5370 }, { "epoch": 0.926946933149552, "grad_norm": 21.59777069091797, "learning_rate": 4.363998057475783e-07, "logits/chosen": -2.0812506675720215, "logits/rejected": -2.042513132095337, "logps/chosen": -215.59799194335938, "logps/rejected": -265.55987548828125, "loss": 0.5563, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5649205446243286, "rewards/margins": 0.5848304033279419, "rewards/rejected": -2.1497511863708496, "step": 5380 }, { "epoch": 0.9286698828394211, "grad_norm": 25.48356819152832, "learning_rate": 4.3606542682214065e-07, "logits/chosen": -1.9818401336669922, "logits/rejected": -1.9325300455093384, "logps/chosen": -233.2616424560547, "logps/rejected": -285.22955322265625, "loss": 0.5791, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.7448337078094482, "rewards/margins": 0.6011096239089966, "rewards/rejected": -2.345942974090576, "step": 5390 }, { "epoch": 0.9303928325292902, "grad_norm": 28.81311798095703, "learning_rate": 4.3573030002467415e-07, "logits/chosen": -1.999366044998169, "logits/rejected": -1.958042860031128, "logps/chosen": -210.5235137939453, "logps/rejected": -284.1165771484375, "loss": 0.5264, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5865437984466553, "rewards/margins": 0.7212013006210327, "rewards/rejected": -2.3077452182769775, "step": 5400 }, { "epoch": 0.9321157822191593, "grad_norm": 39.472286224365234, "learning_rate": 4.353944267021886e-07, "logits/chosen": -2.0250706672668457, "logits/rejected": -2.002026081085205, "logps/chosen": -226.7211151123047, "logps/rejected": -265.8958740234375, "loss": 0.6087, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6864761114120483, "rewards/margins": 0.43023666739463806, "rewards/rejected": -2.116712808609009, "step": 5410 }, { "epoch": 0.9338387319090282, "grad_norm": 26.32221221923828, "learning_rate": 4.350578082046944e-07, "logits/chosen": -2.0823471546173096, "logits/rejected": -2.0464425086975098, "logps/chosen": -208.64453125, "logps/rejected": -265.73846435546875, "loss": 0.5723, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.53423273563385, "rewards/margins": 0.5952261686325073, "rewards/rejected": -2.1294589042663574, "step": 5420 }, { "epoch": 0.9355616815988973, "grad_norm": 23.36039161682129, "learning_rate": 4.3472044588519707e-07, "logits/chosen": -2.0640034675598145, "logits/rejected": -2.033115863800049, "logps/chosen": -203.04269409179688, "logps/rejected": -256.3262939453125, "loss": 0.5773, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4885218143463135, "rewards/margins": 0.5672802925109863, "rewards/rejected": -2.0558021068573, "step": 5430 }, { "epoch": 0.9372846312887664, "grad_norm": 21.861549377441406, "learning_rate": 4.3438234109969194e-07, "logits/chosen": -2.050621271133423, "logits/rejected": -2.0160679817199707, "logps/chosen": -194.39541625976562, "logps/rejected": -255.40084838867188, "loss": 0.5434, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3782681226730347, "rewards/margins": 0.6317645907402039, "rewards/rejected": -2.010032892227173, "step": 5440 }, { "epoch": 0.9390075809786355, "grad_norm": 39.127403259277344, "learning_rate": 4.340434952071586e-07, "logits/chosen": -2.001739263534546, "logits/rejected": -1.9711672067642212, "logps/chosen": -209.41732788085938, "logps/rejected": -263.2030029296875, "loss": 0.575, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5258095264434814, "rewards/margins": 0.5785327553749084, "rewards/rejected": -2.104342222213745, "step": 5450 }, { "epoch": 0.9407305306685044, "grad_norm": 26.060443878173828, "learning_rate": 4.337039095695554e-07, "logits/chosen": -1.992112398147583, "logits/rejected": -1.962459921836853, "logps/chosen": -204.56369018554688, "logps/rejected": -246.32034301757812, "loss": 0.6136, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.5006986856460571, "rewards/margins": 0.44974803924560547, "rewards/rejected": -1.9504464864730835, "step": 5460 }, { "epoch": 0.9424534803583735, "grad_norm": 16.457931518554688, "learning_rate": 4.3336358555181395e-07, "logits/chosen": -2.041752576828003, "logits/rejected": -2.018385648727417, "logps/chosen": -182.25868225097656, "logps/rejected": -225.34017944335938, "loss": 0.5971, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3010756969451904, "rewards/margins": 0.45102375745773315, "rewards/rejected": -1.752099633216858, "step": 5470 }, { "epoch": 0.9441764300482426, "grad_norm": 23.21503257751465, "learning_rate": 4.3302252452183396e-07, "logits/chosen": -2.0474092960357666, "logits/rejected": -2.005073070526123, "logps/chosen": -182.69290161132812, "logps/rejected": -245.06167602539062, "loss": 0.5286, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2817127704620361, "rewards/margins": 0.659805178642273, "rewards/rejected": -1.9415180683135986, "step": 5480 }, { "epoch": 0.9458993797381117, "grad_norm": 17.282848358154297, "learning_rate": 4.3268072785047727e-07, "logits/chosen": -2.0035064220428467, "logits/rejected": -1.9655128717422485, "logps/chosen": -202.6716766357422, "logps/rejected": -244.6951446533203, "loss": 0.5892, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.458282232284546, "rewards/margins": 0.47887173295021057, "rewards/rejected": -1.9371538162231445, "step": 5490 }, { "epoch": 0.9476223294279807, "grad_norm": 27.68363380432129, "learning_rate": 4.323381969115626e-07, "logits/chosen": -2.0546698570251465, "logits/rejected": -2.0241079330444336, "logps/chosen": -216.05313110351562, "logps/rejected": -280.0194396972656, "loss": 0.5358, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6138684749603271, "rewards/margins": 0.6450786590576172, "rewards/rejected": -2.2589471340179443, "step": 5500 }, { "epoch": 0.9493452791178497, "grad_norm": 31.15981674194336, "learning_rate": 4.3199493308185996e-07, "logits/chosen": -1.9570415019989014, "logits/rejected": -1.928706407546997, "logps/chosen": -258.3089904785156, "logps/rejected": -313.845947265625, "loss": 0.6018, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.0466341972351074, "rewards/margins": 0.5670737624168396, "rewards/rejected": -2.6137077808380127, "step": 5510 }, { "epoch": 0.9510682288077188, "grad_norm": 30.720592498779297, "learning_rate": 4.3165093774108526e-07, "logits/chosen": -1.9494514465332031, "logits/rejected": -1.920331597328186, "logps/chosen": -251.87924194335938, "logps/rejected": -308.6024169921875, "loss": 0.5759, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.9401447772979736, "rewards/margins": 0.637392520904541, "rewards/rejected": -2.5775375366210938, "step": 5520 }, { "epoch": 0.9527911784975879, "grad_norm": 23.633731842041016, "learning_rate": 4.313062122718945e-07, "logits/chosen": -2.0031421184539795, "logits/rejected": -1.9585685729980469, "logps/chosen": -238.923583984375, "logps/rejected": -292.52691650390625, "loss": 0.548, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7777411937713623, "rewards/margins": 0.6342819333076477, "rewards/rejected": -2.4120230674743652, "step": 5530 }, { "epoch": 0.954514128187457, "grad_norm": 25.11859703063965, "learning_rate": 4.3096075805987854e-07, "logits/chosen": -2.0395348072052, "logits/rejected": -2.0156760215759277, "logps/chosen": -206.6995086669922, "logps/rejected": -264.9061584472656, "loss": 0.5662, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5293447971343994, "rewards/margins": 0.5622383952140808, "rewards/rejected": -2.091583013534546, "step": 5540 }, { "epoch": 0.956237077877326, "grad_norm": 22.497875213623047, "learning_rate": 4.3061457649355725e-07, "logits/chosen": -1.9762252569198608, "logits/rejected": -1.9511468410491943, "logps/chosen": -192.49325561523438, "logps/rejected": -226.85629272460938, "loss": 0.6309, "rewards/accuracies": 0.625, "rewards/chosen": -1.395403504371643, "rewards/margins": 0.33508509397506714, "rewards/rejected": -1.7304887771606445, "step": 5550 }, { "epoch": 0.957960027567195, "grad_norm": 21.76469612121582, "learning_rate": 4.3026766896437397e-07, "logits/chosen": -2.052302122116089, "logits/rejected": -2.0147347450256348, "logps/chosen": -163.8451385498047, "logps/rejected": -208.0031280517578, "loss": 0.5685, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1079504489898682, "rewards/margins": 0.4840013384819031, "rewards/rejected": -1.591951847076416, "step": 5560 }, { "epoch": 0.9596829772570641, "grad_norm": 22.236814498901367, "learning_rate": 4.2992003686669e-07, "logits/chosen": -2.1132795810699463, "logits/rejected": -2.0932562351226807, "logps/chosen": -164.45407104492188, "logps/rejected": -208.7455596923828, "loss": 0.5941, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1252763271331787, "rewards/margins": 0.40155482292175293, "rewards/rejected": -1.5268311500549316, "step": 5570 }, { "epoch": 0.9614059269469332, "grad_norm": 20.76388931274414, "learning_rate": 4.2957168159777906e-07, "logits/chosen": -2.0340957641601562, "logits/rejected": -1.9989385604858398, "logps/chosen": -173.22402954101562, "logps/rejected": -231.7960662841797, "loss": 0.5281, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.190632939338684, "rewards/margins": 0.5938134789466858, "rewards/rejected": -1.7844464778900146, "step": 5580 }, { "epoch": 0.9631288766368022, "grad_norm": 42.83650588989258, "learning_rate": 4.292226045578216e-07, "logits/chosen": -2.0228583812713623, "logits/rejected": -1.9758615493774414, "logps/chosen": -223.142578125, "logps/rejected": -279.0724182128906, "loss": 0.5878, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6596895456314087, "rewards/margins": 0.5846013426780701, "rewards/rejected": -2.244290828704834, "step": 5590 }, { "epoch": 0.9648518263266712, "grad_norm": 26.451879501342773, "learning_rate": 4.28872807149899e-07, "logits/chosen": -2.041311264038086, "logits/rejected": -2.0140676498413086, "logps/chosen": -207.6523895263672, "logps/rejected": -275.0325012207031, "loss": 0.5465, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5448793172836304, "rewards/margins": 0.6417932510375977, "rewards/rejected": -2.1866726875305176, "step": 5600 }, { "epoch": 0.9648518263266712, "eval_logits/chosen": -2.087179183959961, "eval_logits/rejected": -2.066899538040161, "eval_logps/chosen": -186.70709228515625, "eval_logps/rejected": -217.74668884277344, "eval_loss": 0.6324964761734009, "eval_rewards/accuracies": 0.6370818018913269, "eval_rewards/chosen": -1.2769159078598022, "eval_rewards/margins": 0.27305492758750916, "eval_rewards/rejected": -1.5499709844589233, "eval_runtime": 361.3676, "eval_samples_per_second": 11.91, "eval_steps_per_second": 1.489, "step": 5600 }, { "epoch": 0.9665747760165403, "grad_norm": 25.95531463623047, "learning_rate": 4.285222907799883e-07, "logits/chosen": -2.0479652881622314, "logits/rejected": -2.013390064239502, "logps/chosen": -203.67628479003906, "logps/rejected": -271.48565673828125, "loss": 0.5373, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5083404779434204, "rewards/margins": 0.652506947517395, "rewards/rejected": -2.1608471870422363, "step": 5610 }, { "epoch": 0.9682977257064094, "grad_norm": 35.70238494873047, "learning_rate": 4.2817105685695617e-07, "logits/chosen": -1.9710603952407837, "logits/rejected": -1.938661813735962, "logps/chosen": -214.04348754882812, "logps/rejected": -261.2383728027344, "loss": 0.5831, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5861331224441528, "rewards/margins": 0.5239273309707642, "rewards/rejected": -2.110060453414917, "step": 5620 }, { "epoch": 0.9700206753962785, "grad_norm": 23.401948928833008, "learning_rate": 4.278191067925534e-07, "logits/chosen": -2.069118022918701, "logits/rejected": -2.030264377593994, "logps/chosen": -195.80950927734375, "logps/rejected": -235.580078125, "loss": 0.6037, "rewards/accuracies": 0.6875, "rewards/chosen": -1.390516996383667, "rewards/margins": 0.45987266302108765, "rewards/rejected": -1.8503894805908203, "step": 5630 }, { "epoch": 0.9717436250861475, "grad_norm": 20.85120964050293, "learning_rate": 4.2746644200140937e-07, "logits/chosen": -2.0919713973999023, "logits/rejected": -2.062824249267578, "logps/chosen": -156.51181030273438, "logps/rejected": -208.86196899414062, "loss": 0.5627, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0121644735336304, "rewards/margins": 0.5226696729660034, "rewards/rejected": -1.5348341464996338, "step": 5640 }, { "epoch": 0.9734665747760165, "grad_norm": 13.499677658081055, "learning_rate": 4.271130639010262e-07, "logits/chosen": -2.0077853202819824, "logits/rejected": -1.9664016962051392, "logps/chosen": -177.82154846191406, "logps/rejected": -233.50296020507812, "loss": 0.5642, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2604433298110962, "rewards/margins": 0.5619374513626099, "rewards/rejected": -1.822380781173706, "step": 5650 }, { "epoch": 0.9751895244658856, "grad_norm": 21.704248428344727, "learning_rate": 4.267589739117731e-07, "logits/chosen": -1.9905760288238525, "logits/rejected": -1.954175591468811, "logps/chosen": -202.77235412597656, "logps/rejected": -268.557373046875, "loss": 0.5332, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5065020322799683, "rewards/margins": 0.6797186136245728, "rewards/rejected": -2.186220645904541, "step": 5660 }, { "epoch": 0.9769124741557547, "grad_norm": 21.946508407592773, "learning_rate": 4.264041734568805e-07, "logits/chosen": -1.944946527481079, "logits/rejected": -1.9078090190887451, "logps/chosen": -222.34872436523438, "logps/rejected": -287.39019775390625, "loss": 0.5225, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6697494983673096, "rewards/margins": 0.6833610534667969, "rewards/rejected": -2.3531107902526855, "step": 5670 }, { "epoch": 0.9786354238456237, "grad_norm": 39.278507232666016, "learning_rate": 4.260486639624347e-07, "logits/chosen": -1.919394850730896, "logits/rejected": -1.8745434284210205, "logps/chosen": -259.0929870605469, "logps/rejected": -321.0891418457031, "loss": 0.5696, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.0399715900421143, "rewards/margins": 0.6615440249443054, "rewards/rejected": -2.7015154361724854, "step": 5680 }, { "epoch": 0.9803583735354927, "grad_norm": 18.62718391418457, "learning_rate": 4.256924468573717e-07, "logits/chosen": -2.0100674629211426, "logits/rejected": -1.9515002965927124, "logps/chosen": -218.5240936279297, "logps/rejected": -285.1241149902344, "loss": 0.5104, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5850409269332886, "rewards/margins": 0.7582043409347534, "rewards/rejected": -2.343245267868042, "step": 5690 }, { "epoch": 0.9820813232253618, "grad_norm": 34.81496810913086, "learning_rate": 4.253355235734719e-07, "logits/chosen": -1.9763332605361938, "logits/rejected": -1.943891167640686, "logps/chosen": -190.7355499267578, "logps/rejected": -246.2216339111328, "loss": 0.5644, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3267838954925537, "rewards/margins": 0.6045454740524292, "rewards/rejected": -1.931329369544983, "step": 5700 }, { "epoch": 0.9838042729152309, "grad_norm": 18.815074920654297, "learning_rate": 4.2497789554535393e-07, "logits/chosen": -2.103501081466675, "logits/rejected": -2.0706424713134766, "logps/chosen": -178.41119384765625, "logps/rejected": -231.51028442382812, "loss": 0.5652, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2224504947662354, "rewards/margins": 0.5746386051177979, "rewards/rejected": -1.7970892190933228, "step": 5710 }, { "epoch": 0.9855272226051, "grad_norm": 21.395015716552734, "learning_rate": 4.2461956421046917e-07, "logits/chosen": -2.0665366649627686, "logits/rejected": -2.0214967727661133, "logps/chosen": -160.29750061035156, "logps/rejected": -218.61416625976562, "loss": 0.5264, "rewards/accuracies": 0.75, "rewards/chosen": -1.0673502683639526, "rewards/margins": 0.634096086025238, "rewards/rejected": -1.701446533203125, "step": 5720 }, { "epoch": 0.987250172294969, "grad_norm": 32.33345413208008, "learning_rate": 4.2426053100909574e-07, "logits/chosen": -2.093170642852783, "logits/rejected": -2.064652442932129, "logps/chosen": -189.04232788085938, "logps/rejected": -240.9623260498047, "loss": 0.5973, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3615580797195435, "rewards/margins": 0.5365954637527466, "rewards/rejected": -1.8981536626815796, "step": 5730 }, { "epoch": 0.988973121984838, "grad_norm": 20.025728225708008, "learning_rate": 4.2390079738433316e-07, "logits/chosen": -2.0255069732666016, "logits/rejected": -1.9607988595962524, "logps/chosen": -203.14671325683594, "logps/rejected": -293.372802734375, "loss": 0.4694, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4350515604019165, "rewards/margins": 0.9654629826545715, "rewards/rejected": -2.4005141258239746, "step": 5740 }, { "epoch": 0.9906960716747071, "grad_norm": 23.066410064697266, "learning_rate": 4.235403647820958e-07, "logits/chosen": -2.0593209266662598, "logits/rejected": -2.033235549926758, "logps/chosen": -209.4295196533203, "logps/rejected": -240.8780059814453, "loss": 0.642, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5121194124221802, "rewards/margins": 0.39781758189201355, "rewards/rejected": -1.9099371433258057, "step": 5750 }, { "epoch": 0.9924190213645762, "grad_norm": 17.3566837310791, "learning_rate": 4.2317923465110786e-07, "logits/chosen": -2.0864696502685547, "logits/rejected": -2.038264513015747, "logps/chosen": -184.2631072998047, "logps/rejected": -243.1116485595703, "loss": 0.5412, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2722245454788208, "rewards/margins": 0.611478865146637, "rewards/rejected": -1.8837032318115234, "step": 5760 }, { "epoch": 0.9941419710544452, "grad_norm": 21.723268508911133, "learning_rate": 4.2281740844289706e-07, "logits/chosen": -1.9943828582763672, "logits/rejected": -1.9467099905014038, "logps/chosen": -189.71707153320312, "logps/rejected": -254.4718475341797, "loss": 0.5471, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3700774908065796, "rewards/margins": 0.6413481831550598, "rewards/rejected": -2.011425495147705, "step": 5770 }, { "epoch": 0.9958649207443143, "grad_norm": 38.848876953125, "learning_rate": 4.2245488761178884e-07, "logits/chosen": -1.9788990020751953, "logits/rejected": -1.9352576732635498, "logps/chosen": -216.1654815673828, "logps/rejected": -284.24554443359375, "loss": 0.5554, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6271826028823853, "rewards/margins": 0.6832169890403748, "rewards/rejected": -2.3103995323181152, "step": 5780 }, { "epoch": 0.9975878704341833, "grad_norm": 20.532377243041992, "learning_rate": 4.2209167361490073e-07, "logits/chosen": -1.8998435735702515, "logits/rejected": -1.8462488651275635, "logps/chosen": -232.8983917236328, "logps/rejected": -304.81396484375, "loss": 0.5442, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8157646656036377, "rewards/margins": 0.6938422918319702, "rewards/rejected": -2.5096068382263184, "step": 5790 }, { "epoch": 0.9993108201240524, "grad_norm": 19.26974868774414, "learning_rate": 4.217277679121364e-07, "logits/chosen": -2.020625114440918, "logits/rejected": -1.968883752822876, "logps/chosen": -232.93106079101562, "logps/rejected": -311.3260803222656, "loss": 0.5352, "rewards/accuracies": 0.71875, "rewards/chosen": -1.8115390539169312, "rewards/margins": 0.7903367280960083, "rewards/rejected": -2.6018757820129395, "step": 5800 }, { "epoch": 1.0010337698139213, "grad_norm": 16.623146057128906, "learning_rate": 4.2136317196617964e-07, "logits/chosen": -2.0192911624908447, "logits/rejected": -1.971780776977539, "logps/chosen": -217.9697723388672, "logps/rejected": -298.9036560058594, "loss": 0.5051, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6251914501190186, "rewards/margins": 0.8331044316291809, "rewards/rejected": -2.458296060562134, "step": 5810 }, { "epoch": 1.0027567195037905, "grad_norm": 23.782194137573242, "learning_rate": 4.2099788724248863e-07, "logits/chosen": -2.0052778720855713, "logits/rejected": -1.937267541885376, "logps/chosen": -221.1989288330078, "logps/rejected": -316.5149230957031, "loss": 0.4327, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.6724674701690674, "rewards/margins": 1.006546139717102, "rewards/rejected": -2.67901349067688, "step": 5820 }, { "epoch": 1.0044796691936595, "grad_norm": 35.32204818725586, "learning_rate": 4.2063191520929023e-07, "logits/chosen": -1.8941318988800049, "logits/rejected": -1.8446038961410522, "logps/chosen": -249.76663208007812, "logps/rejected": -338.6812438964844, "loss": 0.5195, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.950754165649414, "rewards/margins": 0.9204131960868835, "rewards/rejected": -2.8711674213409424, "step": 5830 }, { "epoch": 1.0062026188835287, "grad_norm": 14.736563682556152, "learning_rate": 4.2026525733757364e-07, "logits/chosen": -1.9705533981323242, "logits/rejected": -1.926038384437561, "logps/chosen": -216.8916473388672, "logps/rejected": -300.7476501464844, "loss": 0.5103, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6354926824569702, "rewards/margins": 0.8343148231506348, "rewards/rejected": -2.4698073863983154, "step": 5840 }, { "epoch": 1.0079255685733977, "grad_norm": 28.668010711669922, "learning_rate": 4.1989791510108475e-07, "logits/chosen": -2.0323872566223145, "logits/rejected": -1.9819841384887695, "logps/chosen": -180.29989624023438, "logps/rejected": -270.27593994140625, "loss": 0.4634, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.2870458364486694, "rewards/margins": 0.8800476789474487, "rewards/rejected": -2.167093515396118, "step": 5850 }, { "epoch": 1.0096485182632666, "grad_norm": 25.484766006469727, "learning_rate": 4.195298899763202e-07, "logits/chosen": -1.9165560007095337, "logits/rejected": -1.865731954574585, "logps/chosen": -200.59585571289062, "logps/rejected": -297.9922790527344, "loss": 0.4796, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4746431112289429, "rewards/margins": 0.9892441034317017, "rewards/rejected": -2.4638872146606445, "step": 5860 }, { "epoch": 1.0113714679531358, "grad_norm": 27.089738845825195, "learning_rate": 4.191611834425216e-07, "logits/chosen": -1.9268524646759033, "logits/rejected": -1.8713953495025635, "logps/chosen": -232.2283172607422, "logps/rejected": -324.1127624511719, "loss": 0.4791, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7567579746246338, "rewards/margins": 0.9463886022567749, "rewards/rejected": -2.7031466960906982, "step": 5870 }, { "epoch": 1.0130944176430048, "grad_norm": 27.151538848876953, "learning_rate": 4.187917969816692e-07, "logits/chosen": -1.9690574407577515, "logits/rejected": -1.9225845336914062, "logps/chosen": -205.36972045898438, "logps/rejected": -288.64508056640625, "loss": 0.4976, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4972679615020752, "rewards/margins": 0.8655202984809875, "rewards/rejected": -2.362787961959839, "step": 5880 }, { "epoch": 1.014817367332874, "grad_norm": 27.642696380615234, "learning_rate": 4.184217320784762e-07, "logits/chosen": -2.002453327178955, "logits/rejected": -1.9604202508926392, "logps/chosen": -198.0019989013672, "logps/rejected": -275.18670654296875, "loss": 0.5315, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4161829948425293, "rewards/margins": 0.7640682458877563, "rewards/rejected": -2.180251359939575, "step": 5890 }, { "epoch": 1.016540317022743, "grad_norm": 25.530738830566406, "learning_rate": 4.180509902203829e-07, "logits/chosen": -1.9071022272109985, "logits/rejected": -1.8538618087768555, "logps/chosen": -206.09268188476562, "logps/rejected": -300.14508056640625, "loss": 0.4484, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.4945026636123657, "rewards/margins": 0.9738343954086304, "rewards/rejected": -2.468336820602417, "step": 5900 }, { "epoch": 1.018263266712612, "grad_norm": 29.373737335205078, "learning_rate": 4.1767957289755026e-07, "logits/chosen": -1.8833202123641968, "logits/rejected": -1.8462626934051514, "logps/chosen": -222.30679321289062, "logps/rejected": -312.62261962890625, "loss": 0.5243, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6928943395614624, "rewards/margins": 0.8663195371627808, "rewards/rejected": -2.5592141151428223, "step": 5910 }, { "epoch": 1.019986216402481, "grad_norm": 34.13895034790039, "learning_rate": 4.1730748160285446e-07, "logits/chosen": -1.8065992593765259, "logits/rejected": -1.759703278541565, "logps/chosen": -259.6125183105469, "logps/rejected": -339.39849853515625, "loss": 0.564, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.0781290531158447, "rewards/margins": 0.7895197868347168, "rewards/rejected": -2.8676493167877197, "step": 5920 }, { "epoch": 1.02170916609235, "grad_norm": 25.93062400817871, "learning_rate": 4.169347178318806e-07, "logits/chosen": -1.9100964069366455, "logits/rejected": -1.872892141342163, "logps/chosen": -237.0955047607422, "logps/rejected": -315.92730712890625, "loss": 0.5785, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8648557662963867, "rewards/margins": 0.790349006652832, "rewards/rejected": -2.655205249786377, "step": 5930 }, { "epoch": 1.0234321157822193, "grad_norm": 33.706565856933594, "learning_rate": 4.165612830829166e-07, "logits/chosen": -1.9038887023925781, "logits/rejected": -1.8542802333831787, "logps/chosen": -199.47853088378906, "logps/rejected": -277.88446044921875, "loss": 0.5171, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.444139003753662, "rewards/margins": 0.8106091618537903, "rewards/rejected": -2.2547481060028076, "step": 5940 }, { "epoch": 1.0251550654720882, "grad_norm": 26.05793571472168, "learning_rate": 4.161871788569474e-07, "logits/chosen": -1.8957666158676147, "logits/rejected": -1.858064889907837, "logps/chosen": -200.43679809570312, "logps/rejected": -283.1724548339844, "loss": 0.4862, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.4524383544921875, "rewards/margins": 0.8502519726753235, "rewards/rejected": -2.3026905059814453, "step": 5950 }, { "epoch": 1.0268780151619572, "grad_norm": 19.349016189575195, "learning_rate": 4.1581240665764894e-07, "logits/chosen": -1.885359764099121, "logits/rejected": -1.830887794494629, "logps/chosen": -229.05892944335938, "logps/rejected": -286.2938232421875, "loss": 0.5594, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7035707235336304, "rewards/margins": 0.6521943807601929, "rewards/rejected": -2.3557651042938232, "step": 5960 }, { "epoch": 1.0286009648518264, "grad_norm": 23.10702896118164, "learning_rate": 4.154369679913818e-07, "logits/chosen": -1.8941190242767334, "logits/rejected": -1.8452653884887695, "logps/chosen": -235.5084228515625, "logps/rejected": -319.9417419433594, "loss": 0.4943, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.824506402015686, "rewards/margins": 0.8811212778091431, "rewards/rejected": -2.70562744140625, "step": 5970 }, { "epoch": 1.0303239145416954, "grad_norm": 35.37862777709961, "learning_rate": 4.1506086436718547e-07, "logits/chosen": -1.875645637512207, "logits/rejected": -1.8346000909805298, "logps/chosen": -235.96090698242188, "logps/rejected": -333.5318908691406, "loss": 0.4972, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8220031261444092, "rewards/margins": 0.954565703868866, "rewards/rejected": -2.77656888961792, "step": 5980 }, { "epoch": 1.0320468642315643, "grad_norm": 35.280948638916016, "learning_rate": 4.146840972967723e-07, "logits/chosen": -1.9312973022460938, "logits/rejected": -1.8685839176177979, "logps/chosen": -244.7843017578125, "logps/rejected": -330.3288879394531, "loss": 0.4898, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.8921781778335571, "rewards/margins": 0.899877667427063, "rewards/rejected": -2.792055606842041, "step": 5990 }, { "epoch": 1.0337698139214335, "grad_norm": 40.39186477661133, "learning_rate": 4.14306668294521e-07, "logits/chosen": -1.8580873012542725, "logits/rejected": -1.8043105602264404, "logps/chosen": -276.5140380859375, "logps/rejected": -390.03179931640625, "loss": 0.4632, "rewards/accuracies": 0.78125, "rewards/chosen": -2.2415695190429688, "rewards/margins": 1.1483030319213867, "rewards/rejected": -3.3898727893829346, "step": 6000 }, { "epoch": 1.0337698139214335, "eval_logits/chosen": -1.9125378131866455, "eval_logits/rejected": -1.8835746049880981, "eval_logps/chosen": -277.2339172363281, "eval_logps/rejected": -326.78759765625, "eval_loss": 0.6484220623970032, "eval_rewards/accuracies": 0.6496282815933228, "eval_rewards/chosen": -2.1821844577789307, "eval_rewards/margins": 0.45819583535194397, "eval_rewards/rejected": -2.640380382537842, "eval_runtime": 360.932, "eval_samples_per_second": 11.925, "eval_steps_per_second": 1.491, "step": 6000 }, { "epoch": 1.0354927636113025, "grad_norm": 34.88210678100586, "learning_rate": 4.139285788774712e-07, "logits/chosen": -1.881900429725647, "logits/rejected": -1.8421787023544312, "logps/chosen": -255.9016876220703, "logps/rejected": -341.322265625, "loss": 0.5587, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.020561933517456, "rewards/margins": 0.8898151516914368, "rewards/rejected": -2.910377264022827, "step": 6010 }, { "epoch": 1.0372157133011717, "grad_norm": 21.118213653564453, "learning_rate": 4.1354983056531674e-07, "logits/chosen": -2.041095018386841, "logits/rejected": -1.9816436767578125, "logps/chosen": -204.9003448486328, "logps/rejected": -285.4656066894531, "loss": 0.4943, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4904783964157104, "rewards/margins": 0.8757421374320984, "rewards/rejected": -2.366220474243164, "step": 6020 }, { "epoch": 1.0389386629910407, "grad_norm": 20.85739517211914, "learning_rate": 4.131704248803999e-07, "logits/chosen": -1.8389513492584229, "logits/rejected": -1.7913305759429932, "logps/chosen": -201.35513305664062, "logps/rejected": -277.56787109375, "loss": 0.536, "rewards/accuracies": 0.75, "rewards/chosen": -1.5110610723495483, "rewards/margins": 0.7651222944259644, "rewards/rejected": -2.276183605194092, "step": 6030 }, { "epoch": 1.0406616126809096, "grad_norm": 28.334020614624023, "learning_rate": 4.1279036334770525e-07, "logits/chosen": -1.9902660846710205, "logits/rejected": -1.9484964609146118, "logps/chosen": -186.19017028808594, "logps/rejected": -252.3775177001953, "loss": 0.5252, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.320743203163147, "rewards/margins": 0.6969123482704163, "rewards/rejected": -2.017655849456787, "step": 6040 }, { "epoch": 1.0423845623707788, "grad_norm": 23.768043518066406, "learning_rate": 4.124096474948534e-07, "logits/chosen": -1.910634994506836, "logits/rejected": -1.8590805530548096, "logps/chosen": -229.1404571533203, "logps/rejected": -309.28155517578125, "loss": 0.4828, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.687110185623169, "rewards/margins": 0.8855695724487305, "rewards/rejected": -2.5726795196533203, "step": 6050 }, { "epoch": 1.0441075120606478, "grad_norm": 48.89802932739258, "learning_rate": 4.12028278852095e-07, "logits/chosen": -1.8629939556121826, "logits/rejected": -1.8155031204223633, "logps/chosen": -267.139404296875, "logps/rejected": -354.20782470703125, "loss": 0.5426, "rewards/accuracies": 0.78125, "rewards/chosen": -2.130652904510498, "rewards/margins": 0.8984590768814087, "rewards/rejected": -3.0291123390197754, "step": 6060 }, { "epoch": 1.045830461750517, "grad_norm": 50.099334716796875, "learning_rate": 4.1164625895230457e-07, "logits/chosen": -1.8457444906234741, "logits/rejected": -1.78597891330719, "logps/chosen": -296.71112060546875, "logps/rejected": -400.134765625, "loss": 0.5085, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.4173965454101562, "rewards/margins": 1.0742360353469849, "rewards/rejected": -3.4916324615478516, "step": 6070 }, { "epoch": 1.047553411440386, "grad_norm": 26.530786514282227, "learning_rate": 4.1126358933097425e-07, "logits/chosen": -1.8958507776260376, "logits/rejected": -1.8499376773834229, "logps/chosen": -282.85675048828125, "logps/rejected": -368.38714599609375, "loss": 0.5211, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.2249388694763184, "rewards/margins": 0.9374469518661499, "rewards/rejected": -3.162385940551758, "step": 6080 }, { "epoch": 1.049276361130255, "grad_norm": 22.645334243774414, "learning_rate": 4.1088027152620753e-07, "logits/chosen": -1.9771652221679688, "logits/rejected": -1.9213783740997314, "logps/chosen": -226.96694946289062, "logps/rejected": -323.4267272949219, "loss": 0.4675, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.7469186782836914, "rewards/margins": 0.9808128476142883, "rewards/rejected": -2.727731227874756, "step": 6090 }, { "epoch": 1.050999310820124, "grad_norm": 21.984912872314453, "learning_rate": 4.104963070787134e-07, "logits/chosen": -1.9915088415145874, "logits/rejected": -1.9385402202606201, "logps/chosen": -211.1168212890625, "logps/rejected": -314.689697265625, "loss": 0.449, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.5292876958847046, "rewards/margins": 1.0903791189193726, "rewards/rejected": -2.619666576385498, "step": 6100 }, { "epoch": 1.052722260509993, "grad_norm": 25.840604782104492, "learning_rate": 4.101116975318e-07, "logits/chosen": -1.9235185384750366, "logits/rejected": -1.8710438013076782, "logps/chosen": -243.2189178466797, "logps/rejected": -341.24774169921875, "loss": 0.4824, "rewards/accuracies": 0.78125, "rewards/chosen": -1.9068129062652588, "rewards/margins": 0.9739784002304077, "rewards/rejected": -2.880791187286377, "step": 6110 }, { "epoch": 1.0544452101998623, "grad_norm": 42.950923919677734, "learning_rate": 4.0972644443136807e-07, "logits/chosen": -1.9765655994415283, "logits/rejected": -1.8978198766708374, "logps/chosen": -262.1171875, "logps/rejected": -389.03509521484375, "loss": 0.4465, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.066864252090454, "rewards/margins": 1.3010509014129639, "rewards/rejected": -3.3679146766662598, "step": 6120 }, { "epoch": 1.0561681598897312, "grad_norm": 17.42161750793457, "learning_rate": 4.093405493259056e-07, "logits/chosen": -1.9397681951522827, "logits/rejected": -1.8816808462142944, "logps/chosen": -238.67080688476562, "logps/rejected": -342.2619934082031, "loss": 0.444, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.8331947326660156, "rewards/margins": 1.0562341213226318, "rewards/rejected": -2.8894283771514893, "step": 6130 }, { "epoch": 1.0578911095796002, "grad_norm": 30.309040069580078, "learning_rate": 4.089540137664803e-07, "logits/chosen": -1.984863519668579, "logits/rejected": -1.9349193572998047, "logps/chosen": -209.4086456298828, "logps/rejected": -290.91253662109375, "loss": 0.5023, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5570614337921143, "rewards/margins": 0.8362207412719727, "rewards/rejected": -2.393281936645508, "step": 6140 }, { "epoch": 1.0596140592694694, "grad_norm": 24.917699813842773, "learning_rate": 4.0856683930673496e-07, "logits/chosen": -1.945174217224121, "logits/rejected": -1.8966201543807983, "logps/chosen": -218.4195556640625, "logps/rejected": -304.27972412109375, "loss": 0.5286, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6490360498428345, "rewards/margins": 0.9044731259346008, "rewards/rejected": -2.55350923538208, "step": 6150 }, { "epoch": 1.0613370089593384, "grad_norm": 33.783241271972656, "learning_rate": 4.0817902750287977e-07, "logits/chosen": -1.8973767757415771, "logits/rejected": -1.8591804504394531, "logps/chosen": -227.42361450195312, "logps/rejected": -300.1002502441406, "loss": 0.5577, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7499326467514038, "rewards/margins": 0.7326033711433411, "rewards/rejected": -2.4825360774993896, "step": 6160 }, { "epoch": 1.0630599586492075, "grad_norm": 23.68409538269043, "learning_rate": 4.0779057991368683e-07, "logits/chosen": -1.992837905883789, "logits/rejected": -1.9425159692764282, "logps/chosen": -237.0723114013672, "logps/rejected": -336.44036865234375, "loss": 0.4804, "rewards/accuracies": 0.75, "rewards/chosen": -1.7719281911849976, "rewards/margins": 1.0313085317611694, "rewards/rejected": -2.803236484527588, "step": 6170 }, { "epoch": 1.0647829083390765, "grad_norm": 22.278350830078125, "learning_rate": 4.074014981004839e-07, "logits/chosen": -1.9917291402816772, "logits/rejected": -1.9516489505767822, "logps/chosen": -227.7581329345703, "logps/rejected": -292.923583984375, "loss": 0.5421, "rewards/accuracies": 0.75, "rewards/chosen": -1.6847959756851196, "rewards/margins": 0.7221981287002563, "rewards/rejected": -2.406994104385376, "step": 6180 }, { "epoch": 1.0665058580289455, "grad_norm": 24.879804611206055, "learning_rate": 4.0701178362714753e-07, "logits/chosen": -1.989490270614624, "logits/rejected": -1.93972647190094, "logps/chosen": -232.4105682373047, "logps/rejected": -316.6881408691406, "loss": 0.4705, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7252111434936523, "rewards/margins": 0.9199700355529785, "rewards/rejected": -2.645181179046631, "step": 6190 }, { "epoch": 1.0682288077188147, "grad_norm": 29.970762252807617, "learning_rate": 4.066214380600976e-07, "logits/chosen": -1.873096227645874, "logits/rejected": -1.8117682933807373, "logps/chosen": -244.5754852294922, "logps/rejected": -345.7333984375, "loss": 0.4937, "rewards/accuracies": 0.78125, "rewards/chosen": -1.891631841659546, "rewards/margins": 1.0226860046386719, "rewards/rejected": -2.9143178462982178, "step": 6200 }, { "epoch": 1.0699517574086836, "grad_norm": 42.00307846069336, "learning_rate": 4.0623046296829057e-07, "logits/chosen": -1.939247488975525, "logits/rejected": -1.8960247039794922, "logps/chosen": -245.89212036132812, "logps/rejected": -323.60064697265625, "loss": 0.5127, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.895512342453003, "rewards/margins": 0.8126155138015747, "rewards/rejected": -2.708127975463867, "step": 6210 }, { "epoch": 1.0716747070985528, "grad_norm": 33.78615188598633, "learning_rate": 4.058388599232129e-07, "logits/chosen": -1.9505269527435303, "logits/rejected": -1.9098577499389648, "logps/chosen": -228.78500366210938, "logps/rejected": -304.4464111328125, "loss": 0.5507, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.756371259689331, "rewards/margins": 0.7719415426254272, "rewards/rejected": -2.528313159942627, "step": 6220 }, { "epoch": 1.0733976567884218, "grad_norm": 35.60861587524414, "learning_rate": 4.0544663049887517e-07, "logits/chosen": -2.024252414703369, "logits/rejected": -1.9508203268051147, "logps/chosen": -230.6692352294922, "logps/rejected": -352.4142761230469, "loss": 0.4317, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.7003017663955688, "rewards/margins": 1.268700361251831, "rewards/rejected": -2.9690022468566895, "step": 6230 }, { "epoch": 1.0751206064782908, "grad_norm": 44.08332061767578, "learning_rate": 4.05053776271806e-07, "logits/chosen": -1.8323113918304443, "logits/rejected": -1.7822949886322021, "logps/chosen": -283.6968994140625, "logps/rejected": -374.8121032714844, "loss": 0.5501, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.2674195766448975, "rewards/margins": 0.9504927396774292, "rewards/rejected": -3.217912197113037, "step": 6240 }, { "epoch": 1.07684355616816, "grad_norm": 19.4212589263916, "learning_rate": 4.046602988210448e-07, "logits/chosen": -1.9309933185577393, "logits/rejected": -1.877638816833496, "logps/chosen": -251.753173828125, "logps/rejected": -329.91107177734375, "loss": 0.5695, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9484363794326782, "rewards/margins": 0.8187299966812134, "rewards/rejected": -2.7671661376953125, "step": 6250 }, { "epoch": 1.078566505858029, "grad_norm": 40.873374938964844, "learning_rate": 4.0426619972813634e-07, "logits/chosen": -1.8882240056991577, "logits/rejected": -1.8520675897598267, "logps/chosen": -213.1721954345703, "logps/rejected": -264.79302978515625, "loss": 0.5734, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5789695978164673, "rewards/margins": 0.5627680420875549, "rewards/rejected": -2.141737461090088, "step": 6260 }, { "epoch": 1.080289455547898, "grad_norm": 18.4406795501709, "learning_rate": 4.0387148057712383e-07, "logits/chosen": -2.098450183868408, "logits/rejected": -2.0737686157226562, "logps/chosen": -197.15431213378906, "logps/rejected": -274.11212158203125, "loss": 0.5356, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.434789776802063, "rewards/margins": 0.7662370800971985, "rewards/rejected": -2.2010269165039062, "step": 6270 }, { "epoch": 1.082012405237767, "grad_norm": 25.81861686706543, "learning_rate": 4.034761429545428e-07, "logits/chosen": -1.8749046325683594, "logits/rejected": -1.837730050086975, "logps/chosen": -202.4122314453125, "logps/rejected": -265.70361328125, "loss": 0.5277, "rewards/accuracies": 0.75, "rewards/chosen": -1.4752110242843628, "rewards/margins": 0.6643983125686646, "rewards/rejected": -2.1396093368530273, "step": 6280 }, { "epoch": 1.083735354927636, "grad_norm": 24.857681274414062, "learning_rate": 4.030801884494147e-07, "logits/chosen": -1.9082505702972412, "logits/rejected": -1.8685214519500732, "logps/chosen": -217.2801513671875, "logps/rejected": -290.1829528808594, "loss": 0.5048, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6506789922714233, "rewards/margins": 0.7392069697380066, "rewards/rejected": -2.3898861408233643, "step": 6290 }, { "epoch": 1.0854583046175053, "grad_norm": 26.95937156677246, "learning_rate": 4.0268361865324054e-07, "logits/chosen": -1.9121443033218384, "logits/rejected": -1.8716628551483154, "logps/chosen": -233.95718383789062, "logps/rejected": -308.82855224609375, "loss": 0.5175, "rewards/accuracies": 0.71875, "rewards/chosen": -1.802476167678833, "rewards/margins": 0.7767321467399597, "rewards/rejected": -2.5792081356048584, "step": 6300 }, { "epoch": 1.0871812543073742, "grad_norm": 27.7519474029541, "learning_rate": 4.022864351599943e-07, "logits/chosen": -1.9166914224624634, "logits/rejected": -1.8662357330322266, "logps/chosen": -215.0142364501953, "logps/rejected": -290.4009094238281, "loss": 0.5198, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6318359375, "rewards/margins": 0.7610210180282593, "rewards/rejected": -2.3928568363189697, "step": 6310 }, { "epoch": 1.0889042039972432, "grad_norm": 20.872501373291016, "learning_rate": 4.018886395661166e-07, "logits/chosen": -1.8938758373260498, "logits/rejected": -1.8550622463226318, "logps/chosen": -215.02865600585938, "logps/rejected": -285.084716796875, "loss": 0.5424, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.595429539680481, "rewards/margins": 0.729217529296875, "rewards/rejected": -2.3246471881866455, "step": 6320 }, { "epoch": 1.0906271536871124, "grad_norm": 20.782012939453125, "learning_rate": 4.014902334705085e-07, "logits/chosen": -1.930921196937561, "logits/rejected": -1.8778480291366577, "logps/chosen": -207.6751708984375, "logps/rejected": -301.84259033203125, "loss": 0.4573, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5282132625579834, "rewards/margins": 0.9717317819595337, "rewards/rejected": -2.4999451637268066, "step": 6330 }, { "epoch": 1.0923501033769814, "grad_norm": 30.38066864013672, "learning_rate": 4.01091218474525e-07, "logits/chosen": -1.967708945274353, "logits/rejected": -1.9164838790893555, "logps/chosen": -225.5016632080078, "logps/rejected": -313.8369140625, "loss": 0.5028, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6835486888885498, "rewards/margins": 0.9299084544181824, "rewards/rejected": -2.613456964492798, "step": 6340 }, { "epoch": 1.0940730530668505, "grad_norm": 35.5791130065918, "learning_rate": 4.0069159618196813e-07, "logits/chosen": -1.884433388710022, "logits/rejected": -1.845542311668396, "logps/chosen": -245.2696990966797, "logps/rejected": -335.843994140625, "loss": 0.4828, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8855760097503662, "rewards/margins": 0.9182368516921997, "rewards/rejected": -2.8038127422332764, "step": 6350 }, { "epoch": 1.0957960027567195, "grad_norm": 32.387840270996094, "learning_rate": 4.002913681990813e-07, "logits/chosen": -1.8037582635879517, "logits/rejected": -1.7552263736724854, "logps/chosen": -263.691162109375, "logps/rejected": -383.96771240234375, "loss": 0.4592, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.0901196002960205, "rewards/margins": 1.2108937501907349, "rewards/rejected": -3.301013469696045, "step": 6360 }, { "epoch": 1.0975189524465885, "grad_norm": 31.696401596069336, "learning_rate": 3.998905361345423e-07, "logits/chosen": -1.8160336017608643, "logits/rejected": -1.7841603755950928, "logps/chosen": -270.5869140625, "logps/rejected": -349.1259765625, "loss": 0.5647, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.1539430618286133, "rewards/margins": 0.7999936938285828, "rewards/rejected": -2.953936815261841, "step": 6370 }, { "epoch": 1.0992419021364577, "grad_norm": 31.24211311340332, "learning_rate": 3.9948910159945676e-07, "logits/chosen": -1.896767020225525, "logits/rejected": -1.8412729501724243, "logps/chosen": -235.407470703125, "logps/rejected": -341.68109130859375, "loss": 0.4566, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8163881301879883, "rewards/margins": 1.0446827411651611, "rewards/rejected": -2.8610711097717285, "step": 6380 }, { "epoch": 1.1009648518263266, "grad_norm": 23.071456909179688, "learning_rate": 3.9908706620735214e-07, "logits/chosen": -1.9219896793365479, "logits/rejected": -1.8575019836425781, "logps/chosen": -228.6627197265625, "logps/rejected": -328.0730285644531, "loss": 0.4506, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.71219801902771, "rewards/margins": 1.0696029663085938, "rewards/rejected": -2.781801223754883, "step": 6390 }, { "epoch": 1.1026878015161956, "grad_norm": 27.987018585205078, "learning_rate": 3.98684431574171e-07, "logits/chosen": -1.8819494247436523, "logits/rejected": -1.8256406784057617, "logps/chosen": -244.5769500732422, "logps/rejected": -343.0653076171875, "loss": 0.4736, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8576946258544922, "rewards/margins": 1.0152769088745117, "rewards/rejected": -2.872971534729004, "step": 6400 }, { "epoch": 1.1026878015161956, "eval_logits/chosen": -1.8794265985488892, "eval_logits/rejected": -1.8530522584915161, "eval_logps/chosen": -274.69427490234375, "eval_logps/rejected": -322.3578796386719, "eval_loss": 0.6454460024833679, "eval_rewards/accuracies": 0.6547397971153259, "eval_rewards/chosen": -2.156787633895874, "eval_rewards/margins": 0.43929487466812134, "eval_rewards/rejected": -2.596082925796509, "eval_runtime": 360.9816, "eval_samples_per_second": 11.923, "eval_steps_per_second": 1.49, "step": 6400 }, { "epoch": 1.1044107512060648, "grad_norm": 50.71770095825195, "learning_rate": 3.982811993182643e-07, "logits/chosen": -1.8697437047958374, "logits/rejected": -1.8276088237762451, "logps/chosen": -311.2469482421875, "logps/rejected": -403.5909729003906, "loss": 0.563, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.552518129348755, "rewards/margins": 0.9764571189880371, "rewards/rejected": -3.528975009918213, "step": 6410 }, { "epoch": 1.1061337008959338, "grad_norm": 35.5622444152832, "learning_rate": 3.978773710603852e-07, "logits/chosen": -1.86460280418396, "logits/rejected": -1.8166091442108154, "logps/chosen": -296.0201416015625, "logps/rejected": -383.09112548828125, "loss": 0.5507, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.4030678272247314, "rewards/margins": 0.8959752321243286, "rewards/rejected": -3.2990429401397705, "step": 6420 }, { "epoch": 1.107856650585803, "grad_norm": 23.604068756103516, "learning_rate": 3.9747294842368246e-07, "logits/chosen": -1.8714535236358643, "logits/rejected": -1.834882378578186, "logps/chosen": -228.5640106201172, "logps/rejected": -301.71826171875, "loss": 0.5335, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7759802341461182, "rewards/margins": 0.7603236436843872, "rewards/rejected": -2.536303758621216, "step": 6430 }, { "epoch": 1.109579600275672, "grad_norm": 26.628463745117188, "learning_rate": 3.9706793303369377e-07, "logits/chosen": -1.9284706115722656, "logits/rejected": -1.875792145729065, "logps/chosen": -225.0537109375, "logps/rejected": -316.73468017578125, "loss": 0.4726, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.695712685585022, "rewards/margins": 0.9403985738754272, "rewards/rejected": -2.6361114978790283, "step": 6440 }, { "epoch": 1.111302549965541, "grad_norm": 23.13311004638672, "learning_rate": 3.966623265183394e-07, "logits/chosen": -1.895464539527893, "logits/rejected": -1.8404392004013062, "logps/chosen": -238.4036865234375, "logps/rejected": -319.94097900390625, "loss": 0.5142, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.8104534149169922, "rewards/margins": 0.8702430725097656, "rewards/rejected": -2.680696487426758, "step": 6450 }, { "epoch": 1.11302549965541, "grad_norm": 33.32912826538086, "learning_rate": 3.9625613050791576e-07, "logits/chosen": -1.8273884057998657, "logits/rejected": -1.7905113697052002, "logps/chosen": -240.16012573242188, "logps/rejected": -358.64910888671875, "loss": 0.4365, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.8739084005355835, "rewards/margins": 1.1462388038635254, "rewards/rejected": -3.0201470851898193, "step": 6460 }, { "epoch": 1.114748449345279, "grad_norm": 31.69497299194336, "learning_rate": 3.958493466350883e-07, "logits/chosen": -1.764683485031128, "logits/rejected": -1.722357153892517, "logps/chosen": -254.2992401123047, "logps/rejected": -336.9817199707031, "loss": 0.5197, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.00130295753479, "rewards/margins": 0.859014630317688, "rewards/rejected": -2.8603177070617676, "step": 6470 }, { "epoch": 1.1164713990351482, "grad_norm": 34.380428314208984, "learning_rate": 3.9544197653488566e-07, "logits/chosen": -1.8660881519317627, "logits/rejected": -1.8116614818572998, "logps/chosen": -253.64492797851562, "logps/rejected": -333.5671691894531, "loss": 0.5038, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9637492895126343, "rewards/margins": 0.8683713674545288, "rewards/rejected": -2.832120895385742, "step": 6480 }, { "epoch": 1.1181943487250172, "grad_norm": 22.399675369262695, "learning_rate": 3.950340218446926e-07, "logits/chosen": -1.995276689529419, "logits/rejected": -1.948129415512085, "logps/chosen": -253.04519653320312, "logps/rejected": -352.1228332519531, "loss": 0.5118, "rewards/accuracies": 0.75, "rewards/chosen": -1.9833208322525024, "rewards/margins": 1.0025694370269775, "rewards/rejected": -2.9858901500701904, "step": 6490 }, { "epoch": 1.1199172984148862, "grad_norm": 35.35326385498047, "learning_rate": 3.946254842042437e-07, "logits/chosen": -1.9476611614227295, "logits/rejected": -1.9087482690811157, "logps/chosen": -219.40164184570312, "logps/rejected": -289.535400390625, "loss": 0.4985, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6201190948486328, "rewards/margins": 0.7548078894615173, "rewards/rejected": -2.374927043914795, "step": 6500 }, { "epoch": 1.1216402481047554, "grad_norm": 33.930023193359375, "learning_rate": 3.942163652556166e-07, "logits/chosen": -1.9032150506973267, "logits/rejected": -1.8367655277252197, "logps/chosen": -215.4200897216797, "logps/rejected": -296.2190856933594, "loss": 0.4948, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5807517766952515, "rewards/margins": 0.8594449758529663, "rewards/rejected": -2.4401967525482178, "step": 6510 }, { "epoch": 1.1233631977946243, "grad_norm": 23.532869338989258, "learning_rate": 3.9380666664322526e-07, "logits/chosen": -1.9229843616485596, "logits/rejected": -1.8704620599746704, "logps/chosen": -211.8603973388672, "logps/rejected": -302.26434326171875, "loss": 0.4496, "rewards/accuracies": 0.84375, "rewards/chosen": -1.5720702409744263, "rewards/margins": 0.9419166445732117, "rewards/rejected": -2.5139870643615723, "step": 6520 }, { "epoch": 1.1250861474844935, "grad_norm": 22.186826705932617, "learning_rate": 3.93396390013814e-07, "logits/chosen": -1.8291927576065063, "logits/rejected": -1.764974594116211, "logps/chosen": -250.465576171875, "logps/rejected": -368.4342346191406, "loss": 0.4155, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.9597097635269165, "rewards/margins": 1.220273733139038, "rewards/rejected": -3.179983139038086, "step": 6530 }, { "epoch": 1.1268090971743625, "grad_norm": 40.1002197265625, "learning_rate": 3.929855370164499e-07, "logits/chosen": -1.7839500904083252, "logits/rejected": -1.7287908792495728, "logps/chosen": -303.1529541015625, "logps/rejected": -393.82476806640625, "loss": 0.5693, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.45111083984375, "rewards/margins": 0.9267457723617554, "rewards/rejected": -3.377856492996216, "step": 6540 }, { "epoch": 1.1285320468642315, "grad_norm": 40.53666687011719, "learning_rate": 3.92574109302517e-07, "logits/chosen": -1.7984297275543213, "logits/rejected": -1.751060128211975, "logps/chosen": -259.16839599609375, "logps/rejected": -349.862060546875, "loss": 0.5458, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.0755727291107178, "rewards/margins": 0.9119545221328735, "rewards/rejected": -2.987527370452881, "step": 6550 }, { "epoch": 1.1302549965541007, "grad_norm": 26.17641258239746, "learning_rate": 3.9216210852570937e-07, "logits/chosen": -1.924118995666504, "logits/rejected": -1.87078857421875, "logps/chosen": -211.19155883789062, "logps/rejected": -298.49468994140625, "loss": 0.4804, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.5441941022872925, "rewards/margins": 0.9326621890068054, "rewards/rejected": -2.476856231689453, "step": 6560 }, { "epoch": 1.1319779462439696, "grad_norm": 29.83054542541504, "learning_rate": 3.9174953634202424e-07, "logits/chosen": -1.8989213705062866, "logits/rejected": -1.8443619012832642, "logps/chosen": -227.7430419921875, "logps/rejected": -315.738037109375, "loss": 0.4828, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7227426767349243, "rewards/margins": 0.9160329699516296, "rewards/rejected": -2.63877534866333, "step": 6570 }, { "epoch": 1.1337008959338388, "grad_norm": 25.417308807373047, "learning_rate": 3.913363944097559e-07, "logits/chosen": -1.9100420475006104, "logits/rejected": -1.853197693824768, "logps/chosen": -218.9949188232422, "logps/rejected": -295.11944580078125, "loss": 0.512, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6264736652374268, "rewards/margins": 0.8301679491996765, "rewards/rejected": -2.456641674041748, "step": 6580 }, { "epoch": 1.1354238456237078, "grad_norm": 29.641244888305664, "learning_rate": 3.9092268438948825e-07, "logits/chosen": -1.8723993301391602, "logits/rejected": -1.8143097162246704, "logps/chosen": -213.529541015625, "logps/rejected": -316.8847961425781, "loss": 0.4665, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.600020408630371, "rewards/margins": 1.0391137599945068, "rewards/rejected": -2.639133930206299, "step": 6590 }, { "epoch": 1.1371467953135768, "grad_norm": 49.26777648925781, "learning_rate": 3.90508407944089e-07, "logits/chosen": -1.8910129070281982, "logits/rejected": -1.8443208932876587, "logps/chosen": -241.1716766357422, "logps/rejected": -334.2567138671875, "loss": 0.5109, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8437801599502563, "rewards/margins": 0.9672563672065735, "rewards/rejected": -2.8110365867614746, "step": 6600 }, { "epoch": 1.138869745003446, "grad_norm": 19.679494857788086, "learning_rate": 3.9009356673870224e-07, "logits/chosen": -1.8602310419082642, "logits/rejected": -1.8133646249771118, "logps/chosen": -268.1920471191406, "logps/rejected": -372.3370666503906, "loss": 0.4658, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.1512560844421387, "rewards/margins": 1.0625637769699097, "rewards/rejected": -3.213819980621338, "step": 6610 }, { "epoch": 1.140592694693315, "grad_norm": 40.432918548583984, "learning_rate": 3.8967816244074214e-07, "logits/chosen": -1.805352807044983, "logits/rejected": -1.7569506168365479, "logps/chosen": -273.6300354003906, "logps/rejected": -359.02691650390625, "loss": 0.5504, "rewards/accuracies": 0.71875, "rewards/chosen": -2.2042930126190186, "rewards/margins": 0.8730916976928711, "rewards/rejected": -3.0773849487304688, "step": 6620 }, { "epoch": 1.1423156443831841, "grad_norm": 25.013216018676758, "learning_rate": 3.8926219671988627e-07, "logits/chosen": -1.8702272176742554, "logits/rejected": -1.8223224878311157, "logps/chosen": -208.30245971679688, "logps/rejected": -283.2719421386719, "loss": 0.4888, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5366510152816772, "rewards/margins": 0.810300350189209, "rewards/rejected": -2.346951484680176, "step": 6630 }, { "epoch": 1.144038594073053, "grad_norm": 27.49932289123535, "learning_rate": 3.8884567124806864e-07, "logits/chosen": -1.9217408895492554, "logits/rejected": -1.8718513250350952, "logps/chosen": -224.3609161376953, "logps/rejected": -300.01165771484375, "loss": 0.5224, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.697998046875, "rewards/margins": 0.8161190748214722, "rewards/rejected": -2.5141172409057617, "step": 6640 }, { "epoch": 1.145761543762922, "grad_norm": 35.373470306396484, "learning_rate": 3.8842858769947324e-07, "logits/chosen": -1.923402190208435, "logits/rejected": -1.8658084869384766, "logps/chosen": -243.9268798828125, "logps/rejected": -356.40460205078125, "loss": 0.4439, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.908514380455017, "rewards/margins": 1.1278340816497803, "rewards/rejected": -3.036348819732666, "step": 6650 }, { "epoch": 1.1474844934527912, "grad_norm": 31.49143409729004, "learning_rate": 3.8801094775052713e-07, "logits/chosen": -1.8889760971069336, "logits/rejected": -1.8402891159057617, "logps/chosen": -263.2322082519531, "logps/rejected": -377.49114990234375, "loss": 0.4658, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.1033730506896973, "rewards/margins": 1.1411552429199219, "rewards/rejected": -3.2445285320281982, "step": 6660 }, { "epoch": 1.1492074431426602, "grad_norm": 23.95795440673828, "learning_rate": 3.8759275307989376e-07, "logits/chosen": -1.8000015020370483, "logits/rejected": -1.7430862188339233, "logps/chosen": -282.15753173828125, "logps/rejected": -423.968017578125, "loss": 0.4097, "rewards/accuracies": 0.84375, "rewards/chosen": -2.314126491546631, "rewards/margins": 1.3824223279953003, "rewards/rejected": -3.6965489387512207, "step": 6670 }, { "epoch": 1.1509303928325294, "grad_norm": 35.03030776977539, "learning_rate": 3.8717400536846623e-07, "logits/chosen": -1.7605645656585693, "logits/rejected": -1.712803840637207, "logps/chosen": -302.34228515625, "logps/rejected": -400.19378662109375, "loss": 0.5291, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.4841458797454834, "rewards/margins": 1.0123988389968872, "rewards/rejected": -3.496544361114502, "step": 6680 }, { "epoch": 1.1526533425223984, "grad_norm": 25.53325653076172, "learning_rate": 3.867547062993607e-07, "logits/chosen": -1.8648484945297241, "logits/rejected": -1.810886025428772, "logps/chosen": -263.6770935058594, "logps/rejected": -369.0923767089844, "loss": 0.4758, "rewards/accuracies": 0.75, "rewards/chosen": -2.05566143989563, "rewards/margins": 1.089951992034912, "rewards/rejected": -3.145613193511963, "step": 6690 }, { "epoch": 1.1543762922122673, "grad_norm": 41.431800842285156, "learning_rate": 3.8633485755790914e-07, "logits/chosen": -1.8342593908309937, "logits/rejected": -1.7964439392089844, "logps/chosen": -266.6868896484375, "logps/rejected": -360.70220947265625, "loss": 0.5109, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.0866353511810303, "rewards/margins": 0.9687382578849792, "rewards/rejected": -3.0553736686706543, "step": 6700 }, { "epoch": 1.1560992419021365, "grad_norm": 23.521198272705078, "learning_rate": 3.859144608316532e-07, "logits/chosen": -1.8509660959243774, "logits/rejected": -1.7801856994628906, "logps/chosen": -254.5431365966797, "logps/rejected": -384.8039855957031, "loss": 0.4083, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.9652729034423828, "rewards/margins": 1.3591676950454712, "rewards/rejected": -3.3244407176971436, "step": 6710 }, { "epoch": 1.1578221915920055, "grad_norm": 30.949420928955078, "learning_rate": 3.854935178103368e-07, "logits/chosen": -1.8148887157440186, "logits/rejected": -1.757820725440979, "logps/chosen": -263.23175048828125, "logps/rejected": -381.0891418457031, "loss": 0.445, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.1056952476501465, "rewards/margins": 1.1920859813690186, "rewards/rejected": -3.2977817058563232, "step": 6720 }, { "epoch": 1.1595451412818747, "grad_norm": 39.427085876464844, "learning_rate": 3.850720301859e-07, "logits/chosen": -1.7516651153564453, "logits/rejected": -1.7123782634735107, "logps/chosen": -301.04779052734375, "logps/rejected": -381.38201904296875, "loss": 0.5852, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.434793472290039, "rewards/margins": 0.8372718691825867, "rewards/rejected": -3.2720654010772705, "step": 6730 }, { "epoch": 1.1612680909717437, "grad_norm": 29.0310115814209, "learning_rate": 3.846499996524715e-07, "logits/chosen": -1.825322151184082, "logits/rejected": -1.7790626287460327, "logps/chosen": -255.5957794189453, "logps/rejected": -360.89459228515625, "loss": 0.4691, "rewards/accuracies": 0.78125, "rewards/chosen": -2.0024280548095703, "rewards/margins": 1.0590091943740845, "rewards/rejected": -3.0614371299743652, "step": 6740 }, { "epoch": 1.1629910406616126, "grad_norm": 26.128395080566406, "learning_rate": 3.842274279063623e-07, "logits/chosen": -1.9014613628387451, "logits/rejected": -1.8570184707641602, "logps/chosen": -242.1820526123047, "logps/rejected": -339.1670837402344, "loss": 0.5002, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.88741135597229, "rewards/margins": 0.9738866090774536, "rewards/rejected": -2.861298084259033, "step": 6750 }, { "epoch": 1.1647139903514818, "grad_norm": 42.60655975341797, "learning_rate": 3.838043166460588e-07, "logits/chosen": -1.8509057760238647, "logits/rejected": -1.7928546667099, "logps/chosen": -229.1250762939453, "logps/rejected": -319.4176330566406, "loss": 0.5036, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7573251724243164, "rewards/margins": 0.9542375802993774, "rewards/rejected": -2.7115628719329834, "step": 6760 }, { "epoch": 1.1664369400413508, "grad_norm": 46.68840789794922, "learning_rate": 3.833806675722159e-07, "logits/chosen": -1.8707281351089478, "logits/rejected": -1.8292791843414307, "logps/chosen": -243.39370727539062, "logps/rejected": -343.2301940917969, "loss": 0.4985, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.8810571432113647, "rewards/margins": 1.0001057386398315, "rewards/rejected": -2.8811628818511963, "step": 6770 }, { "epoch": 1.1681598897312198, "grad_norm": 35.05253982543945, "learning_rate": 3.829564823876501e-07, "logits/chosen": -1.8461239337921143, "logits/rejected": -1.788248062133789, "logps/chosen": -253.2092742919922, "logps/rejected": -348.6705017089844, "loss": 0.5125, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.959573745727539, "rewards/margins": 0.9957149624824524, "rewards/rejected": -2.955288887023926, "step": 6780 }, { "epoch": 1.169882839421089, "grad_norm": 44.82268524169922, "learning_rate": 3.825317627973328e-07, "logits/chosen": -1.8655624389648438, "logits/rejected": -1.8160375356674194, "logps/chosen": -254.75436401367188, "logps/rejected": -354.41815185546875, "loss": 0.496, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.0266332626342773, "rewards/margins": 0.9998002052307129, "rewards/rejected": -3.0264334678649902, "step": 6790 }, { "epoch": 1.171605789110958, "grad_norm": 19.807069778442383, "learning_rate": 3.821065105083836e-07, "logits/chosen": -1.7918307781219482, "logits/rejected": -1.7324936389923096, "logps/chosen": -242.369140625, "logps/rejected": -344.6247863769531, "loss": 0.4665, "rewards/accuracies": 0.75, "rewards/chosen": -1.8909105062484741, "rewards/margins": 1.0596225261688232, "rewards/rejected": -2.950532913208008, "step": 6800 }, { "epoch": 1.171605789110958, "eval_logits/chosen": -1.904203176498413, "eval_logits/rejected": -1.8821338415145874, "eval_logps/chosen": -248.59921264648438, "eval_logps/rejected": -290.02947998046875, "eval_loss": 0.6385961174964905, "eval_rewards/accuracies": 0.6442843675613403, "eval_rewards/chosen": -1.8958371877670288, "eval_rewards/margins": 0.376961350440979, "eval_rewards/rejected": -2.272798776626587, "eval_runtime": 360.8399, "eval_samples_per_second": 11.928, "eval_steps_per_second": 1.491, "step": 6800 }, { "epoch": 1.173328738800827, "grad_norm": 37.92570495605469, "learning_rate": 3.8168072723006283e-07, "logits/chosen": -1.838097333908081, "logits/rejected": -1.783511757850647, "logps/chosen": -269.75286865234375, "logps/rejected": -352.02020263671875, "loss": 0.531, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.1296119689941406, "rewards/margins": 0.8818729519844055, "rewards/rejected": -3.0114848613739014, "step": 6810 }, { "epoch": 1.175051688490696, "grad_norm": 35.662715911865234, "learning_rate": 3.812544146737654e-07, "logits/chosen": -1.8938138484954834, "logits/rejected": -1.8443641662597656, "logps/chosen": -285.08258056640625, "logps/rejected": -385.55389404296875, "loss": 0.5058, "rewards/accuracies": 0.75, "rewards/chosen": -2.286356210708618, "rewards/margins": 1.051060676574707, "rewards/rejected": -3.3374171257019043, "step": 6820 }, { "epoch": 1.176774638180565, "grad_norm": 29.76297950744629, "learning_rate": 3.8082757455301346e-07, "logits/chosen": -1.9038909673690796, "logits/rejected": -1.872214913368225, "logps/chosen": -263.4322814941406, "logps/rejected": -359.9519348144531, "loss": 0.4949, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.0882058143615723, "rewards/margins": 0.9883928298950195, "rewards/rejected": -3.076598644256592, "step": 6830 }, { "epoch": 1.1784975878704342, "grad_norm": 25.704219818115234, "learning_rate": 3.804002085834497e-07, "logits/chosen": -1.9186826944351196, "logits/rejected": -1.8656005859375, "logps/chosen": -238.841064453125, "logps/rejected": -330.4883728027344, "loss": 0.5127, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.808995008468628, "rewards/margins": 0.9483944773674011, "rewards/rejected": -2.757389545440674, "step": 6840 }, { "epoch": 1.1802205375603032, "grad_norm": 25.4129581451416, "learning_rate": 3.799723184828304e-07, "logits/chosen": -1.8957765102386475, "logits/rejected": -1.8405253887176514, "logps/chosen": -202.45123291015625, "logps/rejected": -286.8634338378906, "loss": 0.4903, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.4711071252822876, "rewards/margins": 0.870842456817627, "rewards/rejected": -2.341949462890625, "step": 6850 }, { "epoch": 1.1819434872501722, "grad_norm": 29.786457061767578, "learning_rate": 3.795439059710185e-07, "logits/chosen": -1.934286117553711, "logits/rejected": -1.877376914024353, "logps/chosen": -248.01809692382812, "logps/rejected": -321.09619140625, "loss": 0.5205, "rewards/accuracies": 0.75, "rewards/chosen": -1.8807119131088257, "rewards/margins": 0.8119600415229797, "rewards/rejected": -2.692671775817871, "step": 6860 }, { "epoch": 1.1836664369400414, "grad_norm": 23.92767333984375, "learning_rate": 3.7911497276997677e-07, "logits/chosen": -1.809736967086792, "logits/rejected": -1.7767900228500366, "logps/chosen": -271.0516662597656, "logps/rejected": -362.0204772949219, "loss": 0.5016, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.1902968883514404, "rewards/margins": 0.9184913635253906, "rewards/rejected": -3.1087887287139893, "step": 6870 }, { "epoch": 1.1853893866299103, "grad_norm": 31.839534759521484, "learning_rate": 3.7868552060376086e-07, "logits/chosen": -1.8469183444976807, "logits/rejected": -1.7904822826385498, "logps/chosen": -270.2738342285156, "logps/rejected": -373.61773681640625, "loss": 0.4666, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.1428287029266357, "rewards/margins": 1.0873041152954102, "rewards/rejected": -3.2301323413848877, "step": 6880 }, { "epoch": 1.1871123363197795, "grad_norm": 22.6026668548584, "learning_rate": 3.782555511985123e-07, "logits/chosen": -1.825683832168579, "logits/rejected": -1.774274468421936, "logps/chosen": -264.87347412109375, "logps/rejected": -379.3821716308594, "loss": 0.4424, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.116506814956665, "rewards/margins": 1.1573073863983154, "rewards/rejected": -3.2738139629364014, "step": 6890 }, { "epoch": 1.1888352860096485, "grad_norm": 52.010128021240234, "learning_rate": 3.7782506628245154e-07, "logits/chosen": -1.866090178489685, "logits/rejected": -1.8177963495254517, "logps/chosen": -274.6611328125, "logps/rejected": -370.4534606933594, "loss": 0.5161, "rewards/accuracies": 0.78125, "rewards/chosen": -2.20335054397583, "rewards/margins": 0.9748086929321289, "rewards/rejected": -3.178159236907959, "step": 6900 }, { "epoch": 1.1905582356995175, "grad_norm": 41.44845962524414, "learning_rate": 3.773940675858713e-07, "logits/chosen": -1.912523865699768, "logits/rejected": -1.8593909740447998, "logps/chosen": -263.13043212890625, "logps/rejected": -363.60589599609375, "loss": 0.4613, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.0532422065734863, "rewards/margins": 1.0346832275390625, "rewards/rejected": -3.0879251956939697, "step": 6910 }, { "epoch": 1.1922811853893867, "grad_norm": 30.008424758911133, "learning_rate": 3.769625568411291e-07, "logits/chosen": -1.987203598022461, "logits/rejected": -1.932976484298706, "logps/chosen": -251.5031280517578, "logps/rejected": -331.0704650878906, "loss": 0.5412, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.9486143589019775, "rewards/margins": 0.8264883756637573, "rewards/rejected": -2.7751028537750244, "step": 6920 }, { "epoch": 1.1940041350792556, "grad_norm": 40.39155960083008, "learning_rate": 3.7653053578264085e-07, "logits/chosen": -1.9288034439086914, "logits/rejected": -1.8601503372192383, "logps/chosen": -220.29812622070312, "logps/rejected": -327.15179443359375, "loss": 0.4556, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.614402413368225, "rewards/margins": 1.1242268085479736, "rewards/rejected": -2.73862886428833, "step": 6930 }, { "epoch": 1.1957270847691248, "grad_norm": 41.36498260498047, "learning_rate": 3.760980061468734e-07, "logits/chosen": -1.836321473121643, "logits/rejected": -1.7562634944915771, "logps/chosen": -259.41802978515625, "logps/rejected": -384.7509765625, "loss": 0.4643, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.9935392141342163, "rewards/margins": 1.3565499782562256, "rewards/rejected": -3.3500893115997314, "step": 6940 }, { "epoch": 1.1974500344589938, "grad_norm": 29.255107879638672, "learning_rate": 3.7566496967233806e-07, "logits/chosen": -1.865804672241211, "logits/rejected": -1.8096462488174438, "logps/chosen": -281.8961181640625, "logps/rejected": -393.8701171875, "loss": 0.4786, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.276465654373169, "rewards/margins": 1.1354783773422241, "rewards/rejected": -3.4119441509246826, "step": 6950 }, { "epoch": 1.1991729841488628, "grad_norm": 26.68109130859375, "learning_rate": 3.7523142809958307e-07, "logits/chosen": -2.0168564319610596, "logits/rejected": -1.9593250751495361, "logps/chosen": -249.63229370117188, "logps/rejected": -325.8633728027344, "loss": 0.5289, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.9357798099517822, "rewards/margins": 0.851833164691925, "rewards/rejected": -2.7876129150390625, "step": 6960 }, { "epoch": 1.200895933838732, "grad_norm": 35.96856689453125, "learning_rate": 3.747973831711868e-07, "logits/chosen": -1.9925317764282227, "logits/rejected": -1.9339529275894165, "logps/chosen": -209.8934783935547, "logps/rejected": -312.00433349609375, "loss": 0.4146, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.5449930429458618, "rewards/margins": 1.074834942817688, "rewards/rejected": -2.6198277473449707, "step": 6970 }, { "epoch": 1.202618883528601, "grad_norm": 35.7033805847168, "learning_rate": 3.743628366317512e-07, "logits/chosen": -1.905755639076233, "logits/rejected": -1.8550455570220947, "logps/chosen": -252.19308471679688, "logps/rejected": -353.1059875488281, "loss": 0.4883, "rewards/accuracies": 0.75, "rewards/chosen": -1.9683558940887451, "rewards/margins": 1.0429661273956299, "rewards/rejected": -3.011322021484375, "step": 6980 }, { "epoch": 1.20434183321847, "grad_norm": 24.847332000732422, "learning_rate": 3.73927790227894e-07, "logits/chosen": -1.8924840688705444, "logits/rejected": -1.8414844274520874, "logps/chosen": -248.7648162841797, "logps/rejected": -340.7574768066406, "loss": 0.5123, "rewards/accuracies": 0.75, "rewards/chosen": -1.9070727825164795, "rewards/margins": 0.9624916315078735, "rewards/rejected": -2.8695645332336426, "step": 6990 }, { "epoch": 1.206064782908339, "grad_norm": 22.887718200683594, "learning_rate": 3.7349224570824235e-07, "logits/chosen": -2.0844712257385254, "logits/rejected": -2.0205156803131104, "logps/chosen": -249.1080780029297, "logps/rejected": -365.4342041015625, "loss": 0.4466, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.919012427330017, "rewards/margins": 1.194676160812378, "rewards/rejected": -3.1136887073516846, "step": 7000 }, { "epoch": 1.207787732598208, "grad_norm": 22.691221237182617, "learning_rate": 3.7305620482342527e-07, "logits/chosen": -1.9934028387069702, "logits/rejected": -1.9432646036148071, "logps/chosen": -226.4757843017578, "logps/rejected": -309.2968444824219, "loss": 0.5261, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6933708190917969, "rewards/margins": 0.8783981204032898, "rewards/rejected": -2.5717687606811523, "step": 7010 }, { "epoch": 1.2095106822880772, "grad_norm": 32.02613067626953, "learning_rate": 3.7261966932606705e-07, "logits/chosen": -1.9594752788543701, "logits/rejected": -1.908998727798462, "logps/chosen": -208.65951538085938, "logps/rejected": -289.6645202636719, "loss": 0.5241, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.566380500793457, "rewards/margins": 0.8149579167366028, "rewards/rejected": -2.381338357925415, "step": 7020 }, { "epoch": 1.2112336319779462, "grad_norm": 35.86697006225586, "learning_rate": 3.7218264097078016e-07, "logits/chosen": -2.006108045578003, "logits/rejected": -1.9455482959747314, "logps/chosen": -225.1941680908203, "logps/rejected": -327.09906005859375, "loss": 0.4929, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7094199657440186, "rewards/margins": 1.0167547464370728, "rewards/rejected": -2.726174831390381, "step": 7030 }, { "epoch": 1.2129565816678154, "grad_norm": 34.738033294677734, "learning_rate": 3.717451215141577e-07, "logits/chosen": -1.963739037513733, "logits/rejected": -1.9076731204986572, "logps/chosen": -230.6356964111328, "logps/rejected": -332.78814697265625, "loss": 0.465, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.73089599609375, "rewards/margins": 1.0377737283706665, "rewards/rejected": -2.768669605255127, "step": 7040 }, { "epoch": 1.2146795313576844, "grad_norm": 27.5352725982666, "learning_rate": 3.713071127147671e-07, "logits/chosen": -1.909221887588501, "logits/rejected": -1.8646854162216187, "logps/chosen": -235.1972198486328, "logps/rejected": -333.7093505859375, "loss": 0.5173, "rewards/accuracies": 0.78125, "rewards/chosen": -1.8259246349334717, "rewards/margins": 0.9867849349975586, "rewards/rejected": -2.812709331512451, "step": 7050 }, { "epoch": 1.2164024810475533, "grad_norm": 39.16017150878906, "learning_rate": 3.7086861633314223e-07, "logits/chosen": -1.9553331136703491, "logits/rejected": -1.9213979244232178, "logps/chosen": -241.71377563476562, "logps/rejected": -313.6133117675781, "loss": 0.5543, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.8799902200698853, "rewards/margins": 0.7380637526512146, "rewards/rejected": -2.618053913116455, "step": 7060 }, { "epoch": 1.2181254307374225, "grad_norm": 54.01152420043945, "learning_rate": 3.704296341317773e-07, "logits/chosen": -1.9407684803009033, "logits/rejected": -1.8998291492462158, "logps/chosen": -217.6569061279297, "logps/rejected": -285.63299560546875, "loss": 0.5962, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6233813762664795, "rewards/margins": 0.7062555551528931, "rewards/rejected": -2.329637050628662, "step": 7070 }, { "epoch": 1.2198483804272915, "grad_norm": 24.258563995361328, "learning_rate": 3.699901678751186e-07, "logits/chosen": -2.0587058067321777, "logits/rejected": -2.0046958923339844, "logps/chosen": -184.54164123535156, "logps/rejected": -260.68505859375, "loss": 0.4942, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.3013032674789429, "rewards/margins": 0.7683302760124207, "rewards/rejected": -2.0696334838867188, "step": 7080 }, { "epoch": 1.2215713301171607, "grad_norm": 22.55738067626953, "learning_rate": 3.695502193295585e-07, "logits/chosen": -2.0340194702148438, "logits/rejected": -1.9887676239013672, "logps/chosen": -214.79196166992188, "logps/rejected": -318.87237548828125, "loss": 0.4582, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6107103824615479, "rewards/margins": 1.0711817741394043, "rewards/rejected": -2.681892156600952, "step": 7090 }, { "epoch": 1.2232942798070296, "grad_norm": 33.675025939941406, "learning_rate": 3.6910979026342776e-07, "logits/chosen": -1.9022763967514038, "logits/rejected": -1.8534395694732666, "logps/chosen": -254.4623565673828, "logps/rejected": -347.77313232421875, "loss": 0.5272, "rewards/accuracies": 0.75, "rewards/chosen": -1.9707844257354736, "rewards/margins": 0.949144721031189, "rewards/rejected": -2.919929027557373, "step": 7100 }, { "epoch": 1.2250172294968986, "grad_norm": 23.56369972229004, "learning_rate": 3.6866888244698834e-07, "logits/chosen": -1.9148534536361694, "logits/rejected": -1.8702976703643799, "logps/chosen": -231.2939453125, "logps/rejected": -319.962646484375, "loss": 0.4877, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.770056128501892, "rewards/margins": 0.926967442035675, "rewards/rejected": -2.697023630142212, "step": 7110 }, { "epoch": 1.2267401791867678, "grad_norm": 46.12360763549805, "learning_rate": 3.682274976524269e-07, "logits/chosen": -1.949755311012268, "logits/rejected": -1.8993345499038696, "logps/chosen": -226.44100952148438, "logps/rejected": -285.6453857421875, "loss": 0.5739, "rewards/accuracies": 0.71875, "rewards/chosen": -1.632348656654358, "rewards/margins": 0.7060927152633667, "rewards/rejected": -2.3384413719177246, "step": 7120 }, { "epoch": 1.2284631288766368, "grad_norm": 34.92463684082031, "learning_rate": 3.677856376538468e-07, "logits/chosen": -1.9399125576019287, "logits/rejected": -1.8942511081695557, "logps/chosen": -195.03221130371094, "logps/rejected": -280.861083984375, "loss": 0.4816, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3795123100280762, "rewards/margins": 0.9033554196357727, "rewards/rejected": -2.282867908477783, "step": 7130 }, { "epoch": 1.230186078566506, "grad_norm": 32.912010192871094, "learning_rate": 3.6734330422726177e-07, "logits/chosen": -2.0039687156677246, "logits/rejected": -1.9703712463378906, "logps/chosen": -197.0970001220703, "logps/rejected": -269.23431396484375, "loss": 0.5476, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4478214979171753, "rewards/margins": 0.7311514019966125, "rewards/rejected": -2.1789727210998535, "step": 7140 }, { "epoch": 1.231909028256375, "grad_norm": 33.11442184448242, "learning_rate": 3.669004991505884e-07, "logits/chosen": -1.9776252508163452, "logits/rejected": -1.9367477893829346, "logps/chosen": -252.473388671875, "logps/rejected": -333.32403564453125, "loss": 0.5337, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.9565120935440063, "rewards/margins": 0.8333422541618347, "rewards/rejected": -2.7898545265197754, "step": 7150 }, { "epoch": 1.233631977946244, "grad_norm": 29.88750648498535, "learning_rate": 3.664572242036389e-07, "logits/chosen": -1.9587013721466064, "logits/rejected": -1.9006990194320679, "logps/chosen": -223.1503448486328, "logps/rejected": -323.29827880859375, "loss": 0.489, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6898117065429688, "rewards/margins": 1.0159175395965576, "rewards/rejected": -2.7057290077209473, "step": 7160 }, { "epoch": 1.235354927636113, "grad_norm": 53.25121307373047, "learning_rate": 3.660134811681141e-07, "logits/chosen": -1.9398210048675537, "logits/rejected": -1.8802130222320557, "logps/chosen": -228.8319549560547, "logps/rejected": -327.91937255859375, "loss": 0.5011, "rewards/accuracies": 0.75, "rewards/chosen": -1.7278600931167603, "rewards/margins": 1.0210916996002197, "rewards/rejected": -2.7489514350891113, "step": 7170 }, { "epoch": 1.237077877325982, "grad_norm": 34.254642486572266, "learning_rate": 3.655692718275964e-07, "logits/chosen": -2.011734962463379, "logits/rejected": -1.9630460739135742, "logps/chosen": -241.0047607421875, "logps/rejected": -339.90728759765625, "loss": 0.4446, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.846017599105835, "rewards/margins": 1.0159300565719604, "rewards/rejected": -2.861948013305664, "step": 7180 }, { "epoch": 1.2388008270158513, "grad_norm": 28.785646438598633, "learning_rate": 3.651245979675423e-07, "logits/chosen": -1.8020470142364502, "logits/rejected": -1.7512038946151733, "logps/chosen": -289.11004638671875, "logps/rejected": -387.1331787109375, "loss": 0.5309, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.3605945110321045, "rewards/margins": 0.984226405620575, "rewards/rejected": -3.3448212146759033, "step": 7190 }, { "epoch": 1.2405237767057202, "grad_norm": 41.82642364501953, "learning_rate": 3.646794613752756e-07, "logits/chosen": -1.871299386024475, "logits/rejected": -1.814131736755371, "logps/chosen": -278.1118469238281, "logps/rejected": -400.67181396484375, "loss": 0.4789, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.2423312664031982, "rewards/margins": 1.2147181034088135, "rewards/rejected": -3.4570491313934326, "step": 7200 }, { "epoch": 1.2405237767057202, "eval_logits/chosen": -1.965865969657898, "eval_logits/rejected": -1.9443191289901733, "eval_logps/chosen": -250.99407958984375, "eval_logps/rejected": -292.0611267089844, "eval_loss": 0.6483320593833923, "eval_rewards/accuracies": 0.6403345465660095, "eval_rewards/chosen": -1.919785737991333, "eval_rewards/margins": 0.3733295202255249, "eval_rewards/rejected": -2.2931151390075684, "eval_runtime": 361.8875, "eval_samples_per_second": 11.893, "eval_steps_per_second": 1.487, "step": 7200 }, { "epoch": 1.2422467263955892, "grad_norm": 39.62995529174805, "learning_rate": 3.6423386383997966e-07, "logits/chosen": -1.9800291061401367, "logits/rejected": -1.9402778148651123, "logps/chosen": -247.8915557861328, "logps/rejected": -321.3523864746094, "loss": 0.5535, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8989824056625366, "rewards/margins": 0.788342297077179, "rewards/rejected": -2.6873250007629395, "step": 7210 }, { "epoch": 1.2439696760854584, "grad_norm": 34.938621520996094, "learning_rate": 3.63787807152691e-07, "logits/chosen": -1.9155391454696655, "logits/rejected": -1.8525406122207642, "logps/chosen": -219.7184600830078, "logps/rejected": -316.4938049316406, "loss": 0.4816, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6461101770401, "rewards/margins": 1.0182768106460571, "rewards/rejected": -2.6643869876861572, "step": 7220 }, { "epoch": 1.2456926257753274, "grad_norm": 30.212488174438477, "learning_rate": 3.6334129310629136e-07, "logits/chosen": -1.9223697185516357, "logits/rejected": -1.8762083053588867, "logps/chosen": -212.2480010986328, "logps/rejected": -305.87530517578125, "loss": 0.4734, "rewards/accuracies": 0.75, "rewards/chosen": -1.5704636573791504, "rewards/margins": 0.9719744920730591, "rewards/rejected": -2.542438507080078, "step": 7230 }, { "epoch": 1.2474155754651963, "grad_norm": 20.030900955200195, "learning_rate": 3.628943234955009e-07, "logits/chosen": -1.9466670751571655, "logits/rejected": -1.9089453220367432, "logps/chosen": -234.3196258544922, "logps/rejected": -325.2353210449219, "loss": 0.4855, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7620437145233154, "rewards/margins": 0.9446337819099426, "rewards/rejected": -2.706677198410034, "step": 7240 }, { "epoch": 1.2491385251550655, "grad_norm": 29.988601684570312, "learning_rate": 3.6244690011687064e-07, "logits/chosen": -1.9492202997207642, "logits/rejected": -1.8867504596710205, "logps/chosen": -240.9092559814453, "logps/rejected": -342.6973571777344, "loss": 0.4544, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8518192768096924, "rewards/margins": 1.063063383102417, "rewards/rejected": -2.9148824214935303, "step": 7250 }, { "epoch": 1.2508614748449345, "grad_norm": 39.43049621582031, "learning_rate": 3.6199902476877594e-07, "logits/chosen": -1.9371681213378906, "logits/rejected": -1.888795256614685, "logps/chosen": -258.21075439453125, "logps/rejected": -352.481201171875, "loss": 0.5325, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.031129837036133, "rewards/margins": 0.9846760034561157, "rewards/rejected": -3.015805721282959, "step": 7260 }, { "epoch": 1.2525844245348035, "grad_norm": 32.04049301147461, "learning_rate": 3.6155069925140843e-07, "logits/chosen": -1.9975197315216064, "logits/rejected": -1.9397770166397095, "logps/chosen": -230.236328125, "logps/rejected": -315.9816589355469, "loss": 0.522, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.74825119972229, "rewards/margins": 0.8915823101997375, "rewards/rejected": -2.639833450317383, "step": 7270 }, { "epoch": 1.2543073742246726, "grad_norm": 29.002805709838867, "learning_rate": 3.6110192536676915e-07, "logits/chosen": -2.048396587371826, "logits/rejected": -1.9838192462921143, "logps/chosen": -226.6609649658203, "logps/rejected": -324.8330078125, "loss": 0.4786, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6872451305389404, "rewards/margins": 1.0430591106414795, "rewards/rejected": -2.73030424118042, "step": 7280 }, { "epoch": 1.2560303239145416, "grad_norm": 39.33851623535156, "learning_rate": 3.606527049186616e-07, "logits/chosen": -1.900702714920044, "logits/rejected": -1.8558051586151123, "logps/chosen": -244.43527221679688, "logps/rejected": -341.25445556640625, "loss": 0.4763, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.8861032724380493, "rewards/margins": 0.9750259518623352, "rewards/rejected": -2.8611292839050293, "step": 7290 }, { "epoch": 1.2577532736044108, "grad_norm": 30.13620948791504, "learning_rate": 3.6020303971268396e-07, "logits/chosen": -1.8521835803985596, "logits/rejected": -1.809308648109436, "logps/chosen": -251.0287628173828, "logps/rejected": -341.36798095703125, "loss": 0.5507, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.0039429664611816, "rewards/margins": 0.8881893157958984, "rewards/rejected": -2.892131805419922, "step": 7300 }, { "epoch": 1.2594762232942798, "grad_norm": 27.07564926147461, "learning_rate": 3.5975293155622215e-07, "logits/chosen": -1.9788624048233032, "logits/rejected": -1.9252811670303345, "logps/chosen": -249.314208984375, "logps/rejected": -346.1986389160156, "loss": 0.5193, "rewards/accuracies": 0.78125, "rewards/chosen": -1.9367738962173462, "rewards/margins": 1.016249418258667, "rewards/rejected": -2.9530234336853027, "step": 7310 }, { "epoch": 1.2611991729841487, "grad_norm": 26.56252098083496, "learning_rate": 3.5930238225844246e-07, "logits/chosen": -1.9911493062973022, "logits/rejected": -1.9445139169692993, "logps/chosen": -220.41909790039062, "logps/rejected": -303.0832214355469, "loss": 0.5442, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.64716374874115, "rewards/margins": 0.8795566558837891, "rewards/rejected": -2.5267205238342285, "step": 7320 }, { "epoch": 1.262922122674018, "grad_norm": 52.00545120239258, "learning_rate": 3.5885139363028435e-07, "logits/chosen": -1.8765075206756592, "logits/rejected": -1.8277864456176758, "logps/chosen": -231.78695678710938, "logps/rejected": -336.43292236328125, "loss": 0.4593, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7761586904525757, "rewards/margins": 1.0868165493011475, "rewards/rejected": -2.8629753589630127, "step": 7330 }, { "epoch": 1.264645072363887, "grad_norm": 28.644384384155273, "learning_rate": 3.5839996748445305e-07, "logits/chosen": -1.9700886011123657, "logits/rejected": -1.9171686172485352, "logps/chosen": -239.4293670654297, "logps/rejected": -325.0167236328125, "loss": 0.5026, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8341861963272095, "rewards/margins": 0.8852967023849487, "rewards/rejected": -2.719482898712158, "step": 7340 }, { "epoch": 1.266368022053756, "grad_norm": 38.03202438354492, "learning_rate": 3.5794810563541236e-07, "logits/chosen": -2.0383639335632324, "logits/rejected": -1.9784564971923828, "logps/chosen": -204.90823364257812, "logps/rejected": -307.02825927734375, "loss": 0.4563, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4649776220321655, "rewards/margins": 1.0982885360717773, "rewards/rejected": -2.5632662773132324, "step": 7350 }, { "epoch": 1.268090971743625, "grad_norm": 44.70645523071289, "learning_rate": 3.574958098993775e-07, "logits/chosen": -1.8994029760360718, "logits/rejected": -1.8556798696517944, "logps/chosen": -240.5580291748047, "logps/rejected": -343.79302978515625, "loss": 0.4751, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.8504226207733154, "rewards/margins": 1.0769256353378296, "rewards/rejected": -2.9273481369018555, "step": 7360 }, { "epoch": 1.269813921433494, "grad_norm": 37.326576232910156, "learning_rate": 3.570430820943074e-07, "logits/chosen": -1.8463459014892578, "logits/rejected": -1.7994228601455688, "logps/chosen": -255.2736358642578, "logps/rejected": -367.541259765625, "loss": 0.459, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.0111775398254395, "rewards/margins": 1.1598385572433472, "rewards/rejected": -3.171015977859497, "step": 7370 }, { "epoch": 1.2715368711233632, "grad_norm": 30.31681251525879, "learning_rate": 3.5658992403989783e-07, "logits/chosen": -1.8821303844451904, "logits/rejected": -1.8388748168945312, "logps/chosen": -254.57266235351562, "logps/rejected": -354.007080078125, "loss": 0.5331, "rewards/accuracies": 0.75, "rewards/chosen": -2.010704517364502, "rewards/margins": 0.998013973236084, "rewards/rejected": -3.008718490600586, "step": 7380 }, { "epoch": 1.2732598208132322, "grad_norm": 59.481361389160156, "learning_rate": 3.56136337557574e-07, "logits/chosen": -1.874807357788086, "logits/rejected": -1.8256614208221436, "logps/chosen": -247.7091522216797, "logps/rejected": -353.0526123046875, "loss": 0.4818, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9475284814834595, "rewards/margins": 1.063948392868042, "rewards/rejected": -3.011476993560791, "step": 7390 }, { "epoch": 1.2749827705031014, "grad_norm": 23.180805206298828, "learning_rate": 3.5568232447048274e-07, "logits/chosen": -1.8431650400161743, "logits/rejected": -1.7780349254608154, "logps/chosen": -281.7533264160156, "logps/rejected": -398.482666015625, "loss": 0.4474, "rewards/accuracies": 0.78125, "rewards/chosen": -2.253941059112549, "rewards/margins": 1.2331979274749756, "rewards/rejected": -3.4871387481689453, "step": 7400 }, { "epoch": 1.2767057201929704, "grad_norm": 51.275333404541016, "learning_rate": 3.5522788660348614e-07, "logits/chosen": -1.857966661453247, "logits/rejected": -1.7905441522598267, "logps/chosen": -279.89166259765625, "logps/rejected": -391.64892578125, "loss": 0.4966, "rewards/accuracies": 0.78125, "rewards/chosen": -2.2538554668426514, "rewards/margins": 1.1435785293579102, "rewards/rejected": -3.3974337577819824, "step": 7410 }, { "epoch": 1.2784286698828393, "grad_norm": 39.47146987915039, "learning_rate": 3.5477302578315307e-07, "logits/chosen": -1.8272358179092407, "logits/rejected": -1.7850310802459717, "logps/chosen": -275.1412658691406, "logps/rejected": -370.8865051269531, "loss": 0.5471, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.17646861076355, "rewards/margins": 1.0049974918365479, "rewards/rejected": -3.181466579437256, "step": 7420 }, { "epoch": 1.2801516195727085, "grad_norm": 33.89100646972656, "learning_rate": 3.5431774383775294e-07, "logits/chosen": -1.8256886005401611, "logits/rejected": -1.7710965871810913, "logps/chosen": -281.10467529296875, "logps/rejected": -391.57781982421875, "loss": 0.513, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.2769622802734375, "rewards/margins": 1.1270339488983154, "rewards/rejected": -3.403996229171753, "step": 7430 }, { "epoch": 1.2818745692625775, "grad_norm": 37.3873291015625, "learning_rate": 3.538620425972475e-07, "logits/chosen": -1.9672698974609375, "logits/rejected": -1.9214227199554443, "logps/chosen": -235.86325073242188, "logps/rejected": -307.61578369140625, "loss": 0.5646, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7938079833984375, "rewards/margins": 0.7590411305427551, "rewards/rejected": -2.552849054336548, "step": 7440 }, { "epoch": 1.2835975189524467, "grad_norm": 22.498027801513672, "learning_rate": 3.534059238932838e-07, "logits/chosen": -2.1226348876953125, "logits/rejected": -2.069653272628784, "logps/chosen": -182.15524291992188, "logps/rejected": -247.0476531982422, "loss": 0.5273, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.246428370475769, "rewards/margins": 0.7088695764541626, "rewards/rejected": -1.9552980661392212, "step": 7450 }, { "epoch": 1.2853204686423156, "grad_norm": 35.78093719482422, "learning_rate": 3.5294938955918717e-07, "logits/chosen": -1.9734151363372803, "logits/rejected": -1.9356145858764648, "logps/chosen": -190.55734252929688, "logps/rejected": -281.5165100097656, "loss": 0.4602, "rewards/accuracies": 0.78125, "rewards/chosen": -1.3951637744903564, "rewards/margins": 0.8983510136604309, "rewards/rejected": -2.2935144901275635, "step": 7460 }, { "epoch": 1.2870434183321846, "grad_norm": 60.992706298828125, "learning_rate": 3.524924414299532e-07, "logits/chosen": -1.9368385076522827, "logits/rejected": -1.9018291234970093, "logps/chosen": -259.17657470703125, "logps/rejected": -335.03497314453125, "loss": 0.5673, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.040667772293091, "rewards/margins": 0.8032005429267883, "rewards/rejected": -2.8438680171966553, "step": 7470 }, { "epoch": 1.2887663680220538, "grad_norm": 39.57422637939453, "learning_rate": 3.5203508134224093e-07, "logits/chosen": -1.8494319915771484, "logits/rejected": -1.8010066747665405, "logps/chosen": -273.61102294921875, "logps/rejected": -354.962158203125, "loss": 0.5669, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.1814329624176025, "rewards/margins": 0.8817019462585449, "rewards/rejected": -3.0631349086761475, "step": 7480 }, { "epoch": 1.2904893177119228, "grad_norm": 35.95067596435547, "learning_rate": 3.515773111343648e-07, "logits/chosen": -1.8543411493301392, "logits/rejected": -1.8178415298461914, "logps/chosen": -265.13397216796875, "logps/rejected": -352.121337890625, "loss": 0.5331, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.119323492050171, "rewards/margins": 0.8739126920700073, "rewards/rejected": -2.9932363033294678, "step": 7490 }, { "epoch": 1.292212267401792, "grad_norm": 26.60555648803711, "learning_rate": 3.5111913264628827e-07, "logits/chosen": -1.9228330850601196, "logits/rejected": -1.873252511024475, "logps/chosen": -216.4825439453125, "logps/rejected": -305.30609130859375, "loss": 0.5091, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6483036279678345, "rewards/margins": 0.8926553726196289, "rewards/rejected": -2.540959119796753, "step": 7500 }, { "epoch": 1.293935217091661, "grad_norm": 32.708290100097656, "learning_rate": 3.506605477196155e-07, "logits/chosen": -1.926487922668457, "logits/rejected": -1.8907474279403687, "logps/chosen": -230.7793426513672, "logps/rejected": -305.3385314941406, "loss": 0.4933, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.723146677017212, "rewards/margins": 0.8209812045097351, "rewards/rejected": -2.544127941131592, "step": 7510 }, { "epoch": 1.29565816678153, "grad_norm": 27.165733337402344, "learning_rate": 3.502015581975843e-07, "logits/chosen": -1.9401960372924805, "logits/rejected": -1.8948246240615845, "logps/chosen": -212.61502075195312, "logps/rejected": -304.7122497558594, "loss": 0.4547, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5809810161590576, "rewards/margins": 0.9408208131790161, "rewards/rejected": -2.5218019485473633, "step": 7520 }, { "epoch": 1.297381116471399, "grad_norm": 24.90611457824707, "learning_rate": 3.4974216592505874e-07, "logits/chosen": -1.8633019924163818, "logits/rejected": -1.8168882131576538, "logps/chosen": -232.188720703125, "logps/rejected": -344.02813720703125, "loss": 0.4572, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7948267459869385, "rewards/margins": 1.1224312782287598, "rewards/rejected": -2.917257785797119, "step": 7530 }, { "epoch": 1.299104066161268, "grad_norm": 30.08808708190918, "learning_rate": 3.492823727485218e-07, "logits/chosen": -1.9120346307754517, "logits/rejected": -1.8595492839813232, "logps/chosen": -269.2940979003906, "logps/rejected": -401.46636962890625, "loss": 0.4085, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.1674752235412598, "rewards/margins": 1.3249281644821167, "rewards/rejected": -3.492403507232666, "step": 7540 }, { "epoch": 1.3008270158511372, "grad_norm": 33.55939865112305, "learning_rate": 3.488221805160678e-07, "logits/chosen": -1.835585355758667, "logits/rejected": -1.7946319580078125, "logps/chosen": -302.1973876953125, "logps/rejected": -409.4505615234375, "loss": 0.5796, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.490602970123291, "rewards/margins": 1.0954583883285522, "rewards/rejected": -3.586061477661133, "step": 7550 }, { "epoch": 1.3025499655410062, "grad_norm": 42.52745819091797, "learning_rate": 3.483615910773949e-07, "logits/chosen": -1.9464540481567383, "logits/rejected": -1.8895442485809326, "logps/chosen": -249.20156860351562, "logps/rejected": -365.8326721191406, "loss": 0.4837, "rewards/accuracies": 0.78125, "rewards/chosen": -1.9567756652832031, "rewards/margins": 1.1515578031539917, "rewards/rejected": -3.1083335876464844, "step": 7560 }, { "epoch": 1.3042729152308752, "grad_norm": 35.630279541015625, "learning_rate": 3.4790060628379803e-07, "logits/chosen": -1.980413794517517, "logits/rejected": -1.9263309240341187, "logps/chosen": -236.85830688476562, "logps/rejected": -333.9463195800781, "loss": 0.504, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7986764907836914, "rewards/margins": 1.0236331224441528, "rewards/rejected": -2.8223094940185547, "step": 7570 }, { "epoch": 1.3059958649207444, "grad_norm": 40.41703414916992, "learning_rate": 3.4743922798816107e-07, "logits/chosen": -1.9418426752090454, "logits/rejected": -1.894049882888794, "logps/chosen": -221.6752471923828, "logps/rejected": -317.5902099609375, "loss": 0.5033, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.644579529762268, "rewards/margins": 0.9925468564033508, "rewards/rejected": -2.6371264457702637, "step": 7580 }, { "epoch": 1.3077188146106133, "grad_norm": 19.41632080078125, "learning_rate": 3.469774580449495e-07, "logits/chosen": -1.9536221027374268, "logits/rejected": -1.882176160812378, "logps/chosen": -236.449951171875, "logps/rejected": -355.5335693359375, "loss": 0.4375, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7750402688980103, "rewards/margins": 1.284414529800415, "rewards/rejected": -3.059454917907715, "step": 7590 }, { "epoch": 1.3094417643004825, "grad_norm": 45.97262191772461, "learning_rate": 3.46515298310203e-07, "logits/chosen": -1.9079723358154297, "logits/rejected": -1.858176589012146, "logps/chosen": -255.7857208251953, "logps/rejected": -343.4252014160156, "loss": 0.5477, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0255866050720215, "rewards/margins": 0.9314759373664856, "rewards/rejected": -2.9570624828338623, "step": 7600 }, { "epoch": 1.3094417643004825, "eval_logits/chosen": -1.9845290184020996, "eval_logits/rejected": -1.9621840715408325, "eval_logps/chosen": -237.4424591064453, "eval_logps/rejected": -279.5165100097656, "eval_loss": 0.641257643699646, "eval_rewards/accuracies": 0.6498606204986572, "eval_rewards/chosen": -1.7842696905136108, "eval_rewards/margins": 0.383399099111557, "eval_rewards/rejected": -2.1676690578460693, "eval_runtime": 361.983, "eval_samples_per_second": 11.89, "eval_steps_per_second": 1.486, "step": 7600 }, { "epoch": 1.3111647139903515, "grad_norm": 23.437707901000977, "learning_rate": 3.4605275064152817e-07, "logits/chosen": -1.9151580333709717, "logits/rejected": -1.8621551990509033, "logps/chosen": -222.98532104492188, "logps/rejected": -313.58416748046875, "loss": 0.5102, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.686981439590454, "rewards/margins": 0.9570782780647278, "rewards/rejected": -2.644059658050537, "step": 7610 }, { "epoch": 1.3128876636802205, "grad_norm": 39.289344787597656, "learning_rate": 3.4558981689809064e-07, "logits/chosen": -1.9646356105804443, "logits/rejected": -1.9170808792114258, "logps/chosen": -224.3332061767578, "logps/rejected": -304.8197937011719, "loss": 0.5237, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6550371646881104, "rewards/margins": 0.8572853207588196, "rewards/rejected": -2.512322187423706, "step": 7620 }, { "epoch": 1.3146106133700897, "grad_norm": 20.694198608398438, "learning_rate": 3.45126498940608e-07, "logits/chosen": -1.9502004384994507, "logits/rejected": -1.919576644897461, "logps/chosen": -203.83580017089844, "logps/rejected": -291.90203857421875, "loss": 0.4953, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5308338403701782, "rewards/margins": 0.8527175784111023, "rewards/rejected": -2.383551597595215, "step": 7630 }, { "epoch": 1.3163335630599586, "grad_norm": 42.61960983276367, "learning_rate": 3.446627986313419e-07, "logits/chosen": -1.9235782623291016, "logits/rejected": -1.865512490272522, "logps/chosen": -235.48159790039062, "logps/rejected": -333.7663269042969, "loss": 0.4866, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7704296112060547, "rewards/margins": 1.050910472869873, "rewards/rejected": -2.8213400840759277, "step": 7640 }, { "epoch": 1.3180565127498278, "grad_norm": 29.04054832458496, "learning_rate": 3.4419871783409116e-07, "logits/chosen": -1.9340184926986694, "logits/rejected": -1.8840440511703491, "logps/chosen": -267.86932373046875, "logps/rejected": -364.5704345703125, "loss": 0.5152, "rewards/accuracies": 0.75, "rewards/chosen": -2.0856947898864746, "rewards/margins": 1.013533592224121, "rewards/rejected": -3.0992283821105957, "step": 7650 }, { "epoch": 1.3197794624396968, "grad_norm": 22.07731819152832, "learning_rate": 3.4373425841418377e-07, "logits/chosen": -1.9651868343353271, "logits/rejected": -1.9032859802246094, "logps/chosen": -233.3546905517578, "logps/rejected": -352.4500427246094, "loss": 0.4074, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.7605863809585571, "rewards/margins": 1.2233213186264038, "rewards/rejected": -2.98390793800354, "step": 7660 }, { "epoch": 1.3215024121295658, "grad_norm": 35.345577239990234, "learning_rate": 3.4326942223846936e-07, "logits/chosen": -1.8982417583465576, "logits/rejected": -1.854189157485962, "logps/chosen": -243.8641815185547, "logps/rejected": -327.7332458496094, "loss": 0.5707, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8560680150985718, "rewards/margins": 0.8962557911872864, "rewards/rejected": -2.752324104309082, "step": 7670 }, { "epoch": 1.323225361819435, "grad_norm": 24.718090057373047, "learning_rate": 3.428042111753123e-07, "logits/chosen": -1.9406830072402954, "logits/rejected": -1.8926671743392944, "logps/chosen": -240.29312133789062, "logps/rejected": -332.96453857421875, "loss": 0.4779, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8838748931884766, "rewards/margins": 0.9562339782714844, "rewards/rejected": -2.840108871459961, "step": 7680 }, { "epoch": 1.324948311509304, "grad_norm": 23.8499698638916, "learning_rate": 3.423386270945835e-07, "logits/chosen": -2.038442850112915, "logits/rejected": -1.9699786901474, "logps/chosen": -234.3078155517578, "logps/rejected": -325.94305419921875, "loss": 0.4803, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7793443202972412, "rewards/margins": 0.9939638376235962, "rewards/rejected": -2.773308277130127, "step": 7690 }, { "epoch": 1.3266712611991731, "grad_norm": 25.235780715942383, "learning_rate": 3.4187267186765325e-07, "logits/chosen": -1.9371519088745117, "logits/rejected": -1.8713867664337158, "logps/chosen": -247.2796630859375, "logps/rejected": -349.75408935546875, "loss": 0.4949, "rewards/accuracies": 0.78125, "rewards/chosen": -1.9263479709625244, "rewards/margins": 1.0337719917297363, "rewards/rejected": -2.9601199626922607, "step": 7700 }, { "epoch": 1.328394210889042, "grad_norm": 31.90622901916504, "learning_rate": 3.414063473673835e-07, "logits/chosen": -1.9395166635513306, "logits/rejected": -1.8920812606811523, "logps/chosen": -255.0199432373047, "logps/rejected": -349.48663330078125, "loss": 0.4686, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.0186798572540283, "rewards/margins": 0.9681509733200073, "rewards/rejected": -2.986830949783325, "step": 7710 }, { "epoch": 1.330117160578911, "grad_norm": 38.197166442871094, "learning_rate": 3.4093965546812075e-07, "logits/chosen": -1.9624826908111572, "logits/rejected": -1.9116675853729248, "logps/chosen": -258.188720703125, "logps/rejected": -343.18511962890625, "loss": 0.5343, "rewards/accuracies": 0.71875, "rewards/chosen": -2.00760817527771, "rewards/margins": 0.929057776927948, "rewards/rejected": -2.9366660118103027, "step": 7720 }, { "epoch": 1.33184011026878, "grad_norm": 40.80598068237305, "learning_rate": 3.4047259804568793e-07, "logits/chosen": -1.8829591274261475, "logits/rejected": -1.8316304683685303, "logps/chosen": -240.71890258789062, "logps/rejected": -337.19049072265625, "loss": 0.484, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.8570854663848877, "rewards/margins": 1.0008538961410522, "rewards/rejected": -2.8579392433166504, "step": 7730 }, { "epoch": 1.3335630599586492, "grad_norm": 27.297386169433594, "learning_rate": 3.400051769773774e-07, "logits/chosen": -1.9413639307022095, "logits/rejected": -1.8864524364471436, "logps/chosen": -248.7015380859375, "logps/rejected": -365.2894592285156, "loss": 0.4531, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9548965692520142, "rewards/margins": 1.1702516078948975, "rewards/rejected": -3.125148296356201, "step": 7740 }, { "epoch": 1.3352860096485184, "grad_norm": 28.473569869995117, "learning_rate": 3.3953739414194293e-07, "logits/chosen": -1.9811633825302124, "logits/rejected": -1.9134953022003174, "logps/chosen": -251.4621124267578, "logps/rejected": -369.58154296875, "loss": 0.4535, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.96244215965271, "rewards/margins": 1.2237776517868042, "rewards/rejected": -3.1862196922302246, "step": 7750 }, { "epoch": 1.3370089593383874, "grad_norm": 34.905517578125, "learning_rate": 3.390692514195925e-07, "logits/chosen": -1.8766844272613525, "logits/rejected": -1.8258984088897705, "logps/chosen": -222.68734741210938, "logps/rejected": -327.47283935546875, "loss": 0.4632, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7118583917617798, "rewards/margins": 1.0548206567764282, "rewards/rejected": -2.766679048538208, "step": 7760 }, { "epoch": 1.3387319090282563, "grad_norm": 30.522104263305664, "learning_rate": 3.386007506919808e-07, "logits/chosen": -1.9382438659667969, "logits/rejected": -1.8887494802474976, "logps/chosen": -245.9361114501953, "logps/rejected": -338.2505798339844, "loss": 0.5059, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.926809549331665, "rewards/margins": 0.937475323677063, "rewards/rejected": -2.8642847537994385, "step": 7770 }, { "epoch": 1.3404548587181253, "grad_norm": 30.46518325805664, "learning_rate": 3.3813189384220106e-07, "logits/chosen": -1.982755422592163, "logits/rejected": -1.9379222393035889, "logps/chosen": -267.4217834472656, "logps/rejected": -381.255615234375, "loss": 0.4795, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.149242401123047, "rewards/margins": 1.1368591785430908, "rewards/rejected": -3.2861015796661377, "step": 7780 }, { "epoch": 1.3421778084079945, "grad_norm": 35.398704528808594, "learning_rate": 3.376626827547782e-07, "logits/chosen": -1.9689769744873047, "logits/rejected": -1.916944146156311, "logps/chosen": -278.5525817871094, "logps/rejected": -407.31390380859375, "loss": 0.4193, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.264665126800537, "rewards/margins": 1.2984492778778076, "rewards/rejected": -3.563114881515503, "step": 7790 }, { "epoch": 1.3439007580978635, "grad_norm": 34.90265655517578, "learning_rate": 3.3719311931566096e-07, "logits/chosen": -1.95291006565094, "logits/rejected": -1.9006128311157227, "logps/chosen": -283.9674072265625, "logps/rejected": -404.7523498535156, "loss": 0.4589, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.2859137058258057, "rewards/margins": 1.2199745178222656, "rewards/rejected": -3.505887985229492, "step": 7800 }, { "epoch": 1.3456237077877327, "grad_norm": 48.02193069458008, "learning_rate": 3.367232054122143e-07, "logits/chosen": -1.9438936710357666, "logits/rejected": -1.8844982385635376, "logps/chosen": -295.0854187011719, "logps/rejected": -394.09918212890625, "loss": 0.5302, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.355963706970215, "rewards/margins": 1.043371558189392, "rewards/rejected": -3.3993351459503174, "step": 7810 }, { "epoch": 1.3473466574776016, "grad_norm": 36.68013000488281, "learning_rate": 3.362529429332117e-07, "logits/chosen": -1.9684326648712158, "logits/rejected": -1.90561842918396, "logps/chosen": -262.14691162109375, "logps/rejected": -383.6622009277344, "loss": 0.4751, "rewards/accuracies": 0.75, "rewards/chosen": -2.045532703399658, "rewards/margins": 1.2499134540557861, "rewards/rejected": -3.2954459190368652, "step": 7820 }, { "epoch": 1.3490696071674706, "grad_norm": 43.31465148925781, "learning_rate": 3.357823337688279e-07, "logits/chosen": -1.9468940496444702, "logits/rejected": -1.8974952697753906, "logps/chosen": -234.67745971679688, "logps/rejected": -336.5506591796875, "loss": 0.5109, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.8388497829437256, "rewards/margins": 1.0021735429763794, "rewards/rejected": -2.8410234451293945, "step": 7830 }, { "epoch": 1.3507925568573398, "grad_norm": 25.465286254882812, "learning_rate": 3.35311379810631e-07, "logits/chosen": -1.9444602727890015, "logits/rejected": -1.9013820886611938, "logps/chosen": -215.7208251953125, "logps/rejected": -296.7525634765625, "loss": 0.5184, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.611196517944336, "rewards/margins": 0.8247619867324829, "rewards/rejected": -2.4359583854675293, "step": 7840 }, { "epoch": 1.3525155065472088, "grad_norm": 33.77199935913086, "learning_rate": 3.3484008295157495e-07, "logits/chosen": -1.9006173610687256, "logits/rejected": -1.8271697759628296, "logps/chosen": -240.51025390625, "logps/rejected": -338.54010009765625, "loss": 0.4634, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.842721700668335, "rewards/margins": 1.0674257278442383, "rewards/rejected": -2.9101476669311523, "step": 7850 }, { "epoch": 1.354238456237078, "grad_norm": 33.142913818359375, "learning_rate": 3.343684450859922e-07, "logits/chosen": -1.9356529712677002, "logits/rejected": -1.8675906658172607, "logps/chosen": -249.0338134765625, "logps/rejected": -362.5653076171875, "loss": 0.4542, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9576822519302368, "rewards/margins": 1.1520602703094482, "rewards/rejected": -3.1097424030303955, "step": 7860 }, { "epoch": 1.355961405926947, "grad_norm": 42.13118362426758, "learning_rate": 3.338964681095854e-07, "logits/chosen": -1.9101474285125732, "logits/rejected": -1.8635610342025757, "logps/chosen": -268.474365234375, "logps/rejected": -371.6224060058594, "loss": 0.5157, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1169955730438232, "rewards/margins": 1.065665602684021, "rewards/rejected": -3.182661533355713, "step": 7870 }, { "epoch": 1.3576843556168159, "grad_norm": 50.455841064453125, "learning_rate": 3.3342415391942055e-07, "logits/chosen": -1.9599437713623047, "logits/rejected": -1.919441819190979, "logps/chosen": -259.6284484863281, "logps/rejected": -341.6492614746094, "loss": 0.5595, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.056394338607788, "rewards/margins": 0.8438320159912109, "rewards/rejected": -2.90022611618042, "step": 7880 }, { "epoch": 1.359407305306685, "grad_norm": 27.43259048461914, "learning_rate": 3.329515044139189e-07, "logits/chosen": -1.9637091159820557, "logits/rejected": -1.9139865636825562, "logps/chosen": -229.21438598632812, "logps/rejected": -317.72772216796875, "loss": 0.5163, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7436039447784424, "rewards/margins": 0.8854995965957642, "rewards/rejected": -2.629103183746338, "step": 7890 }, { "epoch": 1.361130254996554, "grad_norm": 25.757503509521484, "learning_rate": 3.324785214928496e-07, "logits/chosen": -2.0457122325897217, "logits/rejected": -2.005627155303955, "logps/chosen": -197.79066467285156, "logps/rejected": -280.9956970214844, "loss": 0.5046, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4466524124145508, "rewards/margins": 0.840844452381134, "rewards/rejected": -2.287497043609619, "step": 7900 }, { "epoch": 1.3628532046864232, "grad_norm": 41.52971649169922, "learning_rate": 3.3200520705732195e-07, "logits/chosen": -2.0292277336120605, "logits/rejected": -1.9812195301055908, "logps/chosen": -203.8176727294922, "logps/rejected": -274.96234130859375, "loss": 0.5363, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.465406060218811, "rewards/margins": 0.7549365758895874, "rewards/rejected": -2.2203423976898193, "step": 7910 }, { "epoch": 1.3645761543762922, "grad_norm": 28.424823760986328, "learning_rate": 3.315315630097774e-07, "logits/chosen": -1.994763970375061, "logits/rejected": -1.955718755722046, "logps/chosen": -205.61148071289062, "logps/rejected": -275.96575927734375, "loss": 0.528, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4938069581985474, "rewards/margins": 0.7569162845611572, "rewards/rejected": -2.250723123550415, "step": 7920 }, { "epoch": 1.3662991040661612, "grad_norm": 32.62846374511719, "learning_rate": 3.3105759125398283e-07, "logits/chosen": -2.0215632915496826, "logits/rejected": -1.96671462059021, "logps/chosen": -212.2326202392578, "logps/rejected": -303.41668701171875, "loss": 0.446, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.5316672325134277, "rewards/margins": 0.9840360879898071, "rewards/rejected": -2.5157036781311035, "step": 7930 }, { "epoch": 1.3680220537560304, "grad_norm": 46.17294692993164, "learning_rate": 3.305832936950217e-07, "logits/chosen": -1.9514837265014648, "logits/rejected": -1.90081787109375, "logps/chosen": -229.9663848876953, "logps/rejected": -326.1025085449219, "loss": 0.5005, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7080886363983154, "rewards/margins": 1.0084588527679443, "rewards/rejected": -2.7165474891662598, "step": 7940 }, { "epoch": 1.3697450034458993, "grad_norm": 26.849809646606445, "learning_rate": 3.301086722392873e-07, "logits/chosen": -1.9272445440292358, "logits/rejected": -1.8954582214355469, "logps/chosen": -208.8096160888672, "logps/rejected": -298.6540222167969, "loss": 0.5193, "rewards/accuracies": 0.75, "rewards/chosen": -1.5671939849853516, "rewards/margins": 0.8979303240776062, "rewards/rejected": -2.4651246070861816, "step": 7950 }, { "epoch": 1.3714679531357685, "grad_norm": 30.55245590209961, "learning_rate": 3.2963372879447497e-07, "logits/chosen": -1.946054458618164, "logits/rejected": -1.8961801528930664, "logps/chosen": -209.4328155517578, "logps/rejected": -299.6525573730469, "loss": 0.4808, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.5855239629745483, "rewards/margins": 0.8956725001335144, "rewards/rejected": -2.481196641921997, "step": 7960 }, { "epoch": 1.3731909028256375, "grad_norm": 24.744123458862305, "learning_rate": 3.291584652695739e-07, "logits/chosen": -1.9824409484863281, "logits/rejected": -1.9361375570297241, "logps/chosen": -226.3142852783203, "logps/rejected": -316.39898681640625, "loss": 0.4787, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6986534595489502, "rewards/margins": 0.9493204951286316, "rewards/rejected": -2.6479740142822266, "step": 7970 }, { "epoch": 1.3749138525155065, "grad_norm": 55.92538070678711, "learning_rate": 3.2868288357486e-07, "logits/chosen": -1.971534013748169, "logits/rejected": -1.928073525428772, "logps/chosen": -249.99526977539062, "logps/rejected": -350.9532775878906, "loss": 0.4781, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9198601245880127, "rewards/margins": 1.036308765411377, "rewards/rejected": -2.9561691284179688, "step": 7980 }, { "epoch": 1.3766368022053757, "grad_norm": 36.98202133178711, "learning_rate": 3.28206985621888e-07, "logits/chosen": -1.9411964416503906, "logits/rejected": -1.8911597728729248, "logps/chosen": -261.72589111328125, "logps/rejected": -366.69390869140625, "loss": 0.4816, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.103363037109375, "rewards/margins": 1.0494035482406616, "rewards/rejected": -3.152766704559326, "step": 7990 }, { "epoch": 1.3783597518952446, "grad_norm": 34.65818786621094, "learning_rate": 3.277307733234839e-07, "logits/chosen": -1.8554356098175049, "logits/rejected": -1.8017200231552124, "logps/chosen": -245.2360382080078, "logps/rejected": -360.3771667480469, "loss": 0.4423, "rewards/accuracies": 0.8125, "rewards/chosen": -1.892396330833435, "rewards/margins": 1.1603657007217407, "rewards/rejected": -3.0527617931365967, "step": 8000 }, { "epoch": 1.3783597518952446, "eval_logits/chosen": -1.9468685388565063, "eval_logits/rejected": -1.926641821861267, "eval_logps/chosen": -259.041748046875, "eval_logps/rejected": -298.94793701171875, "eval_loss": 0.6528458595275879, "eval_rewards/accuracies": 0.6414963006973267, "eval_rewards/chosen": -2.00026273727417, "eval_rewards/margins": 0.3617205023765564, "eval_rewards/rejected": -2.361983060836792, "eval_runtime": 361.8074, "eval_samples_per_second": 11.896, "eval_steps_per_second": 1.487, "step": 8000 }, { "epoch": 1.3800827015851138, "grad_norm": 51.32448196411133, "learning_rate": 3.272542485937368e-07, "logits/chosen": -1.8827146291732788, "logits/rejected": -1.8168216943740845, "logps/chosen": -264.81005859375, "logps/rejected": -365.46795654296875, "loss": 0.5471, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.067554235458374, "rewards/margins": 1.075744390487671, "rewards/rejected": -3.143298625946045, "step": 8010 }, { "epoch": 1.3818056512749828, "grad_norm": 38.838199615478516, "learning_rate": 3.2677741334799227e-07, "logits/chosen": -1.9274343252182007, "logits/rejected": -1.8637018203735352, "logps/chosen": -257.0386047363281, "logps/rejected": -358.05413818359375, "loss": 0.4669, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.9630534648895264, "rewards/margins": 1.0785964727401733, "rewards/rejected": -3.0416502952575684, "step": 8020 }, { "epoch": 1.3835286009648518, "grad_norm": 32.4897575378418, "learning_rate": 3.2630026950284315e-07, "logits/chosen": -1.9393823146820068, "logits/rejected": -1.8899962902069092, "logps/chosen": -213.46646118164062, "logps/rejected": -325.04205322265625, "loss": 0.4747, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6184823513031006, "rewards/margins": 1.105190634727478, "rewards/rejected": -2.7236733436584473, "step": 8030 }, { "epoch": 1.385251550654721, "grad_norm": 28.223188400268555, "learning_rate": 3.258228189761234e-07, "logits/chosen": -1.9332107305526733, "logits/rejected": -1.8654142618179321, "logps/chosen": -223.5410614013672, "logps/rejected": -345.335205078125, "loss": 0.4074, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.6788692474365234, "rewards/margins": 1.2674894332885742, "rewards/rejected": -2.9463586807250977, "step": 8040 }, { "epoch": 1.38697450034459, "grad_norm": 25.61373519897461, "learning_rate": 3.253450636868992e-07, "logits/chosen": -1.938897728919983, "logits/rejected": -1.8898561000823975, "logps/chosen": -223.5132598876953, "logps/rejected": -332.8970642089844, "loss": 0.4794, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6774787902832031, "rewards/margins": 1.0993845462799072, "rewards/rejected": -2.7768633365631104, "step": 8050 }, { "epoch": 1.388697450034459, "grad_norm": 31.50178337097168, "learning_rate": 3.2486700555546193e-07, "logits/chosen": -1.9820287227630615, "logits/rejected": -1.9291832447052002, "logps/chosen": -238.9545440673828, "logps/rejected": -325.25689697265625, "loss": 0.5465, "rewards/accuracies": 0.71875, "rewards/chosen": -1.8248071670532227, "rewards/margins": 0.9315603971481323, "rewards/rejected": -2.7563679218292236, "step": 8060 }, { "epoch": 1.390420399724328, "grad_norm": 36.30656433105469, "learning_rate": 3.2438864650331997e-07, "logits/chosen": -1.9192146062850952, "logits/rejected": -1.8664567470550537, "logps/chosen": -231.16305541992188, "logps/rejected": -312.89520263671875, "loss": 0.5029, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7699248790740967, "rewards/margins": 0.8697635531425476, "rewards/rejected": -2.63968825340271, "step": 8070 }, { "epoch": 1.392143349414197, "grad_norm": 38.666160583496094, "learning_rate": 3.2390998845319164e-07, "logits/chosen": -1.938584566116333, "logits/rejected": -1.8892152309417725, "logps/chosen": -223.812744140625, "logps/rejected": -338.17620849609375, "loss": 0.4581, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6995760202407837, "rewards/margins": 1.1107227802276611, "rewards/rejected": -2.810298442840576, "step": 8080 }, { "epoch": 1.3938662991040662, "grad_norm": 24.277746200561523, "learning_rate": 3.2343103332899635e-07, "logits/chosen": -1.8694385290145874, "logits/rejected": -1.8096065521240234, "logps/chosen": -257.20196533203125, "logps/rejected": -372.5042419433594, "loss": 0.4719, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.029611110687256, "rewards/margins": 1.1915353536605835, "rewards/rejected": -3.22114634513855, "step": 8090 }, { "epoch": 1.3955892487939352, "grad_norm": 44.1191291809082, "learning_rate": 3.2295178305584835e-07, "logits/chosen": -1.8937492370605469, "logits/rejected": -1.8587541580200195, "logps/chosen": -291.1720886230469, "logps/rejected": -380.21514892578125, "loss": 0.5423, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.356750011444092, "rewards/margins": 0.8717220425605774, "rewards/rejected": -3.2284717559814453, "step": 8100 }, { "epoch": 1.3973121984838044, "grad_norm": 47.089210510253906, "learning_rate": 3.2247223956004783e-07, "logits/chosen": -1.8573442697525024, "logits/rejected": -1.800794243812561, "logps/chosen": -279.50762939453125, "logps/rejected": -376.6075744628906, "loss": 0.5248, "rewards/accuracies": 0.75, "rewards/chosen": -2.227139949798584, "rewards/margins": 1.0390504598617554, "rewards/rejected": -3.26619029045105, "step": 8110 }, { "epoch": 1.3990351481736734, "grad_norm": 44.62866973876953, "learning_rate": 3.2199240476907354e-07, "logits/chosen": -1.9026877880096436, "logits/rejected": -1.84988272190094, "logps/chosen": -251.6893768310547, "logps/rejected": -340.9601745605469, "loss": 0.5036, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.958630919456482, "rewards/margins": 0.9312818646430969, "rewards/rejected": -2.8899126052856445, "step": 8120 }, { "epoch": 1.4007580978635423, "grad_norm": 39.25453186035156, "learning_rate": 3.215122806115751e-07, "logits/chosen": -1.9493757486343384, "logits/rejected": -1.9110513925552368, "logps/chosen": -230.50894165039062, "logps/rejected": -309.252685546875, "loss": 0.5066, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7483036518096924, "rewards/margins": 0.8471187353134155, "rewards/rejected": -2.5954222679138184, "step": 8130 }, { "epoch": 1.4024810475534115, "grad_norm": 21.498291015625, "learning_rate": 3.210318690173652e-07, "logits/chosen": -1.9303830862045288, "logits/rejected": -1.8804740905761719, "logps/chosen": -226.16000366210938, "logps/rejected": -326.58172607421875, "loss": 0.4846, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7239042520523071, "rewards/margins": 1.0192549228668213, "rewards/rejected": -2.743159294128418, "step": 8140 }, { "epoch": 1.4042039972432805, "grad_norm": 21.030715942382812, "learning_rate": 3.2055117191741197e-07, "logits/chosen": -1.9703865051269531, "logits/rejected": -1.92462956905365, "logps/chosen": -235.7099151611328, "logps/rejected": -318.791748046875, "loss": 0.557, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8229669332504272, "rewards/margins": 0.8424266576766968, "rewards/rejected": -2.665393352508545, "step": 8150 }, { "epoch": 1.4059269469331497, "grad_norm": 28.582143783569336, "learning_rate": 3.20070191243831e-07, "logits/chosen": -1.941114068031311, "logits/rejected": -1.884535551071167, "logps/chosen": -208.457275390625, "logps/rejected": -326.6148376464844, "loss": 0.4058, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.5205273628234863, "rewards/margins": 1.2360708713531494, "rewards/rejected": -2.7565979957580566, "step": 8160 }, { "epoch": 1.4076498966230186, "grad_norm": 38.59527587890625, "learning_rate": 3.1958892892987774e-07, "logits/chosen": -1.909999132156372, "logits/rejected": -1.8523762226104736, "logps/chosen": -257.6002197265625, "logps/rejected": -342.0556335449219, "loss": 0.4986, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.9753692150115967, "rewards/margins": 0.9335805177688599, "rewards/rejected": -2.908949613571167, "step": 8170 }, { "epoch": 1.4093728463128876, "grad_norm": 42.686344146728516, "learning_rate": 3.191073869099395e-07, "logits/chosen": -1.8773012161254883, "logits/rejected": -1.824954628944397, "logps/chosen": -297.3863525390625, "logps/rejected": -407.413330078125, "loss": 0.4771, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.3980631828308105, "rewards/margins": 1.1302452087402344, "rewards/rejected": -3.528308868408203, "step": 8180 }, { "epoch": 1.4110957960027566, "grad_norm": 48.320560455322266, "learning_rate": 3.1862556711952805e-07, "logits/chosen": -1.797096848487854, "logits/rejected": -1.7423614263534546, "logps/chosen": -303.3244934082031, "logps/rejected": -412.1568298339844, "loss": 0.4977, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.4393374919891357, "rewards/margins": 1.1471078395843506, "rewards/rejected": -3.5864453315734863, "step": 8190 }, { "epoch": 1.4128187456926258, "grad_norm": 49.76549530029297, "learning_rate": 3.1814347149527155e-07, "logits/chosen": -1.8736263513565063, "logits/rejected": -1.8312642574310303, "logps/chosen": -291.61724853515625, "logps/rejected": -382.6716003417969, "loss": 0.5022, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.357804775238037, "rewards/margins": 0.9562328457832336, "rewards/rejected": -3.314037322998047, "step": 8200 }, { "epoch": 1.414541695382495, "grad_norm": 26.661985397338867, "learning_rate": 3.1766110197490673e-07, "logits/chosen": -1.8359260559082031, "logits/rejected": -1.7866836786270142, "logps/chosen": -243.22848510742188, "logps/rejected": -342.04290771484375, "loss": 0.4951, "rewards/accuracies": 0.75, "rewards/chosen": -1.8702657222747803, "rewards/margins": 0.9947149157524109, "rewards/rejected": -2.864980459213257, "step": 8210 }, { "epoch": 1.416264645072364, "grad_norm": 39.83286666870117, "learning_rate": 3.171784604972716e-07, "logits/chosen": -1.9040403366088867, "logits/rejected": -1.8404487371444702, "logps/chosen": -241.09646606445312, "logps/rejected": -345.05181884765625, "loss": 0.4826, "rewards/accuracies": 0.75, "rewards/chosen": -1.879390001296997, "rewards/margins": 1.048660397529602, "rewards/rejected": -2.9280500411987305, "step": 8220 }, { "epoch": 1.417987594762233, "grad_norm": 25.753490447998047, "learning_rate": 3.166955490022966e-07, "logits/chosen": -1.8975028991699219, "logits/rejected": -1.8578437566757202, "logps/chosen": -263.56256103515625, "logps/rejected": -360.471923828125, "loss": 0.4844, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.068711996078491, "rewards/margins": 1.0323107242584229, "rewards/rejected": -3.101022720336914, "step": 8230 }, { "epoch": 1.4197105444521019, "grad_norm": 40.954612731933594, "learning_rate": 3.1621236943099833e-07, "logits/chosen": -1.9528751373291016, "logits/rejected": -1.8952739238739014, "logps/chosen": -279.40692138671875, "logps/rejected": -376.708984375, "loss": 0.4795, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.224416494369507, "rewards/margins": 1.0324606895446777, "rewards/rejected": -3.2568774223327637, "step": 8240 }, { "epoch": 1.421433494141971, "grad_norm": 37.160858154296875, "learning_rate": 3.157289237254701e-07, "logits/chosen": -1.942457914352417, "logits/rejected": -1.8984220027923584, "logps/chosen": -254.0677490234375, "logps/rejected": -364.9516296386719, "loss": 0.4463, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.9464086294174194, "rewards/margins": 1.161780595779419, "rewards/rejected": -3.108189344406128, "step": 8250 }, { "epoch": 1.42315644383184, "grad_norm": 33.47797775268555, "learning_rate": 3.152452138288755e-07, "logits/chosen": -2.006679058074951, "logits/rejected": -1.952954888343811, "logps/chosen": -234.88174438476562, "logps/rejected": -341.0386962890625, "loss": 0.4519, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7679574489593506, "rewards/margins": 1.0905119180679321, "rewards/rejected": -2.8584694862365723, "step": 8260 }, { "epoch": 1.4248793935217092, "grad_norm": 32.248172760009766, "learning_rate": 3.147612416854396e-07, "logits/chosen": -1.997107744216919, "logits/rejected": -1.9523258209228516, "logps/chosen": -243.0508270263672, "logps/rejected": -343.28216552734375, "loss": 0.4869, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.8901599645614624, "rewards/margins": 1.0022904872894287, "rewards/rejected": -2.8924500942230225, "step": 8270 }, { "epoch": 1.4266023432115782, "grad_norm": 37.839481353759766, "learning_rate": 3.142770092404418e-07, "logits/chosen": -1.9262062311172485, "logits/rejected": -1.883358359336853, "logps/chosen": -269.64410400390625, "logps/rejected": -363.852294921875, "loss": 0.5168, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.166213274002075, "rewards/margins": 0.9534326791763306, "rewards/rejected": -3.119645833969116, "step": 8280 }, { "epoch": 1.4283252929014472, "grad_norm": 28.519514083862305, "learning_rate": 3.137925184402078e-07, "logits/chosen": -1.8984733819961548, "logits/rejected": -1.8326194286346436, "logps/chosen": -261.3946533203125, "logps/rejected": -347.3902893066406, "loss": 0.493, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0072038173675537, "rewards/margins": 0.9321564435958862, "rewards/rejected": -2.9393606185913086, "step": 8290 }, { "epoch": 1.4300482425913164, "grad_norm": 22.495311737060547, "learning_rate": 3.133077712321015e-07, "logits/chosen": -2.011319637298584, "logits/rejected": -1.9586061239242554, "logps/chosen": -280.8663024902344, "logps/rejected": -390.22979736328125, "loss": 0.5145, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.2154147624969482, "rewards/margins": 1.153434157371521, "rewards/rejected": -3.368849277496338, "step": 8300 }, { "epoch": 1.4317711922811853, "grad_norm": 41.687740325927734, "learning_rate": 3.128227695645176e-07, "logits/chosen": -1.9000619649887085, "logits/rejected": -1.8470157384872437, "logps/chosen": -265.81842041015625, "logps/rejected": -355.65008544921875, "loss": 0.511, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.0835185050964355, "rewards/margins": 0.9701333045959473, "rewards/rejected": -3.0536513328552246, "step": 8310 }, { "epoch": 1.4334941419710545, "grad_norm": 32.66810607910156, "learning_rate": 3.123375153868734e-07, "logits/chosen": -1.9251701831817627, "logits/rejected": -1.8821756839752197, "logps/chosen": -235.2640838623047, "logps/rejected": -346.80523681640625, "loss": 0.4553, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.8160892724990845, "rewards/margins": 1.1172221899032593, "rewards/rejected": -2.9333114624023438, "step": 8320 }, { "epoch": 1.4352170916609235, "grad_norm": 27.470369338989258, "learning_rate": 3.118520106496014e-07, "logits/chosen": -1.9838006496429443, "logits/rejected": -1.944800615310669, "logps/chosen": -235.54721069335938, "logps/rejected": -327.46575927734375, "loss": 0.5109, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.8601970672607422, "rewards/margins": 0.8868915438652039, "rewards/rejected": -2.747088670730591, "step": 8330 }, { "epoch": 1.4369400413507925, "grad_norm": 30.947973251342773, "learning_rate": 3.1136625730414083e-07, "logits/chosen": -2.0482122898101807, "logits/rejected": -2.0036213397979736, "logps/chosen": -223.91909790039062, "logps/rejected": -313.14044189453125, "loss": 0.5088, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6807992458343506, "rewards/margins": 0.9355287551879883, "rewards/rejected": -2.616328239440918, "step": 8340 }, { "epoch": 1.4386629910406616, "grad_norm": 28.19830322265625, "learning_rate": 3.1088025730293055e-07, "logits/chosen": -1.9159135818481445, "logits/rejected": -1.8705981969833374, "logps/chosen": -227.56298828125, "logps/rejected": -309.39422607421875, "loss": 0.5067, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7284634113311768, "rewards/margins": 0.8446298837661743, "rewards/rejected": -2.5730934143066406, "step": 8350 }, { "epoch": 1.4403859407305306, "grad_norm": 23.838239669799805, "learning_rate": 3.103940125994007e-07, "logits/chosen": -1.8981670141220093, "logits/rejected": -1.83480966091156, "logps/chosen": -220.01296997070312, "logps/rejected": -323.8902587890625, "loss": 0.451, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6390295028686523, "rewards/margins": 1.0848989486694336, "rewards/rejected": -2.723928451538086, "step": 8360 }, { "epoch": 1.4421088904203998, "grad_norm": 39.21136474609375, "learning_rate": 3.099075251479648e-07, "logits/chosen": -1.8907877206802368, "logits/rejected": -1.8405523300170898, "logps/chosen": -266.4438171386719, "logps/rejected": -387.4299621582031, "loss": 0.4701, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.1438050270080566, "rewards/margins": 1.2050453424453735, "rewards/rejected": -3.3488502502441406, "step": 8370 }, { "epoch": 1.4438318401102688, "grad_norm": 29.429744720458984, "learning_rate": 3.094207969040123e-07, "logits/chosen": -1.9305824041366577, "logits/rejected": -1.8763229846954346, "logps/chosen": -272.27606201171875, "logps/rejected": -385.3312683105469, "loss": 0.4912, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.195239305496216, "rewards/margins": 1.1247823238372803, "rewards/rejected": -3.320021867752075, "step": 8380 }, { "epoch": 1.4455547898001377, "grad_norm": 36.40107727050781, "learning_rate": 3.089338298239004e-07, "logits/chosen": -1.9637582302093506, "logits/rejected": -1.8982025384902954, "logps/chosen": -266.0154724121094, "logps/rejected": -401.3577575683594, "loss": 0.4135, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0868000984191895, "rewards/margins": 1.3735166788101196, "rewards/rejected": -3.4603168964385986, "step": 8390 }, { "epoch": 1.447277739490007, "grad_norm": 26.601699829101562, "learning_rate": 3.084466258649463e-07, "logits/chosen": -1.963487982749939, "logits/rejected": -1.9118248224258423, "logps/chosen": -236.89199829101562, "logps/rejected": -347.1706848144531, "loss": 0.4668, "rewards/accuracies": 0.78125, "rewards/chosen": -1.8595539331436157, "rewards/margins": 1.109204888343811, "rewards/rejected": -2.968759059906006, "step": 8400 }, { "epoch": 1.447277739490007, "eval_logits/chosen": -2.0027456283569336, "eval_logits/rejected": -1.9825429916381836, "eval_logps/chosen": -243.06837463378906, "eval_logps/rejected": -280.9324951171875, "eval_loss": 0.6515441536903381, "eval_rewards/accuracies": 0.6403345465660095, "eval_rewards/chosen": -1.8405290842056274, "eval_rewards/margins": 0.3412999212741852, "eval_rewards/rejected": -2.1818289756774902, "eval_runtime": 362.0254, "eval_samples_per_second": 11.889, "eval_steps_per_second": 1.486, "step": 8400 }, { "epoch": 1.449000689179876, "grad_norm": 33.29566955566406, "learning_rate": 3.079591869854193e-07, "logits/chosen": -1.958396553993225, "logits/rejected": -1.9015048742294312, "logps/chosen": -238.5832977294922, "logps/rejected": -347.30206298828125, "loss": 0.4723, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.797162413597107, "rewards/margins": 1.1736029386520386, "rewards/rejected": -2.9707655906677246, "step": 8410 }, { "epoch": 1.450723638869745, "grad_norm": 36.68778610229492, "learning_rate": 3.074715151445329e-07, "logits/chosen": -1.8942447900772095, "logits/rejected": -1.8410489559173584, "logps/chosen": -213.4732666015625, "logps/rejected": -318.49102783203125, "loss": 0.4469, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.610137701034546, "rewards/margins": 1.0688587427139282, "rewards/rejected": -2.6789963245391846, "step": 8420 }, { "epoch": 1.452446588559614, "grad_norm": 33.62598419189453, "learning_rate": 3.0698361230243707e-07, "logits/chosen": -1.9546492099761963, "logits/rejected": -1.9132808446884155, "logps/chosen": -225.9335479736328, "logps/rejected": -317.28619384765625, "loss": 0.5336, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7155460119247437, "rewards/margins": 0.8944891691207886, "rewards/rejected": -2.6100354194641113, "step": 8430 }, { "epoch": 1.454169538249483, "grad_norm": 28.892169952392578, "learning_rate": 3.0649548042021015e-07, "logits/chosen": -1.9583556652069092, "logits/rejected": -1.9160188436508179, "logps/chosen": -231.806884765625, "logps/rejected": -323.4978942871094, "loss": 0.4961, "rewards/accuracies": 0.75, "rewards/chosen": -1.7564489841461182, "rewards/margins": 0.9758768081665039, "rewards/rejected": -2.732325792312622, "step": 8440 }, { "epoch": 1.4558924879393522, "grad_norm": 38.77963638305664, "learning_rate": 3.060071214598512e-07, "logits/chosen": -1.8522148132324219, "logits/rejected": -1.8074191808700562, "logps/chosen": -252.95999145507812, "logps/rejected": -369.5479736328125, "loss": 0.4727, "rewards/accuracies": 0.78125, "rewards/chosen": -2.0423831939697266, "rewards/margins": 1.1307865381240845, "rewards/rejected": -3.1731698513031006, "step": 8450 }, { "epoch": 1.4576154376292212, "grad_norm": 23.918777465820312, "learning_rate": 3.0551853738427183e-07, "logits/chosen": -1.9128358364105225, "logits/rejected": -1.8620100021362305, "logps/chosen": -273.38916015625, "logps/rejected": -378.34210205078125, "loss": 0.4576, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.1986680030822754, "rewards/margins": 1.0549832582473755, "rewards/rejected": -3.2536513805389404, "step": 8460 }, { "epoch": 1.4593383873190904, "grad_norm": 45.33697509765625, "learning_rate": 3.050297301572887e-07, "logits/chosen": -1.9258848428726196, "logits/rejected": -1.8762986660003662, "logps/chosen": -277.52752685546875, "logps/rejected": -378.4075622558594, "loss": 0.5364, "rewards/accuracies": 0.75, "rewards/chosen": -2.21083402633667, "rewards/margins": 1.0525271892547607, "rewards/rejected": -3.2633614540100098, "step": 8470 }, { "epoch": 1.4610613370089593, "grad_norm": 38.8912239074707, "learning_rate": 3.045407017436153e-07, "logits/chosen": -1.9580576419830322, "logits/rejected": -1.9021764993667603, "logps/chosen": -223.31008911132812, "logps/rejected": -322.2271728515625, "loss": 0.5106, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6913293600082397, "rewards/margins": 1.0228065252304077, "rewards/rejected": -2.7141356468200684, "step": 8480 }, { "epoch": 1.4627842866988283, "grad_norm": 33.16299057006836, "learning_rate": 3.04051454108854e-07, "logits/chosen": -2.060778856277466, "logits/rejected": -2.0190467834472656, "logps/chosen": -195.30868530273438, "logps/rejected": -294.2635498046875, "loss": 0.4677, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4011390209197998, "rewards/margins": 0.9798418879508972, "rewards/rejected": -2.380980968475342, "step": 8490 }, { "epoch": 1.4645072363886975, "grad_norm": 35.084632873535156, "learning_rate": 3.035619892194886e-07, "logits/chosen": -1.9555803537368774, "logits/rejected": -1.9094107151031494, "logps/chosen": -214.4794921875, "logps/rejected": -313.01776123046875, "loss": 0.4746, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5911809206008911, "rewards/margins": 1.014812707901001, "rewards/rejected": -2.6059937477111816, "step": 8500 }, { "epoch": 1.4662301860785665, "grad_norm": 37.75786209106445, "learning_rate": 3.0307230904287605e-07, "logits/chosen": -1.9321568012237549, "logits/rejected": -1.884227991104126, "logps/chosen": -246.20437622070312, "logps/rejected": -343.88433837890625, "loss": 0.4838, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8712425231933594, "rewards/margins": 1.0584646463394165, "rewards/rejected": -2.9297070503234863, "step": 8510 }, { "epoch": 1.4679531357684357, "grad_norm": 33.38050079345703, "learning_rate": 3.025824155472383e-07, "logits/chosen": -1.908262848854065, "logits/rejected": -1.8664461374282837, "logps/chosen": -248.241455078125, "logps/rejected": -337.4832763671875, "loss": 0.497, "rewards/accuracies": 0.75, "rewards/chosen": -1.9352061748504639, "rewards/margins": 0.9604357481002808, "rewards/rejected": -2.895641803741455, "step": 8520 }, { "epoch": 1.4696760854583046, "grad_norm": 21.825809478759766, "learning_rate": 3.020923107016552e-07, "logits/chosen": -1.9126112461090088, "logits/rejected": -1.865881323814392, "logps/chosen": -228.7753143310547, "logps/rejected": -316.41473388671875, "loss": 0.4978, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7342201471328735, "rewards/margins": 0.927161693572998, "rewards/rejected": -2.661381483078003, "step": 8530 }, { "epoch": 1.4713990351481736, "grad_norm": 26.49464988708496, "learning_rate": 3.016019964760559e-07, "logits/chosen": -2.0309338569641113, "logits/rejected": -1.991786241531372, "logps/chosen": -219.93154907226562, "logps/rejected": -301.6743469238281, "loss": 0.539, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.642358422279358, "rewards/margins": 0.8332676887512207, "rewards/rejected": -2.475625991821289, "step": 8540 }, { "epoch": 1.4731219848380428, "grad_norm": 28.88072967529297, "learning_rate": 3.01111474841211e-07, "logits/chosen": -1.980002999305725, "logits/rejected": -1.9248607158660889, "logps/chosen": -221.1356201171875, "logps/rejected": -328.1407470703125, "loss": 0.4482, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6576719284057617, "rewards/margins": 1.0753799676895142, "rewards/rejected": -2.7330517768859863, "step": 8550 }, { "epoch": 1.4748449345279118, "grad_norm": 29.78912925720215, "learning_rate": 3.00620747768725e-07, "logits/chosen": -2.0209805965423584, "logits/rejected": -1.9600439071655273, "logps/chosen": -245.6587371826172, "logps/rejected": -358.68951416015625, "loss": 0.453, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9083433151245117, "rewards/margins": 1.1612635850906372, "rewards/rejected": -3.0696067810058594, "step": 8560 }, { "epoch": 1.476567884217781, "grad_norm": 35.94548034667969, "learning_rate": 3.001298172310278e-07, "logits/chosen": -1.911577582359314, "logits/rejected": -1.8700872659683228, "logps/chosen": -262.2509765625, "logps/rejected": -355.0532531738281, "loss": 0.5016, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.0718178749084473, "rewards/margins": 0.9666481018066406, "rewards/rejected": -3.038465738296509, "step": 8570 }, { "epoch": 1.47829083390765, "grad_norm": 33.60352325439453, "learning_rate": 2.9963868520136763e-07, "logits/chosen": -1.9891901016235352, "logits/rejected": -1.9395248889923096, "logps/chosen": -242.00247192382812, "logps/rejected": -350.08734130859375, "loss": 0.4492, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8663666248321533, "rewards/margins": 1.0947659015655518, "rewards/rejected": -2.961132526397705, "step": 8580 }, { "epoch": 1.480013783597519, "grad_norm": 30.410587310791016, "learning_rate": 2.991473536538021e-07, "logits/chosen": -1.961456298828125, "logits/rejected": -1.9023650884628296, "logps/chosen": -250.53085327148438, "logps/rejected": -353.6197509765625, "loss": 0.4387, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.9188801050186157, "rewards/margins": 1.0910704135894775, "rewards/rejected": -3.0099501609802246, "step": 8590 }, { "epoch": 1.481736733287388, "grad_norm": 32.453468322753906, "learning_rate": 2.9865582456319093e-07, "logits/chosen": -1.942993402481079, "logits/rejected": -1.904200792312622, "logps/chosen": -272.5259704589844, "logps/rejected": -355.6471252441406, "loss": 0.5387, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.184065580368042, "rewards/margins": 0.8626424670219421, "rewards/rejected": -3.046708106994629, "step": 8600 }, { "epoch": 1.483459682977257, "grad_norm": 36.14287567138672, "learning_rate": 2.981640999051879e-07, "logits/chosen": -1.9518852233886719, "logits/rejected": -1.903838872909546, "logps/chosen": -261.6189270019531, "logps/rejected": -368.6334533691406, "loss": 0.488, "rewards/accuracies": 0.75, "rewards/chosen": -2.062203884124756, "rewards/margins": 1.1216530799865723, "rewards/rejected": -3.183856248855591, "step": 8610 }, { "epoch": 1.4851826326671262, "grad_norm": 27.295005798339844, "learning_rate": 2.976721816562329e-07, "logits/chosen": -1.9210399389266968, "logits/rejected": -1.8921953439712524, "logps/chosen": -241.91757202148438, "logps/rejected": -323.5142517089844, "loss": 0.507, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8604011535644531, "rewards/margins": 0.87261563539505, "rewards/rejected": -2.7330167293548584, "step": 8620 }, { "epoch": 1.4869055823569952, "grad_norm": 28.899320602416992, "learning_rate": 2.9718007179354394e-07, "logits/chosen": -1.9313371181488037, "logits/rejected": -1.8836867809295654, "logps/chosen": -219.7310028076172, "logps/rejected": -328.97735595703125, "loss": 0.4448, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6648895740509033, "rewards/margins": 1.1079365015029907, "rewards/rejected": -2.7728257179260254, "step": 8630 }, { "epoch": 1.4886285320468642, "grad_norm": 24.424346923828125, "learning_rate": 2.9668777229510883e-07, "logits/chosen": -2.030834197998047, "logits/rejected": -1.9829210042953491, "logps/chosen": -259.44525146484375, "logps/rejected": -346.0587463378906, "loss": 0.5431, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.0256335735321045, "rewards/margins": 0.9163734316825867, "rewards/rejected": -2.942007064819336, "step": 8640 }, { "epoch": 1.4903514817367332, "grad_norm": 47.041019439697266, "learning_rate": 2.961952851396782e-07, "logits/chosen": -1.9653571844100952, "logits/rejected": -1.9116747379302979, "logps/chosen": -246.31103515625, "logps/rejected": -336.708740234375, "loss": 0.5361, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.903097152709961, "rewards/margins": 0.9307646751403809, "rewards/rejected": -2.833862066268921, "step": 8650 }, { "epoch": 1.4920744314266023, "grad_norm": 31.659198760986328, "learning_rate": 2.9570261230675635e-07, "logits/chosen": -1.9751007556915283, "logits/rejected": -1.9284700155258179, "logps/chosen": -251.59719848632812, "logps/rejected": -353.7413635253906, "loss": 0.5044, "rewards/accuracies": 0.78125, "rewards/chosen": -1.969775915145874, "rewards/margins": 1.056877851486206, "rewards/rejected": -3.02665376663208, "step": 8660 }, { "epoch": 1.4937973811164715, "grad_norm": 27.148176193237305, "learning_rate": 2.9520975577659455e-07, "logits/chosen": -2.05680513381958, "logits/rejected": -2.005464553833008, "logps/chosen": -230.9302215576172, "logps/rejected": -332.4646911621094, "loss": 0.5001, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7806917428970337, "rewards/margins": 1.025880217552185, "rewards/rejected": -2.8065719604492188, "step": 8670 }, { "epoch": 1.4955203308063405, "grad_norm": 43.044918060302734, "learning_rate": 2.9471671753018183e-07, "logits/chosen": -1.9811875820159912, "logits/rejected": -1.950135588645935, "logps/chosen": -236.6911163330078, "logps/rejected": -316.8743591308594, "loss": 0.5493, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.845965027809143, "rewards/margins": 0.7946556806564331, "rewards/rejected": -2.640620470046997, "step": 8680 }, { "epoch": 1.4972432804962095, "grad_norm": 33.43032455444336, "learning_rate": 2.9422349954923785e-07, "logits/chosen": -1.9930521249771118, "logits/rejected": -1.9426205158233643, "logps/chosen": -239.96011352539062, "logps/rejected": -324.61126708984375, "loss": 0.5157, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8383514881134033, "rewards/margins": 0.9339269399642944, "rewards/rejected": -2.772278308868408, "step": 8690 }, { "epoch": 1.4989662301860784, "grad_norm": 22.333784103393555, "learning_rate": 2.937301038162048e-07, "logits/chosen": -2.0095386505126953, "logits/rejected": -1.9630706310272217, "logps/chosen": -245.417236328125, "logps/rejected": -322.0905456542969, "loss": 0.5211, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.8827292919158936, "rewards/margins": 0.8183779716491699, "rewards/rejected": -2.7011072635650635, "step": 8700 }, { "epoch": 1.5006891798759476, "grad_norm": 33.75700759887695, "learning_rate": 2.932365323142391e-07, "logits/chosen": -1.9597835540771484, "logits/rejected": -1.9223884344100952, "logps/chosen": -229.0197296142578, "logps/rejected": -302.28509521484375, "loss": 0.5443, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7502546310424805, "rewards/margins": 0.7415592074394226, "rewards/rejected": -2.491813898086548, "step": 8710 }, { "epoch": 1.5024121295658168, "grad_norm": 36.968040466308594, "learning_rate": 2.927427870272039e-07, "logits/chosen": -2.0005135536193848, "logits/rejected": -1.94069504737854, "logps/chosen": -237.36911010742188, "logps/rejected": -337.5896301269531, "loss": 0.4354, "rewards/accuracies": 0.8125, "rewards/chosen": -1.802830696105957, "rewards/margins": 1.0337858200073242, "rewards/rejected": -2.8366165161132812, "step": 8720 }, { "epoch": 1.5041350792556858, "grad_norm": 40.525657653808594, "learning_rate": 2.922488699396605e-07, "logits/chosen": -1.9717642068862915, "logits/rejected": -1.914671540260315, "logps/chosen": -272.8725891113281, "logps/rejected": -380.7918395996094, "loss": 0.5037, "rewards/accuracies": 0.75, "rewards/chosen": -2.1939022541046143, "rewards/margins": 1.099668264389038, "rewards/rejected": -3.2935702800750732, "step": 8730 }, { "epoch": 1.5058580289455548, "grad_norm": 42.64216995239258, "learning_rate": 2.9175478303686143e-07, "logits/chosen": -1.9897191524505615, "logits/rejected": -1.9484837055206299, "logps/chosen": -277.7720031738281, "logps/rejected": -398.8113708496094, "loss": 0.4796, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.243814468383789, "rewards/margins": 1.2226290702819824, "rewards/rejected": -3.4664435386657715, "step": 8740 }, { "epoch": 1.5075809786354237, "grad_norm": 39.720890045166016, "learning_rate": 2.9126052830474084e-07, "logits/chosen": -1.9898649454116821, "logits/rejected": -1.940707802772522, "logps/chosen": -265.99029541015625, "logps/rejected": -376.7161560058594, "loss": 0.4602, "rewards/accuracies": 0.8125, "rewards/chosen": -2.114291191101074, "rewards/margins": 1.1316261291503906, "rewards/rejected": -3.245917558670044, "step": 8750 }, { "epoch": 1.509303928325293, "grad_norm": 23.796550750732422, "learning_rate": 2.9076610772990816e-07, "logits/chosen": -1.9658838510513306, "logits/rejected": -1.9365953207015991, "logps/chosen": -255.6982879638672, "logps/rejected": -340.27783203125, "loss": 0.5444, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.030052661895752, "rewards/margins": 0.8299967050552368, "rewards/rejected": -2.8600494861602783, "step": 8760 }, { "epoch": 1.5110268780151621, "grad_norm": 28.302946090698242, "learning_rate": 2.9027152329963916e-07, "logits/chosen": -2.00508713722229, "logits/rejected": -1.9621906280517578, "logps/chosen": -232.0532684326172, "logps/rejected": -329.3794250488281, "loss": 0.4733, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7616924047470093, "rewards/margins": 1.038021206855774, "rewards/rejected": -2.799713611602783, "step": 8770 }, { "epoch": 1.512749827705031, "grad_norm": 28.905548095703125, "learning_rate": 2.897767770018682e-07, "logits/chosen": -2.050926923751831, "logits/rejected": -2.0135295391082764, "logps/chosen": -239.0745849609375, "logps/rejected": -332.68048095703125, "loss": 0.4859, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8889503479003906, "rewards/margins": 0.912859320640564, "rewards/rejected": -2.8018100261688232, "step": 8780 }, { "epoch": 1.5144727773949, "grad_norm": 38.8216552734375, "learning_rate": 2.8928187082518036e-07, "logits/chosen": -2.030805826187134, "logits/rejected": -1.9798908233642578, "logps/chosen": -243.2490997314453, "logps/rejected": -342.73138427734375, "loss": 0.4791, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.870365858078003, "rewards/margins": 1.0457369089126587, "rewards/rejected": -2.916102647781372, "step": 8790 }, { "epoch": 1.516195727084769, "grad_norm": 27.284061431884766, "learning_rate": 2.887868067588031e-07, "logits/chosen": -1.9725666046142578, "logits/rejected": -1.923872709274292, "logps/chosen": -252.4422149658203, "logps/rejected": -341.8983459472656, "loss": 0.509, "rewards/accuracies": 0.78125, "rewards/chosen": -1.9614473581314087, "rewards/margins": 0.9417362213134766, "rewards/rejected": -2.9031834602355957, "step": 8800 }, { "epoch": 1.516195727084769, "eval_logits/chosen": -2.042152166366577, "eval_logits/rejected": -2.022376537322998, "eval_logps/chosen": -254.48277282714844, "eval_logps/rejected": -294.4091491699219, "eval_loss": 0.6471243500709534, "eval_rewards/accuracies": 0.6424256563186646, "eval_rewards/chosen": -1.954672932624817, "eval_rewards/margins": 0.3619226813316345, "eval_rewards/rejected": -2.3165955543518066, "eval_runtime": 361.7167, "eval_samples_per_second": 11.899, "eval_steps_per_second": 1.487, "step": 8800 }, { "epoch": 1.5179186767746382, "grad_norm": 35.6402702331543, "learning_rate": 2.882915867925989e-07, "logits/chosen": -1.9759294986724854, "logits/rejected": -1.916739821434021, "logps/chosen": -263.79522705078125, "logps/rejected": -378.80755615234375, "loss": 0.4474, "rewards/accuracies": 0.78125, "rewards/chosen": -2.116203784942627, "rewards/margins": 1.1553723812103271, "rewards/rejected": -3.271576404571533, "step": 8810 }, { "epoch": 1.5196416264645074, "grad_norm": 36.64787673950195, "learning_rate": 2.877962129170564e-07, "logits/chosen": -2.019723415374756, "logits/rejected": -1.9638421535491943, "logps/chosen": -262.76031494140625, "logps/rejected": -372.87237548828125, "loss": 0.4801, "rewards/accuracies": 0.8125, "rewards/chosen": -2.111617088317871, "rewards/margins": 1.127884864807129, "rewards/rejected": -3.239501953125, "step": 8820 }, { "epoch": 1.5213645761543764, "grad_norm": 40.79579544067383, "learning_rate": 2.8730068712328325e-07, "logits/chosen": -1.894916296005249, "logits/rejected": -1.8423700332641602, "logps/chosen": -253.98953247070312, "logps/rejected": -352.9727478027344, "loss": 0.5276, "rewards/accuracies": 0.75, "rewards/chosen": -1.9858863353729248, "rewards/margins": 1.0216693878173828, "rewards/rejected": -3.0075554847717285, "step": 8830 }, { "epoch": 1.5230875258442453, "grad_norm": 44.158042907714844, "learning_rate": 2.868050114029975e-07, "logits/chosen": -1.9646812677383423, "logits/rejected": -1.923368215560913, "logps/chosen": -238.01992797851562, "logps/rejected": -337.86712646484375, "loss": 0.489, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.8446180820465088, "rewards/margins": 1.005642056465149, "rewards/rejected": -2.850260019302368, "step": 8840 }, { "epoch": 1.5248104755341143, "grad_norm": 44.28162384033203, "learning_rate": 2.863091877485199e-07, "logits/chosen": -2.047132730484009, "logits/rejected": -1.996974229812622, "logps/chosen": -237.5460662841797, "logps/rejected": -346.73382568359375, "loss": 0.4741, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.8408377170562744, "rewards/margins": 1.0879124402999878, "rewards/rejected": -2.9287500381469727, "step": 8850 }, { "epoch": 1.5265334252239835, "grad_norm": 39.02864456176758, "learning_rate": 2.858132181527657e-07, "logits/chosen": -1.952619194984436, "logits/rejected": -1.9050172567367554, "logps/chosen": -256.36676025390625, "logps/rejected": -358.66461181640625, "loss": 0.4663, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.0211822986602783, "rewards/margins": 1.0468981266021729, "rewards/rejected": -3.068080425262451, "step": 8860 }, { "epoch": 1.5282563749138525, "grad_norm": 34.92826461791992, "learning_rate": 2.8531710460923696e-07, "logits/chosen": -2.059631824493408, "logits/rejected": -1.9925981760025024, "logps/chosen": -261.72198486328125, "logps/rejected": -377.0589904785156, "loss": 0.4605, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.0375783443450928, "rewards/margins": 1.1884291172027588, "rewards/rejected": -3.2260074615478516, "step": 8870 }, { "epoch": 1.5299793246037217, "grad_norm": 39.90979766845703, "learning_rate": 2.848208491120141e-07, "logits/chosen": -1.9067188501358032, "logits/rejected": -1.8527987003326416, "logps/chosen": -270.7587890625, "logps/rejected": -378.75274658203125, "loss": 0.5282, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.1644182205200195, "rewards/margins": 1.1176087856292725, "rewards/rejected": -3.282027006149292, "step": 8880 }, { "epoch": 1.5317022742935906, "grad_norm": 35.80415344238281, "learning_rate": 2.8432445365574824e-07, "logits/chosen": -1.903998613357544, "logits/rejected": -1.8490577936172485, "logps/chosen": -264.04229736328125, "logps/rejected": -371.2017822265625, "loss": 0.4384, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.08154559135437, "rewards/margins": 1.130414366722107, "rewards/rejected": -3.2119603157043457, "step": 8890 }, { "epoch": 1.5334252239834596, "grad_norm": 36.9718017578125, "learning_rate": 2.8382792023565303e-07, "logits/chosen": -1.993935227394104, "logits/rejected": -1.9364922046661377, "logps/chosen": -258.0193176269531, "logps/rejected": -370.16644287109375, "loss": 0.4617, "rewards/accuracies": 0.75, "rewards/chosen": -2.020143985748291, "rewards/margins": 1.1633901596069336, "rewards/rejected": -3.1835341453552246, "step": 8900 }, { "epoch": 1.5351481736733288, "grad_norm": 45.19432067871094, "learning_rate": 2.8333125084749666e-07, "logits/chosen": -2.004828691482544, "logits/rejected": -1.9631010293960571, "logps/chosen": -253.2248992919922, "logps/rejected": -346.3771667480469, "loss": 0.5155, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.9616769552230835, "rewards/margins": 0.9789560437202454, "rewards/rejected": -2.9406330585479736, "step": 8910 }, { "epoch": 1.5368711233631978, "grad_norm": 20.75895881652832, "learning_rate": 2.8283444748759376e-07, "logits/chosen": -2.092076301574707, "logits/rejected": -2.0257861614227295, "logps/chosen": -234.6975860595703, "logps/rejected": -359.0959777832031, "loss": 0.4114, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.758986473083496, "rewards/margins": 1.3185980319976807, "rewards/rejected": -3.077584743499756, "step": 8920 }, { "epoch": 1.538594073053067, "grad_norm": 28.16330337524414, "learning_rate": 2.823375121527975e-07, "logits/chosen": -2.0109429359436035, "logits/rejected": -1.9524452686309814, "logps/chosen": -249.6481475830078, "logps/rejected": -353.1387634277344, "loss": 0.4825, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.944911241531372, "rewards/margins": 1.0881527662277222, "rewards/rejected": -3.033064365386963, "step": 8930 }, { "epoch": 1.540317022742936, "grad_norm": 37.03079605102539, "learning_rate": 2.818404468404916e-07, "logits/chosen": -2.0266358852386475, "logits/rejected": -1.9744689464569092, "logps/chosen": -240.50283813476562, "logps/rejected": -339.1553039550781, "loss": 0.4827, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.8341026306152344, "rewards/margins": 1.003930926322937, "rewards/rejected": -2.838033437728882, "step": 8940 }, { "epoch": 1.5420399724328049, "grad_norm": 36.03659439086914, "learning_rate": 2.813432535485819e-07, "logits/chosen": -1.960561752319336, "logits/rejected": -1.9336059093475342, "logps/chosen": -238.7672882080078, "logps/rejected": -326.0018615722656, "loss": 0.5081, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8381109237670898, "rewards/margins": 0.8607797622680664, "rewards/rejected": -2.6988906860351562, "step": 8950 }, { "epoch": 1.5437629221226739, "grad_norm": 47.81509017944336, "learning_rate": 2.8084593427548915e-07, "logits/chosen": -2.023395299911499, "logits/rejected": -1.9670394659042358, "logps/chosen": -268.51361083984375, "logps/rejected": -368.26568603515625, "loss": 0.5119, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1313421726226807, "rewards/margins": 1.0451680421829224, "rewards/rejected": -3.1765103340148926, "step": 8960 }, { "epoch": 1.545485871812543, "grad_norm": 25.33212661743164, "learning_rate": 2.803484910201399e-07, "logits/chosen": -1.9787847995758057, "logits/rejected": -1.9118139743804932, "logps/chosen": -266.91552734375, "logps/rejected": -387.0469055175781, "loss": 0.4215, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.1262621879577637, "rewards/margins": 1.2448461055755615, "rewards/rejected": -3.371108293533325, "step": 8970 }, { "epoch": 1.5472088215024122, "grad_norm": 25.273517608642578, "learning_rate": 2.7985092578195946e-07, "logits/chosen": -2.0212745666503906, "logits/rejected": -1.9745696783065796, "logps/chosen": -293.9491271972656, "logps/rejected": -386.3792724609375, "loss": 0.5518, "rewards/accuracies": 0.71875, "rewards/chosen": -2.3879616260528564, "rewards/margins": 0.9510875940322876, "rewards/rejected": -3.3390491008758545, "step": 8980 }, { "epoch": 1.5489317711922812, "grad_norm": 33.955604553222656, "learning_rate": 2.793532405608633e-07, "logits/chosen": -1.9511983394622803, "logits/rejected": -1.8933565616607666, "logps/chosen": -262.6517028808594, "logps/rejected": -386.84942626953125, "loss": 0.4205, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.0563275814056396, "rewards/margins": 1.2760744094848633, "rewards/rejected": -3.332401752471924, "step": 8990 }, { "epoch": 1.5506547208821502, "grad_norm": 44.704063415527344, "learning_rate": 2.7885543735724913e-07, "logits/chosen": -2.026851177215576, "logits/rejected": -1.9747167825698853, "logps/chosen": -288.61663818359375, "logps/rejected": -395.96905517578125, "loss": 0.5091, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.300950527191162, "rewards/margins": 1.1435821056365967, "rewards/rejected": -3.444532871246338, "step": 9000 }, { "epoch": 1.5523776705720191, "grad_norm": 24.084840774536133, "learning_rate": 2.7835751817198894e-07, "logits/chosen": -1.9841976165771484, "logits/rejected": -1.9366334676742554, "logps/chosen": -263.0357971191406, "logps/rejected": -356.3251037597656, "loss": 0.4992, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.0629210472106934, "rewards/margins": 0.9795193672180176, "rewards/rejected": -3.0424399375915527, "step": 9010 }, { "epoch": 1.5541006202618883, "grad_norm": 32.312965393066406, "learning_rate": 2.778594850064207e-07, "logits/chosen": -1.9422919750213623, "logits/rejected": -1.8865864276885986, "logps/chosen": -239.6875457763672, "logps/rejected": -365.14276123046875, "loss": 0.4291, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.8395541906356812, "rewards/margins": 1.3123962879180908, "rewards/rejected": -3.1519505977630615, "step": 9020 }, { "epoch": 1.5558235699517575, "grad_norm": 22.12751007080078, "learning_rate": 2.773613398623408e-07, "logits/chosen": -1.9673751592636108, "logits/rejected": -1.942098617553711, "logps/chosen": -246.42245483398438, "logps/rejected": -350.5736389160156, "loss": 0.4987, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9399378299713135, "rewards/margins": 1.0371708869934082, "rewards/rejected": -2.9771084785461426, "step": 9030 }, { "epoch": 1.5575465196416265, "grad_norm": 38.82960510253906, "learning_rate": 2.768630847419955e-07, "logits/chosen": -1.9541194438934326, "logits/rejected": -1.91436767578125, "logps/chosen": -254.92724609375, "logps/rejected": -342.79559326171875, "loss": 0.5284, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.035867691040039, "rewards/margins": 0.8781115412712097, "rewards/rejected": -2.9139792919158936, "step": 9040 }, { "epoch": 1.5592694693314955, "grad_norm": 40.776729583740234, "learning_rate": 2.7636472164807327e-07, "logits/chosen": -2.0216119289398193, "logits/rejected": -1.967264175415039, "logps/chosen": -263.71466064453125, "logps/rejected": -368.890625, "loss": 0.4835, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.0970542430877686, "rewards/margins": 1.0792135000228882, "rewards/rejected": -3.176267623901367, "step": 9050 }, { "epoch": 1.5609924190213644, "grad_norm": 33.63459396362305, "learning_rate": 2.758662525836964e-07, "logits/chosen": -2.0420312881469727, "logits/rejected": -2.0015597343444824, "logps/chosen": -246.3864288330078, "logps/rejected": -327.31146240234375, "loss": 0.5176, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9084705114364624, "rewards/margins": 0.8390554189682007, "rewards/rejected": -2.747525930404663, "step": 9060 }, { "epoch": 1.5627153687112336, "grad_norm": 36.73130416870117, "learning_rate": 2.7536767955241314e-07, "logits/chosen": -2.0111818313598633, "logits/rejected": -1.9558393955230713, "logps/chosen": -226.2626190185547, "logps/rejected": -343.33563232421875, "loss": 0.4003, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.6982786655426025, "rewards/margins": 1.1944520473480225, "rewards/rejected": -2.892730712890625, "step": 9070 }, { "epoch": 1.5644383184011028, "grad_norm": 39.36493682861328, "learning_rate": 2.7486900455818985e-07, "logits/chosen": -1.9752775430679321, "logits/rejected": -1.9355007410049438, "logps/chosen": -262.43975830078125, "logps/rejected": -363.19732666015625, "loss": 0.5294, "rewards/accuracies": 0.71875, "rewards/chosen": -2.093728542327881, "rewards/margins": 0.9876585006713867, "rewards/rejected": -3.0813870429992676, "step": 9080 }, { "epoch": 1.5661612680909718, "grad_norm": 31.47509002685547, "learning_rate": 2.7437022960540237e-07, "logits/chosen": -1.9407882690429688, "logits/rejected": -1.8859045505523682, "logps/chosen": -286.5377502441406, "logps/rejected": -396.13983154296875, "loss": 0.4307, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.304168701171875, "rewards/margins": 1.1572160720825195, "rewards/rejected": -3.4613850116729736, "step": 9090 }, { "epoch": 1.5678842177808407, "grad_norm": 33.036434173583984, "learning_rate": 2.7387135669882865e-07, "logits/chosen": -1.9815852642059326, "logits/rejected": -1.9367036819458008, "logps/chosen": -290.05010986328125, "logps/rejected": -368.4955749511719, "loss": 0.5747, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.3225066661834717, "rewards/margins": 0.8337349891662598, "rewards/rejected": -3.1562418937683105, "step": 9100 }, { "epoch": 1.5696071674707097, "grad_norm": 32.89937973022461, "learning_rate": 2.733723878436401e-07, "logits/chosen": -1.9668214321136475, "logits/rejected": -1.9219114780426025, "logps/chosen": -253.4140167236328, "logps/rejected": -353.35577392578125, "loss": 0.4587, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.967380166053772, "rewards/margins": 1.0450024604797363, "rewards/rejected": -3.0123825073242188, "step": 9110 }, { "epoch": 1.571330117160579, "grad_norm": 37.931121826171875, "learning_rate": 2.7287332504539384e-07, "logits/chosen": -2.0175561904907227, "logits/rejected": -1.967853307723999, "logps/chosen": -237.3586883544922, "logps/rejected": -337.29937744140625, "loss": 0.4649, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7562859058380127, "rewards/margins": 1.0668604373931885, "rewards/rejected": -2.823146343231201, "step": 9120 }, { "epoch": 1.573053066850448, "grad_norm": 42.4247932434082, "learning_rate": 2.723741703100248e-07, "logits/chosen": -1.967000961303711, "logits/rejected": -1.9162113666534424, "logps/chosen": -251.71389770507812, "logps/rejected": -350.8138732910156, "loss": 0.4888, "rewards/accuracies": 0.78125, "rewards/chosen": -1.963323950767517, "rewards/margins": 1.0034139156341553, "rewards/rejected": -2.966738224029541, "step": 9130 }, { "epoch": 1.574776016540317, "grad_norm": 19.454328536987305, "learning_rate": 2.718749256438371e-07, "logits/chosen": -1.999427080154419, "logits/rejected": -1.938997507095337, "logps/chosen": -249.1681671142578, "logps/rejected": -353.614990234375, "loss": 0.457, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9041764736175537, "rewards/margins": 1.1470805406570435, "rewards/rejected": -3.051257371902466, "step": 9140 }, { "epoch": 1.576498966230186, "grad_norm": 31.750324249267578, "learning_rate": 2.7137559305349663e-07, "logits/chosen": -1.999197244644165, "logits/rejected": -1.939035415649414, "logps/chosen": -252.26596069335938, "logps/rejected": -370.0325012207031, "loss": 0.4479, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9269672632217407, "rewards/margins": 1.2371042966842651, "rewards/rejected": -3.164071559906006, "step": 9150 }, { "epoch": 1.578221915920055, "grad_norm": 21.94939613342285, "learning_rate": 2.708761745460224e-07, "logits/chosen": -1.948900580406189, "logits/rejected": -1.903293251991272, "logps/chosen": -229.6333770751953, "logps/rejected": -347.2161560058594, "loss": 0.4265, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.796865701675415, "rewards/margins": 1.1574687957763672, "rewards/rejected": -2.9543347358703613, "step": 9160 }, { "epoch": 1.5799448656099242, "grad_norm": 28.7042236328125, "learning_rate": 2.7037667212877914e-07, "logits/chosen": -2.028552293777466, "logits/rejected": -1.963505506515503, "logps/chosen": -239.3440399169922, "logps/rejected": -351.4969482421875, "loss": 0.443, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8368728160858154, "rewards/margins": 1.1935064792633057, "rewards/rejected": -3.030379295349121, "step": 9170 }, { "epoch": 1.5816678152997934, "grad_norm": 24.264577865600586, "learning_rate": 2.6987708780946847e-07, "logits/chosen": -1.9841711521148682, "logits/rejected": -1.9442100524902344, "logps/chosen": -245.4112091064453, "logps/rejected": -337.2157287597656, "loss": 0.5181, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8929179906845093, "rewards/margins": 0.9592446088790894, "rewards/rejected": -2.8521628379821777, "step": 9180 }, { "epoch": 1.5833907649896624, "grad_norm": 31.94174575805664, "learning_rate": 2.6937742359612126e-07, "logits/chosen": -2.0503029823303223, "logits/rejected": -2.0222954750061035, "logps/chosen": -259.79241943359375, "logps/rejected": -363.932373046875, "loss": 0.4388, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0457522869110107, "rewards/margins": 1.0346379280090332, "rewards/rejected": -3.080390453338623, "step": 9190 }, { "epoch": 1.5851137146795313, "grad_norm": 20.523685455322266, "learning_rate": 2.6887768149708977e-07, "logits/chosen": -1.9990094900131226, "logits/rejected": -1.9520622491836548, "logps/chosen": -239.85079956054688, "logps/rejected": -358.41571044921875, "loss": 0.4177, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.8550341129302979, "rewards/margins": 1.2236909866333008, "rewards/rejected": -3.0787250995635986, "step": 9200 }, { "epoch": 1.5851137146795313, "eval_logits/chosen": -2.0064148902893066, "eval_logits/rejected": -1.985373854637146, "eval_logps/chosen": -252.3707275390625, "eval_logps/rejected": -293.0923156738281, "eval_loss": 0.6542167663574219, "eval_rewards/accuracies": 0.6391728520393372, "eval_rewards/chosen": -1.933552622795105, "eval_rewards/margins": 0.36987459659576416, "eval_rewards/rejected": -2.303427219390869, "eval_runtime": 361.7084, "eval_samples_per_second": 11.899, "eval_steps_per_second": 1.487, "step": 9200 }, { "epoch": 1.5868366643694003, "grad_norm": 38.729225158691406, "learning_rate": 2.6837786352103914e-07, "logits/chosen": -1.9263750314712524, "logits/rejected": -1.865478754043579, "logps/chosen": -261.43231201171875, "logps/rejected": -367.8107604980469, "loss": 0.4911, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.041464328765869, "rewards/margins": 1.1291074752807617, "rewards/rejected": -3.170571804046631, "step": 9210 }, { "epoch": 1.5885596140592695, "grad_norm": 53.039894104003906, "learning_rate": 2.678779716769394e-07, "logits/chosen": -1.959080696105957, "logits/rejected": -1.9117927551269531, "logps/chosen": -242.5574188232422, "logps/rejected": -357.82598876953125, "loss": 0.4623, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.913165807723999, "rewards/margins": 1.1734858751296997, "rewards/rejected": -3.086651563644409, "step": 9220 }, { "epoch": 1.5902825637491387, "grad_norm": 39.7568473815918, "learning_rate": 2.6737800797405763e-07, "logits/chosen": -1.8976761102676392, "logits/rejected": -1.844270944595337, "logps/chosen": -248.05477905273438, "logps/rejected": -364.051513671875, "loss": 0.4569, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.97853684425354, "rewards/margins": 1.1362416744232178, "rewards/rejected": -3.114778518676758, "step": 9230 }, { "epoch": 1.5920055134390076, "grad_norm": 46.86324691772461, "learning_rate": 2.668779744219497e-07, "logits/chosen": -1.9212672710418701, "logits/rejected": -1.8795932531356812, "logps/chosen": -265.1170349121094, "logps/rejected": -379.7967224121094, "loss": 0.4656, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.076167583465576, "rewards/margins": 1.1845210790634155, "rewards/rejected": -3.2606887817382812, "step": 9240 }, { "epoch": 1.5937284631288766, "grad_norm": 39.65110778808594, "learning_rate": 2.6637787303045215e-07, "logits/chosen": -1.8899033069610596, "logits/rejected": -1.8346529006958008, "logps/chosen": -249.73031616210938, "logps/rejected": -358.6544494628906, "loss": 0.5143, "rewards/accuracies": 0.75, "rewards/chosen": -1.9341949224472046, "rewards/margins": 1.1101762056350708, "rewards/rejected": -3.0443711280822754, "step": 9250 }, { "epoch": 1.5954514128187456, "grad_norm": 24.493730545043945, "learning_rate": 2.658777058096744e-07, "logits/chosen": -1.925278663635254, "logits/rejected": -1.88437819480896, "logps/chosen": -237.0755615234375, "logps/rejected": -341.0747375488281, "loss": 0.4757, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.8852241039276123, "rewards/margins": 1.046508550643921, "rewards/rejected": -2.931732654571533, "step": 9260 }, { "epoch": 1.5971743625086148, "grad_norm": 20.740720748901367, "learning_rate": 2.6537747476999017e-07, "logits/chosen": -1.9574615955352783, "logits/rejected": -1.902316689491272, "logps/chosen": -249.68569946289062, "logps/rejected": -358.1416931152344, "loss": 0.448, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.934759497642517, "rewards/margins": 1.1353925466537476, "rewards/rejected": -3.0701520442962646, "step": 9270 }, { "epoch": 1.598897312198484, "grad_norm": 38.02179718017578, "learning_rate": 2.6487718192203e-07, "logits/chosen": -1.9826538562774658, "logits/rejected": -1.9402740001678467, "logps/chosen": -231.21115112304688, "logps/rejected": -328.49127197265625, "loss": 0.4899, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7622559070587158, "rewards/margins": 0.9935979843139648, "rewards/rejected": -2.7558541297912598, "step": 9280 }, { "epoch": 1.600620261888353, "grad_norm": 34.11743927001953, "learning_rate": 2.6437682927667265e-07, "logits/chosen": -1.9324042797088623, "logits/rejected": -1.8920612335205078, "logps/chosen": -231.2100067138672, "logps/rejected": -311.09222412109375, "loss": 0.5335, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7578970193862915, "rewards/margins": 0.8527604937553406, "rewards/rejected": -2.610657215118408, "step": 9290 }, { "epoch": 1.602343211578222, "grad_norm": 46.93685531616211, "learning_rate": 2.638764188450373e-07, "logits/chosen": -1.9869029521942139, "logits/rejected": -1.9425594806671143, "logps/chosen": -228.2207794189453, "logps/rejected": -333.8774108886719, "loss": 0.4581, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7326017618179321, "rewards/margins": 1.058089256286621, "rewards/rejected": -2.7906908988952637, "step": 9300 }, { "epoch": 1.6040661612680909, "grad_norm": 26.5582275390625, "learning_rate": 2.6337595263847533e-07, "logits/chosen": -1.9858334064483643, "logits/rejected": -1.9485981464385986, "logps/chosen": -225.65011596679688, "logps/rejected": -323.06298828125, "loss": 0.4971, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7087366580963135, "rewards/margins": 1.025743842124939, "rewards/rejected": -2.734480381011963, "step": 9310 }, { "epoch": 1.60578911095796, "grad_norm": 30.959178924560547, "learning_rate": 2.628754326685626e-07, "logits/chosen": -2.0470666885375977, "logits/rejected": -1.9957401752471924, "logps/chosen": -230.56689453125, "logps/rejected": -340.4515686035156, "loss": 0.4598, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7834981679916382, "rewards/margins": 1.0898150205612183, "rewards/rejected": -2.8733127117156982, "step": 9320 }, { "epoch": 1.607512060647829, "grad_norm": 32.93854904174805, "learning_rate": 2.623748609470905e-07, "logits/chosen": -2.007800579071045, "logits/rejected": -1.9562647342681885, "logps/chosen": -240.7649383544922, "logps/rejected": -336.1905517578125, "loss": 0.5091, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8232452869415283, "rewards/margins": 0.9949762225151062, "rewards/rejected": -2.818221092224121, "step": 9330 }, { "epoch": 1.6092350103376982, "grad_norm": 17.481374740600586, "learning_rate": 2.618742394860589e-07, "logits/chosen": -1.9526252746582031, "logits/rejected": -1.9059522151947021, "logps/chosen": -247.42489624023438, "logps/rejected": -364.13885498046875, "loss": 0.4466, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9511878490447998, "rewards/margins": 1.1861870288848877, "rewards/rejected": -3.1373753547668457, "step": 9340 }, { "epoch": 1.6109579600275672, "grad_norm": 53.94264602661133, "learning_rate": 2.613735702976676e-07, "logits/chosen": -1.947758436203003, "logits/rejected": -1.8854362964630127, "logps/chosen": -248.6236572265625, "logps/rejected": -366.7275390625, "loss": 0.4314, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9528688192367554, "rewards/margins": 1.2066624164581299, "rewards/rejected": -3.1595311164855957, "step": 9350 }, { "epoch": 1.6126809097174362, "grad_norm": 39.05000305175781, "learning_rate": 2.6087285539430793e-07, "logits/chosen": -1.9782739877700806, "logits/rejected": -1.9168260097503662, "logps/chosen": -284.8238830566406, "logps/rejected": -409.9019470214844, "loss": 0.4554, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.2637181282043457, "rewards/margins": 1.318053960800171, "rewards/rejected": -3.581772565841675, "step": 9360 }, { "epoch": 1.6144038594073054, "grad_norm": 51.16798782348633, "learning_rate": 2.603720967885552e-07, "logits/chosen": -1.9530961513519287, "logits/rejected": -1.8813329935073853, "logps/chosen": -299.49102783203125, "logps/rejected": -429.8968200683594, "loss": 0.4491, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.444570302963257, "rewards/margins": 1.3447846174240112, "rewards/rejected": -3.7893548011779785, "step": 9370 }, { "epoch": 1.6161268090971743, "grad_norm": 28.713010787963867, "learning_rate": 2.598712964931602e-07, "logits/chosen": -1.9431720972061157, "logits/rejected": -1.8953588008880615, "logps/chosen": -287.85791015625, "logps/rejected": -397.68023681640625, "loss": 0.5402, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.326961040496826, "rewards/margins": 1.1127294301986694, "rewards/rejected": -3.4396908283233643, "step": 9380 }, { "epoch": 1.6178497587870435, "grad_norm": 36.38191604614258, "learning_rate": 2.5937045652104156e-07, "logits/chosen": -1.90447998046875, "logits/rejected": -1.8507049083709717, "logps/chosen": -266.5199890136719, "logps/rejected": -382.275390625, "loss": 0.4709, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.126375436782837, "rewards/margins": 1.1582132577896118, "rewards/rejected": -3.28458833694458, "step": 9390 }, { "epoch": 1.6195727084769125, "grad_norm": 22.360328674316406, "learning_rate": 2.58869578885277e-07, "logits/chosen": -1.9762977361679077, "logits/rejected": -1.929149866104126, "logps/chosen": -232.34536743164062, "logps/rejected": -330.082275390625, "loss": 0.4996, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.766662359237671, "rewards/margins": 0.992715060710907, "rewards/rejected": -2.7593777179718018, "step": 9400 }, { "epoch": 1.6212956581667815, "grad_norm": 43.83208465576172, "learning_rate": 2.583686655990961e-07, "logits/chosen": -1.9678404331207275, "logits/rejected": -1.9175662994384766, "logps/chosen": -238.7517852783203, "logps/rejected": -344.4751892089844, "loss": 0.4721, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8406603336334229, "rewards/margins": 1.0932737588882446, "rewards/rejected": -2.933933973312378, "step": 9410 }, { "epoch": 1.6230186078566504, "grad_norm": 33.517845153808594, "learning_rate": 2.5786771867587125e-07, "logits/chosen": -1.991847276687622, "logits/rejected": -1.9440711736679077, "logps/chosen": -241.57470703125, "logps/rejected": -333.8283386230469, "loss": 0.5087, "rewards/accuracies": 0.75, "rewards/chosen": -1.8588556051254272, "rewards/margins": 0.9679737091064453, "rewards/rejected": -2.826829433441162, "step": 9420 }, { "epoch": 1.6247415575465196, "grad_norm": 33.28972244262695, "learning_rate": 2.5736674012911044e-07, "logits/chosen": -1.9488227367401123, "logits/rejected": -1.8874578475952148, "logps/chosen": -247.0536346435547, "logps/rejected": -350.56610107421875, "loss": 0.5145, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.9276052713394165, "rewards/margins": 1.0997895002365112, "rewards/rejected": -3.0273940563201904, "step": 9430 }, { "epoch": 1.6264645072363888, "grad_norm": 36.2224235534668, "learning_rate": 2.5686573197244853e-07, "logits/chosen": -1.9623922109603882, "logits/rejected": -1.9277002811431885, "logps/chosen": -220.17501831054688, "logps/rejected": -305.6186828613281, "loss": 0.4967, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.7027654647827148, "rewards/margins": 0.8390003442764282, "rewards/rejected": -2.5417656898498535, "step": 9440 }, { "epoch": 1.6281874569262578, "grad_norm": 30.603832244873047, "learning_rate": 2.5636469621963934e-07, "logits/chosen": -2.057969570159912, "logits/rejected": -2.0100674629211426, "logps/chosen": -246.371826171875, "logps/rejected": -342.1852111816406, "loss": 0.4888, "rewards/accuracies": 0.75, "rewards/chosen": -1.9226688146591187, "rewards/margins": 1.002288579940796, "rewards/rejected": -2.924957275390625, "step": 9450 }, { "epoch": 1.6299104066161267, "grad_norm": 22.263578414916992, "learning_rate": 2.5586363488454805e-07, "logits/chosen": -1.9660428762435913, "logits/rejected": -1.916745901107788, "logps/chosen": -224.8061981201172, "logps/rejected": -358.927490234375, "loss": 0.391, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7111608982086182, "rewards/margins": 1.3487448692321777, "rewards/rejected": -3.059905529022217, "step": 9460 }, { "epoch": 1.6316333563059957, "grad_norm": 37.52483367919922, "learning_rate": 2.55362549981142e-07, "logits/chosen": -1.8326387405395508, "logits/rejected": -1.7776682376861572, "logps/chosen": -247.7879638671875, "logps/rejected": -350.54248046875, "loss": 0.4738, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9481890201568604, "rewards/margins": 1.0607709884643555, "rewards/rejected": -3.0089597702026367, "step": 9470 }, { "epoch": 1.633356305995865, "grad_norm": 22.981693267822266, "learning_rate": 2.548614435234838e-07, "logits/chosen": -1.9384018182754517, "logits/rejected": -1.8693573474884033, "logps/chosen": -257.7132873535156, "logps/rejected": -371.1829528808594, "loss": 0.4509, "rewards/accuracies": 0.84375, "rewards/chosen": -2.029435396194458, "rewards/margins": 1.17180597782135, "rewards/rejected": -3.2012412548065186, "step": 9480 }, { "epoch": 1.635079255685734, "grad_norm": 36.154457092285156, "learning_rate": 2.543603175257225e-07, "logits/chosen": -1.8804614543914795, "logits/rejected": -1.826551079750061, "logps/chosen": -273.04766845703125, "logps/rejected": -394.6677551269531, "loss": 0.4232, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.17427396774292, "rewards/margins": 1.257433295249939, "rewards/rejected": -3.4317073822021484, "step": 9490 }, { "epoch": 1.636802205375603, "grad_norm": 33.637611389160156, "learning_rate": 2.5385917400208555e-07, "logits/chosen": -1.907813310623169, "logits/rejected": -1.8588817119598389, "logps/chosen": -307.50592041015625, "logps/rejected": -441.1964416503906, "loss": 0.4336, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.526552200317383, "rewards/margins": 1.3504663705825806, "rewards/rejected": -3.877018451690674, "step": 9500 }, { "epoch": 1.638525155065472, "grad_norm": 37.197479248046875, "learning_rate": 2.5335801496687115e-07, "logits/chosen": -1.8052974939346313, "logits/rejected": -1.7492759227752686, "logps/chosen": -325.2489013671875, "logps/rejected": -476.228271484375, "loss": 0.3873, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7307815551757812, "rewards/margins": 1.5022097826004028, "rewards/rejected": -4.2329912185668945, "step": 9510 }, { "epoch": 1.640248104755341, "grad_norm": 33.70505905151367, "learning_rate": 2.528568424344396e-07, "logits/chosen": -1.979736328125, "logits/rejected": -1.9391673803329468, "logps/chosen": -330.03533935546875, "logps/rejected": -454.0965270996094, "loss": 0.5275, "rewards/accuracies": 0.75, "rewards/chosen": -2.7677440643310547, "rewards/margins": 1.2013375759124756, "rewards/rejected": -3.969081401824951, "step": 9520 }, { "epoch": 1.6419710544452102, "grad_norm": 37.968997955322266, "learning_rate": 2.523556584192056e-07, "logits/chosen": -1.9683992862701416, "logits/rejected": -1.905721664428711, "logps/chosen": -260.7203369140625, "logps/rejected": -371.06634521484375, "loss": 0.4526, "rewards/accuracies": 0.8125, "rewards/chosen": -2.062553882598877, "rewards/margins": 1.1545089483261108, "rewards/rejected": -3.2170627117156982, "step": 9530 }, { "epoch": 1.6436940041350794, "grad_norm": 32.79338073730469, "learning_rate": 2.5185446493562986e-07, "logits/chosen": -1.9054248332977295, "logits/rejected": -1.8496837615966797, "logps/chosen": -247.6826171875, "logps/rejected": -366.1922607421875, "loss": 0.4289, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9268531799316406, "rewards/margins": 1.2108924388885498, "rewards/rejected": -3.1377453804016113, "step": 9540 }, { "epoch": 1.6454169538249483, "grad_norm": 34.12459945678711, "learning_rate": 2.513532639982113e-07, "logits/chosen": -1.9738709926605225, "logits/rejected": -1.9305200576782227, "logps/chosen": -241.8445587158203, "logps/rejected": -357.56280517578125, "loss": 0.4473, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.876917839050293, "rewards/margins": 1.1424349546432495, "rewards/rejected": -3.019352674484253, "step": 9550 }, { "epoch": 1.6471399035148173, "grad_norm": 41.7688102722168, "learning_rate": 2.508520576214786e-07, "logits/chosen": -2.0186164379119873, "logits/rejected": -1.9717800617218018, "logps/chosen": -242.5105438232422, "logps/rejected": -350.2165222167969, "loss": 0.4653, "rewards/accuracies": 0.75, "rewards/chosen": -1.8794488906860352, "rewards/margins": 1.1008975505828857, "rewards/rejected": -2.9803466796875, "step": 9560 }, { "epoch": 1.6488628532046863, "grad_norm": 34.770729064941406, "learning_rate": 2.503508478199825e-07, "logits/chosen": -1.8823726177215576, "logits/rejected": -1.8332128524780273, "logps/chosen": -253.80517578125, "logps/rejected": -371.1283264160156, "loss": 0.4767, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.015493154525757, "rewards/margins": 1.1915894746780396, "rewards/rejected": -3.2070822715759277, "step": 9570 }, { "epoch": 1.6505858028945555, "grad_norm": 44.03782653808594, "learning_rate": 2.498496366082875e-07, "logits/chosen": -1.9593610763549805, "logits/rejected": -1.9023668766021729, "logps/chosen": -263.1236267089844, "logps/rejected": -379.2723693847656, "loss": 0.4536, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.0793633460998535, "rewards/margins": 1.1488624811172485, "rewards/rejected": -3.2282257080078125, "step": 9580 }, { "epoch": 1.6523087525844247, "grad_norm": 37.73891830444336, "learning_rate": 2.493484260009636e-07, "logits/chosen": -2.0363147258758545, "logits/rejected": -1.980688452720642, "logps/chosen": -268.5592041015625, "logps/rejected": -378.9376220703125, "loss": 0.4732, "rewards/accuracies": 0.78125, "rewards/chosen": -2.1117188930511475, "rewards/margins": 1.1196707487106323, "rewards/rejected": -3.2313895225524902, "step": 9590 }, { "epoch": 1.6540317022742936, "grad_norm": 30.348796844482422, "learning_rate": 2.488472180125786e-07, "logits/chosen": -1.8977482318878174, "logits/rejected": -1.8300949335098267, "logps/chosen": -281.89349365234375, "logps/rejected": -429.6216735839844, "loss": 0.4181, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.258483409881592, "rewards/margins": 1.521997094154358, "rewards/rejected": -3.780480146408081, "step": 9600 }, { "epoch": 1.6540317022742936, "eval_logits/chosen": -1.950149655342102, "eval_logits/rejected": -1.9264878034591675, "eval_logps/chosen": -292.5314025878906, "eval_logps/rejected": -343.3230285644531, "eval_loss": 0.662646472454071, "eval_rewards/accuracies": 0.6438196897506714, "eval_rewards/chosen": -2.3351595401763916, "eval_rewards/margins": 0.4705745279788971, "eval_rewards/rejected": -2.8057339191436768, "eval_runtime": 361.4458, "eval_samples_per_second": 11.908, "eval_steps_per_second": 1.488, "step": 9600 }, { "epoch": 1.6557546519641626, "grad_norm": 37.09590148925781, "learning_rate": 2.483460146576895e-07, "logits/chosen": -1.9040969610214233, "logits/rejected": -1.8556592464447021, "logps/chosen": -272.9593200683594, "logps/rejected": -406.8845520019531, "loss": 0.431, "rewards/accuracies": 0.8125, "rewards/chosen": -2.148843288421631, "rewards/margins": 1.3800032138824463, "rewards/rejected": -3.5288467407226562, "step": 9610 }, { "epoch": 1.6574776016540316, "grad_norm": 32.055198669433594, "learning_rate": 2.478448179508349e-07, "logits/chosen": -2.00480055809021, "logits/rejected": -1.9527469873428345, "logps/chosen": -256.01605224609375, "logps/rejected": -347.72222900390625, "loss": 0.5301, "rewards/accuracies": 0.71875, "rewards/chosen": -2.002298593521118, "rewards/margins": 0.9261295199394226, "rewards/rejected": -2.9284281730651855, "step": 9620 }, { "epoch": 1.6592005513439008, "grad_norm": 44.06272888183594, "learning_rate": 2.4734362990652655e-07, "logits/chosen": -1.9586946964263916, "logits/rejected": -1.9114172458648682, "logps/chosen": -273.14166259765625, "logps/rejected": -368.00347900390625, "loss": 0.5306, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.1746678352355957, "rewards/margins": 0.9780102968215942, "rewards/rejected": -3.1526780128479004, "step": 9630 }, { "epoch": 1.66092350103377, "grad_norm": 34.47047805786133, "learning_rate": 2.4684245253924146e-07, "logits/chosen": -1.986244797706604, "logits/rejected": -1.9359824657440186, "logps/chosen": -251.10995483398438, "logps/rejected": -352.0421142578125, "loss": 0.4759, "rewards/accuracies": 0.75, "rewards/chosen": -1.9371776580810547, "rewards/margins": 1.0870282649993896, "rewards/rejected": -3.0242061614990234, "step": 9640 }, { "epoch": 1.662646450723639, "grad_norm": 38.89225769042969, "learning_rate": 2.463412878634138e-07, "logits/chosen": -1.9501041173934937, "logits/rejected": -1.8936655521392822, "logps/chosen": -250.8838653564453, "logps/rejected": -344.51123046875, "loss": 0.5427, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.9464542865753174, "rewards/margins": 0.9589397311210632, "rewards/rejected": -2.9053940773010254, "step": 9650 }, { "epoch": 1.664369400413508, "grad_norm": 25.26555633544922, "learning_rate": 2.4584013789342643e-07, "logits/chosen": -1.9443193674087524, "logits/rejected": -1.9002587795257568, "logps/chosen": -225.0978546142578, "logps/rejected": -334.0148620605469, "loss": 0.4553, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7332658767700195, "rewards/margins": 1.0759772062301636, "rewards/rejected": -2.8092429637908936, "step": 9660 }, { "epoch": 1.6660923501033769, "grad_norm": 32.71099090576172, "learning_rate": 2.453390046436034e-07, "logits/chosen": -2.0171279907226562, "logits/rejected": -1.9572296142578125, "logps/chosen": -245.544921875, "logps/rejected": -379.5919494628906, "loss": 0.4288, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.8718976974487305, "rewards/margins": 1.3972442150115967, "rewards/rejected": -3.269141674041748, "step": 9670 }, { "epoch": 1.667815299793246, "grad_norm": 38.62004852294922, "learning_rate": 2.448378901282015e-07, "logits/chosen": -1.9090893268585205, "logits/rejected": -1.8737207651138306, "logps/chosen": -282.3365173339844, "logps/rejected": -382.1561279296875, "loss": 0.5569, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.267176389694214, "rewards/margins": 1.0385839939117432, "rewards/rejected": -3.305760145187378, "step": 9680 }, { "epoch": 1.6695382494831152, "grad_norm": 37.97184753417969, "learning_rate": 2.4433679636140217e-07, "logits/chosen": -1.8625141382217407, "logits/rejected": -1.8137317895889282, "logps/chosen": -279.2232971191406, "logps/rejected": -399.1757507324219, "loss": 0.4424, "rewards/accuracies": 0.8125, "rewards/chosen": -2.228250503540039, "rewards/margins": 1.2323328256607056, "rewards/rejected": -3.460583209991455, "step": 9690 }, { "epoch": 1.6712611991729842, "grad_norm": 34.77642822265625, "learning_rate": 2.438357253573033e-07, "logits/chosen": -1.8747072219848633, "logits/rejected": -1.8190977573394775, "logps/chosen": -271.69622802734375, "logps/rejected": -382.57025146484375, "loss": 0.4836, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.1838881969451904, "rewards/margins": 1.130371332168579, "rewards/rejected": -3.3142592906951904, "step": 9700 }, { "epoch": 1.6729841488628532, "grad_norm": 43.158084869384766, "learning_rate": 2.4333467912991154e-07, "logits/chosen": -1.8235889673233032, "logits/rejected": -1.7928504943847656, "logps/chosen": -270.31365966796875, "logps/rejected": -363.97369384765625, "loss": 0.5198, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.162893533706665, "rewards/margins": 0.9671275019645691, "rewards/rejected": -3.1300206184387207, "step": 9710 }, { "epoch": 1.6747070985527222, "grad_norm": 37.88699722290039, "learning_rate": 2.4283365969313383e-07, "logits/chosen": -1.9659092426300049, "logits/rejected": -1.9283020496368408, "logps/chosen": -271.4593811035156, "logps/rejected": -356.45001220703125, "loss": 0.5729, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1644816398620605, "rewards/margins": 0.8912860155105591, "rewards/rejected": -3.05576753616333, "step": 9720 }, { "epoch": 1.6764300482425913, "grad_norm": 28.830284118652344, "learning_rate": 2.4233266906076955e-07, "logits/chosen": -2.0077900886535645, "logits/rejected": -1.9621881246566772, "logps/chosen": -234.3231201171875, "logps/rejected": -345.99786376953125, "loss": 0.4605, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7928962707519531, "rewards/margins": 1.129953145980835, "rewards/rejected": -2.922849178314209, "step": 9730 }, { "epoch": 1.6781529979324605, "grad_norm": 35.382720947265625, "learning_rate": 2.4183170924650215e-07, "logits/chosen": -1.9590526819229126, "logits/rejected": -1.9149301052093506, "logps/chosen": -251.9228973388672, "logps/rejected": -351.0686340332031, "loss": 0.4939, "rewards/accuracies": 0.75, "rewards/chosen": -1.963244080543518, "rewards/margins": 1.0238615274429321, "rewards/rejected": -2.987105369567871, "step": 9740 }, { "epoch": 1.6798759476223295, "grad_norm": 33.39261245727539, "learning_rate": 2.413307822638912e-07, "logits/chosen": -1.9903481006622314, "logits/rejected": -1.9229633808135986, "logps/chosen": -256.7508850097656, "logps/rejected": -358.58599853515625, "loss": 0.4775, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0080180168151855, "rewards/margins": 1.0909160375595093, "rewards/rejected": -3.098933696746826, "step": 9750 }, { "epoch": 1.6815988973121985, "grad_norm": 24.769556045532227, "learning_rate": 2.4082989012636434e-07, "logits/chosen": -1.9807764291763306, "logits/rejected": -1.9283912181854248, "logps/chosen": -238.79318237304688, "logps/rejected": -327.9174499511719, "loss": 0.4856, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8459182977676392, "rewards/margins": 0.9301916360855103, "rewards/rejected": -2.7761096954345703, "step": 9760 }, { "epoch": 1.6833218470020674, "grad_norm": 32.895530700683594, "learning_rate": 2.403290348472093e-07, "logits/chosen": -1.9188416004180908, "logits/rejected": -1.8691402673721313, "logps/chosen": -237.32272338867188, "logps/rejected": -343.59808349609375, "loss": 0.4718, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8218839168548584, "rewards/margins": 1.0747507810592651, "rewards/rejected": -2.896634578704834, "step": 9770 }, { "epoch": 1.6850447966919366, "grad_norm": 28.64840316772461, "learning_rate": 2.3982821843956557e-07, "logits/chosen": -1.9465420246124268, "logits/rejected": -1.8731029033660889, "logps/chosen": -266.94635009765625, "logps/rejected": -378.4681701660156, "loss": 0.4874, "rewards/accuracies": 0.75, "rewards/chosen": -2.05368709564209, "rewards/margins": 1.2200250625610352, "rewards/rejected": -3.273712158203125, "step": 9780 }, { "epoch": 1.6867677463818056, "grad_norm": 26.374879837036133, "learning_rate": 2.393274429164162e-07, "logits/chosen": -1.9631248712539673, "logits/rejected": -1.928788423538208, "logps/chosen": -279.85980224609375, "logps/rejected": -394.64727783203125, "loss": 0.4983, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.254769802093506, "rewards/margins": 1.1263645887374878, "rewards/rejected": -3.381134510040283, "step": 9790 }, { "epoch": 1.6884906960716748, "grad_norm": 48.05499267578125, "learning_rate": 2.388267102905803e-07, "logits/chosen": -1.9941221475601196, "logits/rejected": -1.9271152019500732, "logps/chosen": -276.64971923828125, "logps/rejected": -413.61920166015625, "loss": 0.4044, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.180598497390747, "rewards/margins": 1.423845648765564, "rewards/rejected": -3.6044440269470215, "step": 9800 }, { "epoch": 1.6902136457615438, "grad_norm": 29.706451416015625, "learning_rate": 2.383260225747041e-07, "logits/chosen": -1.95602548122406, "logits/rejected": -1.8980586528778076, "logps/chosen": -288.73760986328125, "logps/rejected": -394.91021728515625, "loss": 0.5404, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.319633960723877, "rewards/margins": 1.1051318645477295, "rewards/rejected": -3.4247658252716064, "step": 9810 }, { "epoch": 1.6919365954514127, "grad_norm": 44.91404724121094, "learning_rate": 2.3782538178125375e-07, "logits/chosen": -1.952826738357544, "logits/rejected": -1.9020576477050781, "logps/chosen": -277.63079833984375, "logps/rejected": -389.0127868652344, "loss": 0.4673, "rewards/accuracies": 0.78125, "rewards/chosen": -2.225379228591919, "rewards/margins": 1.133194923400879, "rewards/rejected": -3.3585739135742188, "step": 9820 }, { "epoch": 1.693659545141282, "grad_norm": 33.253822326660156, "learning_rate": 2.3732478992250662e-07, "logits/chosen": -2.0086519718170166, "logits/rejected": -1.967625379562378, "logps/chosen": -261.67620849609375, "logps/rejected": -357.3586120605469, "loss": 0.5205, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0233311653137207, "rewards/margins": 0.9871104955673218, "rewards/rejected": -3.010441780090332, "step": 9830 }, { "epoch": 1.6953824948311509, "grad_norm": 36.89247512817383, "learning_rate": 2.3682424901054326e-07, "logits/chosen": -1.9769260883331299, "logits/rejected": -1.9204738140106201, "logps/chosen": -256.5655517578125, "logps/rejected": -374.6059265136719, "loss": 0.4638, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0030035972595215, "rewards/margins": 1.2141447067260742, "rewards/rejected": -3.2171483039855957, "step": 9840 }, { "epoch": 1.69710544452102, "grad_norm": 32.1376838684082, "learning_rate": 2.3632376105723955e-07, "logits/chosen": -1.9975944757461548, "logits/rejected": -1.967670202255249, "logps/chosen": -249.30068969726562, "logps/rejected": -337.46148681640625, "loss": 0.5011, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9292528629302979, "rewards/margins": 0.9340192675590515, "rewards/rejected": -2.863272190093994, "step": 9850 }, { "epoch": 1.698828394210889, "grad_norm": 40.45782470703125, "learning_rate": 2.3582332807425867e-07, "logits/chosen": -2.033379077911377, "logits/rejected": -1.9965550899505615, "logps/chosen": -260.59124755859375, "logps/rejected": -341.44268798828125, "loss": 0.5483, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.0335183143615723, "rewards/margins": 0.8478059768676758, "rewards/rejected": -2.881324291229248, "step": 9860 }, { "epoch": 1.700551343900758, "grad_norm": 26.661699295043945, "learning_rate": 2.3532295207304268e-07, "logits/chosen": -2.0201058387756348, "logits/rejected": -1.9670464992523193, "logps/chosen": -223.9939727783203, "logps/rejected": -316.6408996582031, "loss": 0.4843, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7034847736358643, "rewards/margins": 0.9665762186050415, "rewards/rejected": -2.670060873031616, "step": 9870 }, { "epoch": 1.7022742935906272, "grad_norm": 36.77285385131836, "learning_rate": 2.3482263506480452e-07, "logits/chosen": -1.9723132848739624, "logits/rejected": -1.9341713190078735, "logps/chosen": -225.6239776611328, "logps/rejected": -338.54949951171875, "loss": 0.4418, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7276761531829834, "rewards/margins": 1.1118366718292236, "rewards/rejected": -2.8395133018493652, "step": 9880 }, { "epoch": 1.7039972432804962, "grad_norm": 30.037927627563477, "learning_rate": 2.3432237906052018e-07, "logits/chosen": -1.955492377281189, "logits/rejected": -1.918945074081421, "logps/chosen": -253.19717407226562, "logps/rejected": -351.8016357421875, "loss": 0.4848, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.9539806842803955, "rewards/margins": 1.0355241298675537, "rewards/rejected": -2.98950457572937, "step": 9890 }, { "epoch": 1.7057201929703654, "grad_norm": 31.166872024536133, "learning_rate": 2.3382218607092038e-07, "logits/chosen": -1.9768812656402588, "logits/rejected": -1.9233033657073975, "logps/chosen": -244.2887420654297, "logps/rejected": -368.73980712890625, "loss": 0.42, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.9127399921417236, "rewards/margins": 1.2859939336776733, "rewards/rejected": -3.1987338066101074, "step": 9900 }, { "epoch": 1.7074431426602343, "grad_norm": 27.7515811920166, "learning_rate": 2.3332205810648266e-07, "logits/chosen": -2.0263919830322266, "logits/rejected": -1.9568601846694946, "logps/chosen": -254.876708984375, "logps/rejected": -357.9356689453125, "loss": 0.5013, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.9671180248260498, "rewards/margins": 1.1164093017578125, "rewards/rejected": -3.0835278034210205, "step": 9910 }, { "epoch": 1.7091660923501033, "grad_norm": 35.373233795166016, "learning_rate": 2.3282199717742308e-07, "logits/chosen": -2.0692200660705566, "logits/rejected": -2.0151288509368896, "logps/chosen": -232.6851043701172, "logps/rejected": -336.7629089355469, "loss": 0.4479, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7862794399261475, "rewards/margins": 1.0915358066558838, "rewards/rejected": -2.8778152465820312, "step": 9920 }, { "epoch": 1.7108890420399723, "grad_norm": 32.82352066040039, "learning_rate": 2.3232200529368824e-07, "logits/chosen": -2.0081865787506104, "logits/rejected": -1.976311445236206, "logps/chosen": -229.7543182373047, "logps/rejected": -341.74761962890625, "loss": 0.4736, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7641820907592773, "rewards/margins": 1.1083869934082031, "rewards/rejected": -2.8725690841674805, "step": 9930 }, { "epoch": 1.7126119917298415, "grad_norm": 29.07355499267578, "learning_rate": 2.3182208446494727e-07, "logits/chosen": -2.0387377738952637, "logits/rejected": -2.0017287731170654, "logps/chosen": -264.155029296875, "logps/rejected": -359.59515380859375, "loss": 0.5155, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.08296537399292, "rewards/margins": 0.9136285781860352, "rewards/rejected": -2.996593952178955, "step": 9940 }, { "epoch": 1.7143349414197107, "grad_norm": 53.28978729248047, "learning_rate": 2.313222367005837e-07, "logits/chosen": -1.9962422847747803, "logits/rejected": -1.929328203201294, "logps/chosen": -272.79522705078125, "logps/rejected": -396.5162658691406, "loss": 0.4635, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.1801466941833496, "rewards/margins": 1.2850888967514038, "rewards/rejected": -3.4652352333068848, "step": 9950 }, { "epoch": 1.7160578911095796, "grad_norm": 59.67111587524414, "learning_rate": 2.3082246400968758e-07, "logits/chosen": -2.020737648010254, "logits/rejected": -1.9763295650482178, "logps/chosen": -280.27874755859375, "logps/rejected": -382.78692626953125, "loss": 0.5219, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.2608189582824707, "rewards/margins": 1.027080774307251, "rewards/rejected": -3.2878994941711426, "step": 9960 }, { "epoch": 1.7177808407994486, "grad_norm": 32.914859771728516, "learning_rate": 2.303227684010467e-07, "logits/chosen": -2.0620505809783936, "logits/rejected": -2.0254998207092285, "logps/chosen": -248.0398406982422, "logps/rejected": -344.94549560546875, "loss": 0.5177, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.9560155868530273, "rewards/margins": 0.9850342869758606, "rewards/rejected": -2.941049814224243, "step": 9970 }, { "epoch": 1.7195037904893176, "grad_norm": 24.60742950439453, "learning_rate": 2.2982315188313952e-07, "logits/chosen": -1.9630451202392578, "logits/rejected": -1.9169738292694092, "logps/chosen": -256.19281005859375, "logps/rejected": -370.25457763671875, "loss": 0.459, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.0312106609344482, "rewards/margins": 1.153822898864746, "rewards/rejected": -3.1850333213806152, "step": 9980 }, { "epoch": 1.7212267401791868, "grad_norm": 42.18019485473633, "learning_rate": 2.2932361646412644e-07, "logits/chosen": -2.0445556640625, "logits/rejected": -1.9971535205841064, "logps/chosen": -242.07455444335938, "logps/rejected": -329.0810241699219, "loss": 0.515, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.8719704151153564, "rewards/margins": 0.9333742260932922, "rewards/rejected": -2.8053441047668457, "step": 9990 }, { "epoch": 1.722949689869056, "grad_norm": 27.816390991210938, "learning_rate": 2.2882416415184174e-07, "logits/chosen": -2.0468382835388184, "logits/rejected": -2.0068085193634033, "logps/chosen": -228.55508422851562, "logps/rejected": -324.1514587402344, "loss": 0.4469, "rewards/accuracies": 0.75, "rewards/chosen": -1.7164595127105713, "rewards/margins": 0.968951404094696, "rewards/rejected": -2.685410976409912, "step": 10000 }, { "epoch": 1.722949689869056, "eval_logits/chosen": -2.0591249465942383, "eval_logits/rejected": -2.0388007164001465, "eval_logps/chosen": -239.3806915283203, "eval_logps/rejected": -280.0088806152344, "eval_loss": 0.6436116099357605, "eval_rewards/accuracies": 0.643122673034668, "eval_rewards/chosen": -1.8036521673202515, "eval_rewards/margins": 0.3689405620098114, "eval_rewards/rejected": -2.1725926399230957, "eval_runtime": 361.5773, "eval_samples_per_second": 11.903, "eval_steps_per_second": 1.488, "step": 10000 }, { "epoch": 1.724672639558925, "grad_norm": 19.452911376953125, "learning_rate": 2.283247969537861e-07, "logits/chosen": -2.0305991172790527, "logits/rejected": -1.9809935092926025, "logps/chosen": -247.6123504638672, "logps/rejected": -362.4576110839844, "loss": 0.444, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.9457921981811523, "rewards/margins": 1.154893398284912, "rewards/rejected": -3.1006855964660645, "step": 10010 }, { "epoch": 1.7263955892487939, "grad_norm": 36.74274444580078, "learning_rate": 2.2782551687711734e-07, "logits/chosen": -1.9446094036102295, "logits/rejected": -1.8924716711044312, "logps/chosen": -276.7430725097656, "logps/rejected": -385.075927734375, "loss": 0.4759, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.2179694175720215, "rewards/margins": 1.1262280941009521, "rewards/rejected": -3.3441975116729736, "step": 10020 }, { "epoch": 1.7281185389386629, "grad_norm": 52.06788635253906, "learning_rate": 2.273263259286439e-07, "logits/chosen": -1.9232804775238037, "logits/rejected": -1.8624763488769531, "logps/chosen": -286.44024658203125, "logps/rejected": -400.1329650878906, "loss": 0.4378, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.281569004058838, "rewards/margins": 1.2058217525482178, "rewards/rejected": -3.4873909950256348, "step": 10030 }, { "epoch": 1.729841488628532, "grad_norm": 30.292909622192383, "learning_rate": 2.2682722611481547e-07, "logits/chosen": -2.0581536293029785, "logits/rejected": -2.005876064300537, "logps/chosen": -286.12353515625, "logps/rejected": -408.5050354003906, "loss": 0.4479, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.2985472679138184, "rewards/margins": 1.2486097812652588, "rewards/rejected": -3.5471572875976562, "step": 10040 }, { "epoch": 1.7315644383184012, "grad_norm": 41.2258186340332, "learning_rate": 2.2632821944171573e-07, "logits/chosen": -1.9926458597183228, "logits/rejected": -1.9601705074310303, "logps/chosen": -271.47698974609375, "logps/rejected": -371.081298828125, "loss": 0.5036, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1656384468078613, "rewards/margins": 0.9935787916183472, "rewards/rejected": -3.159217357635498, "step": 10050 }, { "epoch": 1.7332873880082702, "grad_norm": 46.32283020019531, "learning_rate": 2.258293079150537e-07, "logits/chosen": -1.9818079471588135, "logits/rejected": -1.9385312795639038, "logps/chosen": -269.8608703613281, "logps/rejected": -383.0013122558594, "loss": 0.4828, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.2018613815307617, "rewards/margins": 1.1060611009597778, "rewards/rejected": -3.30792236328125, "step": 10060 }, { "epoch": 1.7350103376981392, "grad_norm": 32.64421844482422, "learning_rate": 2.253304935401561e-07, "logits/chosen": -1.9373579025268555, "logits/rejected": -1.8965246677398682, "logps/chosen": -253.9124298095703, "logps/rejected": -378.1776123046875, "loss": 0.4286, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0052552223205566, "rewards/margins": 1.25211501121521, "rewards/rejected": -3.257370710372925, "step": 10070 }, { "epoch": 1.7367332873880081, "grad_norm": 47.720890045166016, "learning_rate": 2.248317783219593e-07, "logits/chosen": -1.983437180519104, "logits/rejected": -1.9297415018081665, "logps/chosen": -269.78955078125, "logps/rejected": -385.2200622558594, "loss": 0.4729, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.121124029159546, "rewards/margins": 1.1850605010986328, "rewards/rejected": -3.306184768676758, "step": 10080 }, { "epoch": 1.7384562370778773, "grad_norm": 35.48196029663086, "learning_rate": 2.2433316426500097e-07, "logits/chosen": -2.0450704097747803, "logits/rejected": -1.9830868244171143, "logps/chosen": -257.6543273925781, "logps/rejected": -396.65533447265625, "loss": 0.4334, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.019641160964966, "rewards/margins": 1.3764359951019287, "rewards/rejected": -3.3960769176483154, "step": 10090 }, { "epoch": 1.7401791867677465, "grad_norm": 35.64340591430664, "learning_rate": 2.2383465337341223e-07, "logits/chosen": -1.979543924331665, "logits/rejected": -1.9323008060455322, "logps/chosen": -280.89886474609375, "logps/rejected": -394.08966064453125, "loss": 0.4807, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.2441608905792236, "rewards/margins": 1.1615378856658936, "rewards/rejected": -3.405698776245117, "step": 10100 }, { "epoch": 1.7419021364576155, "grad_norm": 71.6368179321289, "learning_rate": 2.2333624765090933e-07, "logits/chosen": -1.9893434047698975, "logits/rejected": -1.9467194080352783, "logps/chosen": -303.05975341796875, "logps/rejected": -436.0597229003906, "loss": 0.4597, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.5236096382141113, "rewards/margins": 1.3001294136047363, "rewards/rejected": -3.8237388134002686, "step": 10110 }, { "epoch": 1.7436250861474845, "grad_norm": 31.558183670043945, "learning_rate": 2.2283794910078617e-07, "logits/chosen": -1.9682697057724, "logits/rejected": -1.918298363685608, "logps/chosen": -309.4798889160156, "logps/rejected": -426.55755615234375, "loss": 0.4678, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.5500471591949463, "rewards/margins": 1.2252274751663208, "rewards/rejected": -3.7752742767333984, "step": 10120 }, { "epoch": 1.7453480358373534, "grad_norm": 41.0295524597168, "learning_rate": 2.2233975972590564e-07, "logits/chosen": -1.9597307443618774, "logits/rejected": -1.9024890661239624, "logps/chosen": -310.1100158691406, "logps/rejected": -446.66949462890625, "loss": 0.4586, "rewards/accuracies": 0.78125, "rewards/chosen": -2.5539612770080566, "rewards/margins": 1.3794822692871094, "rewards/rejected": -3.933443069458008, "step": 10130 }, { "epoch": 1.7470709855272226, "grad_norm": 41.24382781982422, "learning_rate": 2.2184168152869183e-07, "logits/chosen": -1.9590829610824585, "logits/rejected": -1.8940374851226807, "logps/chosen": -316.6585693359375, "logps/rejected": -434.50445556640625, "loss": 0.488, "rewards/accuracies": 0.75, "rewards/chosen": -2.5500733852386475, "rewards/margins": 1.2591197490692139, "rewards/rejected": -3.8091931343078613, "step": 10140 }, { "epoch": 1.7487939352170918, "grad_norm": 42.21787643432617, "learning_rate": 2.2134371651112202e-07, "logits/chosen": -1.955320954322815, "logits/rejected": -1.9047415256500244, "logps/chosen": -271.3085021972656, "logps/rejected": -388.6605529785156, "loss": 0.4689, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.15678071975708, "rewards/margins": 1.1781138181686401, "rewards/rejected": -3.3348946571350098, "step": 10150 }, { "epoch": 1.7505168849069608, "grad_norm": 28.049915313720703, "learning_rate": 2.2084586667471845e-07, "logits/chosen": -1.9859501123428345, "logits/rejected": -1.9206873178482056, "logps/chosen": -239.6188201904297, "logps/rejected": -382.41094970703125, "loss": 0.3858, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8617891073226929, "rewards/margins": 1.4471232891082764, "rewards/rejected": -3.3089122772216797, "step": 10160 }, { "epoch": 1.7522398345968297, "grad_norm": 43.666717529296875, "learning_rate": 2.2034813402054062e-07, "logits/chosen": -1.9575588703155518, "logits/rejected": -1.9117521047592163, "logps/chosen": -268.25592041015625, "logps/rejected": -389.5258483886719, "loss": 0.478, "rewards/accuracies": 0.75, "rewards/chosen": -2.13291072845459, "rewards/margins": 1.2282688617706299, "rewards/rejected": -3.3611793518066406, "step": 10170 }, { "epoch": 1.7539627842866987, "grad_norm": 23.70035171508789, "learning_rate": 2.198505205491769e-07, "logits/chosen": -1.9278274774551392, "logits/rejected": -1.8680626153945923, "logps/chosen": -271.293212890625, "logps/rejected": -412.00408935546875, "loss": 0.4261, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.182626724243164, "rewards/margins": 1.4099500179290771, "rewards/rejected": -3.592576503753662, "step": 10180 }, { "epoch": 1.755685733976568, "grad_norm": 60.148834228515625, "learning_rate": 2.1935302826073658e-07, "logits/chosen": -1.9124057292938232, "logits/rejected": -1.859548568725586, "logps/chosen": -272.058837890625, "logps/rejected": -396.0384826660156, "loss": 0.458, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.196594715118408, "rewards/margins": 1.2545301914215088, "rewards/rejected": -3.451124906539917, "step": 10190 }, { "epoch": 1.757408683666437, "grad_norm": 44.6929817199707, "learning_rate": 2.188556591548419e-07, "logits/chosen": -1.9498252868652344, "logits/rejected": -1.9000533819198608, "logps/chosen": -293.17327880859375, "logps/rejected": -410.2666931152344, "loss": 0.4582, "rewards/accuracies": 0.75, "rewards/chosen": -2.3982319831848145, "rewards/margins": 1.2046576738357544, "rewards/rejected": -3.6028892993927, "step": 10200 }, { "epoch": 1.759131633356306, "grad_norm": 61.035335540771484, "learning_rate": 2.1835841523061997e-07, "logits/chosen": -1.9273840188980103, "logits/rejected": -1.871167778968811, "logps/chosen": -319.53955078125, "logps/rejected": -485.272705078125, "loss": 0.3905, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.638547420501709, "rewards/margins": 1.657278299331665, "rewards/rejected": -4.295825958251953, "step": 10210 }, { "epoch": 1.760854583046175, "grad_norm": 49.66360855102539, "learning_rate": 2.1786129848669482e-07, "logits/chosen": -1.8796741962432861, "logits/rejected": -1.832911491394043, "logps/chosen": -321.859130859375, "logps/rejected": -448.517578125, "loss": 0.4749, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.723935604095459, "rewards/margins": 1.2590359449386597, "rewards/rejected": -3.9829719066619873, "step": 10220 }, { "epoch": 1.762577532736044, "grad_norm": 37.08704376220703, "learning_rate": 2.1736431092117925e-07, "logits/chosen": -1.9123092889785767, "logits/rejected": -1.858594536781311, "logps/chosen": -294.6947021484375, "logps/rejected": -422.02447509765625, "loss": 0.4856, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.4026026725769043, "rewards/margins": 1.3061363697052002, "rewards/rejected": -3.7087388038635254, "step": 10230 }, { "epoch": 1.7643004824259132, "grad_norm": 25.288190841674805, "learning_rate": 2.1686745453166674e-07, "logits/chosen": -1.9344371557235718, "logits/rejected": -1.8671379089355469, "logps/chosen": -273.5270080566406, "logps/rejected": -432.91217041015625, "loss": 0.3671, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.2144367694854736, "rewards/margins": 1.5974085330963135, "rewards/rejected": -3.811845302581787, "step": 10240 }, { "epoch": 1.7660234321157822, "grad_norm": 42.096458435058594, "learning_rate": 2.1637073131522366e-07, "logits/chosen": -1.9658024311065674, "logits/rejected": -1.9028819799423218, "logps/chosen": -284.531982421875, "logps/rejected": -403.46044921875, "loss": 0.3907, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.268610715866089, "rewards/margins": 1.2759894132614136, "rewards/rejected": -3.5445995330810547, "step": 10250 }, { "epoch": 1.7677463818056514, "grad_norm": 44.73157501220703, "learning_rate": 2.1587414326838094e-07, "logits/chosen": -1.9406728744506836, "logits/rejected": -1.9113953113555908, "logps/chosen": -286.78033447265625, "logps/rejected": -404.0321960449219, "loss": 0.4764, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.326537609100342, "rewards/margins": 1.1510480642318726, "rewards/rejected": -3.477585554122925, "step": 10260 }, { "epoch": 1.7694693314955203, "grad_norm": 32.976470947265625, "learning_rate": 2.1537769238712646e-07, "logits/chosen": -1.9190086126327515, "logits/rejected": -1.8704010248184204, "logps/chosen": -283.75469970703125, "logps/rejected": -407.4646911621094, "loss": 0.4411, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.2729952335357666, "rewards/margins": 1.249434471130371, "rewards/rejected": -3.5224297046661377, "step": 10270 }, { "epoch": 1.7711922811853893, "grad_norm": 49.289573669433594, "learning_rate": 2.1488138066689668e-07, "logits/chosen": -1.933314323425293, "logits/rejected": -1.8916332721710205, "logps/chosen": -289.12542724609375, "logps/rejected": -387.3733215332031, "loss": 0.5337, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.340834856033325, "rewards/margins": 1.0099698305130005, "rewards/rejected": -3.350804567337036, "step": 10280 }, { "epoch": 1.7729152308752585, "grad_norm": 41.76154327392578, "learning_rate": 2.1438521010256848e-07, "logits/chosen": -1.9279146194458008, "logits/rejected": -1.8669952154159546, "logps/chosen": -267.79156494140625, "logps/rejected": -405.36895751953125, "loss": 0.4718, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.1564979553222656, "rewards/margins": 1.3485300540924072, "rewards/rejected": -3.5050277709960938, "step": 10290 }, { "epoch": 1.7746381805651275, "grad_norm": 50.36730194091797, "learning_rate": 2.138891826884517e-07, "logits/chosen": -1.9704631567001343, "logits/rejected": -1.9255040884017944, "logps/chosen": -300.48077392578125, "logps/rejected": -408.8417663574219, "loss": 0.5186, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.442343235015869, "rewards/margins": 1.106431245803833, "rewards/rejected": -3.5487747192382812, "step": 10300 }, { "epoch": 1.7763611302549966, "grad_norm": 29.09944725036621, "learning_rate": 2.1339330041828053e-07, "logits/chosen": -1.9092556238174438, "logits/rejected": -1.8683710098266602, "logps/chosen": -295.45306396484375, "logps/rejected": -411.4949645996094, "loss": 0.4544, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.377427577972412, "rewards/margins": 1.236376166343689, "rewards/rejected": -3.6138038635253906, "step": 10310 }, { "epoch": 1.7780840799448656, "grad_norm": 43.08719253540039, "learning_rate": 2.1289756528520615e-07, "logits/chosen": -1.9652965068817139, "logits/rejected": -1.910241723060608, "logps/chosen": -270.1449890136719, "logps/rejected": -379.33782958984375, "loss": 0.4767, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.120330333709717, "rewards/margins": 1.14203679561615, "rewards/rejected": -3.2623672485351562, "step": 10320 }, { "epoch": 1.7798070296347346, "grad_norm": 46.50043487548828, "learning_rate": 2.124019792817878e-07, "logits/chosen": -1.965386152267456, "logits/rejected": -1.9089456796646118, "logps/chosen": -251.63650512695312, "logps/rejected": -380.83135986328125, "loss": 0.3978, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.985447883605957, "rewards/margins": 1.2902238368988037, "rewards/rejected": -3.2756717205047607, "step": 10330 }, { "epoch": 1.7815299793246038, "grad_norm": 30.213546752929688, "learning_rate": 2.119065443999858e-07, "logits/chosen": -1.9763885736465454, "logits/rejected": -1.918243408203125, "logps/chosen": -278.84918212890625, "logps/rejected": -408.4505310058594, "loss": 0.4381, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.2185330390930176, "rewards/margins": 1.3290278911590576, "rewards/rejected": -3.5475611686706543, "step": 10340 }, { "epoch": 1.7832529290144727, "grad_norm": 54.937530517578125, "learning_rate": 2.1141126263115274e-07, "logits/chosen": -1.9652454853057861, "logits/rejected": -1.9155279397964478, "logps/chosen": -309.233154296875, "logps/rejected": -420.7496643066406, "loss": 0.4867, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.5385148525238037, "rewards/margins": 1.1698596477508545, "rewards/rejected": -3.7083740234375, "step": 10350 }, { "epoch": 1.784975878704342, "grad_norm": 42.93281173706055, "learning_rate": 2.1091613596602596e-07, "logits/chosen": -1.9615905284881592, "logits/rejected": -1.9177404642105103, "logps/chosen": -281.1597595214844, "logps/rejected": -367.57080078125, "loss": 0.5949, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.241529703140259, "rewards/margins": 0.9181375503540039, "rewards/rejected": -3.1596672534942627, "step": 10360 }, { "epoch": 1.786698828394211, "grad_norm": 48.12542724609375, "learning_rate": 2.1042116639471936e-07, "logits/chosen": -1.8978999853134155, "logits/rejected": -1.8513730764389038, "logps/chosen": -231.7674102783203, "logps/rejected": -331.33251953125, "loss": 0.4725, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7824760675430298, "rewards/margins": 1.0123255252838135, "rewards/rejected": -2.794801712036133, "step": 10370 }, { "epoch": 1.7884217780840799, "grad_norm": 43.14175796508789, "learning_rate": 2.0992635590671527e-07, "logits/chosen": -2.0063693523406982, "logits/rejected": -1.9761041402816772, "logps/chosen": -242.87429809570312, "logps/rejected": -301.395751953125, "loss": 0.6404, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.900983452796936, "rewards/margins": 0.5931377410888672, "rewards/rejected": -2.4941213130950928, "step": 10380 }, { "epoch": 1.7901447277739488, "grad_norm": 19.803905487060547, "learning_rate": 2.0943170649085677e-07, "logits/chosen": -2.035249710083008, "logits/rejected": -1.9801585674285889, "logps/chosen": -219.2891082763672, "logps/rejected": -319.1458435058594, "loss": 0.4758, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6221036911010742, "rewards/margins": 1.037827730178833, "rewards/rejected": -2.6599314212799072, "step": 10390 }, { "epoch": 1.791867677463818, "grad_norm": 28.226577758789062, "learning_rate": 2.089372201353394e-07, "logits/chosen": -2.076899766921997, "logits/rejected": -2.0340158939361572, "logps/chosen": -231.8303680419922, "logps/rejected": -348.06005859375, "loss": 0.4365, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7937465906143188, "rewards/margins": 1.1452327966690063, "rewards/rejected": -2.938979148864746, "step": 10400 }, { "epoch": 1.791867677463818, "eval_logits/chosen": -2.063697099685669, "eval_logits/rejected": -2.044318675994873, "eval_logps/chosen": -235.93026733398438, "eval_logps/rejected": -275.3836669921875, "eval_loss": 0.644584059715271, "eval_rewards/accuracies": 0.6466078162193298, "eval_rewards/chosen": -1.7691479921340942, "eval_rewards/margins": 0.35719242691993713, "eval_rewards/rejected": -2.126340389251709, "eval_runtime": 361.7978, "eval_samples_per_second": 11.896, "eval_steps_per_second": 1.487, "step": 10400 }, { "epoch": 1.7935906271536872, "grad_norm": 40.870487213134766, "learning_rate": 2.0844289882770332e-07, "logits/chosen": -1.983575463294983, "logits/rejected": -1.934737205505371, "logps/chosen": -248.5260772705078, "logps/rejected": -333.0788269042969, "loss": 0.5257, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9107271432876587, "rewards/margins": 0.8957985043525696, "rewards/rejected": -2.806525707244873, "step": 10410 }, { "epoch": 1.7953135768435562, "grad_norm": 54.21998977661133, "learning_rate": 2.0794874455482554e-07, "logits/chosen": -2.0022151470184326, "logits/rejected": -1.9513057470321655, "logps/chosen": -250.685302734375, "logps/rejected": -356.3897705078125, "loss": 0.4587, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.9589130878448486, "rewards/margins": 1.0693190097808838, "rewards/rejected": -3.0282320976257324, "step": 10420 }, { "epoch": 1.7970365265334252, "grad_norm": 28.664793014526367, "learning_rate": 2.0745475930291124e-07, "logits/chosen": -2.042982578277588, "logits/rejected": -1.992074728012085, "logps/chosen": -241.39132690429688, "logps/rejected": -343.1851501464844, "loss": 0.4982, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.8567968606948853, "rewards/margins": 1.0504250526428223, "rewards/rejected": -2.907221794128418, "step": 10430 }, { "epoch": 1.7987594762232941, "grad_norm": 35.063785552978516, "learning_rate": 2.0696094505748655e-07, "logits/chosen": -2.0899689197540283, "logits/rejected": -2.0541319847106934, "logps/chosen": -251.8154296875, "logps/rejected": -345.1575622558594, "loss": 0.5, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9246141910552979, "rewards/margins": 0.963638424873352, "rewards/rejected": -2.8882522583007812, "step": 10440 }, { "epoch": 1.8004824259131633, "grad_norm": 29.81060028076172, "learning_rate": 2.064673038033901e-07, "logits/chosen": -2.046325206756592, "logits/rejected": -2.010258436203003, "logps/chosen": -248.33450317382812, "logps/rejected": -353.0802001953125, "loss": 0.468, "rewards/accuracies": 0.8125, "rewards/chosen": -1.937429666519165, "rewards/margins": 1.0303479433059692, "rewards/rejected": -2.9677774906158447, "step": 10450 }, { "epoch": 1.8022053756030325, "grad_norm": 25.357572555541992, "learning_rate": 2.059738375247656e-07, "logits/chosen": -2.037874460220337, "logits/rejected": -1.9796565771102905, "logps/chosen": -236.61599731445312, "logps/rejected": -335.01434326171875, "loss": 0.4459, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8103773593902588, "rewards/margins": 1.0125434398651123, "rewards/rejected": -2.82292103767395, "step": 10460 }, { "epoch": 1.8039283252929015, "grad_norm": 50.976783752441406, "learning_rate": 2.054805482050527e-07, "logits/chosen": -1.9457530975341797, "logits/rejected": -1.9000259637832642, "logps/chosen": -250.84228515625, "logps/rejected": -346.46160888671875, "loss": 0.5289, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.982693076133728, "rewards/margins": 0.9838865995407104, "rewards/rejected": -2.9665796756744385, "step": 10470 }, { "epoch": 1.8056512749827704, "grad_norm": 34.938720703125, "learning_rate": 2.0498743782698048e-07, "logits/chosen": -2.0403237342834473, "logits/rejected": -1.9832664728164673, "logps/chosen": -241.64599609375, "logps/rejected": -360.1966247558594, "loss": 0.4663, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.8425973653793335, "rewards/margins": 1.244246244430542, "rewards/rejected": -3.086843729019165, "step": 10480 }, { "epoch": 1.8073742246726394, "grad_norm": 27.717023849487305, "learning_rate": 2.0449450837255846e-07, "logits/chosen": -2.0231616497039795, "logits/rejected": -1.9732494354248047, "logps/chosen": -229.64053344726562, "logps/rejected": -344.3799133300781, "loss": 0.4386, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7433134317398071, "rewards/margins": 1.1796467304229736, "rewards/rejected": -2.9229602813720703, "step": 10490 }, { "epoch": 1.8090971743625086, "grad_norm": 30.53769302368164, "learning_rate": 2.04001761823069e-07, "logits/chosen": -2.0048375129699707, "logits/rejected": -1.9412002563476562, "logps/chosen": -250.18557739257812, "logps/rejected": -374.77783203125, "loss": 0.4483, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.9639889001846313, "rewards/margins": 1.242708444595337, "rewards/rejected": -3.206697463989258, "step": 10500 }, { "epoch": 1.8108201240523778, "grad_norm": 38.20130157470703, "learning_rate": 2.0350920015905946e-07, "logits/chosen": -2.0103769302368164, "logits/rejected": -1.94830322265625, "logps/chosen": -259.51824951171875, "logps/rejected": -377.5314636230469, "loss": 0.4625, "rewards/accuracies": 0.78125, "rewards/chosen": -2.0336837768554688, "rewards/margins": 1.2202539443969727, "rewards/rejected": -3.2539379596710205, "step": 10510 }, { "epoch": 1.8125430737422468, "grad_norm": 46.98945236206055, "learning_rate": 2.0301682536033366e-07, "logits/chosen": -2.0162129402160645, "logits/rejected": -1.9667482376098633, "logps/chosen": -263.54449462890625, "logps/rejected": -404.9740905761719, "loss": 0.4242, "rewards/accuracies": 0.78125, "rewards/chosen": -2.1151363849639893, "rewards/margins": 1.399470329284668, "rewards/rejected": -3.514606475830078, "step": 10520 }, { "epoch": 1.8142660234321157, "grad_norm": 36.54393005371094, "learning_rate": 2.0252463940594476e-07, "logits/chosen": -1.936745047569275, "logits/rejected": -1.8924674987792969, "logps/chosen": -290.01336669921875, "logps/rejected": -392.6010437011719, "loss": 0.4883, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.351137161254883, "rewards/margins": 1.0595757961273193, "rewards/rejected": -3.4107131958007812, "step": 10530 }, { "epoch": 1.8159889731219847, "grad_norm": 45.22914123535156, "learning_rate": 2.0203264427418666e-07, "logits/chosen": -1.864105224609375, "logits/rejected": -1.813640832901001, "logps/chosen": -281.66448974609375, "logps/rejected": -401.921630859375, "loss": 0.4555, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.2849280834198, "rewards/margins": 1.2517248392105103, "rewards/rejected": -3.5366528034210205, "step": 10540 }, { "epoch": 1.817711922811854, "grad_norm": 54.167076110839844, "learning_rate": 2.0154084194258641e-07, "logits/chosen": -2.038327217102051, "logits/rejected": -1.9919458627700806, "logps/chosen": -302.88470458984375, "logps/rejected": -427.89501953125, "loss": 0.4539, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.4884445667266846, "rewards/margins": 1.2426961660385132, "rewards/rejected": -3.731140613555908, "step": 10550 }, { "epoch": 1.819434872501723, "grad_norm": 39.44807434082031, "learning_rate": 2.010492343878959e-07, "logits/chosen": -1.9321849346160889, "logits/rejected": -1.8639672994613647, "logps/chosen": -290.53521728515625, "logps/rejected": -438.87158203125, "loss": 0.4223, "rewards/accuracies": 0.8125, "rewards/chosen": -2.338209867477417, "rewards/margins": 1.5184687376022339, "rewards/rejected": -3.8566787242889404, "step": 10560 }, { "epoch": 1.821157822191592, "grad_norm": 50.06810760498047, "learning_rate": 2.0055782358608423e-07, "logits/chosen": -1.9538475275039673, "logits/rejected": -1.9071626663208008, "logps/chosen": -310.2945861816406, "logps/rejected": -440.3954162597656, "loss": 0.4679, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.53814435005188, "rewards/margins": 1.3127472400665283, "rewards/rejected": -3.8508918285369873, "step": 10570 }, { "epoch": 1.822880771881461, "grad_norm": 42.652626037597656, "learning_rate": 2.0006661151232984e-07, "logits/chosen": -1.9664814472198486, "logits/rejected": -1.905191421508789, "logps/chosen": -282.2268981933594, "logps/rejected": -413.0838928222656, "loss": 0.4601, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.285202741622925, "rewards/margins": 1.323268175125122, "rewards/rejected": -3.6084704399108887, "step": 10580 }, { "epoch": 1.82460372157133, "grad_norm": 26.795143127441406, "learning_rate": 1.9957560014101218e-07, "logits/chosen": -1.977410078048706, "logits/rejected": -1.9217808246612549, "logps/chosen": -266.11456298828125, "logps/rejected": -375.61370849609375, "loss": 0.4521, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.09472393989563, "rewards/margins": 1.1730115413665771, "rewards/rejected": -3.267735242843628, "step": 10590 }, { "epoch": 1.8263266712611992, "grad_norm": 29.609148025512695, "learning_rate": 1.9908479144570418e-07, "logits/chosen": -2.0211422443389893, "logits/rejected": -1.9857227802276611, "logps/chosen": -253.1974639892578, "logps/rejected": -366.4717102050781, "loss": 0.4834, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9984159469604492, "rewards/margins": 1.1275442838668823, "rewards/rejected": -3.125960111618042, "step": 10600 }, { "epoch": 1.8280496209510684, "grad_norm": 38.983055114746094, "learning_rate": 1.9859418739916388e-07, "logits/chosen": -1.9919939041137695, "logits/rejected": -1.953507423400879, "logps/chosen": -273.6680603027344, "logps/rejected": -377.12322998046875, "loss": 0.5068, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.2157740592956543, "rewards/margins": 1.0374714136123657, "rewards/rejected": -3.2532451152801514, "step": 10610 }, { "epoch": 1.8297725706409373, "grad_norm": 41.77549362182617, "learning_rate": 1.9810378997332692e-07, "logits/chosen": -1.909139633178711, "logits/rejected": -1.8482526540756226, "logps/chosen": -264.4571838378906, "logps/rejected": -384.8020935058594, "loss": 0.4278, "rewards/accuracies": 0.78125, "rewards/chosen": -2.095834493637085, "rewards/margins": 1.2369600534439087, "rewards/rejected": -3.3327949047088623, "step": 10620 }, { "epoch": 1.8314955203308063, "grad_norm": 23.605958938598633, "learning_rate": 1.9761360113929853e-07, "logits/chosen": -1.9870498180389404, "logits/rejected": -1.9329030513763428, "logps/chosen": -275.0382385253906, "logps/rejected": -385.61199951171875, "loss": 0.5368, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.195175886154175, "rewards/margins": 1.129639983177185, "rewards/rejected": -3.3248162269592285, "step": 10630 }, { "epoch": 1.8332184700206753, "grad_norm": 50.84027862548828, "learning_rate": 1.9712362286734545e-07, "logits/chosen": -2.05165433883667, "logits/rejected": -2.0006394386291504, "logps/chosen": -269.555908203125, "logps/rejected": -375.22344970703125, "loss": 0.5192, "rewards/accuracies": 0.78125, "rewards/chosen": -2.1424124240875244, "rewards/margins": 1.0947930812835693, "rewards/rejected": -3.2372055053710938, "step": 10640 }, { "epoch": 1.8349414197105445, "grad_norm": 36.0411262512207, "learning_rate": 1.9663385712688794e-07, "logits/chosen": -2.030020236968994, "logits/rejected": -1.9818140268325806, "logps/chosen": -263.0570983886719, "logps/rejected": -387.527587890625, "loss": 0.4541, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.0825119018554688, "rewards/margins": 1.264723539352417, "rewards/rejected": -3.3472354412078857, "step": 10650 }, { "epoch": 1.8366643694004137, "grad_norm": 33.28034210205078, "learning_rate": 1.9614430588649217e-07, "logits/chosen": -1.9764604568481445, "logits/rejected": -1.927027702331543, "logps/chosen": -260.2097473144531, "logps/rejected": -379.53350830078125, "loss": 0.4356, "rewards/accuracies": 0.78125, "rewards/chosen": -2.0522079467773438, "rewards/margins": 1.1991796493530273, "rewards/rejected": -3.25138783454895, "step": 10660 }, { "epoch": 1.8383873190902826, "grad_norm": 43.7881965637207, "learning_rate": 1.956549711138621e-07, "logits/chosen": -1.9924142360687256, "logits/rejected": -1.935875654220581, "logps/chosen": -283.5320129394531, "logps/rejected": -404.827392578125, "loss": 0.4934, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.2974987030029297, "rewards/margins": 1.2398579120635986, "rewards/rejected": -3.5373566150665283, "step": 10670 }, { "epoch": 1.8401102687801516, "grad_norm": 26.40317153930664, "learning_rate": 1.951658547758317e-07, "logits/chosen": -2.022430658340454, "logits/rejected": -1.9683517217636108, "logps/chosen": -258.402099609375, "logps/rejected": -374.1106872558594, "loss": 0.4205, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.008884906768799, "rewards/margins": 1.2345539331436157, "rewards/rejected": -3.243439197540283, "step": 10680 }, { "epoch": 1.8418332184700206, "grad_norm": 33.29716873168945, "learning_rate": 1.9467695883835692e-07, "logits/chosen": -1.9891599416732788, "logits/rejected": -1.9449787139892578, "logps/chosen": -248.6431427001953, "logps/rejected": -382.27557373046875, "loss": 0.4511, "rewards/accuracies": 0.78125, "rewards/chosen": -1.9370352029800415, "rewards/margins": 1.3258932828903198, "rewards/rejected": -3.2629284858703613, "step": 10690 }, { "epoch": 1.8435561681598898, "grad_norm": 44.16842269897461, "learning_rate": 1.9418828526650766e-07, "logits/chosen": -2.0228426456451416, "logits/rejected": -1.9621185064315796, "logps/chosen": -251.39242553710938, "logps/rejected": -352.22308349609375, "loss": 0.4477, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9484155178070068, "rewards/margins": 1.0773483514785767, "rewards/rejected": -3.025763988494873, "step": 10700 }, { "epoch": 1.8452791178497587, "grad_norm": 41.234554290771484, "learning_rate": 1.9369983602446025e-07, "logits/chosen": -2.0109829902648926, "logits/rejected": -1.9568634033203125, "logps/chosen": -247.7287139892578, "logps/rejected": -354.6108093261719, "loss": 0.4685, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.9395967721939087, "rewards/margins": 1.1188300848007202, "rewards/rejected": -3.058427095413208, "step": 10710 }, { "epoch": 1.847002067539628, "grad_norm": 36.743648529052734, "learning_rate": 1.9321161307548935e-07, "logits/chosen": -2.006873369216919, "logits/rejected": -1.9556163549423218, "logps/chosen": -246.66427612304688, "logps/rejected": -349.91705322265625, "loss": 0.4968, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9337928295135498, "rewards/margins": 1.051224946975708, "rewards/rejected": -2.985017776489258, "step": 10720 }, { "epoch": 1.848725017229497, "grad_norm": 25.436752319335938, "learning_rate": 1.9272361838196e-07, "logits/chosen": -2.12395977973938, "logits/rejected": -2.058037281036377, "logps/chosen": -245.2313690185547, "logps/rejected": -361.54534912109375, "loss": 0.3993, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.9124692678451538, "rewards/margins": 1.2043049335479736, "rewards/rejected": -3.116774320602417, "step": 10730 }, { "epoch": 1.8504479669193659, "grad_norm": 41.69729232788086, "learning_rate": 1.922358539053197e-07, "logits/chosen": -1.9981117248535156, "logits/rejected": -1.941663384437561, "logps/chosen": -263.2612609863281, "logps/rejected": -374.8126525878906, "loss": 0.5049, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.0853493213653564, "rewards/margins": 1.1175873279571533, "rewards/rejected": -3.202936887741089, "step": 10740 }, { "epoch": 1.852170916609235, "grad_norm": 31.728364944458008, "learning_rate": 1.9174832160609067e-07, "logits/chosen": -1.9576467275619507, "logits/rejected": -1.898319959640503, "logps/chosen": -265.2740173339844, "logps/rejected": -391.9245300292969, "loss": 0.4712, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.125028371810913, "rewards/margins": 1.2731573581695557, "rewards/rejected": -3.3981852531433105, "step": 10750 }, { "epoch": 1.853893866299104, "grad_norm": 39.7555046081543, "learning_rate": 1.91261023443862e-07, "logits/chosen": -1.9749969244003296, "logits/rejected": -1.9226016998291016, "logps/chosen": -274.27178955078125, "logps/rejected": -393.4690856933594, "loss": 0.4483, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.196519613265991, "rewards/margins": 1.2170164585113525, "rewards/rejected": -3.4135360717773438, "step": 10760 }, { "epoch": 1.8556168159889732, "grad_norm": 37.342227935791016, "learning_rate": 1.9077396137728166e-07, "logits/chosen": -1.9684898853302002, "logits/rejected": -1.9151008129119873, "logps/chosen": -280.0582580566406, "logps/rejected": -418.4295959472656, "loss": 0.3954, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.2169878482818604, "rewards/margins": 1.4318593740463257, "rewards/rejected": -3.6488471031188965, "step": 10770 }, { "epoch": 1.8573397656788422, "grad_norm": 25.664596557617188, "learning_rate": 1.9028713736404866e-07, "logits/chosen": -2.075706958770752, "logits/rejected": -2.021446943283081, "logps/chosen": -267.18267822265625, "logps/rejected": -400.29888916015625, "loss": 0.4517, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.1281251907348633, "rewards/margins": 1.324436902999878, "rewards/rejected": -3.452561855316162, "step": 10780 }, { "epoch": 1.8590627153687111, "grad_norm": 39.812618255615234, "learning_rate": 1.8980055336090503e-07, "logits/chosen": -2.1293275356292725, "logits/rejected": -2.0732367038726807, "logps/chosen": -236.0831756591797, "logps/rejected": -358.1720886230469, "loss": 0.431, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.827760934829712, "rewards/margins": 1.2310631275177002, "rewards/rejected": -3.058823585510254, "step": 10790 }, { "epoch": 1.8607856650585803, "grad_norm": 28.97698211669922, "learning_rate": 1.8931421132362826e-07, "logits/chosen": -2.0683188438415527, "logits/rejected": -2.0173563957214355, "logps/chosen": -246.6348419189453, "logps/rejected": -374.78814697265625, "loss": 0.4488, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.9310967922210693, "rewards/margins": 1.2858989238739014, "rewards/rejected": -3.2169957160949707, "step": 10800 }, { "epoch": 1.8607856650585803, "eval_logits/chosen": -2.0535407066345215, "eval_logits/rejected": -2.0317139625549316, "eval_logps/chosen": -271.04888916015625, "eval_logps/rejected": -316.684326171875, "eval_loss": 0.6557953357696533, "eval_rewards/accuracies": 0.6449813842773438, "eval_rewards/chosen": -2.1203341484069824, "eval_rewards/margins": 0.4190131723880768, "eval_rewards/rejected": -2.5393471717834473, "eval_runtime": 361.6654, "eval_samples_per_second": 11.901, "eval_steps_per_second": 1.488, "step": 10800 }, { "epoch": 1.8625086147484493, "grad_norm": 21.25095558166504, "learning_rate": 1.888281132070232e-07, "logits/chosen": -2.0479507446289062, "logits/rejected": -1.9832979440689087, "logps/chosen": -264.6322326660156, "logps/rejected": -383.3566589355469, "loss": 0.4363, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.0596108436584473, "rewards/margins": 1.2815499305725098, "rewards/rejected": -3.341161012649536, "step": 10810 }, { "epoch": 1.8642315644383185, "grad_norm": 47.55622863769531, "learning_rate": 1.8834226096491457e-07, "logits/chosen": -1.985553503036499, "logits/rejected": -1.9404199123382568, "logps/chosen": -259.9513854980469, "logps/rejected": -370.4425048828125, "loss": 0.4798, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.0600924491882324, "rewards/margins": 1.1396005153656006, "rewards/rejected": -3.199692964553833, "step": 10820 }, { "epoch": 1.8659545141281875, "grad_norm": 73.32157135009766, "learning_rate": 1.8785665655013822e-07, "logits/chosen": -1.9949207305908203, "logits/rejected": -1.9449560642242432, "logps/chosen": -290.11468505859375, "logps/rejected": -411.42767333984375, "loss": 0.4582, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.370034694671631, "rewards/margins": 1.2209819555282593, "rewards/rejected": -3.5910167694091797, "step": 10830 }, { "epoch": 1.8676774638180564, "grad_norm": 31.25346565246582, "learning_rate": 1.8737130191453443e-07, "logits/chosen": -1.99038827419281, "logits/rejected": -1.9474595785140991, "logps/chosen": -278.84490966796875, "logps/rejected": -375.85589599609375, "loss": 0.5546, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.248175859451294, "rewards/margins": 1.0037610530853271, "rewards/rejected": -3.251936674118042, "step": 10840 }, { "epoch": 1.8694004135079254, "grad_norm": 45.17557144165039, "learning_rate": 1.868861990089393e-07, "logits/chosen": -1.9947643280029297, "logits/rejected": -1.9514482021331787, "logps/chosen": -250.06326293945312, "logps/rejected": -357.240478515625, "loss": 0.486, "rewards/accuracies": 0.75, "rewards/chosen": -1.9859851598739624, "rewards/margins": 1.0463157892227173, "rewards/rejected": -3.0323007106781006, "step": 10850 }, { "epoch": 1.8711233631977946, "grad_norm": 52.394493103027344, "learning_rate": 1.8640134978317706e-07, "logits/chosen": -2.089376926422119, "logits/rejected": -2.0164506435394287, "logps/chosen": -237.2823028564453, "logps/rejected": -351.81494140625, "loss": 0.4852, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8111803531646729, "rewards/margins": 1.185259461402893, "rewards/rejected": -2.9964399337768555, "step": 10860 }, { "epoch": 1.8728463128876638, "grad_norm": 25.866594314575195, "learning_rate": 1.859167561860527e-07, "logits/chosen": -1.9796807765960693, "logits/rejected": -1.9330120086669922, "logps/chosen": -228.75790405273438, "logps/rejected": -355.3408203125, "loss": 0.4716, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7788339853286743, "rewards/margins": 1.2205626964569092, "rewards/rejected": -2.999396562576294, "step": 10870 }, { "epoch": 1.8745692625775328, "grad_norm": 43.45176696777344, "learning_rate": 1.8543242016534298e-07, "logits/chosen": -2.0059409141540527, "logits/rejected": -1.9454164505004883, "logps/chosen": -261.5819396972656, "logps/rejected": -392.29901123046875, "loss": 0.4102, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.065403699874878, "rewards/margins": 1.3378067016601562, "rewards/rejected": -3.403210401535034, "step": 10880 }, { "epoch": 1.8762922122674017, "grad_norm": 40.9916877746582, "learning_rate": 1.8494834366779008e-07, "logits/chosen": -2.0208678245544434, "logits/rejected": -1.976416826248169, "logps/chosen": -276.38690185546875, "logps/rejected": -391.3465881347656, "loss": 0.4899, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.2208657264709473, "rewards/margins": 1.1508612632751465, "rewards/rejected": -3.3717265129089355, "step": 10890 }, { "epoch": 1.8780151619572707, "grad_norm": 50.783226013183594, "learning_rate": 1.844645286390927e-07, "logits/chosen": -2.037166118621826, "logits/rejected": -1.973984956741333, "logps/chosen": -267.40679931640625, "logps/rejected": -384.87554931640625, "loss": 0.4604, "rewards/accuracies": 0.8125, "rewards/chosen": -2.104724407196045, "rewards/margins": 1.242376446723938, "rewards/rejected": -3.3471012115478516, "step": 10900 }, { "epoch": 1.8797381116471399, "grad_norm": 41.24894714355469, "learning_rate": 1.8398097702389875e-07, "logits/chosen": -1.963120460510254, "logits/rejected": -1.9057592153549194, "logps/chosen": -260.38134765625, "logps/rejected": -370.3000183105469, "loss": 0.4875, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0327491760253906, "rewards/margins": 1.1303297281265259, "rewards/rejected": -3.163078784942627, "step": 10910 }, { "epoch": 1.881461061337009, "grad_norm": 25.644855499267578, "learning_rate": 1.8349769076579712e-07, "logits/chosen": -1.9795176982879639, "logits/rejected": -1.9256212711334229, "logps/chosen": -249.01821899414062, "logps/rejected": -373.9560546875, "loss": 0.3926, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9259611368179321, "rewards/margins": 1.2956678867340088, "rewards/rejected": -3.2216289043426514, "step": 10920 }, { "epoch": 1.883184011026878, "grad_norm": 41.94106674194336, "learning_rate": 1.8301467180731033e-07, "logits/chosen": -1.9166953563690186, "logits/rejected": -1.8661868572235107, "logps/chosen": -269.175537109375, "logps/rejected": -377.1507568359375, "loss": 0.4487, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.1400136947631836, "rewards/margins": 1.1192739009857178, "rewards/rejected": -3.2592880725860596, "step": 10930 }, { "epoch": 1.884906960716747, "grad_norm": 52.158477783203125, "learning_rate": 1.8253192208988657e-07, "logits/chosen": -1.9322324991226196, "logits/rejected": -1.8848450183868408, "logps/chosen": -282.6719665527344, "logps/rejected": -407.81072998046875, "loss": 0.442, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.257007598876953, "rewards/margins": 1.2809138298034668, "rewards/rejected": -3.53792142868042, "step": 10940 }, { "epoch": 1.886629910406616, "grad_norm": 47.92372131347656, "learning_rate": 1.8204944355389172e-07, "logits/chosen": -1.9877846240997314, "logits/rejected": -1.9390790462493896, "logps/chosen": -272.74407958984375, "logps/rejected": -395.9543762207031, "loss": 0.428, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.174280881881714, "rewards/margins": 1.2442162036895752, "rewards/rejected": -3.418497085571289, "step": 10950 }, { "epoch": 1.8883528600964852, "grad_norm": 41.54286575317383, "learning_rate": 1.8156723813860169e-07, "logits/chosen": -1.988814353942871, "logits/rejected": -1.9397096633911133, "logps/chosen": -281.7684020996094, "logps/rejected": -392.3604736328125, "loss": 0.5166, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.2756786346435547, "rewards/margins": 1.1293554306030273, "rewards/rejected": -3.405033826828003, "step": 10960 }, { "epoch": 1.8900758097863544, "grad_norm": 36.3850212097168, "learning_rate": 1.8108530778219455e-07, "logits/chosen": -2.0230507850646973, "logits/rejected": -1.9660866260528564, "logps/chosen": -243.6371307373047, "logps/rejected": -353.1454162597656, "loss": 0.4969, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.8600499629974365, "rewards/margins": 1.1356605291366577, "rewards/rejected": -2.9957103729248047, "step": 10970 }, { "epoch": 1.8917987594762233, "grad_norm": 29.159048080444336, "learning_rate": 1.806036544217429e-07, "logits/chosen": -2.0591073036193848, "logits/rejected": -2.0053396224975586, "logps/chosen": -243.8022918701172, "logps/rejected": -357.51495361328125, "loss": 0.4645, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.8782495260238647, "rewards/margins": 1.144733190536499, "rewards/rejected": -3.0229828357696533, "step": 10980 }, { "epoch": 1.8935217091660923, "grad_norm": 30.029666900634766, "learning_rate": 1.8012227999320604e-07, "logits/chosen": -2.0071699619293213, "logits/rejected": -1.9563453197479248, "logps/chosen": -217.38845825195312, "logps/rejected": -331.3340148925781, "loss": 0.4515, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6387300491333008, "rewards/margins": 1.1501741409301758, "rewards/rejected": -2.7889039516448975, "step": 10990 }, { "epoch": 1.8952446588559613, "grad_norm": 36.38576126098633, "learning_rate": 1.7964118643142196e-07, "logits/chosen": -2.0993247032165527, "logits/rejected": -2.065068006515503, "logps/chosen": -239.4434356689453, "logps/rejected": -354.570068359375, "loss": 0.4826, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.863684058189392, "rewards/margins": 1.1226847171783447, "rewards/rejected": -2.9863688945770264, "step": 11000 }, { "epoch": 1.8969676085458305, "grad_norm": 36.76717758178711, "learning_rate": 1.791603756700998e-07, "logits/chosen": -2.006556987762451, "logits/rejected": -1.9471065998077393, "logps/chosen": -252.9434356689453, "logps/rejected": -391.0294494628906, "loss": 0.3921, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9897778034210205, "rewards/margins": 1.416285514831543, "rewards/rejected": -3.4060630798339844, "step": 11010 }, { "epoch": 1.8986905582356997, "grad_norm": 20.732736587524414, "learning_rate": 1.7867984964181194e-07, "logits/chosen": -2.0571486949920654, "logits/rejected": -1.9969736337661743, "logps/chosen": -249.04635620117188, "logps/rejected": -387.585693359375, "loss": 0.3807, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.9302200078964233, "rewards/margins": 1.4255729913711548, "rewards/rejected": -3.355792999267578, "step": 11020 }, { "epoch": 1.9004135079255686, "grad_norm": 33.26679992675781, "learning_rate": 1.7819961027798653e-07, "logits/chosen": -2.032064914703369, "logits/rejected": -1.9785715341567993, "logps/chosen": -282.28070068359375, "logps/rejected": -418.87677001953125, "loss": 0.4818, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.2626571655273438, "rewards/margins": 1.3945884704589844, "rewards/rejected": -3.657245635986328, "step": 11030 }, { "epoch": 1.9021364576154376, "grad_norm": 31.185171127319336, "learning_rate": 1.777196595088993e-07, "logits/chosen": -2.026113510131836, "logits/rejected": -1.9730275869369507, "logps/chosen": -293.0149230957031, "logps/rejected": -412.3173828125, "loss": 0.4702, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.3516273498535156, "rewards/margins": 1.2242400646209717, "rewards/rejected": -3.5758674144744873, "step": 11040 }, { "epoch": 1.9038594073053066, "grad_norm": 29.499040603637695, "learning_rate": 1.7723999926366607e-07, "logits/chosen": -1.9686431884765625, "logits/rejected": -1.9163358211517334, "logps/chosen": -281.82098388671875, "logps/rejected": -420.391357421875, "loss": 0.4339, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.2946906089782715, "rewards/margins": 1.36869215965271, "rewards/rejected": -3.6633827686309814, "step": 11050 }, { "epoch": 1.9055823569951758, "grad_norm": 31.285354614257812, "learning_rate": 1.7676063147023486e-07, "logits/chosen": -1.98250412940979, "logits/rejected": -1.902591347694397, "logps/chosen": -279.9284362792969, "logps/rejected": -419.84893798828125, "loss": 0.4268, "rewards/accuracies": 0.8125, "rewards/chosen": -2.2453508377075195, "rewards/margins": 1.4421086311340332, "rewards/rejected": -3.6874594688415527, "step": 11060 }, { "epoch": 1.907305306685045, "grad_norm": 31.769317626953125, "learning_rate": 1.762815580553782e-07, "logits/chosen": -2.0290043354034424, "logits/rejected": -1.9549000263214111, "logps/chosen": -274.19476318359375, "logps/rejected": -414.11065673828125, "loss": 0.4315, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.1953015327453613, "rewards/margins": 1.4572569131851196, "rewards/rejected": -3.6525585651397705, "step": 11070 }, { "epoch": 1.909028256374914, "grad_norm": 23.747516632080078, "learning_rate": 1.7580278094468563e-07, "logits/chosen": -2.0042121410369873, "logits/rejected": -1.9560444355010986, "logps/chosen": -280.6150817871094, "logps/rejected": -397.8692321777344, "loss": 0.4909, "rewards/accuracies": 0.75, "rewards/chosen": -2.235276460647583, "rewards/margins": 1.2275713682174683, "rewards/rejected": -3.4628474712371826, "step": 11080 }, { "epoch": 1.9107512060647829, "grad_norm": 56.00666427612305, "learning_rate": 1.753243020625555e-07, "logits/chosen": -2.010807752609253, "logits/rejected": -1.9577172994613647, "logps/chosen": -278.1907043457031, "logps/rejected": -382.7396545410156, "loss": 0.5005, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.2005579471588135, "rewards/margins": 1.1054139137268066, "rewards/rejected": -3.30597186088562, "step": 11090 }, { "epoch": 1.9124741557546519, "grad_norm": 46.680198669433594, "learning_rate": 1.748461233321874e-07, "logits/chosen": -1.9757673740386963, "logits/rejected": -1.9329439401626587, "logps/chosen": -245.8427276611328, "logps/rejected": -351.6376953125, "loss": 0.5166, "rewards/accuracies": 0.71875, "rewards/chosen": -1.9388740062713623, "rewards/margins": 1.0512489080429077, "rewards/rejected": -2.9901227951049805, "step": 11100 }, { "epoch": 1.914197105444521, "grad_norm": 22.51175308227539, "learning_rate": 1.743682466755747e-07, "logits/chosen": -1.9942877292633057, "logits/rejected": -1.919983148574829, "logps/chosen": -261.4507751464844, "logps/rejected": -416.44683837890625, "loss": 0.4576, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.06864857673645, "rewards/margins": 1.5745538473129272, "rewards/rejected": -3.6432018280029297, "step": 11110 }, { "epoch": 1.9159200551343902, "grad_norm": 41.89836883544922, "learning_rate": 1.738906740134964e-07, "logits/chosen": -2.076693058013916, "logits/rejected": -2.0234456062316895, "logps/chosen": -261.53851318359375, "logps/rejected": -374.6880798339844, "loss": 0.4584, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.01017165184021, "rewards/margins": 1.2082760334014893, "rewards/rejected": -3.21844744682312, "step": 11120 }, { "epoch": 1.9176430048242592, "grad_norm": 57.876564025878906, "learning_rate": 1.7341340726550982e-07, "logits/chosen": -1.987900972366333, "logits/rejected": -1.9265962839126587, "logps/chosen": -265.7851257324219, "logps/rejected": -385.83160400390625, "loss": 0.4413, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.09367036819458, "rewards/margins": 1.2371257543563843, "rewards/rejected": -3.330796003341675, "step": 11130 }, { "epoch": 1.9193659545141282, "grad_norm": 38.07902908325195, "learning_rate": 1.7293644834994265e-07, "logits/chosen": -1.9308849573135376, "logits/rejected": -1.8795785903930664, "logps/chosen": -267.89190673828125, "logps/rejected": -392.2071838378906, "loss": 0.4149, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.123365879058838, "rewards/margins": 1.2871346473693848, "rewards/rejected": -3.4105002880096436, "step": 11140 }, { "epoch": 1.9210889042039971, "grad_norm": 51.1519889831543, "learning_rate": 1.7245979918388512e-07, "logits/chosen": -2.005169153213501, "logits/rejected": -1.963592767715454, "logps/chosen": -295.58154296875, "logps/rejected": -406.9353942871094, "loss": 0.5097, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.3682665824890137, "rewards/margins": 1.1456425189971924, "rewards/rejected": -3.5139095783233643, "step": 11150 }, { "epoch": 1.9228118538938663, "grad_norm": 34.57376480102539, "learning_rate": 1.7198346168318257e-07, "logits/chosen": -2.038787364959717, "logits/rejected": -1.9917595386505127, "logps/chosen": -248.0997314453125, "logps/rejected": -349.9923400878906, "loss": 0.4997, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.9474378824234009, "rewards/margins": 1.0454728603363037, "rewards/rejected": -2.992910861968994, "step": 11160 }, { "epoch": 1.9245348035837355, "grad_norm": 32.42025375366211, "learning_rate": 1.7150743776242762e-07, "logits/chosen": -1.9767448902130127, "logits/rejected": -1.9280691146850586, "logps/chosen": -254.2997283935547, "logps/rejected": -351.42498779296875, "loss": 0.4826, "rewards/accuracies": 0.75, "rewards/chosen": -1.9570831060409546, "rewards/margins": 1.0704643726348877, "rewards/rejected": -3.0275473594665527, "step": 11170 }, { "epoch": 1.9262577532736045, "grad_norm": 55.10776138305664, "learning_rate": 1.7103172933495266e-07, "logits/chosen": -2.0774331092834473, "logits/rejected": -2.0258259773254395, "logps/chosen": -265.4750671386719, "logps/rejected": -384.88360595703125, "loss": 0.4429, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.103172540664673, "rewards/margins": 1.2388051748275757, "rewards/rejected": -3.341977596282959, "step": 11180 }, { "epoch": 1.9279807029634735, "grad_norm": 26.25624656677246, "learning_rate": 1.7055633831282151e-07, "logits/chosen": -2.058413028717041, "logits/rejected": -1.9916912317276, "logps/chosen": -249.4329376220703, "logps/rejected": -386.6725769042969, "loss": 0.3662, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.9345378875732422, "rewards/margins": 1.4226974248886108, "rewards/rejected": -3.3572354316711426, "step": 11190 }, { "epoch": 1.9297036526533424, "grad_norm": 41.0643196105957, "learning_rate": 1.7008126660682273e-07, "logits/chosen": -2.0307328701019287, "logits/rejected": -1.9719680547714233, "logps/chosen": -268.20440673828125, "logps/rejected": -399.9913024902344, "loss": 0.4611, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.1359944343566895, "rewards/margins": 1.3297420740127563, "rewards/rejected": -3.4657368659973145, "step": 11200 }, { "epoch": 1.9297036526533424, "eval_logits/chosen": -2.0224127769470215, "eval_logits/rejected": -1.998708963394165, "eval_logps/chosen": -306.0948181152344, "eval_logps/rejected": -356.9083251953125, "eval_loss": 0.6646140813827515, "eval_rewards/accuracies": 0.6468401551246643, "eval_rewards/chosen": -2.4707934856414795, "eval_rewards/margins": 0.47079354524612427, "eval_rewards/rejected": -2.941586971282959, "eval_runtime": 361.8561, "eval_samples_per_second": 11.894, "eval_steps_per_second": 1.487, "step": 11200 }, { "epoch": 1.9314266023432116, "grad_norm": 48.22035598754883, "learning_rate": 1.6960651612646113e-07, "logits/chosen": -1.9346179962158203, "logits/rejected": -1.8764925003051758, "logps/chosen": -294.4263000488281, "logps/rejected": -445.35711669921875, "loss": 0.464, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.4312703609466553, "rewards/margins": 1.4967553615570068, "rewards/rejected": -3.928025484085083, "step": 11210 }, { "epoch": 1.9331495520330806, "grad_norm": 39.21717071533203, "learning_rate": 1.6913208877995038e-07, "logits/chosen": -1.9758126735687256, "logits/rejected": -1.9285478591918945, "logps/chosen": -280.5477294921875, "logps/rejected": -409.70391845703125, "loss": 0.5177, "rewards/accuracies": 0.75, "rewards/chosen": -2.2805309295654297, "rewards/margins": 1.2664536237716675, "rewards/rejected": -3.5469844341278076, "step": 11220 }, { "epoch": 1.9348725017229498, "grad_norm": 34.211395263671875, "learning_rate": 1.6865798647420565e-07, "logits/chosen": -1.99324631690979, "logits/rejected": -1.945502519607544, "logps/chosen": -286.435302734375, "logps/rejected": -395.6497802734375, "loss": 0.4573, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.2897446155548096, "rewards/margins": 1.166741967201233, "rewards/rejected": -3.456486225128174, "step": 11230 }, { "epoch": 1.9365954514128187, "grad_norm": 26.087488174438477, "learning_rate": 1.6818421111483519e-07, "logits/chosen": -2.015946626663208, "logits/rejected": -1.9643065929412842, "logps/chosen": -261.059326171875, "logps/rejected": -362.7684020996094, "loss": 0.4922, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0572943687438965, "rewards/margins": 1.0473647117614746, "rewards/rejected": -3.1046595573425293, "step": 11240 }, { "epoch": 1.9383184011026877, "grad_norm": 33.3255500793457, "learning_rate": 1.6771076460613342e-07, "logits/chosen": -2.045555830001831, "logits/rejected": -1.9856678247451782, "logps/chosen": -249.6926727294922, "logps/rejected": -372.72137451171875, "loss": 0.4009, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.9469709396362305, "rewards/margins": 1.249483346939087, "rewards/rejected": -3.1964540481567383, "step": 11250 }, { "epoch": 1.940041350792557, "grad_norm": 23.117216110229492, "learning_rate": 1.6723764885107284e-07, "logits/chosen": -1.9867252111434937, "logits/rejected": -1.9287971258163452, "logps/chosen": -266.19305419921875, "logps/rejected": -372.9690856933594, "loss": 0.4837, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.0741875171661377, "rewards/margins": 1.1506420373916626, "rewards/rejected": -3.2248291969299316, "step": 11260 }, { "epoch": 1.9417643004824259, "grad_norm": 44.461463928222656, "learning_rate": 1.6676486575129674e-07, "logits/chosen": -1.957049012184143, "logits/rejected": -1.9023834466934204, "logps/chosen": -268.98956298828125, "logps/rejected": -386.60235595703125, "loss": 0.4103, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.1265718936920166, "rewards/margins": 1.2351770401000977, "rewards/rejected": -3.3617489337921143, "step": 11270 }, { "epoch": 1.943487250172295, "grad_norm": 24.83477210998535, "learning_rate": 1.6629241720711096e-07, "logits/chosen": -1.9939937591552734, "logits/rejected": -1.9325447082519531, "logps/chosen": -285.57568359375, "logps/rejected": -397.7808532714844, "loss": 0.4881, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.285452365875244, "rewards/margins": 1.1766626834869385, "rewards/rejected": -3.4621148109436035, "step": 11280 }, { "epoch": 1.945210199862164, "grad_norm": 41.174903869628906, "learning_rate": 1.658203051174769e-07, "logits/chosen": -2.0219919681549072, "logits/rejected": -1.970686674118042, "logps/chosen": -268.61444091796875, "logps/rejected": -394.9859619140625, "loss": 0.432, "rewards/accuracies": 0.8125, "rewards/chosen": -2.079193592071533, "rewards/margins": 1.315537452697754, "rewards/rejected": -3.394731044769287, "step": 11290 }, { "epoch": 1.946933149552033, "grad_norm": 33.40657424926758, "learning_rate": 1.6534853138000365e-07, "logits/chosen": -1.9379314184188843, "logits/rejected": -1.8896305561065674, "logps/chosen": -299.6044006347656, "logps/rejected": -393.23089599609375, "loss": 0.5603, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.3959834575653076, "rewards/margins": 1.0061590671539307, "rewards/rejected": -3.402142286300659, "step": 11300 }, { "epoch": 1.948656099241902, "grad_norm": 52.6716423034668, "learning_rate": 1.6487709789094007e-07, "logits/chosen": -2.0082356929779053, "logits/rejected": -1.9558404684066772, "logps/chosen": -283.80218505859375, "logps/rejected": -424.55438232421875, "loss": 0.3816, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.3135826587677, "rewards/margins": 1.4265820980072021, "rewards/rejected": -3.7401645183563232, "step": 11310 }, { "epoch": 1.9503790489317712, "grad_norm": 33.208168029785156, "learning_rate": 1.644060065451678e-07, "logits/chosen": -1.9769518375396729, "logits/rejected": -1.9190177917480469, "logps/chosen": -294.9504089355469, "logps/rejected": -429.8594665527344, "loss": 0.46, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.3866686820983887, "rewards/margins": 1.3795229196548462, "rewards/rejected": -3.7661919593811035, "step": 11320 }, { "epoch": 1.9521019986216404, "grad_norm": 63.43159484863281, "learning_rate": 1.6393525923619279e-07, "logits/chosen": -1.9697481393814087, "logits/rejected": -1.9202378988265991, "logps/chosen": -291.4168701171875, "logps/rejected": -413.87530517578125, "loss": 0.4516, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.3729403018951416, "rewards/margins": 1.2881883382797241, "rewards/rejected": -3.661128520965576, "step": 11330 }, { "epoch": 1.9538249483115093, "grad_norm": 35.852962493896484, "learning_rate": 1.6346485785613852e-07, "logits/chosen": -1.9736683368682861, "logits/rejected": -1.9105730056762695, "logps/chosen": -273.4845275878906, "logps/rejected": -401.42559814453125, "loss": 0.4576, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1712100505828857, "rewards/margins": 1.304930567741394, "rewards/rejected": -3.4761409759521484, "step": 11340 }, { "epoch": 1.9555478980013783, "grad_norm": 33.967796325683594, "learning_rate": 1.6299480429573802e-07, "logits/chosen": -2.0450968742370605, "logits/rejected": -1.989073395729065, "logps/chosen": -282.04229736328125, "logps/rejected": -387.19305419921875, "loss": 0.5488, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.236253023147583, "rewards/margins": 1.0867068767547607, "rewards/rejected": -3.3229598999023438, "step": 11350 }, { "epoch": 1.9572708476912473, "grad_norm": 36.09415817260742, "learning_rate": 1.6252510044432622e-07, "logits/chosen": -2.006619453430176, "logits/rejected": -1.9583534002304077, "logps/chosen": -267.88201904296875, "logps/rejected": -375.99212646484375, "loss": 0.4557, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.13773250579834, "rewards/margins": 1.1414198875427246, "rewards/rejected": -3.2791523933410645, "step": 11360 }, { "epoch": 1.9589937973811165, "grad_norm": 30.58839225769043, "learning_rate": 1.6205574818983228e-07, "logits/chosen": -2.011200428009033, "logits/rejected": -1.9597864151000977, "logps/chosen": -258.14093017578125, "logps/rejected": -376.9452819824219, "loss": 0.4507, "rewards/accuracies": 0.78125, "rewards/chosen": -2.013904571533203, "rewards/margins": 1.2291772365570068, "rewards/rejected": -3.243082046508789, "step": 11370 }, { "epoch": 1.9607167470709856, "grad_norm": 78.1897201538086, "learning_rate": 1.6158674941877237e-07, "logits/chosen": -1.9451507329940796, "logits/rejected": -1.8842531442642212, "logps/chosen": -271.4443359375, "logps/rejected": -394.81353759765625, "loss": 0.4639, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1914219856262207, "rewards/margins": 1.257164478302002, "rewards/rejected": -3.4485867023468018, "step": 11380 }, { "epoch": 1.9624396967608546, "grad_norm": 33.75755310058594, "learning_rate": 1.6111810601624184e-07, "logits/chosen": -2.038860559463501, "logits/rejected": -1.9730615615844727, "logps/chosen": -248.2669219970703, "logps/rejected": -380.95098876953125, "loss": 0.4422, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9481830596923828, "rewards/margins": 1.3177549839019775, "rewards/rejected": -3.2659378051757812, "step": 11390 }, { "epoch": 1.9641626464507236, "grad_norm": 39.778480529785156, "learning_rate": 1.6064981986590763e-07, "logits/chosen": -2.0072200298309326, "logits/rejected": -1.9555925130844116, "logps/chosen": -248.47384643554688, "logps/rejected": -364.9747619628906, "loss": 0.4513, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9173710346221924, "rewards/margins": 1.2105586528778076, "rewards/rejected": -3.127929449081421, "step": 11400 }, { "epoch": 1.9658855961405926, "grad_norm": 29.74614143371582, "learning_rate": 1.6018189285000072e-07, "logits/chosen": -1.9953339099884033, "logits/rejected": -1.943982720375061, "logps/chosen": -273.2976989746094, "logps/rejected": -387.4016418457031, "loss": 0.4671, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.169517993927002, "rewards/margins": 1.164113998413086, "rewards/rejected": -3.333631992340088, "step": 11410 }, { "epoch": 1.9676085458304617, "grad_norm": 27.70301628112793, "learning_rate": 1.5971432684930852e-07, "logits/chosen": -1.9857032299041748, "logits/rejected": -1.9375683069229126, "logps/chosen": -263.55621337890625, "logps/rejected": -357.06805419921875, "loss": 0.4876, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.090210199356079, "rewards/margins": 0.9859424829483032, "rewards/rejected": -3.0761525630950928, "step": 11420 }, { "epoch": 1.969331495520331, "grad_norm": 28.47338104248047, "learning_rate": 1.592471237431675e-07, "logits/chosen": -1.9494556188583374, "logits/rejected": -1.9003379344940186, "logps/chosen": -258.4821472167969, "logps/rejected": -358.6229553222656, "loss": 0.5031, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.0296130180358887, "rewards/margins": 1.0445277690887451, "rewards/rejected": -3.074141025543213, "step": 11430 }, { "epoch": 1.9710544452102, "grad_norm": 27.11261749267578, "learning_rate": 1.587802854094555e-07, "logits/chosen": -1.9922187328338623, "logits/rejected": -1.9520759582519531, "logps/chosen": -238.9619140625, "logps/rejected": -344.40460205078125, "loss": 0.4567, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.860879898071289, "rewards/margins": 1.0597840547561646, "rewards/rejected": -2.920663833618164, "step": 11440 }, { "epoch": 1.9727773949000689, "grad_norm": 50.365028381347656, "learning_rate": 1.5831381372458418e-07, "logits/chosen": -2.0092692375183105, "logits/rejected": -1.9559404850006104, "logps/chosen": -253.676025390625, "logps/rejected": -371.489501953125, "loss": 0.4383, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.000629186630249, "rewards/margins": 1.2027219533920288, "rewards/rejected": -3.2033512592315674, "step": 11450 }, { "epoch": 1.9745003445899378, "grad_norm": 26.66362953186035, "learning_rate": 1.578477105634914e-07, "logits/chosen": -1.9663670063018799, "logits/rejected": -1.9066731929779053, "logps/chosen": -267.9611511230469, "logps/rejected": -404.2895812988281, "loss": 0.416, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.1187379360198975, "rewards/margins": 1.3901035785675049, "rewards/rejected": -3.5088417530059814, "step": 11460 }, { "epoch": 1.976223294279807, "grad_norm": 28.536787033081055, "learning_rate": 1.5738197779963385e-07, "logits/chosen": -2.0132460594177246, "logits/rejected": -1.9331022500991821, "logps/chosen": -275.8514709472656, "logps/rejected": -408.76129150390625, "loss": 0.405, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.1780247688293457, "rewards/margins": 1.4043099880218506, "rewards/rejected": -3.582334518432617, "step": 11470 }, { "epoch": 1.9779462439696762, "grad_norm": 49.4683837890625, "learning_rate": 1.5691661730497934e-07, "logits/chosen": -2.0169177055358887, "logits/rejected": -1.9501965045928955, "logps/chosen": -255.26498413085938, "logps/rejected": -362.5444030761719, "loss": 0.4865, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9774376153945923, "rewards/margins": 1.1496721506118774, "rewards/rejected": -3.1271095275878906, "step": 11480 }, { "epoch": 1.9796691936595452, "grad_norm": 24.46822166442871, "learning_rate": 1.5645163094999969e-07, "logits/chosen": -2.0169854164123535, "logits/rejected": -1.9575512409210205, "logps/chosen": -268.773681640625, "logps/rejected": -380.21295166015625, "loss": 0.4879, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.128267765045166, "rewards/margins": 1.1672557592391968, "rewards/rejected": -3.2955238819122314, "step": 11490 }, { "epoch": 1.9813921433494142, "grad_norm": 38.415809631347656, "learning_rate": 1.559870206036626e-07, "logits/chosen": -2.023591995239258, "logits/rejected": -1.9795535802841187, "logps/chosen": -248.5731658935547, "logps/rejected": -344.3983154296875, "loss": 0.4949, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.9699647426605225, "rewards/margins": 0.9742344617843628, "rewards/rejected": -2.944199323654175, "step": 11500 }, { "epoch": 1.9831150930392831, "grad_norm": 38.590301513671875, "learning_rate": 1.5552278813342443e-07, "logits/chosen": -1.9903109073638916, "logits/rejected": -1.9450523853302002, "logps/chosen": -266.0024719238281, "logps/rejected": -375.02264404296875, "loss": 0.4982, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.097303867340088, "rewards/margins": 1.1330721378326416, "rewards/rejected": -3.2303760051727295, "step": 11510 }, { "epoch": 1.9848380427291523, "grad_norm": 34.56684112548828, "learning_rate": 1.550589354052228e-07, "logits/chosen": -2.0108087062835693, "logits/rejected": -1.9296718835830688, "logps/chosen": -250.23782348632812, "logps/rejected": -362.9869079589844, "loss": 0.442, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8913452625274658, "rewards/margins": 1.241668462753296, "rewards/rejected": -3.133013963699341, "step": 11520 }, { "epoch": 1.9865609924190215, "grad_norm": 21.778335571289062, "learning_rate": 1.5459546428346914e-07, "logits/chosen": -2.001823902130127, "logits/rejected": -1.946722388267517, "logps/chosen": -233.7056884765625, "logps/rejected": -336.0831298828125, "loss": 0.449, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.7883857488632202, "rewards/margins": 1.0894372463226318, "rewards/rejected": -2.8778228759765625, "step": 11530 }, { "epoch": 1.9882839421088905, "grad_norm": 42.611602783203125, "learning_rate": 1.5413237663104085e-07, "logits/chosen": -1.9997422695159912, "logits/rejected": -1.950979232788086, "logps/chosen": -258.2232971191406, "logps/rejected": -401.89129638671875, "loss": 0.3996, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.063605308532715, "rewards/margins": 1.40665602684021, "rewards/rejected": -3.470261335372925, "step": 11540 }, { "epoch": 1.9900068917987594, "grad_norm": 35.406578063964844, "learning_rate": 1.5366967430927397e-07, "logits/chosen": -1.9369081258773804, "logits/rejected": -1.8973232507705688, "logps/chosen": -295.0162048339844, "logps/rejected": -403.62060546875, "loss": 0.5356, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.4257140159606934, "rewards/margins": 1.0877028703689575, "rewards/rejected": -3.5134170055389404, "step": 11550 }, { "epoch": 1.9917298414886284, "grad_norm": 40.58312225341797, "learning_rate": 1.5320735917795591e-07, "logits/chosen": -1.963915228843689, "logits/rejected": -1.9011389017105103, "logps/chosen": -288.8177490234375, "logps/rejected": -414.3863220214844, "loss": 0.4336, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.325291156768799, "rewards/margins": 1.3138149976730347, "rewards/rejected": -3.639106273651123, "step": 11560 }, { "epoch": 1.9934527911784976, "grad_norm": 40.99516296386719, "learning_rate": 1.5274543309531764e-07, "logits/chosen": -1.9867122173309326, "logits/rejected": -1.9286584854125977, "logps/chosen": -299.43719482421875, "logps/rejected": -432.5682678222656, "loss": 0.4362, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.4471497535705566, "rewards/margins": 1.3316915035247803, "rewards/rejected": -3.778841495513916, "step": 11570 }, { "epoch": 1.9951757408683668, "grad_norm": 48.45778274536133, "learning_rate": 1.5228389791802663e-07, "logits/chosen": -2.027468204498291, "logits/rejected": -1.9723812341690063, "logps/chosen": -290.87469482421875, "logps/rejected": -418.28839111328125, "loss": 0.455, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.3367998600006104, "rewards/margins": 1.3011219501495361, "rewards/rejected": -3.6379218101501465, "step": 11580 }, { "epoch": 1.9968986905582358, "grad_norm": 37.24993896484375, "learning_rate": 1.5182275550117895e-07, "logits/chosen": -1.941125512123108, "logits/rejected": -1.8772166967391968, "logps/chosen": -280.60302734375, "logps/rejected": -397.8558044433594, "loss": 0.4524, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.2048726081848145, "rewards/margins": 1.271545171737671, "rewards/rejected": -3.4764175415039062, "step": 11590 }, { "epoch": 1.9986216402481047, "grad_norm": 37.41301345825195, "learning_rate": 1.5136200769829192e-07, "logits/chosen": -2.0175511837005615, "logits/rejected": -1.9650070667266846, "logps/chosen": -253.857177734375, "logps/rejected": -377.40924072265625, "loss": 0.4546, "rewards/accuracies": 0.78125, "rewards/chosen": -2.0248160362243652, "rewards/margins": 1.2216683626174927, "rewards/rejected": -3.2464842796325684, "step": 11600 }, { "epoch": 1.9986216402481047, "eval_logits/chosen": -2.019531726837158, "eval_logits/rejected": -1.9966729879379272, "eval_logps/chosen": -286.5284423828125, "eval_logps/rejected": -335.9582824707031, "eval_loss": 0.6541406512260437, "eval_rewards/accuracies": 0.6435873508453369, "eval_rewards/chosen": -2.275129556655884, "eval_rewards/margins": 0.45695725083351135, "eval_rewards/rejected": -2.7320868968963623, "eval_runtime": 362.8207, "eval_samples_per_second": 11.863, "eval_steps_per_second": 1.483, "step": 11600 }, { "epoch": 2.0003445899379737, "grad_norm": 37.33259201049805, "learning_rate": 1.50901656361297e-07, "logits/chosen": -1.964093565940857, "logits/rejected": -1.9069855213165283, "logps/chosen": -256.0425720214844, "logps/rejected": -386.4458312988281, "loss": 0.4353, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.03315806388855, "rewards/margins": 1.3188011646270752, "rewards/rejected": -3.351959705352783, "step": 11610 }, { "epoch": 2.0020675396278427, "grad_norm": 34.64838790893555, "learning_rate": 1.504417033405319e-07, "logits/chosen": -2.0020484924316406, "logits/rejected": -1.9461925029754639, "logps/chosen": -269.329833984375, "logps/rejected": -405.37811279296875, "loss": 0.3993, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.120792865753174, "rewards/margins": 1.3729735612869263, "rewards/rejected": -3.4937667846679688, "step": 11620 }, { "epoch": 2.003790489317712, "grad_norm": 21.82311248779297, "learning_rate": 1.4998215048473357e-07, "logits/chosen": -1.9972374439239502, "logits/rejected": -1.9257395267486572, "logps/chosen": -255.61538696289062, "logps/rejected": -409.0962829589844, "loss": 0.3355, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.969273567199707, "rewards/margins": 1.598056435585022, "rewards/rejected": -3.5673298835754395, "step": 11630 }, { "epoch": 2.005513439007581, "grad_norm": 33.785865783691406, "learning_rate": 1.4952299964103004e-07, "logits/chosen": -1.9824466705322266, "logits/rejected": -1.9212512969970703, "logps/chosen": -269.06146240234375, "logps/rejected": -411.01416015625, "loss": 0.3717, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.1552460193634033, "rewards/margins": 1.4568287134170532, "rewards/rejected": -3.612074613571167, "step": 11640 }, { "epoch": 2.00723638869745, "grad_norm": 33.466243743896484, "learning_rate": 1.490642526549341e-07, "logits/chosen": -1.918196439743042, "logits/rejected": -1.8455432653427124, "logps/chosen": -267.04290771484375, "logps/rejected": -418.99444580078125, "loss": 0.364, "rewards/accuracies": 0.875, "rewards/chosen": -2.1170568466186523, "rewards/margins": 1.5664006471633911, "rewards/rejected": -3.683457612991333, "step": 11650 }, { "epoch": 2.008959338387319, "grad_norm": 37.37173843383789, "learning_rate": 1.486059113703349e-07, "logits/chosen": -1.9491525888442993, "logits/rejected": -1.8650496006011963, "logps/chosen": -309.7674865722656, "logps/rejected": -457.23260498046875, "loss": 0.37, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.5339298248291016, "rewards/margins": 1.5483729839324951, "rewards/rejected": -4.082303047180176, "step": 11660 }, { "epoch": 2.010682288077188, "grad_norm": 31.43915367126465, "learning_rate": 1.4814797762949094e-07, "logits/chosen": -1.918054223060608, "logits/rejected": -1.8322408199310303, "logps/chosen": -318.16180419921875, "logps/rejected": -521.9979248046875, "loss": 0.2937, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.6157901287078857, "rewards/margins": 2.0760855674743652, "rewards/rejected": -4.691876411437988, "step": 11670 }, { "epoch": 2.0124052377670574, "grad_norm": 46.547306060791016, "learning_rate": 1.47690453273023e-07, "logits/chosen": -1.8848421573638916, "logits/rejected": -1.8277667760849, "logps/chosen": -335.9585266113281, "logps/rejected": -503.4609375, "loss": 0.379, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.7837977409362793, "rewards/margins": 1.6997438669204712, "rewards/rejected": -4.483541965484619, "step": 11680 }, { "epoch": 2.0141281874569263, "grad_norm": 44.76530838012695, "learning_rate": 1.4723334013990562e-07, "logits/chosen": -1.9853168725967407, "logits/rejected": -1.9154516458511353, "logps/chosen": -327.79083251953125, "logps/rejected": -475.179931640625, "loss": 0.407, "rewards/accuracies": 0.84375, "rewards/chosen": -2.6683692932128906, "rewards/margins": 1.5524470806121826, "rewards/rejected": -4.220816612243652, "step": 11690 }, { "epoch": 2.0158511371467953, "grad_norm": 29.296173095703125, "learning_rate": 1.4677664006746126e-07, "logits/chosen": -2.017822265625, "logits/rejected": -1.9463411569595337, "logps/chosen": -295.9585266113281, "logps/rejected": -449.8290100097656, "loss": 0.3578, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.3865818977355957, "rewards/margins": 1.5512158870697021, "rewards/rejected": -3.9377975463867188, "step": 11700 }, { "epoch": 2.0175740868366643, "grad_norm": 57.6847038269043, "learning_rate": 1.4632035489135169e-07, "logits/chosen": -1.9698623418807983, "logits/rejected": -1.8926331996917725, "logps/chosen": -299.74566650390625, "logps/rejected": -462.079833984375, "loss": 0.3662, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.4386746883392334, "rewards/margins": 1.6593185663223267, "rewards/rejected": -4.097992897033691, "step": 11710 }, { "epoch": 2.0192970365265333, "grad_norm": 17.02887725830078, "learning_rate": 1.458644864455712e-07, "logits/chosen": -2.000767230987549, "logits/rejected": -1.9426374435424805, "logps/chosen": -303.05084228515625, "logps/rejected": -451.16986083984375, "loss": 0.4259, "rewards/accuracies": 0.8125, "rewards/chosen": -2.478684902191162, "rewards/margins": 1.5077711343765259, "rewards/rejected": -3.9864563941955566, "step": 11720 }, { "epoch": 2.0210199862164027, "grad_norm": 34.90534591674805, "learning_rate": 1.45409036562439e-07, "logits/chosen": -1.9627971649169922, "logits/rejected": -1.89785897731781, "logps/chosen": -288.6501770019531, "logps/rejected": -457.83319091796875, "loss": 0.3844, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.3309381008148193, "rewards/margins": 1.6790539026260376, "rewards/rejected": -4.0099921226501465, "step": 11730 }, { "epoch": 2.0227429359062716, "grad_norm": 23.329877853393555, "learning_rate": 1.4495400707259182e-07, "logits/chosen": -2.0058047771453857, "logits/rejected": -1.9528608322143555, "logps/chosen": -273.7113342285156, "logps/rejected": -411.271728515625, "loss": 0.3947, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.2012686729431152, "rewards/margins": 1.3885650634765625, "rewards/rejected": -3.5898337364196777, "step": 11740 }, { "epoch": 2.0244658855961406, "grad_norm": 30.32756805419922, "learning_rate": 1.4449939980497688e-07, "logits/chosen": -1.9976352453231812, "logits/rejected": -1.9390451908111572, "logps/chosen": -281.8433837890625, "logps/rejected": -460.3387756347656, "loss": 0.3094, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.325040340423584, "rewards/margins": 1.7356574535369873, "rewards/rejected": -4.060698509216309, "step": 11750 }, { "epoch": 2.0261888352860096, "grad_norm": 40.211669921875, "learning_rate": 1.4404521658684436e-07, "logits/chosen": -1.9842277765274048, "logits/rejected": -1.9212192296981812, "logps/chosen": -313.7696838378906, "logps/rejected": -475.6824645996094, "loss": 0.4137, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.580165147781372, "rewards/margins": 1.6510244607925415, "rewards/rejected": -4.231189250946045, "step": 11760 }, { "epoch": 2.0279117849758785, "grad_norm": 40.162933349609375, "learning_rate": 1.4359145924373974e-07, "logits/chosen": -1.9128490686416626, "logits/rejected": -1.8453865051269531, "logps/chosen": -299.1348571777344, "logps/rejected": -468.7713928222656, "loss": 0.3288, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.4083027839660645, "rewards/margins": 1.7666370868682861, "rewards/rejected": -4.17494010925293, "step": 11770 }, { "epoch": 2.029634734665748, "grad_norm": 43.0914192199707, "learning_rate": 1.4313812959949682e-07, "logits/chosen": -1.8900182247161865, "logits/rejected": -1.825042486190796, "logps/chosen": -314.4072265625, "logps/rejected": -464.59210205078125, "loss": 0.4105, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.6415810585021973, "rewards/margins": 1.4874277114868164, "rewards/rejected": -4.129008769989014, "step": 11780 }, { "epoch": 2.031357684355617, "grad_norm": 34.921688079833984, "learning_rate": 1.4268522947623053e-07, "logits/chosen": -1.920490026473999, "logits/rejected": -1.8588720560073853, "logps/chosen": -291.70648193359375, "logps/rejected": -509.09130859375, "loss": 0.2749, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.412020206451416, "rewards/margins": 2.150852918624878, "rewards/rejected": -4.562872886657715, "step": 11790 }, { "epoch": 2.033080634045486, "grad_norm": 29.160125732421875, "learning_rate": 1.4223276069432898e-07, "logits/chosen": -1.9423201084136963, "logits/rejected": -1.8831470012664795, "logps/chosen": -338.941650390625, "logps/rejected": -492.81671142578125, "loss": 0.3771, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.803985595703125, "rewards/margins": 1.5960160493850708, "rewards/rejected": -4.400001525878906, "step": 11800 }, { "epoch": 2.034803583735355, "grad_norm": 42.92601013183594, "learning_rate": 1.4178072507244704e-07, "logits/chosen": -1.9298042058944702, "logits/rejected": -1.8617265224456787, "logps/chosen": -337.6637878417969, "logps/rejected": -514.6013793945312, "loss": 0.3411, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.806673288345337, "rewards/margins": 1.818861961364746, "rewards/rejected": -4.625535011291504, "step": 11810 }, { "epoch": 2.036526533425224, "grad_norm": 31.830278396606445, "learning_rate": 1.4132912442749804e-07, "logits/chosen": -1.9490562677383423, "logits/rejected": -1.8854787349700928, "logps/chosen": -305.84521484375, "logps/rejected": -473.886474609375, "loss": 0.3824, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.544320821762085, "rewards/margins": 1.6879554986953735, "rewards/rejected": -4.23227596282959, "step": 11820 }, { "epoch": 2.0382494831150932, "grad_norm": 38.818328857421875, "learning_rate": 1.4087796057464741e-07, "logits/chosen": -1.9240907430648804, "logits/rejected": -1.8648512363433838, "logps/chosen": -292.7670593261719, "logps/rejected": -469.896484375, "loss": 0.345, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.4006237983703613, "rewards/margins": 1.7592111825942993, "rewards/rejected": -4.159834861755371, "step": 11830 }, { "epoch": 2.039972432804962, "grad_norm": 26.511688232421875, "learning_rate": 1.4042723532730449e-07, "logits/chosen": -1.9328176975250244, "logits/rejected": -1.8597285747528076, "logps/chosen": -308.6951904296875, "logps/rejected": -490.858154296875, "loss": 0.315, "rewards/accuracies": 0.90625, "rewards/chosen": -2.554982900619507, "rewards/margins": 1.8402225971221924, "rewards/rejected": -4.395205497741699, "step": 11840 }, { "epoch": 2.041695382494831, "grad_norm": 38.05181121826172, "learning_rate": 1.3997695049711608e-07, "logits/chosen": -1.898842215538025, "logits/rejected": -1.8268134593963623, "logps/chosen": -320.562744140625, "logps/rejected": -471.35748291015625, "loss": 0.375, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.6516222953796387, "rewards/margins": 1.577473521232605, "rewards/rejected": -4.229096412658691, "step": 11850 }, { "epoch": 2.0434183321847, "grad_norm": 58.496402740478516, "learning_rate": 1.3952710789395878e-07, "logits/chosen": -1.9562299251556396, "logits/rejected": -1.9173316955566406, "logps/chosen": -348.47613525390625, "logps/rejected": -480.850341796875, "loss": 0.5004, "rewards/accuracies": 0.75, "rewards/chosen": -2.936743974685669, "rewards/margins": 1.301387071609497, "rewards/rejected": -4.238131046295166, "step": 11860 }, { "epoch": 2.045141281874569, "grad_norm": 49.54454803466797, "learning_rate": 1.3907770932593108e-07, "logits/chosen": -1.9042739868164062, "logits/rejected": -1.8530365228652954, "logps/chosen": -302.0828552246094, "logps/rejected": -465.28497314453125, "loss": 0.3967, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.524749755859375, "rewards/margins": 1.594477891921997, "rewards/rejected": -4.119227409362793, "step": 11870 }, { "epoch": 2.0468642315644385, "grad_norm": 47.53093719482422, "learning_rate": 1.3862875659934742e-07, "logits/chosen": -1.9479024410247803, "logits/rejected": -1.8763694763183594, "logps/chosen": -304.0093078613281, "logps/rejected": -466.21240234375, "loss": 0.3358, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.4574196338653564, "rewards/margins": 1.6604318618774414, "rewards/rejected": -4.117851257324219, "step": 11880 }, { "epoch": 2.0485871812543075, "grad_norm": 41.84941482543945, "learning_rate": 1.3818025151873004e-07, "logits/chosen": -1.9622453451156616, "logits/rejected": -1.8968029022216797, "logps/chosen": -307.8858947753906, "logps/rejected": -455.3121032714844, "loss": 0.3952, "rewards/accuracies": 0.8125, "rewards/chosen": -2.4986684322357178, "rewards/margins": 1.536169409751892, "rewards/rejected": -4.0348381996154785, "step": 11890 }, { "epoch": 2.0503101309441765, "grad_norm": 50.097347259521484, "learning_rate": 1.3773219588680167e-07, "logits/chosen": -1.8746473789215088, "logits/rejected": -1.8054075241088867, "logps/chosen": -296.9711608886719, "logps/rejected": -477.3529357910156, "loss": 0.34, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.4296257495880127, "rewards/margins": 1.8380768299102783, "rewards/rejected": -4.267703056335449, "step": 11900 }, { "epoch": 2.0520330806340454, "grad_norm": 35.73570251464844, "learning_rate": 1.3728459150447874e-07, "logits/chosen": -2.0279855728149414, "logits/rejected": -1.9843858480453491, "logps/chosen": -297.2587890625, "logps/rejected": -468.70635986328125, "loss": 0.3595, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.439187526702881, "rewards/margins": 1.6750843524932861, "rewards/rejected": -4.114272117614746, "step": 11910 }, { "epoch": 2.0537560303239144, "grad_norm": 53.64319610595703, "learning_rate": 1.3683744017086386e-07, "logits/chosen": -1.964199423789978, "logits/rejected": -1.9108998775482178, "logps/chosen": -313.17486572265625, "logps/rejected": -467.337890625, "loss": 0.4056, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.5432918071746826, "rewards/margins": 1.5665208101272583, "rewards/rejected": -4.1098127365112305, "step": 11920 }, { "epoch": 2.055478980013784, "grad_norm": 32.15663528442383, "learning_rate": 1.3639074368323873e-07, "logits/chosen": -1.9454278945922852, "logits/rejected": -1.879626989364624, "logps/chosen": -299.2511291503906, "logps/rejected": -476.59307861328125, "loss": 0.3248, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.456939220428467, "rewards/margins": 1.7594232559204102, "rewards/rejected": -4.216361999511719, "step": 11930 }, { "epoch": 2.057201929703653, "grad_norm": 41.21245193481445, "learning_rate": 1.359445038370567e-07, "logits/chosen": -1.9431301355361938, "logits/rejected": -1.872836709022522, "logps/chosen": -269.83331298828125, "logps/rejected": -448.21929931640625, "loss": 0.3361, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.1429476737976074, "rewards/margins": 1.812497854232788, "rewards/rejected": -3.9554457664489746, "step": 11940 }, { "epoch": 2.0589248793935218, "grad_norm": 37.295597076416016, "learning_rate": 1.354987224259359e-07, "logits/chosen": -2.0178167819976807, "logits/rejected": -1.9470899105072021, "logps/chosen": -274.1566467285156, "logps/rejected": -430.16485595703125, "loss": 0.392, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.151776075363159, "rewards/margins": 1.5939236879348755, "rewards/rejected": -3.745699405670166, "step": 11950 }, { "epoch": 2.0606478290833907, "grad_norm": 48.4313850402832, "learning_rate": 1.3505340124165155e-07, "logits/chosen": -1.914493203163147, "logits/rejected": -1.8433141708374023, "logps/chosen": -272.5869140625, "logps/rejected": -461.40234375, "loss": 0.3346, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.1910834312438965, "rewards/margins": 1.9050095081329346, "rewards/rejected": -4.09609317779541, "step": 11960 }, { "epoch": 2.0623707787732597, "grad_norm": 38.390506744384766, "learning_rate": 1.3460854207412927e-07, "logits/chosen": -2.0111584663391113, "logits/rejected": -1.9377391338348389, "logps/chosen": -277.7578430175781, "logps/rejected": -447.0340881347656, "loss": 0.3587, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.2062878608703613, "rewards/margins": 1.741132378578186, "rewards/rejected": -3.947420597076416, "step": 11970 }, { "epoch": 2.0640937284631287, "grad_norm": 29.428462982177734, "learning_rate": 1.3416414671143743e-07, "logits/chosen": -1.9599330425262451, "logits/rejected": -1.8841501474380493, "logps/chosen": -291.03515625, "logps/rejected": -460.79461669921875, "loss": 0.3189, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.3488030433654785, "rewards/margins": 1.7235733270645142, "rewards/rejected": -4.0723772048950195, "step": 11980 }, { "epoch": 2.065816678152998, "grad_norm": 30.11941146850586, "learning_rate": 1.3372021693978057e-07, "logits/chosen": -1.9964679479599, "logits/rejected": -1.92724609375, "logps/chosen": -320.68133544921875, "logps/rejected": -466.2008361816406, "loss": 0.4366, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.6191534996032715, "rewards/margins": 1.534792184829712, "rewards/rejected": -4.1539459228515625, "step": 11990 }, { "epoch": 2.067539627842867, "grad_norm": 40.9322509765625, "learning_rate": 1.3327675454349135e-07, "logits/chosen": -1.9176896810531616, "logits/rejected": -1.867864966392517, "logps/chosen": -296.06597900390625, "logps/rejected": -464.32696533203125, "loss": 0.3836, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.442272186279297, "rewards/margins": 1.6620765924453735, "rewards/rejected": -4.104348659515381, "step": 12000 }, { "epoch": 2.067539627842867, "eval_logits/chosen": -1.9843589067459106, "eval_logits/rejected": -1.9585344791412354, "eval_logps/chosen": -334.6001281738281, "eval_logps/rejected": -394.8880920410156, "eval_loss": 0.6827077865600586, "eval_rewards/accuracies": 0.6463754773139954, "eval_rewards/chosen": -2.755847215652466, "eval_rewards/margins": 0.5655378699302673, "eval_rewards/rejected": -3.3213844299316406, "eval_runtime": 361.8827, "eval_samples_per_second": 11.893, "eval_steps_per_second": 1.487, "step": 12000 }, { "epoch": 2.069262577532736, "grad_norm": 39.82230758666992, "learning_rate": 1.3283376130502405e-07, "logits/chosen": -2.000572443008423, "logits/rejected": -1.9180504083633423, "logps/chosen": -309.64080810546875, "logps/rejected": -490.228759765625, "loss": 0.3601, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.5387492179870605, "rewards/margins": 1.8195781707763672, "rewards/rejected": -4.358327388763428, "step": 12010 }, { "epoch": 2.070985527222605, "grad_norm": 43.60273742675781, "learning_rate": 1.3239123900494736e-07, "logits/chosen": -1.9799461364746094, "logits/rejected": -1.9173142910003662, "logps/chosen": -299.7392578125, "logps/rejected": -454.30023193359375, "loss": 0.3878, "rewards/accuracies": 0.84375, "rewards/chosen": -2.4648945331573486, "rewards/margins": 1.5586652755737305, "rewards/rejected": -4.0235595703125, "step": 12020 }, { "epoch": 2.072708476912474, "grad_norm": 81.41376495361328, "learning_rate": 1.31949189421937e-07, "logits/chosen": -2.047072172164917, "logits/rejected": -1.9901902675628662, "logps/chosen": -282.29266357421875, "logps/rejected": -432.1227111816406, "loss": 0.3875, "rewards/accuracies": 0.84375, "rewards/chosen": -2.321225643157959, "rewards/margins": 1.4954288005828857, "rewards/rejected": -3.8166542053222656, "step": 12030 }, { "epoch": 2.0744314266023434, "grad_norm": 35.211814880371094, "learning_rate": 1.3150761433276858e-07, "logits/chosen": -1.9656760692596436, "logits/rejected": -1.8834718465805054, "logps/chosen": -295.324951171875, "logps/rejected": -500.71246337890625, "loss": 0.3061, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.3857743740081787, "rewards/margins": 2.110292673110962, "rewards/rejected": -4.496067047119141, "step": 12040 }, { "epoch": 2.0761543762922123, "grad_norm": 62.94130325317383, "learning_rate": 1.3106651551231041e-07, "logits/chosen": -2.015890121459961, "logits/rejected": -1.931300163269043, "logps/chosen": -311.99566650390625, "logps/rejected": -489.7413024902344, "loss": 0.3276, "rewards/accuracies": 0.875, "rewards/chosen": -2.5656542778015137, "rewards/margins": 1.8272279500961304, "rewards/rejected": -4.392882347106934, "step": 12050 }, { "epoch": 2.0778773259820813, "grad_norm": 56.522701263427734, "learning_rate": 1.3062589473351675e-07, "logits/chosen": -1.9648616313934326, "logits/rejected": -1.8981422185897827, "logps/chosen": -315.14044189453125, "logps/rejected": -496.851318359375, "loss": 0.3486, "rewards/accuracies": 0.84375, "rewards/chosen": -2.584763765335083, "rewards/margins": 1.8556935787200928, "rewards/rejected": -4.440457344055176, "step": 12060 }, { "epoch": 2.0796002756719503, "grad_norm": 60.15129852294922, "learning_rate": 1.301857537674204e-07, "logits/chosen": -1.9758834838867188, "logits/rejected": -1.8920056819915771, "logps/chosen": -300.3418884277344, "logps/rejected": -497.5635681152344, "loss": 0.3305, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.44091796875, "rewards/margins": 2.0436320304870605, "rewards/rejected": -4.484549522399902, "step": 12070 }, { "epoch": 2.0813232253618192, "grad_norm": 39.1005859375, "learning_rate": 1.2974609438312544e-07, "logits/chosen": -1.928297758102417, "logits/rejected": -1.8385089635849, "logps/chosen": -278.0054626464844, "logps/rejected": -509.8894958496094, "loss": 0.2745, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.2088065147399902, "rewards/margins": 2.348057270050049, "rewards/rejected": -4.556863784790039, "step": 12080 }, { "epoch": 2.0830461750516887, "grad_norm": 69.02693176269531, "learning_rate": 1.2930691834780023e-07, "logits/chosen": -1.9618841409683228, "logits/rejected": -1.8927383422851562, "logps/chosen": -323.1204528808594, "logps/rejected": -473.625244140625, "loss": 0.4622, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.6510982513427734, "rewards/margins": 1.5638573169708252, "rewards/rejected": -4.2149553298950195, "step": 12090 }, { "epoch": 2.0847691247415576, "grad_norm": 49.65925216674805, "learning_rate": 1.288682274266706e-07, "logits/chosen": -1.9403762817382812, "logits/rejected": -1.8725990056991577, "logps/chosen": -287.79583740234375, "logps/rejected": -453.991455078125, "loss": 0.3674, "rewards/accuracies": 0.84375, "rewards/chosen": -2.291826009750366, "rewards/margins": 1.7297157049179077, "rewards/rejected": -4.021541595458984, "step": 12100 }, { "epoch": 2.0864920744314266, "grad_norm": 24.758180618286133, "learning_rate": 1.2843002338301225e-07, "logits/chosen": -1.971364974975586, "logits/rejected": -1.8960367441177368, "logps/chosen": -303.2628479003906, "logps/rejected": -472.09918212890625, "loss": 0.3644, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.450784206390381, "rewards/margins": 1.741528868675232, "rewards/rejected": -4.192313194274902, "step": 12110 }, { "epoch": 2.0882150241212956, "grad_norm": 49.581809997558594, "learning_rate": 1.2799230797814415e-07, "logits/chosen": -1.9963009357452393, "logits/rejected": -1.9327083826065063, "logps/chosen": -286.9825134277344, "logps/rejected": -446.86407470703125, "loss": 0.3856, "rewards/accuracies": 0.8125, "rewards/chosen": -2.3025104999542236, "rewards/margins": 1.653399109840393, "rewards/rejected": -3.955909252166748, "step": 12120 }, { "epoch": 2.0899379738111645, "grad_norm": 25.061363220214844, "learning_rate": 1.2755508297142118e-07, "logits/chosen": -1.9765379428863525, "logits/rejected": -1.891554832458496, "logps/chosen": -271.9354553222656, "logps/rejected": -466.00054931640625, "loss": 0.3136, "rewards/accuracies": 0.875, "rewards/chosen": -2.1719729900360107, "rewards/margins": 1.960875153541565, "rewards/rejected": -4.132847785949707, "step": 12130 }, { "epoch": 2.091660923501034, "grad_norm": 61.87263107299805, "learning_rate": 1.2711835012022697e-07, "logits/chosen": -2.0030934810638428, "logits/rejected": -1.9377405643463135, "logps/chosen": -290.70550537109375, "logps/rejected": -451.60888671875, "loss": 0.3677, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.3768577575683594, "rewards/margins": 1.6030426025390625, "rewards/rejected": -3.97990083694458, "step": 12140 }, { "epoch": 2.093383873190903, "grad_norm": 67.82479858398438, "learning_rate": 1.2668211117996703e-07, "logits/chosen": -2.012500286102295, "logits/rejected": -1.9480791091918945, "logps/chosen": -281.7728271484375, "logps/rejected": -462.63885498046875, "loss": 0.3826, "rewards/accuracies": 0.8125, "rewards/chosen": -2.2728779315948486, "rewards/margins": 1.833735466003418, "rewards/rejected": -4.106613636016846, "step": 12150 }, { "epoch": 2.095106822880772, "grad_norm": 41.75953674316406, "learning_rate": 1.2624636790406173e-07, "logits/chosen": -1.9487348794937134, "logits/rejected": -1.8791500329971313, "logps/chosen": -299.1808776855469, "logps/rejected": -456.084716796875, "loss": 0.4183, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.424877166748047, "rewards/margins": 1.6073623895645142, "rewards/rejected": -4.0322394371032715, "step": 12160 }, { "epoch": 2.096829772570641, "grad_norm": 30.884580612182617, "learning_rate": 1.2581112204393936e-07, "logits/chosen": -1.9606748819351196, "logits/rejected": -1.896061658859253, "logps/chosen": -292.00689697265625, "logps/rejected": -462.348388671875, "loss": 0.4046, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.3637471199035645, "rewards/margins": 1.7077734470367432, "rewards/rejected": -4.071520805358887, "step": 12170 }, { "epoch": 2.09855272226051, "grad_norm": 30.779600143432617, "learning_rate": 1.2537637534902822e-07, "logits/chosen": -1.91006600856781, "logits/rejected": -1.8402068614959717, "logps/chosen": -300.908203125, "logps/rejected": -510.309814453125, "loss": 0.3294, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.51627779006958, "rewards/margins": 2.0171589851379395, "rewards/rejected": -4.533437252044678, "step": 12180 }, { "epoch": 2.1002756719503792, "grad_norm": 46.58794021606445, "learning_rate": 1.2494212956675096e-07, "logits/chosen": -1.9382431507110596, "logits/rejected": -1.867269515991211, "logps/chosen": -294.2776184082031, "logps/rejected": -447.78924560546875, "loss": 0.3943, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.364957332611084, "rewards/margins": 1.5870397090911865, "rewards/rejected": -3.9519970417022705, "step": 12190 }, { "epoch": 2.101998621640248, "grad_norm": 34.27021789550781, "learning_rate": 1.2450838644251663e-07, "logits/chosen": -1.9983274936676025, "logits/rejected": -1.9339754581451416, "logps/chosen": -304.81097412109375, "logps/rejected": -473.16876220703125, "loss": 0.336, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.4736709594726562, "rewards/margins": 1.7213926315307617, "rewards/rejected": -4.195063591003418, "step": 12200 }, { "epoch": 2.103721571330117, "grad_norm": 53.004234313964844, "learning_rate": 1.2407514771971366e-07, "logits/chosen": -1.9715681076049805, "logits/rejected": -1.9261468648910522, "logps/chosen": -305.8539733886719, "logps/rejected": -452.8780212402344, "loss": 0.3897, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.509162664413452, "rewards/margins": 1.5120748281478882, "rewards/rejected": -4.021237373352051, "step": 12210 }, { "epoch": 2.105444521019986, "grad_norm": 56.49449157714844, "learning_rate": 1.236424151397036e-07, "logits/chosen": -1.9193532466888428, "logits/rejected": -1.8550560474395752, "logps/chosen": -287.67779541015625, "logps/rejected": -467.0467834472656, "loss": 0.342, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.3402535915374756, "rewards/margins": 1.8010280132293701, "rewards/rejected": -4.141281604766846, "step": 12220 }, { "epoch": 2.107167470709855, "grad_norm": 79.3811264038086, "learning_rate": 1.2321019044181297e-07, "logits/chosen": -1.969342589378357, "logits/rejected": -1.9075031280517578, "logps/chosen": -309.88726806640625, "logps/rejected": -475.4771423339844, "loss": 0.4198, "rewards/accuracies": 0.8125, "rewards/chosen": -2.5718398094177246, "rewards/margins": 1.682697057723999, "rewards/rejected": -4.2545366287231445, "step": 12230 }, { "epoch": 2.1088904203997245, "grad_norm": 60.88803482055664, "learning_rate": 1.2277847536332747e-07, "logits/chosen": -1.8940273523330688, "logits/rejected": -1.8209311962127686, "logps/chosen": -306.8291015625, "logps/rejected": -492.16131591796875, "loss": 0.3838, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.5310661792755127, "rewards/margins": 1.874685287475586, "rewards/rejected": -4.4057512283325195, "step": 12240 }, { "epoch": 2.1106133700895935, "grad_norm": 62.910980224609375, "learning_rate": 1.2234727163948405e-07, "logits/chosen": -1.9293591976165771, "logits/rejected": -1.8716834783554077, "logps/chosen": -302.0365295410156, "logps/rejected": -434.86163330078125, "loss": 0.4651, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.4879209995269775, "rewards/margins": 1.3595836162567139, "rewards/rejected": -3.8475043773651123, "step": 12250 }, { "epoch": 2.1123363197794625, "grad_norm": 42.876258850097656, "learning_rate": 1.2191658100346464e-07, "logits/chosen": -1.9628655910491943, "logits/rejected": -1.8976800441741943, "logps/chosen": -288.15582275390625, "logps/rejected": -434.14837646484375, "loss": 0.42, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.3457915782928467, "rewards/margins": 1.4467397928237915, "rewards/rejected": -3.7925314903259277, "step": 12260 }, { "epoch": 2.1140592694693314, "grad_norm": 39.460296630859375, "learning_rate": 1.2148640518638848e-07, "logits/chosen": -2.0182292461395264, "logits/rejected": -1.944959044456482, "logps/chosen": -299.56915283203125, "logps/rejected": -482.38226318359375, "loss": 0.362, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.435922145843506, "rewards/margins": 1.85861074924469, "rewards/rejected": -4.294532775878906, "step": 12270 }, { "epoch": 2.1157822191592004, "grad_norm": 51.082515716552734, "learning_rate": 1.2105674591730598e-07, "logits/chosen": -1.9684298038482666, "logits/rejected": -1.8986520767211914, "logps/chosen": -306.10955810546875, "logps/rejected": -487.24322509765625, "loss": 0.3648, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.485935688018799, "rewards/margins": 1.8657524585723877, "rewards/rejected": -4.351687908172607, "step": 12280 }, { "epoch": 2.11750516884907, "grad_norm": 55.751399993896484, "learning_rate": 1.2062760492319088e-07, "logits/chosen": -1.898820161819458, "logits/rejected": -1.8377506732940674, "logps/chosen": -316.0989074707031, "logps/rejected": -465.7880859375, "loss": 0.425, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.613844394683838, "rewards/margins": 1.516311526298523, "rewards/rejected": -4.13015604019165, "step": 12290 }, { "epoch": 2.1192281185389388, "grad_norm": 42.29615783691406, "learning_rate": 1.2019898392893412e-07, "logits/chosen": -1.9786655902862549, "logits/rejected": -1.9071992635726929, "logps/chosen": -295.4461669921875, "logps/rejected": -468.9852600097656, "loss": 0.3324, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.3980698585510254, "rewards/margins": 1.7718623876571655, "rewards/rejected": -4.1699323654174805, "step": 12300 }, { "epoch": 2.1209510682288077, "grad_norm": 33.926795959472656, "learning_rate": 1.197708846573366e-07, "logits/chosen": -1.9467910528182983, "logits/rejected": -1.8622026443481445, "logps/chosen": -285.8037109375, "logps/rejected": -477.11962890625, "loss": 0.3178, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.292365789413452, "rewards/margins": 1.9615243673324585, "rewards/rejected": -4.253890514373779, "step": 12310 }, { "epoch": 2.1226740179186767, "grad_norm": 39.43899917602539, "learning_rate": 1.1934330882910173e-07, "logits/chosen": -1.9613946676254272, "logits/rejected": -1.8820440769195557, "logps/chosen": -276.88873291015625, "logps/rejected": -472.463134765625, "loss": 0.3501, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.211195707321167, "rewards/margins": 1.9676494598388672, "rewards/rejected": -4.178845405578613, "step": 12320 }, { "epoch": 2.1243969676085457, "grad_norm": 43.35344314575195, "learning_rate": 1.1891625816282938e-07, "logits/chosen": -1.9653129577636719, "logits/rejected": -1.905160903930664, "logps/chosen": -290.321044921875, "logps/rejected": -454.90887451171875, "loss": 0.3681, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.323530673980713, "rewards/margins": 1.6830909252166748, "rewards/rejected": -4.006621360778809, "step": 12330 }, { "epoch": 2.126119917298415, "grad_norm": 37.30738067626953, "learning_rate": 1.1848973437500862e-07, "logits/chosen": -1.980043649673462, "logits/rejected": -1.9064311981201172, "logps/chosen": -289.3338317871094, "logps/rejected": -467.463623046875, "loss": 0.3416, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.342607021331787, "rewards/margins": 1.8010165691375732, "rewards/rejected": -4.143623352050781, "step": 12340 }, { "epoch": 2.127842866988284, "grad_norm": 26.57041358947754, "learning_rate": 1.1806373918001058e-07, "logits/chosen": -1.9231382608413696, "logits/rejected": -1.8419666290283203, "logps/chosen": -295.4007873535156, "logps/rejected": -453.6814880371094, "loss": 0.3881, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.3837594985961914, "rewards/margins": 1.6471055746078491, "rewards/rejected": -4.03086519241333, "step": 12350 }, { "epoch": 2.129565816678153, "grad_norm": 43.93779754638672, "learning_rate": 1.1763827429008174e-07, "logits/chosen": -1.977709412574768, "logits/rejected": -1.9125217199325562, "logps/chosen": -268.74908447265625, "logps/rejected": -464.39794921875, "loss": 0.2971, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -2.145785093307495, "rewards/margins": 1.9847934246063232, "rewards/rejected": -4.130578517913818, "step": 12360 }, { "epoch": 2.131288766368022, "grad_norm": 21.26366424560547, "learning_rate": 1.1721334141533726e-07, "logits/chosen": -1.9472122192382812, "logits/rejected": -1.8628746271133423, "logps/chosen": -314.5235290527344, "logps/rejected": -505.862060546875, "loss": 0.3549, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.6088333129882812, "rewards/margins": 1.9576690196990967, "rewards/rejected": -4.566501617431641, "step": 12370 }, { "epoch": 2.133011716057891, "grad_norm": 45.90119552612305, "learning_rate": 1.1678894226375394e-07, "logits/chosen": -1.940728783607483, "logits/rejected": -1.8751938343048096, "logps/chosen": -305.2769470214844, "logps/rejected": -515.9562377929688, "loss": 0.2863, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.5387349128723145, "rewards/margins": 2.065279006958008, "rewards/rejected": -4.604013919830322, "step": 12380 }, { "epoch": 2.13473466574776, "grad_norm": 32.29837417602539, "learning_rate": 1.1636507854116301e-07, "logits/chosen": -1.999100923538208, "logits/rejected": -1.9313665628433228, "logps/chosen": -318.89703369140625, "logps/rejected": -516.4935302734375, "loss": 0.3491, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.646089792251587, "rewards/margins": 1.9831323623657227, "rewards/rejected": -4.629221439361572, "step": 12390 }, { "epoch": 2.1364576154376294, "grad_norm": 34.866539001464844, "learning_rate": 1.1594175195124398e-07, "logits/chosen": -1.9335514307022095, "logits/rejected": -1.8725340366363525, "logps/chosen": -352.32989501953125, "logps/rejected": -545.3397216796875, "loss": 0.337, "rewards/accuracies": 0.84375, "rewards/chosen": -2.962782859802246, "rewards/margins": 1.9329183101654053, "rewards/rejected": -4.895700931549072, "step": 12400 }, { "epoch": 2.1364576154376294, "eval_logits/chosen": -1.947951316833496, "eval_logits/rejected": -1.921724557876587, "eval_logps/chosen": -380.37890625, "eval_logps/rejected": -445.4347229003906, "eval_loss": 0.7082515358924866, "eval_rewards/accuracies": 0.6424256563186646, "eval_rewards/chosen": -3.2136342525482178, "eval_rewards/margins": 0.6132170557975769, "eval_rewards/rejected": -3.8268511295318604, "eval_runtime": 361.9137, "eval_samples_per_second": 11.892, "eval_steps_per_second": 1.487, "step": 12400 }, { "epoch": 2.1381805651274983, "grad_norm": 52.224456787109375, "learning_rate": 1.1551896419551715e-07, "logits/chosen": -1.9209928512573242, "logits/rejected": -1.845625638961792, "logps/chosen": -325.1629638671875, "logps/rejected": -501.7755432128906, "loss": 0.3743, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.700852394104004, "rewards/margins": 1.7984294891357422, "rewards/rejected": -4.499281883239746, "step": 12410 }, { "epoch": 2.1399035148173673, "grad_norm": 79.16876983642578, "learning_rate": 1.15096716973337e-07, "logits/chosen": -1.8652293682098389, "logits/rejected": -1.7988011837005615, "logps/chosen": -337.16937255859375, "logps/rejected": -534.8143920898438, "loss": 0.3469, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.809027671813965, "rewards/margins": 2.0188217163085938, "rewards/rejected": -4.827849388122559, "step": 12420 }, { "epoch": 2.1416264645072363, "grad_norm": 56.75960922241211, "learning_rate": 1.1467501198188565e-07, "logits/chosen": -1.9250608682632446, "logits/rejected": -1.8526904582977295, "logps/chosen": -330.2320251464844, "logps/rejected": -527.3609008789062, "loss": 0.3074, "rewards/accuracies": 0.84375, "rewards/chosen": -2.7417359352111816, "rewards/margins": 1.9952952861785889, "rewards/rejected": -4.737030982971191, "step": 12430 }, { "epoch": 2.1433494141971057, "grad_norm": 30.24839210510254, "learning_rate": 1.1425385091616563e-07, "logits/chosen": -1.892388105392456, "logits/rejected": -1.8306602239608765, "logps/chosen": -319.5003967285156, "logps/rejected": -474.87286376953125, "loss": 0.418, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.6500840187072754, "rewards/margins": 1.5815784931182861, "rewards/rejected": -4.231662273406982, "step": 12440 }, { "epoch": 2.1450723638869746, "grad_norm": 67.47943115234375, "learning_rate": 1.1383323546899315e-07, "logits/chosen": -1.9388656616210938, "logits/rejected": -1.8774505853652954, "logps/chosen": -300.49517822265625, "logps/rejected": -471.45458984375, "loss": 0.4041, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.489452838897705, "rewards/margins": 1.7244945764541626, "rewards/rejected": -4.213947772979736, "step": 12450 }, { "epoch": 2.1467953135768436, "grad_norm": 41.299598693847656, "learning_rate": 1.1341316733099133e-07, "logits/chosen": -1.9813731908798218, "logits/rejected": -1.9062292575836182, "logps/chosen": -311.234130859375, "logps/rejected": -516.5731811523438, "loss": 0.3355, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.5532143115997314, "rewards/margins": 2.094651699066162, "rewards/rejected": -4.6478657722473145, "step": 12460 }, { "epoch": 2.1485182632667126, "grad_norm": 37.120460510253906, "learning_rate": 1.129936481905836e-07, "logits/chosen": -2.0257461071014404, "logits/rejected": -1.9522682428359985, "logps/chosen": -311.55670166015625, "logps/rejected": -506.30047607421875, "loss": 0.325, "rewards/accuracies": 0.875, "rewards/chosen": -2.5549588203430176, "rewards/margins": 1.9551002979278564, "rewards/rejected": -4.510058879852295, "step": 12470 }, { "epoch": 2.1502412129565815, "grad_norm": 39.97397232055664, "learning_rate": 1.1257467973398674e-07, "logits/chosen": -2.011247158050537, "logits/rejected": -1.9420245885849, "logps/chosen": -311.06890869140625, "logps/rejected": -495.9739685058594, "loss": 0.3435, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.5540554523468018, "rewards/margins": 1.8669353723526, "rewards/rejected": -4.420990943908691, "step": 12480 }, { "epoch": 2.1519641626464505, "grad_norm": 58.95415496826172, "learning_rate": 1.1215626364520398e-07, "logits/chosen": -1.8889681100845337, "logits/rejected": -1.823825478553772, "logps/chosen": -331.1417236328125, "logps/rejected": -522.0241088867188, "loss": 0.381, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.752253532409668, "rewards/margins": 1.9356873035430908, "rewards/rejected": -4.687941074371338, "step": 12490 }, { "epoch": 2.15368711233632, "grad_norm": 58.76851272583008, "learning_rate": 1.1173840160601828e-07, "logits/chosen": -1.9338499307632446, "logits/rejected": -1.8589493036270142, "logps/chosen": -323.54364013671875, "logps/rejected": -527.2523193359375, "loss": 0.3432, "rewards/accuracies": 0.875, "rewards/chosen": -2.6904850006103516, "rewards/margins": 2.0569777488708496, "rewards/rejected": -4.747463226318359, "step": 12500 }, { "epoch": 2.155410062026189, "grad_norm": 35.479522705078125, "learning_rate": 1.1132109529598588e-07, "logits/chosen": -1.9746873378753662, "logits/rejected": -1.916210412979126, "logps/chosen": -325.0608825683594, "logps/rejected": -507.01348876953125, "loss": 0.3756, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.7096118927001953, "rewards/margins": 1.849020004272461, "rewards/rejected": -4.558631896972656, "step": 12510 }, { "epoch": 2.157133011716058, "grad_norm": 42.347007751464844, "learning_rate": 1.1090434639242935e-07, "logits/chosen": -1.9247783422470093, "logits/rejected": -1.8621553182601929, "logps/chosen": -323.98992919921875, "logps/rejected": -486.0606994628906, "loss": 0.4181, "rewards/accuracies": 0.8125, "rewards/chosen": -2.701956272125244, "rewards/margins": 1.6232349872589111, "rewards/rejected": -4.325191497802734, "step": 12520 }, { "epoch": 2.158855961405927, "grad_norm": 62.47432327270508, "learning_rate": 1.1048815657043057e-07, "logits/chosen": -1.942082405090332, "logits/rejected": -1.8820416927337646, "logps/chosen": -301.18975830078125, "logps/rejected": -475.2323303222656, "loss": 0.3449, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.461287021636963, "rewards/margins": 1.740502119064331, "rewards/rejected": -4.201788902282715, "step": 12530 }, { "epoch": 2.160578911095796, "grad_norm": 38.01189041137695, "learning_rate": 1.1007252750282431e-07, "logits/chosen": -1.9692459106445312, "logits/rejected": -1.9062296152114868, "logps/chosen": -281.21124267578125, "logps/rejected": -437.91192626953125, "loss": 0.4287, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.2666797637939453, "rewards/margins": 1.5799837112426758, "rewards/rejected": -3.846663236618042, "step": 12540 }, { "epoch": 2.162301860785665, "grad_norm": 43.03032302856445, "learning_rate": 1.0965746086019165e-07, "logits/chosen": -2.036771297454834, "logits/rejected": -1.9848034381866455, "logps/chosen": -265.8465881347656, "logps/rejected": -407.20782470703125, "loss": 0.4139, "rewards/accuracies": 0.78125, "rewards/chosen": -2.1340770721435547, "rewards/margins": 1.4216623306274414, "rewards/rejected": -3.555739641189575, "step": 12550 }, { "epoch": 2.164024810475534, "grad_norm": 57.60479736328125, "learning_rate": 1.0924295831085273e-07, "logits/chosen": -1.9816356897354126, "logits/rejected": -1.9209493398666382, "logps/chosen": -287.20751953125, "logps/rejected": -415.9566955566406, "loss": 0.4487, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.2888903617858887, "rewards/margins": 1.3815561532974243, "rewards/rejected": -3.6704468727111816, "step": 12560 }, { "epoch": 2.165747760165403, "grad_norm": 31.562345504760742, "learning_rate": 1.0882902152086069e-07, "logits/chosen": -1.9467792510986328, "logits/rejected": -1.8678953647613525, "logps/chosen": -278.6131286621094, "logps/rejected": -439.3788146972656, "loss": 0.3627, "rewards/accuracies": 0.875, "rewards/chosen": -2.2470502853393555, "rewards/margins": 1.6461206674575806, "rewards/rejected": -3.8931708335876465, "step": 12570 }, { "epoch": 2.167470709855272, "grad_norm": 42.792945861816406, "learning_rate": 1.0841565215399454e-07, "logits/chosen": -1.9815813302993774, "logits/rejected": -1.8989368677139282, "logps/chosen": -292.2269592285156, "logps/rejected": -474.1559143066406, "loss": 0.3324, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.3025739192962646, "rewards/margins": 1.9217106103897095, "rewards/rejected": -4.2242841720581055, "step": 12580 }, { "epoch": 2.169193659545141, "grad_norm": 43.01104736328125, "learning_rate": 1.0800285187175251e-07, "logits/chosen": -1.9528411626815796, "logits/rejected": -1.883568525314331, "logps/chosen": -281.2275390625, "logps/rejected": -457.57440185546875, "loss": 0.3189, "rewards/accuracies": 0.875, "rewards/chosen": -2.24062442779541, "rewards/margins": 1.8133739233016968, "rewards/rejected": -4.0539984703063965, "step": 12590 }, { "epoch": 2.1709166092350105, "grad_norm": 41.17893981933594, "learning_rate": 1.075906223333454e-07, "logits/chosen": -1.9607336521148682, "logits/rejected": -1.900046706199646, "logps/chosen": -310.1869201660156, "logps/rejected": -484.5005798339844, "loss": 0.3768, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.5064454078674316, "rewards/margins": 1.8170220851898193, "rewards/rejected": -4.323467254638672, "step": 12600 }, { "epoch": 2.1726395589248795, "grad_norm": 35.73141860961914, "learning_rate": 1.0717896519569017e-07, "logits/chosen": -2.036128520965576, "logits/rejected": -1.983715295791626, "logps/chosen": -299.83428955078125, "logps/rejected": -466.02716064453125, "loss": 0.399, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.4497034549713135, "rewards/margins": 1.6737884283065796, "rewards/rejected": -4.123492240905762, "step": 12610 }, { "epoch": 2.1743625086147484, "grad_norm": 28.763906478881836, "learning_rate": 1.067678821134031e-07, "logits/chosen": -2.064497709274292, "logits/rejected": -1.9927780628204346, "logps/chosen": -318.9910583496094, "logps/rejected": -503.26641845703125, "loss": 0.3777, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.63399600982666, "rewards/margins": 1.878535509109497, "rewards/rejected": -4.512531757354736, "step": 12620 }, { "epoch": 2.1760854583046174, "grad_norm": 47.750335693359375, "learning_rate": 1.0635737473879267e-07, "logits/chosen": -2.014401912689209, "logits/rejected": -1.9549516439437866, "logps/chosen": -287.65740966796875, "logps/rejected": -465.9449768066406, "loss": 0.3961, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.3684074878692627, "rewards/margins": 1.7788422107696533, "rewards/rejected": -4.147249698638916, "step": 12630 }, { "epoch": 2.1778084079944864, "grad_norm": 39.04806137084961, "learning_rate": 1.0594744472185377e-07, "logits/chosen": -2.042396068572998, "logits/rejected": -1.9774303436279297, "logps/chosen": -277.58447265625, "logps/rejected": -448.61944580078125, "loss": 0.3356, "rewards/accuracies": 0.875, "rewards/chosen": -2.2217676639556885, "rewards/margins": 1.7359695434570312, "rewards/rejected": -3.9577369689941406, "step": 12640 }, { "epoch": 2.179531357684356, "grad_norm": 48.59407043457031, "learning_rate": 1.055380937102607e-07, "logits/chosen": -1.9850565195083618, "logits/rejected": -1.9248874187469482, "logps/chosen": -283.30914306640625, "logps/rejected": -431.22015380859375, "loss": 0.4298, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.2947239875793457, "rewards/margins": 1.5094715356826782, "rewards/rejected": -3.8041954040527344, "step": 12650 }, { "epoch": 2.1812543073742248, "grad_norm": 44.61518859863281, "learning_rate": 1.0512932334936015e-07, "logits/chosen": -1.943450689315796, "logits/rejected": -1.8831688165664673, "logps/chosen": -290.56170654296875, "logps/rejected": -436.44189453125, "loss": 0.433, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.3756332397460938, "rewards/margins": 1.4659080505371094, "rewards/rejected": -3.841541290283203, "step": 12660 }, { "epoch": 2.1829772570640937, "grad_norm": 52.15546798706055, "learning_rate": 1.0472113528216531e-07, "logits/chosen": -2.1301369667053223, "logits/rejected": -2.0479319095611572, "logps/chosen": -263.2397766113281, "logps/rejected": -430.3846740722656, "loss": 0.406, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.0902490615844727, "rewards/margins": 1.7033288478851318, "rewards/rejected": -3.7935779094696045, "step": 12670 }, { "epoch": 2.1847002067539627, "grad_norm": 38.04237365722656, "learning_rate": 1.0431353114934857e-07, "logits/chosen": -1.9793756008148193, "logits/rejected": -1.9182277917861938, "logps/chosen": -273.33563232421875, "logps/rejected": -430.2529296875, "loss": 0.3823, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.2080166339874268, "rewards/margins": 1.5849330425262451, "rewards/rejected": -3.792949676513672, "step": 12680 }, { "epoch": 2.1864231564438317, "grad_norm": 32.16179656982422, "learning_rate": 1.0390651258923558e-07, "logits/chosen": -1.9667625427246094, "logits/rejected": -1.9126285314559937, "logps/chosen": -272.6062927246094, "logps/rejected": -436.34906005859375, "loss": 0.445, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.1974709033966064, "rewards/margins": 1.6450570821762085, "rewards/rejected": -3.8425278663635254, "step": 12690 }, { "epoch": 2.188146106133701, "grad_norm": 36.72175979614258, "learning_rate": 1.0350008123779796e-07, "logits/chosen": -1.9585024118423462, "logits/rejected": -1.8859550952911377, "logps/chosen": -280.945556640625, "logps/rejected": -460.8404235839844, "loss": 0.3258, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.225369930267334, "rewards/margins": 1.8467836380004883, "rewards/rejected": -4.072153568267822, "step": 12700 }, { "epoch": 2.18986905582357, "grad_norm": 43.44378662109375, "learning_rate": 1.0309423872864753e-07, "logits/chosen": -2.018216133117676, "logits/rejected": -1.964107871055603, "logps/chosen": -296.74383544921875, "logps/rejected": -466.38507080078125, "loss": 0.3886, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.421928882598877, "rewards/margins": 1.669752836227417, "rewards/rejected": -4.091681480407715, "step": 12710 }, { "epoch": 2.191592005513439, "grad_norm": 42.997657775878906, "learning_rate": 1.02688986693029e-07, "logits/chosen": -2.0137364864349365, "logits/rejected": -1.9375922679901123, "logps/chosen": -292.14178466796875, "logps/rejected": -474.83673095703125, "loss": 0.3107, "rewards/accuracies": 0.90625, "rewards/chosen": -2.3449535369873047, "rewards/margins": 1.8990837335586548, "rewards/rejected": -4.244036674499512, "step": 12720 }, { "epoch": 2.193314955203308, "grad_norm": 41.74778366088867, "learning_rate": 1.0228432675981372e-07, "logits/chosen": -1.9188010692596436, "logits/rejected": -1.8514400720596313, "logps/chosen": -325.906005859375, "logps/rejected": -501.18505859375, "loss": 0.3985, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.747020959854126, "rewards/margins": 1.7683101892471313, "rewards/rejected": -4.515331268310547, "step": 12730 }, { "epoch": 2.195037904893177, "grad_norm": 30.27802276611328, "learning_rate": 1.0188026055549331e-07, "logits/chosen": -1.923540711402893, "logits/rejected": -1.8737980127334595, "logps/chosen": -367.73773193359375, "logps/rejected": -529.8878784179688, "loss": 0.4129, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -3.150376796722412, "rewards/margins": 1.6354032754898071, "rewards/rejected": -4.7857794761657715, "step": 12740 }, { "epoch": 2.1967608545830464, "grad_norm": 37.93767547607422, "learning_rate": 1.0147678970417304e-07, "logits/chosen": -1.9292017221450806, "logits/rejected": -1.8547041416168213, "logps/chosen": -310.8869323730469, "logps/rejected": -498.99090576171875, "loss": 0.3087, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.556175947189331, "rewards/margins": 1.895201325416565, "rewards/rejected": -4.451376914978027, "step": 12750 }, { "epoch": 2.1984838042729153, "grad_norm": 41.72764587402344, "learning_rate": 1.0107391582756492e-07, "logits/chosen": -1.9289324283599854, "logits/rejected": -1.8683983087539673, "logps/chosen": -341.8710021972656, "logps/rejected": -502.6561584472656, "loss": 0.469, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.890869140625, "rewards/margins": 1.6119649410247803, "rewards/rejected": -4.502834320068359, "step": 12760 }, { "epoch": 2.2002067539627843, "grad_norm": 48.156402587890625, "learning_rate": 1.0067164054498154e-07, "logits/chosen": -2.0199224948883057, "logits/rejected": -1.9415843486785889, "logps/chosen": -305.77471923828125, "logps/rejected": -477.60101318359375, "loss": 0.3633, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.4717206954956055, "rewards/margins": 1.7889131307601929, "rewards/rejected": -4.260633945465088, "step": 12770 }, { "epoch": 2.2019297036526533, "grad_norm": 37.46812438964844, "learning_rate": 1.0026996547332969e-07, "logits/chosen": -2.0082437992095947, "logits/rejected": -1.9417692422866821, "logps/chosen": -300.9212341308594, "logps/rejected": -466.66424560546875, "loss": 0.4138, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.4866092205047607, "rewards/margins": 1.6510769128799438, "rewards/rejected": -4.137686252593994, "step": 12780 }, { "epoch": 2.2036526533425222, "grad_norm": 58.198490142822266, "learning_rate": 9.986889222710365e-08, "logits/chosen": -1.9383857250213623, "logits/rejected": -1.872574806213379, "logps/chosen": -312.90655517578125, "logps/rejected": -490.7158203125, "loss": 0.3513, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.575570583343506, "rewards/margins": 1.810793161392212, "rewards/rejected": -4.386363506317139, "step": 12790 }, { "epoch": 2.205375603032391, "grad_norm": 49.12572479248047, "learning_rate": 9.946842241837853e-08, "logits/chosen": -1.9933202266693115, "logits/rejected": -1.933647871017456, "logps/chosen": -296.76055908203125, "logps/rejected": -453.9546813964844, "loss": 0.3756, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.406980514526367, "rewards/margins": 1.5803667306900024, "rewards/rejected": -3.987347364425659, "step": 12800 }, { "epoch": 2.205375603032391, "eval_logits/chosen": -2.017103433609009, "eval_logits/rejected": -1.9938302040100098, "eval_logps/chosen": -315.3892822265625, "eval_logps/rejected": -370.35186767578125, "eval_loss": 0.6891571283340454, "eval_rewards/accuracies": 0.6377788186073303, "eval_rewards/chosen": -2.5637378692626953, "eval_rewards/margins": 0.5122846961021423, "eval_rewards/rejected": -3.0760228633880615, "eval_runtime": 362.6532, "eval_samples_per_second": 11.868, "eval_steps_per_second": 1.484, "step": 12800 }, { "epoch": 2.2070985527222606, "grad_norm": 45.974220275878906, "learning_rate": 9.906855765680399e-08, "logits/chosen": -2.0251450538635254, "logits/rejected": -1.9628913402557373, "logps/chosen": -288.62579345703125, "logps/rejected": -465.579345703125, "loss": 0.3406, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.3746047019958496, "rewards/margins": 1.739974021911621, "rewards/rejected": -4.114578723907471, "step": 12810 }, { "epoch": 2.2088215024121296, "grad_norm": 43.94154357910156, "learning_rate": 9.866929954959796e-08, "logits/chosen": -1.9383151531219482, "logits/rejected": -1.876800298690796, "logps/chosen": -278.95458984375, "logps/rejected": -459.8402404785156, "loss": 0.3444, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.241555690765381, "rewards/margins": 1.8143020868301392, "rewards/rejected": -4.0558576583862305, "step": 12820 }, { "epoch": 2.2105444521019986, "grad_norm": 51.672645568847656, "learning_rate": 9.827064970153998e-08, "logits/chosen": -1.9496148824691772, "logits/rejected": -1.8914169073104858, "logps/chosen": -288.90203857421875, "logps/rejected": -435.0799255371094, "loss": 0.4525, "rewards/accuracies": 0.8125, "rewards/chosen": -2.3753180503845215, "rewards/margins": 1.460530161857605, "rewards/rejected": -3.835848331451416, "step": 12830 }, { "epoch": 2.2122674017918675, "grad_norm": 33.459754943847656, "learning_rate": 9.787260971496442e-08, "logits/chosen": -2.019003391265869, "logits/rejected": -1.9668471813201904, "logps/chosen": -292.3218994140625, "logps/rejected": -466.98895263671875, "loss": 0.3397, "rewards/accuracies": 0.875, "rewards/chosen": -2.36293888092041, "rewards/margins": 1.7675050497055054, "rewards/rejected": -4.130444049835205, "step": 12840 }, { "epoch": 2.213990351481737, "grad_norm": 38.255069732666016, "learning_rate": 9.747518118975478e-08, "logits/chosen": -2.018942356109619, "logits/rejected": -1.950378656387329, "logps/chosen": -291.0025634765625, "logps/rejected": -436.2259826660156, "loss": 0.3985, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.370919704437256, "rewards/margins": 1.4875982999801636, "rewards/rejected": -3.858518123626709, "step": 12850 }, { "epoch": 2.215713301171606, "grad_norm": 41.61083984375, "learning_rate": 9.707836572333664e-08, "logits/chosen": -1.9403775930404663, "logits/rejected": -1.87300705909729, "logps/chosen": -309.5350341796875, "logps/rejected": -477.6661682128906, "loss": 0.4175, "rewards/accuracies": 0.84375, "rewards/chosen": -2.5251624584198, "rewards/margins": 1.743912935256958, "rewards/rejected": -4.269075870513916, "step": 12860 }, { "epoch": 2.217436250861475, "grad_norm": 73.10432434082031, "learning_rate": 9.668216491067133e-08, "logits/chosen": -1.9701849222183228, "logits/rejected": -1.9051164388656616, "logps/chosen": -293.7796325683594, "logps/rejected": -461.9593811035156, "loss": 0.385, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.401444911956787, "rewards/margins": 1.718030571937561, "rewards/rejected": -4.119475364685059, "step": 12870 }, { "epoch": 2.219159200551344, "grad_norm": 50.50831604003906, "learning_rate": 9.628658034424992e-08, "logits/chosen": -2.003298282623291, "logits/rejected": -1.933159589767456, "logps/chosen": -281.7684326171875, "logps/rejected": -450.92095947265625, "loss": 0.3727, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.264930009841919, "rewards/margins": 1.6894314289093018, "rewards/rejected": -3.9543614387512207, "step": 12880 }, { "epoch": 2.220882150241213, "grad_norm": 54.304134368896484, "learning_rate": 9.589161361408643e-08, "logits/chosen": -2.011436939239502, "logits/rejected": -1.9597275257110596, "logps/chosen": -290.1167907714844, "logps/rejected": -449.3526916503906, "loss": 0.3589, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.314418315887451, "rewards/margins": 1.64906907081604, "rewards/rejected": -3.963487148284912, "step": 12890 }, { "epoch": 2.222605099931082, "grad_norm": 56.35163879394531, "learning_rate": 9.549726630771149e-08, "logits/chosen": -1.9865226745605469, "logits/rejected": -1.9059772491455078, "logps/chosen": -309.3846740722656, "logps/rejected": -501.5279235839844, "loss": 0.2949, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.540785551071167, "rewards/margins": 1.9388885498046875, "rewards/rejected": -4.479674339294434, "step": 12900 }, { "epoch": 2.224328049620951, "grad_norm": 44.442138671875, "learning_rate": 9.51035400101659e-08, "logits/chosen": -1.9946962594985962, "logits/rejected": -1.9332191944122314, "logps/chosen": -292.8513488769531, "logps/rejected": -463.7665100097656, "loss": 0.3893, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.4097213745117188, "rewards/margins": 1.6811046600341797, "rewards/rejected": -4.090826511383057, "step": 12910 }, { "epoch": 2.22605099931082, "grad_norm": 40.461021423339844, "learning_rate": 9.471043630399467e-08, "logits/chosen": -1.9348642826080322, "logits/rejected": -1.8610506057739258, "logps/chosen": -321.6365966796875, "logps/rejected": -519.1051025390625, "loss": 0.3462, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.679136276245117, "rewards/margins": 1.9852793216705322, "rewards/rejected": -4.66441535949707, "step": 12920 }, { "epoch": 2.227773949000689, "grad_norm": 43.913551330566406, "learning_rate": 9.431795676924026e-08, "logits/chosen": -1.9739036560058594, "logits/rejected": -1.9152370691299438, "logps/chosen": -321.7902526855469, "logps/rejected": -491.28924560546875, "loss": 0.44, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.715599298477173, "rewards/margins": 1.676016092300415, "rewards/rejected": -4.391615390777588, "step": 12930 }, { "epoch": 2.229496898690558, "grad_norm": 40.213253021240234, "learning_rate": 9.392610298343622e-08, "logits/chosen": -1.9419832229614258, "logits/rejected": -1.8752925395965576, "logps/chosen": -334.49664306640625, "logps/rejected": -508.883544921875, "loss": 0.3687, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.8115615844726562, "rewards/margins": 1.7945139408111572, "rewards/rejected": -4.606075763702393, "step": 12940 }, { "epoch": 2.231219848380427, "grad_norm": 64.1973876953125, "learning_rate": 9.353487652160094e-08, "logits/chosen": -1.953054666519165, "logits/rejected": -1.8800547122955322, "logps/chosen": -303.9375915527344, "logps/rejected": -511.6747131347656, "loss": 0.3222, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.495833396911621, "rewards/margins": 2.0900635719299316, "rewards/rejected": -4.585896968841553, "step": 12950 }, { "epoch": 2.2329427980702965, "grad_norm": 40.988040924072266, "learning_rate": 9.314427895623162e-08, "logits/chosen": -1.9215924739837646, "logits/rejected": -1.8643007278442383, "logps/chosen": -314.4231872558594, "logps/rejected": -465.78125, "loss": 0.3776, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.552246570587158, "rewards/margins": 1.578215479850769, "rewards/rejected": -4.130461692810059, "step": 12960 }, { "epoch": 2.2346657477601655, "grad_norm": 27.41073226928711, "learning_rate": 9.27543118572973e-08, "logits/chosen": -1.9060852527618408, "logits/rejected": -1.8388478755950928, "logps/chosen": -314.9337463378906, "logps/rejected": -499.300048828125, "loss": 0.3349, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.6078109741210938, "rewards/margins": 1.9120609760284424, "rewards/rejected": -4.519871711730957, "step": 12970 }, { "epoch": 2.2363886974500344, "grad_norm": 52.485843658447266, "learning_rate": 9.236497679223324e-08, "logits/chosen": -1.9126790761947632, "logits/rejected": -1.857478141784668, "logps/chosen": -317.0921936035156, "logps/rejected": -492.88433837890625, "loss": 0.3534, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.66068696975708, "rewards/margins": 1.7468478679656982, "rewards/rejected": -4.407535076141357, "step": 12980 }, { "epoch": 2.2381116471399034, "grad_norm": 34.429019927978516, "learning_rate": 9.197627532593402e-08, "logits/chosen": -1.9796491861343384, "logits/rejected": -1.9022690057754517, "logps/chosen": -330.0085144042969, "logps/rejected": -505.8802185058594, "loss": 0.3346, "rewards/accuracies": 0.84375, "rewards/chosen": -2.701561450958252, "rewards/margins": 1.8359028100967407, "rewards/rejected": -4.537464618682861, "step": 12990 }, { "epoch": 2.2398345968297724, "grad_norm": 40.48801803588867, "learning_rate": 9.158820902074788e-08, "logits/chosen": -1.9519132375717163, "logits/rejected": -1.8828893899917603, "logps/chosen": -338.058349609375, "logps/rejected": -521.3416748046875, "loss": 0.3592, "rewards/accuracies": 0.875, "rewards/chosen": -2.836092472076416, "rewards/margins": 1.8669312000274658, "rewards/rejected": -4.703024387359619, "step": 13000 }, { "epoch": 2.241557546519642, "grad_norm": 62.406288146972656, "learning_rate": 9.12007794364697e-08, "logits/chosen": -1.9421573877334595, "logits/rejected": -1.8789962530136108, "logps/chosen": -321.5479431152344, "logps/rejected": -476.6585998535156, "loss": 0.4545, "rewards/accuracies": 0.8125, "rewards/chosen": -2.704465627670288, "rewards/margins": 1.5701141357421875, "rewards/rejected": -4.2745795249938965, "step": 13010 }, { "epoch": 2.2432804962095108, "grad_norm": 42.24813461303711, "learning_rate": 9.081398813033536e-08, "logits/chosen": -1.930538535118103, "logits/rejected": -1.8583247661590576, "logps/chosen": -297.6632385253906, "logps/rejected": -470.6366271972656, "loss": 0.3904, "rewards/accuracies": 0.84375, "rewards/chosen": -2.452331066131592, "rewards/margins": 1.7635552883148193, "rewards/rejected": -4.21588659286499, "step": 13020 }, { "epoch": 2.2450034458993797, "grad_norm": 35.25090408325195, "learning_rate": 9.042783665701531e-08, "logits/chosen": -1.9962295293807983, "logits/rejected": -1.9263139963150024, "logps/chosen": -289.33966064453125, "logps/rejected": -452.6971740722656, "loss": 0.3754, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.3384387493133545, "rewards/margins": 1.6710551977157593, "rewards/rejected": -4.009493827819824, "step": 13030 }, { "epoch": 2.2467263955892487, "grad_norm": 30.05388641357422, "learning_rate": 9.004232656860805e-08, "logits/chosen": -2.0175726413726807, "logits/rejected": -1.9478025436401367, "logps/chosen": -280.0323486328125, "logps/rejected": -462.7877502441406, "loss": 0.3584, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.249023199081421, "rewards/margins": 1.8285497426986694, "rewards/rejected": -4.077572822570801, "step": 13040 }, { "epoch": 2.2484493452791177, "grad_norm": 35.090755462646484, "learning_rate": 8.965745941463407e-08, "logits/chosen": -2.0072054862976074, "logits/rejected": -1.943996787071228, "logps/chosen": -298.59716796875, "logps/rejected": -464.1021423339844, "loss": 0.4069, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.4602174758911133, "rewards/margins": 1.648674726486206, "rewards/rejected": -4.108892440795898, "step": 13050 }, { "epoch": 2.250172294968987, "grad_norm": 41.31547164916992, "learning_rate": 8.927323674202997e-08, "logits/chosen": -1.9858779907226562, "logits/rejected": -1.9159005880355835, "logps/chosen": -300.620849609375, "logps/rejected": -488.54345703125, "loss": 0.372, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.464571952819824, "rewards/margins": 1.8845546245574951, "rewards/rejected": -4.349126815795898, "step": 13060 }, { "epoch": 2.251895244658856, "grad_norm": 66.54163360595703, "learning_rate": 8.888966009514157e-08, "logits/chosen": -1.9481319189071655, "logits/rejected": -1.8802512884140015, "logps/chosen": -297.29669189453125, "logps/rejected": -451.35125732421875, "loss": 0.4606, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.3986568450927734, "rewards/margins": 1.5888535976409912, "rewards/rejected": -3.9875102043151855, "step": 13070 }, { "epoch": 2.253618194348725, "grad_norm": 40.96952438354492, "learning_rate": 8.850673101571816e-08, "logits/chosen": -1.9249690771102905, "logits/rejected": -1.8608089685440063, "logps/chosen": -316.6446228027344, "logps/rejected": -493.554443359375, "loss": 0.3944, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.6456961631774902, "rewards/margins": 1.778590202331543, "rewards/rejected": -4.424286842346191, "step": 13080 }, { "epoch": 2.255341144038594, "grad_norm": 26.038515090942383, "learning_rate": 8.812445104290625e-08, "logits/chosen": -2.0093979835510254, "logits/rejected": -1.9287300109863281, "logps/chosen": -303.60076904296875, "logps/rejected": -475.38250732421875, "loss": 0.3562, "rewards/accuracies": 0.84375, "rewards/chosen": -2.490363359451294, "rewards/margins": 1.7500556707382202, "rewards/rejected": -4.240418434143066, "step": 13090 }, { "epoch": 2.257064093728463, "grad_norm": 29.471046447753906, "learning_rate": 8.774282171324346e-08, "logits/chosen": -2.070284605026245, "logits/rejected": -1.9959561824798584, "logps/chosen": -299.08770751953125, "logps/rejected": -488.11505126953125, "loss": 0.3522, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.435448169708252, "rewards/margins": 1.9503271579742432, "rewards/rejected": -4.385775089263916, "step": 13100 }, { "epoch": 2.2587870434183324, "grad_norm": 32.652076721191406, "learning_rate": 8.736184456065182e-08, "logits/chosen": -1.935272455215454, "logits/rejected": -1.8669092655181885, "logps/chosen": -301.9248046875, "logps/rejected": -471.02398681640625, "loss": 0.3853, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.469153642654419, "rewards/margins": 1.7321808338165283, "rewards/rejected": -4.201333999633789, "step": 13110 }, { "epoch": 2.2605099931082013, "grad_norm": 41.857730865478516, "learning_rate": 8.698152111643242e-08, "logits/chosen": -2.039344310760498, "logits/rejected": -1.9828838109970093, "logps/chosen": -300.8283996582031, "logps/rejected": -457.0296936035156, "loss": 0.4085, "rewards/accuracies": 0.8125, "rewards/chosen": -2.454498529434204, "rewards/margins": 1.5927093029022217, "rewards/rejected": -4.047207832336426, "step": 13120 }, { "epoch": 2.2622329427980703, "grad_norm": 39.48984146118164, "learning_rate": 8.66018529092585e-08, "logits/chosen": -2.0704100131988525, "logits/rejected": -1.9895254373550415, "logps/chosen": -277.7041320800781, "logps/rejected": -487.31732177734375, "loss": 0.3333, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.2258903980255127, "rewards/margins": 2.0747389793395996, "rewards/rejected": -4.300629615783691, "step": 13130 }, { "epoch": 2.2639558924879393, "grad_norm": 21.945051193237305, "learning_rate": 8.622284146516995e-08, "logits/chosen": -1.977802038192749, "logits/rejected": -1.9135671854019165, "logps/chosen": -283.92120361328125, "logps/rejected": -446.6160583496094, "loss": 0.3184, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.2972354888916016, "rewards/margins": 1.6303297281265259, "rewards/rejected": -3.927565097808838, "step": 13140 }, { "epoch": 2.2656788421778082, "grad_norm": 38.17441940307617, "learning_rate": 8.58444883075665e-08, "logits/chosen": -1.9683263301849365, "logits/rejected": -1.8931963443756104, "logps/chosen": -302.1012268066406, "logps/rejected": -507.4934997558594, "loss": 0.2913, "rewards/accuracies": 0.90625, "rewards/chosen": -2.4572086334228516, "rewards/margins": 2.092195987701416, "rewards/rejected": -4.549405097961426, "step": 13150 }, { "epoch": 2.2674017918676777, "grad_norm": 47.274574279785156, "learning_rate": 8.546679495720233e-08, "logits/chosen": -1.9338089227676392, "logits/rejected": -1.8687407970428467, "logps/chosen": -343.2513732910156, "logps/rejected": -502.92822265625, "loss": 0.4, "rewards/accuracies": 0.8125, "rewards/chosen": -2.8605539798736572, "rewards/margins": 1.6396598815917969, "rewards/rejected": -4.500214099884033, "step": 13160 }, { "epoch": 2.2691247415575466, "grad_norm": 40.87839126586914, "learning_rate": 8.508976293217937e-08, "logits/chosen": -1.968770980834961, "logits/rejected": -1.9037364721298218, "logps/chosen": -304.5184020996094, "logps/rejected": -481.93975830078125, "loss": 0.3504, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.4937708377838135, "rewards/margins": 1.8095462322235107, "rewards/rejected": -4.303317070007324, "step": 13170 }, { "epoch": 2.2708476912474156, "grad_norm": 39.818702697753906, "learning_rate": 8.471339374794131e-08, "logits/chosen": -1.9877229928970337, "logits/rejected": -1.903790831565857, "logps/chosen": -292.5276794433594, "logps/rejected": -488.23516845703125, "loss": 0.2968, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.377256155014038, "rewards/margins": 2.0034468173980713, "rewards/rejected": -4.380703449249268, "step": 13180 }, { "epoch": 2.2725706409372846, "grad_norm": 46.20803451538086, "learning_rate": 8.433768891726794e-08, "logits/chosen": -1.9753103256225586, "logits/rejected": -1.912003755569458, "logps/chosen": -313.26513671875, "logps/rejected": -465.99542236328125, "loss": 0.4287, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.5642807483673096, "rewards/margins": 1.5769232511520386, "rewards/rejected": -4.141203880310059, "step": 13190 }, { "epoch": 2.2742935906271535, "grad_norm": 33.72400665283203, "learning_rate": 8.396264995026859e-08, "logits/chosen": -1.9571205377578735, "logits/rejected": -1.8977305889129639, "logps/chosen": -301.6448059082031, "logps/rejected": -455.74700927734375, "loss": 0.4071, "rewards/accuracies": 0.78125, "rewards/chosen": -2.4547247886657715, "rewards/margins": 1.5802429914474487, "rewards/rejected": -4.03496789932251, "step": 13200 }, { "epoch": 2.2742935906271535, "eval_logits/chosen": -2.0059151649475098, "eval_logits/rejected": -1.9810409545898438, "eval_logps/chosen": -331.4143371582031, "eval_logps/rejected": -390.3795166015625, "eval_loss": 0.6989346742630005, "eval_rewards/accuracies": 0.6345260143280029, "eval_rewards/chosen": -2.7239885330200195, "eval_rewards/margins": 0.552310585975647, "eval_rewards/rejected": -3.276298999786377, "eval_runtime": 362.5579, "eval_samples_per_second": 11.871, "eval_steps_per_second": 1.484, "step": 13200 }, { "epoch": 2.2760165403170225, "grad_norm": 22.102046966552734, "learning_rate": 8.358827835437615e-08, "logits/chosen": -2.036517858505249, "logits/rejected": -1.9863923788070679, "logps/chosen": -281.88714599609375, "logps/rejected": -433.1886291503906, "loss": 0.4407, "rewards/accuracies": 0.8125, "rewards/chosen": -2.2851810455322266, "rewards/margins": 1.5200507640838623, "rewards/rejected": -3.8052315711975098, "step": 13210 }, { "epoch": 2.277739490006892, "grad_norm": 28.394975662231445, "learning_rate": 8.3214575634341e-08, "logits/chosen": -2.025442361831665, "logits/rejected": -1.967790961265564, "logps/chosen": -288.62969970703125, "logps/rejected": -452.2958984375, "loss": 0.3523, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.3265864849090576, "rewards/margins": 1.6451356410980225, "rewards/rejected": -3.971722364425659, "step": 13220 }, { "epoch": 2.279462439696761, "grad_norm": 45.30950927734375, "learning_rate": 8.284154329222531e-08, "logits/chosen": -1.9815788269042969, "logits/rejected": -1.9127254486083984, "logps/chosen": -273.38775634765625, "logps/rejected": -469.23565673828125, "loss": 0.3406, "rewards/accuracies": 0.84375, "rewards/chosen": -2.2103800773620605, "rewards/margins": 1.9582183361053467, "rewards/rejected": -4.168598175048828, "step": 13230 }, { "epoch": 2.28118538938663, "grad_norm": 33.8141975402832, "learning_rate": 8.246918282739663e-08, "logits/chosen": -1.9854990243911743, "logits/rejected": -1.9326483011245728, "logps/chosen": -310.1800537109375, "logps/rejected": -461.95892333984375, "loss": 0.3827, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.5845720767974854, "rewards/margins": 1.5086365938186646, "rewards/rejected": -4.0932087898254395, "step": 13240 }, { "epoch": 2.282908339076499, "grad_norm": 40.896484375, "learning_rate": 8.209749573652184e-08, "logits/chosen": -1.961124062538147, "logits/rejected": -1.900235891342163, "logps/chosen": -290.27764892578125, "logps/rejected": -481.72100830078125, "loss": 0.3407, "rewards/accuracies": 0.875, "rewards/chosen": -2.37351131439209, "rewards/margins": 1.8958784341812134, "rewards/rejected": -4.269390106201172, "step": 13250 }, { "epoch": 2.2846312887663682, "grad_norm": 45.03136444091797, "learning_rate": 8.17264835135612e-08, "logits/chosen": -2.0273993015289307, "logits/rejected": -1.956947684288025, "logps/chosen": -311.03460693359375, "logps/rejected": -503.27447509765625, "loss": 0.3403, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.616345167160034, "rewards/margins": 1.896369218826294, "rewards/rejected": -4.512714385986328, "step": 13260 }, { "epoch": 2.286354238456237, "grad_norm": 30.7396297454834, "learning_rate": 8.13561476497628e-08, "logits/chosen": -1.9267151355743408, "logits/rejected": -1.8649072647094727, "logps/chosen": -301.29852294921875, "logps/rejected": -493.84027099609375, "loss": 0.3372, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.456468105316162, "rewards/margins": 1.9473756551742554, "rewards/rejected": -4.403843402862549, "step": 13270 }, { "epoch": 2.288077188146106, "grad_norm": 49.62660598754883, "learning_rate": 8.098648963365571e-08, "logits/chosen": -1.9719321727752686, "logits/rejected": -1.8957502841949463, "logps/chosen": -288.279541015625, "logps/rejected": -461.49261474609375, "loss": 0.3945, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.3330600261688232, "rewards/margins": 1.7755730152130127, "rewards/rejected": -4.108632564544678, "step": 13280 }, { "epoch": 2.289800137835975, "grad_norm": 70.3248291015625, "learning_rate": 8.061751095104471e-08, "logits/chosen": -1.9417566061019897, "logits/rejected": -1.8656524419784546, "logps/chosen": -297.5235290527344, "logps/rejected": -486.68414306640625, "loss": 0.3338, "rewards/accuracies": 0.875, "rewards/chosen": -2.4242615699768066, "rewards/margins": 1.9188019037246704, "rewards/rejected": -4.343063831329346, "step": 13290 }, { "epoch": 2.291523087525844, "grad_norm": 46.958316802978516, "learning_rate": 8.024921308500412e-08, "logits/chosen": -1.94889235496521, "logits/rejected": -1.87539803981781, "logps/chosen": -317.95404052734375, "logps/rejected": -497.37152099609375, "loss": 0.3888, "rewards/accuracies": 0.8125, "rewards/chosen": -2.6620562076568604, "rewards/margins": 1.806361436843872, "rewards/rejected": -4.468417644500732, "step": 13300 }, { "epoch": 2.293246037215713, "grad_norm": 58.731544494628906, "learning_rate": 7.988159751587157e-08, "logits/chosen": -1.9700193405151367, "logits/rejected": -1.900874376296997, "logps/chosen": -303.85693359375, "logps/rejected": -492.9615173339844, "loss": 0.3764, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.4616265296936035, "rewards/margins": 1.9344682693481445, "rewards/rejected": -4.39609432220459, "step": 13310 }, { "epoch": 2.2949689869055825, "grad_norm": 54.80722427368164, "learning_rate": 7.951466572124229e-08, "logits/chosen": -1.986997365951538, "logits/rejected": -1.9099671840667725, "logps/chosen": -309.99212646484375, "logps/rejected": -461.88519287109375, "loss": 0.4385, "rewards/accuracies": 0.8125, "rewards/chosen": -2.5312483310699463, "rewards/margins": 1.612308144569397, "rewards/rejected": -4.143556118011475, "step": 13320 }, { "epoch": 2.2966919365954515, "grad_norm": 30.391557693481445, "learning_rate": 7.914841917596335e-08, "logits/chosen": -1.9580758810043335, "logits/rejected": -1.8824771642684937, "logps/chosen": -266.89813232421875, "logps/rejected": -469.8072204589844, "loss": 0.3247, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.1005990505218506, "rewards/margins": 2.102206230163574, "rewards/rejected": -4.202805519104004, "step": 13330 }, { "epoch": 2.2984148862853204, "grad_norm": 62.13656234741211, "learning_rate": 7.878285935212741e-08, "logits/chosen": -2.0400404930114746, "logits/rejected": -1.9787954092025757, "logps/chosen": -281.8410949707031, "logps/rejected": -443.72235107421875, "loss": 0.3995, "rewards/accuracies": 0.78125, "rewards/chosen": -2.261660099029541, "rewards/margins": 1.6442092657089233, "rewards/rejected": -3.905869245529175, "step": 13340 }, { "epoch": 2.3001378359751894, "grad_norm": 52.591339111328125, "learning_rate": 7.841798771906685e-08, "logits/chosen": -2.0216293334960938, "logits/rejected": -1.941457748413086, "logps/chosen": -272.722900390625, "logps/rejected": -444.1116638183594, "loss": 0.3824, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.153017520904541, "rewards/margins": 1.7805055379867554, "rewards/rejected": -3.933523178100586, "step": 13350 }, { "epoch": 2.301860785665059, "grad_norm": 47.480865478515625, "learning_rate": 7.805380574334793e-08, "logits/chosen": -2.042249917984009, "logits/rejected": -1.9698143005371094, "logps/chosen": -255.32876586914062, "logps/rejected": -449.73492431640625, "loss": 0.297, "rewards/accuracies": 0.875, "rewards/chosen": -1.9736545085906982, "rewards/margins": 1.9943023920059204, "rewards/rejected": -3.967957019805908, "step": 13360 }, { "epoch": 2.3035837353549278, "grad_norm": 23.708370208740234, "learning_rate": 7.769031488876505e-08, "logits/chosen": -1.9960012435913086, "logits/rejected": -1.9139912128448486, "logps/chosen": -261.7926940917969, "logps/rejected": -448.83221435546875, "loss": 0.3438, "rewards/accuracies": 0.875, "rewards/chosen": -2.0615437030792236, "rewards/margins": 1.9043357372283936, "rewards/rejected": -3.965879440307617, "step": 13370 }, { "epoch": 2.3053066850447967, "grad_norm": 69.86125183105469, "learning_rate": 7.732751661633466e-08, "logits/chosen": -1.9258997440338135, "logits/rejected": -1.853322982788086, "logps/chosen": -296.5481872558594, "logps/rejected": -456.8854064941406, "loss": 0.393, "rewards/accuracies": 0.84375, "rewards/chosen": -2.381920337677002, "rewards/margins": 1.6552002429962158, "rewards/rejected": -4.037120819091797, "step": 13380 }, { "epoch": 2.3070296347346657, "grad_norm": 32.16677474975586, "learning_rate": 7.696541238428936e-08, "logits/chosen": -1.9565061330795288, "logits/rejected": -1.878354787826538, "logps/chosen": -281.51202392578125, "logps/rejected": -444.2578125, "loss": 0.3993, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.240410327911377, "rewards/margins": 1.6941583156585693, "rewards/rejected": -3.9345688819885254, "step": 13390 }, { "epoch": 2.3087525844245347, "grad_norm": 24.719867706298828, "learning_rate": 7.66040036480721e-08, "logits/chosen": -1.9789676666259766, "logits/rejected": -1.903477430343628, "logps/chosen": -282.2976989746094, "logps/rejected": -472.1946716308594, "loss": 0.3441, "rewards/accuracies": 0.875, "rewards/chosen": -2.260823965072632, "rewards/margins": 1.9604384899139404, "rewards/rejected": -4.2212629318237305, "step": 13400 }, { "epoch": 2.3104755341144037, "grad_norm": 29.825729370117188, "learning_rate": 7.624329186033054e-08, "logits/chosen": -1.9889189004898071, "logits/rejected": -1.9127864837646484, "logps/chosen": -318.6175842285156, "logps/rejected": -516.7404174804688, "loss": 0.3753, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.638373374938965, "rewards/margins": 2.0276589393615723, "rewards/rejected": -4.666031837463379, "step": 13410 }, { "epoch": 2.312198483804273, "grad_norm": 47.65455627441406, "learning_rate": 7.588327847091078e-08, "logits/chosen": -1.9790700674057007, "logits/rejected": -1.918404221534729, "logps/chosen": -320.033935546875, "logps/rejected": -504.40277099609375, "loss": 0.3723, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.659442901611328, "rewards/margins": 1.833289384841919, "rewards/rejected": -4.492732524871826, "step": 13420 }, { "epoch": 2.313921433494142, "grad_norm": 59.22507858276367, "learning_rate": 7.552396492685204e-08, "logits/chosen": -2.055762767791748, "logits/rejected": -1.9831984043121338, "logps/chosen": -303.80889892578125, "logps/rejected": -482.2606506347656, "loss": 0.3658, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.4781455993652344, "rewards/margins": 1.7767798900604248, "rewards/rejected": -4.254925727844238, "step": 13430 }, { "epoch": 2.315644383184011, "grad_norm": 33.2974853515625, "learning_rate": 7.516535267238028e-08, "logits/chosen": -1.9197736978530884, "logits/rejected": -1.8640073537826538, "logps/chosen": -284.1615295410156, "logps/rejected": -442.9144592285156, "loss": 0.4245, "rewards/accuracies": 0.8125, "rewards/chosen": -2.301309823989868, "rewards/margins": 1.612192153930664, "rewards/rejected": -3.9135022163391113, "step": 13440 }, { "epoch": 2.31736733287388, "grad_norm": 30.388511657714844, "learning_rate": 7.480744314890303e-08, "logits/chosen": -1.9493885040283203, "logits/rejected": -1.8790403604507446, "logps/chosen": -278.25665283203125, "logps/rejected": -455.18896484375, "loss": 0.3567, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.2711548805236816, "rewards/margins": 1.7667537927627563, "rewards/rejected": -4.037908554077148, "step": 13450 }, { "epoch": 2.3190902825637494, "grad_norm": 50.018062591552734, "learning_rate": 7.44502377950029e-08, "logits/chosen": -1.9807054996490479, "logits/rejected": -1.9037548303604126, "logps/chosen": -270.915771484375, "logps/rejected": -469.55560302734375, "loss": 0.3107, "rewards/accuracies": 0.875, "rewards/chosen": -2.1509830951690674, "rewards/margins": 2.046891927719116, "rewards/rejected": -4.197875022888184, "step": 13460 }, { "epoch": 2.3208132322536184, "grad_norm": 37.25325393676758, "learning_rate": 7.409373804643243e-08, "logits/chosen": -1.9210283756256104, "logits/rejected": -1.8541533946990967, "logps/chosen": -303.3565673828125, "logps/rejected": -463.77239990234375, "loss": 0.405, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.4753193855285645, "rewards/margins": 1.6655935049057007, "rewards/rejected": -4.140913009643555, "step": 13470 }, { "epoch": 2.3225361819434873, "grad_norm": 39.188995361328125, "learning_rate": 7.373794533610813e-08, "logits/chosen": -2.005866289138794, "logits/rejected": -1.9305928945541382, "logps/chosen": -296.78253173828125, "logps/rejected": -465.75286865234375, "loss": 0.365, "rewards/accuracies": 0.84375, "rewards/chosen": -2.4259719848632812, "rewards/margins": 1.716382384300232, "rewards/rejected": -4.1423540115356445, "step": 13480 }, { "epoch": 2.3242591316333563, "grad_norm": 36.8951416015625, "learning_rate": 7.338286109410416e-08, "logits/chosen": -1.9743963479995728, "logits/rejected": -1.8979825973510742, "logps/chosen": -317.464111328125, "logps/rejected": -500.0098571777344, "loss": 0.3687, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.632472038269043, "rewards/margins": 1.838273048400879, "rewards/rejected": -4.470745086669922, "step": 13490 }, { "epoch": 2.3259820813232253, "grad_norm": 45.553466796875, "learning_rate": 7.302848674764747e-08, "logits/chosen": -1.9755733013153076, "logits/rejected": -1.8955450057983398, "logps/chosen": -308.56915283203125, "logps/rejected": -520.5836791992188, "loss": 0.2724, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.518578052520752, "rewards/margins": 2.175283193588257, "rewards/rejected": -4.693861961364746, "step": 13500 }, { "epoch": 2.3277050310130942, "grad_norm": 21.522863388061523, "learning_rate": 7.267482372111169e-08, "logits/chosen": -1.9071283340454102, "logits/rejected": -1.8417308330535889, "logps/chosen": -284.7821960449219, "logps/rejected": -473.8779296875, "loss": 0.3417, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.3408172130584717, "rewards/margins": 1.859559416770935, "rewards/rejected": -4.200376987457275, "step": 13510 }, { "epoch": 2.3294279807029636, "grad_norm": 46.86043167114258, "learning_rate": 7.232187343601112e-08, "logits/chosen": -1.8909887075424194, "logits/rejected": -1.8170280456542969, "logps/chosen": -305.592529296875, "logps/rejected": -503.0406188964844, "loss": 0.3397, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.539792060852051, "rewards/margins": 1.9846267700195312, "rewards/rejected": -4.524418354034424, "step": 13520 }, { "epoch": 2.3311509303928326, "grad_norm": 61.98499298095703, "learning_rate": 7.196963731099532e-08, "logits/chosen": -1.9518464803695679, "logits/rejected": -1.8860689401626587, "logps/chosen": -333.2911682128906, "logps/rejected": -476.44183349609375, "loss": 0.4455, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.777620792388916, "rewards/margins": 1.4731686115264893, "rewards/rejected": -4.250789165496826, "step": 13530 }, { "epoch": 2.3328738800827016, "grad_norm": 68.71394348144531, "learning_rate": 7.161811676184345e-08, "logits/chosen": -1.9195371866226196, "logits/rejected": -1.8478978872299194, "logps/chosen": -319.31939697265625, "logps/rejected": -506.7085876464844, "loss": 0.3555, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.6288211345672607, "rewards/margins": 1.9446920156478882, "rewards/rejected": -4.573513031005859, "step": 13540 }, { "epoch": 2.3345968297725705, "grad_norm": 46.51976776123047, "learning_rate": 7.126731320145854e-08, "logits/chosen": -1.9478000402450562, "logits/rejected": -1.8766177892684937, "logps/chosen": -308.91351318359375, "logps/rejected": -477.0372619628906, "loss": 0.4232, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.573697805404663, "rewards/margins": 1.7069047689437866, "rewards/rejected": -4.28060245513916, "step": 13550 }, { "epoch": 2.3363197794624395, "grad_norm": 68.52571868896484, "learning_rate": 7.09172280398615e-08, "logits/chosen": -1.9702144861221313, "logits/rejected": -1.9154142141342163, "logps/chosen": -283.3779296875, "logps/rejected": -439.3309631347656, "loss": 0.4384, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.2980241775512695, "rewards/margins": 1.5721662044525146, "rewards/rejected": -3.870190382003784, "step": 13560 }, { "epoch": 2.338042729152309, "grad_norm": 37.95341110229492, "learning_rate": 7.056786268418597e-08, "logits/chosen": -1.8973090648651123, "logits/rejected": -1.8277537822723389, "logps/chosen": -307.06524658203125, "logps/rejected": -503.9869079589844, "loss": 0.2993, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.5466437339782715, "rewards/margins": 1.9906597137451172, "rewards/rejected": -4.537303447723389, "step": 13570 }, { "epoch": 2.339765678842178, "grad_norm": 28.87031364440918, "learning_rate": 7.021921853867224e-08, "logits/chosen": -1.9527027606964111, "logits/rejected": -1.8996165990829468, "logps/chosen": -284.82232666015625, "logps/rejected": -451.60211181640625, "loss": 0.3477, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.3161532878875732, "rewards/margins": 1.67333984375, "rewards/rejected": -3.989492893218994, "step": 13580 }, { "epoch": 2.341488628532047, "grad_norm": 24.49680519104004, "learning_rate": 6.987129700466173e-08, "logits/chosen": -1.932607889175415, "logits/rejected": -1.8766577243804932, "logps/chosen": -298.7451477050781, "logps/rejected": -479.075927734375, "loss": 0.3197, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.4566731452941895, "rewards/margins": 1.8017864227294922, "rewards/rejected": -4.258459568023682, "step": 13590 }, { "epoch": 2.343211578221916, "grad_norm": 50.10162353515625, "learning_rate": 6.952409948059157e-08, "logits/chosen": -1.956814169883728, "logits/rejected": -1.8925914764404297, "logps/chosen": -317.15618896484375, "logps/rejected": -487.13140869140625, "loss": 0.4236, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.572103977203369, "rewards/margins": 1.7272611856460571, "rewards/rejected": -4.299365043640137, "step": 13600 }, { "epoch": 2.343211578221916, "eval_logits/chosen": -1.9798275232315063, "eval_logits/rejected": -1.954161286354065, "eval_logps/chosen": -350.75762939453125, "eval_logps/rejected": -412.56683349609375, "eval_loss": 0.7126924395561218, "eval_rewards/accuracies": 0.6328996419906616, "eval_rewards/chosen": -2.917421579360962, "eval_rewards/margins": 0.5807508230209351, "eval_rewards/rejected": -3.4981720447540283, "eval_runtime": 362.2029, "eval_samples_per_second": 11.883, "eval_steps_per_second": 1.485, "step": 13600 }, { "epoch": 2.344934527911785, "grad_norm": 50.25716018676758, "learning_rate": 6.917762736198874e-08, "logits/chosen": -2.073000431060791, "logits/rejected": -2.001960277557373, "logps/chosen": -299.1363830566406, "logps/rejected": -471.4437561035156, "loss": 0.3621, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.4524221420288086, "rewards/margins": 1.7304941415786743, "rewards/rejected": -4.182916164398193, "step": 13610 }, { "epoch": 2.346657477601654, "grad_norm": 24.899574279785156, "learning_rate": 6.883188204146445e-08, "logits/chosen": -1.9582160711288452, "logits/rejected": -1.8715308904647827, "logps/chosen": -311.832763671875, "logps/rejected": -494.74786376953125, "loss": 0.3389, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.5423147678375244, "rewards/margins": 1.894571304321289, "rewards/rejected": -4.436886310577393, "step": 13620 }, { "epoch": 2.348380427291523, "grad_norm": 34.89834213256836, "learning_rate": 6.848686490870853e-08, "logits/chosen": -1.9160064458847046, "logits/rejected": -1.828000783920288, "logps/chosen": -299.5780334472656, "logps/rejected": -505.17413330078125, "loss": 0.2956, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.4388580322265625, "rewards/margins": 2.119528293609619, "rewards/rejected": -4.558385848999023, "step": 13630 }, { "epoch": 2.350103376981392, "grad_norm": 29.578866958618164, "learning_rate": 6.81425773504842e-08, "logits/chosen": -1.9359655380249023, "logits/rejected": -1.8576080799102783, "logps/chosen": -296.59368896484375, "logps/rejected": -494.71795654296875, "loss": 0.3015, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.408308744430542, "rewards/margins": 2.0067155361175537, "rewards/rejected": -4.415024280548096, "step": 13640 }, { "epoch": 2.351826326671261, "grad_norm": 48.94502639770508, "learning_rate": 6.77990207506221e-08, "logits/chosen": -1.9907512664794922, "logits/rejected": -1.9204210042953491, "logps/chosen": -322.8336486816406, "logps/rejected": -517.5927734375, "loss": 0.3502, "rewards/accuracies": 0.84375, "rewards/chosen": -2.6770455837249756, "rewards/margins": 1.9912853240966797, "rewards/rejected": -4.668331146240234, "step": 13650 }, { "epoch": 2.35354927636113, "grad_norm": 40.428070068359375, "learning_rate": 6.745619649001477e-08, "logits/chosen": -1.902645468711853, "logits/rejected": -1.8283004760742188, "logps/chosen": -298.4253845214844, "logps/rejected": -486.20819091796875, "loss": 0.3459, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.4478378295898438, "rewards/margins": 1.8837738037109375, "rewards/rejected": -4.331611633300781, "step": 13660 }, { "epoch": 2.3552722260509995, "grad_norm": 45.59690856933594, "learning_rate": 6.711410594661116e-08, "logits/chosen": -1.9599357843399048, "logits/rejected": -1.8858649730682373, "logps/chosen": -316.57098388671875, "logps/rejected": -499.40216064453125, "loss": 0.3482, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.6074106693267822, "rewards/margins": 1.848226547241211, "rewards/rejected": -4.455636501312256, "step": 13670 }, { "epoch": 2.3569951757408685, "grad_norm": 46.03456115722656, "learning_rate": 6.677275049541129e-08, "logits/chosen": -1.9755738973617554, "logits/rejected": -1.8989111185073853, "logps/chosen": -314.34674072265625, "logps/rejected": -487.97509765625, "loss": 0.3901, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.582007646560669, "rewards/margins": 1.7926870584487915, "rewards/rejected": -4.374695301055908, "step": 13680 }, { "epoch": 2.3587181254307374, "grad_norm": 49.609954833984375, "learning_rate": 6.643213150846053e-08, "logits/chosen": -1.9404627084732056, "logits/rejected": -1.8581126928329468, "logps/chosen": -316.091796875, "logps/rejected": -516.0310668945312, "loss": 0.3365, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.6144022941589355, "rewards/margins": 2.011970281600952, "rewards/rejected": -4.626372814178467, "step": 13690 }, { "epoch": 2.3604410751206064, "grad_norm": 63.84767150878906, "learning_rate": 6.609225035484395e-08, "logits/chosen": -1.9487206935882568, "logits/rejected": -1.8727912902832031, "logps/chosen": -315.8226318359375, "logps/rejected": -488.0445251464844, "loss": 0.3727, "rewards/accuracies": 0.84375, "rewards/chosen": -2.561033248901367, "rewards/margins": 1.7964370250701904, "rewards/rejected": -4.357470512390137, "step": 13700 }, { "epoch": 2.3621640248104754, "grad_norm": 66.00569152832031, "learning_rate": 6.5753108400681e-08, "logits/chosen": -1.878554105758667, "logits/rejected": -1.7945867776870728, "logps/chosen": -309.86102294921875, "logps/rejected": -507.458740234375, "loss": 0.3208, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -2.55210542678833, "rewards/margins": 1.9888455867767334, "rewards/rejected": -4.540951251983643, "step": 13710 }, { "epoch": 2.3638869745003444, "grad_norm": 76.46798706054688, "learning_rate": 6.541470700912014e-08, "logits/chosen": -1.9059851169586182, "logits/rejected": -1.8347688913345337, "logps/chosen": -328.2723083496094, "logps/rejected": -509.8519592285156, "loss": 0.3993, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.730103015899658, "rewards/margins": 1.8516451120376587, "rewards/rejected": -4.581748008728027, "step": 13720 }, { "epoch": 2.3656099241902138, "grad_norm": 87.67671966552734, "learning_rate": 6.507704754033299e-08, "logits/chosen": -2.0191407203674316, "logits/rejected": -1.9459142684936523, "logps/chosen": -321.8654479980469, "logps/rejected": -542.2857055664062, "loss": 0.3035, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.6557891368865967, "rewards/margins": 2.1971020698547363, "rewards/rejected": -4.852890968322754, "step": 13730 }, { "epoch": 2.3673328738800827, "grad_norm": 54.28536605834961, "learning_rate": 6.474013135150927e-08, "logits/chosen": -2.0295586585998535, "logits/rejected": -1.9532287120819092, "logps/chosen": -315.52532958984375, "logps/rejected": -510.1664123535156, "loss": 0.3501, "rewards/accuracies": 0.875, "rewards/chosen": -2.5716781616210938, "rewards/margins": 2.002004623413086, "rewards/rejected": -4.5736823081970215, "step": 13740 }, { "epoch": 2.3690558235699517, "grad_norm": 40.25489044189453, "learning_rate": 6.440395979685118e-08, "logits/chosen": -1.9747211933135986, "logits/rejected": -1.897438406944275, "logps/chosen": -322.9765319824219, "logps/rejected": -530.2354736328125, "loss": 0.3607, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.66455078125, "rewards/margins": 2.1009037494659424, "rewards/rejected": -4.765454292297363, "step": 13750 }, { "epoch": 2.3707787732598207, "grad_norm": 74.22500610351562, "learning_rate": 6.406853422756778e-08, "logits/chosen": -2.034287929534912, "logits/rejected": -1.9651908874511719, "logps/chosen": -308.0318603515625, "logps/rejected": -464.5968322753906, "loss": 0.4333, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.5259594917297363, "rewards/margins": 1.6212389469146729, "rewards/rejected": -4.147198677062988, "step": 13760 }, { "epoch": 2.37250172294969, "grad_norm": 49.615089416503906, "learning_rate": 6.373385599186965e-08, "logits/chosen": -2.0266547203063965, "logits/rejected": -1.9632809162139893, "logps/chosen": -295.71612548828125, "logps/rejected": -483.315185546875, "loss": 0.3696, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.442220687866211, "rewards/margins": 1.852952241897583, "rewards/rejected": -4.295172691345215, "step": 13770 }, { "epoch": 2.374224672639559, "grad_norm": 31.079999923706055, "learning_rate": 6.33999264349638e-08, "logits/chosen": -2.066023588180542, "logits/rejected": -1.9795291423797607, "logps/chosen": -312.97607421875, "logps/rejected": -488.4681091308594, "loss": 0.3466, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.5295205116271973, "rewards/margins": 1.8464891910552979, "rewards/rejected": -4.376009941101074, "step": 13780 }, { "epoch": 2.375947622329428, "grad_norm": 42.027183532714844, "learning_rate": 6.306674689904798e-08, "logits/chosen": -1.9941504001617432, "logits/rejected": -1.929997205734253, "logps/chosen": -287.31402587890625, "logps/rejected": -445.3863220214844, "loss": 0.3749, "rewards/accuracies": 0.8125, "rewards/chosen": -2.2951600551605225, "rewards/margins": 1.63910710811615, "rewards/rejected": -3.934267520904541, "step": 13790 }, { "epoch": 2.377670572019297, "grad_norm": 46.127349853515625, "learning_rate": 6.273431872330487e-08, "logits/chosen": -2.059976100921631, "logits/rejected": -1.9758069515228271, "logps/chosen": -284.869873046875, "logps/rejected": -472.40655517578125, "loss": 0.3598, "rewards/accuracies": 0.84375, "rewards/chosen": -2.264007329940796, "rewards/margins": 1.949480652809143, "rewards/rejected": -4.21348762512207, "step": 13800 }, { "epoch": 2.379393521709166, "grad_norm": 43.63248062133789, "learning_rate": 6.240264324389765e-08, "logits/chosen": -1.9813019037246704, "logits/rejected": -1.907204270362854, "logps/chosen": -276.1174011230469, "logps/rejected": -463.06854248046875, "loss": 0.3704, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.2198359966278076, "rewards/margins": 1.8832365274429321, "rewards/rejected": -4.1030731201171875, "step": 13810 }, { "epoch": 2.381116471399035, "grad_norm": 46.57829284667969, "learning_rate": 6.207172179396392e-08, "logits/chosen": -2.0180835723876953, "logits/rejected": -1.94087815284729, "logps/chosen": -293.0558166503906, "logps/rejected": -466.24200439453125, "loss": 0.3513, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.32195782661438, "rewards/margins": 1.8142026662826538, "rewards/rejected": -4.136160850524902, "step": 13820 }, { "epoch": 2.3828394210889043, "grad_norm": 45.167728424072266, "learning_rate": 6.174155570361039e-08, "logits/chosen": -2.0552353858947754, "logits/rejected": -2.0002999305725098, "logps/chosen": -278.19854736328125, "logps/rejected": -431.64892578125, "loss": 0.3787, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.217895984649658, "rewards/margins": 1.5822315216064453, "rewards/rejected": -3.8001270294189453, "step": 13830 }, { "epoch": 2.3845623707787733, "grad_norm": 50.23057174682617, "learning_rate": 6.141214629990798e-08, "logits/chosen": -2.023578643798828, "logits/rejected": -1.9553991556167603, "logps/chosen": -293.44683837890625, "logps/rejected": -444.34356689453125, "loss": 0.4139, "rewards/accuracies": 0.8125, "rewards/chosen": -2.359248399734497, "rewards/margins": 1.571588397026062, "rewards/rejected": -3.9308364391326904, "step": 13840 }, { "epoch": 2.3862853204686423, "grad_norm": 69.8079605102539, "learning_rate": 6.10834949068858e-08, "logits/chosen": -1.9609124660491943, "logits/rejected": -1.8988176584243774, "logps/chosen": -291.35009765625, "logps/rejected": -471.74920654296875, "loss": 0.4011, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.3631508350372314, "rewards/margins": 1.868528127670288, "rewards/rejected": -4.231679439544678, "step": 13850 }, { "epoch": 2.3880082701585112, "grad_norm": 33.40293502807617, "learning_rate": 6.075560284552658e-08, "logits/chosen": -1.9476524591445923, "logits/rejected": -1.8815271854400635, "logps/chosen": -307.68145751953125, "logps/rejected": -471.6998596191406, "loss": 0.3563, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.5548532009124756, "rewards/margins": 1.6557025909423828, "rewards/rejected": -4.2105560302734375, "step": 13860 }, { "epoch": 2.3897312198483807, "grad_norm": 52.14076614379883, "learning_rate": 6.04284714337607e-08, "logits/chosen": -2.0066094398498535, "logits/rejected": -1.9432475566864014, "logps/chosen": -321.5055236816406, "logps/rejected": -500.7430725097656, "loss": 0.4283, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.6546456813812256, "rewards/margins": 1.833840012550354, "rewards/rejected": -4.488485813140869, "step": 13870 }, { "epoch": 2.3914541695382496, "grad_norm": 55.95194625854492, "learning_rate": 6.010210198646143e-08, "logits/chosen": -1.9298763275146484, "logits/rejected": -1.849346399307251, "logps/chosen": -312.5848083496094, "logps/rejected": -474.78411865234375, "loss": 0.4238, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.5713329315185547, "rewards/margins": 1.6587108373641968, "rewards/rejected": -4.230043411254883, "step": 13880 }, { "epoch": 2.3931771192281186, "grad_norm": 25.900493621826172, "learning_rate": 5.977649581543908e-08, "logits/chosen": -2.0376148223876953, "logits/rejected": -1.9670966863632202, "logps/chosen": -280.1473083496094, "logps/rejected": -453.1607360839844, "loss": 0.3412, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.2383084297180176, "rewards/margins": 1.7694286108016968, "rewards/rejected": -4.007737159729004, "step": 13890 }, { "epoch": 2.3949000689179876, "grad_norm": 35.56214141845703, "learning_rate": 5.945165422943646e-08, "logits/chosen": -1.968825340270996, "logits/rejected": -1.8976529836654663, "logps/chosen": -292.9938049316406, "logps/rejected": -455.3026428222656, "loss": 0.3827, "rewards/accuracies": 0.84375, "rewards/chosen": -2.3907172679901123, "rewards/margins": 1.6774673461914062, "rewards/rejected": -4.068184852600098, "step": 13900 }, { "epoch": 2.3966230186078565, "grad_norm": 51.492740631103516, "learning_rate": 5.912757853412281e-08, "logits/chosen": -1.979527235031128, "logits/rejected": -1.9144665002822876, "logps/chosen": -276.3684997558594, "logps/rejected": -436.29058837890625, "loss": 0.3807, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.1894757747650146, "rewards/margins": 1.6545445919036865, "rewards/rejected": -3.844020366668701, "step": 13910 }, { "epoch": 2.3983459682977255, "grad_norm": 49.091129302978516, "learning_rate": 5.8804270032089236e-08, "logits/chosen": -1.998175859451294, "logits/rejected": -1.9192034006118774, "logps/chosen": -308.27850341796875, "logps/rejected": -475.4336853027344, "loss": 0.3586, "rewards/accuracies": 0.84375, "rewards/chosen": -2.4715120792388916, "rewards/margins": 1.7704658508300781, "rewards/rejected": -4.241978168487549, "step": 13920 }, { "epoch": 2.400068917987595, "grad_norm": 59.99543762207031, "learning_rate": 5.8481730022842984e-08, "logits/chosen": -1.9565117359161377, "logits/rejected": -1.8945804834365845, "logps/chosen": -298.58538818359375, "logps/rejected": -444.9927673339844, "loss": 0.4355, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.4644086360931396, "rewards/margins": 1.482774257659912, "rewards/rejected": -3.947183132171631, "step": 13930 }, { "epoch": 2.401791867677464, "grad_norm": 51.859039306640625, "learning_rate": 5.815995980280247e-08, "logits/chosen": -1.949885368347168, "logits/rejected": -1.8741350173950195, "logps/chosen": -291.86346435546875, "logps/rejected": -496.63671875, "loss": 0.3204, "rewards/accuracies": 0.875, "rewards/chosen": -2.3798670768737793, "rewards/margins": 2.029872417449951, "rewards/rejected": -4.4097394943237305, "step": 13940 }, { "epoch": 2.403514817367333, "grad_norm": 50.78218078613281, "learning_rate": 5.783896066529209e-08, "logits/chosen": -2.000544786453247, "logits/rejected": -1.9280424118041992, "logps/chosen": -318.35498046875, "logps/rejected": -497.6119079589844, "loss": 0.4375, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.640174388885498, "rewards/margins": 1.8130782842636108, "rewards/rejected": -4.45325231552124, "step": 13950 }, { "epoch": 2.405237767057202, "grad_norm": 56.98761749267578, "learning_rate": 5.7518733900536966e-08, "logits/chosen": -1.870274305343628, "logits/rejected": -1.824155569076538, "logps/chosen": -317.3321838378906, "logps/rejected": -461.24713134765625, "loss": 0.4094, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.6283392906188965, "rewards/margins": 1.4614887237548828, "rewards/rejected": -4.089827537536621, "step": 13960 }, { "epoch": 2.406960716747071, "grad_norm": 28.709196090698242, "learning_rate": 5.719928079565764e-08, "logits/chosen": -1.9703019857406616, "logits/rejected": -1.8923661708831787, "logps/chosen": -294.66741943359375, "logps/rejected": -487.2413024902344, "loss": 0.3619, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.344806432723999, "rewards/margins": 2.0014970302581787, "rewards/rejected": -4.3463029861450195, "step": 13970 }, { "epoch": 2.40868366643694, "grad_norm": 43.88873291015625, "learning_rate": 5.688060263466493e-08, "logits/chosen": -1.9428203105926514, "logits/rejected": -1.8797473907470703, "logps/chosen": -299.7875671386719, "logps/rejected": -490.990478515625, "loss": 0.3742, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.4412450790405273, "rewards/margins": 1.9304412603378296, "rewards/rejected": -4.371685981750488, "step": 13980 }, { "epoch": 2.410406616126809, "grad_norm": 51.615333557128906, "learning_rate": 5.656270069845506e-08, "logits/chosen": -1.971731185913086, "logits/rejected": -1.897513747215271, "logps/chosen": -297.4532775878906, "logps/rejected": -462.0396423339844, "loss": 0.3683, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.3899219036102295, "rewards/margins": 1.6951929330825806, "rewards/rejected": -4.0851149559021, "step": 13990 }, { "epoch": 2.412129565816678, "grad_norm": 39.22588348388672, "learning_rate": 5.624557626480422e-08, "logits/chosen": -1.9121043682098389, "logits/rejected": -1.833711862564087, "logps/chosen": -307.8840637207031, "logps/rejected": -490.16162109375, "loss": 0.3527, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.5629477500915527, "rewards/margins": 1.8123763799667358, "rewards/rejected": -4.375324249267578, "step": 14000 }, { "epoch": 2.412129565816678, "eval_logits/chosen": -2.0097904205322266, "eval_logits/rejected": -1.9851875305175781, "eval_logps/chosen": -328.8109436035156, "eval_logps/rejected": -387.50384521484375, "eval_loss": 0.700573205947876, "eval_rewards/accuracies": 0.6252323389053345, "eval_rewards/chosen": -2.697953939437866, "eval_rewards/margins": 0.5495878458023071, "eval_rewards/rejected": -3.247542142868042, "eval_runtime": 362.4466, "eval_samples_per_second": 11.875, "eval_steps_per_second": 1.484, "step": 14000 }, { "epoch": 2.413852515506547, "grad_norm": 51.822940826416016, "learning_rate": 5.592923060836338e-08, "logits/chosen": -1.9477201700210571, "logits/rejected": -1.8927860260009766, "logps/chosen": -287.1483459472656, "logps/rejected": -469.8138122558594, "loss": 0.3877, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.307628631591797, "rewards/margins": 1.8646997213363647, "rewards/rejected": -4.172328472137451, "step": 14010 }, { "epoch": 2.415575465196416, "grad_norm": 50.05439758300781, "learning_rate": 5.561366500065348e-08, "logits/chosen": -1.9519977569580078, "logits/rejected": -1.8986446857452393, "logps/chosen": -309.08306884765625, "logps/rejected": -469.7696228027344, "loss": 0.3993, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.5351595878601074, "rewards/margins": 1.6568044424057007, "rewards/rejected": -4.191964149475098, "step": 14020 }, { "epoch": 2.4172984148862855, "grad_norm": 47.51583480834961, "learning_rate": 5.5298880710059976e-08, "logits/chosen": -1.9260857105255127, "logits/rejected": -1.8538860082626343, "logps/chosen": -309.1075744628906, "logps/rejected": -477.6659240722656, "loss": 0.4107, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.548079013824463, "rewards/margins": 1.741795301437378, "rewards/rejected": -4.289874076843262, "step": 14030 }, { "epoch": 2.4190213645761545, "grad_norm": 80.8395004272461, "learning_rate": 5.498487900182788e-08, "logits/chosen": -1.9633543491363525, "logits/rejected": -1.87796151638031, "logps/chosen": -300.79833984375, "logps/rejected": -481.75360107421875, "loss": 0.3344, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.433958053588867, "rewards/margins": 1.8929589986801147, "rewards/rejected": -4.326916694641113, "step": 14040 }, { "epoch": 2.4207443142660234, "grad_norm": 22.408294677734375, "learning_rate": 5.4671661138056824e-08, "logits/chosen": -1.9889585971832275, "logits/rejected": -1.911306381225586, "logps/chosen": -278.3016357421875, "logps/rejected": -444.74676513671875, "loss": 0.361, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.2276663780212402, "rewards/margins": 1.7204513549804688, "rewards/rejected": -3.94811749458313, "step": 14050 }, { "epoch": 2.4224672639558924, "grad_norm": 22.780344009399414, "learning_rate": 5.4359228377695826e-08, "logits/chosen": -1.9816150665283203, "logits/rejected": -1.8914697170257568, "logps/chosen": -306.04522705078125, "logps/rejected": -515.6046142578125, "loss": 0.313, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.49011492729187, "rewards/margins": 2.1573874950408936, "rewards/rejected": -4.647502422332764, "step": 14060 }, { "epoch": 2.4241902136457614, "grad_norm": 27.179912567138672, "learning_rate": 5.404758197653822e-08, "logits/chosen": -1.9690793752670288, "logits/rejected": -1.8919321298599243, "logps/chosen": -314.31488037109375, "logps/rejected": -506.97283935546875, "loss": 0.3258, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.5836429595947266, "rewards/margins": 1.9453452825546265, "rewards/rejected": -4.528988361358643, "step": 14070 }, { "epoch": 2.425913163335631, "grad_norm": 41.746402740478516, "learning_rate": 5.3736723187216504e-08, "logits/chosen": -1.9172413349151611, "logits/rejected": -1.8423162698745728, "logps/chosen": -339.7348327636719, "logps/rejected": -509.649658203125, "loss": 0.3998, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.8080036640167236, "rewards/margins": 1.7681306600570679, "rewards/rejected": -4.57613468170166, "step": 14080 }, { "epoch": 2.4276361130254998, "grad_norm": 52.659915924072266, "learning_rate": 5.3426653259197705e-08, "logits/chosen": -1.936475396156311, "logits/rejected": -1.8656017780303955, "logps/chosen": -320.84197998046875, "logps/rejected": -478.6650390625, "loss": 0.423, "rewards/accuracies": 0.8125, "rewards/chosen": -2.63415789604187, "rewards/margins": 1.6408799886703491, "rewards/rejected": -4.275038719177246, "step": 14090 }, { "epoch": 2.4293590627153687, "grad_norm": 34.1100959777832, "learning_rate": 5.311737343877804e-08, "logits/chosen": -1.9781997203826904, "logits/rejected": -1.89706552028656, "logps/chosen": -310.69293212890625, "logps/rejected": -515.9114990234375, "loss": 0.3103, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.5112366676330566, "rewards/margins": 2.1067495346069336, "rewards/rejected": -4.617985725402832, "step": 14100 }, { "epoch": 2.4310820124052377, "grad_norm": 61.30091857910156, "learning_rate": 5.280888496907782e-08, "logits/chosen": -1.9872591495513916, "logits/rejected": -1.9317775964736938, "logps/chosen": -316.92645263671875, "logps/rejected": -477.86279296875, "loss": 0.4492, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.662778854370117, "rewards/margins": 1.6108955144882202, "rewards/rejected": -4.273674488067627, "step": 14110 }, { "epoch": 2.4328049620951067, "grad_norm": 76.96726989746094, "learning_rate": 5.250118909003659e-08, "logits/chosen": -1.9548256397247314, "logits/rejected": -1.8854633569717407, "logps/chosen": -321.2222595214844, "logps/rejected": -476.19281005859375, "loss": 0.4203, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.68253755569458, "rewards/margins": 1.5608093738555908, "rewards/rejected": -4.243346214294434, "step": 14120 }, { "epoch": 2.4345279117849756, "grad_norm": 49.95549774169922, "learning_rate": 5.219428703840842e-08, "logits/chosen": -1.987666130065918, "logits/rejected": -1.920340895652771, "logps/chosen": -308.84478759765625, "logps/rejected": -468.25079345703125, "loss": 0.3632, "rewards/accuracies": 0.8125, "rewards/chosen": -2.5330371856689453, "rewards/margins": 1.6346709728240967, "rewards/rejected": -4.167707920074463, "step": 14130 }, { "epoch": 2.436250861474845, "grad_norm": 64.65557861328125, "learning_rate": 5.188818004775636e-08, "logits/chosen": -2.02113676071167, "logits/rejected": -1.9526363611221313, "logps/chosen": -326.83740234375, "logps/rejected": -499.20068359375, "loss": 0.4253, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.7379324436187744, "rewards/margins": 1.7475124597549438, "rewards/rejected": -4.485445022583008, "step": 14140 }, { "epoch": 2.437973811164714, "grad_norm": 61.66667175292969, "learning_rate": 5.158286934844802e-08, "logits/chosen": -2.0146563053131104, "logits/rejected": -1.948124647140503, "logps/chosen": -325.88482666015625, "logps/rejected": -500.88446044921875, "loss": 0.4129, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.7087466716766357, "rewards/margins": 1.7536227703094482, "rewards/rejected": -4.462369441986084, "step": 14150 }, { "epoch": 2.439696760854583, "grad_norm": 47.33082580566406, "learning_rate": 5.127835616765019e-08, "logits/chosen": -1.9622341394424438, "logits/rejected": -1.8922746181488037, "logps/chosen": -310.03289794921875, "logps/rejected": -479.2196350097656, "loss": 0.4003, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.5562658309936523, "rewards/margins": 1.6777102947235107, "rewards/rejected": -4.233975410461426, "step": 14160 }, { "epoch": 2.441419710544452, "grad_norm": 64.16891479492188, "learning_rate": 5.097464172932434e-08, "logits/chosen": -1.92804753780365, "logits/rejected": -1.8650553226470947, "logps/chosen": -309.32421875, "logps/rejected": -461.51483154296875, "loss": 0.4611, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.5285778045654297, "rewards/margins": 1.5520374774932861, "rewards/rejected": -4.080615043640137, "step": 14170 }, { "epoch": 2.4431426602343214, "grad_norm": 81.40733337402344, "learning_rate": 5.0671727254221224e-08, "logits/chosen": -1.9670326709747314, "logits/rejected": -1.8911988735198975, "logps/chosen": -326.33709716796875, "logps/rejected": -482.4718322753906, "loss": 0.4075, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.6823925971984863, "rewards/margins": 1.6346495151519775, "rewards/rejected": -4.317042350769043, "step": 14180 }, { "epoch": 2.4448656099241903, "grad_norm": 38.953914642333984, "learning_rate": 5.0369613959876377e-08, "logits/chosen": -1.9963645935058594, "logits/rejected": -1.9273340702056885, "logps/chosen": -318.72613525390625, "logps/rejected": -479.4820251464844, "loss": 0.4071, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.592322826385498, "rewards/margins": 1.663661241531372, "rewards/rejected": -4.255983352661133, "step": 14190 }, { "epoch": 2.4465885596140593, "grad_norm": 38.68586730957031, "learning_rate": 5.0068303060605164e-08, "logits/chosen": -1.9081411361694336, "logits/rejected": -1.8482236862182617, "logps/chosen": -296.44232177734375, "logps/rejected": -487.2012634277344, "loss": 0.3017, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.4155592918395996, "rewards/margins": 1.9202626943588257, "rewards/rejected": -4.335822105407715, "step": 14200 }, { "epoch": 2.4483115093039283, "grad_norm": 68.72774505615234, "learning_rate": 4.9767795767497536e-08, "logits/chosen": -1.9583053588867188, "logits/rejected": -1.895477294921875, "logps/chosen": -320.45806884765625, "logps/rejected": -490.91729736328125, "loss": 0.395, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.6658682823181152, "rewards/margins": 1.7230441570281982, "rewards/rejected": -4.388912200927734, "step": 14210 }, { "epoch": 2.4500344589937972, "grad_norm": 26.315872192382812, "learning_rate": 4.946809328841356e-08, "logits/chosen": -2.011765718460083, "logits/rejected": -1.922503113746643, "logps/chosen": -319.15435791015625, "logps/rejected": -521.7716674804688, "loss": 0.2923, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.5985920429229736, "rewards/margins": 2.0739340782165527, "rewards/rejected": -4.672525882720947, "step": 14220 }, { "epoch": 2.451757408683666, "grad_norm": 30.739002227783203, "learning_rate": 4.916919682797843e-08, "logits/chosen": -1.881664514541626, "logits/rejected": -1.7980133295059204, "logps/chosen": -328.3320617675781, "logps/rejected": -528.0885009765625, "loss": 0.3218, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.7098538875579834, "rewards/margins": 2.061765670776367, "rewards/rejected": -4.77161979675293, "step": 14230 }, { "epoch": 2.4534803583735356, "grad_norm": 44.16826248168945, "learning_rate": 4.8871107587577814e-08, "logits/chosen": -1.8941932916641235, "logits/rejected": -1.8240993022918701, "logps/chosen": -336.04376220703125, "logps/rejected": -542.801025390625, "loss": 0.348, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.8132503032684326, "rewards/margins": 2.0796570777893066, "rewards/rejected": -4.89290714263916, "step": 14240 }, { "epoch": 2.4552033080634046, "grad_norm": 45.10354995727539, "learning_rate": 4.857382676535235e-08, "logits/chosen": -1.9018518924713135, "logits/rejected": -1.8337697982788086, "logps/chosen": -337.66351318359375, "logps/rejected": -501.7044982910156, "loss": 0.4686, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.8236937522888184, "rewards/margins": 1.6673047542572021, "rewards/rejected": -4.4909987449646, "step": 14250 }, { "epoch": 2.4569262577532736, "grad_norm": 68.63253784179688, "learning_rate": 4.827735555619375e-08, "logits/chosen": -1.9878625869750977, "logits/rejected": -1.9059957265853882, "logps/chosen": -339.18170166015625, "logps/rejected": -518.7178955078125, "loss": 0.3445, "rewards/accuracies": 0.8125, "rewards/chosen": -2.8129630088806152, "rewards/margins": 1.8818851709365845, "rewards/rejected": -4.69484806060791, "step": 14260 }, { "epoch": 2.4586492074431425, "grad_norm": 49.88498306274414, "learning_rate": 4.7981695151739525e-08, "logits/chosen": -1.9431911706924438, "logits/rejected": -1.8706172704696655, "logps/chosen": -338.65606689453125, "logps/rejected": -531.8112182617188, "loss": 0.3569, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.8319337368011475, "rewards/margins": 1.9805126190185547, "rewards/rejected": -4.812446117401123, "step": 14270 }, { "epoch": 2.460372157133012, "grad_norm": 63.758018493652344, "learning_rate": 4.7686846740367993e-08, "logits/chosen": -1.8964427709579468, "logits/rejected": -1.8369576930999756, "logps/chosen": -345.4010009765625, "logps/rejected": -519.8181762695312, "loss": 0.4121, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.928940534591675, "rewards/margins": 1.7523027658462524, "rewards/rejected": -4.681242942810059, "step": 14280 }, { "epoch": 2.462095106822881, "grad_norm": 29.42935562133789, "learning_rate": 4.739281150719404e-08, "logits/chosen": -1.9027845859527588, "logits/rejected": -1.8150513172149658, "logps/chosen": -321.64422607421875, "logps/rejected": -510.31719970703125, "loss": 0.3793, "rewards/accuracies": 0.84375, "rewards/chosen": -2.60392427444458, "rewards/margins": 1.9835846424102783, "rewards/rejected": -4.5875091552734375, "step": 14290 }, { "epoch": 2.46381805651275, "grad_norm": 27.863561630249023, "learning_rate": 4.709959063406374e-08, "logits/chosen": -2.0142741203308105, "logits/rejected": -1.9412200450897217, "logps/chosen": -292.5108337402344, "logps/rejected": -485.62579345703125, "loss": 0.3122, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.3745532035827637, "rewards/margins": 1.925846815109253, "rewards/rejected": -4.3003997802734375, "step": 14300 }, { "epoch": 2.465541006202619, "grad_norm": 73.72631072998047, "learning_rate": 4.680718529955027e-08, "logits/chosen": -1.9436357021331787, "logits/rejected": -1.8767812252044678, "logps/chosen": -326.1226501464844, "logps/rejected": -499.9153747558594, "loss": 0.3878, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.7152605056762695, "rewards/margins": 1.7729976177215576, "rewards/rejected": -4.488258361816406, "step": 14310 }, { "epoch": 2.467263955892488, "grad_norm": 45.091766357421875, "learning_rate": 4.6515596678948525e-08, "logits/chosen": -1.919123888015747, "logits/rejected": -1.8597291707992554, "logps/chosen": -303.47039794921875, "logps/rejected": -481.43017578125, "loss": 0.3759, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.4740800857543945, "rewards/margins": 1.7939262390136719, "rewards/rejected": -4.268006324768066, "step": 14320 }, { "epoch": 2.468986905582357, "grad_norm": 45.74407196044922, "learning_rate": 4.622482594427093e-08, "logits/chosen": -2.015190601348877, "logits/rejected": -1.9302337169647217, "logps/chosen": -325.8829650878906, "logps/rejected": -513.3617553710938, "loss": 0.3553, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.6998817920684814, "rewards/margins": 1.8989711999893188, "rewards/rejected": -4.59885311126709, "step": 14330 }, { "epoch": 2.470709855272226, "grad_norm": 41.77252960205078, "learning_rate": 4.593487426424234e-08, "logits/chosen": -1.932885766029358, "logits/rejected": -1.8578767776489258, "logps/chosen": -323.0542907714844, "logps/rejected": -523.5946655273438, "loss": 0.3534, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.673053741455078, "rewards/margins": 2.00534725189209, "rewards/rejected": -4.678400993347168, "step": 14340 }, { "epoch": 2.472432804962095, "grad_norm": 34.980709075927734, "learning_rate": 4.5645742804295504e-08, "logits/chosen": -1.9846203327178955, "logits/rejected": -1.9170112609863281, "logps/chosen": -306.7415466308594, "logps/rejected": -486.54913330078125, "loss": 0.3894, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.523918390274048, "rewards/margins": 1.8072974681854248, "rewards/rejected": -4.331215858459473, "step": 14350 }, { "epoch": 2.474155754651964, "grad_norm": 62.719825744628906, "learning_rate": 4.5357432726566505e-08, "logits/chosen": -1.9439918994903564, "logits/rejected": -1.8790658712387085, "logps/chosen": -320.13427734375, "logps/rejected": -467.3065490722656, "loss": 0.4443, "rewards/accuracies": 0.78125, "rewards/chosen": -2.636364221572876, "rewards/margins": 1.518759846687317, "rewards/rejected": -4.155124187469482, "step": 14360 }, { "epoch": 2.475878704341833, "grad_norm": 26.924619674682617, "learning_rate": 4.506994518988988e-08, "logits/chosen": -1.9754998683929443, "logits/rejected": -1.9037275314331055, "logps/chosen": -318.4054260253906, "logps/rejected": -485.1969299316406, "loss": 0.3747, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.6214189529418945, "rewards/margins": 1.7304035425186157, "rewards/rejected": -4.351822376251221, "step": 14370 }, { "epoch": 2.4776016540317025, "grad_norm": 29.335342407226562, "learning_rate": 4.478328134979406e-08, "logits/chosen": -2.0286643505096436, "logits/rejected": -1.9526474475860596, "logps/chosen": -301.04669189453125, "logps/rejected": -488.252197265625, "loss": 0.327, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.4772324562072754, "rewards/margins": 1.8999398946762085, "rewards/rejected": -4.377171993255615, "step": 14380 }, { "epoch": 2.4793246037215715, "grad_norm": 53.01050567626953, "learning_rate": 4.4497442358496564e-08, "logits/chosen": -1.9692310094833374, "logits/rejected": -1.890504240989685, "logps/chosen": -306.83917236328125, "logps/rejected": -506.3428649902344, "loss": 0.3179, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.535381317138672, "rewards/margins": 2.0242855548858643, "rewards/rejected": -4.559666633605957, "step": 14390 }, { "epoch": 2.4810475534114405, "grad_norm": 57.95623779296875, "learning_rate": 4.4212429364899716e-08, "logits/chosen": -2.0019524097442627, "logits/rejected": -1.92264723777771, "logps/chosen": -300.58099365234375, "logps/rejected": -498.085205078125, "loss": 0.3258, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.425410032272339, "rewards/margins": 2.064704418182373, "rewards/rejected": -4.490114688873291, "step": 14400 }, { "epoch": 2.4810475534114405, "eval_logits/chosen": -1.983465552330017, "eval_logits/rejected": -1.9581221342086792, "eval_logps/chosen": -351.1316223144531, "eval_logps/rejected": -412.84381103515625, "eval_loss": 0.7094977498054504, "eval_rewards/accuracies": 0.6291821599006653, "eval_rewards/chosen": -2.921161651611328, "eval_rewards/margins": 0.5797803997993469, "eval_rewards/rejected": -3.5009422302246094, "eval_runtime": 362.7148, "eval_samples_per_second": 11.866, "eval_steps_per_second": 1.483, "step": 14400 }, { "epoch": 2.4827705031013094, "grad_norm": 37.84271240234375, "learning_rate": 4.392824351458582e-08, "logits/chosen": -1.9251970052719116, "logits/rejected": -1.8528591394424438, "logps/chosen": -311.3099365234375, "logps/rejected": -496.4620056152344, "loss": 0.4249, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.5949466228485107, "rewards/margins": 1.8501379489898682, "rewards/rejected": -4.445084571838379, "step": 14410 }, { "epoch": 2.4844934527911784, "grad_norm": 23.653823852539062, "learning_rate": 4.364488594981241e-08, "logits/chosen": -1.965746283531189, "logits/rejected": -1.893243432044983, "logps/chosen": -315.8910217285156, "logps/rejected": -512.0089721679688, "loss": 0.3438, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.631085157394409, "rewards/margins": 1.9754117727279663, "rewards/rejected": -4.606496810913086, "step": 14420 }, { "epoch": 2.4862164024810474, "grad_norm": 77.0193099975586, "learning_rate": 4.336235780950781e-08, "logits/chosen": -1.9656648635864258, "logits/rejected": -1.9029594659805298, "logps/chosen": -315.43280029296875, "logps/rejected": -496.3191833496094, "loss": 0.4171, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.624117136001587, "rewards/margins": 1.8436698913574219, "rewards/rejected": -4.467787265777588, "step": 14430 }, { "epoch": 2.4879393521709168, "grad_norm": 36.122188568115234, "learning_rate": 4.308066022926671e-08, "logits/chosen": -1.9495820999145508, "logits/rejected": -1.8801301717758179, "logps/chosen": -300.253173828125, "logps/rejected": -480.55316162109375, "loss": 0.3674, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.4543848037719727, "rewards/margins": 1.8177156448364258, "rewards/rejected": -4.27209997177124, "step": 14440 }, { "epoch": 2.4896623018607857, "grad_norm": 41.10538864135742, "learning_rate": 4.2799794341345285e-08, "logits/chosen": -1.931875228881836, "logits/rejected": -1.8698323965072632, "logps/chosen": -318.81005859375, "logps/rejected": -494.9874572753906, "loss": 0.3609, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.625624179840088, "rewards/margins": 1.811368703842163, "rewards/rejected": -4.436992645263672, "step": 14450 }, { "epoch": 2.4913852515506547, "grad_norm": 39.530853271484375, "learning_rate": 4.25197612746569e-08, "logits/chosen": -1.9851162433624268, "logits/rejected": -1.921587586402893, "logps/chosen": -292.372314453125, "logps/rejected": -463.8827209472656, "loss": 0.383, "rewards/accuracies": 0.8125, "rewards/chosen": -2.3790640830993652, "rewards/margins": 1.732471227645874, "rewards/rejected": -4.111535549163818, "step": 14460 }, { "epoch": 2.4931082012405237, "grad_norm": 38.64424133300781, "learning_rate": 4.224056215476751e-08, "logits/chosen": -2.042806625366211, "logits/rejected": -1.9653587341308594, "logps/chosen": -311.68011474609375, "logps/rejected": -481.64886474609375, "loss": 0.3819, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.5181145668029785, "rewards/margins": 1.807458519935608, "rewards/rejected": -4.325572967529297, "step": 14470 }, { "epoch": 2.4948311509303926, "grad_norm": 30.831968307495117, "learning_rate": 4.1962198103890986e-08, "logits/chosen": -1.9921352863311768, "logits/rejected": -1.9395091533660889, "logps/chosen": -296.5747375488281, "logps/rejected": -475.81890869140625, "loss": 0.3511, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.4224448204040527, "rewards/margins": 1.7932968139648438, "rewards/rejected": -4.2157416343688965, "step": 14480 }, { "epoch": 2.496554100620262, "grad_norm": 63.212947845458984, "learning_rate": 4.1684670240884675e-08, "logits/chosen": -1.9566596746444702, "logits/rejected": -1.8930752277374268, "logps/chosen": -300.58966064453125, "logps/rejected": -482.56475830078125, "loss": 0.3761, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.492058277130127, "rewards/margins": 1.8249543905258179, "rewards/rejected": -4.317012786865234, "step": 14490 }, { "epoch": 2.498277050310131, "grad_norm": 19.59129524230957, "learning_rate": 4.140797968124515e-08, "logits/chosen": -1.9982140064239502, "logits/rejected": -1.9067881107330322, "logps/chosen": -281.71160888671875, "logps/rejected": -474.36322021484375, "loss": 0.2989, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.241438627243042, "rewards/margins": 1.96945321559906, "rewards/rejected": -4.210892200469971, "step": 14500 }, { "epoch": 2.5, "grad_norm": 40.402915954589844, "learning_rate": 4.113212753710343e-08, "logits/chosen": -2.010701894760132, "logits/rejected": -1.9537899494171143, "logps/chosen": -316.1083984375, "logps/rejected": -458.4609375, "loss": 0.4596, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.5965523719787598, "rewards/margins": 1.4818514585494995, "rewards/rejected": -4.078403949737549, "step": 14510 }, { "epoch": 2.501722949689869, "grad_norm": 55.37042236328125, "learning_rate": 4.0857114917220575e-08, "logits/chosen": -1.956424355506897, "logits/rejected": -1.8823816776275635, "logps/chosen": -308.7487487792969, "logps/rejected": -490.67108154296875, "loss": 0.4248, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.5198333263397217, "rewards/margins": 1.8460098505020142, "rewards/rejected": -4.365843296051025, "step": 14520 }, { "epoch": 2.503445899379738, "grad_norm": 40.08086395263672, "learning_rate": 4.058294292698319e-08, "logits/chosen": -1.9945170879364014, "logits/rejected": -1.9278576374053955, "logps/chosen": -314.1739807128906, "logps/rejected": -491.2374572753906, "loss": 0.3523, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.558720827102661, "rewards/margins": 1.8053739070892334, "rewards/rejected": -4.3640947341918945, "step": 14530 }, { "epoch": 2.505168849069607, "grad_norm": 33.939239501953125, "learning_rate": 4.030961266839919e-08, "logits/chosen": -1.9611730575561523, "logits/rejected": -1.8840157985687256, "logps/chosen": -298.56427001953125, "logps/rejected": -500.73724365234375, "loss": 0.3331, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.4321391582489014, "rewards/margins": 2.059800386428833, "rewards/rejected": -4.491939544677734, "step": 14540 }, { "epoch": 2.5068917987594763, "grad_norm": 55.15105438232422, "learning_rate": 4.0037125240093256e-08, "logits/chosen": -2.0137858390808105, "logits/rejected": -1.9408706426620483, "logps/chosen": -286.3623046875, "logps/rejected": -474.01129150390625, "loss": 0.4124, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.281116008758545, "rewards/margins": 1.9318211078643799, "rewards/rejected": -4.212936878204346, "step": 14550 }, { "epoch": 2.5086147484493453, "grad_norm": 36.28321838378906, "learning_rate": 3.976548173730221e-08, "logits/chosen": -1.9515411853790283, "logits/rejected": -1.8780558109283447, "logps/chosen": -304.74884033203125, "logps/rejected": -501.0821838378906, "loss": 0.3531, "rewards/accuracies": 0.8125, "rewards/chosen": -2.513805389404297, "rewards/margins": 1.9583349227905273, "rewards/rejected": -4.472140312194824, "step": 14560 }, { "epoch": 2.5103376981392143, "grad_norm": 35.86176300048828, "learning_rate": 3.9494683251870884e-08, "logits/chosen": -1.9258005619049072, "logits/rejected": -1.8741058111190796, "logps/chosen": -294.9167785644531, "logps/rejected": -457.59075927734375, "loss": 0.4305, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.4149582386016846, "rewards/margins": 1.6256370544433594, "rewards/rejected": -4.040595054626465, "step": 14570 }, { "epoch": 2.5120606478290832, "grad_norm": 64.2865219116211, "learning_rate": 3.922473087224776e-08, "logits/chosen": -1.992349624633789, "logits/rejected": -1.926539659500122, "logps/chosen": -300.4522399902344, "logps/rejected": -459.8357849121094, "loss": 0.4159, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.498250961303711, "rewards/margins": 1.5989882946014404, "rewards/rejected": -4.0972394943237305, "step": 14580 }, { "epoch": 2.5137835975189526, "grad_norm": 49.8903923034668, "learning_rate": 3.8955625683480266e-08, "logits/chosen": -1.959519386291504, "logits/rejected": -1.8935353755950928, "logps/chosen": -280.5421142578125, "logps/rejected": -462.41156005859375, "loss": 0.3679, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.267099380493164, "rewards/margins": 1.8396879434585571, "rewards/rejected": -4.10678768157959, "step": 14590 }, { "epoch": 2.5155065472088216, "grad_norm": 39.02354431152344, "learning_rate": 3.868736876721088e-08, "logits/chosen": -1.9743598699569702, "logits/rejected": -1.8878322839736938, "logps/chosen": -289.6495361328125, "logps/rejected": -480.49639892578125, "loss": 0.2947, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.339020252227783, "rewards/margins": 1.9598667621612549, "rewards/rejected": -4.298886299133301, "step": 14600 }, { "epoch": 2.5172294968986906, "grad_norm": 41.63386154174805, "learning_rate": 3.8419961201672205e-08, "logits/chosen": -1.9445478916168213, "logits/rejected": -1.8676742315292358, "logps/chosen": -296.60235595703125, "logps/rejected": -469.178466796875, "loss": 0.3503, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.3897957801818848, "rewards/margins": 1.8112754821777344, "rewards/rejected": -4.201071739196777, "step": 14610 }, { "epoch": 2.5189524465885595, "grad_norm": 39.51591491699219, "learning_rate": 3.815340406168332e-08, "logits/chosen": -1.9419586658477783, "logits/rejected": -1.8837162256240845, "logps/chosen": -325.1991271972656, "logps/rejected": -505.8304748535156, "loss": 0.4072, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.729252815246582, "rewards/margins": 1.8019882440567017, "rewards/rejected": -4.531240940093994, "step": 14620 }, { "epoch": 2.5206753962784285, "grad_norm": 36.9972038269043, "learning_rate": 3.788769841864481e-08, "logits/chosen": -1.9145472049713135, "logits/rejected": -1.8516956567764282, "logps/chosen": -306.4422912597656, "logps/rejected": -485.6219787597656, "loss": 0.3561, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.5169224739074707, "rewards/margins": 1.8177623748779297, "rewards/rejected": -4.3346848487854, "step": 14630 }, { "epoch": 2.5223983459682975, "grad_norm": 28.115114212036133, "learning_rate": 3.762284534053492e-08, "logits/chosen": -1.9255964756011963, "logits/rejected": -1.8614028692245483, "logps/chosen": -302.2560119628906, "logps/rejected": -491.028076171875, "loss": 0.3954, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.4977028369903564, "rewards/margins": 1.8955367803573608, "rewards/rejected": -4.393239974975586, "step": 14640 }, { "epoch": 2.524121295658167, "grad_norm": 85.41705322265625, "learning_rate": 3.7358845891905164e-08, "logits/chosen": -1.9827568531036377, "logits/rejected": -1.9224497079849243, "logps/chosen": -339.1072692871094, "logps/rejected": -482.38104248046875, "loss": 0.4669, "rewards/accuracies": 0.78125, "rewards/chosen": -2.841937303543091, "rewards/margins": 1.4911645650863647, "rewards/rejected": -4.333102226257324, "step": 14650 }, { "epoch": 2.525844245348036, "grad_norm": 70.93968963623047, "learning_rate": 3.7095701133875586e-08, "logits/chosen": -1.9349651336669922, "logits/rejected": -1.851837396621704, "logps/chosen": -310.9633483886719, "logps/rejected": -514.5816650390625, "loss": 0.284, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.5625016689300537, "rewards/margins": 2.0745997428894043, "rewards/rejected": -4.637101173400879, "step": 14660 }, { "epoch": 2.527567195037905, "grad_norm": 57.41047668457031, "learning_rate": 3.68334121241313e-08, "logits/chosen": -1.8892574310302734, "logits/rejected": -1.8228298425674438, "logps/chosen": -296.69049072265625, "logps/rejected": -478.93963623046875, "loss": 0.34, "rewards/accuracies": 0.84375, "rewards/chosen": -2.4446091651916504, "rewards/margins": 1.8310762643814087, "rewards/rejected": -4.2756853103637695, "step": 14670 }, { "epoch": 2.529290144727774, "grad_norm": 33.72563552856445, "learning_rate": 3.657197991691774e-08, "logits/chosen": -1.9461174011230469, "logits/rejected": -1.8868846893310547, "logps/chosen": -313.5001220703125, "logps/rejected": -478.3130798339844, "loss": 0.3886, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.642167091369629, "rewards/margins": 1.6222788095474243, "rewards/rejected": -4.264445781707764, "step": 14680 }, { "epoch": 2.531013094417643, "grad_norm": 78.52323913574219, "learning_rate": 3.6311405563036326e-08, "logits/chosen": -1.9653047323226929, "logits/rejected": -1.882991075515747, "logps/chosen": -321.0413818359375, "logps/rejected": -530.1490478515625, "loss": 0.3417, "rewards/accuracies": 0.84375, "rewards/chosen": -2.6834750175476074, "rewards/margins": 2.0904886722564697, "rewards/rejected": -4.77396297454834, "step": 14690 }, { "epoch": 2.532736044107512, "grad_norm": 68.353271484375, "learning_rate": 3.605169010984049e-08, "logits/chosen": -2.00197172164917, "logits/rejected": -1.921370506286621, "logps/chosen": -340.9608154296875, "logps/rejected": -557.9393310546875, "loss": 0.3971, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.854647159576416, "rewards/margins": 2.165767192840576, "rewards/rejected": -5.02041482925415, "step": 14700 }, { "epoch": 2.534458993797381, "grad_norm": 46.22882843017578, "learning_rate": 3.579283460123153e-08, "logits/chosen": -1.972235918045044, "logits/rejected": -1.9150753021240234, "logps/chosen": -304.63177490234375, "logps/rejected": -484.6707458496094, "loss": 0.3641, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.5008139610290527, "rewards/margins": 1.8162987232208252, "rewards/rejected": -4.317112922668457, "step": 14710 }, { "epoch": 2.53618194348725, "grad_norm": 54.91798400878906, "learning_rate": 3.553484007765423e-08, "logits/chosen": -1.9383903741836548, "logits/rejected": -1.8560737371444702, "logps/chosen": -324.42596435546875, "logps/rejected": -520.8067626953125, "loss": 0.3623, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.660179853439331, "rewards/margins": 2.0176262855529785, "rewards/rejected": -4.6778059005737305, "step": 14720 }, { "epoch": 2.537904893177119, "grad_norm": 47.46377182006836, "learning_rate": 3.527770757609253e-08, "logits/chosen": -2.004211902618408, "logits/rejected": -1.9192428588867188, "logps/chosen": -314.0555114746094, "logps/rejected": -510.8456115722656, "loss": 0.351, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.596126079559326, "rewards/margins": 2.015511989593506, "rewards/rejected": -4.611637592315674, "step": 14730 }, { "epoch": 2.539627842866988, "grad_norm": 44.57471466064453, "learning_rate": 3.5021438130065834e-08, "logits/chosen": -1.930315613746643, "logits/rejected": -1.8692620992660522, "logps/chosen": -331.04632568359375, "logps/rejected": -524.072021484375, "loss": 0.3353, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.740408420562744, "rewards/margins": 1.9584133625030518, "rewards/rejected": -4.698822021484375, "step": 14740 }, { "epoch": 2.5413507925568575, "grad_norm": 56.950592041015625, "learning_rate": 3.476603276962439e-08, "logits/chosen": -1.9497531652450562, "logits/rejected": -1.8802579641342163, "logps/chosen": -307.09686279296875, "logps/rejected": -480.75, "loss": 0.3446, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.535259485244751, "rewards/margins": 1.7848014831542969, "rewards/rejected": -4.320060729980469, "step": 14750 }, { "epoch": 2.5430737422467264, "grad_norm": 53.76311492919922, "learning_rate": 3.451149252134544e-08, "logits/chosen": -1.9908113479614258, "logits/rejected": -1.9187204837799072, "logps/chosen": -321.18377685546875, "logps/rejected": -506.8678283691406, "loss": 0.3519, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.673459529876709, "rewards/margins": 1.870560884475708, "rewards/rejected": -4.54401969909668, "step": 14760 }, { "epoch": 2.5447966919365954, "grad_norm": 26.732715606689453, "learning_rate": 3.425781840832889e-08, "logits/chosen": -1.985361099243164, "logits/rejected": -1.9164581298828125, "logps/chosen": -329.1950988769531, "logps/rejected": -504.43463134765625, "loss": 0.43, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.7253451347351074, "rewards/margins": 1.7618662118911743, "rewards/rejected": -4.487211227416992, "step": 14770 }, { "epoch": 2.5465196416264644, "grad_norm": 50.72362518310547, "learning_rate": 3.400501145019344e-08, "logits/chosen": -1.9368940591812134, "logits/rejected": -1.8853365182876587, "logps/chosen": -310.21990966796875, "logps/rejected": -477.82171630859375, "loss": 0.4156, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.596402645111084, "rewards/margins": 1.6935937404632568, "rewards/rejected": -4.289996147155762, "step": 14780 }, { "epoch": 2.548242591316334, "grad_norm": 53.15373611450195, "learning_rate": 3.375307266307223e-08, "logits/chosen": -2.008366107940674, "logits/rejected": -1.9284690618515015, "logps/chosen": -325.8512268066406, "logps/rejected": -499.0397033691406, "loss": 0.4668, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.7171332836151123, "rewards/margins": 1.7677100896835327, "rewards/rejected": -4.484843730926514, "step": 14790 }, { "epoch": 2.5499655410062028, "grad_norm": 58.55543899536133, "learning_rate": 3.350200305960885e-08, "logits/chosen": -1.9808719158172607, "logits/rejected": -1.9144868850708008, "logps/chosen": -298.7830810546875, "logps/rejected": -494.57135009765625, "loss": 0.3646, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.4371559619903564, "rewards/margins": 1.9559059143066406, "rewards/rejected": -4.393062114715576, "step": 14800 }, { "epoch": 2.5499655410062028, "eval_logits/chosen": -2.012669563293457, "eval_logits/rejected": -1.9883649349212646, "eval_logps/chosen": -331.82568359375, "eval_logps/rejected": -389.863037109375, "eval_loss": 0.7040568590164185, "eval_rewards/accuracies": 0.6349906921386719, "eval_rewards/chosen": -2.728101968765259, "eval_rewards/margins": 0.5430322289466858, "eval_rewards/rejected": -3.271134614944458, "eval_runtime": 362.4727, "eval_samples_per_second": 11.874, "eval_steps_per_second": 1.484, "step": 14800 }, { "epoch": 2.5516884906960717, "grad_norm": 39.24565124511719, "learning_rate": 3.3251803648953385e-08, "logits/chosen": -2.0206525325775146, "logits/rejected": -1.9432004690170288, "logps/chosen": -307.79022216796875, "logps/rejected": -479.3343200683594, "loss": 0.3686, "rewards/accuracies": 0.875, "rewards/chosen": -2.4808387756347656, "rewards/margins": 1.8377106189727783, "rewards/rejected": -4.318549633026123, "step": 14810 }, { "epoch": 2.5534114403859407, "grad_norm": 55.82837677001953, "learning_rate": 3.300247543675827e-08, "logits/chosen": -2.0951836109161377, "logits/rejected": -2.010108709335327, "logps/chosen": -294.2719421386719, "logps/rejected": -492.14471435546875, "loss": 0.3158, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.3969359397888184, "rewards/margins": 2.0130224227905273, "rewards/rejected": -4.409958839416504, "step": 14820 }, { "epoch": 2.5551343900758097, "grad_norm": 30.70519256591797, "learning_rate": 3.275401942517417e-08, "logits/chosen": -1.9996370077133179, "logits/rejected": -1.922426462173462, "logps/chosen": -292.40216064453125, "logps/rejected": -478.2186584472656, "loss": 0.3606, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.3593692779541016, "rewards/margins": 1.9176502227783203, "rewards/rejected": -4.277019500732422, "step": 14830 }, { "epoch": 2.5568573397656786, "grad_norm": 36.70526123046875, "learning_rate": 3.250643661284594e-08, "logits/chosen": -2.0392510890960693, "logits/rejected": -1.9697567224502563, "logps/chosen": -323.1668395996094, "logps/rejected": -497.4178161621094, "loss": 0.3558, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.6625213623046875, "rewards/margins": 1.754372000694275, "rewards/rejected": -4.416893005371094, "step": 14840 }, { "epoch": 2.558580289455548, "grad_norm": 22.372861862182617, "learning_rate": 3.225972799490892e-08, "logits/chosen": -1.9957027435302734, "logits/rejected": -1.916261911392212, "logps/chosen": -310.05767822265625, "logps/rejected": -503.2296447753906, "loss": 0.3544, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.5505528450012207, "rewards/margins": 1.9505903720855713, "rewards/rejected": -4.501142978668213, "step": 14850 }, { "epoch": 2.560303239145417, "grad_norm": 66.57563781738281, "learning_rate": 3.201389456298465e-08, "logits/chosen": -1.9645763635635376, "logits/rejected": -1.9052448272705078, "logps/chosen": -298.2623596191406, "logps/rejected": -488.90277099609375, "loss": 0.4949, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.4263710975646973, "rewards/margins": 1.9532301425933838, "rewards/rejected": -4.37960147857666, "step": 14860 }, { "epoch": 2.562026188835286, "grad_norm": 38.0269660949707, "learning_rate": 3.176893730517677e-08, "logits/chosen": -2.0329701900482178, "logits/rejected": -1.9648631811141968, "logps/chosen": -294.7584228515625, "logps/rejected": -463.03900146484375, "loss": 0.3881, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.3629069328308105, "rewards/margins": 1.743474006652832, "rewards/rejected": -4.106380462646484, "step": 14870 }, { "epoch": 2.563749138525155, "grad_norm": 30.3785457611084, "learning_rate": 3.1524857206067344e-08, "logits/chosen": -1.9795843362808228, "logits/rejected": -1.9254392385482788, "logps/chosen": -282.4158935546875, "logps/rejected": -459.23980712890625, "loss": 0.3544, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.2844276428222656, "rewards/margins": 1.7837345600128174, "rewards/rejected": -4.068161964416504, "step": 14880 }, { "epoch": 2.5654720882150244, "grad_norm": 39.38323974609375, "learning_rate": 3.1281655246712866e-08, "logits/chosen": -1.9299129247665405, "logits/rejected": -1.8718467950820923, "logps/chosen": -300.26348876953125, "logps/rejected": -473.24261474609375, "loss": 0.3709, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.4187610149383545, "rewards/margins": 1.7480814456939697, "rewards/rejected": -4.166842460632324, "step": 14890 }, { "epoch": 2.5671950379048933, "grad_norm": 51.39818572998047, "learning_rate": 3.103933240464002e-08, "logits/chosen": -1.926270842552185, "logits/rejected": -1.8625341653823853, "logps/chosen": -290.84783935546875, "logps/rejected": -454.15362548828125, "loss": 0.4228, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.364269733428955, "rewards/margins": 1.670400857925415, "rewards/rejected": -4.034670352935791, "step": 14900 }, { "epoch": 2.5689179875947623, "grad_norm": 39.80025100708008, "learning_rate": 3.0797889653842166e-08, "logits/chosen": -1.9870851039886475, "logits/rejected": -1.9160282611846924, "logps/chosen": -278.642333984375, "logps/rejected": -475.19464111328125, "loss": 0.3223, "rewards/accuracies": 0.875, "rewards/chosen": -2.271270275115967, "rewards/margins": 1.9492067098617554, "rewards/rejected": -4.220477104187012, "step": 14910 }, { "epoch": 2.5706409372846313, "grad_norm": 27.736406326293945, "learning_rate": 3.05573279647752e-08, "logits/chosen": -1.9719301462173462, "logits/rejected": -1.907342553138733, "logps/chosen": -304.2413024902344, "logps/rejected": -502.77288818359375, "loss": 0.3758, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.50430965423584, "rewards/margins": 1.9803966283798218, "rewards/rejected": -4.484706401824951, "step": 14920 }, { "epoch": 2.5723638869745002, "grad_norm": 33.87883758544922, "learning_rate": 3.0317648304353544e-08, "logits/chosen": -2.042679786682129, "logits/rejected": -1.9629135131835938, "logps/chosen": -285.1095275878906, "logps/rejected": -494.31915283203125, "loss": 0.3064, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.3042385578155518, "rewards/margins": 2.1175315380096436, "rewards/rejected": -4.421770095825195, "step": 14930 }, { "epoch": 2.574086836664369, "grad_norm": 48.721923828125, "learning_rate": 3.0078851635946424e-08, "logits/chosen": -1.98525071144104, "logits/rejected": -1.913047432899475, "logps/chosen": -289.99627685546875, "logps/rejected": -458.86444091796875, "loss": 0.3901, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.360745668411255, "rewards/margins": 1.7362349033355713, "rewards/rejected": -4.096980571746826, "step": 14940 }, { "epoch": 2.575809786354238, "grad_norm": 57.920841217041016, "learning_rate": 2.984093891937403e-08, "logits/chosen": -2.0541720390319824, "logits/rejected": -1.9860897064208984, "logps/chosen": -309.0328674316406, "logps/rejected": -479.0060119628906, "loss": 0.4331, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.5226423740386963, "rewards/margins": 1.7741369009017944, "rewards/rejected": -4.296778678894043, "step": 14950 }, { "epoch": 2.5775327360441076, "grad_norm": 50.547115325927734, "learning_rate": 2.960391111090374e-08, "logits/chosen": -1.9819790124893188, "logits/rejected": -1.9073944091796875, "logps/chosen": -294.6394348144531, "logps/rejected": -469.97271728515625, "loss": 0.3863, "rewards/accuracies": 0.84375, "rewards/chosen": -2.3729491233825684, "rewards/margins": 1.787616491317749, "rewards/rejected": -4.160565376281738, "step": 14960 }, { "epoch": 2.5792556857339766, "grad_norm": 40.986026763916016, "learning_rate": 2.936776916324568e-08, "logits/chosen": -2.018409490585327, "logits/rejected": -1.9384181499481201, "logps/chosen": -280.56317138671875, "logps/rejected": -459.3999938964844, "loss": 0.4342, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.256394863128662, "rewards/margins": 1.8092197179794312, "rewards/rejected": -4.065614700317383, "step": 14970 }, { "epoch": 2.5809786354238455, "grad_norm": 57.03447723388672, "learning_rate": 2.913251402554978e-08, "logits/chosen": -2.016087055206299, "logits/rejected": -1.9302079677581787, "logps/chosen": -309.3193664550781, "logps/rejected": -499.822021484375, "loss": 0.3685, "rewards/accuracies": 0.84375, "rewards/chosen": -2.5437188148498535, "rewards/margins": 1.9490035772323608, "rewards/rejected": -4.492722034454346, "step": 14980 }, { "epoch": 2.582701585113715, "grad_norm": 42.181602478027344, "learning_rate": 2.889814664340137e-08, "logits/chosen": -2.038231372833252, "logits/rejected": -1.9799411296844482, "logps/chosen": -285.73004150390625, "logps/rejected": -475.6513671875, "loss": 0.3499, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.3344035148620605, "rewards/margins": 1.9017696380615234, "rewards/rejected": -4.236173152923584, "step": 14990 }, { "epoch": 2.584424534803584, "grad_norm": 75.44369506835938, "learning_rate": 2.8664667958817472e-08, "logits/chosen": -2.037766933441162, "logits/rejected": -1.9752750396728516, "logps/chosen": -285.4435729980469, "logps/rejected": -466.686279296875, "loss": 0.3491, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.2875800132751465, "rewards/margins": 1.8457456827163696, "rewards/rejected": -4.133325576782227, "step": 15000 }, { "epoch": 2.586147484493453, "grad_norm": 36.41118240356445, "learning_rate": 2.8432078910243212e-08, "logits/chosen": -2.0314011573791504, "logits/rejected": -1.9651511907577515, "logps/chosen": -293.895263671875, "logps/rejected": -456.9076232910156, "loss": 0.4429, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.380807399749756, "rewards/margins": 1.6586002111434937, "rewards/rejected": -4.039408206939697, "step": 15010 }, { "epoch": 2.587870434183322, "grad_norm": 63.26222229003906, "learning_rate": 2.8200380432547688e-08, "logits/chosen": -1.9638545513153076, "logits/rejected": -1.9137016534805298, "logps/chosen": -293.77520751953125, "logps/rejected": -448.73553466796875, "loss": 0.4648, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.426182985305786, "rewards/margins": 1.5396860837936401, "rewards/rejected": -3.9658684730529785, "step": 15020 }, { "epoch": 2.589593383873191, "grad_norm": 39.986629486083984, "learning_rate": 2.7969573457020724e-08, "logits/chosen": -1.9821510314941406, "logits/rejected": -1.9157823324203491, "logps/chosen": -282.78009033203125, "logps/rejected": -451.91705322265625, "loss": 0.3987, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.2739741802215576, "rewards/margins": 1.7043097019195557, "rewards/rejected": -3.978283643722534, "step": 15030 }, { "epoch": 2.59131633356306, "grad_norm": 61.45586013793945, "learning_rate": 2.773965891136859e-08, "logits/chosen": -2.029153823852539, "logits/rejected": -1.95956289768219, "logps/chosen": -274.9366455078125, "logps/rejected": -443.38677978515625, "loss": 0.3757, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.19682240486145, "rewards/margins": 1.72360098361969, "rewards/rejected": -3.920423984527588, "step": 15040 }, { "epoch": 2.5930392832529288, "grad_norm": 48.26288986206055, "learning_rate": 2.7510637719710684e-08, "logits/chosen": -2.020521640777588, "logits/rejected": -1.9446289539337158, "logps/chosen": -283.5299072265625, "logps/rejected": -453.5205993652344, "loss": 0.3651, "rewards/accuracies": 0.84375, "rewards/chosen": -2.2770135402679443, "rewards/margins": 1.7751836776733398, "rewards/rejected": -4.052197456359863, "step": 15050 }, { "epoch": 2.594762232942798, "grad_norm": 68.87848663330078, "learning_rate": 2.7282510802575486e-08, "logits/chosen": -1.9736919403076172, "logits/rejected": -1.908149003982544, "logps/chosen": -284.16107177734375, "logps/rejected": -459.76202392578125, "loss": 0.4144, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.2958412170410156, "rewards/margins": 1.7624839544296265, "rewards/rejected": -4.05832576751709, "step": 15060 }, { "epoch": 2.596485182632667, "grad_norm": 34.389869689941406, "learning_rate": 2.7055279076897253e-08, "logits/chosen": -2.04569149017334, "logits/rejected": -1.9712460041046143, "logps/chosen": -274.36444091796875, "logps/rejected": -423.3799743652344, "loss": 0.3925, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.1597518920898438, "rewards/margins": 1.5673410892486572, "rewards/rejected": -3.727092742919922, "step": 15070 }, { "epoch": 2.598208132322536, "grad_norm": 48.54868698120117, "learning_rate": 2.682894345601186e-08, "logits/chosen": -2.0070011615753174, "logits/rejected": -1.9270061254501343, "logps/chosen": -261.50347900390625, "logps/rejected": -458.921875, "loss": 0.3104, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.0696921348571777, "rewards/margins": 1.9860970973968506, "rewards/rejected": -4.055788993835449, "step": 15080 }, { "epoch": 2.599931082012405, "grad_norm": 36.92158508300781, "learning_rate": 2.660350484965354e-08, "logits/chosen": -2.023442029953003, "logits/rejected": -1.9527900218963623, "logps/chosen": -275.90313720703125, "logps/rejected": -440.72698974609375, "loss": 0.3993, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.206986904144287, "rewards/margins": 1.676055669784546, "rewards/rejected": -3.883042573928833, "step": 15090 }, { "epoch": 2.6016540317022745, "grad_norm": 33.21492004394531, "learning_rate": 2.637896416395116e-08, "logits/chosen": -1.960444450378418, "logits/rejected": -1.9132283926010132, "logps/chosen": -283.37994384765625, "logps/rejected": -419.58331298828125, "loss": 0.4616, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.3161568641662598, "rewards/margins": 1.379992127418518, "rewards/rejected": -3.6961491107940674, "step": 15100 }, { "epoch": 2.6033769813921435, "grad_norm": 52.18138885498047, "learning_rate": 2.6155322301424087e-08, "logits/chosen": -1.9938548803329468, "logits/rejected": -1.9247496128082275, "logps/chosen": -291.7914733886719, "logps/rejected": -462.2745666503906, "loss": 0.4011, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.3737876415252686, "rewards/margins": 1.7412630319595337, "rewards/rejected": -4.115050792694092, "step": 15110 }, { "epoch": 2.6050999310820124, "grad_norm": 54.9022331237793, "learning_rate": 2.5932580160979323e-08, "logits/chosen": -1.9776744842529297, "logits/rejected": -1.9180479049682617, "logps/chosen": -312.2158203125, "logps/rejected": -470.9625549316406, "loss": 0.3937, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.5373573303222656, "rewards/margins": 1.614047646522522, "rewards/rejected": -4.15140438079834, "step": 15120 }, { "epoch": 2.6068228807718814, "grad_norm": 63.49612808227539, "learning_rate": 2.5710738637907488e-08, "logits/chosen": -1.9729678630828857, "logits/rejected": -1.8941831588745117, "logps/chosen": -286.8149719238281, "logps/rejected": -460.7655334472656, "loss": 0.3565, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.323557138442993, "rewards/margins": 1.7954384088516235, "rewards/rejected": -4.118995189666748, "step": 15130 }, { "epoch": 2.6085458304617504, "grad_norm": 32.351829528808594, "learning_rate": 2.5489798623879094e-08, "logits/chosen": -1.9975192546844482, "logits/rejected": -1.924696683883667, "logps/chosen": -311.10858154296875, "logps/rejected": -488.335693359375, "loss": 0.3702, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.552527904510498, "rewards/margins": 1.7710415124893188, "rewards/rejected": -4.323569297790527, "step": 15140 }, { "epoch": 2.6102687801516193, "grad_norm": 74.86699676513672, "learning_rate": 2.526976100694117e-08, "logits/chosen": -2.087351083755493, "logits/rejected": -2.0167007446289062, "logps/chosen": -293.3951721191406, "logps/rejected": -455.9125061035156, "loss": 0.4055, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.382277011871338, "rewards/margins": 1.6585884094238281, "rewards/rejected": -4.040865421295166, "step": 15150 }, { "epoch": 2.6119917298414888, "grad_norm": 40.00994873046875, "learning_rate": 2.5050626671513725e-08, "logits/chosen": -1.956843376159668, "logits/rejected": -1.8919804096221924, "logps/chosen": -294.46966552734375, "logps/rejected": -477.08221435546875, "loss": 0.3606, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.3869712352752686, "rewards/margins": 1.846509337425232, "rewards/rejected": -4.233480930328369, "step": 15160 }, { "epoch": 2.6137146795313577, "grad_norm": 67.9205322265625, "learning_rate": 2.4832396498386143e-08, "logits/chosen": -1.9408683776855469, "logits/rejected": -1.8818705081939697, "logps/chosen": -284.05255126953125, "logps/rejected": -450.400634765625, "loss": 0.4041, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.306239604949951, "rewards/margins": 1.6681127548217773, "rewards/rejected": -3.9743523597717285, "step": 15170 }, { "epoch": 2.6154376292212267, "grad_norm": 32.40700149536133, "learning_rate": 2.461507136471344e-08, "logits/chosen": -1.9982690811157227, "logits/rejected": -1.9377868175506592, "logps/chosen": -317.1585388183594, "logps/rejected": -490.69659423828125, "loss": 0.4173, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.603140354156494, "rewards/margins": 1.754939317703247, "rewards/rejected": -4.358079433441162, "step": 15180 }, { "epoch": 2.6171605789110957, "grad_norm": 46.89610290527344, "learning_rate": 2.4398652144013098e-08, "logits/chosen": -1.9540035724639893, "logits/rejected": -1.893449068069458, "logps/chosen": -295.5348205566406, "logps/rejected": -474.33709716796875, "loss": 0.3773, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.4177746772766113, "rewards/margins": 1.8105154037475586, "rewards/rejected": -4.228289604187012, "step": 15190 }, { "epoch": 2.618883528600965, "grad_norm": 36.42682647705078, "learning_rate": 2.418313970616126e-08, "logits/chosen": -1.9989887475967407, "logits/rejected": -1.927923560142517, "logps/chosen": -317.0546875, "logps/rejected": -481.78875732421875, "loss": 0.3596, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.6056201457977295, "rewards/margins": 1.686851143836975, "rewards/rejected": -4.292471885681152, "step": 15200 }, { "epoch": 2.618883528600965, "eval_logits/chosen": -2.0104026794433594, "eval_logits/rejected": -1.9861952066421509, "eval_logps/chosen": -337.950927734375, "eval_logps/rejected": -396.4674072265625, "eval_loss": 0.7045614719390869, "eval_rewards/accuracies": 0.6359200477600098, "eval_rewards/chosen": -2.7893543243408203, "eval_rewards/margins": 0.547823965549469, "eval_rewards/rejected": -3.3371779918670654, "eval_runtime": 361.97, "eval_samples_per_second": 11.89, "eval_steps_per_second": 1.486, "step": 15200 }, { "epoch": 2.620606478290834, "grad_norm": 47.936954498291016, "learning_rate": 2.3968534917389345e-08, "logits/chosen": -1.9200124740600586, "logits/rejected": -1.8329826593399048, "logps/chosen": -305.399169921875, "logps/rejected": -492.1451110839844, "loss": 0.3859, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.486237049102783, "rewards/margins": 1.9213802814483643, "rewards/rejected": -4.40761661529541, "step": 15210 }, { "epoch": 2.622329427980703, "grad_norm": 32.09933853149414, "learning_rate": 2.3754838640280633e-08, "logits/chosen": -1.920479416847229, "logits/rejected": -1.8435475826263428, "logps/chosen": -289.9581298828125, "logps/rejected": -468.51580810546875, "loss": 0.3256, "rewards/accuracies": 0.84375, "rewards/chosen": -2.367870807647705, "rewards/margins": 1.8315532207489014, "rewards/rejected": -4.1994242668151855, "step": 15220 }, { "epoch": 2.624052377670572, "grad_norm": 25.09410285949707, "learning_rate": 2.3542051733766692e-08, "logits/chosen": -1.9608709812164307, "logits/rejected": -1.8966529369354248, "logps/chosen": -302.05987548828125, "logps/rejected": -484.9076232910156, "loss": 0.384, "rewards/accuracies": 0.875, "rewards/chosen": -2.4609055519104004, "rewards/margins": 1.8601592779159546, "rewards/rejected": -4.3210649490356445, "step": 15230 }, { "epoch": 2.625775327360441, "grad_norm": 48.44083786010742, "learning_rate": 2.3330175053123985e-08, "logits/chosen": -1.9625259637832642, "logits/rejected": -1.8818233013153076, "logps/chosen": -297.42742919921875, "logps/rejected": -484.22320556640625, "loss": 0.3624, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.373741388320923, "rewards/margins": 1.9391887187957764, "rewards/rejected": -4.312930107116699, "step": 15240 }, { "epoch": 2.62749827705031, "grad_norm": 47.75816345214844, "learning_rate": 2.3119209449970283e-08, "logits/chosen": -1.9868698120117188, "logits/rejected": -1.921992301940918, "logps/chosen": -311.54779052734375, "logps/rejected": -489.3172302246094, "loss": 0.4283, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.5729899406433105, "rewards/margins": 1.7840687036514282, "rewards/rejected": -4.357058525085449, "step": 15250 }, { "epoch": 2.6292212267401793, "grad_norm": 36.276824951171875, "learning_rate": 2.2909155772261517e-08, "logits/chosen": -1.9517484903335571, "logits/rejected": -1.890221357345581, "logps/chosen": -315.72760009765625, "logps/rejected": -476.744140625, "loss": 0.4285, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.602705478668213, "rewards/margins": 1.6705633401870728, "rewards/rejected": -4.273268699645996, "step": 15260 }, { "epoch": 2.6309441764300483, "grad_norm": 42.16166687011719, "learning_rate": 2.2700014864288268e-08, "logits/chosen": -1.991088628768921, "logits/rejected": -1.893536925315857, "logps/chosen": -311.6806640625, "logps/rejected": -515.9600219726562, "loss": 0.3154, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.559229612350464, "rewards/margins": 2.110598087310791, "rewards/rejected": -4.669827461242676, "step": 15270 }, { "epoch": 2.6326671261199173, "grad_norm": 54.88694763183594, "learning_rate": 2.2491787566672164e-08, "logits/chosen": -2.0504777431488037, "logits/rejected": -1.9851185083389282, "logps/chosen": -297.4906311035156, "logps/rejected": -487.5647888183594, "loss": 0.2948, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.437282085418701, "rewards/margins": 1.907504677772522, "rewards/rejected": -4.34478759765625, "step": 15280 }, { "epoch": 2.6343900758097862, "grad_norm": 28.790456771850586, "learning_rate": 2.2284474716362662e-08, "logits/chosen": -1.9894415140151978, "logits/rejected": -1.9120299816131592, "logps/chosen": -303.7039489746094, "logps/rejected": -489.24407958984375, "loss": 0.3769, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.4811856746673584, "rewards/margins": 1.9076086282730103, "rewards/rejected": -4.3887939453125, "step": 15290 }, { "epoch": 2.6361130254996556, "grad_norm": 29.120548248291016, "learning_rate": 2.2078077146633843e-08, "logits/chosen": -1.901275634765625, "logits/rejected": -1.8261524438858032, "logps/chosen": -323.650390625, "logps/rejected": -522.5540771484375, "loss": 0.3591, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.707955837249756, "rewards/margins": 2.0309720039367676, "rewards/rejected": -4.73892879486084, "step": 15300 }, { "epoch": 2.6378359751895246, "grad_norm": 26.178735733032227, "learning_rate": 2.1872595687080857e-08, "logits/chosen": -2.0176873207092285, "logits/rejected": -1.9345753192901611, "logps/chosen": -294.57379150390625, "logps/rejected": -493.0152893066406, "loss": 0.3066, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.389258623123169, "rewards/margins": 2.015267848968506, "rewards/rejected": -4.404526710510254, "step": 15310 }, { "epoch": 2.6395589248793936, "grad_norm": 63.48350524902344, "learning_rate": 2.166803116361654e-08, "logits/chosen": -1.9528690576553345, "logits/rejected": -1.8989251852035522, "logps/chosen": -315.02606201171875, "logps/rejected": -496.94329833984375, "loss": 0.3566, "rewards/accuracies": 0.84375, "rewards/chosen": -2.62955379486084, "rewards/margins": 1.8155285120010376, "rewards/rejected": -4.445082664489746, "step": 15320 }, { "epoch": 2.6412818745692626, "grad_norm": 44.775238037109375, "learning_rate": 2.1464384398468233e-08, "logits/chosen": -1.9879424571990967, "logits/rejected": -1.9055492877960205, "logps/chosen": -314.1039733886719, "logps/rejected": -520.6514892578125, "loss": 0.3147, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.575599193572998, "rewards/margins": 2.098741054534912, "rewards/rejected": -4.67434024810791, "step": 15330 }, { "epoch": 2.6430048242591315, "grad_norm": 36.74201202392578, "learning_rate": 2.126165621017456e-08, "logits/chosen": -1.9776026010513306, "logits/rejected": -1.9137685298919678, "logps/chosen": -320.1769714355469, "logps/rejected": -502.0813903808594, "loss": 0.3723, "rewards/accuracies": 0.8125, "rewards/chosen": -2.6508679389953613, "rewards/margins": 1.8282616138458252, "rewards/rejected": -4.479129314422607, "step": 15340 }, { "epoch": 2.6447277739490005, "grad_norm": 64.56641387939453, "learning_rate": 2.1059847413581772e-08, "logits/chosen": -1.9074665307998657, "logits/rejected": -1.8412668704986572, "logps/chosen": -309.04925537109375, "logps/rejected": -509.5458984375, "loss": 0.3589, "rewards/accuracies": 0.84375, "rewards/chosen": -2.540052890777588, "rewards/margins": 2.029656171798706, "rewards/rejected": -4.569709300994873, "step": 15350 }, { "epoch": 2.64645072363887, "grad_norm": 33.855587005615234, "learning_rate": 2.0858958819840954e-08, "logits/chosen": -1.976341962814331, "logits/rejected": -1.9194657802581787, "logps/chosen": -312.1601257324219, "logps/rejected": -474.921142578125, "loss": 0.4474, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.5892739295959473, "rewards/margins": 1.6147493124008179, "rewards/rejected": -4.2040228843688965, "step": 15360 }, { "epoch": 2.648173673328739, "grad_norm": 46.51362609863281, "learning_rate": 2.065899123640441e-08, "logits/chosen": -1.988711953163147, "logits/rejected": -1.9110978841781616, "logps/chosen": -315.85162353515625, "logps/rejected": -497.49072265625, "loss": 0.3439, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.588850498199463, "rewards/margins": 1.880987524986267, "rewards/rejected": -4.4698381423950195, "step": 15370 }, { "epoch": 2.649896623018608, "grad_norm": 46.66869354248047, "learning_rate": 2.0459945467022566e-08, "logits/chosen": -1.9690208435058594, "logits/rejected": -1.905799150466919, "logps/chosen": -308.9378967285156, "logps/rejected": -484.22845458984375, "loss": 0.3544, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.5399415493011475, "rewards/margins": 1.789746642112732, "rewards/rejected": -4.32968807220459, "step": 15380 }, { "epoch": 2.651619572708477, "grad_norm": 26.786161422729492, "learning_rate": 2.026182231174059e-08, "logits/chosen": -2.0500638484954834, "logits/rejected": -1.9859079122543335, "logps/chosen": -300.12054443359375, "logps/rejected": -496.5732421875, "loss": 0.3766, "rewards/accuracies": 0.84375, "rewards/chosen": -2.4338181018829346, "rewards/margins": 1.9835102558135986, "rewards/rejected": -4.417328834533691, "step": 15390 }, { "epoch": 2.6533425223983462, "grad_norm": 38.265380859375, "learning_rate": 2.006462256689545e-08, "logits/chosen": -1.9896526336669922, "logits/rejected": -1.9177395105361938, "logps/chosen": -325.6061096191406, "logps/rejected": -474.498779296875, "loss": 0.4421, "rewards/accuracies": 0.8125, "rewards/chosen": -2.668098211288452, "rewards/margins": 1.5707905292510986, "rewards/rejected": -4.238888740539551, "step": 15400 }, { "epoch": 2.655065472088215, "grad_norm": 32.2272834777832, "learning_rate": 1.9868347025112586e-08, "logits/chosen": -2.0001821517944336, "logits/rejected": -1.9231926202774048, "logps/chosen": -306.1611633300781, "logps/rejected": -451.07904052734375, "loss": 0.4127, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.4552395343780518, "rewards/margins": 1.5481938123703003, "rewards/rejected": -4.0034332275390625, "step": 15410 }, { "epoch": 2.656788421778084, "grad_norm": 55.408695220947266, "learning_rate": 1.9672996475302406e-08, "logits/chosen": -1.9853451251983643, "logits/rejected": -1.9185739755630493, "logps/chosen": -293.37335205078125, "logps/rejected": -475.57122802734375, "loss": 0.3565, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.397921562194824, "rewards/margins": 1.83285391330719, "rewards/rejected": -4.230775833129883, "step": 15420 }, { "epoch": 2.658511371467953, "grad_norm": 25.616779327392578, "learning_rate": 1.9478571702657708e-08, "logits/chosen": -2.046208620071411, "logits/rejected": -1.9735801219940186, "logps/chosen": -306.2370910644531, "logps/rejected": -488.9190979003906, "loss": 0.3341, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.4653913974761963, "rewards/margins": 1.896561861038208, "rewards/rejected": -4.361952781677246, "step": 15430 }, { "epoch": 2.660234321157822, "grad_norm": 31.727825164794922, "learning_rate": 1.9285073488650135e-08, "logits/chosen": -2.0755953788757324, "logits/rejected": -2.0093483924865723, "logps/chosen": -294.185546875, "logps/rejected": -461.57958984375, "loss": 0.3486, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.3863959312438965, "rewards/margins": 1.6984243392944336, "rewards/rejected": -4.084820747375488, "step": 15440 }, { "epoch": 2.661957270847691, "grad_norm": 44.71334457397461, "learning_rate": 1.909250261102696e-08, "logits/chosen": -1.8957074880599976, "logits/rejected": -1.8258901834487915, "logps/chosen": -298.90863037109375, "logps/rejected": -472.1896057128906, "loss": 0.3613, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.4581780433654785, "rewards/margins": 1.738215684890747, "rewards/rejected": -4.196393966674805, "step": 15450 }, { "epoch": 2.66368022053756, "grad_norm": 41.83999252319336, "learning_rate": 1.8900859843808402e-08, "logits/chosen": -2.0570788383483887, "logits/rejected": -1.9942249059677124, "logps/chosen": -286.05072021484375, "logps/rejected": -461.48846435546875, "loss": 0.3445, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.2443838119506836, "rewards/margins": 1.8011665344238281, "rewards/rejected": -4.045550346374512, "step": 15460 }, { "epoch": 2.6654031702274295, "grad_norm": 39.91064453125, "learning_rate": 1.8710145957283957e-08, "logits/chosen": -2.051372766494751, "logits/rejected": -1.9770056009292603, "logps/chosen": -307.9310607910156, "logps/rejected": -494.627685546875, "loss": 0.3204, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.4663753509521484, "rewards/margins": 1.9346469640731812, "rewards/rejected": -4.401022434234619, "step": 15470 }, { "epoch": 2.6671261199172984, "grad_norm": 40.35519027709961, "learning_rate": 1.8520361718009774e-08, "logits/chosen": -2.0140302181243896, "logits/rejected": -1.9403234720230103, "logps/chosen": -287.68890380859375, "logps/rejected": -465.5856018066406, "loss": 0.3723, "rewards/accuracies": 0.84375, "rewards/chosen": -2.3101298809051514, "rewards/margins": 1.8339691162109375, "rewards/rejected": -4.14409875869751, "step": 15480 }, { "epoch": 2.6688490696071674, "grad_norm": 48.24130630493164, "learning_rate": 1.8331507888805165e-08, "logits/chosen": -1.9857633113861084, "logits/rejected": -1.9298359155654907, "logps/chosen": -321.91168212890625, "logps/rejected": -480.99383544921875, "loss": 0.4169, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.6629233360290527, "rewards/margins": 1.6324180364608765, "rewards/rejected": -4.295340538024902, "step": 15490 }, { "epoch": 2.670572019297037, "grad_norm": 87.59442138671875, "learning_rate": 1.8143585228750036e-08, "logits/chosen": -1.9466676712036133, "logits/rejected": -1.8974924087524414, "logps/chosen": -336.58917236328125, "logps/rejected": -492.77593994140625, "loss": 0.4327, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.816450595855713, "rewards/margins": 1.571930170059204, "rewards/rejected": -4.388380527496338, "step": 15500 }, { "epoch": 2.6722949689869058, "grad_norm": 35.80043029785156, "learning_rate": 1.7956594493181264e-08, "logits/chosen": -1.9505637884140015, "logits/rejected": -1.8752607107162476, "logps/chosen": -310.6455078125, "logps/rejected": -487.4745178222656, "loss": 0.3805, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.5150203704833984, "rewards/margins": 1.8283474445343018, "rewards/rejected": -4.343367576599121, "step": 15510 }, { "epoch": 2.6740179186767747, "grad_norm": 60.143489837646484, "learning_rate": 1.7770536433690086e-08, "logits/chosen": -1.9314775466918945, "logits/rejected": -1.8762843608856201, "logps/chosen": -300.8268737792969, "logps/rejected": -463.8331604003906, "loss": 0.412, "rewards/accuracies": 0.8125, "rewards/chosen": -2.4581704139709473, "rewards/margins": 1.6911847591400146, "rewards/rejected": -4.149355411529541, "step": 15520 }, { "epoch": 2.6757408683666437, "grad_norm": 50.70917892456055, "learning_rate": 1.7585411798118926e-08, "logits/chosen": -1.935359001159668, "logits/rejected": -1.8644733428955078, "logps/chosen": -284.1484375, "logps/rejected": -460.7568359375, "loss": 0.3489, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.292328119277954, "rewards/margins": 1.7874534130096436, "rewards/rejected": -4.079781532287598, "step": 15530 }, { "epoch": 2.6774638180565127, "grad_norm": 58.23405075073242, "learning_rate": 1.7401221330558435e-08, "logits/chosen": -1.9572185277938843, "logits/rejected": -1.8863433599472046, "logps/chosen": -317.5622863769531, "logps/rejected": -488.829345703125, "loss": 0.421, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.6421070098876953, "rewards/margins": 1.7365738153457642, "rewards/rejected": -4.37868070602417, "step": 15540 }, { "epoch": 2.6791867677463816, "grad_norm": 62.14585494995117, "learning_rate": 1.7217965771344407e-08, "logits/chosen": -2.035008430480957, "logits/rejected": -1.9723542928695679, "logps/chosen": -328.51690673828125, "logps/rejected": -491.39202880859375, "loss": 0.4349, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.727168321609497, "rewards/margins": 1.6709645986557007, "rewards/rejected": -4.398132801055908, "step": 15550 }, { "epoch": 2.6809097174362506, "grad_norm": 48.07640838623047, "learning_rate": 1.703564585705483e-08, "logits/chosen": -1.9981772899627686, "logits/rejected": -1.9443098306655884, "logps/chosen": -298.8880310058594, "logps/rejected": -466.82830810546875, "loss": 0.4117, "rewards/accuracies": 0.84375, "rewards/chosen": -2.470547914505005, "rewards/margins": 1.6721513271331787, "rewards/rejected": -4.142699241638184, "step": 15560 }, { "epoch": 2.68263266712612, "grad_norm": 35.6458854675293, "learning_rate": 1.6854262320507018e-08, "logits/chosen": -2.0329806804656982, "logits/rejected": -1.96586012840271, "logps/chosen": -314.06500244140625, "logps/rejected": -489.5606994628906, "loss": 0.3775, "rewards/accuracies": 0.84375, "rewards/chosen": -2.6031272411346436, "rewards/margins": 1.7959398031234741, "rewards/rejected": -4.3990678787231445, "step": 15570 }, { "epoch": 2.684355616815989, "grad_norm": 34.99605941772461, "learning_rate": 1.6673815890754672e-08, "logits/chosen": -2.046435594558716, "logits/rejected": -1.9838531017303467, "logps/chosen": -289.421630859375, "logps/rejected": -465.7112731933594, "loss": 0.3331, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.3265817165374756, "rewards/margins": 1.7689015865325928, "rewards/rejected": -4.095483303070068, "step": 15580 }, { "epoch": 2.686078566505858, "grad_norm": 49.486114501953125, "learning_rate": 1.6494307293084736e-08, "logits/chosen": -1.973262071609497, "logits/rejected": -1.9005916118621826, "logps/chosen": -281.6139221191406, "logps/rejected": -458.63372802734375, "loss": 0.359, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.257755756378174, "rewards/margins": 1.791918396949768, "rewards/rejected": -4.049674034118652, "step": 15590 }, { "epoch": 2.687801516195727, "grad_norm": 40.97716522216797, "learning_rate": 1.6315737249014694e-08, "logits/chosen": -2.004505157470703, "logits/rejected": -1.931879997253418, "logps/chosen": -306.7490234375, "logps/rejected": -509.3731994628906, "loss": 0.3549, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.5400571823120117, "rewards/margins": 2.0500755310058594, "rewards/rejected": -4.590132713317871, "step": 15600 }, { "epoch": 2.687801516195727, "eval_logits/chosen": -2.008359909057617, "eval_logits/rejected": -1.9840874671936035, "eval_logps/chosen": -343.37371826171875, "eval_logps/rejected": -402.0517883300781, "eval_loss": 0.7066512107849121, "eval_rewards/accuracies": 0.6310408711433411, "eval_rewards/chosen": -2.8435821533203125, "eval_rewards/margins": 0.5494397282600403, "eval_rewards/rejected": -3.393021821975708, "eval_runtime": 362.5155, "eval_samples_per_second": 11.873, "eval_steps_per_second": 1.484, "step": 15600 }, { "epoch": 2.6895244658855963, "grad_norm": 42.8242301940918, "learning_rate": 1.6138106476289603e-08, "logits/chosen": -2.0315780639648438, "logits/rejected": -1.9709250926971436, "logps/chosen": -311.28887939453125, "logps/rejected": -469.7393493652344, "loss": 0.4056, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.5429322719573975, "rewards/margins": 1.616577386856079, "rewards/rejected": -4.159509181976318, "step": 15610 }, { "epoch": 2.6912474155754653, "grad_norm": 51.03504943847656, "learning_rate": 1.5961415688879293e-08, "logits/chosen": -1.9465405941009521, "logits/rejected": -1.8902467489242554, "logps/chosen": -297.16693115234375, "logps/rejected": -472.02789306640625, "loss": 0.3616, "rewards/accuracies": 0.84375, "rewards/chosen": -2.422400951385498, "rewards/margins": 1.7449815273284912, "rewards/rejected": -4.167382717132568, "step": 15620 }, { "epoch": 2.6929703652653343, "grad_norm": 61.478736877441406, "learning_rate": 1.578566559697522e-08, "logits/chosen": -1.9549518823623657, "logits/rejected": -1.8679726123809814, "logps/chosen": -326.21240234375, "logps/rejected": -516.4959716796875, "loss": 0.3472, "rewards/accuracies": 0.84375, "rewards/chosen": -2.703623056411743, "rewards/margins": 1.9685478210449219, "rewards/rejected": -4.672171115875244, "step": 15630 }, { "epoch": 2.6946933149552033, "grad_norm": 29.764585494995117, "learning_rate": 1.5610856906988018e-08, "logits/chosen": -1.9588758945465088, "logits/rejected": -1.8890784978866577, "logps/chosen": -310.8281555175781, "logps/rejected": -514.0198974609375, "loss": 0.3208, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.5701849460601807, "rewards/margins": 2.04536771774292, "rewards/rejected": -4.615551948547363, "step": 15640 }, { "epoch": 2.6964162646450722, "grad_norm": 63.78411102294922, "learning_rate": 1.5436990321544303e-08, "logits/chosen": -1.9298512935638428, "logits/rejected": -1.8583509922027588, "logps/chosen": -315.3456726074219, "logps/rejected": -479.90545654296875, "loss": 0.4222, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.6030354499816895, "rewards/margins": 1.6636539697647095, "rewards/rejected": -4.266688823699951, "step": 15650 }, { "epoch": 2.698139214334941, "grad_norm": 30.067846298217773, "learning_rate": 1.5264066539484004e-08, "logits/chosen": -2.006969928741455, "logits/rejected": -1.9370698928833008, "logps/chosen": -323.02423095703125, "logps/rejected": -525.4228515625, "loss": 0.3082, "rewards/accuracies": 0.875, "rewards/chosen": -2.691089153289795, "rewards/margins": 2.0306942462921143, "rewards/rejected": -4.721783638000488, "step": 15660 }, { "epoch": 2.6998621640248106, "grad_norm": 37.487632751464844, "learning_rate": 1.5092086255857645e-08, "logits/chosen": -1.9349817037582397, "logits/rejected": -1.85541570186615, "logps/chosen": -308.1513671875, "logps/rejected": -485.69842529296875, "loss": 0.3421, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.5293941497802734, "rewards/margins": 1.8190078735351562, "rewards/rejected": -4.348402500152588, "step": 15670 }, { "epoch": 2.7015851137146796, "grad_norm": 38.32218933105469, "learning_rate": 1.4921050161923356e-08, "logits/chosen": -1.9333937168121338, "logits/rejected": -1.8729240894317627, "logps/chosen": -349.242919921875, "logps/rejected": -516.079833984375, "loss": 0.3762, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.950295925140381, "rewards/margins": 1.6847566366195679, "rewards/rejected": -4.635052680969238, "step": 15680 }, { "epoch": 2.7033080634045485, "grad_norm": 32.17233657836914, "learning_rate": 1.4750958945144194e-08, "logits/chosen": -1.973449945449829, "logits/rejected": -1.9072134494781494, "logps/chosen": -308.03533935546875, "logps/rejected": -488.24102783203125, "loss": 0.3335, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.5459365844726562, "rewards/margins": 1.8067331314086914, "rewards/rejected": -4.352670192718506, "step": 15690 }, { "epoch": 2.7050310130944175, "grad_norm": 26.034517288208008, "learning_rate": 1.4581813289185369e-08, "logits/chosen": -1.9757862091064453, "logits/rejected": -1.902587890625, "logps/chosen": -323.45733642578125, "logps/rejected": -511.4142150878906, "loss": 0.3792, "rewards/accuracies": 0.84375, "rewards/chosen": -2.701112985610962, "rewards/margins": 1.8938236236572266, "rewards/rejected": -4.594936370849609, "step": 15700 }, { "epoch": 2.706753962784287, "grad_norm": 44.360496520996094, "learning_rate": 1.4413613873911572e-08, "logits/chosen": -1.976714849472046, "logits/rejected": -1.9073139429092407, "logps/chosen": -340.33892822265625, "logps/rejected": -525.4049682617188, "loss": 0.3704, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.8148560523986816, "rewards/margins": 1.8850845098495483, "rewards/rejected": -4.699941158294678, "step": 15710 }, { "epoch": 2.708476912474156, "grad_norm": 56.355010986328125, "learning_rate": 1.4246361375384152e-08, "logits/chosen": -1.9749386310577393, "logits/rejected": -1.9251251220703125, "logps/chosen": -324.2084045410156, "logps/rejected": -486.76708984375, "loss": 0.3804, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.7012667655944824, "rewards/margins": 1.646211862564087, "rewards/rejected": -4.34747838973999, "step": 15720 }, { "epoch": 2.710199862164025, "grad_norm": 36.656227111816406, "learning_rate": 1.4080056465858331e-08, "logits/chosen": -2.0380120277404785, "logits/rejected": -1.969276785850525, "logps/chosen": -305.8288879394531, "logps/rejected": -495.70880126953125, "loss": 0.3215, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.5178353786468506, "rewards/margins": 1.9159759283065796, "rewards/rejected": -4.433811187744141, "step": 15730 }, { "epoch": 2.711922811853894, "grad_norm": 50.60468673706055, "learning_rate": 1.3914699813780628e-08, "logits/chosen": -1.9742889404296875, "logits/rejected": -1.896368384361267, "logps/chosen": -311.3577575683594, "logps/rejected": -490.9815368652344, "loss": 0.3724, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.5623440742492676, "rewards/margins": 1.854272484779358, "rewards/rejected": -4.4166154861450195, "step": 15740 }, { "epoch": 2.713645761543763, "grad_norm": 55.93589782714844, "learning_rate": 1.3750292083786224e-08, "logits/chosen": -1.9888490438461304, "logits/rejected": -1.914433479309082, "logps/chosen": -338.60040283203125, "logps/rejected": -534.7534790039062, "loss": 0.314, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.8561387062072754, "rewards/margins": 1.9475526809692383, "rewards/rejected": -4.8036909103393555, "step": 15750 }, { "epoch": 2.7153687112336318, "grad_norm": 40.64079284667969, "learning_rate": 1.3586833936696046e-08, "logits/chosen": -1.922194242477417, "logits/rejected": -1.8549903631210327, "logps/chosen": -318.5240478515625, "logps/rejected": -488.30499267578125, "loss": 0.4345, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.667625904083252, "rewards/margins": 1.720468282699585, "rewards/rejected": -4.388094425201416, "step": 15760 }, { "epoch": 2.717091660923501, "grad_norm": 29.53965187072754, "learning_rate": 1.3424326029514404e-08, "logits/chosen": -1.9496351480484009, "logits/rejected": -1.8891067504882812, "logps/chosen": -313.3179626464844, "logps/rejected": -490.7872009277344, "loss": 0.33, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.5970590114593506, "rewards/margins": 1.7862827777862549, "rewards/rejected": -4.383342266082764, "step": 15770 }, { "epoch": 2.71881461061337, "grad_norm": 39.166690826416016, "learning_rate": 1.3262769015426111e-08, "logits/chosen": -2.0432775020599365, "logits/rejected": -1.981764793395996, "logps/chosen": -339.0775451660156, "logps/rejected": -483.3779296875, "loss": 0.4573, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.8457586765289307, "rewards/margins": 1.467932939529419, "rewards/rejected": -4.313692092895508, "step": 15780 }, { "epoch": 2.720537560303239, "grad_norm": 29.134658813476562, "learning_rate": 1.3102163543794008e-08, "logits/chosen": -1.9055366516113281, "logits/rejected": -1.851017713546753, "logps/chosen": -334.7145690917969, "logps/rejected": -514.5390014648438, "loss": 0.3397, "rewards/accuracies": 0.875, "rewards/chosen": -2.8018946647644043, "rewards/margins": 1.8263680934906006, "rewards/rejected": -4.628263473510742, "step": 15790 }, { "epoch": 2.722260509993108, "grad_norm": 26.8823184967041, "learning_rate": 1.2942510260156303e-08, "logits/chosen": -2.0487115383148193, "logits/rejected": -1.9665085077285767, "logps/chosen": -306.05621337890625, "logps/rejected": -481.76434326171875, "loss": 0.3401, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.4664268493652344, "rewards/margins": 1.84841787815094, "rewards/rejected": -4.314844131469727, "step": 15800 }, { "epoch": 2.7239834596829775, "grad_norm": 41.069698333740234, "learning_rate": 1.2783809806223928e-08, "logits/chosen": -2.0220227241516113, "logits/rejected": -1.9602558612823486, "logps/chosen": -320.92474365234375, "logps/rejected": -495.7259216308594, "loss": 0.3735, "rewards/accuracies": 0.84375, "rewards/chosen": -2.7198517322540283, "rewards/margins": 1.7194585800170898, "rewards/rejected": -4.439309597015381, "step": 15810 }, { "epoch": 2.7257064093728465, "grad_norm": 59.23585891723633, "learning_rate": 1.2626062819878103e-08, "logits/chosen": -1.923966407775879, "logits/rejected": -1.8586641550064087, "logps/chosen": -324.52850341796875, "logps/rejected": -506.334716796875, "loss": 0.4029, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7219691276550293, "rewards/margins": 1.8301641941070557, "rewards/rejected": -4.552133083343506, "step": 15820 }, { "epoch": 2.7274293590627154, "grad_norm": 53.38178634643555, "learning_rate": 1.2469269935167614e-08, "logits/chosen": -1.9988445043563843, "logits/rejected": -1.9467023611068726, "logps/chosen": -310.7342529296875, "logps/rejected": -479.91351318359375, "loss": 0.4356, "rewards/accuracies": 0.8125, "rewards/chosen": -2.5900216102600098, "rewards/margins": 1.7045097351074219, "rewards/rejected": -4.294531345367432, "step": 15830 }, { "epoch": 2.7291523087525844, "grad_norm": 34.80722427368164, "learning_rate": 1.2313431782306232e-08, "logits/chosen": -2.0116004943847656, "logits/rejected": -1.9453134536743164, "logps/chosen": -338.9144287109375, "logps/rejected": -524.4220581054688, "loss": 0.4286, "rewards/accuracies": 0.8125, "rewards/chosen": -2.8447375297546387, "rewards/margins": 1.851933479309082, "rewards/rejected": -4.6966705322265625, "step": 15840 }, { "epoch": 2.7308752584424534, "grad_norm": 77.20761108398438, "learning_rate": 1.2158548987670491e-08, "logits/chosen": -1.984557867050171, "logits/rejected": -1.9149999618530273, "logps/chosen": -303.9657897949219, "logps/rejected": -492.3682556152344, "loss": 0.3934, "rewards/accuracies": 0.84375, "rewards/chosen": -2.4949309825897217, "rewards/margins": 1.8940006494522095, "rewards/rejected": -4.388932228088379, "step": 15850 }, { "epoch": 2.7325982081323223, "grad_norm": 65.79174041748047, "learning_rate": 1.2004622173796802e-08, "logits/chosen": -1.9729435443878174, "logits/rejected": -1.9125111103057861, "logps/chosen": -315.96636962890625, "logps/rejected": -466.82855224609375, "loss": 0.4387, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.6169323921203613, "rewards/margins": 1.550413966178894, "rewards/rejected": -4.167346000671387, "step": 15860 }, { "epoch": 2.7343211578221913, "grad_norm": 54.52164077758789, "learning_rate": 1.1851651959379095e-08, "logits/chosen": -1.9617595672607422, "logits/rejected": -1.8899450302124023, "logps/chosen": -330.5244445800781, "logps/rejected": -515.6116333007812, "loss": 0.4228, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.7475571632385254, "rewards/margins": 1.8913471698760986, "rewards/rejected": -4.638903617858887, "step": 15870 }, { "epoch": 2.7360441075120607, "grad_norm": 27.524450302124023, "learning_rate": 1.1699638959266428e-08, "logits/chosen": -1.9716495275497437, "logits/rejected": -1.9012991189956665, "logps/chosen": -293.0732727050781, "logps/rejected": -496.59295654296875, "loss": 0.2969, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.390104055404663, "rewards/margins": 2.0701675415039062, "rewards/rejected": -4.460270881652832, "step": 15880 }, { "epoch": 2.7377670572019297, "grad_norm": 38.46266174316406, "learning_rate": 1.1548583784460414e-08, "logits/chosen": -2.000511407852173, "logits/rejected": -1.9248682260513306, "logps/chosen": -321.2179260253906, "logps/rejected": -485.4606018066406, "loss": 0.4298, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.683239459991455, "rewards/margins": 1.6916545629501343, "rewards/rejected": -4.374893665313721, "step": 15890 }, { "epoch": 2.7394900068917987, "grad_norm": 44.15073013305664, "learning_rate": 1.1398487042112687e-08, "logits/chosen": -2.0028350353240967, "logits/rejected": -1.9129317998886108, "logps/chosen": -323.5985107421875, "logps/rejected": -502.631103515625, "loss": 0.3663, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.6213297843933105, "rewards/margins": 1.8939927816390991, "rewards/rejected": -4.515322685241699, "step": 15900 }, { "epoch": 2.741212956581668, "grad_norm": 41.59025192260742, "learning_rate": 1.1249349335522685e-08, "logits/chosen": -1.9395897388458252, "logits/rejected": -1.8692032098770142, "logps/chosen": -298.6854553222656, "logps/rejected": -477.9593811035156, "loss": 0.3544, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.441694974899292, "rewards/margins": 1.8476743698120117, "rewards/rejected": -4.289369583129883, "step": 15910 }, { "epoch": 2.742935906271537, "grad_norm": 51.42915344238281, "learning_rate": 1.1101171264134957e-08, "logits/chosen": -1.9555208683013916, "logits/rejected": -1.8940417766571045, "logps/chosen": -325.4806213378906, "logps/rejected": -512.5076293945312, "loss": 0.3424, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.720991611480713, "rewards/margins": 1.8687635660171509, "rewards/rejected": -4.589756011962891, "step": 15920 }, { "epoch": 2.744658855961406, "grad_norm": 62.49579620361328, "learning_rate": 1.0953953423537054e-08, "logits/chosen": -1.9099289178848267, "logits/rejected": -1.8272504806518555, "logps/chosen": -290.15130615234375, "logps/rejected": -485.7161560058594, "loss": 0.3237, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.3355636596679688, "rewards/margins": 2.0281808376312256, "rewards/rejected": -4.363744735717773, "step": 15930 }, { "epoch": 2.746381805651275, "grad_norm": 29.054019927978516, "learning_rate": 1.0807696405456756e-08, "logits/chosen": -1.9707472324371338, "logits/rejected": -1.8854782581329346, "logps/chosen": -311.0867614746094, "logps/rejected": -519.9143676757812, "loss": 0.3125, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.506115198135376, "rewards/margins": 2.107940196990967, "rewards/rejected": -4.614055156707764, "step": 15940 }, { "epoch": 2.748104755341144, "grad_norm": 57.67142105102539, "learning_rate": 1.0662400797760179e-08, "logits/chosen": -2.029792308807373, "logits/rejected": -1.9601253271102905, "logps/chosen": -300.7933654785156, "logps/rejected": -464.7063903808594, "loss": 0.3856, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.413893222808838, "rewards/margins": 1.7165197134017944, "rewards/rejected": -4.130413055419922, "step": 15950 }, { "epoch": 2.749827705031013, "grad_norm": 24.922483444213867, "learning_rate": 1.0518067184448892e-08, "logits/chosen": -1.9965903759002686, "logits/rejected": -1.9195541143417358, "logps/chosen": -313.47747802734375, "logps/rejected": -498.96514892578125, "loss": 0.3671, "rewards/accuracies": 0.875, "rewards/chosen": -2.56428599357605, "rewards/margins": 1.904685378074646, "rewards/rejected": -4.4689717292785645, "step": 15960 }, { "epoch": 2.751550654720882, "grad_norm": 62.52370071411133, "learning_rate": 1.0374696145657946e-08, "logits/chosen": -1.9834229946136475, "logits/rejected": -1.9151852130889893, "logps/chosen": -301.9158630371094, "logps/rejected": -490.0071716308594, "loss": 0.3563, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.4508912563323975, "rewards/margins": 1.8997344970703125, "rewards/rejected": -4.350625991821289, "step": 15970 }, { "epoch": 2.7532736044107513, "grad_norm": 59.35655975341797, "learning_rate": 1.0232288257653433e-08, "logits/chosen": -1.9886503219604492, "logits/rejected": -1.9282948970794678, "logps/chosen": -313.227294921875, "logps/rejected": -482.8558044433594, "loss": 0.3589, "rewards/accuracies": 0.84375, "rewards/chosen": -2.5628468990325928, "rewards/margins": 1.7101532220840454, "rewards/rejected": -4.273000240325928, "step": 15980 }, { "epoch": 2.7549965541006203, "grad_norm": 41.66982650756836, "learning_rate": 1.0090844092830092e-08, "logits/chosen": -2.04725980758667, "logits/rejected": -2.006431818008423, "logps/chosen": -312.2140197753906, "logps/rejected": -475.05078125, "loss": 0.4219, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.564270257949829, "rewards/margins": 1.6366828680038452, "rewards/rejected": -4.200953483581543, "step": 15990 }, { "epoch": 2.7567195037904892, "grad_norm": 28.929399490356445, "learning_rate": 9.9503642197091e-09, "logits/chosen": -1.931726098060608, "logits/rejected": -1.8573191165924072, "logps/chosen": -300.25225830078125, "logps/rejected": -496.76251220703125, "loss": 0.2868, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.457338333129883, "rewards/margins": 2.001309633255005, "rewards/rejected": -4.458648204803467, "step": 16000 }, { "epoch": 2.7567195037904892, "eval_logits/chosen": -2.0016162395477295, "eval_logits/rejected": -1.9770392179489136, "eval_logps/chosen": -349.65234375, "eval_logps/rejected": -409.4747314453125, "eval_loss": 0.71168053150177, "eval_rewards/accuracies": 0.6289498209953308, "eval_rewards/chosen": -2.9063684940338135, "eval_rewards/margins": 0.5608826279640198, "eval_rewards/rejected": -3.4672513008117676, "eval_runtime": 362.0623, "eval_samples_per_second": 11.887, "eval_steps_per_second": 1.486, "step": 16000 }, { "epoch": 2.758442453480358, "grad_norm": 81.58589935302734, "learning_rate": 9.810849202935734e-09, "logits/chosen": -2.00762677192688, "logits/rejected": -1.9473369121551514, "logps/chosen": -317.76190185546875, "logps/rejected": -481.61358642578125, "loss": 0.4362, "rewards/accuracies": 0.78125, "rewards/chosen": -2.6177096366882324, "rewards/margins": 1.673418641090393, "rewards/rejected": -4.291128158569336, "step": 16010 }, { "epoch": 2.7601654031702276, "grad_norm": 43.891441345214844, "learning_rate": 9.672299603277146e-09, "logits/chosen": -2.0296239852905273, "logits/rejected": -1.9573570489883423, "logps/chosen": -313.2237548828125, "logps/rejected": -490.2967224121094, "loss": 0.397, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.5886740684509277, "rewards/margins": 1.80803644657135, "rewards/rejected": -4.396710395812988, "step": 16020 }, { "epoch": 2.7618883528600966, "grad_norm": 35.099708557128906, "learning_rate": 9.53471597762015e-09, "logits/chosen": -2.025749921798706, "logits/rejected": -1.9519951343536377, "logps/chosen": -320.3111267089844, "logps/rejected": -504.8877868652344, "loss": 0.3507, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.6667256355285645, "rewards/margins": 1.867027997970581, "rewards/rejected": -4.533753871917725, "step": 16030 }, { "epoch": 2.7636113025499656, "grad_norm": 72.72978973388672, "learning_rate": 9.39809887896878e-09, "logits/chosen": -1.9920387268066406, "logits/rejected": -1.921282172203064, "logps/chosen": -331.725341796875, "logps/rejected": -523.0769653320312, "loss": 0.3386, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.762260913848877, "rewards/margins": 1.9533103704452515, "rewards/rejected": -4.715571403503418, "step": 16040 }, { "epoch": 2.7653342522398345, "grad_norm": 28.8155460357666, "learning_rate": 9.262448856442257e-09, "logits/chosen": -1.9660561084747314, "logits/rejected": -1.8904939889907837, "logps/chosen": -292.36077880859375, "logps/rejected": -495.7724609375, "loss": 0.284, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -2.3717424869537354, "rewards/margins": 2.0586609840393066, "rewards/rejected": -4.430403232574463, "step": 16050 }, { "epoch": 2.7670572019297035, "grad_norm": 45.704063415527344, "learning_rate": 9.127766455272828e-09, "logits/chosen": -1.9846750497817993, "logits/rejected": -1.9225795269012451, "logps/chosen": -299.59930419921875, "logps/rejected": -471.0708923339844, "loss": 0.3752, "rewards/accuracies": 0.84375, "rewards/chosen": -2.42854380607605, "rewards/margins": 1.7419992685317993, "rewards/rejected": -4.1705427169799805, "step": 16060 }, { "epoch": 2.7687801516195725, "grad_norm": 32.10371780395508, "learning_rate": 8.994052216803355e-09, "logits/chosen": -1.96079421043396, "logits/rejected": -1.8969967365264893, "logps/chosen": -306.35546875, "logps/rejected": -493.9599609375, "loss": 0.3177, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.4819493293762207, "rewards/margins": 1.9257476329803467, "rewards/rejected": -4.4076972007751465, "step": 16070 }, { "epoch": 2.770503101309442, "grad_norm": 47.946529388427734, "learning_rate": 8.861306678485364e-09, "logits/chosen": -1.9766671657562256, "logits/rejected": -1.8872219324111938, "logps/chosen": -311.2156066894531, "logps/rejected": -490.8643493652344, "loss": 0.3718, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.5303075313568115, "rewards/margins": 1.868847131729126, "rewards/rejected": -4.399155139923096, "step": 16080 }, { "epoch": 2.772226050999311, "grad_norm": 37.69658279418945, "learning_rate": 8.72953037387672e-09, "logits/chosen": -1.989370346069336, "logits/rejected": -1.9152278900146484, "logps/chosen": -303.6719055175781, "logps/rejected": -487.06280517578125, "loss": 0.3528, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.500981569290161, "rewards/margins": 1.8659719228744507, "rewards/rejected": -4.3669538497924805, "step": 16090 }, { "epoch": 2.77394900068918, "grad_norm": 53.79957580566406, "learning_rate": 8.59872383263957e-09, "logits/chosen": -2.005476236343384, "logits/rejected": -1.9398939609527588, "logps/chosen": -307.587646484375, "logps/rejected": -513.4273681640625, "loss": 0.3396, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.5385444164276123, "rewards/margins": 2.058758497238159, "rewards/rejected": -4.59730339050293, "step": 16100 }, { "epoch": 2.775671950379049, "grad_norm": 28.924591064453125, "learning_rate": 8.468887580538126e-09, "logits/chosen": -1.9373798370361328, "logits/rejected": -1.8563706874847412, "logps/chosen": -302.4486389160156, "logps/rejected": -502.7521057128906, "loss": 0.3368, "rewards/accuracies": 0.84375, "rewards/chosen": -2.444441795349121, "rewards/margins": 2.0490219593048096, "rewards/rejected": -4.493463516235352, "step": 16110 }, { "epoch": 2.777394900068918, "grad_norm": 59.301788330078125, "learning_rate": 8.340022139436714e-09, "logits/chosen": -1.9959636926651, "logits/rejected": -1.9369560480117798, "logps/chosen": -303.8575744628906, "logps/rejected": -461.7627868652344, "loss": 0.3886, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.5287814140319824, "rewards/margins": 1.5682741403579712, "rewards/rejected": -4.097055435180664, "step": 16120 }, { "epoch": 2.779117849758787, "grad_norm": 33.732452392578125, "learning_rate": 8.212128027297594e-09, "logits/chosen": -1.9702991247177124, "logits/rejected": -1.90116286277771, "logps/chosen": -321.61285400390625, "logps/rejected": -505.44622802734375, "loss": 0.356, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.682741165161133, "rewards/margins": 1.8666775226593018, "rewards/rejected": -4.5494184494018555, "step": 16130 }, { "epoch": 2.780840799448656, "grad_norm": 55.63058853149414, "learning_rate": 8.085205758178781e-09, "logits/chosen": -2.0472989082336426, "logits/rejected": -1.9743015766143799, "logps/chosen": -312.7616271972656, "logps/rejected": -483.490234375, "loss": 0.3962, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.590057373046875, "rewards/margins": 1.730506181716919, "rewards/rejected": -4.320563316345215, "step": 16140 }, { "epoch": 2.782563749138525, "grad_norm": 35.45902633666992, "learning_rate": 7.959255842232005e-09, "logits/chosen": -2.029773235321045, "logits/rejected": -1.964609146118164, "logps/chosen": -320.7317810058594, "logps/rejected": -496.0630798339844, "loss": 0.3621, "rewards/accuracies": 0.8125, "rewards/chosen": -2.650829553604126, "rewards/margins": 1.7967488765716553, "rewards/rejected": -4.447578430175781, "step": 16150 }, { "epoch": 2.784286698828394, "grad_norm": 42.601402282714844, "learning_rate": 7.834278785700893e-09, "logits/chosen": -1.980169653892517, "logits/rejected": -1.8956329822540283, "logps/chosen": -315.523681640625, "logps/rejected": -526.1173706054688, "loss": 0.3124, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.607433795928955, "rewards/margins": 2.1239945888519287, "rewards/rejected": -4.731429100036621, "step": 16160 }, { "epoch": 2.786009648518263, "grad_norm": 33.85255813598633, "learning_rate": 7.710275090918622e-09, "logits/chosen": -1.938880205154419, "logits/rejected": -1.8739057779312134, "logps/chosen": -316.03765869140625, "logps/rejected": -484.83453369140625, "loss": 0.3936, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.615074396133423, "rewards/margins": 1.7018743753433228, "rewards/rejected": -4.316948413848877, "step": 16170 }, { "epoch": 2.7877325982081325, "grad_norm": 75.35025024414062, "learning_rate": 7.587245256306135e-09, "logits/chosen": -1.979379653930664, "logits/rejected": -1.8880045413970947, "logps/chosen": -324.08807373046875, "logps/rejected": -518.1857299804688, "loss": 0.3798, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.6412007808685303, "rewards/margins": 2.028455972671509, "rewards/rejected": -4.669656753540039, "step": 16180 }, { "epoch": 2.7894555478980014, "grad_norm": 40.81918716430664, "learning_rate": 7.465189776369895e-09, "logits/chosen": -1.9683879613876343, "logits/rejected": -1.901315689086914, "logps/chosen": -310.1624755859375, "logps/rejected": -495.50225830078125, "loss": 0.3829, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.554086923599243, "rewards/margins": 1.8621364831924438, "rewards/rejected": -4.416223049163818, "step": 16190 }, { "epoch": 2.7911784975878704, "grad_norm": 53.49696350097656, "learning_rate": 7.344109141700167e-09, "logits/chosen": -1.952439546585083, "logits/rejected": -1.8729956150054932, "logps/chosen": -301.7851257324219, "logps/rejected": -475.7323303222656, "loss": 0.3665, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.4563658237457275, "rewards/margins": 1.8075430393218994, "rewards/rejected": -4.263908386230469, "step": 16200 }, { "epoch": 2.7929014472777394, "grad_norm": 56.92274475097656, "learning_rate": 7.224003838968873e-09, "logits/chosen": -1.896593689918518, "logits/rejected": -1.8335720300674438, "logps/chosen": -313.4895324707031, "logps/rejected": -483.8226013183594, "loss": 0.3777, "rewards/accuracies": 0.84375, "rewards/chosen": -2.5999932289123535, "rewards/margins": 1.7070661783218384, "rewards/rejected": -4.307058811187744, "step": 16210 }, { "epoch": 2.794624396967609, "grad_norm": 21.524486541748047, "learning_rate": 7.104874350927714e-09, "logits/chosen": -1.9821815490722656, "logits/rejected": -1.918572187423706, "logps/chosen": -306.9528503417969, "logps/rejected": -486.61187744140625, "loss": 0.3996, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.4908456802368164, "rewards/margins": 1.832363486289978, "rewards/rejected": -4.323209285736084, "step": 16220 }, { "epoch": 2.7963473466574778, "grad_norm": 21.50068473815918, "learning_rate": 6.986721156406139e-09, "logits/chosen": -1.9879566431045532, "logits/rejected": -1.9092060327529907, "logps/chosen": -303.32208251953125, "logps/rejected": -502.4454040527344, "loss": 0.3274, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.5111184120178223, "rewards/margins": 2.007546901702881, "rewards/rejected": -4.518664836883545, "step": 16230 }, { "epoch": 2.7980702963473467, "grad_norm": 31.76837158203125, "learning_rate": 6.86954473030954e-09, "logits/chosen": -1.9715988636016846, "logits/rejected": -1.8964351415634155, "logps/chosen": -311.85748291015625, "logps/rejected": -518.4403076171875, "loss": 0.3137, "rewards/accuracies": 0.875, "rewards/chosen": -2.5852322578430176, "rewards/margins": 2.072300672531128, "rewards/rejected": -4.657533168792725, "step": 16240 }, { "epoch": 2.7997932460372157, "grad_norm": 42.55023193359375, "learning_rate": 6.753345543617228e-09, "logits/chosen": -1.9966237545013428, "logits/rejected": -1.9349393844604492, "logps/chosen": -317.23382568359375, "logps/rejected": -478.33282470703125, "loss": 0.4424, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.591723918914795, "rewards/margins": 1.652349829673767, "rewards/rejected": -4.244073867797852, "step": 16250 }, { "epoch": 2.8015161957270847, "grad_norm": 56.26144027709961, "learning_rate": 6.638124063380629e-09, "logits/chosen": -1.9839979410171509, "logits/rejected": -1.9063066244125366, "logps/chosen": -324.0694274902344, "logps/rejected": -522.8606567382812, "loss": 0.3618, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.7059319019317627, "rewards/margins": 1.9750534296035767, "rewards/rejected": -4.680985450744629, "step": 16260 }, { "epoch": 2.8032391454169536, "grad_norm": 35.89350128173828, "learning_rate": 6.523880752721395e-09, "logits/chosen": -1.9857683181762695, "logits/rejected": -1.922458291053772, "logps/chosen": -325.7708435058594, "logps/rejected": -498.18524169921875, "loss": 0.4184, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.696904420852661, "rewards/margins": 1.761609435081482, "rewards/rejected": -4.4585137367248535, "step": 16270 }, { "epoch": 2.804962095106823, "grad_norm": 69.16283416748047, "learning_rate": 6.410616070829433e-09, "logits/chosen": -2.0395760536193848, "logits/rejected": -1.9760862588882446, "logps/chosen": -323.95672607421875, "logps/rejected": -483.1488342285156, "loss": 0.4317, "rewards/accuracies": 0.8125, "rewards/chosen": -2.6687064170837402, "rewards/margins": 1.643886923789978, "rewards/rejected": -4.312594413757324, "step": 16280 }, { "epoch": 2.806685044796692, "grad_norm": 64.75214385986328, "learning_rate": 6.298330472961216e-09, "logits/chosen": -1.9958524703979492, "logits/rejected": -1.9377946853637695, "logps/chosen": -303.9469299316406, "logps/rejected": -483.7103576660156, "loss": 0.3951, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.5054538249969482, "rewards/margins": 1.7941396236419678, "rewards/rejected": -4.299592971801758, "step": 16290 }, { "epoch": 2.808407994486561, "grad_norm": 41.44542694091797, "learning_rate": 6.187024410437946e-09, "logits/chosen": -2.012407064437866, "logits/rejected": -1.9356353282928467, "logps/chosen": -321.70037841796875, "logps/rejected": -496.36846923828125, "loss": 0.3781, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.6674365997314453, "rewards/margins": 1.793129324913025, "rewards/rejected": -4.46056604385376, "step": 16300 }, { "epoch": 2.81013094417643, "grad_norm": 48.67670440673828, "learning_rate": 6.076698330643504e-09, "logits/chosen": -1.9949337244033813, "logits/rejected": -1.9309295415878296, "logps/chosen": -314.4491271972656, "logps/rejected": -479.9866638183594, "loss": 0.4198, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.606614589691162, "rewards/margins": 1.649374008178711, "rewards/rejected": -4.255989074707031, "step": 16310 }, { "epoch": 2.8118538938662994, "grad_norm": 60.270835876464844, "learning_rate": 5.967352677022946e-09, "logits/chosen": -1.995179533958435, "logits/rejected": -1.932437539100647, "logps/chosen": -317.7287292480469, "logps/rejected": -508.16717529296875, "loss": 0.3644, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.6165366172790527, "rewards/margins": 1.921966791152954, "rewards/rejected": -4.538503646850586, "step": 16320 }, { "epoch": 2.8135768435561683, "grad_norm": 39.4362907409668, "learning_rate": 5.858987889080596e-09, "logits/chosen": -2.008793354034424, "logits/rejected": -1.9383611679077148, "logps/chosen": -293.9007568359375, "logps/rejected": -474.19183349609375, "loss": 0.34, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.3466713428497314, "rewards/margins": 1.8633272647857666, "rewards/rejected": -4.20999813079834, "step": 16330 }, { "epoch": 2.8152997932460373, "grad_norm": 19.201900482177734, "learning_rate": 5.751604402378263e-09, "logits/chosen": -1.9500610828399658, "logits/rejected": -1.8847576379776, "logps/chosen": -295.54052734375, "logps/rejected": -483.1204528808594, "loss": 0.3152, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.3934226036071777, "rewards/margins": 1.9237467050552368, "rewards/rejected": -4.317169189453125, "step": 16340 }, { "epoch": 2.8170227429359063, "grad_norm": 27.058698654174805, "learning_rate": 5.645202648533493e-09, "logits/chosen": -2.0627427101135254, "logits/rejected": -1.9843612909317017, "logps/chosen": -302.8964538574219, "logps/rejected": -477.3736267089844, "loss": 0.3442, "rewards/accuracies": 0.84375, "rewards/chosen": -2.4570231437683105, "rewards/margins": 1.780082106590271, "rewards/rejected": -4.237105369567871, "step": 16350 }, { "epoch": 2.8187456926257752, "grad_norm": 47.898555755615234, "learning_rate": 5.5397830552179056e-09, "logits/chosen": -2.016932725906372, "logits/rejected": -1.9358619451522827, "logps/chosen": -293.22796630859375, "logps/rejected": -491.0762634277344, "loss": 0.3147, "rewards/accuracies": 0.875, "rewards/chosen": -2.3937313556671143, "rewards/margins": 2.012429714202881, "rewards/rejected": -4.406161308288574, "step": 16360 }, { "epoch": 2.820468642315644, "grad_norm": 40.3588981628418, "learning_rate": 5.435346046155337e-09, "logits/chosen": -1.975415587425232, "logits/rejected": -1.9143269062042236, "logps/chosen": -314.270751953125, "logps/rejected": -466.89990234375, "loss": 0.4307, "rewards/accuracies": 0.8125, "rewards/chosen": -2.5936217308044434, "rewards/margins": 1.581056833267212, "rewards/rejected": -4.174678325653076, "step": 16370 }, { "epoch": 2.822191592005513, "grad_norm": 62.21453094482422, "learning_rate": 5.331892041120279e-09, "logits/chosen": -2.0118536949157715, "logits/rejected": -1.9477832317352295, "logps/chosen": -302.5957336425781, "logps/rejected": -482.06884765625, "loss": 0.3724, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.4717679023742676, "rewards/margins": 1.8051445484161377, "rewards/rejected": -4.276912689208984, "step": 16380 }, { "epoch": 2.8239145416953826, "grad_norm": 42.925445556640625, "learning_rate": 5.229421455936106e-09, "logits/chosen": -1.9326406717300415, "logits/rejected": -1.8734573125839233, "logps/chosen": -285.70660400390625, "logps/rejected": -454.9163513183594, "loss": 0.4138, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.316829204559326, "rewards/margins": 1.7159194946289062, "rewards/rejected": -4.032748699188232, "step": 16390 }, { "epoch": 2.8256374913852516, "grad_norm": 34.639102935791016, "learning_rate": 5.127934702473524e-09, "logits/chosen": -1.9523178339004517, "logits/rejected": -1.8782927989959717, "logps/chosen": -295.1117858886719, "logps/rejected": -494.18194580078125, "loss": 0.3243, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.385686159133911, "rewards/margins": 2.016838788986206, "rewards/rejected": -4.402524948120117, "step": 16400 }, { "epoch": 2.8256374913852516, "eval_logits/chosen": -2.008486270904541, "eval_logits/rejected": -1.9840666055679321, "eval_logps/chosen": -342.5143127441406, "eval_logps/rejected": -401.57861328125, "eval_loss": 0.7086198925971985, "eval_rewards/accuracies": 0.6319702863693237, "eval_rewards/chosen": -2.834988594055176, "eval_rewards/margins": 0.5533016324043274, "eval_rewards/rejected": -3.3882901668548584, "eval_runtime": 362.0433, "eval_samples_per_second": 11.888, "eval_steps_per_second": 1.486, "step": 16400 }, { "epoch": 2.8273604410751205, "grad_norm": 43.376346588134766, "learning_rate": 5.027432188648678e-09, "logits/chosen": -2.0023369789123535, "logits/rejected": -1.909185767173767, "logps/chosen": -312.04986572265625, "logps/rejected": -512.2869873046875, "loss": 0.3072, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.559176206588745, "rewards/margins": 2.088606357574463, "rewards/rejected": -4.647782802581787, "step": 16410 }, { "epoch": 2.82908339076499, "grad_norm": 48.757354736328125, "learning_rate": 4.927914318421711e-09, "logits/chosen": -1.896371841430664, "logits/rejected": -1.838361144065857, "logps/chosen": -315.489990234375, "logps/rejected": -470.5518493652344, "loss": 0.4172, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.602351665496826, "rewards/margins": 1.540833830833435, "rewards/rejected": -4.143185615539551, "step": 16420 }, { "epoch": 2.830806340454859, "grad_norm": 62.68126678466797, "learning_rate": 4.829381491795126e-09, "logits/chosen": -1.9145017862319946, "logits/rejected": -1.8501514196395874, "logps/chosen": -312.2740173339844, "logps/rejected": -505.00457763671875, "loss": 0.3148, "rewards/accuracies": 0.875, "rewards/chosen": -2.5775694847106934, "rewards/margins": 1.9415448904037476, "rewards/rejected": -4.5191144943237305, "step": 16430 }, { "epoch": 2.832529290144728, "grad_norm": 29.35466194152832, "learning_rate": 4.731834104812149e-09, "logits/chosen": -1.962580680847168, "logits/rejected": -1.886800765991211, "logps/chosen": -297.8292541503906, "logps/rejected": -497.9293518066406, "loss": 0.3339, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.4581551551818848, "rewards/margins": 2.0110676288604736, "rewards/rejected": -4.469222545623779, "step": 16440 }, { "epoch": 2.834252239834597, "grad_norm": 47.078460693359375, "learning_rate": 4.635272549555036e-09, "logits/chosen": -2.0226187705993652, "logits/rejected": -1.942291259765625, "logps/chosen": -304.83380126953125, "logps/rejected": -507.748291015625, "loss": 0.2931, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.464683771133423, "rewards/margins": 2.0731606483459473, "rewards/rejected": -4.537844181060791, "step": 16450 }, { "epoch": 2.835975189524466, "grad_norm": 43.73933029174805, "learning_rate": 4.539697214143656e-09, "logits/chosen": -1.8991577625274658, "logits/rejected": -1.8499176502227783, "logps/chosen": -296.8829040527344, "logps/rejected": -468.86474609375, "loss": 0.4119, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.472661256790161, "rewards/margins": 1.719536542892456, "rewards/rejected": -4.192197799682617, "step": 16460 }, { "epoch": 2.837698139214335, "grad_norm": 48.004390716552734, "learning_rate": 4.445108482733828e-09, "logits/chosen": -2.0137686729431152, "logits/rejected": -1.933941125869751, "logps/chosen": -304.0889587402344, "logps/rejected": -515.68359375, "loss": 0.3317, "rewards/accuracies": 0.875, "rewards/chosen": -2.5189285278320312, "rewards/margins": 2.1211273670196533, "rewards/rejected": -4.6400556564331055, "step": 16470 }, { "epoch": 2.8394210889042037, "grad_norm": 28.406702041625977, "learning_rate": 4.351506735515875e-09, "logits/chosen": -1.9296886920928955, "logits/rejected": -1.8545078039169312, "logps/chosen": -297.65252685546875, "logps/rejected": -497.1302185058594, "loss": 0.3069, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.441197156906128, "rewards/margins": 1.9969764947891235, "rewards/rejected": -4.438174247741699, "step": 16480 }, { "epoch": 2.841144038594073, "grad_norm": 47.717472076416016, "learning_rate": 4.258892348712989e-09, "logits/chosen": -1.9757184982299805, "logits/rejected": -1.9167261123657227, "logps/chosen": -308.49603271484375, "logps/rejected": -456.364990234375, "loss": 0.4019, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.556999444961548, "rewards/margins": 1.4968293905258179, "rewards/rejected": -4.053828716278076, "step": 16490 }, { "epoch": 2.842866988283942, "grad_norm": 33.50767135620117, "learning_rate": 4.1672656945796746e-09, "logits/chosen": -2.0055363178253174, "logits/rejected": -1.9190800189971924, "logps/chosen": -300.1772155761719, "logps/rejected": -456.3914489746094, "loss": 0.356, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.4092912673950195, "rewards/margins": 1.6647533178329468, "rewards/rejected": -4.074044227600098, "step": 16500 }, { "epoch": 2.844589937973811, "grad_norm": 54.51416015625, "learning_rate": 4.076627141400557e-09, "logits/chosen": -2.012219190597534, "logits/rejected": -1.9525381326675415, "logps/chosen": -299.2861022949219, "logps/rejected": -469.740234375, "loss": 0.3489, "rewards/accuracies": 0.875, "rewards/chosen": -2.4150969982147217, "rewards/margins": 1.7358524799346924, "rewards/rejected": -4.150949954986572, "step": 16510 }, { "epoch": 2.84631288766368, "grad_norm": 34.60490798950195, "learning_rate": 3.986977053488439e-09, "logits/chosen": -2.0472934246063232, "logits/rejected": -1.9639832973480225, "logps/chosen": -300.44427490234375, "logps/rejected": -517.4644775390625, "loss": 0.2741, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.433382511138916, "rewards/margins": 2.219550132751465, "rewards/rejected": -4.652932167053223, "step": 16520 }, { "epoch": 2.8480358373535495, "grad_norm": 69.30424499511719, "learning_rate": 3.898315791183243e-09, "logits/chosen": -1.9323632717132568, "logits/rejected": -1.8719297647476196, "logps/chosen": -317.5927429199219, "logps/rejected": -465.6849670410156, "loss": 0.3909, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.6075639724731445, "rewards/margins": 1.530882477760315, "rewards/rejected": -4.138446807861328, "step": 16530 }, { "epoch": 2.8497587870434185, "grad_norm": 51.688724517822266, "learning_rate": 3.810643710850381e-09, "logits/chosen": -1.9560706615447998, "logits/rejected": -1.8844619989395142, "logps/chosen": -325.9396057128906, "logps/rejected": -501.45703125, "loss": 0.4425, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.724729061126709, "rewards/margins": 1.792863130569458, "rewards/rejected": -4.517591953277588, "step": 16540 }, { "epoch": 2.8514817367332874, "grad_norm": 42.67374038696289, "learning_rate": 3.7239611648793025e-09, "logits/chosen": -1.9616479873657227, "logits/rejected": -1.8878576755523682, "logps/chosen": -333.8500061035156, "logps/rejected": -536.5110473632812, "loss": 0.36, "rewards/accuracies": 0.84375, "rewards/chosen": -2.792130947113037, "rewards/margins": 2.0398573875427246, "rewards/rejected": -4.8319878578186035, "step": 16550 }, { "epoch": 2.8532046864231564, "grad_norm": 42.05401611328125, "learning_rate": 3.6382685016821123e-09, "logits/chosen": -1.9521152973175049, "logits/rejected": -1.8853868246078491, "logps/chosen": -306.54620361328125, "logps/rejected": -492.77752685546875, "loss": 0.3603, "rewards/accuracies": 0.84375, "rewards/chosen": -2.510697364807129, "rewards/margins": 1.8570754528045654, "rewards/rejected": -4.367773056030273, "step": 16560 }, { "epoch": 2.8549276361130254, "grad_norm": 36.88031768798828, "learning_rate": 3.553566065692237e-09, "logits/chosen": -2.013857364654541, "logits/rejected": -1.9316978454589844, "logps/chosen": -305.222412109375, "logps/rejected": -489.1131896972656, "loss": 0.3118, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.505892753601074, "rewards/margins": 1.8779538869857788, "rewards/rejected": -4.383846759796143, "step": 16570 }, { "epoch": 2.8566505858028943, "grad_norm": 50.863494873046875, "learning_rate": 3.4698541973629536e-09, "logits/chosen": -1.9432636499404907, "logits/rejected": -1.8766043186187744, "logps/chosen": -305.536376953125, "logps/rejected": -479.81134033203125, "loss": 0.4423, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.536360263824463, "rewards/margins": 1.742417573928833, "rewards/rejected": -4.278777599334717, "step": 16580 }, { "epoch": 2.8583735354927637, "grad_norm": 64.90330505371094, "learning_rate": 3.387133233165973e-09, "logits/chosen": -1.9309171438217163, "logits/rejected": -1.8621307611465454, "logps/chosen": -319.0091857910156, "logps/rejected": -479.45745849609375, "loss": 0.4137, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.6426944732666016, "rewards/margins": 1.6571133136749268, "rewards/rejected": -4.299807548522949, "step": 16590 }, { "epoch": 2.8600964851826327, "grad_norm": 64.63720703125, "learning_rate": 3.305403505590276e-09, "logits/chosen": -1.9574962854385376, "logits/rejected": -1.8995040655136108, "logps/chosen": -334.0093994140625, "logps/rejected": -505.06158447265625, "loss": 0.4287, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.7632877826690674, "rewards/margins": 1.7428205013275146, "rewards/rejected": -4.506108283996582, "step": 16600 }, { "epoch": 2.8618194348725017, "grad_norm": 41.6068000793457, "learning_rate": 3.2246653431406424e-09, "logits/chosen": -1.9588810205459595, "logits/rejected": -1.8665924072265625, "logps/chosen": -323.65167236328125, "logps/rejected": -523.3704833984375, "loss": 0.3616, "rewards/accuracies": 0.8125, "rewards/chosen": -2.677800416946411, "rewards/margins": 2.0320656299591064, "rewards/rejected": -4.709865570068359, "step": 16610 }, { "epoch": 2.8635423845623706, "grad_norm": 28.82052993774414, "learning_rate": 3.1449190703362606e-09, "logits/chosen": -1.9433845281600952, "logits/rejected": -1.8670654296875, "logps/chosen": -308.9308776855469, "logps/rejected": -501.6695861816406, "loss": 0.3557, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.5142550468444824, "rewards/margins": 1.9610239267349243, "rewards/rejected": -4.475279808044434, "step": 16620 }, { "epoch": 2.86526533425224, "grad_norm": 59.52485656738281, "learning_rate": 3.066165007709648e-09, "logits/chosen": -1.9474287033081055, "logits/rejected": -1.8773529529571533, "logps/chosen": -307.9587707519531, "logps/rejected": -491.36175537109375, "loss": 0.3799, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.450289011001587, "rewards/margins": 1.9436753988265991, "rewards/rejected": -4.3939642906188965, "step": 16630 }, { "epoch": 2.866988283942109, "grad_norm": 50.3684196472168, "learning_rate": 2.9884034718050955e-09, "logits/chosen": -1.9604854583740234, "logits/rejected": -1.8742080926895142, "logps/chosen": -337.62518310546875, "logps/rejected": -530.4407348632812, "loss": 0.3574, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.8089358806610107, "rewards/margins": 1.9887984991073608, "rewards/rejected": -4.79773473739624, "step": 16640 }, { "epoch": 2.868711233631978, "grad_norm": 44.950687408447266, "learning_rate": 2.9116347751776404e-09, "logits/chosen": -1.94940185546875, "logits/rejected": -1.8784650564193726, "logps/chosen": -295.203857421875, "logps/rejected": -489.9127502441406, "loss": 0.3111, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.394976854324341, "rewards/margins": 1.9954516887664795, "rewards/rejected": -4.39042854309082, "step": 16650 }, { "epoch": 2.870434183321847, "grad_norm": 68.06328582763672, "learning_rate": 2.8358592263916238e-09, "logits/chosen": -1.9676002264022827, "logits/rejected": -1.9088243246078491, "logps/chosen": -314.5768127441406, "logps/rejected": -508.9937438964844, "loss": 0.3551, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.598508596420288, "rewards/margins": 1.9561907052993774, "rewards/rejected": -4.554699897766113, "step": 16660 }, { "epoch": 2.872157133011716, "grad_norm": 32.52412796020508, "learning_rate": 2.761077130019579e-09, "logits/chosen": -2.0002925395965576, "logits/rejected": -1.933781623840332, "logps/chosen": -288.32470703125, "logps/rejected": -477.91400146484375, "loss": 0.3317, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.351802349090576, "rewards/margins": 1.9073355197906494, "rewards/rejected": -4.259138107299805, "step": 16670 }, { "epoch": 2.873880082701585, "grad_norm": 46.96864318847656, "learning_rate": 2.6872887866408732e-09, "logits/chosen": -1.9223473072052002, "logits/rejected": -1.860379934310913, "logps/chosen": -296.52386474609375, "logps/rejected": -470.6809997558594, "loss": 0.3683, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.468095541000366, "rewards/margins": 1.752500295639038, "rewards/rejected": -4.2205963134765625, "step": 16680 }, { "epoch": 2.8756030323914543, "grad_norm": 39.60722351074219, "learning_rate": 2.6144944928407075e-09, "logits/chosen": -1.9830448627471924, "logits/rejected": -1.9252612590789795, "logps/chosen": -302.50213623046875, "logps/rejected": -471.08660888671875, "loss": 0.3646, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.4704606533050537, "rewards/margins": 1.7122749090194702, "rewards/rejected": -4.182735443115234, "step": 16690 }, { "epoch": 2.8773259820813233, "grad_norm": 70.59654235839844, "learning_rate": 2.5426945412086453e-09, "logits/chosen": -2.066020965576172, "logits/rejected": -2.0007615089416504, "logps/chosen": -312.9866027832031, "logps/rejected": -481.12127685546875, "loss": 0.4515, "rewards/accuracies": 0.8125, "rewards/chosen": -2.5725064277648926, "rewards/margins": 1.7103774547576904, "rewards/rejected": -4.28288459777832, "step": 16700 }, { "epoch": 2.8790489317711923, "grad_norm": 67.25102996826172, "learning_rate": 2.471889220337725e-09, "logits/chosen": -1.9627320766448975, "logits/rejected": -1.8977569341659546, "logps/chosen": -318.37371826171875, "logps/rejected": -490.7613830566406, "loss": 0.4356, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.6620755195617676, "rewards/margins": 1.7478606700897217, "rewards/rejected": -4.409936428070068, "step": 16710 }, { "epoch": 2.8807718814610612, "grad_norm": 35.83496856689453, "learning_rate": 2.402078814823072e-09, "logits/chosen": -1.9355350732803345, "logits/rejected": -1.8637994527816772, "logps/chosen": -319.97760009765625, "logps/rejected": -495.446533203125, "loss": 0.3536, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.6449217796325684, "rewards/margins": 1.8294874429702759, "rewards/rejected": -4.474409580230713, "step": 16720 }, { "epoch": 2.8824948311509306, "grad_norm": 31.255884170532227, "learning_rate": 2.3332636052608714e-09, "logits/chosen": -1.9555209875106812, "logits/rejected": -1.885491967201233, "logps/chosen": -291.7463073730469, "logps/rejected": -487.203125, "loss": 0.3363, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.392470121383667, "rewards/margins": 1.9736398458480835, "rewards/rejected": -4.366110324859619, "step": 16730 }, { "epoch": 2.8842177808407996, "grad_norm": 44.37505340576172, "learning_rate": 2.26544386824723e-09, "logits/chosen": -1.9382679462432861, "logits/rejected": -1.8725357055664062, "logps/chosen": -302.8109436035156, "logps/rejected": -457.8538513183594, "loss": 0.4091, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.4617977142333984, "rewards/margins": 1.5926666259765625, "rewards/rejected": -4.054464340209961, "step": 16740 }, { "epoch": 2.8859407305306686, "grad_norm": 32.24553298950195, "learning_rate": 2.198619876377067e-09, "logits/chosen": -2.0115573406219482, "logits/rejected": -1.9303081035614014, "logps/chosen": -306.3038635253906, "logps/rejected": -537.3651123046875, "loss": 0.2954, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.5131142139434814, "rewards/margins": 2.287555456161499, "rewards/rejected": -4.800669193267822, "step": 16750 }, { "epoch": 2.8876636802205375, "grad_norm": 44.070831298828125, "learning_rate": 2.1327918982428916e-09, "logits/chosen": -1.9359534978866577, "logits/rejected": -1.8707046508789062, "logps/chosen": -310.8444519042969, "logps/rejected": -490.2184143066406, "loss": 0.348, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.550225019454956, "rewards/margins": 1.8353372812271118, "rewards/rejected": -4.385562419891357, "step": 16760 }, { "epoch": 2.8893866299104065, "grad_norm": 43.847412109375, "learning_rate": 2.0679601984339713e-09, "logits/chosen": -2.0291895866394043, "logits/rejected": -1.9430195093154907, "logps/chosen": -312.1524353027344, "logps/rejected": -501.88238525390625, "loss": 0.3737, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.5466151237487793, "rewards/margins": 1.9837877750396729, "rewards/rejected": -4.530402183532715, "step": 16770 }, { "epoch": 2.8911095796002755, "grad_norm": 42.16927719116211, "learning_rate": 2.004125037535054e-09, "logits/chosen": -1.9637123346328735, "logits/rejected": -1.900246262550354, "logps/chosen": -319.84100341796875, "logps/rejected": -487.20819091796875, "loss": 0.388, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.6435866355895996, "rewards/margins": 1.6579605340957642, "rewards/rejected": -4.301547050476074, "step": 16780 }, { "epoch": 2.892832529290145, "grad_norm": 49.28826904296875, "learning_rate": 1.9412866721253974e-09, "logits/chosen": -1.9904868602752686, "logits/rejected": -1.8965473175048828, "logps/chosen": -301.068603515625, "logps/rejected": -472.2461853027344, "loss": 0.4269, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.4315168857574463, "rewards/margins": 1.7929401397705078, "rewards/rejected": -4.224457263946533, "step": 16790 }, { "epoch": 2.894555478980014, "grad_norm": 56.4819450378418, "learning_rate": 1.87944535477777e-09, "logits/chosen": -2.0147433280944824, "logits/rejected": -1.9574966430664062, "logps/chosen": -320.1642150878906, "logps/rejected": -481.4913635253906, "loss": 0.3963, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.6967387199401855, "rewards/margins": 1.6126458644866943, "rewards/rejected": -4.309384346008301, "step": 16800 }, { "epoch": 2.894555478980014, "eval_logits/chosen": -2.0081090927124023, "eval_logits/rejected": -1.9834911823272705, "eval_logps/chosen": -345.49188232421875, "eval_logps/rejected": -404.8013916015625, "eval_loss": 0.7103734612464905, "eval_rewards/accuracies": 0.6301115155220032, "eval_rewards/chosen": -2.8647639751434326, "eval_rewards/margins": 0.5557535886764526, "eval_rewards/rejected": -3.420517683029175, "eval_runtime": 361.5732, "eval_samples_per_second": 11.904, "eval_steps_per_second": 1.488, "step": 16800 }, { "epoch": 2.896278428669883, "grad_norm": 52.320068359375, "learning_rate": 1.8186013340573404e-09, "logits/chosen": -1.9541130065917969, "logits/rejected": -1.8771181106567383, "logps/chosen": -291.00482177734375, "logps/rejected": -482.44000244140625, "loss": 0.3611, "rewards/accuracies": 0.8125, "rewards/chosen": -2.3701610565185547, "rewards/margins": 1.969092607498169, "rewards/rejected": -4.3392534255981445, "step": 16810 }, { "epoch": 2.898001378359752, "grad_norm": 47.22905731201172, "learning_rate": 1.7587548545208442e-09, "logits/chosen": -1.989022970199585, "logits/rejected": -1.9109958410263062, "logps/chosen": -310.2891540527344, "logps/rejected": -472.656005859375, "loss": 0.3874, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.541001081466675, "rewards/margins": 1.6712042093276978, "rewards/rejected": -4.21220588684082, "step": 16820 }, { "epoch": 2.899724328049621, "grad_norm": 47.421958923339844, "learning_rate": 1.6999061567153638e-09, "logits/chosen": -2.0461344718933105, "logits/rejected": -1.9658887386322021, "logps/chosen": -330.6890563964844, "logps/rejected": -540.8363037109375, "loss": 0.3554, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.750687837600708, "rewards/margins": 2.116511583328247, "rewards/rejected": -4.867198944091797, "step": 16830 }, { "epoch": 2.90144727773949, "grad_norm": 75.02934265136719, "learning_rate": 1.6420554771775786e-09, "logits/chosen": -1.9625543355941772, "logits/rejected": -1.8915119171142578, "logps/chosen": -306.87060546875, "logps/rejected": -479.26080322265625, "loss": 0.353, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.5282883644104004, "rewards/margins": 1.7588777542114258, "rewards/rejected": -4.287166118621826, "step": 16840 }, { "epoch": 2.903170227429359, "grad_norm": 46.775604248046875, "learning_rate": 1.5852030484327372e-09, "logits/chosen": -2.001521348953247, "logits/rejected": -1.933220624923706, "logps/chosen": -313.8436584472656, "logps/rejected": -490.5022888183594, "loss": 0.3136, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.573374032974243, "rewards/margins": 1.8404076099395752, "rewards/rejected": -4.413781642913818, "step": 16850 }, { "epoch": 2.904893177119228, "grad_norm": 41.85394287109375, "learning_rate": 1.5293490989936874e-09, "logits/chosen": -2.0023932456970215, "logits/rejected": -1.91958487033844, "logps/chosen": -310.11151123046875, "logps/rejected": -484.804931640625, "loss": 0.3642, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.537031650543213, "rewards/margins": 1.8104941844940186, "rewards/rejected": -4.347525596618652, "step": 16860 }, { "epoch": 2.906616126809097, "grad_norm": 72.52410888671875, "learning_rate": 1.4744938533599594e-09, "logits/chosen": -1.9951210021972656, "logits/rejected": -1.9260365962982178, "logps/chosen": -307.95196533203125, "logps/rejected": -473.8611755371094, "loss": 0.4291, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.532139301300049, "rewards/margins": 1.7052303552627563, "rewards/rejected": -4.237369537353516, "step": 16870 }, { "epoch": 2.908339076498966, "grad_norm": 40.3792724609375, "learning_rate": 1.4206375320169328e-09, "logits/chosen": -2.002443313598633, "logits/rejected": -1.9358123540878296, "logps/chosen": -307.26422119140625, "logps/rejected": -489.0497131347656, "loss": 0.3624, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.5680949687957764, "rewards/margins": 1.8095386028289795, "rewards/rejected": -4.377633571624756, "step": 16880 }, { "epoch": 2.910062026188835, "grad_norm": 43.24536895751953, "learning_rate": 1.3677803514349217e-09, "logits/chosen": -1.9354703426361084, "logits/rejected": -1.8794496059417725, "logps/chosen": -334.2705993652344, "logps/rejected": -511.354248046875, "loss": 0.3672, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.7884416580200195, "rewards/margins": 1.793386459350586, "rewards/rejected": -4.581827640533447, "step": 16890 }, { "epoch": 2.9117849758787044, "grad_norm": 55.82430648803711, "learning_rate": 1.315922524068258e-09, "logits/chosen": -2.0079846382141113, "logits/rejected": -1.9507519006729126, "logps/chosen": -306.2540283203125, "logps/rejected": -466.34576416015625, "loss": 0.3871, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.5238380432128906, "rewards/margins": 1.6495411396026611, "rewards/rejected": -4.173378944396973, "step": 16900 }, { "epoch": 2.9135079255685734, "grad_norm": 34.92049789428711, "learning_rate": 1.2650642583544312e-09, "logits/chosen": -2.0201222896575928, "logits/rejected": -1.9454004764556885, "logps/chosen": -311.7602233886719, "logps/rejected": -509.61407470703125, "loss": 0.3689, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.5929977893829346, "rewards/margins": 1.9696071147918701, "rewards/rejected": -4.562604904174805, "step": 16910 }, { "epoch": 2.9152308752584424, "grad_norm": 38.00172424316406, "learning_rate": 1.2152057587133668e-09, "logits/chosen": -1.980713129043579, "logits/rejected": -1.9206430912017822, "logps/chosen": -313.1141357421875, "logps/rejected": -477.897216796875, "loss": 0.3751, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.573094606399536, "rewards/margins": 1.6757402420043945, "rewards/rejected": -4.248834609985352, "step": 16920 }, { "epoch": 2.9169538249483113, "grad_norm": 43.91480255126953, "learning_rate": 1.1663472255464824e-09, "logits/chosen": -2.0166168212890625, "logits/rejected": -1.9619123935699463, "logps/chosen": -301.18402099609375, "logps/rejected": -455.1163635253906, "loss": 0.4211, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.437309980392456, "rewards/margins": 1.6082134246826172, "rewards/rejected": -4.045523643493652, "step": 16930 }, { "epoch": 2.9186767746381808, "grad_norm": 37.877899169921875, "learning_rate": 1.1184888552359385e-09, "logits/chosen": -1.9701398611068726, "logits/rejected": -1.9095942974090576, "logps/chosen": -311.5226135253906, "logps/rejected": -466.55084228515625, "loss": 0.4453, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.520601272583008, "rewards/margins": 1.6050732135772705, "rewards/rejected": -4.125674724578857, "step": 16940 }, { "epoch": 2.9203997243280497, "grad_norm": 77.34817504882812, "learning_rate": 1.0716308401438334e-09, "logits/chosen": -1.9589197635650635, "logits/rejected": -1.9072033166885376, "logps/chosen": -321.0328063964844, "logps/rejected": -497.5147399902344, "loss": 0.3983, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.699627161026001, "rewards/margins": 1.7325799465179443, "rewards/rejected": -4.432206630706787, "step": 16950 }, { "epoch": 2.9221226740179187, "grad_norm": 57.7326774597168, "learning_rate": 1.0257733686114545e-09, "logits/chosen": -1.9648206233978271, "logits/rejected": -1.8897243738174438, "logps/chosen": -303.43902587890625, "logps/rejected": -513.9765625, "loss": 0.3355, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.47377347946167, "rewards/margins": 2.1220626831054688, "rewards/rejected": -4.595836639404297, "step": 16960 }, { "epoch": 2.9238456237077877, "grad_norm": 70.4461441040039, "learning_rate": 9.809166249584721e-10, "logits/chosen": -2.0406229496002197, "logits/rejected": -1.9712715148925781, "logps/chosen": -327.0880432128906, "logps/rejected": -501.34478759765625, "loss": 0.3643, "rewards/accuracies": 0.84375, "rewards/chosen": -2.7061314582824707, "rewards/margins": 1.7797197103500366, "rewards/rejected": -4.485851287841797, "step": 16970 }, { "epoch": 2.9255685733976566, "grad_norm": 46.693321228027344, "learning_rate": 9.370607894822468e-10, "logits/chosen": -2.0040597915649414, "logits/rejected": -1.9270622730255127, "logps/chosen": -300.2549133300781, "logps/rejected": -477.6903381347656, "loss": 0.3427, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.4538629055023193, "rewards/margins": 1.8043968677520752, "rewards/rejected": -4.2582597732543945, "step": 16980 }, { "epoch": 2.9272915230875256, "grad_norm": 26.898324966430664, "learning_rate": 8.942060384570515e-10, "logits/chosen": -2.0526130199432373, "logits/rejected": -1.981886625289917, "logps/chosen": -286.5238342285156, "logps/rejected": -469.318603515625, "loss": 0.3522, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.330425262451172, "rewards/margins": 1.8455028533935547, "rewards/rejected": -4.175928115844727, "step": 16990 }, { "epoch": 2.929014472777395, "grad_norm": 67.33253479003906, "learning_rate": 8.523525441334611e-10, "logits/chosen": -2.0571649074554443, "logits/rejected": -2.0016846656799316, "logps/chosen": -306.4559326171875, "logps/rejected": -447.0130920410156, "loss": 0.471, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.50343656539917, "rewards/margins": 1.4377968311309814, "rewards/rejected": -3.9412338733673096, "step": 17000 }, { "epoch": 2.930737422467264, "grad_norm": 39.961387634277344, "learning_rate": 8.115004747375476e-10, "logits/chosen": -1.9635521173477173, "logits/rejected": -1.8958520889282227, "logps/chosen": -328.88494873046875, "logps/rejected": -517.787353515625, "loss": 0.4016, "rewards/accuracies": 0.84375, "rewards/chosen": -2.7507503032684326, "rewards/margins": 1.8838965892791748, "rewards/rejected": -4.634646415710449, "step": 17010 }, { "epoch": 2.932460372157133, "grad_norm": 55.9789924621582, "learning_rate": 7.716499944702138e-10, "logits/chosen": -1.965374231338501, "logits/rejected": -1.9213764667510986, "logps/chosen": -316.76361083984375, "logps/rejected": -467.15887451171875, "loss": 0.4726, "rewards/accuracies": 0.78125, "rewards/chosen": -2.6275250911712646, "rewards/margins": 1.5201573371887207, "rewards/rejected": -4.1476826667785645, "step": 17020 }, { "epoch": 2.934183321847002, "grad_norm": 60.556922912597656, "learning_rate": 7.328012635066938e-10, "logits/chosen": -2.0092720985412598, "logits/rejected": -1.9343280792236328, "logps/chosen": -305.34716796875, "logps/rejected": -470.07940673828125, "loss": 0.3834, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.47866153717041, "rewards/margins": 1.6968753337860107, "rewards/rejected": -4.175536155700684, "step": 17030 }, { "epoch": 2.9359062715368713, "grad_norm": 46.01879119873047, "learning_rate": 6.949544379956651e-10, "logits/chosen": -2.027007818222046, "logits/rejected": -1.9483808279037476, "logps/chosen": -331.25067138671875, "logps/rejected": -502.82696533203125, "loss": 0.3512, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.748131513595581, "rewards/margins": 1.7764955759048462, "rewards/rejected": -4.524627208709717, "step": 17040 }, { "epoch": 2.9376292212267403, "grad_norm": 79.57945251464844, "learning_rate": 6.581096700588041e-10, "logits/chosen": -1.9695183038711548, "logits/rejected": -1.912893533706665, "logps/chosen": -328.90557861328125, "logps/rejected": -486.8267517089844, "loss": 0.4495, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.7338550090789795, "rewards/margins": 1.6042826175689697, "rewards/rejected": -4.338137626647949, "step": 17050 }, { "epoch": 2.9393521709166093, "grad_norm": 45.0560417175293, "learning_rate": 6.222671077900921e-10, "logits/chosen": -2.001903772354126, "logits/rejected": -1.9271373748779297, "logps/chosen": -312.98992919921875, "logps/rejected": -480.68316650390625, "loss": 0.391, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.579789638519287, "rewards/margins": 1.6826848983764648, "rewards/rejected": -4.262474536895752, "step": 17060 }, { "epoch": 2.9410751206064782, "grad_norm": 48.86662673950195, "learning_rate": 5.874268952552608e-10, "logits/chosen": -1.9958575963974, "logits/rejected": -1.9243223667144775, "logps/chosen": -322.68670654296875, "logps/rejected": -480.5069885253906, "loss": 0.4326, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.666762590408325, "rewards/margins": 1.627129316329956, "rewards/rejected": -4.2938923835754395, "step": 17070 }, { "epoch": 2.942798070296347, "grad_norm": 44.047672271728516, "learning_rate": 5.535891724911812e-10, "logits/chosen": -1.936525583267212, "logits/rejected": -1.8636153936386108, "logps/chosen": -317.54791259765625, "logps/rejected": -521.099609375, "loss": 0.325, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.6366071701049805, "rewards/margins": 2.0487289428710938, "rewards/rejected": -4.685335636138916, "step": 17080 }, { "epoch": 2.944521019986216, "grad_norm": 50.70817184448242, "learning_rate": 5.207540755053085e-10, "logits/chosen": -1.9859488010406494, "logits/rejected": -1.9171874523162842, "logps/chosen": -303.4879150390625, "logps/rejected": -508.82269287109375, "loss": 0.3163, "rewards/accuracies": 0.875, "rewards/chosen": -2.4983277320861816, "rewards/margins": 2.065859317779541, "rewards/rejected": -4.564187049865723, "step": 17090 }, { "epoch": 2.9462439696760856, "grad_norm": 53.58707046508789, "learning_rate": 4.889217362751552e-10, "logits/chosen": -1.9679946899414062, "logits/rejected": -1.895334005355835, "logps/chosen": -321.09271240234375, "logps/rejected": -495.86572265625, "loss": 0.3831, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.643812417984009, "rewards/margins": 1.794777274131775, "rewards/rejected": -4.438589572906494, "step": 17100 }, { "epoch": 2.9479669193659546, "grad_norm": 33.87190246582031, "learning_rate": 4.5809228274773517e-10, "logits/chosen": -1.932033896446228, "logits/rejected": -1.870363473892212, "logps/chosen": -339.7185974121094, "logps/rejected": -529.9310913085938, "loss": 0.3229, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.8384857177734375, "rewards/margins": 1.9319543838500977, "rewards/rejected": -4.770440101623535, "step": 17110 }, { "epoch": 2.9496898690558235, "grad_norm": 48.99530029296875, "learning_rate": 4.28265838839037e-10, "logits/chosen": -2.0496230125427246, "logits/rejected": -1.9713125228881836, "logps/chosen": -311.4201965332031, "logps/rejected": -491.86444091796875, "loss": 0.3374, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.5792136192321777, "rewards/margins": 1.860452651977539, "rewards/rejected": -4.439666271209717, "step": 17120 }, { "epoch": 2.9514128187456925, "grad_norm": 64.98383331298828, "learning_rate": 3.994425244336352e-10, "logits/chosen": -1.924207091331482, "logits/rejected": -1.8409217596054077, "logps/chosen": -334.62567138671875, "logps/rejected": -523.8311157226562, "loss": 0.3532, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.788783550262451, "rewards/margins": 1.9189989566802979, "rewards/rejected": -4.707782745361328, "step": 17130 }, { "epoch": 2.953135768435562, "grad_norm": 49.69882583618164, "learning_rate": 3.716224553839964e-10, "logits/chosen": -1.9374433755874634, "logits/rejected": -1.8788745403289795, "logps/chosen": -302.5534362792969, "logps/rejected": -494.52984619140625, "loss": 0.3404, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.4677860736846924, "rewards/margins": 1.9571502208709717, "rewards/rejected": -4.424935817718506, "step": 17140 }, { "epoch": 2.954858718125431, "grad_norm": 35.88608932495117, "learning_rate": 3.448057435102292e-10, "logits/chosen": -2.0112757682800293, "logits/rejected": -1.9359382390975952, "logps/chosen": -329.676513671875, "logps/rejected": -508.78668212890625, "loss": 0.4217, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.7404181957244873, "rewards/margins": 1.8306620121002197, "rewards/rejected": -4.571080207824707, "step": 17150 }, { "epoch": 2.9565816678153, "grad_norm": 46.91283416748047, "learning_rate": 3.189924965995017e-10, "logits/chosen": -2.0378429889678955, "logits/rejected": -1.9630523920059204, "logps/chosen": -306.4375915527344, "logps/rejected": -535.2238159179688, "loss": 0.3287, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.5412473678588867, "rewards/margins": 2.2977943420410156, "rewards/rejected": -4.839041709899902, "step": 17160 }, { "epoch": 2.958304617505169, "grad_norm": 37.330562591552734, "learning_rate": 2.9418281840570823e-10, "logits/chosen": -2.032977342605591, "logits/rejected": -1.966487169265747, "logps/chosen": -339.48095703125, "logps/rejected": -502.02532958984375, "loss": 0.4462, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.794395923614502, "rewards/margins": 1.6556148529052734, "rewards/rejected": -4.450010776519775, "step": 17170 }, { "epoch": 2.960027567195038, "grad_norm": 35.931007385253906, "learning_rate": 2.703768086489422e-10, "logits/chosen": -1.987225890159607, "logits/rejected": -1.9175924062728882, "logps/chosen": -318.9792785644531, "logps/rejected": -495.4854431152344, "loss": 0.391, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.6744179725646973, "rewards/margins": 1.7735569477081299, "rewards/rejected": -4.4479756355285645, "step": 17180 }, { "epoch": 2.9617505168849068, "grad_norm": 61.79660415649414, "learning_rate": 2.475745630151349e-10, "logits/chosen": -1.9643968343734741, "logits/rejected": -1.9011493921279907, "logps/chosen": -285.7884521484375, "logps/rejected": -467.2911071777344, "loss": 0.3316, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.328789234161377, "rewards/margins": 1.8242868185043335, "rewards/rejected": -4.153076171875, "step": 17190 }, { "epoch": 2.963473466574776, "grad_norm": 39.53703689575195, "learning_rate": 2.257761731557506e-10, "logits/chosen": -2.0098865032196045, "logits/rejected": -1.9425595998764038, "logps/chosen": -306.86517333984375, "logps/rejected": -490.03814697265625, "loss": 0.3399, "rewards/accuracies": 0.84375, "rewards/chosen": -2.516226291656494, "rewards/margins": 1.8481931686401367, "rewards/rejected": -4.364419460296631, "step": 17200 }, { "epoch": 2.963473466574776, "eval_logits/chosen": -2.007523775100708, "eval_logits/rejected": -1.9830219745635986, "eval_logps/chosen": -344.9560241699219, "eval_logps/rejected": -404.2797546386719, "eval_loss": 0.709500253200531, "eval_rewards/accuracies": 0.633596658706665, "eval_rewards/chosen": -2.859405517578125, "eval_rewards/margins": 0.5558960437774658, "eval_rewards/rejected": -3.4153013229370117, "eval_runtime": 361.8135, "eval_samples_per_second": 11.896, "eval_steps_per_second": 1.487, "step": 17200 }, { "epoch": 2.965196416264645, "grad_norm": 55.28764343261719, "learning_rate": 2.0498172668728664e-10, "logits/chosen": -1.9488662481307983, "logits/rejected": -1.8814799785614014, "logps/chosen": -321.608642578125, "logps/rejected": -518.12841796875, "loss": 0.3773, "rewards/accuracies": 0.875, "rewards/chosen": -2.6885111331939697, "rewards/margins": 1.9650837182998657, "rewards/rejected": -4.653594017028809, "step": 17210 }, { "epoch": 2.966919365954514, "grad_norm": 39.413063049316406, "learning_rate": 1.8519130719102384e-10, "logits/chosen": -2.002307653427124, "logits/rejected": -1.9336833953857422, "logps/chosen": -318.3397521972656, "logps/rejected": -516.0406494140625, "loss": 0.3841, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.6193480491638184, "rewards/margins": 1.9812126159667969, "rewards/rejected": -4.600560665130615, "step": 17220 }, { "epoch": 2.968642315644383, "grad_norm": 25.598228454589844, "learning_rate": 1.6640499421263775e-10, "logits/chosen": -1.9980285167694092, "logits/rejected": -1.936488151550293, "logps/chosen": -319.53515625, "logps/rejected": -508.0516662597656, "loss": 0.4141, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.657583236694336, "rewards/margins": 1.8795464038848877, "rewards/rejected": -4.5371294021606445, "step": 17230 }, { "epoch": 2.9703652653342525, "grad_norm": 40.87997055053711, "learning_rate": 1.4862286326189355e-10, "logits/chosen": -1.9989811182022095, "logits/rejected": -1.9359538555145264, "logps/chosen": -294.2556457519531, "logps/rejected": -466.5439453125, "loss": 0.3939, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.3949034214019775, "rewards/margins": 1.7280871868133545, "rewards/rejected": -4.12299108505249, "step": 17240 }, { "epoch": 2.9720882150241215, "grad_norm": 48.124446868896484, "learning_rate": 1.31844985812396e-10, "logits/chosen": -1.9930938482284546, "logits/rejected": -1.9305658340454102, "logps/chosen": -309.66351318359375, "logps/rejected": -486.0123596191406, "loss": 0.3748, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.562588930130005, "rewards/margins": 1.7796005010604858, "rewards/rejected": -4.342188835144043, "step": 17250 }, { "epoch": 2.9738111647139904, "grad_norm": 29.23064422607422, "learning_rate": 1.1607142930114556e-10, "logits/chosen": -1.992112398147583, "logits/rejected": -1.9134495258331299, "logps/chosen": -298.883544921875, "logps/rejected": -494.91015625, "loss": 0.3486, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.429267406463623, "rewards/margins": 2.007957696914673, "rewards/rejected": -4.437224864959717, "step": 17260 }, { "epoch": 2.9755341144038594, "grad_norm": 44.806732177734375, "learning_rate": 1.0130225712845498e-10, "logits/chosen": -1.9773629903793335, "logits/rejected": -1.9037420749664307, "logps/chosen": -329.89276123046875, "logps/rejected": -502.8241271972656, "loss": 0.3789, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.738227128982544, "rewards/margins": 1.7515281438827515, "rewards/rejected": -4.489755153656006, "step": 17270 }, { "epoch": 2.9772570640937284, "grad_norm": 49.066043853759766, "learning_rate": 8.753752865761633e-11, "logits/chosen": -1.939640760421753, "logits/rejected": -1.866666555404663, "logps/chosen": -313.457275390625, "logps/rejected": -492.65960693359375, "loss": 0.363, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.5823307037353516, "rewards/margins": 1.8314058780670166, "rewards/rejected": -4.413736820220947, "step": 17280 }, { "epoch": 2.9789800137835973, "grad_norm": 54.72799301147461, "learning_rate": 7.477729921456788e-11, "logits/chosen": -1.9939829111099243, "logits/rejected": -1.9264724254608154, "logps/chosen": -325.5101013183594, "logps/rejected": -495.202880859375, "loss": 0.353, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.6511871814727783, "rewards/margins": 1.7408859729766846, "rewards/rejected": -4.392073154449463, "step": 17290 }, { "epoch": 2.9807029634734663, "grad_norm": 43.085575103759766, "learning_rate": 6.302162008786638e-11, "logits/chosen": -2.0136828422546387, "logits/rejected": -1.948874831199646, "logps/chosen": -305.8410339355469, "logps/rejected": -484.3267517089844, "loss": 0.3581, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.491542339324951, "rewards/margins": 1.835982084274292, "rewards/rejected": -4.327524185180664, "step": 17300 }, { "epoch": 2.9824259131633357, "grad_norm": 60.86738204956055, "learning_rate": 5.2270538528326194e-11, "logits/chosen": -2.021362066268921, "logits/rejected": -1.9485132694244385, "logps/chosen": -311.72430419921875, "logps/rejected": -504.989013671875, "loss": 0.3807, "rewards/accuracies": 0.84375, "rewards/chosen": -2.5608646869659424, "rewards/margins": 1.969520926475525, "rewards/rejected": -4.530385971069336, "step": 17310 }, { "epoch": 2.9841488628532047, "grad_norm": 37.73630905151367, "learning_rate": 4.252409774885279e-11, "logits/chosen": -2.0035107135772705, "logits/rejected": -1.9430601596832275, "logps/chosen": -313.00714111328125, "logps/rejected": -477.59161376953125, "loss": 0.4121, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.5748836994171143, "rewards/margins": 1.6871376037597656, "rewards/rejected": -4.262021064758301, "step": 17320 }, { "epoch": 2.9858718125430737, "grad_norm": 31.04465675354004, "learning_rate": 3.37823369243595e-11, "logits/chosen": -1.9464353322982788, "logits/rejected": -1.8741804361343384, "logps/chosen": -316.3692321777344, "logps/rejected": -497.06591796875, "loss": 0.3924, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.5962769985198975, "rewards/margins": 1.8650003671646118, "rewards/rejected": -4.461277484893799, "step": 17330 }, { "epoch": 2.987594762232943, "grad_norm": 55.72785949707031, "learning_rate": 2.6045291191462148e-11, "logits/chosen": -2.006436824798584, "logits/rejected": -1.9283565282821655, "logps/chosen": -298.8694152832031, "logps/rejected": -486.3223571777344, "loss": 0.3129, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.4506101608276367, "rewards/margins": 1.899383544921875, "rewards/rejected": -4.349993705749512, "step": 17340 }, { "epoch": 2.989317711922812, "grad_norm": 24.38024139404297, "learning_rate": 1.9312991648534616e-11, "logits/chosen": -2.022604465484619, "logits/rejected": -1.938359022140503, "logps/chosen": -293.2810974121094, "logps/rejected": -486.90289306640625, "loss": 0.3126, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.3894364833831787, "rewards/margins": 1.9684984683990479, "rewards/rejected": -4.357934951782227, "step": 17350 }, { "epoch": 2.991040661612681, "grad_norm": 63.593135833740234, "learning_rate": 1.3585465355347992e-11, "logits/chosen": -2.0631136894226074, "logits/rejected": -1.9972121715545654, "logps/chosen": -298.95355224609375, "logps/rejected": -495.7401428222656, "loss": 0.366, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.4375734329223633, "rewards/margins": 1.9781090021133423, "rewards/rejected": -4.415682792663574, "step": 17360 }, { "epoch": 2.99276361130255, "grad_norm": 42.5406608581543, "learning_rate": 8.862735333181603e-12, "logits/chosen": -1.9741065502166748, "logits/rejected": -1.9104198217391968, "logps/chosen": -310.2678527832031, "logps/rejected": -486.5558166503906, "loss": 0.3891, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.5711617469787598, "rewards/margins": 1.7596864700317383, "rewards/rejected": -4.330848693847656, "step": 17370 }, { "epoch": 2.994486560992419, "grad_norm": 80.14839172363281, "learning_rate": 5.144820564573216e-12, "logits/chosen": -1.970440149307251, "logits/rejected": -1.9158207178115845, "logps/chosen": -322.6686096191406, "logps/rejected": -477.5149841308594, "loss": 0.4479, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.6907784938812256, "rewards/margins": 1.5461736917495728, "rewards/rejected": -4.236952304840088, "step": 17380 }, { "epoch": 2.996209510682288, "grad_norm": 39.31589126586914, "learning_rate": 2.431735993346784e-12, "logits/chosen": -2.007418155670166, "logits/rejected": -1.9457228183746338, "logps/chosen": -308.9580993652344, "logps/rejected": -486.5531311035156, "loss": 0.3115, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.535252571105957, "rewards/margins": 1.8011573553085327, "rewards/rejected": -4.3364105224609375, "step": 17390 }, { "epoch": 2.997932460372157, "grad_norm": 25.230167388916016, "learning_rate": 7.234925244459233e-13, "logits/chosen": -1.987540602684021, "logits/rejected": -1.929103136062622, "logps/chosen": -325.2969970703125, "logps/rejected": -491.44866943359375, "loss": 0.4602, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.6901509761810303, "rewards/margins": 1.735664963722229, "rewards/rejected": -4.425815582275391, "step": 17400 }, { "epoch": 2.9996554100620263, "grad_norm": 55.49097442626953, "learning_rate": 2.009702398941826e-14, "logits/chosen": -1.9490423202514648, "logits/rejected": -1.874058723449707, "logps/chosen": -304.45489501953125, "logps/rejected": -497.83935546875, "loss": 0.2924, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.4959452152252197, "rewards/margins": 1.9391368627548218, "rewards/rejected": -4.43508243560791, "step": 17410 }, { "epoch": 3.0, "step": 17412, "total_flos": 0.0, "train_loss": 0.49131362667692796, "train_runtime": 84120.8122, "train_samples_per_second": 3.312, "train_steps_per_second": 0.207 } ], "logging_steps": 10, "max_steps": 17412, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }