{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997382884061764, "eval_steps": 100, "global_step": 1910, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.58203125, "learning_rate": 2.617801047120419e-08, "logits/chosen": -3.1532161235809326, "logits/rejected": -3.1690337657928467, "logps/chosen": -305.45306396484375, "logps/rejected": -294.4603576660156, "loss": 0.5, "rewards/accuracies": 0.5, "rewards/chosen": -0.0007838421151973307, "rewards/margins": -0.00040248289587907493, "rewards/rejected": -0.000381359423045069, "step": 1 }, { "epoch": 0.01, "grad_norm": 0.5390625, "learning_rate": 2.617801047120419e-07, "logits/chosen": -3.177987813949585, "logits/rejected": -3.2059593200683594, "logps/chosen": -299.1102294921875, "logps/rejected": -249.10623168945312, "loss": 0.5001, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": -0.0008526805322617292, "rewards/margins": -0.00045007685548625886, "rewards/rejected": -0.0004026036476716399, "step": 10 }, { "epoch": 0.01, "grad_norm": 0.59765625, "learning_rate": 5.235602094240838e-07, "logits/chosen": -3.1716275215148926, "logits/rejected": -3.166067123413086, "logps/chosen": -238.83120727539062, "logps/rejected": -244.2283935546875, "loss": 0.5, "rewards/accuracies": 0.503125011920929, "rewards/chosen": 0.0001240858546225354, "rewards/margins": 5.3543342801276594e-05, "rewards/rejected": 7.054249726934358e-05, "step": 20 }, { "epoch": 0.02, "grad_norm": 0.765625, "learning_rate": 7.853403141361258e-07, "logits/chosen": -3.194286823272705, "logits/rejected": -3.2046267986297607, "logps/chosen": -268.1184387207031, "logps/rejected": -239.86087036132812, "loss": 0.4997, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.002198445377871394, "rewards/margins": 0.0013555358164012432, "rewards/rejected": 0.0008429096196778119, "step": 30 }, { "epoch": 0.02, "grad_norm": 0.72265625, "learning_rate": 1.0471204188481676e-06, "logits/chosen": -3.1798417568206787, "logits/rejected": -3.185044765472412, "logps/chosen": -273.47900390625, "logps/rejected": -255.7032928466797, "loss": 0.4993, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": 0.005988434888422489, "rewards/margins": 0.0028830617666244507, "rewards/rejected": 0.003105373587459326, "step": 40 }, { "epoch": 0.03, "grad_norm": 0.7578125, "learning_rate": 1.3089005235602096e-06, "logits/chosen": -3.162355899810791, "logits/rejected": -3.1799404621124268, "logps/chosen": -256.9862060546875, "logps/rejected": -239.87069702148438, "loss": 0.4985, "rewards/accuracies": 0.671875, "rewards/chosen": 0.012596851214766502, "rewards/margins": 0.006152496673166752, "rewards/rejected": 0.0064443545415997505, "step": 50 }, { "epoch": 0.03, "grad_norm": 0.73046875, "learning_rate": 1.5706806282722515e-06, "logits/chosen": -3.1871049404144287, "logits/rejected": -3.200637102127075, "logps/chosen": -294.3240661621094, "logps/rejected": -262.1870422363281, "loss": 0.4969, "rewards/accuracies": 0.6875, "rewards/chosen": 0.022759366780519485, "rewards/margins": 0.012819233350455761, "rewards/rejected": 0.009940135292708874, "step": 60 }, { "epoch": 0.04, "grad_norm": 0.75390625, "learning_rate": 1.8324607329842933e-06, "logits/chosen": -3.1636972427368164, "logits/rejected": -3.161069869995117, "logps/chosen": -266.68853759765625, "logps/rejected": -243.20263671875, "loss": 0.496, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": 0.031995244324207306, "rewards/margins": 0.016239028424024582, "rewards/rejected": 0.015756219625473022, "step": 70 }, { "epoch": 0.04, "grad_norm": 0.8359375, "learning_rate": 2.094240837696335e-06, "logits/chosen": -3.1705234050750732, "logits/rejected": -3.1865649223327637, "logps/chosen": -271.360595703125, "logps/rejected": -252.78170776367188, "loss": 0.4962, "rewards/accuracies": 0.625, "rewards/chosen": 0.038576819002628326, "rewards/margins": 0.015578309074044228, "rewards/rejected": 0.022998513653874397, "step": 80 }, { "epoch": 0.05, "grad_norm": 0.72265625, "learning_rate": 2.356020942408377e-06, "logits/chosen": -3.166350841522217, "logits/rejected": -3.1725335121154785, "logps/chosen": -240.39999389648438, "logps/rejected": -236.23782348632812, "loss": 0.4956, "rewards/accuracies": 0.625, "rewards/chosen": 0.035779114812612534, "rewards/margins": 0.018319377675652504, "rewards/rejected": 0.01745973899960518, "step": 90 }, { "epoch": 0.05, "grad_norm": 0.7109375, "learning_rate": 2.617801047120419e-06, "logits/chosen": -3.169689655303955, "logits/rejected": -3.2062766551971436, "logps/chosen": -260.24462890625, "logps/rejected": -230.7229766845703, "loss": 0.4929, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.04880410060286522, "rewards/margins": 0.029587719589471817, "rewards/rejected": 0.0192163847386837, "step": 100 }, { "epoch": 0.06, "grad_norm": 0.65625, "learning_rate": 2.8795811518324613e-06, "logits/chosen": -3.1644022464752197, "logits/rejected": -3.178515672683716, "logps/chosen": -257.0642395019531, "logps/rejected": -233.0090789794922, "loss": 0.4904, "rewards/accuracies": 0.715624988079071, "rewards/chosen": 0.05435089394450188, "rewards/margins": 0.03999961167573929, "rewards/rejected": 0.014351281337440014, "step": 110 }, { "epoch": 0.06, "grad_norm": 0.7265625, "learning_rate": 3.141361256544503e-06, "logits/chosen": -3.1647660732269287, "logits/rejected": -3.181644916534424, "logps/chosen": -300.6939392089844, "logps/rejected": -279.0010070800781, "loss": 0.4918, "rewards/accuracies": 0.621874988079071, "rewards/chosen": 0.0594431571662426, "rewards/margins": 0.035575076937675476, "rewards/rejected": 0.023868080228567123, "step": 120 }, { "epoch": 0.07, "grad_norm": 0.6171875, "learning_rate": 3.403141361256545e-06, "logits/chosen": -3.1200623512268066, "logits/rejected": -3.1410274505615234, "logps/chosen": -265.76513671875, "logps/rejected": -246.6106414794922, "loss": 0.4896, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": 0.05789119750261307, "rewards/margins": 0.045422304421663284, "rewards/rejected": 0.012468896806240082, "step": 130 }, { "epoch": 0.07, "grad_norm": 0.671875, "learning_rate": 3.6649214659685865e-06, "logits/chosen": -3.1760947704315186, "logits/rejected": -3.1799349784851074, "logps/chosen": -258.2982482910156, "logps/rejected": -239.6028289794922, "loss": 0.4864, "rewards/accuracies": 0.684374988079071, "rewards/chosen": 0.06110318750143051, "rewards/margins": 0.06362718343734741, "rewards/rejected": -0.00252399779856205, "step": 140 }, { "epoch": 0.08, "grad_norm": 0.625, "learning_rate": 3.926701570680629e-06, "logits/chosen": -3.1222329139709473, "logits/rejected": -3.133145332336426, "logps/chosen": -265.1998291015625, "logps/rejected": -251.78952026367188, "loss": 0.4875, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.05429766699671745, "rewards/margins": 0.05967814847826958, "rewards/rejected": -0.005380480550229549, "step": 150 }, { "epoch": 0.08, "grad_norm": 0.703125, "learning_rate": 4.18848167539267e-06, "logits/chosen": -3.176487445831299, "logits/rejected": -3.190802812576294, "logps/chosen": -272.86651611328125, "logps/rejected": -244.38687133789062, "loss": 0.4852, "rewards/accuracies": 0.684374988079071, "rewards/chosen": 0.05848199874162674, "rewards/margins": 0.07154129445552826, "rewards/rejected": -0.013059285469353199, "step": 160 }, { "epoch": 0.09, "grad_norm": 0.71484375, "learning_rate": 4.450261780104713e-06, "logits/chosen": -3.153256893157959, "logits/rejected": -3.1715409755706787, "logps/chosen": -274.42901611328125, "logps/rejected": -259.0591125488281, "loss": 0.4835, "rewards/accuracies": 0.721875011920929, "rewards/chosen": 0.06613625586032867, "rewards/margins": 0.0808890238404274, "rewards/rejected": -0.014752751216292381, "step": 170 }, { "epoch": 0.09, "grad_norm": 0.51953125, "learning_rate": 4.712041884816754e-06, "logits/chosen": -3.116530179977417, "logits/rejected": -3.125654697418213, "logps/chosen": -264.8692932128906, "logps/rejected": -256.16748046875, "loss": 0.486, "rewards/accuracies": 0.715624988079071, "rewards/chosen": 0.05057697370648384, "rewards/margins": 0.0704292505979538, "rewards/rejected": -0.019852278754115105, "step": 180 }, { "epoch": 0.1, "grad_norm": 0.6171875, "learning_rate": 4.9738219895287965e-06, "logits/chosen": -3.123274564743042, "logits/rejected": -3.1312222480773926, "logps/chosen": -288.4281311035156, "logps/rejected": -258.3103332519531, "loss": 0.4814, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.05620621517300606, "rewards/margins": 0.09435133635997772, "rewards/rejected": -0.038145121186971664, "step": 190 }, { "epoch": 0.1, "grad_norm": 0.703125, "learning_rate": 4.999661831436499e-06, "logits/chosen": -3.126032590866089, "logits/rejected": -3.1409640312194824, "logps/chosen": -271.5264587402344, "logps/rejected": -251.412109375, "loss": 0.4787, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": 0.0683365911245346, "rewards/margins": 0.10697062313556671, "rewards/rejected": -0.038634032011032104, "step": 200 }, { "epoch": 0.11, "grad_norm": 0.6953125, "learning_rate": 4.9984929711403395e-06, "logits/chosen": -3.0909173488616943, "logits/rejected": -3.0889172554016113, "logps/chosen": -235.9461212158203, "logps/rejected": -240.8140106201172, "loss": 0.4855, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.03789529204368591, "rewards/margins": 0.07996337115764618, "rewards/rejected": -0.04206807166337967, "step": 210 }, { "epoch": 0.12, "grad_norm": 0.609375, "learning_rate": 4.996489634487865e-06, "logits/chosen": -3.149304151535034, "logits/rejected": -3.1437649726867676, "logps/chosen": -276.66497802734375, "logps/rejected": -261.4449157714844, "loss": 0.4814, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": 0.030237609520554543, "rewards/margins": 0.1089547872543335, "rewards/rejected": -0.0787171721458435, "step": 220 }, { "epoch": 0.12, "grad_norm": 0.69921875, "learning_rate": 4.9936524905772466e-06, "logits/chosen": -3.092778444290161, "logits/rejected": -3.1226553916931152, "logps/chosen": -265.37457275390625, "logps/rejected": -260.417724609375, "loss": 0.4759, "rewards/accuracies": 0.703125, "rewards/chosen": 0.06305978447198868, "rewards/margins": 0.13326093554496765, "rewards/rejected": -0.07020114362239838, "step": 230 }, { "epoch": 0.13, "grad_norm": 0.76953125, "learning_rate": 4.9899824869915e-06, "logits/chosen": -3.1257612705230713, "logits/rejected": -3.142120838165283, "logps/chosen": -251.7377166748047, "logps/rejected": -258.2868957519531, "loss": 0.4783, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.034979041665792465, "rewards/margins": 0.12733003497123718, "rewards/rejected": -0.09235100448131561, "step": 240 }, { "epoch": 0.13, "grad_norm": 0.75, "learning_rate": 4.985480849482012e-06, "logits/chosen": -3.1416521072387695, "logits/rejected": -3.1663966178894043, "logps/chosen": -280.97442626953125, "logps/rejected": -259.81915283203125, "loss": 0.4749, "rewards/accuracies": 0.703125, "rewards/chosen": 0.03899794816970825, "rewards/margins": 0.14351439476013184, "rewards/rejected": -0.10451646894216537, "step": 250 }, { "epoch": 0.14, "grad_norm": 0.78125, "learning_rate": 4.980149081559142e-06, "logits/chosen": -3.166862964630127, "logits/rejected": -3.1826682090759277, "logps/chosen": -277.7224426269531, "logps/rejected": -264.73248291015625, "loss": 0.4769, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.02976931631565094, "rewards/margins": 0.13806195557117462, "rewards/rejected": -0.10829265415668488, "step": 260 }, { "epoch": 0.14, "grad_norm": 0.6171875, "learning_rate": 4.9739889639900655e-06, "logits/chosen": -3.1009061336517334, "logits/rejected": -3.1220781803131104, "logps/chosen": -271.1175842285156, "logps/rejected": -270.35601806640625, "loss": 0.4696, "rewards/accuracies": 0.75, "rewards/chosen": 0.041884977370500565, "rewards/margins": 0.18558195233345032, "rewards/rejected": -0.14369697868824005, "step": 270 }, { "epoch": 0.15, "grad_norm": 0.77734375, "learning_rate": 4.967002554204009e-06, "logits/chosen": -3.1314620971679688, "logits/rejected": -3.147207021713257, "logps/chosen": -272.50836181640625, "logps/rejected": -278.05487060546875, "loss": 0.4785, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0026828604750335217, "rewards/margins": 0.14521858096122742, "rewards/rejected": -0.1425357311964035, "step": 280 }, { "epoch": 0.15, "grad_norm": 0.62109375, "learning_rate": 4.959192185605089e-06, "logits/chosen": -3.1541976928710938, "logits/rejected": -3.1754889488220215, "logps/chosen": -304.40740966796875, "logps/rejected": -284.8604736328125, "loss": 0.4718, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.0017113524954766035, "rewards/margins": 0.18049772083759308, "rewards/rejected": -0.17878638207912445, "step": 290 }, { "epoch": 0.16, "grad_norm": 0.859375, "learning_rate": 4.950560466792969e-06, "logits/chosen": -3.1485819816589355, "logits/rejected": -3.1677544116973877, "logps/chosen": -263.881103515625, "logps/rejected": -263.1119689941406, "loss": 0.4741, "rewards/accuracies": 0.6875, "rewards/chosen": -0.045438483357429504, "rewards/margins": 0.1866464614868164, "rewards/rejected": -0.23208491504192352, "step": 300 }, { "epoch": 0.16, "grad_norm": 0.80078125, "learning_rate": 4.9411102806916185e-06, "logits/chosen": -3.122454881668091, "logits/rejected": -3.144866943359375, "logps/chosen": -278.1111145019531, "logps/rejected": -262.2369689941406, "loss": 0.4695, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.052451539784669876, "rewards/margins": 0.22689659893512726, "rewards/rejected": -0.2793481647968292, "step": 310 }, { "epoch": 0.17, "grad_norm": 0.71484375, "learning_rate": 4.930844783586424e-06, "logits/chosen": -3.116986036300659, "logits/rejected": -3.1314234733581543, "logps/chosen": -280.05523681640625, "logps/rejected": -292.8658447265625, "loss": 0.4642, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.18633142113685608, "rewards/margins": 0.2993203103542328, "rewards/rejected": -0.4856516718864441, "step": 320 }, { "epoch": 0.17, "grad_norm": 0.8828125, "learning_rate": 4.919767404070033e-06, "logits/chosen": -3.0919606685638428, "logits/rejected": -3.1069421768188477, "logps/chosen": -290.76690673828125, "logps/rejected": -299.6058044433594, "loss": 0.4644, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.24150581657886505, "rewards/margins": 0.32079803943634033, "rewards/rejected": -0.562303900718689, "step": 330 }, { "epoch": 0.18, "grad_norm": 0.8984375, "learning_rate": 4.907881841897216e-06, "logits/chosen": -3.113529920578003, "logits/rejected": -3.13189697265625, "logps/chosen": -316.0239562988281, "logps/rejected": -312.1597900390625, "loss": 0.4576, "rewards/accuracies": 0.703125, "rewards/chosen": -0.3316032886505127, "rewards/margins": 0.40208154916763306, "rewards/rejected": -0.733684778213501, "step": 340 }, { "epoch": 0.18, "grad_norm": 1.03125, "learning_rate": 4.89519206674919e-06, "logits/chosen": -3.0299558639526367, "logits/rejected": -3.0673482418060303, "logps/chosen": -298.59539794921875, "logps/rejected": -356.8745422363281, "loss": 0.4522, "rewards/accuracies": 0.71875, "rewards/chosen": -0.496961772441864, "rewards/margins": 0.5310899615287781, "rewards/rejected": -1.028051733970642, "step": 350 }, { "epoch": 0.19, "grad_norm": 0.984375, "learning_rate": 4.881702316907769e-06, "logits/chosen": -3.04780912399292, "logits/rejected": -3.0520381927490234, "logps/chosen": -342.5206604003906, "logps/rejected": -376.9095153808594, "loss": 0.4588, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.7044845819473267, "rewards/margins": 0.4986873269081116, "rewards/rejected": -1.203171968460083, "step": 360 }, { "epoch": 0.19, "grad_norm": 1.515625, "learning_rate": 4.86741709783982e-06, "logits/chosen": -3.0080177783966064, "logits/rejected": -3.0334441661834717, "logps/chosen": -322.6118469238281, "logps/rejected": -343.72320556640625, "loss": 0.4541, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.7056992053985596, "rewards/margins": 0.5343005061149597, "rewards/rejected": -1.239999771118164, "step": 370 }, { "epoch": 0.2, "grad_norm": 0.88671875, "learning_rate": 4.852341180692471e-06, "logits/chosen": -2.9617342948913574, "logits/rejected": -2.9721832275390625, "logps/chosen": -318.089111328125, "logps/rejected": -369.661865234375, "loss": 0.4526, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5964111089706421, "rewards/margins": 0.5622240900993347, "rewards/rejected": -1.1586352586746216, "step": 380 }, { "epoch": 0.2, "grad_norm": 0.9375, "learning_rate": 4.836479600699579e-06, "logits/chosen": -3.004973888397217, "logits/rejected": -3.015404224395752, "logps/chosen": -322.59222412109375, "logps/rejected": -350.57952880859375, "loss": 0.4555, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4821473956108093, "rewards/margins": 0.5106935501098633, "rewards/rejected": -0.9928409457206726, "step": 390 }, { "epoch": 0.21, "grad_norm": 1.359375, "learning_rate": 4.819837655500014e-06, "logits/chosen": -2.982062816619873, "logits/rejected": -3.0214340686798096, "logps/chosen": -328.14764404296875, "logps/rejected": -358.10198974609375, "loss": 0.4518, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6790810823440552, "rewards/margins": 0.5925682783126831, "rewards/rejected": -1.2716493606567383, "step": 400 }, { "epoch": 0.21, "grad_norm": 1.1953125, "learning_rate": 4.802420903368286e-06, "logits/chosen": -2.996058225631714, "logits/rejected": -3.0428948402404785, "logps/chosen": -311.92181396484375, "logps/rejected": -364.3048095703125, "loss": 0.4564, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5532656908035278, "rewards/margins": 0.49352067708969116, "rewards/rejected": -1.0467865467071533, "step": 410 }, { "epoch": 0.22, "grad_norm": 1.140625, "learning_rate": 4.784235161358124e-06, "logits/chosen": -2.960181951522827, "logits/rejected": -2.9744322299957275, "logps/chosen": -318.44744873046875, "logps/rejected": -352.4103088378906, "loss": 0.4597, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.5166889429092407, "rewards/margins": 0.5014506578445435, "rewards/rejected": -1.0181396007537842, "step": 420 }, { "epoch": 0.23, "grad_norm": 1.09375, "learning_rate": 4.765286503359632e-06, "logits/chosen": -2.887781858444214, "logits/rejected": -2.90468430519104, "logps/chosen": -332.93792724609375, "logps/rejected": -379.0559997558594, "loss": 0.4516, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.6485487222671509, "rewards/margins": 0.5632439255714417, "rewards/rejected": -1.2117927074432373, "step": 430 }, { "epoch": 0.23, "grad_norm": 0.9921875, "learning_rate": 4.745581258070654e-06, "logits/chosen": -2.8706612586975098, "logits/rejected": -2.8921449184417725, "logps/chosen": -356.7577209472656, "logps/rejected": -414.74224853515625, "loss": 0.4523, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9310439229011536, "rewards/margins": 0.6457870602607727, "rewards/rejected": -1.5768309831619263, "step": 440 }, { "epoch": 0.24, "grad_norm": 1.4765625, "learning_rate": 4.725126006883047e-06, "logits/chosen": -2.8595480918884277, "logits/rejected": -2.884725332260132, "logps/chosen": -306.8425598144531, "logps/rejected": -388.7693786621094, "loss": 0.4424, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.637574315071106, "rewards/margins": 0.7818979620933533, "rewards/rejected": -1.4194722175598145, "step": 450 }, { "epoch": 0.24, "grad_norm": 1.0546875, "learning_rate": 4.70392758168454e-06, "logits/chosen": -2.8378663063049316, "logits/rejected": -2.8632078170776367, "logps/chosen": -362.0655822753906, "logps/rejected": -393.5525817871094, "loss": 0.456, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.843133807182312, "rewards/margins": 0.6695644855499268, "rewards/rejected": -1.5126984119415283, "step": 460 }, { "epoch": 0.25, "grad_norm": 1.59375, "learning_rate": 4.68199306257695e-06, "logits/chosen": -2.90700101852417, "logits/rejected": -2.930222511291504, "logps/chosen": -368.11419677734375, "logps/rejected": -419.2084045410156, "loss": 0.4412, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.8187941312789917, "rewards/margins": 0.7371198534965515, "rewards/rejected": -1.5559141635894775, "step": 470 }, { "epoch": 0.25, "grad_norm": 0.8984375, "learning_rate": 4.659329775511478e-06, "logits/chosen": -2.8993983268737793, "logits/rejected": -2.930690050125122, "logps/chosen": -331.52081298828125, "logps/rejected": -389.1390686035156, "loss": 0.4494, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6849466562271118, "rewards/margins": 0.7577627301216125, "rewards/rejected": -1.4427093267440796, "step": 480 }, { "epoch": 0.26, "grad_norm": 1.140625, "learning_rate": 4.635945289841902e-06, "logits/chosen": -2.8397905826568604, "logits/rejected": -2.8895552158355713, "logps/chosen": -351.8777770996094, "logps/rejected": -399.30126953125, "loss": 0.4437, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6446736454963684, "rewards/margins": 0.8137520551681519, "rewards/rejected": -1.458425760269165, "step": 490 }, { "epoch": 0.26, "grad_norm": 2.265625, "learning_rate": 4.611847415796476e-06, "logits/chosen": -2.8411877155303955, "logits/rejected": -2.8607966899871826, "logps/chosen": -360.9768371582031, "logps/rejected": -416.825927734375, "loss": 0.4382, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8380700945854187, "rewards/margins": 0.860516369342804, "rewards/rejected": -1.6985862255096436, "step": 500 }, { "epoch": 0.27, "grad_norm": 1.75, "learning_rate": 4.587044201869378e-06, "logits/chosen": -2.848315715789795, "logits/rejected": -2.8671722412109375, "logps/chosen": -326.2365417480469, "logps/rejected": -389.13720703125, "loss": 0.4425, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7817949056625366, "rewards/margins": 0.8195638656616211, "rewards/rejected": -1.6013587713241577, "step": 510 }, { "epoch": 0.27, "grad_norm": 1.4453125, "learning_rate": 4.561543932132574e-06, "logits/chosen": -2.7974531650543213, "logits/rejected": -2.828226327896118, "logps/chosen": -348.9537658691406, "logps/rejected": -422.35186767578125, "loss": 0.4319, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7357637286186218, "rewards/margins": 0.9210460782051086, "rewards/rejected": -1.6568095684051514, "step": 520 }, { "epoch": 0.28, "grad_norm": 1.6015625, "learning_rate": 4.535355123469009e-06, "logits/chosen": -2.8311352729797363, "logits/rejected": -2.845012664794922, "logps/chosen": -350.1925964355469, "logps/rejected": -428.8135681152344, "loss": 0.4413, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9110676050186157, "rewards/margins": 0.8541079759597778, "rewards/rejected": -1.765175461769104, "step": 530 }, { "epoch": 0.28, "grad_norm": 2.546875, "learning_rate": 4.508486522728037e-06, "logits/chosen": -2.7914469242095947, "logits/rejected": -2.821166515350342, "logps/chosen": -362.1348876953125, "logps/rejected": -431.3775329589844, "loss": 0.425, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.72906094789505, "rewards/margins": 1.1009550094604492, "rewards/rejected": -1.8300158977508545, "step": 540 }, { "epoch": 0.29, "grad_norm": 2.953125, "learning_rate": 4.480947103804044e-06, "logits/chosen": -2.7665412425994873, "logits/rejected": -2.7718183994293213, "logps/chosen": -364.27362060546875, "logps/rejected": -409.62042236328125, "loss": 0.4476, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.9586559534072876, "rewards/margins": 0.7505531311035156, "rewards/rejected": -1.7092090845108032, "step": 550 }, { "epoch": 0.29, "grad_norm": 2.21875, "learning_rate": 4.452746064639239e-06, "logits/chosen": -2.7609431743621826, "logits/rejected": -2.7761027812957764, "logps/chosen": -350.8570556640625, "logps/rejected": -438.7533264160156, "loss": 0.4401, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.9020726084709167, "rewards/margins": 0.9013819694519043, "rewards/rejected": -1.8034546375274658, "step": 560 }, { "epoch": 0.3, "grad_norm": 1.6171875, "learning_rate": 4.423892824151617e-06, "logits/chosen": -2.724292755126953, "logits/rejected": -2.7128217220306396, "logps/chosen": -369.49822998046875, "logps/rejected": -421.8377380371094, "loss": 0.4407, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9183570742607117, "rewards/margins": 0.7936559319496155, "rewards/rejected": -1.7120128870010376, "step": 570 }, { "epoch": 0.3, "grad_norm": 2.296875, "learning_rate": 4.3943970190891164e-06, "logits/chosen": -2.767193555831909, "logits/rejected": -2.7671852111816406, "logps/chosen": -338.62347412109375, "logps/rejected": -402.12713623046875, "loss": 0.4297, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.6558475494384766, "rewards/margins": 0.883314311504364, "rewards/rejected": -1.5391619205474854, "step": 580 }, { "epoch": 0.31, "grad_norm": 1.9765625, "learning_rate": 4.364268500811025e-06, "logits/chosen": -2.71061635017395, "logits/rejected": -2.7537200450897217, "logps/chosen": -356.565185546875, "logps/rejected": -418.8211975097656, "loss": 0.4445, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9843032956123352, "rewards/margins": 0.7892019152641296, "rewards/rejected": -1.773505449295044, "step": 590 }, { "epoch": 0.31, "grad_norm": 1.7421875, "learning_rate": 4.333517331997704e-06, "logits/chosen": -2.7916760444641113, "logits/rejected": -2.771669864654541, "logps/chosen": -304.84649658203125, "logps/rejected": -379.7848815917969, "loss": 0.4453, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.6202788949012756, "rewards/margins": 0.7137486338615417, "rewards/rejected": -1.3340275287628174, "step": 600 }, { "epoch": 0.32, "grad_norm": 1.34375, "learning_rate": 4.302153783289737e-06, "logits/chosen": -2.771042823791504, "logits/rejected": -2.7721784114837646, "logps/chosen": -304.8459167480469, "logps/rejected": -382.05029296875, "loss": 0.4336, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.48120832443237305, "rewards/margins": 0.8254661560058594, "rewards/rejected": -1.3066743612289429, "step": 610 }, { "epoch": 0.32, "grad_norm": 1.171875, "learning_rate": 4.270188329857613e-06, "logits/chosen": -2.6971840858459473, "logits/rejected": -2.7405362129211426, "logps/chosen": -365.59869384765625, "logps/rejected": -422.66082763671875, "loss": 0.4337, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": -0.7066925168037415, "rewards/margins": 0.9992557764053345, "rewards/rejected": -1.7059482336044312, "step": 620 }, { "epoch": 0.33, "grad_norm": 2.546875, "learning_rate": 4.237631647903115e-06, "logits/chosen": -2.729823589324951, "logits/rejected": -2.7406442165374756, "logps/chosen": -343.63861083984375, "logps/rejected": -398.4572448730469, "loss": 0.4474, "rewards/accuracies": 0.65625, "rewards/chosen": -0.834626317024231, "rewards/margins": 0.7315307855606079, "rewards/rejected": -1.5661571025848389, "step": 630 }, { "epoch": 0.33, "grad_norm": 3.15625, "learning_rate": 4.204494611093548e-06, "logits/chosen": -2.6876988410949707, "logits/rejected": -2.710921049118042, "logps/chosen": -344.4961242675781, "logps/rejected": -413.8218688964844, "loss": 0.4355, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9162249565124512, "rewards/margins": 0.9599907994270325, "rewards/rejected": -1.8762153387069702, "step": 640 }, { "epoch": 0.34, "grad_norm": 3.3125, "learning_rate": 4.170788286930024e-06, "logits/chosen": -2.720813751220703, "logits/rejected": -2.7221953868865967, "logps/chosen": -396.78424072265625, "logps/rejected": -472.07293701171875, "loss": 0.4488, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4064452648162842, "rewards/margins": 0.8613995313644409, "rewards/rejected": -2.2678446769714355, "step": 650 }, { "epoch": 0.35, "grad_norm": 1.578125, "learning_rate": 4.136523933051005e-06, "logits/chosen": -2.759519338607788, "logits/rejected": -2.7709243297576904, "logps/chosen": -343.8990173339844, "logps/rejected": -425.5016174316406, "loss": 0.4433, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.881717324256897, "rewards/margins": 0.9298511743545532, "rewards/rejected": -1.8115684986114502, "step": 660 }, { "epoch": 0.35, "grad_norm": 2.390625, "learning_rate": 4.101712993472348e-06, "logits/chosen": -2.7461307048797607, "logits/rejected": -2.7893545627593994, "logps/chosen": -306.47528076171875, "logps/rejected": -357.6148986816406, "loss": 0.4434, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.500246524810791, "rewards/margins": 0.6672911643981934, "rewards/rejected": -1.1675376892089844, "step": 670 }, { "epoch": 0.36, "grad_norm": 1.53125, "learning_rate": 4.066367094765091e-06, "logits/chosen": -2.6456458568573, "logits/rejected": -2.6387665271759033, "logps/chosen": -346.2305603027344, "logps/rejected": -411.1793518066406, "loss": 0.4375, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.803577721118927, "rewards/margins": 0.8857296109199524, "rewards/rejected": -1.689307451248169, "step": 680 }, { "epoch": 0.36, "grad_norm": 1.875, "learning_rate": 4.030498042172277e-06, "logits/chosen": -2.729139804840088, "logits/rejected": -2.7350516319274902, "logps/chosen": -343.1637268066406, "logps/rejected": -424.0751037597656, "loss": 0.4338, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.6723712086677551, "rewards/margins": 0.953707218170166, "rewards/rejected": -1.6260782480239868, "step": 690 }, { "epoch": 0.37, "grad_norm": 1.5, "learning_rate": 3.994117815666095e-06, "logits/chosen": -2.737205743789673, "logits/rejected": -2.756513833999634, "logps/chosen": -358.9730529785156, "logps/rejected": -435.59930419921875, "loss": 0.4352, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9257775545120239, "rewards/margins": 0.9314867258071899, "rewards/rejected": -1.8572641611099243, "step": 700 }, { "epoch": 0.37, "grad_norm": 2.59375, "learning_rate": 3.957238565946672e-06, "logits/chosen": -2.7129783630371094, "logits/rejected": -2.740182399749756, "logps/chosen": -382.54144287109375, "logps/rejected": -479.25079345703125, "loss": 0.425, "rewards/accuracies": 0.75, "rewards/chosen": -1.0284990072250366, "rewards/margins": 1.2022464275360107, "rewards/rejected": -2.230745315551758, "step": 710 }, { "epoch": 0.38, "grad_norm": 3.109375, "learning_rate": 3.919872610383831e-06, "logits/chosen": -2.700688600540161, "logits/rejected": -2.715548038482666, "logps/chosen": -346.7179870605469, "logps/rejected": -442.24468994140625, "loss": 0.429, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9038440585136414, "rewards/margins": 1.1126606464385986, "rewards/rejected": -2.0165047645568848, "step": 720 }, { "epoch": 0.38, "grad_norm": 5.4375, "learning_rate": 3.882032428903195e-06, "logits/chosen": -2.732431650161743, "logits/rejected": -2.741513252258301, "logps/chosen": -341.4494323730469, "logps/rejected": -418.03485107421875, "loss": 0.4393, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.9002019166946411, "rewards/margins": 0.9896795153617859, "rewards/rejected": -1.8898814916610718, "step": 730 }, { "epoch": 0.39, "grad_norm": 2.828125, "learning_rate": 3.84373065981799e-06, "logits/chosen": -2.6866860389709473, "logits/rejected": -2.7075419425964355, "logps/chosen": -335.77630615234375, "logps/rejected": -392.2721252441406, "loss": 0.4315, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7698886394500732, "rewards/margins": 0.8988865613937378, "rewards/rejected": -1.668775200843811, "step": 740 }, { "epoch": 0.39, "grad_norm": 3.03125, "learning_rate": 3.8049800956079552e-06, "logits/chosen": -2.728248357772827, "logits/rejected": -2.719029426574707, "logps/chosen": -333.34747314453125, "logps/rejected": -403.9107971191406, "loss": 0.4329, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.688866376876831, "rewards/margins": 0.9195898771286011, "rewards/rejected": -1.6084562540054321, "step": 750 }, { "epoch": 0.4, "grad_norm": 2.640625, "learning_rate": 3.765793678646753e-06, "logits/chosen": -2.7588818073272705, "logits/rejected": -2.764564037322998, "logps/chosen": -327.6832580566406, "logps/rejected": -397.34967041015625, "loss": 0.4445, "rewards/accuracies": 0.671875, "rewards/chosen": -0.7366557121276855, "rewards/margins": 0.8244975805282593, "rewards/rejected": -1.5611531734466553, "step": 760 }, { "epoch": 0.4, "grad_norm": 3.40625, "learning_rate": 3.726184496879323e-06, "logits/chosen": -2.7015931606292725, "logits/rejected": -2.7093541622161865, "logps/chosen": -328.6782531738281, "logps/rejected": -392.6251525878906, "loss": 0.4434, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.6729280948638916, "rewards/margins": 0.7624879479408264, "rewards/rejected": -1.4354161024093628, "step": 770 }, { "epoch": 0.41, "grad_norm": 2.046875, "learning_rate": 3.686165779450619e-06, "logits/chosen": -2.7212226390838623, "logits/rejected": -2.7281336784362793, "logps/chosen": -332.78985595703125, "logps/rejected": -407.54339599609375, "loss": 0.4226, "rewards/accuracies": 0.75, "rewards/chosen": -0.5121452808380127, "rewards/margins": 1.0209487676620483, "rewards/rejected": -1.5330939292907715, "step": 780 }, { "epoch": 0.41, "grad_norm": 2.0625, "learning_rate": 3.645750892287178e-06, "logits/chosen": -2.6992287635803223, "logits/rejected": -2.7106306552886963, "logps/chosen": -349.5966491699219, "logps/rejected": -479.349609375, "loss": 0.4291, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9312782287597656, "rewards/margins": 1.2202502489089966, "rewards/rejected": -2.1515283584594727, "step": 790 }, { "epoch": 0.42, "grad_norm": 4.4375, "learning_rate": 3.604953333633009e-06, "logits/chosen": -2.6964669227600098, "logits/rejected": -2.718437671661377, "logps/chosen": -368.614990234375, "logps/rejected": -455.652099609375, "loss": 0.444, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1283608675003052, "rewards/margins": 0.9377814531326294, "rewards/rejected": -2.0661423206329346, "step": 800 }, { "epoch": 0.42, "grad_norm": 1.796875, "learning_rate": 3.56378672954129e-06, "logits/chosen": -2.734504222869873, "logits/rejected": -2.741596221923828, "logps/chosen": -345.35894775390625, "logps/rejected": -441.3074645996094, "loss": 0.4339, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.8633454442024231, "rewards/margins": 1.0725480318069458, "rewards/rejected": -1.9358936548233032, "step": 810 }, { "epoch": 0.43, "grad_norm": 4.34375, "learning_rate": 3.5222648293233806e-06, "logits/chosen": -2.726121425628662, "logits/rejected": -2.757293701171875, "logps/chosen": -328.99981689453125, "logps/rejected": -409.4175720214844, "loss": 0.4433, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.8074792623519897, "rewards/margins": 0.893712043762207, "rewards/rejected": -1.7011913061141968, "step": 820 }, { "epoch": 0.43, "grad_norm": 2.015625, "learning_rate": 3.4804015009566573e-06, "logits/chosen": -2.7143707275390625, "logits/rejected": -2.7266762256622314, "logps/chosen": -343.6038513183594, "logps/rejected": -408.5287170410156, "loss": 0.4446, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0202709436416626, "rewards/margins": 0.8006850481033325, "rewards/rejected": -1.8209559917449951, "step": 830 }, { "epoch": 0.44, "grad_norm": 8.25, "learning_rate": 3.4382107264527244e-06, "logits/chosen": -2.731356143951416, "logits/rejected": -2.749807357788086, "logps/chosen": -387.00372314453125, "logps/rejected": -461.8328552246094, "loss": 0.4274, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.057356834411621, "rewards/margins": 1.1233993768692017, "rewards/rejected": -2.180756092071533, "step": 840 }, { "epoch": 0.44, "grad_norm": 1.328125, "learning_rate": 3.3957065971875387e-06, "logits/chosen": -2.736109972000122, "logits/rejected": -2.761101245880127, "logps/chosen": -378.2186279296875, "logps/rejected": -433.41986083984375, "loss": 0.4517, "rewards/accuracies": 0.6875, "rewards/chosen": -1.291465401649475, "rewards/margins": 0.7508509159088135, "rewards/rejected": -2.042316198348999, "step": 850 }, { "epoch": 0.45, "grad_norm": 3.15625, "learning_rate": 3.352903309194999e-06, "logits/chosen": -2.7496652603149414, "logits/rejected": -2.766449451446533, "logps/chosen": -347.00531005859375, "logps/rejected": -450.0355529785156, "loss": 0.4337, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9623804092407227, "rewards/margins": 1.0263848304748535, "rewards/rejected": -1.9887651205062866, "step": 860 }, { "epoch": 0.46, "grad_norm": 0.9765625, "learning_rate": 3.309815158425591e-06, "logits/chosen": -2.6927943229675293, "logits/rejected": -2.7125327587127686, "logps/chosen": -312.2998046875, "logps/rejected": -379.62017822265625, "loss": 0.43, "rewards/accuracies": 0.71875, "rewards/chosen": -0.39604613184928894, "rewards/margins": 0.9399534463882446, "rewards/rejected": -1.335999608039856, "step": 870 }, { "epoch": 0.46, "grad_norm": 4.53125, "learning_rate": 3.266456535971654e-06, "logits/chosen": -2.790156602859497, "logits/rejected": -2.7961854934692383, "logps/chosen": -305.0512390136719, "logps/rejected": -366.155029296875, "loss": 0.4325, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.48138341307640076, "rewards/margins": 0.8310259580612183, "rewards/rejected": -1.3124094009399414, "step": 880 }, { "epoch": 0.47, "grad_norm": 1.0625, "learning_rate": 3.2228419232608692e-06, "logits/chosen": -2.701418399810791, "logits/rejected": -2.688974380493164, "logps/chosen": -316.68133544921875, "logps/rejected": -413.67156982421875, "loss": 0.4314, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6776655316352844, "rewards/margins": 0.9983049631118774, "rewards/rejected": -1.6759703159332275, "step": 890 }, { "epoch": 0.47, "grad_norm": 2.21875, "learning_rate": 3.1789858872195888e-06, "logits/chosen": -2.6642849445343018, "logits/rejected": -2.647975444793701, "logps/chosen": -365.28936767578125, "logps/rejected": -515.3648681640625, "loss": 0.438, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0754855871200562, "rewards/margins": 1.3716323375701904, "rewards/rejected": -2.447117567062378, "step": 900 }, { "epoch": 0.48, "grad_norm": 3.625, "learning_rate": 3.1349030754075945e-06, "logits/chosen": -2.668508291244507, "logits/rejected": -2.6765646934509277, "logps/chosen": -357.05712890625, "logps/rejected": -427.643798828125, "loss": 0.4391, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8638874888420105, "rewards/margins": 0.9187320470809937, "rewards/rejected": -1.7826197147369385, "step": 910 }, { "epoch": 0.48, "grad_norm": 4.15625, "learning_rate": 3.0906082111259313e-06, "logits/chosen": -2.721989154815674, "logits/rejected": -2.7402305603027344, "logps/chosen": -341.566650390625, "logps/rejected": -425.6533203125, "loss": 0.4271, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.6936538815498352, "rewards/margins": 1.0446574687957764, "rewards/rejected": -1.7383114099502563, "step": 920 }, { "epoch": 0.49, "grad_norm": 2.25, "learning_rate": 3.046116088499449e-06, "logits/chosen": -2.732478380203247, "logits/rejected": -2.7298645973205566, "logps/chosen": -366.4847717285156, "logps/rejected": -460.4178161621094, "loss": 0.4317, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7452308535575867, "rewards/margins": 1.1475111246109009, "rewards/rejected": -1.8927419185638428, "step": 930 }, { "epoch": 0.49, "grad_norm": 1.46875, "learning_rate": 3.0014415675356813e-06, "logits/chosen": -2.7274584770202637, "logits/rejected": -2.7278664112091064, "logps/chosen": -354.26751708984375, "logps/rejected": -469.7132873535156, "loss": 0.4211, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.7849363088607788, "rewards/margins": 1.4399199485778809, "rewards/rejected": -2.224856376647949, "step": 940 }, { "epoch": 0.5, "grad_norm": 2.765625, "learning_rate": 2.9565995691617242e-06, "logits/chosen": -2.7352840900421143, "logits/rejected": -2.737842321395874, "logps/chosen": -358.52020263671875, "logps/rejected": -454.4913024902344, "loss": 0.4305, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.7539668083190918, "rewards/margins": 1.1879950761795044, "rewards/rejected": -1.941961646080017, "step": 950 }, { "epoch": 0.5, "grad_norm": 0.9765625, "learning_rate": 2.9116050702407706e-06, "logits/chosen": -2.7479987144470215, "logits/rejected": -2.7785484790802, "logps/chosen": -313.64837646484375, "logps/rejected": -373.56689453125, "loss": 0.4343, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.4597587585449219, "rewards/margins": 0.8521866798400879, "rewards/rejected": -1.3119454383850098, "step": 960 }, { "epoch": 0.51, "grad_norm": 2.171875, "learning_rate": 2.8664730985699537e-06, "logits/chosen": -2.7080225944519043, "logits/rejected": -2.717515707015991, "logps/chosen": -313.6962890625, "logps/rejected": -389.29205322265625, "loss": 0.4272, "rewards/accuracies": 0.734375, "rewards/chosen": -0.46188563108444214, "rewards/margins": 0.994129478931427, "rewards/rejected": -1.4560149908065796, "step": 970 }, { "epoch": 0.51, "grad_norm": 2.5, "learning_rate": 2.8212187278611907e-06, "logits/chosen": -2.7235171794891357, "logits/rejected": -2.736121416091919, "logps/chosen": -342.36273193359375, "logps/rejected": -417.322509765625, "loss": 0.4259, "rewards/accuracies": 0.75, "rewards/chosen": -0.6042460203170776, "rewards/margins": 1.0531026124954224, "rewards/rejected": -1.6573486328125, "step": 980 }, { "epoch": 0.52, "grad_norm": 1.7265625, "learning_rate": 2.7758570727066843e-06, "logits/chosen": -2.690983533859253, "logits/rejected": -2.6975481510162354, "logps/chosen": -342.63079833984375, "logps/rejected": -407.81231689453125, "loss": 0.4475, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.8637169599533081, "rewards/margins": 0.7765380144119263, "rewards/rejected": -1.6402549743652344, "step": 990 }, { "epoch": 0.52, "grad_norm": 1.7109375, "learning_rate": 2.730403283530767e-06, "logits/chosen": -2.6636836528778076, "logits/rejected": -2.661403179168701, "logps/chosen": -344.44744873046875, "logps/rejected": -413.1937561035156, "loss": 0.4248, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.7091799974441528, "rewards/margins": 1.0423099994659424, "rewards/rejected": -1.7514899969100952, "step": 1000 }, { "epoch": 0.53, "grad_norm": 4.21875, "learning_rate": 2.6848725415297888e-06, "logits/chosen": -2.6942477226257324, "logits/rejected": -2.7120838165283203, "logps/chosen": -336.60760498046875, "logps/rejected": -421.9783630371094, "loss": 0.4339, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.7389696836471558, "rewards/margins": 0.9657734036445618, "rewards/rejected": -1.7047427892684937, "step": 1010 }, { "epoch": 0.53, "grad_norm": 1.140625, "learning_rate": 2.639280053601719e-06, "logits/chosen": -2.700098991394043, "logits/rejected": -2.7320022583007812, "logps/chosen": -346.1884460449219, "logps/rejected": -410.5491638183594, "loss": 0.4324, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5768822431564331, "rewards/margins": 0.9592208862304688, "rewards/rejected": -1.5361031293869019, "step": 1020 }, { "epoch": 0.54, "grad_norm": 3.046875, "learning_rate": 2.59364104726716e-06, "logits/chosen": -2.7298641204833984, "logits/rejected": -2.73411226272583, "logps/chosen": -346.91912841796875, "logps/rejected": -417.67034912109375, "loss": 0.43, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.7697745561599731, "rewards/margins": 1.0892785787582397, "rewards/rejected": -1.8590532541275024, "step": 1030 }, { "epoch": 0.54, "grad_norm": 2.609375, "learning_rate": 2.547970765583491e-06, "logits/chosen": -2.716035842895508, "logits/rejected": -2.702650547027588, "logps/chosen": -330.34124755859375, "logps/rejected": -416.9476623535156, "loss": 0.4345, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7259209156036377, "rewards/margins": 0.974997878074646, "rewards/rejected": -1.7009187936782837, "step": 1040 }, { "epoch": 0.55, "grad_norm": 1.390625, "learning_rate": 2.502284462053799e-06, "logits/chosen": -2.656066656112671, "logits/rejected": -2.6666572093963623, "logps/chosen": -331.7149658203125, "logps/rejected": -423.1683654785156, "loss": 0.4287, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5932595729827881, "rewards/margins": 1.035936713218689, "rewards/rejected": -1.6291964054107666, "step": 1050 }, { "epoch": 0.55, "grad_norm": 1.9296875, "learning_rate": 2.456597395532338e-06, "logits/chosen": -2.7209274768829346, "logits/rejected": -2.735020399093628, "logps/chosen": -328.7770080566406, "logps/rejected": -391.19354248046875, "loss": 0.4294, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6377438306808472, "rewards/margins": 0.9876799583435059, "rewards/rejected": -1.625423789024353, "step": 1060 }, { "epoch": 0.56, "grad_norm": 3.515625, "learning_rate": 2.4109248251281953e-06, "logits/chosen": -2.7338156700134277, "logits/rejected": -2.7391562461853027, "logps/chosen": -343.69390869140625, "logps/rejected": -414.94805908203125, "loss": 0.432, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.6750501990318298, "rewards/margins": 1.0820457935333252, "rewards/rejected": -1.7570960521697998, "step": 1070 }, { "epoch": 0.57, "grad_norm": 0.99609375, "learning_rate": 2.365282005108875e-06, "logits/chosen": -2.7210652828216553, "logits/rejected": -2.7339107990264893, "logps/chosen": -335.1302490234375, "logps/rejected": -390.1886901855469, "loss": 0.4303, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6980403661727905, "rewards/margins": 1.0391963720321655, "rewards/rejected": -1.737236738204956, "step": 1080 }, { "epoch": 0.57, "grad_norm": 2.921875, "learning_rate": 2.319684179805491e-06, "logits/chosen": -2.6966023445129395, "logits/rejected": -2.7194454669952393, "logps/chosen": -343.26934814453125, "logps/rejected": -400.3363037109375, "loss": 0.4381, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8400837779045105, "rewards/margins": 0.9520319104194641, "rewards/rejected": -1.7921158075332642, "step": 1090 }, { "epoch": 0.58, "grad_norm": 2.0, "learning_rate": 2.2741465785212905e-06, "logits/chosen": -2.7013275623321533, "logits/rejected": -2.7162578105926514, "logps/chosen": -336.082763671875, "logps/rejected": -424.86895751953125, "loss": 0.4383, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.7391124963760376, "rewards/margins": 0.9916101694107056, "rewards/rejected": -1.7307227849960327, "step": 1100 }, { "epoch": 0.58, "grad_norm": 1.359375, "learning_rate": 2.2286844104451848e-06, "logits/chosen": -2.7336266040802, "logits/rejected": -2.7385802268981934, "logps/chosen": -368.2712707519531, "logps/rejected": -442.3502502441406, "loss": 0.4393, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.8786946535110474, "rewards/margins": 0.9058005213737488, "rewards/rejected": -1.7844951152801514, "step": 1110 }, { "epoch": 0.59, "grad_norm": 1.1328125, "learning_rate": 2.183312859572008e-06, "logits/chosen": -2.6661019325256348, "logits/rejected": -2.687495708465576, "logps/chosen": -360.59503173828125, "logps/rejected": -445.728759765625, "loss": 0.4259, "rewards/accuracies": 0.75, "rewards/chosen": -0.6851642727851868, "rewards/margins": 1.106838345527649, "rewards/rejected": -1.7920026779174805, "step": 1120 }, { "epoch": 0.59, "grad_norm": 3.015625, "learning_rate": 2.1380470796311843e-06, "logits/chosen": -2.6705803871154785, "logits/rejected": -2.668128252029419, "logps/chosen": -339.4599609375, "logps/rejected": -421.2923278808594, "loss": 0.4272, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.642924964427948, "rewards/margins": 1.065707802772522, "rewards/rejected": -1.7086328268051147, "step": 1130 }, { "epoch": 0.6, "grad_norm": 3.96875, "learning_rate": 2.092902189025507e-06, "logits/chosen": -2.6446332931518555, "logits/rejected": -2.655050754547119, "logps/chosen": -355.1614074707031, "logps/rejected": -427.96636962890625, "loss": 0.4265, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7822728157043457, "rewards/margins": 1.1439166069030762, "rewards/rejected": -1.9261894226074219, "step": 1140 }, { "epoch": 0.6, "grad_norm": 0.98828125, "learning_rate": 2.0478932657817105e-06, "logits/chosen": -2.737699031829834, "logits/rejected": -2.732815742492676, "logps/chosen": -356.5775451660156, "logps/rejected": -431.93609619140625, "loss": 0.4282, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8207361102104187, "rewards/margins": 1.0940120220184326, "rewards/rejected": -1.914747953414917, "step": 1150 }, { "epoch": 0.61, "grad_norm": 2.546875, "learning_rate": 2.0030353425145376e-06, "logits/chosen": -2.676217555999756, "logits/rejected": -2.6970784664154053, "logps/chosen": -318.20355224609375, "logps/rejected": -386.79388427734375, "loss": 0.4289, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.741962730884552, "rewards/margins": 1.0068390369415283, "rewards/rejected": -1.748801589012146, "step": 1160 }, { "epoch": 0.61, "grad_norm": 0.7890625, "learning_rate": 1.958343401405964e-06, "logits/chosen": -2.6743831634521484, "logits/rejected": -2.6914896965026855, "logps/chosen": -323.74053955078125, "logps/rejected": -417.9308166503906, "loss": 0.4318, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6001055240631104, "rewards/margins": 1.030330777168274, "rewards/rejected": -1.6304363012313843, "step": 1170 }, { "epoch": 0.62, "grad_norm": 5.53125, "learning_rate": 1.9138323692012734e-06, "logits/chosen": -2.719978094100952, "logits/rejected": -2.7362709045410156, "logps/chosen": -316.5953063964844, "logps/rejected": -410.26593017578125, "loss": 0.4226, "rewards/accuracies": 0.75, "rewards/chosen": -0.5119751691818237, "rewards/margins": 1.1661722660064697, "rewards/rejected": -1.678147554397583, "step": 1180 }, { "epoch": 0.62, "grad_norm": 1.7890625, "learning_rate": 1.8695171122236443e-06, "logits/chosen": -2.6844494342803955, "logits/rejected": -2.6935813426971436, "logps/chosen": -319.3164367675781, "logps/rejected": -416.75616455078125, "loss": 0.4208, "rewards/accuracies": 0.7718750238418579, "rewards/chosen": -0.5857919454574585, "rewards/margins": 1.1580404043197632, "rewards/rejected": -1.7438323497772217, "step": 1190 }, { "epoch": 0.63, "grad_norm": 3.0625, "learning_rate": 1.8254124314089225e-06, "logits/chosen": -2.755115509033203, "logits/rejected": -2.7330613136291504, "logps/chosen": -322.7129821777344, "logps/rejected": -407.7967529296875, "loss": 0.4345, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4961959719657898, "rewards/margins": 1.078574776649475, "rewards/rejected": -1.5747709274291992, "step": 1200 }, { "epoch": 0.63, "grad_norm": 1.9375, "learning_rate": 1.781533057362221e-06, "logits/chosen": -2.746798038482666, "logits/rejected": -2.7739720344543457, "logps/chosen": -299.58160400390625, "logps/rejected": -374.47930908203125, "loss": 0.4296, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.4686378538608551, "rewards/margins": 0.9811908602714539, "rewards/rejected": -1.4498287439346313, "step": 1210 }, { "epoch": 0.64, "grad_norm": 3.296875, "learning_rate": 1.7378936454380277e-06, "logits/chosen": -2.7310540676116943, "logits/rejected": -2.7376887798309326, "logps/chosen": -327.89410400390625, "logps/rejected": -404.711669921875, "loss": 0.4279, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5759900808334351, "rewards/margins": 1.1363575458526611, "rewards/rejected": -1.7123476266860962, "step": 1220 }, { "epoch": 0.64, "grad_norm": 1.5390625, "learning_rate": 1.6945087708454273e-06, "logits/chosen": -2.6734468936920166, "logits/rejected": -2.714264392852783, "logps/chosen": -343.795654296875, "logps/rejected": -427.343505859375, "loss": 0.4219, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.5730609893798828, "rewards/margins": 1.2771522998809814, "rewards/rejected": -1.8502132892608643, "step": 1230 }, { "epoch": 0.65, "grad_norm": 1.8125, "learning_rate": 1.651392923780105e-06, "logits/chosen": -2.6670920848846436, "logits/rejected": -2.6546778678894043, "logps/chosen": -322.3703918457031, "logps/rejected": -434.3629455566406, "loss": 0.4254, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": -0.6311038136482239, "rewards/margins": 1.2766520977020264, "rewards/rejected": -1.9077558517456055, "step": 1240 }, { "epoch": 0.65, "grad_norm": 2.59375, "learning_rate": 1.608560504584737e-06, "logits/chosen": -2.737743377685547, "logits/rejected": -2.720360517501831, "logps/chosen": -334.63275146484375, "logps/rejected": -431.16827392578125, "loss": 0.4287, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8308721780776978, "rewards/margins": 1.0826470851898193, "rewards/rejected": -1.913519263267517, "step": 1250 }, { "epoch": 0.66, "grad_norm": 5.21875, "learning_rate": 1.5660258189393945e-06, "logits/chosen": -2.7399215698242188, "logits/rejected": -2.750929117202759, "logps/chosen": -338.90924072265625, "logps/rejected": -417.3045959472656, "loss": 0.4214, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.5204821825027466, "rewards/margins": 1.254575252532959, "rewards/rejected": -1.7750571966171265, "step": 1260 }, { "epoch": 0.66, "grad_norm": 5.96875, "learning_rate": 1.5238030730835578e-06, "logits/chosen": -2.716184616088867, "logits/rejected": -2.690531015396118, "logps/chosen": -311.8260192871094, "logps/rejected": -440.0191345214844, "loss": 0.4252, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.6099028587341309, "rewards/margins": 1.3696248531341553, "rewards/rejected": -1.979527473449707, "step": 1270 }, { "epoch": 0.67, "grad_norm": 8.5625, "learning_rate": 1.4819063690713565e-06, "logits/chosen": -2.723191022872925, "logits/rejected": -2.699627637863159, "logps/chosen": -343.42608642578125, "logps/rejected": -442.84033203125, "loss": 0.424, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.7352994680404663, "rewards/margins": 1.2973926067352295, "rewards/rejected": -2.0326919555664062, "step": 1280 }, { "epoch": 0.68, "grad_norm": 1.3359375, "learning_rate": 1.4403497000615885e-06, "logits/chosen": -2.709167242050171, "logits/rejected": -2.711165189743042, "logps/chosen": -346.2407531738281, "logps/rejected": -453.6136779785156, "loss": 0.4365, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9276860952377319, "rewards/margins": 1.1097049713134766, "rewards/rejected": -2.037391185760498, "step": 1290 }, { "epoch": 0.68, "grad_norm": 3.109375, "learning_rate": 1.3991469456441273e-06, "logits/chosen": -2.6968870162963867, "logits/rejected": -2.7066521644592285, "logps/chosen": -349.227294921875, "logps/rejected": -449.37274169921875, "loss": 0.4264, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.8388339281082153, "rewards/margins": 1.244845986366272, "rewards/rejected": -2.083679676055908, "step": 1300 }, { "epoch": 0.69, "grad_norm": 1.90625, "learning_rate": 1.3583118672042441e-06, "logits/chosen": -2.682274580001831, "logits/rejected": -2.689542055130005, "logps/chosen": -358.6896057128906, "logps/rejected": -461.7701721191406, "loss": 0.4298, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8582962155342102, "rewards/margins": 1.2384599447250366, "rewards/rejected": -2.0967559814453125, "step": 1310 }, { "epoch": 0.69, "grad_norm": 2.734375, "learning_rate": 1.3178581033264218e-06, "logits/chosen": -2.7299036979675293, "logits/rejected": -2.725554943084717, "logps/chosen": -337.6006774902344, "logps/rejected": -427.9097595214844, "loss": 0.4254, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8014458417892456, "rewards/margins": 1.1019701957702637, "rewards/rejected": -1.9034160375595093, "step": 1320 }, { "epoch": 0.7, "grad_norm": 5.71875, "learning_rate": 1.2777991652391757e-06, "logits/chosen": -2.7372395992279053, "logits/rejected": -2.713787078857422, "logps/chosen": -349.9215087890625, "logps/rejected": -458.30218505859375, "loss": 0.4146, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.605326235294342, "rewards/margins": 1.2629462480545044, "rewards/rejected": -1.8682724237442017, "step": 1330 }, { "epoch": 0.7, "grad_norm": 2.796875, "learning_rate": 1.2381484323024178e-06, "logits/chosen": -2.710744619369507, "logits/rejected": -2.7276930809020996, "logps/chosen": -340.2386779785156, "logps/rejected": -410.47406005859375, "loss": 0.4286, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.6984527111053467, "rewards/margins": 0.98872309923172, "rewards/rejected": -1.687175989151001, "step": 1340 }, { "epoch": 0.71, "grad_norm": 4.375, "learning_rate": 1.1989191475388518e-06, "logits/chosen": -2.7345008850097656, "logits/rejected": -2.737495183944702, "logps/chosen": -341.25408935546875, "logps/rejected": -463.845703125, "loss": 0.423, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.720673680305481, "rewards/margins": 1.3686655759811401, "rewards/rejected": -2.089339256286621, "step": 1350 }, { "epoch": 0.71, "grad_norm": 1.484375, "learning_rate": 1.160124413210918e-06, "logits/chosen": -2.710549831390381, "logits/rejected": -2.7327733039855957, "logps/chosen": -357.73516845703125, "logps/rejected": -460.0575256347656, "loss": 0.4351, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.7213457822799683, "rewards/margins": 1.1925950050354004, "rewards/rejected": -1.9139407873153687, "step": 1360 }, { "epoch": 0.72, "grad_norm": 5.8125, "learning_rate": 1.1217771864447396e-06, "logits/chosen": -2.745375871658325, "logits/rejected": -2.734910488128662, "logps/chosen": -334.57159423828125, "logps/rejected": -431.4033203125, "loss": 0.4065, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.6131495833396912, "rewards/margins": 1.3364530801773071, "rewards/rejected": -1.949602484703064, "step": 1370 }, { "epoch": 0.72, "grad_norm": 3.5, "learning_rate": 1.08389027490255e-06, "logits/chosen": -2.7352352142333984, "logits/rejected": -2.731990098953247, "logps/chosen": -318.87939453125, "logps/rejected": -447.78167724609375, "loss": 0.4207, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -0.6276998519897461, "rewards/margins": 1.426599383354187, "rewards/rejected": -2.0542993545532227, "step": 1380 }, { "epoch": 0.73, "grad_norm": 3.765625, "learning_rate": 1.046476332505036e-06, "logits/chosen": -2.7105917930603027, "logits/rejected": -2.7000508308410645, "logps/chosen": -351.5965881347656, "logps/rejected": -464.24041748046875, "loss": 0.4229, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.666134238243103, "rewards/margins": 1.3925716876983643, "rewards/rejected": -2.0587058067321777, "step": 1390 }, { "epoch": 0.73, "grad_norm": 1.78125, "learning_rate": 1.0095478552050348e-06, "logits/chosen": -2.7293667793273926, "logits/rejected": -2.748169422149658, "logps/chosen": -347.258056640625, "logps/rejected": -454.30126953125, "loss": 0.428, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7874716520309448, "rewards/margins": 1.2901605367660522, "rewards/rejected": -2.077632188796997, "step": 1400 }, { "epoch": 0.74, "grad_norm": 3.6875, "learning_rate": 9.731171768139808e-07, "logits/chosen": -2.747467279434204, "logits/rejected": -2.7329678535461426, "logps/chosen": -328.9295959472656, "logps/rejected": -426.88812255859375, "loss": 0.4343, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.7400668859481812, "rewards/margins": 1.056199073791504, "rewards/rejected": -1.796265959739685, "step": 1410 }, { "epoch": 0.74, "grad_norm": 2.40625, "learning_rate": 9.371964648825221e-07, "logits/chosen": -2.7056431770324707, "logits/rejected": -2.711988925933838, "logps/chosen": -358.60186767578125, "logps/rejected": -416.0870056152344, "loss": 0.4388, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9142545461654663, "rewards/margins": 0.8788741827011108, "rewards/rejected": -1.7931289672851562, "step": 1420 }, { "epoch": 0.75, "grad_norm": 3.09375, "learning_rate": 9.017977166366445e-07, "logits/chosen": -2.712580919265747, "logits/rejected": -2.7144558429718018, "logps/chosen": -340.9812316894531, "logps/rejected": -484.244873046875, "loss": 0.4154, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.7170324921607971, "rewards/margins": 1.5092626810073853, "rewards/rejected": -2.2262954711914062, "step": 1430 }, { "epoch": 0.75, "grad_norm": 4.5625, "learning_rate": 8.669327549707096e-07, "logits/chosen": -2.7783093452453613, "logits/rejected": -2.782752513885498, "logps/chosen": -336.6181335449219, "logps/rejected": -427.4580078125, "loss": 0.4337, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.7095115184783936, "rewards/margins": 1.0761725902557373, "rewards/rejected": -1.7856842279434204, "step": 1440 }, { "epoch": 0.76, "grad_norm": 6.625, "learning_rate": 8.326132244986932e-07, "logits/chosen": -2.7186310291290283, "logits/rejected": -2.7277238368988037, "logps/chosen": -330.8462219238281, "logps/rejected": -445.0597229003906, "loss": 0.4328, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7316503524780273, "rewards/margins": 1.2994225025177002, "rewards/rejected": -2.0310728549957275, "step": 1450 }, { "epoch": 0.76, "grad_norm": 6.40625, "learning_rate": 7.988505876649863e-07, "logits/chosen": -2.673827648162842, "logits/rejected": -2.6909382343292236, "logps/chosen": -328.7157287597656, "logps/rejected": -426.40997314453125, "loss": 0.4208, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.6509988307952881, "rewards/margins": 1.171143651008606, "rewards/rejected": -1.8221423625946045, "step": 1460 }, { "epoch": 0.77, "grad_norm": 5.03125, "learning_rate": 7.656561209160248e-07, "logits/chosen": -2.688586711883545, "logits/rejected": -2.6996278762817383, "logps/chosen": -335.6667785644531, "logps/rejected": -412.533447265625, "loss": 0.4229, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6491155624389648, "rewards/margins": 1.1151740550994873, "rewards/rejected": -1.7642898559570312, "step": 1470 }, { "epoch": 0.77, "grad_norm": 5.71875, "learning_rate": 7.330409109340563e-07, "logits/chosen": -2.7187418937683105, "logits/rejected": -2.713865041732788, "logps/chosen": -330.7655334472656, "logps/rejected": -445.03485107421875, "loss": 0.421, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.6798223257064819, "rewards/margins": 1.387519121170044, "rewards/rejected": -2.0673413276672363, "step": 1480 }, { "epoch": 0.78, "grad_norm": 4.21875, "learning_rate": 7.010158509342682e-07, "logits/chosen": -2.705104351043701, "logits/rejected": -2.7204995155334473, "logps/chosen": -348.8349304199219, "logps/rejected": -465.5157775878906, "loss": 0.4239, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7643205523490906, "rewards/margins": 1.3054136037826538, "rewards/rejected": -2.0697340965270996, "step": 1490 }, { "epoch": 0.79, "grad_norm": 3.25, "learning_rate": 6.695916370265529e-07, "logits/chosen": -2.7456018924713135, "logits/rejected": -2.75111722946167, "logps/chosen": -298.90582275390625, "logps/rejected": -406.1940002441406, "loss": 0.4243, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6092133522033691, "rewards/margins": 1.119093894958496, "rewards/rejected": -1.7283073663711548, "step": 1500 }, { "epoch": 0.79, "grad_norm": 3.75, "learning_rate": 6.387787646430854e-07, "logits/chosen": -2.7123939990997314, "logits/rejected": -2.7206320762634277, "logps/chosen": -311.45892333984375, "logps/rejected": -402.0213317871094, "loss": 0.4307, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5885223746299744, "rewards/margins": 1.1126749515533447, "rewards/rejected": -1.7011972665786743, "step": 1510 }, { "epoch": 0.8, "grad_norm": 2.28125, "learning_rate": 6.085875250329401e-07, "logits/chosen": -2.7589831352233887, "logits/rejected": -2.747678756713867, "logps/chosen": -326.8139953613281, "logps/rejected": -408.4124450683594, "loss": 0.4281, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.6763142347335815, "rewards/margins": 1.2141085863113403, "rewards/rejected": -1.8904228210449219, "step": 1520 }, { "epoch": 0.8, "grad_norm": 4.4375, "learning_rate": 5.79028001824894e-07, "logits/chosen": -2.676466941833496, "logits/rejected": -2.6653060913085938, "logps/chosen": -302.88861083984375, "logps/rejected": -453.6151428222656, "loss": 0.4205, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.5426191687583923, "rewards/margins": 1.233906626701355, "rewards/rejected": -1.776525855064392, "step": 1530 }, { "epoch": 0.81, "grad_norm": 2.265625, "learning_rate": 5.501100676595761e-07, "logits/chosen": -2.7586090564727783, "logits/rejected": -2.7702584266662598, "logps/chosen": -345.1263732910156, "logps/rejected": -439.22808837890625, "loss": 0.4249, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6815992593765259, "rewards/margins": 1.174318790435791, "rewards/rejected": -1.8559181690216064, "step": 1540 }, { "epoch": 0.81, "grad_norm": 3.3125, "learning_rate": 5.218433808920884e-07, "logits/chosen": -2.741339921951294, "logits/rejected": -2.7451634407043457, "logps/chosen": -329.6106262207031, "logps/rejected": -418.102294921875, "loss": 0.418, "rewards/accuracies": 0.778124988079071, "rewards/chosen": -0.48116618394851685, "rewards/margins": 1.2408123016357422, "rewards/rejected": -1.7219784259796143, "step": 1550 }, { "epoch": 0.82, "grad_norm": 2.78125, "learning_rate": 4.942373823661928e-07, "logits/chosen": -2.7156219482421875, "logits/rejected": -2.7195382118225098, "logps/chosen": -338.1180114746094, "logps/rejected": -429.4137268066406, "loss": 0.4242, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5551623702049255, "rewards/margins": 1.1304242610931396, "rewards/rejected": -1.6855865716934204, "step": 1560 }, { "epoch": 0.82, "grad_norm": 0.64453125, "learning_rate": 4.6730129226114363e-07, "logits/chosen": -2.74899959564209, "logits/rejected": -2.7066941261291504, "logps/chosen": -316.0484619140625, "logps/rejected": -443.41156005859375, "loss": 0.4244, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.6658861637115479, "rewards/margins": 1.3104979991912842, "rewards/rejected": -1.976383924484253, "step": 1570 }, { "epoch": 0.83, "grad_norm": 3.03125, "learning_rate": 4.4104410701222703e-07, "logits/chosen": -2.764725923538208, "logits/rejected": -2.7511062622070312, "logps/chosen": -345.6468200683594, "logps/rejected": -458.3433532714844, "loss": 0.4297, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.7146095037460327, "rewards/margins": 1.2348105907440186, "rewards/rejected": -1.9494202136993408, "step": 1580 }, { "epoch": 0.83, "grad_norm": 2.609375, "learning_rate": 4.154745963060197e-07, "logits/chosen": -2.690899610519409, "logits/rejected": -2.68369460105896, "logps/chosen": -327.4097900390625, "logps/rejected": -445.50457763671875, "loss": 0.4267, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.7736159563064575, "rewards/margins": 1.229644775390625, "rewards/rejected": -2.003260850906372, "step": 1590 }, { "epoch": 0.84, "grad_norm": 2.640625, "learning_rate": 3.9060130015138863e-07, "logits/chosen": -2.7254629135131836, "logits/rejected": -2.7183403968811035, "logps/chosen": -338.1430358886719, "logps/rejected": -464.69049072265625, "loss": 0.4245, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.7131890058517456, "rewards/margins": 1.2739613056182861, "rewards/rejected": -1.987149953842163, "step": 1600 }, { "epoch": 0.84, "grad_norm": 1.015625, "learning_rate": 3.664325260271953e-07, "logits/chosen": -2.7088608741760254, "logits/rejected": -2.730421304702759, "logps/chosen": -357.98126220703125, "logps/rejected": -414.66668701171875, "loss": 0.4309, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.7439313530921936, "rewards/margins": 0.9089797139167786, "rewards/rejected": -1.652910828590393, "step": 1610 }, { "epoch": 0.85, "grad_norm": 4.28125, "learning_rate": 3.429763461076677e-07, "logits/chosen": -2.7392077445983887, "logits/rejected": -2.7532036304473877, "logps/chosen": -340.41436767578125, "logps/rejected": -411.6546936035156, "loss": 0.4351, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.7221731543540955, "rewards/margins": 0.9596914052963257, "rewards/rejected": -1.6818645000457764, "step": 1620 }, { "epoch": 0.85, "grad_norm": 6.1875, "learning_rate": 3.202405945663556e-07, "logits/chosen": -2.739431858062744, "logits/rejected": -2.7153592109680176, "logps/chosen": -351.1690368652344, "logps/rejected": -460.6399841308594, "loss": 0.4206, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7035288214683533, "rewards/margins": 1.3128876686096191, "rewards/rejected": -2.016416549682617, "step": 1630 }, { "epoch": 0.86, "grad_norm": 4.34375, "learning_rate": 2.982328649595856e-07, "logits/chosen": -2.7347230911254883, "logits/rejected": -2.7274715900421143, "logps/chosen": -338.5975646972656, "logps/rejected": -435.02972412109375, "loss": 0.4329, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.722823441028595, "rewards/margins": 1.025390625, "rewards/rejected": -1.7482140064239502, "step": 1640 }, { "epoch": 0.86, "grad_norm": 2.75, "learning_rate": 2.7696050769026954e-07, "logits/chosen": -2.7211785316467285, "logits/rejected": -2.684309720993042, "logps/chosen": -344.1480407714844, "logps/rejected": -481.3314514160156, "loss": 0.4299, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7990375757217407, "rewards/margins": 1.399259328842163, "rewards/rejected": -2.1982970237731934, "step": 1650 }, { "epoch": 0.87, "grad_norm": 2.015625, "learning_rate": 2.564306275529341e-07, "logits/chosen": -2.7370145320892334, "logits/rejected": -2.7407755851745605, "logps/chosen": -321.891357421875, "logps/rejected": -401.0546569824219, "loss": 0.4248, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7101179957389832, "rewards/margins": 1.1242733001708984, "rewards/rejected": -1.8343912363052368, "step": 1660 }, { "epoch": 0.87, "grad_norm": 3.671875, "learning_rate": 2.3665008136077332e-07, "logits/chosen": -2.7449889183044434, "logits/rejected": -2.7408945560455322, "logps/chosen": -360.5602722167969, "logps/rejected": -467.7608337402344, "loss": 0.4302, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.823918342590332, "rewards/margins": 1.2585489749908447, "rewards/rejected": -2.0824673175811768, "step": 1670 }, { "epoch": 0.88, "grad_norm": 4.25, "learning_rate": 2.1762547565553293e-07, "logits/chosen": -2.6576642990112305, "logits/rejected": -2.6262636184692383, "logps/chosen": -352.53857421875, "logps/rejected": -485.3980407714844, "loss": 0.419, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.7759647965431213, "rewards/margins": 1.4105063676834106, "rewards/rejected": -2.1864712238311768, "step": 1680 }, { "epoch": 0.88, "grad_norm": 1.2578125, "learning_rate": 1.993631645009747e-07, "logits/chosen": -2.70914888381958, "logits/rejected": -2.6992902755737305, "logps/chosen": -348.275390625, "logps/rejected": -432.3793029785156, "loss": 0.4294, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9621282815933228, "rewards/margins": 1.0199404954910278, "rewards/rejected": -1.982068657875061, "step": 1690 }, { "epoch": 0.89, "grad_norm": 4.125, "learning_rate": 1.818692473606748e-07, "logits/chosen": -2.749687671661377, "logits/rejected": -2.743114948272705, "logps/chosen": -356.88311767578125, "logps/rejected": -437.4881896972656, "loss": 0.4101, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5851279497146606, "rewards/margins": 1.345589280128479, "rewards/rejected": -1.93071711063385, "step": 1700 }, { "epoch": 0.9, "grad_norm": 3.015625, "learning_rate": 1.6514956706084885e-07, "logits/chosen": -2.731550931930542, "logits/rejected": -2.7247581481933594, "logps/chosen": -367.1907653808594, "logps/rejected": -460.9814453125, "loss": 0.4151, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -0.8159070014953613, "rewards/margins": 1.3104612827301025, "rewards/rejected": -2.1263680458068848, "step": 1710 }, { "epoch": 0.9, "grad_norm": 1.921875, "learning_rate": 1.4920970783889737e-07, "logits/chosen": -2.728755235671997, "logits/rejected": -2.716214895248413, "logps/chosen": -326.6175842285156, "logps/rejected": -425.10528564453125, "loss": 0.4179, "rewards/accuracies": 0.75, "rewards/chosen": -0.5755528807640076, "rewards/margins": 1.2382352352142334, "rewards/rejected": -1.8137880563735962, "step": 1720 }, { "epoch": 0.91, "grad_norm": 1.921875, "learning_rate": 1.340549934783164e-07, "logits/chosen": -2.7430858612060547, "logits/rejected": -2.7155654430389404, "logps/chosen": -319.4033508300781, "logps/rejected": -460.38494873046875, "loss": 0.4219, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.5762430429458618, "rewards/margins": 1.5360796451568604, "rewards/rejected": -2.1123225688934326, "step": 1730 }, { "epoch": 0.91, "grad_norm": 0.8046875, "learning_rate": 1.196904855305961e-07, "logits/chosen": -2.7446229457855225, "logits/rejected": -2.741819143295288, "logps/chosen": -347.061279296875, "logps/rejected": -454.8154296875, "loss": 0.4227, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.7210251688957214, "rewards/margins": 1.3714605569839478, "rewards/rejected": -2.0924859046936035, "step": 1740 }, { "epoch": 0.92, "grad_norm": 6.78125, "learning_rate": 1.0612098162470302e-07, "logits/chosen": -2.754495143890381, "logits/rejected": -2.770519733428955, "logps/chosen": -356.2037658691406, "logps/rejected": -428.56915283203125, "loss": 0.4204, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6976330876350403, "rewards/margins": 1.1086335182189941, "rewards/rejected": -1.8062665462493896, "step": 1750 }, { "epoch": 0.92, "grad_norm": 1.703125, "learning_rate": 9.335101386471285e-08, "logits/chosen": -2.7438502311706543, "logits/rejected": -2.732393741607666, "logps/chosen": -325.47747802734375, "logps/rejected": -466.6300354003906, "loss": 0.4096, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.5821502208709717, "rewards/margins": 1.489473819732666, "rewards/rejected": -2.0716240406036377, "step": 1760 }, { "epoch": 0.93, "grad_norm": 1.125, "learning_rate": 8.138484731612273e-08, "logits/chosen": -2.6938061714172363, "logits/rejected": -2.6772360801696777, "logps/chosen": -339.33856201171875, "logps/rejected": -459.6874084472656, "loss": 0.4259, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.7721551060676575, "rewards/margins": 1.2288380861282349, "rewards/rejected": -2.000993251800537, "step": 1770 }, { "epoch": 0.93, "grad_norm": 5.15625, "learning_rate": 7.022647858135501e-08, "logits/chosen": -2.744938850402832, "logits/rejected": -2.7732322216033936, "logps/chosen": -332.72509765625, "logps/rejected": -421.79949951171875, "loss": 0.4295, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.7302526235580444, "rewards/margins": 1.0942634344100952, "rewards/rejected": -1.82451593875885, "step": 1780 }, { "epoch": 0.94, "grad_norm": 1.7734375, "learning_rate": 5.987963446492384e-08, "logits/chosen": -2.6715445518493652, "logits/rejected": -2.6765220165252686, "logps/chosen": -348.5739440917969, "logps/rejected": -482.542236328125, "loss": 0.4117, "rewards/accuracies": 0.734375, "rewards/chosen": -0.6485568284988403, "rewards/margins": 1.4271323680877686, "rewards/rejected": -2.0756890773773193, "step": 1790 }, { "epoch": 0.94, "grad_norm": 4.46875, "learning_rate": 5.034777072871394e-08, "logits/chosen": -2.7057948112487793, "logits/rejected": -2.7085628509521484, "logps/chosen": -318.53497314453125, "logps/rejected": -424.2813415527344, "loss": 0.4289, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7039138674736023, "rewards/margins": 1.1032226085662842, "rewards/rejected": -1.8071365356445312, "step": 1800 }, { "epoch": 0.95, "grad_norm": 2.5625, "learning_rate": 4.163407093778243e-08, "logits/chosen": -2.6782615184783936, "logits/rejected": -2.6741080284118652, "logps/chosen": -333.125732421875, "logps/rejected": -420.4185485839844, "loss": 0.4345, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.8739233016967773, "rewards/margins": 1.0754514932632446, "rewards/rejected": -1.949374794960022, "step": 1810 }, { "epoch": 0.95, "grad_norm": 2.3125, "learning_rate": 3.37414453970758e-08, "logits/chosen": -2.742910146713257, "logits/rejected": -2.7536118030548096, "logps/chosen": -337.20489501953125, "logps/rejected": -413.38238525390625, "loss": 0.4246, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6338103413581848, "rewards/margins": 1.2298805713653564, "rewards/rejected": -1.863690972328186, "step": 1820 }, { "epoch": 0.96, "grad_norm": 7.125, "learning_rate": 2.6672530179410183e-08, "logits/chosen": -2.7278337478637695, "logits/rejected": -2.7276864051818848, "logps/chosen": -324.9429626464844, "logps/rejected": -436.3682556152344, "loss": 0.4287, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.6213011145591736, "rewards/margins": 1.2622824907302856, "rewards/rejected": -1.883583426475525, "step": 1830 }, { "epoch": 0.96, "grad_norm": 1.5859375, "learning_rate": 2.04296862450451e-08, "logits/chosen": -2.7341113090515137, "logits/rejected": -2.7228341102600098, "logps/chosen": -359.82659912109375, "logps/rejected": -492.15771484375, "loss": 0.4246, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8840007781982422, "rewards/margins": 1.547300100326538, "rewards/rejected": -2.431300640106201, "step": 1840 }, { "epoch": 0.97, "grad_norm": 1.3046875, "learning_rate": 1.501499865314171e-08, "logits/chosen": -2.655301809310913, "logits/rejected": -2.6794886589050293, "logps/chosen": -379.09075927734375, "logps/rejected": -465.2655334472656, "loss": 0.4153, "rewards/accuracies": 0.7593749761581421, "rewards/chosen": -0.6638426184654236, "rewards/margins": 1.2455824613571167, "rewards/rejected": -1.909425139427185, "step": 1850 }, { "epoch": 0.97, "grad_norm": 1.8046875, "learning_rate": 1.0430275865371265e-08, "logits/chosen": -2.700707197189331, "logits/rejected": -2.6735973358154297, "logps/chosen": -324.67864990234375, "logps/rejected": -426.3045959472656, "loss": 0.4269, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7671259641647339, "rewards/margins": 1.2393693923950195, "rewards/rejected": -2.006495475769043, "step": 1860 }, { "epoch": 0.98, "grad_norm": 2.96875, "learning_rate": 6.677049141901315e-09, "logits/chosen": -2.714787006378174, "logits/rejected": -2.729104518890381, "logps/chosen": -332.42156982421875, "logps/rejected": -465.78497314453125, "loss": 0.4213, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6763681769371033, "rewards/margins": 1.4941723346710205, "rewards/rejected": -2.1705403327941895, "step": 1870 }, { "epoch": 0.98, "grad_norm": 1.4140625, "learning_rate": 3.756572029968708e-09, "logits/chosen": -2.7444651126861572, "logits/rejected": -2.756309986114502, "logps/chosen": -348.5865783691406, "logps/rejected": -453.53350830078125, "loss": 0.4206, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7736285328865051, "rewards/margins": 1.2819445133209229, "rewards/rejected": -2.055572986602783, "step": 1880 }, { "epoch": 0.99, "grad_norm": 3.234375, "learning_rate": 1.6698199452053199e-09, "logits/chosen": -2.7279868125915527, "logits/rejected": -2.7107343673706055, "logps/chosen": -339.8231506347656, "logps/rejected": -435.91436767578125, "loss": 0.4382, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6851938962936401, "rewards/margins": 0.9626883268356323, "rewards/rejected": -1.6478822231292725, "step": 1890 }, { "epoch": 0.99, "grad_norm": 1.25, "learning_rate": 4.1748984585560094e-10, "logits/chosen": -2.7459967136383057, "logits/rejected": -2.7115063667297363, "logps/chosen": -353.6007385253906, "logps/rejected": -478.2798767089844, "loss": 0.4255, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7070942521095276, "rewards/margins": 1.4682670831680298, "rewards/rejected": -2.175361156463623, "step": 1900 }, { "epoch": 1.0, "grad_norm": 0.84765625, "learning_rate": 0.0, "logits/chosen": -2.7471261024475098, "logits/rejected": -2.7326393127441406, "logps/chosen": -346.9029846191406, "logps/rejected": -450.1078186035156, "loss": 0.4197, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.7211757302284241, "rewards/margins": 1.336022973060608, "rewards/rejected": -2.0571985244750977, "step": 1910 }, { "epoch": 1.0, "step": 1910, "total_flos": 0.0, "train_loss": 0.4411179514611579, "train_runtime": 83930.2059, "train_samples_per_second": 0.728, "train_steps_per_second": 0.023 } ], "logging_steps": 10, "max_steps": 1910, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }