diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,40 +1,2905 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.0, + "epoch": 0.9997382884061764, "eval_steps": 100, - "global_step": 2, + "global_step": 1910, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.5, - "grad_norm": 0.58984375, - "learning_rate": 5e-06, - "logits/chosen": -3.1927170753479004, - "logits/rejected": -3.1942551136016846, - "logps/chosen": -293.9735107421875, - "logps/rejected": -260.54400634765625, + "epoch": 0.0, + "grad_norm": 0.58203125, + "learning_rate": 2.617801047120419e-08, + "logits/chosen": -3.1532161235809326, + "logits/rejected": -3.1690337657928467, + "logps/chosen": -305.45306396484375, + "logps/rejected": -294.4603576660156, "loss": 0.5, - "rewards/accuracies": 0.34375, - "rewards/chosen": -0.0009545064531266689, - "rewards/margins": -0.0002557558473199606, - "rewards/rejected": -0.0006987505475990474, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0007838421151973307, + "rewards/margins": -0.00040248289587907493, + "rewards/rejected": -0.000381359423045069, "step": 1 }, + { + "epoch": 0.01, + "grad_norm": 0.5390625, + "learning_rate": 2.617801047120419e-07, + "logits/chosen": -3.177987813949585, + "logits/rejected": -3.2059593200683594, + "logps/chosen": -299.1102294921875, + "logps/rejected": -249.10623168945312, + "loss": 0.5001, + "rewards/accuracies": 0.4444444477558136, + "rewards/chosen": -0.0008526805322617292, + "rewards/margins": -0.00045007685548625886, + "rewards/rejected": -0.0004026036476716399, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 0.59765625, + "learning_rate": 5.235602094240838e-07, + "logits/chosen": -3.1716275215148926, + "logits/rejected": -3.166067123413086, + "logps/chosen": -238.83120727539062, + "logps/rejected": -244.2283935546875, + "loss": 0.5, + "rewards/accuracies": 0.503125011920929, + "rewards/chosen": 0.0001240858546225354, + "rewards/margins": 5.3543342801276594e-05, + "rewards/rejected": 7.054249726934358e-05, + "step": 20 + }, + { + "epoch": 0.02, + "grad_norm": 0.765625, + "learning_rate": 7.853403141361258e-07, + "logits/chosen": -3.194286823272705, + "logits/rejected": -3.2046267986297607, + "logps/chosen": -268.1184387207031, + "logps/rejected": -239.86087036132812, + "loss": 0.4997, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 0.002198445377871394, + "rewards/margins": 0.0013555358164012432, + "rewards/rejected": 0.0008429096196778119, + "step": 30 + }, + { + "epoch": 0.02, + "grad_norm": 0.72265625, + "learning_rate": 1.0471204188481676e-06, + "logits/chosen": -3.1798417568206787, + "logits/rejected": -3.185044765472412, + "logps/chosen": -273.47900390625, + "logps/rejected": -255.7032928466797, + "loss": 0.4993, + "rewards/accuracies": 0.6343749761581421, + "rewards/chosen": 0.005988434888422489, + "rewards/margins": 0.0028830617666244507, + "rewards/rejected": 0.003105373587459326, + "step": 40 + }, + { + "epoch": 0.03, + "grad_norm": 0.7578125, + "learning_rate": 1.3089005235602096e-06, + "logits/chosen": -3.162355899810791, + "logits/rejected": -3.1799404621124268, + "logps/chosen": -256.9862060546875, + "logps/rejected": -239.87069702148438, + "loss": 0.4985, + "rewards/accuracies": 0.671875, + "rewards/chosen": 0.012596851214766502, + "rewards/margins": 0.006152496673166752, + "rewards/rejected": 0.0064443545415997505, + "step": 50 + }, + { + "epoch": 0.03, + "grad_norm": 0.73046875, + "learning_rate": 1.5706806282722515e-06, + "logits/chosen": -3.1871049404144287, + "logits/rejected": -3.200637102127075, + "logps/chosen": -294.3240661621094, + "logps/rejected": -262.1870422363281, + "loss": 0.4969, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.022759366780519485, + "rewards/margins": 0.012819233350455761, + "rewards/rejected": 0.009940135292708874, + "step": 60 + }, + { + "epoch": 0.04, + "grad_norm": 0.75390625, + "learning_rate": 1.8324607329842933e-06, + "logits/chosen": -3.1636972427368164, + "logits/rejected": -3.161069869995117, + "logps/chosen": -266.68853759765625, + "logps/rejected": -243.20263671875, + "loss": 0.496, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": 0.031995244324207306, + "rewards/margins": 0.016239028424024582, + "rewards/rejected": 0.015756219625473022, + "step": 70 + }, + { + "epoch": 0.04, + "grad_norm": 0.8359375, + "learning_rate": 2.094240837696335e-06, + "logits/chosen": -3.1705234050750732, + "logits/rejected": -3.1865649223327637, + "logps/chosen": -271.360595703125, + "logps/rejected": -252.78170776367188, + "loss": 0.4962, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.038576819002628326, + "rewards/margins": 0.015578309074044228, + "rewards/rejected": 0.022998513653874397, + "step": 80 + }, + { + "epoch": 0.05, + "grad_norm": 0.72265625, + "learning_rate": 2.356020942408377e-06, + "logits/chosen": -3.166350841522217, + "logits/rejected": -3.1725335121154785, + "logps/chosen": -240.39999389648438, + "logps/rejected": -236.23782348632812, + "loss": 0.4956, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.035779114812612534, + "rewards/margins": 0.018319377675652504, + "rewards/rejected": 0.01745973899960518, + "step": 90 + }, + { + "epoch": 0.05, + "grad_norm": 0.7109375, + "learning_rate": 2.617801047120419e-06, + "logits/chosen": -3.169689655303955, + "logits/rejected": -3.2062766551971436, + "logps/chosen": -260.24462890625, + "logps/rejected": -230.7229766845703, + "loss": 0.4929, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.04880410060286522, + "rewards/margins": 0.029587719589471817, + "rewards/rejected": 0.0192163847386837, + "step": 100 + }, + { + "epoch": 0.06, + "grad_norm": 0.65625, + "learning_rate": 2.8795811518324613e-06, + "logits/chosen": -3.1644022464752197, + "logits/rejected": -3.178515672683716, + "logps/chosen": -257.0642395019531, + "logps/rejected": -233.0090789794922, + "loss": 0.4904, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": 0.05435089394450188, + "rewards/margins": 0.03999961167573929, + "rewards/rejected": 0.014351281337440014, + "step": 110 + }, + { + "epoch": 0.06, + "grad_norm": 0.7265625, + "learning_rate": 3.141361256544503e-06, + "logits/chosen": -3.1647660732269287, + "logits/rejected": -3.181644916534424, + "logps/chosen": -300.6939392089844, + "logps/rejected": -279.0010070800781, + "loss": 0.4918, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": 0.0594431571662426, + "rewards/margins": 0.035575076937675476, + "rewards/rejected": 0.023868080228567123, + "step": 120 + }, + { + "epoch": 0.07, + "grad_norm": 0.6171875, + "learning_rate": 3.403141361256545e-06, + "logits/chosen": -3.1200623512268066, + "logits/rejected": -3.1410274505615234, + "logps/chosen": -265.76513671875, + "logps/rejected": -246.6106414794922, + "loss": 0.4896, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": 0.05789119750261307, + "rewards/margins": 0.045422304421663284, + "rewards/rejected": 0.012468896806240082, + "step": 130 + }, + { + "epoch": 0.07, + "grad_norm": 0.671875, + "learning_rate": 3.6649214659685865e-06, + "logits/chosen": -3.1760947704315186, + "logits/rejected": -3.1799349784851074, + "logps/chosen": -258.2982482910156, + "logps/rejected": -239.6028289794922, + "loss": 0.4864, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": 0.06110318750143051, + "rewards/margins": 0.06362718343734741, + "rewards/rejected": -0.00252399779856205, + "step": 140 + }, + { + "epoch": 0.08, + "grad_norm": 0.625, + "learning_rate": 3.926701570680629e-06, + "logits/chosen": -3.1222329139709473, + "logits/rejected": -3.133145332336426, + "logps/chosen": -265.1998291015625, + "logps/rejected": -251.78952026367188, + "loss": 0.4875, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.05429766699671745, + "rewards/margins": 0.05967814847826958, + "rewards/rejected": -0.005380480550229549, + "step": 150 + }, + { + "epoch": 0.08, + "grad_norm": 0.703125, + "learning_rate": 4.18848167539267e-06, + "logits/chosen": -3.176487445831299, + "logits/rejected": -3.190802812576294, + "logps/chosen": -272.86651611328125, + "logps/rejected": -244.38687133789062, + "loss": 0.4852, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": 0.05848199874162674, + "rewards/margins": 0.07154129445552826, + "rewards/rejected": -0.013059285469353199, + "step": 160 + }, + { + "epoch": 0.09, + "grad_norm": 0.71484375, + "learning_rate": 4.450261780104713e-06, + "logits/chosen": -3.153256893157959, + "logits/rejected": -3.1715409755706787, + "logps/chosen": -274.42901611328125, + "logps/rejected": -259.0591125488281, + "loss": 0.4835, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": 0.06613625586032867, + "rewards/margins": 0.0808890238404274, + "rewards/rejected": -0.014752751216292381, + "step": 170 + }, + { + "epoch": 0.09, + "grad_norm": 0.51953125, + "learning_rate": 4.712041884816754e-06, + "logits/chosen": -3.116530179977417, + "logits/rejected": -3.125654697418213, + "logps/chosen": -264.8692932128906, + "logps/rejected": -256.16748046875, + "loss": 0.486, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": 0.05057697370648384, + "rewards/margins": 0.0704292505979538, + "rewards/rejected": -0.019852278754115105, + "step": 180 + }, + { + "epoch": 0.1, + "grad_norm": 0.6171875, + "learning_rate": 4.9738219895287965e-06, + "logits/chosen": -3.123274564743042, + "logits/rejected": -3.1312222480773926, + "logps/chosen": -288.4281311035156, + "logps/rejected": -258.3103332519531, + "loss": 0.4814, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.05620621517300606, + "rewards/margins": 0.09435133635997772, + "rewards/rejected": -0.038145121186971664, + "step": 190 + }, + { + "epoch": 0.1, + "grad_norm": 0.703125, + "learning_rate": 4.999661831436499e-06, + "logits/chosen": -3.126032590866089, + "logits/rejected": -3.1409640312194824, + "logps/chosen": -271.5264587402344, + "logps/rejected": -251.412109375, + "loss": 0.4787, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": 0.0683365911245346, + "rewards/margins": 0.10697062313556671, + "rewards/rejected": -0.038634032011032104, + "step": 200 + }, + { + "epoch": 0.11, + "grad_norm": 0.6953125, + "learning_rate": 4.9984929711403395e-06, + "logits/chosen": -3.0909173488616943, + "logits/rejected": -3.0889172554016113, + "logps/chosen": -235.9461212158203, + "logps/rejected": -240.8140106201172, + "loss": 0.4855, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.03789529204368591, + "rewards/margins": 0.07996337115764618, + "rewards/rejected": -0.04206807166337967, + "step": 210 + }, + { + "epoch": 0.12, + "grad_norm": 0.609375, + "learning_rate": 4.996489634487865e-06, + "logits/chosen": -3.149304151535034, + "logits/rejected": -3.1437649726867676, + "logps/chosen": -276.66497802734375, + "logps/rejected": -261.4449157714844, + "loss": 0.4814, + "rewards/accuracies": 0.6656249761581421, + "rewards/chosen": 0.030237609520554543, + "rewards/margins": 0.1089547872543335, + "rewards/rejected": -0.0787171721458435, + "step": 220 + }, + { + "epoch": 0.12, + "grad_norm": 0.69921875, + "learning_rate": 4.9936524905772466e-06, + "logits/chosen": -3.092778444290161, + "logits/rejected": -3.1226553916931152, + "logps/chosen": -265.37457275390625, + "logps/rejected": -260.417724609375, + "loss": 0.4759, + "rewards/accuracies": 0.703125, + "rewards/chosen": 0.06305978447198868, + "rewards/margins": 0.13326093554496765, + "rewards/rejected": -0.07020114362239838, + "step": 230 + }, + { + "epoch": 0.13, + "grad_norm": 0.76953125, + "learning_rate": 4.9899824869915e-06, + "logits/chosen": -3.1257612705230713, + "logits/rejected": -3.142120838165283, + "logps/chosen": -251.7377166748047, + "logps/rejected": -258.2868957519531, + "loss": 0.4783, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.034979041665792465, + "rewards/margins": 0.12733003497123718, + "rewards/rejected": -0.09235100448131561, + "step": 240 + }, + { + "epoch": 0.13, + "grad_norm": 0.75, + "learning_rate": 4.985480849482012e-06, + "logits/chosen": -3.1416521072387695, + "logits/rejected": -3.1663966178894043, + "logps/chosen": -280.97442626953125, + "logps/rejected": -259.81915283203125, + "loss": 0.4749, + "rewards/accuracies": 0.703125, + "rewards/chosen": 0.03899794816970825, + "rewards/margins": 0.14351439476013184, + "rewards/rejected": -0.10451646894216537, + "step": 250 + }, + { + "epoch": 0.14, + "grad_norm": 0.78125, + "learning_rate": 4.980149081559142e-06, + "logits/chosen": -3.166862964630127, + "logits/rejected": -3.1826682090759277, + "logps/chosen": -277.7224426269531, + "logps/rejected": -264.73248291015625, + "loss": 0.4769, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.02976931631565094, + "rewards/margins": 0.13806195557117462, + "rewards/rejected": -0.10829265415668488, + "step": 260 + }, + { + "epoch": 0.14, + "grad_norm": 0.6171875, + "learning_rate": 4.9739889639900655e-06, + "logits/chosen": -3.1009061336517334, + "logits/rejected": -3.1220781803131104, + "logps/chosen": -271.1175842285156, + "logps/rejected": -270.35601806640625, + "loss": 0.4696, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.041884977370500565, + "rewards/margins": 0.18558195233345032, + "rewards/rejected": -0.14369697868824005, + "step": 270 + }, + { + "epoch": 0.15, + "grad_norm": 0.77734375, + "learning_rate": 4.967002554204009e-06, + "logits/chosen": -3.1314620971679688, + "logits/rejected": -3.147207021713257, + "logps/chosen": -272.50836181640625, + "logps/rejected": -278.05487060546875, + "loss": 0.4785, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0026828604750335217, + "rewards/margins": 0.14521858096122742, + "rewards/rejected": -0.1425357311964035, + "step": 280 + }, + { + "epoch": 0.15, + "grad_norm": 0.62109375, + "learning_rate": 4.959192185605089e-06, + "logits/chosen": -3.1541976928710938, + "logits/rejected": -3.1754889488220215, + "logps/chosen": -304.40740966796875, + "logps/rejected": -284.8604736328125, + "loss": 0.4718, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.0017113524954766035, + "rewards/margins": 0.18049772083759308, + "rewards/rejected": -0.17878638207912445, + "step": 290 + }, + { + "epoch": 0.16, + "grad_norm": 0.859375, + "learning_rate": 4.950560466792969e-06, + "logits/chosen": -3.1485819816589355, + "logits/rejected": -3.1677544116973877, + "logps/chosen": -263.881103515625, + "logps/rejected": -263.1119689941406, + "loss": 0.4741, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.045438483357429504, + "rewards/margins": 0.1866464614868164, + "rewards/rejected": -0.23208491504192352, + "step": 300 + }, + { + "epoch": 0.16, + "grad_norm": 0.80078125, + "learning_rate": 4.9411102806916185e-06, + "logits/chosen": -3.122454881668091, + "logits/rejected": -3.144866943359375, + "logps/chosen": -278.1111145019531, + "logps/rejected": -262.2369689941406, + "loss": 0.4695, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -0.052451539784669876, + "rewards/margins": 0.22689659893512726, + "rewards/rejected": -0.2793481647968292, + "step": 310 + }, + { + "epoch": 0.17, + "grad_norm": 0.71484375, + "learning_rate": 4.930844783586424e-06, + "logits/chosen": -3.116986036300659, + "logits/rejected": -3.1314234733581543, + "logps/chosen": -280.05523681640625, + "logps/rejected": -292.8658447265625, + "loss": 0.4642, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.18633142113685608, + "rewards/margins": 0.2993203103542328, + "rewards/rejected": -0.4856516718864441, + "step": 320 + }, + { + "epoch": 0.17, + "grad_norm": 0.8828125, + "learning_rate": 4.919767404070033e-06, + "logits/chosen": -3.0919606685638428, + "logits/rejected": -3.1069421768188477, + "logps/chosen": -290.76690673828125, + "logps/rejected": -299.6058044433594, + "loss": 0.4644, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.24150581657886505, + "rewards/margins": 0.32079803943634033, + "rewards/rejected": -0.562303900718689, + "step": 330 + }, + { + "epoch": 0.18, + "grad_norm": 0.8984375, + "learning_rate": 4.907881841897216e-06, + "logits/chosen": -3.113529920578003, + "logits/rejected": -3.13189697265625, + "logps/chosen": -316.0239562988281, + "logps/rejected": -312.1597900390625, + "loss": 0.4576, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.3316032886505127, + "rewards/margins": 0.40208154916763306, + "rewards/rejected": -0.733684778213501, + "step": 340 + }, + { + "epoch": 0.18, + "grad_norm": 1.03125, + "learning_rate": 4.89519206674919e-06, + "logits/chosen": -3.0299558639526367, + "logits/rejected": -3.0673482418060303, + "logps/chosen": -298.59539794921875, + "logps/rejected": -356.8745422363281, + "loss": 0.4522, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.496961772441864, + "rewards/margins": 0.5310899615287781, + "rewards/rejected": -1.028051733970642, + "step": 350 + }, + { + "epoch": 0.19, + "grad_norm": 0.984375, + "learning_rate": 4.881702316907769e-06, + "logits/chosen": -3.04780912399292, + "logits/rejected": -3.0520381927490234, + "logps/chosen": -342.5206604003906, + "logps/rejected": -376.9095153808594, + "loss": 0.4588, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -0.7044845819473267, + "rewards/margins": 0.4986873269081116, + "rewards/rejected": -1.203171968460083, + "step": 360 + }, + { + "epoch": 0.19, + "grad_norm": 1.515625, + "learning_rate": 4.86741709783982e-06, + "logits/chosen": -3.0080177783966064, + "logits/rejected": -3.0334441661834717, + "logps/chosen": -322.6118469238281, + "logps/rejected": -343.72320556640625, + "loss": 0.4541, + "rewards/accuracies": 0.6656249761581421, + "rewards/chosen": -0.7056992053985596, + "rewards/margins": 0.5343005061149597, + "rewards/rejected": -1.239999771118164, + "step": 370 + }, + { + "epoch": 0.2, + "grad_norm": 0.88671875, + "learning_rate": 4.852341180692471e-06, + "logits/chosen": -2.9617342948913574, + "logits/rejected": -2.9721832275390625, + "logps/chosen": -318.089111328125, + "logps/rejected": -369.661865234375, + "loss": 0.4526, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5964111089706421, + "rewards/margins": 0.5622240900993347, + "rewards/rejected": -1.1586352586746216, + "step": 380 + }, + { + "epoch": 0.2, + "grad_norm": 0.9375, + "learning_rate": 4.836479600699579e-06, + "logits/chosen": -3.004973888397217, + "logits/rejected": -3.015404224395752, + "logps/chosen": -322.59222412109375, + "logps/rejected": -350.57952880859375, + "loss": 0.4555, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.4821473956108093, + "rewards/margins": 0.5106935501098633, + "rewards/rejected": -0.9928409457206726, + "step": 390 + }, + { + "epoch": 0.21, + "grad_norm": 1.359375, + "learning_rate": 4.819837655500014e-06, + "logits/chosen": -2.982062816619873, + "logits/rejected": -3.0214340686798096, + "logps/chosen": -328.14764404296875, + "logps/rejected": -358.10198974609375, + "loss": 0.4518, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6790810823440552, + "rewards/margins": 0.5925682783126831, + "rewards/rejected": -1.2716493606567383, + "step": 400 + }, + { + "epoch": 0.21, + "grad_norm": 1.1953125, + "learning_rate": 4.802420903368286e-06, + "logits/chosen": -2.996058225631714, + "logits/rejected": -3.0428948402404785, + "logps/chosen": -311.92181396484375, + "logps/rejected": -364.3048095703125, + "loss": 0.4564, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5532656908035278, + "rewards/margins": 0.49352067708969116, + "rewards/rejected": -1.0467865467071533, + "step": 410 + }, + { + "epoch": 0.22, + "grad_norm": 1.140625, + "learning_rate": 4.784235161358124e-06, + "logits/chosen": -2.960181951522827, + "logits/rejected": -2.9744322299957275, + "logps/chosen": -318.44744873046875, + "logps/rejected": -352.4103088378906, + "loss": 0.4597, + "rewards/accuracies": 0.659375011920929, + "rewards/chosen": -0.5166889429092407, + "rewards/margins": 0.5014506578445435, + "rewards/rejected": -1.0181396007537842, + "step": 420 + }, + { + "epoch": 0.23, + "grad_norm": 1.09375, + "learning_rate": 4.765286503359632e-06, + "logits/chosen": -2.887781858444214, + "logits/rejected": -2.90468430519104, + "logps/chosen": -332.93792724609375, + "logps/rejected": -379.0559997558594, + "loss": 0.4516, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -0.6485487222671509, + "rewards/margins": 0.5632439255714417, + "rewards/rejected": -1.2117927074432373, + "step": 430 + }, + { + "epoch": 0.23, + "grad_norm": 0.9921875, + "learning_rate": 4.745581258070654e-06, + "logits/chosen": -2.8706612586975098, + "logits/rejected": -2.8921449184417725, + "logps/chosen": -356.7577209472656, + "logps/rejected": -414.74224853515625, + "loss": 0.4523, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9310439229011536, + "rewards/margins": 0.6457870602607727, + "rewards/rejected": -1.5768309831619263, + "step": 440 + }, + { + "epoch": 0.24, + "grad_norm": 1.4765625, + "learning_rate": 4.725126006883047e-06, + "logits/chosen": -2.8595480918884277, + "logits/rejected": -2.884725332260132, + "logps/chosen": -306.8425598144531, + "logps/rejected": -388.7693786621094, + "loss": 0.4424, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.637574315071106, + "rewards/margins": 0.7818979620933533, + "rewards/rejected": -1.4194722175598145, + "step": 450 + }, + { + "epoch": 0.24, + "grad_norm": 1.0546875, + "learning_rate": 4.70392758168454e-06, + "logits/chosen": -2.8378663063049316, + "logits/rejected": -2.8632078170776367, + "logps/chosen": -362.0655822753906, + "logps/rejected": -393.5525817871094, + "loss": 0.456, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.843133807182312, + "rewards/margins": 0.6695644855499268, + "rewards/rejected": -1.5126984119415283, + "step": 460 + }, + { + "epoch": 0.25, + "grad_norm": 1.59375, + "learning_rate": 4.68199306257695e-06, + "logits/chosen": -2.90700101852417, + "logits/rejected": -2.930222511291504, + "logps/chosen": -368.11419677734375, + "logps/rejected": -419.2084045410156, + "loss": 0.4412, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -0.8187941312789917, + "rewards/margins": 0.7371198534965515, + "rewards/rejected": -1.5559141635894775, + "step": 470 + }, + { + "epoch": 0.25, + "grad_norm": 0.8984375, + "learning_rate": 4.659329775511478e-06, + "logits/chosen": -2.8993983268737793, + "logits/rejected": -2.930690050125122, + "logps/chosen": -331.52081298828125, + "logps/rejected": -389.1390686035156, + "loss": 0.4494, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6849466562271118, + "rewards/margins": 0.7577627301216125, + "rewards/rejected": -1.4427093267440796, + "step": 480 + }, + { + "epoch": 0.26, + "grad_norm": 1.140625, + "learning_rate": 4.635945289841902e-06, + "logits/chosen": -2.8397905826568604, + "logits/rejected": -2.8895552158355713, + "logps/chosen": -351.8777770996094, + "logps/rejected": -399.30126953125, + "loss": 0.4437, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6446736454963684, + "rewards/margins": 0.8137520551681519, + "rewards/rejected": -1.458425760269165, + "step": 490 + }, + { + "epoch": 0.26, + "grad_norm": 2.265625, + "learning_rate": 4.611847415796476e-06, + "logits/chosen": -2.8411877155303955, + "logits/rejected": -2.8607966899871826, + "logps/chosen": -360.9768371582031, + "logps/rejected": -416.825927734375, + "loss": 0.4382, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.8380700945854187, + "rewards/margins": 0.860516369342804, + "rewards/rejected": -1.6985862255096436, + "step": 500 + }, + { + "epoch": 0.27, + "grad_norm": 1.75, + "learning_rate": 4.587044201869378e-06, + "logits/chosen": -2.848315715789795, + "logits/rejected": -2.8671722412109375, + "logps/chosen": -326.2365417480469, + "logps/rejected": -389.13720703125, + "loss": 0.4425, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7817949056625366, + "rewards/margins": 0.8195638656616211, + "rewards/rejected": -1.6013587713241577, + "step": 510 + }, + { + "epoch": 0.27, + "grad_norm": 1.4453125, + "learning_rate": 4.561543932132574e-06, + "logits/chosen": -2.7974531650543213, + "logits/rejected": -2.828226327896118, + "logps/chosen": -348.9537658691406, + "logps/rejected": -422.35186767578125, + "loss": 0.4319, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.7357637286186218, + "rewards/margins": 0.9210460782051086, + "rewards/rejected": -1.6568095684051514, + "step": 520 + }, + { + "epoch": 0.28, + "grad_norm": 1.6015625, + "learning_rate": 4.535355123469009e-06, + "logits/chosen": -2.8311352729797363, + "logits/rejected": -2.845012664794922, + "logps/chosen": -350.1925964355469, + "logps/rejected": -428.8135681152344, + "loss": 0.4413, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9110676050186157, + "rewards/margins": 0.8541079759597778, + "rewards/rejected": -1.765175461769104, + "step": 530 + }, + { + "epoch": 0.28, + "grad_norm": 2.546875, + "learning_rate": 4.508486522728037e-06, + "logits/chosen": -2.7914469242095947, + "logits/rejected": -2.821166515350342, + "logps/chosen": -362.1348876953125, + "logps/rejected": -431.3775329589844, + "loss": 0.425, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -0.72906094789505, + "rewards/margins": 1.1009550094604492, + "rewards/rejected": -1.8300158977508545, + "step": 540 + }, + { + "epoch": 0.29, + "grad_norm": 2.953125, + "learning_rate": 4.480947103804044e-06, + "logits/chosen": -2.7665412425994873, + "logits/rejected": -2.7718183994293213, + "logps/chosen": -364.27362060546875, + "logps/rejected": -409.62042236328125, + "loss": 0.4476, + "rewards/accuracies": 0.6781250238418579, + "rewards/chosen": -0.9586559534072876, + "rewards/margins": 0.7505531311035156, + "rewards/rejected": -1.7092090845108032, + "step": 550 + }, + { + "epoch": 0.29, + "grad_norm": 2.21875, + "learning_rate": 4.452746064639239e-06, + "logits/chosen": -2.7609431743621826, + "logits/rejected": -2.7761027812957764, + "logps/chosen": -350.8570556640625, + "logps/rejected": -438.7533264160156, + "loss": 0.4401, + "rewards/accuracies": 0.6781250238418579, + "rewards/chosen": -0.9020726084709167, + "rewards/margins": 0.9013819694519043, + "rewards/rejected": -1.8034546375274658, + "step": 560 + }, + { + "epoch": 0.3, + "grad_norm": 1.6171875, + "learning_rate": 4.423892824151617e-06, + "logits/chosen": -2.724292755126953, + "logits/rejected": -2.7128217220306396, + "logps/chosen": -369.49822998046875, + "logps/rejected": -421.8377380371094, + "loss": 0.4407, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9183570742607117, + "rewards/margins": 0.7936559319496155, + "rewards/rejected": -1.7120128870010376, + "step": 570 + }, + { + "epoch": 0.3, + "grad_norm": 2.296875, + "learning_rate": 4.3943970190891164e-06, + "logits/chosen": -2.767193555831909, + "logits/rejected": -2.7671852111816406, + "logps/chosen": -338.62347412109375, + "logps/rejected": -402.12713623046875, + "loss": 0.4297, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.6558475494384766, + "rewards/margins": 0.883314311504364, + "rewards/rejected": -1.5391619205474854, + "step": 580 + }, + { + "epoch": 0.31, + "grad_norm": 1.9765625, + "learning_rate": 4.364268500811025e-06, + "logits/chosen": -2.71061635017395, + "logits/rejected": -2.7537200450897217, + "logps/chosen": -356.565185546875, + "logps/rejected": -418.8211975097656, + "loss": 0.4445, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9843032956123352, + "rewards/margins": 0.7892019152641296, + "rewards/rejected": -1.773505449295044, + "step": 590 + }, + { + "epoch": 0.31, + "grad_norm": 1.7421875, + "learning_rate": 4.333517331997704e-06, + "logits/chosen": -2.7916760444641113, + "logits/rejected": -2.771669864654541, + "logps/chosen": -304.84649658203125, + "logps/rejected": -379.7848815917969, + "loss": 0.4453, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": -0.6202788949012756, + "rewards/margins": 0.7137486338615417, + "rewards/rejected": -1.3340275287628174, + "step": 600 + }, + { + "epoch": 0.32, + "grad_norm": 1.34375, + "learning_rate": 4.302153783289737e-06, + "logits/chosen": -2.771042823791504, + "logits/rejected": -2.7721784114837646, + "logps/chosen": -304.8459167480469, + "logps/rejected": -382.05029296875, + "loss": 0.4336, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.48120832443237305, + "rewards/margins": 0.8254661560058594, + "rewards/rejected": -1.3066743612289429, + "step": 610 + }, + { + "epoch": 0.32, + "grad_norm": 1.171875, + "learning_rate": 4.270188329857613e-06, + "logits/chosen": -2.6971840858459473, + "logits/rejected": -2.7405362129211426, + "logps/chosen": -365.59869384765625, + "logps/rejected": -422.66082763671875, + "loss": 0.4337, + "rewards/accuracies": 0.7406250238418579, + "rewards/chosen": -0.7066925168037415, + "rewards/margins": 0.9992557764053345, + "rewards/rejected": -1.7059482336044312, + "step": 620 + }, + { + "epoch": 0.33, + "grad_norm": 2.546875, + "learning_rate": 4.237631647903115e-06, + "logits/chosen": -2.729823589324951, + "logits/rejected": -2.7406442165374756, + "logps/chosen": -343.63861083984375, + "logps/rejected": -398.4572448730469, + "loss": 0.4474, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.834626317024231, + "rewards/margins": 0.7315307855606079, + "rewards/rejected": -1.5661571025848389, + "step": 630 + }, + { + "epoch": 0.33, + "grad_norm": 3.15625, + "learning_rate": 4.204494611093548e-06, + "logits/chosen": -2.6876988410949707, + "logits/rejected": -2.710921049118042, + "logps/chosen": -344.4961242675781, + "logps/rejected": -413.8218688964844, + "loss": 0.4355, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9162249565124512, + "rewards/margins": 0.9599907994270325, + "rewards/rejected": -1.8762153387069702, + "step": 640 + }, + { + "epoch": 0.34, + "grad_norm": 3.3125, + "learning_rate": 4.170788286930024e-06, + "logits/chosen": -2.720813751220703, + "logits/rejected": -2.7221953868865967, + "logps/chosen": -396.78424072265625, + "logps/rejected": -472.07293701171875, + "loss": 0.4488, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.4064452648162842, + "rewards/margins": 0.8613995313644409, + "rewards/rejected": -2.2678446769714355, + "step": 650 + }, + { + "epoch": 0.35, + "grad_norm": 1.578125, + "learning_rate": 4.136523933051005e-06, + "logits/chosen": -2.759519338607788, + "logits/rejected": -2.7709243297576904, + "logps/chosen": -343.8990173339844, + "logps/rejected": -425.5016174316406, + "loss": 0.4433, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.881717324256897, + "rewards/margins": 0.9298511743545532, + "rewards/rejected": -1.8115684986114502, + "step": 660 + }, + { + "epoch": 0.35, + "grad_norm": 2.390625, + "learning_rate": 4.101712993472348e-06, + "logits/chosen": -2.7461307048797607, + "logits/rejected": -2.7893545627593994, + "logps/chosen": -306.47528076171875, + "logps/rejected": -357.6148986816406, + "loss": 0.4434, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": -0.500246524810791, + "rewards/margins": 0.6672911643981934, + "rewards/rejected": -1.1675376892089844, + "step": 670 + }, + { + "epoch": 0.36, + "grad_norm": 1.53125, + "learning_rate": 4.066367094765091e-06, + "logits/chosen": -2.6456458568573, + "logits/rejected": -2.6387665271759033, + "logps/chosen": -346.2305603027344, + "logps/rejected": -411.1793518066406, + "loss": 0.4375, + "rewards/accuracies": 0.659375011920929, + "rewards/chosen": -0.803577721118927, + "rewards/margins": 0.8857296109199524, + "rewards/rejected": -1.689307451248169, + "step": 680 + }, + { + "epoch": 0.36, + "grad_norm": 1.875, + "learning_rate": 4.030498042172277e-06, + "logits/chosen": -2.729139804840088, + "logits/rejected": -2.7350516319274902, + "logps/chosen": -343.1637268066406, + "logps/rejected": -424.0751037597656, + "loss": 0.4338, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": -0.6723712086677551, + "rewards/margins": 0.953707218170166, + "rewards/rejected": -1.6260782480239868, + "step": 690 + }, + { + "epoch": 0.37, + "grad_norm": 1.5, + "learning_rate": 3.994117815666095e-06, + "logits/chosen": -2.737205743789673, + "logits/rejected": -2.756513833999634, + "logps/chosen": -358.9730529785156, + "logps/rejected": -435.59930419921875, + "loss": 0.4352, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9257775545120239, + "rewards/margins": 0.9314867258071899, + "rewards/rejected": -1.8572641611099243, + "step": 700 + }, + { + "epoch": 0.37, + "grad_norm": 2.59375, + "learning_rate": 3.957238565946672e-06, + "logits/chosen": -2.7129783630371094, + "logits/rejected": -2.740182399749756, + "logps/chosen": -382.54144287109375, + "logps/rejected": -479.25079345703125, + "loss": 0.425, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0284990072250366, + "rewards/margins": 1.2022464275360107, + "rewards/rejected": -2.230745315551758, + "step": 710 + }, + { + "epoch": 0.38, + "grad_norm": 3.109375, + "learning_rate": 3.919872610383831e-06, + "logits/chosen": -2.700688600540161, + "logits/rejected": -2.715548038482666, + "logps/chosen": -346.7179870605469, + "logps/rejected": -442.24468994140625, + "loss": 0.429, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.9038440585136414, + "rewards/margins": 1.1126606464385986, + "rewards/rejected": -2.0165047645568848, + "step": 720 + }, + { + "epoch": 0.38, + "grad_norm": 5.4375, + "learning_rate": 3.882032428903195e-06, + "logits/chosen": -2.732431650161743, + "logits/rejected": -2.741513252258301, + "logps/chosen": -341.4494323730469, + "logps/rejected": -418.03485107421875, + "loss": 0.4393, + "rewards/accuracies": 0.6656249761581421, + "rewards/chosen": -0.9002019166946411, + "rewards/margins": 0.9896795153617859, + "rewards/rejected": -1.8898814916610718, + "step": 730 + }, + { + "epoch": 0.39, + "grad_norm": 2.828125, + "learning_rate": 3.84373065981799e-06, + "logits/chosen": -2.6866860389709473, + "logits/rejected": -2.7075419425964355, + "logps/chosen": -335.77630615234375, + "logps/rejected": -392.2721252441406, + "loss": 0.4315, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7698886394500732, + "rewards/margins": 0.8988865613937378, + "rewards/rejected": -1.668775200843811, + "step": 740 + }, + { + "epoch": 0.39, + "grad_norm": 3.03125, + "learning_rate": 3.8049800956079552e-06, + "logits/chosen": -2.728248357772827, + "logits/rejected": -2.719029426574707, + "logps/chosen": -333.34747314453125, + "logps/rejected": -403.9107971191406, + "loss": 0.4329, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -0.688866376876831, + "rewards/margins": 0.9195898771286011, + "rewards/rejected": -1.6084562540054321, + "step": 750 + }, + { + "epoch": 0.4, + "grad_norm": 2.640625, + "learning_rate": 3.765793678646753e-06, + "logits/chosen": -2.7588818073272705, + "logits/rejected": -2.764564037322998, + "logps/chosen": -327.6832580566406, + "logps/rejected": -397.34967041015625, + "loss": 0.4445, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.7366557121276855, + "rewards/margins": 0.8244975805282593, + "rewards/rejected": -1.5611531734466553, + "step": 760 + }, + { + "epoch": 0.4, + "grad_norm": 3.40625, + "learning_rate": 3.726184496879323e-06, + "logits/chosen": -2.7015931606292725, + "logits/rejected": -2.7093541622161865, + "logps/chosen": -328.6782531738281, + "logps/rejected": -392.6251525878906, + "loss": 0.4434, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -0.6729280948638916, + "rewards/margins": 0.7624879479408264, + "rewards/rejected": -1.4354161024093628, + "step": 770 + }, + { + "epoch": 0.41, + "grad_norm": 2.046875, + "learning_rate": 3.686165779450619e-06, + "logits/chosen": -2.7212226390838623, + "logits/rejected": -2.7281336784362793, + "logps/chosen": -332.78985595703125, + "logps/rejected": -407.54339599609375, + "loss": 0.4226, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5121452808380127, + "rewards/margins": 1.0209487676620483, + "rewards/rejected": -1.5330939292907715, + "step": 780 + }, + { + "epoch": 0.41, + "grad_norm": 2.0625, + "learning_rate": 3.645750892287178e-06, + "logits/chosen": -2.6992287635803223, + "logits/rejected": -2.7106306552886963, + "logps/chosen": -349.5966491699219, + "logps/rejected": -479.349609375, + "loss": 0.4291, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9312782287597656, + "rewards/margins": 1.2202502489089966, + "rewards/rejected": -2.1515283584594727, + "step": 790 + }, + { + "epoch": 0.42, + "grad_norm": 4.4375, + "learning_rate": 3.604953333633009e-06, + "logits/chosen": -2.6964669227600098, + "logits/rejected": -2.718437671661377, + "logps/chosen": -368.614990234375, + "logps/rejected": -455.652099609375, + "loss": 0.444, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1283608675003052, + "rewards/margins": 0.9377814531326294, + "rewards/rejected": -2.0661423206329346, + "step": 800 + }, + { + "epoch": 0.42, + "grad_norm": 1.796875, + "learning_rate": 3.56378672954129e-06, + "logits/chosen": -2.734504222869873, + "logits/rejected": -2.741596221923828, + "logps/chosen": -345.35894775390625, + "logps/rejected": -441.3074645996094, + "loss": 0.4339, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -0.8633454442024231, + "rewards/margins": 1.0725480318069458, + "rewards/rejected": -1.9358936548233032, + "step": 810 + }, + { + "epoch": 0.43, + "grad_norm": 4.34375, + "learning_rate": 3.5222648293233806e-06, + "logits/chosen": -2.726121425628662, + "logits/rejected": -2.757293701171875, + "logps/chosen": -328.99981689453125, + "logps/rejected": -409.4175720214844, + "loss": 0.4433, + "rewards/accuracies": 0.6781250238418579, + "rewards/chosen": -0.8074792623519897, + "rewards/margins": 0.893712043762207, + "rewards/rejected": -1.7011913061141968, + "step": 820 + }, + { + "epoch": 0.43, + "grad_norm": 2.015625, + "learning_rate": 3.4804015009566573e-06, + "logits/chosen": -2.7143707275390625, + "logits/rejected": -2.7266762256622314, + "logps/chosen": -343.6038513183594, + "logps/rejected": -408.5287170410156, + "loss": 0.4446, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0202709436416626, + "rewards/margins": 0.8006850481033325, + "rewards/rejected": -1.8209559917449951, + "step": 830 + }, + { + "epoch": 0.44, + "grad_norm": 8.25, + "learning_rate": 3.4382107264527244e-06, + "logits/chosen": -2.731356143951416, + "logits/rejected": -2.749807357788086, + "logps/chosen": -387.00372314453125, + "logps/rejected": -461.8328552246094, + "loss": 0.4274, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.057356834411621, + "rewards/margins": 1.1233993768692017, + "rewards/rejected": -2.180756092071533, + "step": 840 + }, + { + "epoch": 0.44, + "grad_norm": 1.328125, + "learning_rate": 3.3957065971875387e-06, + "logits/chosen": -2.736109972000122, + "logits/rejected": -2.761101245880127, + "logps/chosen": -378.2186279296875, + "logps/rejected": -433.41986083984375, + "loss": 0.4517, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.291465401649475, + "rewards/margins": 0.7508509159088135, + "rewards/rejected": -2.042316198348999, + "step": 850 + }, + { + "epoch": 0.45, + "grad_norm": 3.15625, + "learning_rate": 3.352903309194999e-06, + "logits/chosen": -2.7496652603149414, + "logits/rejected": -2.766449451446533, + "logps/chosen": -347.00531005859375, + "logps/rejected": -450.0355529785156, + "loss": 0.4337, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9623804092407227, + "rewards/margins": 1.0263848304748535, + "rewards/rejected": -1.9887651205062866, + "step": 860 + }, + { + "epoch": 0.46, + "grad_norm": 0.9765625, + "learning_rate": 3.309815158425591e-06, + "logits/chosen": -2.6927943229675293, + "logits/rejected": -2.7125327587127686, + "logps/chosen": -312.2998046875, + "logps/rejected": -379.62017822265625, + "loss": 0.43, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.39604613184928894, + "rewards/margins": 0.9399534463882446, + "rewards/rejected": -1.335999608039856, + "step": 870 + }, + { + "epoch": 0.46, + "grad_norm": 4.53125, + "learning_rate": 3.266456535971654e-06, + "logits/chosen": -2.790156602859497, + "logits/rejected": -2.7961854934692383, + "logps/chosen": -305.0512390136719, + "logps/rejected": -366.155029296875, + "loss": 0.4325, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.48138341307640076, + "rewards/margins": 0.8310259580612183, + "rewards/rejected": -1.3124094009399414, + "step": 880 + }, + { + "epoch": 0.47, + "grad_norm": 1.0625, + "learning_rate": 3.2228419232608692e-06, + "logits/chosen": -2.701418399810791, + "logits/rejected": -2.688974380493164, + "logps/chosen": -316.68133544921875, + "logps/rejected": -413.67156982421875, + "loss": 0.4314, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6776655316352844, + "rewards/margins": 0.9983049631118774, + "rewards/rejected": -1.6759703159332275, + "step": 890 + }, + { + "epoch": 0.47, + "grad_norm": 2.21875, + "learning_rate": 3.1789858872195888e-06, + "logits/chosen": -2.6642849445343018, + "logits/rejected": -2.647975444793701, + "logps/chosen": -365.28936767578125, + "logps/rejected": -515.3648681640625, + "loss": 0.438, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0754855871200562, + "rewards/margins": 1.3716323375701904, + "rewards/rejected": -2.447117567062378, + "step": 900 + }, + { + "epoch": 0.48, + "grad_norm": 3.625, + "learning_rate": 3.1349030754075945e-06, + "logits/chosen": -2.668508291244507, + "logits/rejected": -2.6765646934509277, + "logps/chosen": -357.05712890625, + "logps/rejected": -427.643798828125, + "loss": 0.4391, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8638874888420105, + "rewards/margins": 0.9187320470809937, + "rewards/rejected": -1.7826197147369385, + "step": 910 + }, + { + "epoch": 0.48, + "grad_norm": 4.15625, + "learning_rate": 3.0906082111259313e-06, + "logits/chosen": -2.721989154815674, + "logits/rejected": -2.7402305603027344, + "logps/chosen": -341.566650390625, + "logps/rejected": -425.6533203125, + "loss": 0.4271, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.6936538815498352, + "rewards/margins": 1.0446574687957764, + "rewards/rejected": -1.7383114099502563, + "step": 920 + }, + { + "epoch": 0.49, + "grad_norm": 2.25, + "learning_rate": 3.046116088499449e-06, + "logits/chosen": -2.732478380203247, + "logits/rejected": -2.7298645973205566, + "logps/chosen": -366.4847717285156, + "logps/rejected": -460.4178161621094, + "loss": 0.4317, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.7452308535575867, + "rewards/margins": 1.1475111246109009, + "rewards/rejected": -1.8927419185638428, + "step": 930 + }, + { + "epoch": 0.49, + "grad_norm": 1.46875, + "learning_rate": 3.0014415675356813e-06, + "logits/chosen": -2.7274584770202637, + "logits/rejected": -2.7278664112091064, + "logps/chosen": -354.26751708984375, + "logps/rejected": -469.7132873535156, + "loss": 0.4211, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.7849363088607788, + "rewards/margins": 1.4399199485778809, + "rewards/rejected": -2.224856376647949, + "step": 940 + }, + { + "epoch": 0.5, + "grad_norm": 2.765625, + "learning_rate": 2.9565995691617242e-06, + "logits/chosen": -2.7352840900421143, + "logits/rejected": -2.737842321395874, + "logps/chosen": -358.52020263671875, + "logps/rejected": -454.4913024902344, + "loss": 0.4305, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -0.7539668083190918, + "rewards/margins": 1.1879950761795044, + "rewards/rejected": -1.941961646080017, + "step": 950 + }, + { + "epoch": 0.5, + "grad_norm": 0.9765625, + "learning_rate": 2.9116050702407706e-06, + "logits/chosen": -2.7479987144470215, + "logits/rejected": -2.7785484790802, + "logps/chosen": -313.64837646484375, + "logps/rejected": -373.56689453125, + "loss": 0.4343, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -0.4597587585449219, + "rewards/margins": 0.8521866798400879, + "rewards/rejected": -1.3119454383850098, + "step": 960 + }, + { + "epoch": 0.51, + "grad_norm": 2.171875, + "learning_rate": 2.8664730985699537e-06, + "logits/chosen": -2.7080225944519043, + "logits/rejected": -2.717515707015991, + "logps/chosen": -313.6962890625, + "logps/rejected": -389.29205322265625, + "loss": 0.4272, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.46188563108444214, + "rewards/margins": 0.994129478931427, + "rewards/rejected": -1.4560149908065796, + "step": 970 + }, + { + "epoch": 0.51, + "grad_norm": 2.5, + "learning_rate": 2.8212187278611907e-06, + "logits/chosen": -2.7235171794891357, + "logits/rejected": -2.736121416091919, + "logps/chosen": -342.36273193359375, + "logps/rejected": -417.322509765625, + "loss": 0.4259, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6042460203170776, + "rewards/margins": 1.0531026124954224, + "rewards/rejected": -1.6573486328125, + "step": 980 + }, + { + "epoch": 0.52, + "grad_norm": 1.7265625, + "learning_rate": 2.7758570727066843e-06, + "logits/chosen": -2.690983533859253, + "logits/rejected": -2.6975481510162354, + "logps/chosen": -342.63079833984375, + "logps/rejected": -407.81231689453125, + "loss": 0.4475, + "rewards/accuracies": 0.653124988079071, + "rewards/chosen": -0.8637169599533081, + "rewards/margins": 0.7765380144119263, + "rewards/rejected": -1.6402549743652344, + "step": 990 + }, + { + "epoch": 0.52, + "grad_norm": 1.7109375, + "learning_rate": 2.730403283530767e-06, + "logits/chosen": -2.6636836528778076, + "logits/rejected": -2.661403179168701, + "logps/chosen": -344.44744873046875, + "logps/rejected": -413.1937561035156, + "loss": 0.4248, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -0.7091799974441528, + "rewards/margins": 1.0423099994659424, + "rewards/rejected": -1.7514899969100952, + "step": 1000 + }, + { + "epoch": 0.53, + "grad_norm": 4.21875, + "learning_rate": 2.6848725415297888e-06, + "logits/chosen": -2.6942477226257324, + "logits/rejected": -2.7120838165283203, + "logps/chosen": -336.60760498046875, + "logps/rejected": -421.9783630371094, + "loss": 0.4339, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -0.7389696836471558, + "rewards/margins": 0.9657734036445618, + "rewards/rejected": -1.7047427892684937, + "step": 1010 + }, + { + "epoch": 0.53, + "grad_norm": 1.140625, + "learning_rate": 2.639280053601719e-06, + "logits/chosen": -2.700098991394043, + "logits/rejected": -2.7320022583007812, + "logps/chosen": -346.1884460449219, + "logps/rejected": -410.5491638183594, + "loss": 0.4324, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5768822431564331, + "rewards/margins": 0.9592208862304688, + "rewards/rejected": -1.5361031293869019, + "step": 1020 + }, + { + "epoch": 0.54, + "grad_norm": 3.046875, + "learning_rate": 2.59364104726716e-06, + "logits/chosen": -2.7298641204833984, + "logits/rejected": -2.73411226272583, + "logps/chosen": -346.91912841796875, + "logps/rejected": -417.67034912109375, + "loss": 0.43, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -0.7697745561599731, + "rewards/margins": 1.0892785787582397, + "rewards/rejected": -1.8590532541275024, + "step": 1030 + }, + { + "epoch": 0.54, + "grad_norm": 2.609375, + "learning_rate": 2.547970765583491e-06, + "logits/chosen": -2.716035842895508, + "logits/rejected": -2.702650547027588, + "logps/chosen": -330.34124755859375, + "logps/rejected": -416.9476623535156, + "loss": 0.4345, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.7259209156036377, + "rewards/margins": 0.974997878074646, + "rewards/rejected": -1.7009187936782837, + "step": 1040 + }, + { + "epoch": 0.55, + "grad_norm": 1.390625, + "learning_rate": 2.502284462053799e-06, + "logits/chosen": -2.656066656112671, + "logits/rejected": -2.6666572093963623, + "logps/chosen": -331.7149658203125, + "logps/rejected": -423.1683654785156, + "loss": 0.4287, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.5932595729827881, + "rewards/margins": 1.035936713218689, + "rewards/rejected": -1.6291964054107666, + "step": 1050 + }, + { + "epoch": 0.55, + "grad_norm": 1.9296875, + "learning_rate": 2.456597395532338e-06, + "logits/chosen": -2.7209274768829346, + "logits/rejected": -2.735020399093628, + "logps/chosen": -328.7770080566406, + "logps/rejected": -391.19354248046875, + "loss": 0.4294, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6377438306808472, + "rewards/margins": 0.9876799583435059, + "rewards/rejected": -1.625423789024353, + "step": 1060 + }, + { + "epoch": 0.56, + "grad_norm": 3.515625, + "learning_rate": 2.4109248251281953e-06, + "logits/chosen": -2.7338156700134277, + "logits/rejected": -2.7391562461853027, + "logps/chosen": -343.69390869140625, + "logps/rejected": -414.94805908203125, + "loss": 0.432, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.6750501990318298, + "rewards/margins": 1.0820457935333252, + "rewards/rejected": -1.7570960521697998, + "step": 1070 + }, + { + "epoch": 0.57, + "grad_norm": 0.99609375, + "learning_rate": 2.365282005108875e-06, + "logits/chosen": -2.7210652828216553, + "logits/rejected": -2.7339107990264893, + "logps/chosen": -335.1302490234375, + "logps/rejected": -390.1886901855469, + "loss": 0.4303, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6980403661727905, + "rewards/margins": 1.0391963720321655, + "rewards/rejected": -1.737236738204956, + "step": 1080 + }, + { + "epoch": 0.57, + "grad_norm": 2.921875, + "learning_rate": 2.319684179805491e-06, + "logits/chosen": -2.6966023445129395, + "logits/rejected": -2.7194454669952393, + "logps/chosen": -343.26934814453125, + "logps/rejected": -400.3363037109375, + "loss": 0.4381, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8400837779045105, + "rewards/margins": 0.9520319104194641, + "rewards/rejected": -1.7921158075332642, + "step": 1090 + }, + { + "epoch": 0.58, + "grad_norm": 2.0, + "learning_rate": 2.2741465785212905e-06, + "logits/chosen": -2.7013275623321533, + "logits/rejected": -2.7162578105926514, + "logps/chosen": -336.082763671875, + "logps/rejected": -424.86895751953125, + "loss": 0.4383, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -0.7391124963760376, + "rewards/margins": 0.9916101694107056, + "rewards/rejected": -1.7307227849960327, + "step": 1100 + }, + { + "epoch": 0.58, + "grad_norm": 1.359375, + "learning_rate": 2.2286844104451848e-06, + "logits/chosen": -2.7336266040802, + "logits/rejected": -2.7385802268981934, + "logps/chosen": -368.2712707519531, + "logps/rejected": -442.3502502441406, + "loss": 0.4393, + "rewards/accuracies": 0.6781250238418579, + "rewards/chosen": -0.8786946535110474, + "rewards/margins": 0.9058005213737488, + "rewards/rejected": -1.7844951152801514, + "step": 1110 + }, + { + "epoch": 0.59, + "grad_norm": 1.1328125, + "learning_rate": 2.183312859572008e-06, + "logits/chosen": -2.6661019325256348, + "logits/rejected": -2.687495708465576, + "logps/chosen": -360.59503173828125, + "logps/rejected": -445.728759765625, + "loss": 0.4259, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6851642727851868, + "rewards/margins": 1.106838345527649, + "rewards/rejected": -1.7920026779174805, + "step": 1120 + }, + { + "epoch": 0.59, + "grad_norm": 3.015625, + "learning_rate": 2.1380470796311843e-06, + "logits/chosen": -2.6705803871154785, + "logits/rejected": -2.668128252029419, + "logps/chosen": -339.4599609375, + "logps/rejected": -421.2923278808594, + "loss": 0.4272, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.642924964427948, + "rewards/margins": 1.065707802772522, + "rewards/rejected": -1.7086328268051147, + "step": 1130 + }, + { + "epoch": 0.6, + "grad_norm": 3.96875, + "learning_rate": 2.092902189025507e-06, + "logits/chosen": -2.6446332931518555, + "logits/rejected": -2.655050754547119, + "logps/chosen": -355.1614074707031, + "logps/rejected": -427.96636962890625, + "loss": 0.4265, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7822728157043457, + "rewards/margins": 1.1439166069030762, + "rewards/rejected": -1.9261894226074219, + "step": 1140 + }, + { + "epoch": 0.6, + "grad_norm": 0.98828125, + "learning_rate": 2.0478932657817105e-06, + "logits/chosen": -2.737699031829834, + "logits/rejected": -2.732815742492676, + "logps/chosen": -356.5775451660156, + "logps/rejected": -431.93609619140625, + "loss": 0.4282, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8207361102104187, + "rewards/margins": 1.0940120220184326, + "rewards/rejected": -1.914747953414917, + "step": 1150 + }, + { + "epoch": 0.61, + "grad_norm": 2.546875, + "learning_rate": 2.0030353425145376e-06, + "logits/chosen": -2.676217555999756, + "logits/rejected": -2.6970784664154053, + "logps/chosen": -318.20355224609375, + "logps/rejected": -386.79388427734375, + "loss": 0.4289, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.741962730884552, + "rewards/margins": 1.0068390369415283, + "rewards/rejected": -1.748801589012146, + "step": 1160 + }, + { + "epoch": 0.61, + "grad_norm": 0.7890625, + "learning_rate": 1.958343401405964e-06, + "logits/chosen": -2.6743831634521484, + "logits/rejected": -2.6914896965026855, + "logps/chosen": -323.74053955078125, + "logps/rejected": -417.9308166503906, + "loss": 0.4318, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6001055240631104, + "rewards/margins": 1.030330777168274, + "rewards/rejected": -1.6304363012313843, + "step": 1170 + }, + { + "epoch": 0.62, + "grad_norm": 5.53125, + "learning_rate": 1.9138323692012734e-06, + "logits/chosen": -2.719978094100952, + "logits/rejected": -2.7362709045410156, + "logps/chosen": -316.5953063964844, + "logps/rejected": -410.26593017578125, + "loss": 0.4226, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5119751691818237, + "rewards/margins": 1.1661722660064697, + "rewards/rejected": -1.678147554397583, + "step": 1180 + }, + { + "epoch": 0.62, + "grad_norm": 1.7890625, + "learning_rate": 1.8695171122236443e-06, + "logits/chosen": -2.6844494342803955, + "logits/rejected": -2.6935813426971436, + "logps/chosen": -319.3164367675781, + "logps/rejected": -416.75616455078125, + "loss": 0.4208, + "rewards/accuracies": 0.7718750238418579, + "rewards/chosen": -0.5857919454574585, + "rewards/margins": 1.1580404043197632, + "rewards/rejected": -1.7438323497772217, + "step": 1190 + }, + { + "epoch": 0.63, + "grad_norm": 3.0625, + "learning_rate": 1.8254124314089225e-06, + "logits/chosen": -2.755115509033203, + "logits/rejected": -2.7330613136291504, + "logps/chosen": -322.7129821777344, + "logps/rejected": -407.7967529296875, + "loss": 0.4345, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.4961959719657898, + "rewards/margins": 1.078574776649475, + "rewards/rejected": -1.5747709274291992, + "step": 1200 + }, + { + "epoch": 0.63, + "grad_norm": 1.9375, + "learning_rate": 1.781533057362221e-06, + "logits/chosen": -2.746798038482666, + "logits/rejected": -2.7739720344543457, + "logps/chosen": -299.58160400390625, + "logps/rejected": -374.47930908203125, + "loss": 0.4296, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -0.4686378538608551, + "rewards/margins": 0.9811908602714539, + "rewards/rejected": -1.4498287439346313, + "step": 1210 + }, + { + "epoch": 0.64, + "grad_norm": 3.296875, + "learning_rate": 1.7378936454380277e-06, + "logits/chosen": -2.7310540676116943, + "logits/rejected": -2.7376887798309326, + "logps/chosen": -327.89410400390625, + "logps/rejected": -404.711669921875, + "loss": 0.4279, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5759900808334351, + "rewards/margins": 1.1363575458526611, + "rewards/rejected": -1.7123476266860962, + "step": 1220 + }, + { + "epoch": 0.64, + "grad_norm": 1.5390625, + "learning_rate": 1.6945087708454273e-06, + "logits/chosen": -2.6734468936920166, + "logits/rejected": -2.714264392852783, + "logps/chosen": -343.795654296875, + "logps/rejected": -427.343505859375, + "loss": 0.4219, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.5730609893798828, + "rewards/margins": 1.2771522998809814, + "rewards/rejected": -1.8502132892608643, + "step": 1230 + }, + { + "epoch": 0.65, + "grad_norm": 1.8125, + "learning_rate": 1.651392923780105e-06, + "logits/chosen": -2.6670920848846436, + "logits/rejected": -2.6546778678894043, + "logps/chosen": -322.3703918457031, + "logps/rejected": -434.3629455566406, + "loss": 0.4254, + "rewards/accuracies": 0.7406250238418579, + "rewards/chosen": -0.6311038136482239, + "rewards/margins": 1.2766520977020264, + "rewards/rejected": -1.9077558517456055, + "step": 1240 + }, + { + "epoch": 0.65, + "grad_norm": 2.59375, + "learning_rate": 1.608560504584737e-06, + "logits/chosen": -2.737743377685547, + "logits/rejected": -2.720360517501831, + "logps/chosen": -334.63275146484375, + "logps/rejected": -431.16827392578125, + "loss": 0.4287, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8308721780776978, + "rewards/margins": 1.0826470851898193, + "rewards/rejected": -1.913519263267517, + "step": 1250 + }, + { + "epoch": 0.66, + "grad_norm": 5.21875, + "learning_rate": 1.5660258189393945e-06, + "logits/chosen": -2.7399215698242188, + "logits/rejected": -2.750929117202759, + "logps/chosen": -338.90924072265625, + "logps/rejected": -417.3045959472656, + "loss": 0.4214, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -0.5204821825027466, + "rewards/margins": 1.254575252532959, + "rewards/rejected": -1.7750571966171265, + "step": 1260 + }, + { + "epoch": 0.66, + "grad_norm": 5.96875, + "learning_rate": 1.5238030730835578e-06, + "logits/chosen": -2.716184616088867, + "logits/rejected": -2.690531015396118, + "logps/chosen": -311.8260192871094, + "logps/rejected": -440.0191345214844, + "loss": 0.4252, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.6099028587341309, + "rewards/margins": 1.3696248531341553, + "rewards/rejected": -1.979527473449707, + "step": 1270 + }, + { + "epoch": 0.67, + "grad_norm": 8.5625, + "learning_rate": 1.4819063690713565e-06, + "logits/chosen": -2.723191022872925, + "logits/rejected": -2.699627637863159, + "logps/chosen": -343.42608642578125, + "logps/rejected": -442.84033203125, + "loss": 0.424, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -0.7352994680404663, + "rewards/margins": 1.2973926067352295, + "rewards/rejected": -2.0326919555664062, + "step": 1280 + }, + { + "epoch": 0.68, + "grad_norm": 1.3359375, + "learning_rate": 1.4403497000615885e-06, + "logits/chosen": -2.709167242050171, + "logits/rejected": -2.711165189743042, + "logps/chosen": -346.2407531738281, + "logps/rejected": -453.6136779785156, + "loss": 0.4365, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9276860952377319, + "rewards/margins": 1.1097049713134766, + "rewards/rejected": -2.037391185760498, + "step": 1290 + }, + { + "epoch": 0.68, + "grad_norm": 3.109375, + "learning_rate": 1.3991469456441273e-06, + "logits/chosen": -2.6968870162963867, + "logits/rejected": -2.7066521644592285, + "logps/chosen": -349.227294921875, + "logps/rejected": -449.37274169921875, + "loss": 0.4264, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -0.8388339281082153, + "rewards/margins": 1.244845986366272, + "rewards/rejected": -2.083679676055908, + "step": 1300 + }, + { + "epoch": 0.69, + "grad_norm": 1.90625, + "learning_rate": 1.3583118672042441e-06, + "logits/chosen": -2.682274580001831, + "logits/rejected": -2.689542055130005, + "logps/chosen": -358.6896057128906, + "logps/rejected": -461.7701721191406, + "loss": 0.4298, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8582962155342102, + "rewards/margins": 1.2384599447250366, + "rewards/rejected": -2.0967559814453125, + "step": 1310 + }, + { + "epoch": 0.69, + "grad_norm": 2.734375, + "learning_rate": 1.3178581033264218e-06, + "logits/chosen": -2.7299036979675293, + "logits/rejected": -2.725554943084717, + "logps/chosen": -337.6006774902344, + "logps/rejected": -427.9097595214844, + "loss": 0.4254, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8014458417892456, + "rewards/margins": 1.1019701957702637, + "rewards/rejected": -1.9034160375595093, + "step": 1320 + }, + { + "epoch": 0.7, + "grad_norm": 5.71875, + "learning_rate": 1.2777991652391757e-06, + "logits/chosen": -2.7372395992279053, + "logits/rejected": -2.713787078857422, + "logps/chosen": -349.9215087890625, + "logps/rejected": -458.30218505859375, + "loss": 0.4146, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.605326235294342, + "rewards/margins": 1.2629462480545044, + "rewards/rejected": -1.8682724237442017, + "step": 1330 + }, + { + "epoch": 0.7, + "grad_norm": 2.796875, + "learning_rate": 1.2381484323024178e-06, + "logits/chosen": -2.710744619369507, + "logits/rejected": -2.7276930809020996, + "logps/chosen": -340.2386779785156, + "logps/rejected": -410.47406005859375, + "loss": 0.4286, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -0.6984527111053467, + "rewards/margins": 0.98872309923172, + "rewards/rejected": -1.687175989151001, + "step": 1340 + }, + { + "epoch": 0.71, + "grad_norm": 4.375, + "learning_rate": 1.1989191475388518e-06, + "logits/chosen": -2.7345008850097656, + "logits/rejected": -2.737495183944702, + "logps/chosen": -341.25408935546875, + "logps/rejected": -463.845703125, + "loss": 0.423, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -0.720673680305481, + "rewards/margins": 1.3686655759811401, + "rewards/rejected": -2.089339256286621, + "step": 1350 + }, + { + "epoch": 0.71, + "grad_norm": 1.484375, + "learning_rate": 1.160124413210918e-06, + "logits/chosen": -2.710549831390381, + "logits/rejected": -2.7327733039855957, + "logps/chosen": -357.73516845703125, + "logps/rejected": -460.0575256347656, + "loss": 0.4351, + "rewards/accuracies": 0.6781250238418579, + "rewards/chosen": -0.7213457822799683, + "rewards/margins": 1.1925950050354004, + "rewards/rejected": -1.9139407873153687, + "step": 1360 + }, + { + "epoch": 0.72, + "grad_norm": 5.8125, + "learning_rate": 1.1217771864447396e-06, + "logits/chosen": -2.745375871658325, + "logits/rejected": -2.734910488128662, + "logps/chosen": -334.57159423828125, + "logps/rejected": -431.4033203125, + "loss": 0.4065, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.6131495833396912, + "rewards/margins": 1.3364530801773071, + "rewards/rejected": -1.949602484703064, + "step": 1370 + }, + { + "epoch": 0.72, + "grad_norm": 3.5, + "learning_rate": 1.08389027490255e-06, + "logits/chosen": -2.7352352142333984, + "logits/rejected": -2.731990098953247, + "logps/chosen": -318.87939453125, + "logps/rejected": -447.78167724609375, + "loss": 0.4207, + "rewards/accuracies": 0.746874988079071, + "rewards/chosen": -0.6276998519897461, + "rewards/margins": 1.426599383354187, + "rewards/rejected": -2.0542993545532227, + "step": 1380 + }, + { + "epoch": 0.73, + "grad_norm": 3.765625, + "learning_rate": 1.046476332505036e-06, + "logits/chosen": -2.7105917930603027, + "logits/rejected": -2.7000508308410645, + "logps/chosen": -351.5965881347656, + "logps/rejected": -464.24041748046875, + "loss": 0.4229, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -0.666134238243103, + "rewards/margins": 1.3925716876983643, + "rewards/rejected": -2.0587058067321777, + "step": 1390 + }, + { + "epoch": 0.73, + "grad_norm": 1.78125, + "learning_rate": 1.0095478552050348e-06, + "logits/chosen": -2.7293667793273926, + "logits/rejected": -2.748169422149658, + "logps/chosen": -347.258056640625, + "logps/rejected": -454.30126953125, + "loss": 0.428, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7874716520309448, + "rewards/margins": 1.2901605367660522, + "rewards/rejected": -2.077632188796997, + "step": 1400 + }, + { + "epoch": 0.74, + "grad_norm": 3.6875, + "learning_rate": 9.731171768139808e-07, + "logits/chosen": -2.747467279434204, + "logits/rejected": -2.7329678535461426, + "logps/chosen": -328.9295959472656, + "logps/rejected": -426.88812255859375, + "loss": 0.4343, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.7400668859481812, + "rewards/margins": 1.056199073791504, + "rewards/rejected": -1.796265959739685, + "step": 1410 + }, + { + "epoch": 0.74, + "grad_norm": 2.40625, + "learning_rate": 9.371964648825221e-07, + "logits/chosen": -2.7056431770324707, + "logits/rejected": -2.711988925933838, + "logps/chosen": -358.60186767578125, + "logps/rejected": -416.0870056152344, + "loss": 0.4388, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9142545461654663, + "rewards/margins": 0.8788741827011108, + "rewards/rejected": -1.7931289672851562, + "step": 1420 + }, + { + "epoch": 0.75, + "grad_norm": 3.09375, + "learning_rate": 9.017977166366445e-07, + "logits/chosen": -2.712580919265747, + "logits/rejected": -2.7144558429718018, + "logps/chosen": -340.9812316894531, + "logps/rejected": -484.244873046875, + "loss": 0.4154, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -0.7170324921607971, + "rewards/margins": 1.5092626810073853, + "rewards/rejected": -2.2262954711914062, + "step": 1430 + }, + { + "epoch": 0.75, + "grad_norm": 4.5625, + "learning_rate": 8.669327549707096e-07, + "logits/chosen": -2.7783093452453613, + "logits/rejected": -2.782752513885498, + "logps/chosen": -336.6181335449219, + "logps/rejected": -427.4580078125, + "loss": 0.4337, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.7095115184783936, + "rewards/margins": 1.0761725902557373, + "rewards/rejected": -1.7856842279434204, + "step": 1440 + }, + { + "epoch": 0.76, + "grad_norm": 6.625, + "learning_rate": 8.326132244986932e-07, + "logits/chosen": -2.7186310291290283, + "logits/rejected": -2.7277238368988037, + "logps/chosen": -330.8462219238281, + "logps/rejected": -445.0597229003906, + "loss": 0.4328, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.7316503524780273, + "rewards/margins": 1.2994225025177002, + "rewards/rejected": -2.0310728549957275, + "step": 1450 + }, + { + "epoch": 0.76, + "grad_norm": 6.40625, + "learning_rate": 7.988505876649863e-07, + "logits/chosen": -2.673827648162842, + "logits/rejected": -2.6909382343292236, + "logps/chosen": -328.7157287597656, + "logps/rejected": -426.40997314453125, + "loss": 0.4208, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.6509988307952881, + "rewards/margins": 1.171143651008606, + "rewards/rejected": -1.8221423625946045, + "step": 1460 + }, + { + "epoch": 0.77, + "grad_norm": 5.03125, + "learning_rate": 7.656561209160248e-07, + "logits/chosen": -2.688586711883545, + "logits/rejected": -2.6996278762817383, + "logps/chosen": -335.6667785644531, + "logps/rejected": -412.533447265625, + "loss": 0.4229, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6491155624389648, + "rewards/margins": 1.1151740550994873, + "rewards/rejected": -1.7642898559570312, + "step": 1470 + }, + { + "epoch": 0.77, + "grad_norm": 5.71875, + "learning_rate": 7.330409109340563e-07, + "logits/chosen": -2.7187418937683105, + "logits/rejected": -2.713865041732788, + "logps/chosen": -330.7655334472656, + "logps/rejected": -445.03485107421875, + "loss": 0.421, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.6798223257064819, + "rewards/margins": 1.387519121170044, + "rewards/rejected": -2.0673413276672363, + "step": 1480 + }, + { + "epoch": 0.78, + "grad_norm": 4.21875, + "learning_rate": 7.010158509342682e-07, + "logits/chosen": -2.705104351043701, + "logits/rejected": -2.7204995155334473, + "logps/chosen": -348.8349304199219, + "logps/rejected": -465.5157775878906, + "loss": 0.4239, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7643205523490906, + "rewards/margins": 1.3054136037826538, + "rewards/rejected": -2.0697340965270996, + "step": 1490 + }, + { + "epoch": 0.79, + "grad_norm": 3.25, + "learning_rate": 6.695916370265529e-07, + "logits/chosen": -2.7456018924713135, + "logits/rejected": -2.75111722946167, + "logps/chosen": -298.90582275390625, + "logps/rejected": -406.1940002441406, + "loss": 0.4243, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6092133522033691, + "rewards/margins": 1.119093894958496, + "rewards/rejected": -1.7283073663711548, + "step": 1500 + }, + { + "epoch": 0.79, + "grad_norm": 3.75, + "learning_rate": 6.387787646430854e-07, + "logits/chosen": -2.7123939990997314, + "logits/rejected": -2.7206320762634277, + "logps/chosen": -311.45892333984375, + "logps/rejected": -402.0213317871094, + "loss": 0.4307, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5885223746299744, + "rewards/margins": 1.1126749515533447, + "rewards/rejected": -1.7011972665786743, + "step": 1510 + }, + { + "epoch": 0.8, + "grad_norm": 2.28125, + "learning_rate": 6.085875250329401e-07, + "logits/chosen": -2.7589831352233887, + "logits/rejected": -2.747678756713867, + "logps/chosen": -326.8139953613281, + "logps/rejected": -408.4124450683594, + "loss": 0.4281, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.6763142347335815, + "rewards/margins": 1.2141085863113403, + "rewards/rejected": -1.8904228210449219, + "step": 1520 + }, + { + "epoch": 0.8, + "grad_norm": 4.4375, + "learning_rate": 5.79028001824894e-07, + "logits/chosen": -2.676466941833496, + "logits/rejected": -2.6653060913085938, + "logps/chosen": -302.88861083984375, + "logps/rejected": -453.6151428222656, + "loss": 0.4205, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.5426191687583923, + "rewards/margins": 1.233906626701355, + "rewards/rejected": -1.776525855064392, + "step": 1530 + }, + { + "epoch": 0.81, + "grad_norm": 2.265625, + "learning_rate": 5.501100676595761e-07, + "logits/chosen": -2.7586090564727783, + "logits/rejected": -2.7702584266662598, + "logps/chosen": -345.1263732910156, + "logps/rejected": -439.22808837890625, + "loss": 0.4249, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6815992593765259, + "rewards/margins": 1.174318790435791, + "rewards/rejected": -1.8559181690216064, + "step": 1540 + }, + { + "epoch": 0.81, + "grad_norm": 3.3125, + "learning_rate": 5.218433808920884e-07, + "logits/chosen": -2.741339921951294, + "logits/rejected": -2.7451634407043457, + "logps/chosen": -329.6106262207031, + "logps/rejected": -418.102294921875, + "loss": 0.418, + "rewards/accuracies": 0.778124988079071, + "rewards/chosen": -0.48116618394851685, + "rewards/margins": 1.2408123016357422, + "rewards/rejected": -1.7219784259796143, + "step": 1550 + }, + { + "epoch": 0.82, + "grad_norm": 2.78125, + "learning_rate": 4.942373823661928e-07, + "logits/chosen": -2.7156219482421875, + "logits/rejected": -2.7195382118225098, + "logps/chosen": -338.1180114746094, + "logps/rejected": -429.4137268066406, + "loss": 0.4242, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5551623702049255, + "rewards/margins": 1.1304242610931396, + "rewards/rejected": -1.6855865716934204, + "step": 1560 + }, + { + "epoch": 0.82, + "grad_norm": 0.64453125, + "learning_rate": 4.6730129226114363e-07, + "logits/chosen": -2.74899959564209, + "logits/rejected": -2.7066941261291504, + "logps/chosen": -316.0484619140625, + "logps/rejected": -443.41156005859375, + "loss": 0.4244, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -0.6658861637115479, + "rewards/margins": 1.3104979991912842, + "rewards/rejected": -1.976383924484253, + "step": 1570 + }, + { + "epoch": 0.83, + "grad_norm": 3.03125, + "learning_rate": 4.4104410701222703e-07, + "logits/chosen": -2.764725923538208, + "logits/rejected": -2.7511062622070312, + "logps/chosen": -345.6468200683594, + "logps/rejected": -458.3433532714844, + "loss": 0.4297, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": -0.7146095037460327, + "rewards/margins": 1.2348105907440186, + "rewards/rejected": -1.9494202136993408, + "step": 1580 + }, + { + "epoch": 0.83, + "grad_norm": 2.609375, + "learning_rate": 4.154745963060197e-07, + "logits/chosen": -2.690899610519409, + "logits/rejected": -2.68369460105896, + "logps/chosen": -327.4097900390625, + "logps/rejected": -445.50457763671875, + "loss": 0.4267, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -0.7736159563064575, + "rewards/margins": 1.229644775390625, + "rewards/rejected": -2.003260850906372, + "step": 1590 + }, + { + "epoch": 0.84, + "grad_norm": 2.640625, + "learning_rate": 3.9060130015138863e-07, + "logits/chosen": -2.7254629135131836, + "logits/rejected": -2.7183403968811035, + "logps/chosen": -338.1430358886719, + "logps/rejected": -464.69049072265625, + "loss": 0.4245, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -0.7131890058517456, + "rewards/margins": 1.2739613056182861, + "rewards/rejected": -1.987149953842163, + "step": 1600 + }, + { + "epoch": 0.84, + "grad_norm": 1.015625, + "learning_rate": 3.664325260271953e-07, + "logits/chosen": -2.7088608741760254, + "logits/rejected": -2.730421304702759, + "logps/chosen": -357.98126220703125, + "logps/rejected": -414.66668701171875, + "loss": 0.4309, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.7439313530921936, + "rewards/margins": 0.9089797139167786, + "rewards/rejected": -1.652910828590393, + "step": 1610 + }, + { + "epoch": 0.85, + "grad_norm": 4.28125, + "learning_rate": 3.429763461076677e-07, + "logits/chosen": -2.7392077445983887, + "logits/rejected": -2.7532036304473877, + "logps/chosen": -340.41436767578125, + "logps/rejected": -411.6546936035156, + "loss": 0.4351, + "rewards/accuracies": 0.6781250238418579, + "rewards/chosen": -0.7221731543540955, + "rewards/margins": 0.9596914052963257, + "rewards/rejected": -1.6818645000457764, + "step": 1620 + }, + { + "epoch": 0.85, + "grad_norm": 6.1875, + "learning_rate": 3.202405945663556e-07, + "logits/chosen": -2.739431858062744, + "logits/rejected": -2.7153592109680176, + "logps/chosen": -351.1690368652344, + "logps/rejected": -460.6399841308594, + "loss": 0.4206, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7035288214683533, + "rewards/margins": 1.3128876686096191, + "rewards/rejected": -2.016416549682617, + "step": 1630 + }, + { + "epoch": 0.86, + "grad_norm": 4.34375, + "learning_rate": 2.982328649595856e-07, + "logits/chosen": -2.7347230911254883, + "logits/rejected": -2.7274715900421143, + "logps/chosen": -338.5975646972656, + "logps/rejected": -435.02972412109375, + "loss": 0.4329, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -0.722823441028595, + "rewards/margins": 1.025390625, + "rewards/rejected": -1.7482140064239502, + "step": 1640 + }, + { + "epoch": 0.86, + "grad_norm": 2.75, + "learning_rate": 2.7696050769026954e-07, + "logits/chosen": -2.7211785316467285, + "logits/rejected": -2.684309720993042, + "logps/chosen": -344.1480407714844, + "logps/rejected": -481.3314514160156, + "loss": 0.4299, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.7990375757217407, + "rewards/margins": 1.399259328842163, + "rewards/rejected": -2.1982970237731934, + "step": 1650 + }, + { + "epoch": 0.87, + "grad_norm": 2.015625, + "learning_rate": 2.564306275529341e-07, + "logits/chosen": -2.7370145320892334, + "logits/rejected": -2.7407755851745605, + "logps/chosen": -321.891357421875, + "logps/rejected": -401.0546569824219, + "loss": 0.4248, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.7101179957389832, + "rewards/margins": 1.1242733001708984, + "rewards/rejected": -1.8343912363052368, + "step": 1660 + }, + { + "epoch": 0.87, + "grad_norm": 3.671875, + "learning_rate": 2.3665008136077332e-07, + "logits/chosen": -2.7449889183044434, + "logits/rejected": -2.7408945560455322, + "logps/chosen": -360.5602722167969, + "logps/rejected": -467.7608337402344, + "loss": 0.4302, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.823918342590332, + "rewards/margins": 1.2585489749908447, + "rewards/rejected": -2.0824673175811768, + "step": 1670 + }, + { + "epoch": 0.88, + "grad_norm": 4.25, + "learning_rate": 2.1762547565553293e-07, + "logits/chosen": -2.6576642990112305, + "logits/rejected": -2.6262636184692383, + "logps/chosen": -352.53857421875, + "logps/rejected": -485.3980407714844, + "loss": 0.419, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.7759647965431213, + "rewards/margins": 1.4105063676834106, + "rewards/rejected": -2.1864712238311768, + "step": 1680 + }, + { + "epoch": 0.88, + "grad_norm": 1.2578125, + "learning_rate": 1.993631645009747e-07, + "logits/chosen": -2.70914888381958, + "logits/rejected": -2.6992902755737305, + "logps/chosen": -348.275390625, + "logps/rejected": -432.3793029785156, + "loss": 0.4294, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9621282815933228, + "rewards/margins": 1.0199404954910278, + "rewards/rejected": -1.982068657875061, + "step": 1690 + }, + { + "epoch": 0.89, + "grad_norm": 4.125, + "learning_rate": 1.818692473606748e-07, + "logits/chosen": -2.749687671661377, + "logits/rejected": -2.743114948272705, + "logps/chosen": -356.88311767578125, + "logps/rejected": -437.4881896972656, + "loss": 0.4101, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5851279497146606, + "rewards/margins": 1.345589280128479, + "rewards/rejected": -1.93071711063385, + "step": 1700 + }, + { + "epoch": 0.9, + "grad_norm": 3.015625, + "learning_rate": 1.6514956706084885e-07, + "logits/chosen": -2.731550931930542, + "logits/rejected": -2.7247581481933594, + "logps/chosen": -367.1907653808594, + "logps/rejected": -460.9814453125, + "loss": 0.4151, + "rewards/accuracies": 0.746874988079071, + "rewards/chosen": -0.8159070014953613, + "rewards/margins": 1.3104612827301025, + "rewards/rejected": -2.1263680458068848, + "step": 1710 + }, + { + "epoch": 0.9, + "grad_norm": 1.921875, + "learning_rate": 1.4920970783889737e-07, + "logits/chosen": -2.728755235671997, + "logits/rejected": -2.716214895248413, + "logps/chosen": -326.6175842285156, + "logps/rejected": -425.10528564453125, + "loss": 0.4179, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5755528807640076, + "rewards/margins": 1.2382352352142334, + "rewards/rejected": -1.8137880563735962, + "step": 1720 + }, + { + "epoch": 0.91, + "grad_norm": 1.921875, + "learning_rate": 1.340549934783164e-07, + "logits/chosen": -2.7430858612060547, + "logits/rejected": -2.7155654430389404, + "logps/chosen": -319.4033508300781, + "logps/rejected": -460.38494873046875, + "loss": 0.4219, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -0.5762430429458618, + "rewards/margins": 1.5360796451568604, + "rewards/rejected": -2.1123225688934326, + "step": 1730 + }, + { + "epoch": 0.91, + "grad_norm": 0.8046875, + "learning_rate": 1.196904855305961e-07, + "logits/chosen": -2.7446229457855225, + "logits/rejected": -2.741819143295288, + "logps/chosen": -347.061279296875, + "logps/rejected": -454.8154296875, + "loss": 0.4227, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -0.7210251688957214, + "rewards/margins": 1.3714605569839478, + "rewards/rejected": -2.0924859046936035, + "step": 1740 + }, + { + "epoch": 0.92, + "grad_norm": 6.78125, + "learning_rate": 1.0612098162470302e-07, + "logits/chosen": -2.754495143890381, + "logits/rejected": -2.770519733428955, + "logps/chosen": -356.2037658691406, + "logps/rejected": -428.56915283203125, + "loss": 0.4204, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6976330876350403, + "rewards/margins": 1.1086335182189941, + "rewards/rejected": -1.8062665462493896, + "step": 1750 + }, + { + "epoch": 0.92, + "grad_norm": 1.703125, + "learning_rate": 9.335101386471285e-08, + "logits/chosen": -2.7438502311706543, + "logits/rejected": -2.732393741607666, + "logps/chosen": -325.47747802734375, + "logps/rejected": -466.6300354003906, + "loss": 0.4096, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.5821502208709717, + "rewards/margins": 1.489473819732666, + "rewards/rejected": -2.0716240406036377, + "step": 1760 + }, + { + "epoch": 0.93, + "grad_norm": 1.125, + "learning_rate": 8.138484731612273e-08, + "logits/chosen": -2.6938061714172363, + "logits/rejected": -2.6772360801696777, + "logps/chosen": -339.33856201171875, + "logps/rejected": -459.6874084472656, + "loss": 0.4259, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -0.7721551060676575, + "rewards/margins": 1.2288380861282349, + "rewards/rejected": -2.000993251800537, + "step": 1770 + }, + { + "epoch": 0.93, + "grad_norm": 5.15625, + "learning_rate": 7.022647858135501e-08, + "logits/chosen": -2.744938850402832, + "logits/rejected": -2.7732322216033936, + "logps/chosen": -332.72509765625, + "logps/rejected": -421.79949951171875, + "loss": 0.4295, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.7302526235580444, + "rewards/margins": 1.0942634344100952, + "rewards/rejected": -1.82451593875885, + "step": 1780 + }, + { + "epoch": 0.94, + "grad_norm": 1.7734375, + "learning_rate": 5.987963446492384e-08, + "logits/chosen": -2.6715445518493652, + "logits/rejected": -2.6765220165252686, + "logps/chosen": -348.5739440917969, + "logps/rejected": -482.542236328125, + "loss": 0.4117, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.6485568284988403, + "rewards/margins": 1.4271323680877686, + "rewards/rejected": -2.0756890773773193, + "step": 1790 + }, + { + "epoch": 0.94, + "grad_norm": 4.46875, + "learning_rate": 5.034777072871394e-08, + "logits/chosen": -2.7057948112487793, + "logits/rejected": -2.7085628509521484, + "logps/chosen": -318.53497314453125, + "logps/rejected": -424.2813415527344, + "loss": 0.4289, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7039138674736023, + "rewards/margins": 1.1032226085662842, + "rewards/rejected": -1.8071365356445312, + "step": 1800 + }, + { + "epoch": 0.95, + "grad_norm": 2.5625, + "learning_rate": 4.163407093778243e-08, + "logits/chosen": -2.6782615184783936, + "logits/rejected": -2.6741080284118652, + "logps/chosen": -333.125732421875, + "logps/rejected": -420.4185485839844, + "loss": 0.4345, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -0.8739233016967773, + "rewards/margins": 1.0754514932632446, + "rewards/rejected": -1.949374794960022, + "step": 1810 + }, + { + "epoch": 0.95, + "grad_norm": 2.3125, + "learning_rate": 3.37414453970758e-08, + "logits/chosen": -2.742910146713257, + "logits/rejected": -2.7536118030548096, + "logps/chosen": -337.20489501953125, + "logps/rejected": -413.38238525390625, + "loss": 0.4246, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6338103413581848, + "rewards/margins": 1.2298805713653564, + "rewards/rejected": -1.863690972328186, + "step": 1820 + }, + { + "epoch": 0.96, + "grad_norm": 7.125, + "learning_rate": 2.6672530179410183e-08, + "logits/chosen": -2.7278337478637695, + "logits/rejected": -2.7276864051818848, + "logps/chosen": -324.9429626464844, + "logps/rejected": -436.3682556152344, + "loss": 0.4287, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -0.6213011145591736, + "rewards/margins": 1.2622824907302856, + "rewards/rejected": -1.883583426475525, + "step": 1830 + }, + { + "epoch": 0.96, + "grad_norm": 1.5859375, + "learning_rate": 2.04296862450451e-08, + "logits/chosen": -2.7341113090515137, + "logits/rejected": -2.7228341102600098, + "logps/chosen": -359.82659912109375, + "logps/rejected": -492.15771484375, + "loss": 0.4246, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.8840007781982422, + "rewards/margins": 1.547300100326538, + "rewards/rejected": -2.431300640106201, + "step": 1840 + }, + { + "epoch": 0.97, + "grad_norm": 1.3046875, + "learning_rate": 1.501499865314171e-08, + "logits/chosen": -2.655301809310913, + "logits/rejected": -2.6794886589050293, + "logps/chosen": -379.09075927734375, + "logps/rejected": -465.2655334472656, + "loss": 0.4153, + "rewards/accuracies": 0.7593749761581421, + "rewards/chosen": -0.6638426184654236, + "rewards/margins": 1.2455824613571167, + "rewards/rejected": -1.909425139427185, + "step": 1850 + }, + { + "epoch": 0.97, + "grad_norm": 1.8046875, + "learning_rate": 1.0430275865371265e-08, + "logits/chosen": -2.700707197189331, + "logits/rejected": -2.6735973358154297, + "logps/chosen": -324.67864990234375, + "logps/rejected": -426.3045959472656, + "loss": 0.4269, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7671259641647339, + "rewards/margins": 1.2393693923950195, + "rewards/rejected": -2.006495475769043, + "step": 1860 + }, + { + "epoch": 0.98, + "grad_norm": 2.96875, + "learning_rate": 6.677049141901315e-09, + "logits/chosen": -2.714787006378174, + "logits/rejected": -2.729104518890381, + "logps/chosen": -332.42156982421875, + "logps/rejected": -465.78497314453125, + "loss": 0.4213, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6763681769371033, + "rewards/margins": 1.4941723346710205, + "rewards/rejected": -2.1705403327941895, + "step": 1870 + }, + { + "epoch": 0.98, + "grad_norm": 1.4140625, + "learning_rate": 3.756572029968708e-09, + "logits/chosen": -2.7444651126861572, + "logits/rejected": -2.756309986114502, + "logps/chosen": -348.5865783691406, + "logps/rejected": -453.53350830078125, + "loss": 0.4206, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7736285328865051, + "rewards/margins": 1.2819445133209229, + "rewards/rejected": -2.055572986602783, + "step": 1880 + }, + { + "epoch": 0.99, + "grad_norm": 3.234375, + "learning_rate": 1.6698199452053199e-09, + "logits/chosen": -2.7279868125915527, + "logits/rejected": -2.7107343673706055, + "logps/chosen": -339.8231506347656, + "logps/rejected": -435.91436767578125, + "loss": 0.4382, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6851938962936401, + "rewards/margins": 0.9626883268356323, + "rewards/rejected": -1.6478822231292725, + "step": 1890 + }, + { + "epoch": 0.99, + "grad_norm": 1.25, + "learning_rate": 4.1748984585560094e-10, + "logits/chosen": -2.7459967136383057, + "logits/rejected": -2.7115063667297363, + "logps/chosen": -353.6007385253906, + "logps/rejected": -478.2798767089844, + "loss": 0.4255, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.7070942521095276, + "rewards/margins": 1.4682670831680298, + "rewards/rejected": -2.175361156463623, + "step": 1900 + }, + { + "epoch": 1.0, + "grad_norm": 0.84765625, + "learning_rate": 0.0, + "logits/chosen": -2.7471261024475098, + "logits/rejected": -2.7326393127441406, + "logps/chosen": -346.9029846191406, + "logps/rejected": -450.1078186035156, + "loss": 0.4197, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -0.7211757302284241, + "rewards/margins": 1.336022973060608, + "rewards/rejected": -2.0571985244750977, + "step": 1910 + }, { "epoch": 1.0, - "step": 2, + "step": 1910, "total_flos": 0.0, - "train_loss": 0.5000961124897003, - "train_runtime": 87.0205, - "train_samples_per_second": 0.701, + "train_loss": 0.4411179514611579, + "train_runtime": 83930.2059, + "train_samples_per_second": 0.728, "train_steps_per_second": 0.023 } ], "logging_steps": 10, - "max_steps": 2, + "max_steps": 1910, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100,