{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 4176, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 2.744801810219906, "learning_rate": 1.1961722488038277e-09, "logits/chosen": -2.481754779815674, "logits/rejected": -2.508425712585449, "logps/chosen": -415.2900390625, "logps/rejected": -256.90203857421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 2.7256023826089337, "learning_rate": 1.1961722488038278e-08, "logits/chosen": -2.3546273708343506, "logits/rejected": -2.258561372756958, "logps/chosen": -285.6593322753906, "logps/rejected": -256.5340576171875, "loss": 0.6932, "rewards/accuracies": 0.0972222238779068, "rewards/chosen": -0.00019415299175307155, "rewards/margins": -0.00012803316349163651, "rewards/margins_max": 0.0011134507367387414, "rewards/margins_min": -0.001636704197153449, "rewards/margins_std": 0.0011796443723142147, "rewards/rejected": -6.611982826143503e-05, "step": 10 }, { "epoch": 0.0, "grad_norm": 2.3185006719657775, "learning_rate": 2.3923444976076555e-08, "logits/chosen": -2.2974579334259033, "logits/rejected": -2.195504665374756, "logps/chosen": -227.86630249023438, "logps/rejected": -247.80667114257812, "loss": 0.6932, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.0003401043068151921, "rewards/margins": -0.000454053923022002, "rewards/margins_max": 0.0026182287838310003, "rewards/margins_min": -0.0034712045453488827, "rewards/margins_std": 0.0027132206596434116, "rewards/rejected": 0.0001139496307587251, "step": 20 }, { "epoch": 0.01, "grad_norm": 3.1888704532735774, "learning_rate": 3.588516746411483e-08, "logits/chosen": -2.4950051307678223, "logits/rejected": -2.3585734367370605, "logps/chosen": -327.5831298828125, "logps/rejected": -277.7046813964844, "loss": 0.6932, "rewards/accuracies": 0.4375, "rewards/chosen": -0.00012490439985413104, "rewards/margins": -0.0002924039145000279, "rewards/margins_max": 0.0038055398035794497, "rewards/margins_min": -0.004406157415360212, "rewards/margins_std": 0.0036161807365715504, "rewards/rejected": 0.00016749951464589685, "step": 30 }, { "epoch": 0.01, "grad_norm": 2.4538656683781253, "learning_rate": 4.784688995215311e-08, "logits/chosen": -2.4015650749206543, "logits/rejected": -2.295456647872925, "logps/chosen": -252.20578002929688, "logps/rejected": -236.5134735107422, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": -0.00013311101065482944, "rewards/margins": -1.4377932302522822e-06, "rewards/margins_max": 0.002963937120512128, "rewards/margins_min": -0.0032969701569527388, "rewards/margins_std": 0.002770522143691778, "rewards/rejected": -0.00013167322322260588, "step": 40 }, { "epoch": 0.01, "grad_norm": 2.686019844714698, "learning_rate": 5.980861244019139e-08, "logits/chosen": -2.497512102127075, "logits/rejected": -2.4310920238494873, "logps/chosen": -318.91717529296875, "logps/rejected": -323.89068603515625, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": -0.00019797799177467823, "rewards/margins": 9.350036270916462e-05, "rewards/margins_max": 0.003976952284574509, "rewards/margins_min": -0.004525828640908003, "rewards/margins_std": 0.003774531651288271, "rewards/rejected": -0.00029147841269150376, "step": 50 }, { "epoch": 0.01, "grad_norm": 2.4438452660608943, "learning_rate": 7.177033492822967e-08, "logits/chosen": -2.459364414215088, "logits/rejected": -2.2450814247131348, "logps/chosen": -299.24847412109375, "logps/rejected": -252.3929443359375, "loss": 0.6929, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -7.616997754666954e-05, "rewards/margins": -2.539743400120642e-05, "rewards/margins_max": 0.003886754158884287, "rewards/margins_min": -0.004287646152079105, "rewards/margins_std": 0.003580376971513033, "rewards/rejected": -5.0772534450516105e-05, "step": 60 }, { "epoch": 0.02, "grad_norm": 3.3140747357852565, "learning_rate": 8.373205741626794e-08, "logits/chosen": -2.3426272869110107, "logits/rejected": -2.259082078933716, "logps/chosen": -276.5811462402344, "logps/rejected": -260.48822021484375, "loss": 0.693, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0003733697230927646, "rewards/margins": 0.00028731845668517053, "rewards/margins_max": 0.004445819184184074, "rewards/margins_min": -0.0037965658120810986, "rewards/margins_std": 0.0036791344173252583, "rewards/rejected": 8.605121547589079e-05, "step": 70 }, { "epoch": 0.02, "grad_norm": 2.416214166129825, "learning_rate": 9.569377990430622e-08, "logits/chosen": -2.2033050060272217, "logits/rejected": -2.2411110401153564, "logps/chosen": -213.8606414794922, "logps/rejected": -250.7822723388672, "loss": 0.6928, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -7.874786206230056e-06, "rewards/margins": 0.0005829391884617507, "rewards/margins_max": 0.004679401405155659, "rewards/margins_min": -0.002689824905246496, "rewards/margins_std": 0.0033095483668148518, "rewards/rejected": -0.000590813928283751, "step": 80 }, { "epoch": 0.02, "grad_norm": 2.600111364767266, "learning_rate": 1.076555023923445e-07, "logits/chosen": -2.377185583114624, "logits/rejected": -2.2353789806365967, "logps/chosen": -273.81182861328125, "logps/rejected": -251.4143829345703, "loss": 0.6931, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0008434997871518135, "rewards/margins": -1.6722106011002325e-05, "rewards/margins_max": 0.004678300581872463, "rewards/margins_min": -0.004732917062938213, "rewards/margins_std": 0.004126701038330793, "rewards/rejected": -0.0008267777739092708, "step": 90 }, { "epoch": 0.02, "grad_norm": 2.417577363662982, "learning_rate": 1.1961722488038278e-07, "logits/chosen": -2.3500170707702637, "logits/rejected": -2.3235528469085693, "logps/chosen": -299.5725402832031, "logps/rejected": -314.0147705078125, "loss": 0.6928, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.00042938842670992017, "rewards/margins": 0.0005152079975232482, "rewards/margins_max": 0.006287244614213705, "rewards/margins_min": -0.004308235365897417, "rewards/margins_std": 0.004770913161337376, "rewards/rejected": -0.0009445963660255075, "step": 100 }, { "epoch": 0.02, "eval_logits/chosen": -2.285825252532959, "eval_logits/rejected": -2.1755590438842773, "eval_logps/chosen": -271.71380615234375, "eval_logps/rejected": -261.3662414550781, "eval_loss": 0.6928360462188721, "eval_rewards/accuracies": 0.5665000081062317, "eval_rewards/chosen": -0.0005076593370176852, "eval_rewards/margins": 0.0007294937968254089, "eval_rewards/margins_max": 0.007028755731880665, "eval_rewards/margins_min": -0.0052985576912760735, "eval_rewards/margins_std": 0.004023722838610411, "eval_rewards/rejected": -0.0012371530756354332, "eval_runtime": 1495.6967, "eval_samples_per_second": 2.674, "eval_steps_per_second": 0.167, "step": 100 }, { "epoch": 0.03, "grad_norm": 2.3727695632721324, "learning_rate": 1.3157894736842104e-07, "logits/chosen": -2.4297595024108887, "logits/rejected": -2.3567004203796387, "logps/chosen": -264.10272216796875, "logps/rejected": -260.05914306640625, "loss": 0.6928, "rewards/accuracies": 0.5625, "rewards/chosen": -1.8811673726304434e-05, "rewards/margins": 0.0006207687547430396, "rewards/margins_max": 0.004516714718192816, "rewards/margins_min": -0.003514329669997096, "rewards/margins_std": 0.0036770254373550415, "rewards/rejected": -0.0006395805394276977, "step": 110 }, { "epoch": 0.03, "grad_norm": 2.2707293658706544, "learning_rate": 1.4354066985645933e-07, "logits/chosen": -2.408071994781494, "logits/rejected": -2.240220069885254, "logps/chosen": -256.7394104003906, "logps/rejected": -215.1817626953125, "loss": 0.6924, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0001598354720044881, "rewards/margins": 0.0010401509935036302, "rewards/margins_max": 0.005896436516195536, "rewards/margins_min": -0.003224579617381096, "rewards/margins_std": 0.00400746101513505, "rewards/rejected": -0.001199986319988966, "step": 120 }, { "epoch": 0.03, "grad_norm": 3.129791401258727, "learning_rate": 1.555023923444976e-07, "logits/chosen": -2.4550232887268066, "logits/rejected": -2.3440098762512207, "logps/chosen": -299.2154235839844, "logps/rejected": -285.93115234375, "loss": 0.6922, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.8765596653101966e-05, "rewards/margins": 0.0018414685036987066, "rewards/margins_max": 0.0066340044140815735, "rewards/margins_min": -0.002378536621108651, "rewards/margins_std": 0.004152114503085613, "rewards/rejected": -0.001870234147645533, "step": 130 }, { "epoch": 0.03, "grad_norm": 2.810312904231474, "learning_rate": 1.6746411483253589e-07, "logits/chosen": -2.4631824493408203, "logits/rejected": -2.376560926437378, "logps/chosen": -277.13677978515625, "logps/rejected": -260.3792419433594, "loss": 0.692, "rewards/accuracies": 0.6875, "rewards/chosen": 0.00046903896145522594, "rewards/margins": 0.0031966641545295715, "rewards/margins_max": 0.009032927453517914, "rewards/margins_min": -0.0024510668590664864, "rewards/margins_std": 0.005039413925260305, "rewards/rejected": -0.0027276247274130583, "step": 140 }, { "epoch": 0.04, "grad_norm": 2.673609477341278, "learning_rate": 1.7942583732057415e-07, "logits/chosen": -2.5478498935699463, "logits/rejected": -2.4153032302856445, "logps/chosen": -315.06866455078125, "logps/rejected": -316.5470275878906, "loss": 0.6926, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.002179680857807398, "rewards/margins": 0.0008115528034977615, "rewards/margins_max": 0.007841795682907104, "rewards/margins_min": -0.00833915639668703, "rewards/margins_std": 0.007188429590314627, "rewards/rejected": -0.0029912334866821766, "step": 150 }, { "epoch": 0.04, "grad_norm": 2.6323580477010315, "learning_rate": 1.9138755980861244e-07, "logits/chosen": -2.3698267936706543, "logits/rejected": -2.354356288909912, "logps/chosen": -241.5719757080078, "logps/rejected": -250.2275390625, "loss": 0.6916, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0004969940055161715, "rewards/margins": 0.002713206224143505, "rewards/margins_max": 0.009774195961654186, "rewards/margins_min": -0.003615331370383501, "rewards/margins_std": 0.006041725166141987, "rewards/rejected": -0.00321020046249032, "step": 160 }, { "epoch": 0.04, "grad_norm": 2.3420843690812103, "learning_rate": 2.033492822966507e-07, "logits/chosen": -2.3809444904327393, "logits/rejected": -2.243154764175415, "logps/chosen": -280.31561279296875, "logps/rejected": -229.35989379882812, "loss": 0.6918, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.002677258802577853, "rewards/margins": 0.0020244556944817305, "rewards/margins_max": 0.013485155999660492, "rewards/margins_min": -0.007769724819809198, "rewards/margins_std": 0.009472411125898361, "rewards/rejected": -0.004701714497059584, "step": 170 }, { "epoch": 0.04, "grad_norm": 3.2128438743274192, "learning_rate": 2.15311004784689e-07, "logits/chosen": -2.4034183025360107, "logits/rejected": -2.279552936553955, "logps/chosen": -295.80950927734375, "logps/rejected": -290.2058410644531, "loss": 0.691, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.002126263454556465, "rewards/margins": 0.003915103618055582, "rewards/margins_max": 0.012811603024601936, "rewards/margins_min": -0.004002536181360483, "rewards/margins_std": 0.007635393179953098, "rewards/rejected": -0.006041367072612047, "step": 180 }, { "epoch": 0.05, "grad_norm": 2.0432338797846907, "learning_rate": 2.2727272727272726e-07, "logits/chosen": -2.4159984588623047, "logits/rejected": -2.3051364421844482, "logps/chosen": -212.4981689453125, "logps/rejected": -184.82437133789062, "loss": 0.6919, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0017597004771232605, "rewards/margins": 0.002568793948739767, "rewards/margins_max": 0.013890397734940052, "rewards/margins_min": -0.007022005971521139, "rewards/margins_std": 0.009462077170610428, "rewards/rejected": -0.00432849396020174, "step": 190 }, { "epoch": 0.05, "grad_norm": 4.689874276629473, "learning_rate": 2.3923444976076555e-07, "logits/chosen": -2.4625015258789062, "logits/rejected": -2.327069044113159, "logps/chosen": -251.8259735107422, "logps/rejected": -225.61746215820312, "loss": 0.6908, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0033970631193369627, "rewards/margins": 0.0032132845371961594, "rewards/margins_max": 0.01489048171788454, "rewards/margins_min": -0.0057045575231313705, "rewards/margins_std": 0.009178846143186092, "rewards/rejected": -0.006610347423702478, "step": 200 }, { "epoch": 0.05, "eval_logits/chosen": -2.2858331203460693, "eval_logits/rejected": -2.176124095916748, "eval_logps/chosen": -272.1465759277344, "eval_logps/rejected": -262.0458679199219, "eval_loss": 0.6916428804397583, "eval_rewards/accuracies": 0.6019999980926514, "eval_rewards/chosen": -0.004834940657019615, "eval_rewards/margins": 0.0031981458887457848, "eval_rewards/margins_max": 0.02634728141129017, "eval_rewards/margins_min": -0.01737046428024769, "eval_rewards/margins_std": 0.0141350282356143, "eval_rewards/rejected": -0.0080330865457654, "eval_runtime": 1495.9818, "eval_samples_per_second": 2.674, "eval_steps_per_second": 0.167, "step": 200 }, { "epoch": 0.05, "grad_norm": 2.9394510161433467, "learning_rate": 2.511961722488038e-07, "logits/chosen": -2.438068151473999, "logits/rejected": -2.2896053791046143, "logps/chosen": -278.2657470703125, "logps/rejected": -255.22091674804688, "loss": 0.692, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.0075510716997087, "rewards/margins": 0.0016397204017266631, "rewards/margins_max": 0.01669425331056118, "rewards/margins_min": -0.014115629717707634, "rewards/margins_std": 0.01389253605157137, "rewards/rejected": -0.009190792217850685, "step": 210 }, { "epoch": 0.05, "grad_norm": 2.0313599776792977, "learning_rate": 2.631578947368421e-07, "logits/chosen": -2.416560173034668, "logits/rejected": -2.2866508960723877, "logps/chosen": -242.6060791015625, "logps/rejected": -233.04611206054688, "loss": 0.6906, "rewards/accuracies": 0.5625, "rewards/chosen": -0.002334106247872114, "rewards/margins": 0.004502108320593834, "rewards/margins_max": 0.0231974758207798, "rewards/margins_min": -0.01248755119740963, "rewards/margins_std": 0.015686124563217163, "rewards/rejected": -0.006836214568465948, "step": 220 }, { "epoch": 0.06, "grad_norm": 3.2674602115604805, "learning_rate": 2.7511961722488034e-07, "logits/chosen": -2.435661792755127, "logits/rejected": -2.305683135986328, "logps/chosen": -262.1702575683594, "logps/rejected": -256.0978088378906, "loss": 0.6893, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.0018732731696218252, "rewards/margins": 0.009099382907152176, "rewards/margins_max": 0.026288732886314392, "rewards/margins_min": -0.007434495724737644, "rewards/margins_std": 0.014912542887032032, "rewards/rejected": -0.010972656309604645, "step": 230 }, { "epoch": 0.06, "grad_norm": 2.7642807791839403, "learning_rate": 2.8708133971291866e-07, "logits/chosen": -2.4456396102905273, "logits/rejected": -2.284302234649658, "logps/chosen": -244.33309936523438, "logps/rejected": -237.9929962158203, "loss": 0.6898, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.006622855551540852, "rewards/margins": 0.0064144073985517025, "rewards/margins_max": 0.0257855411618948, "rewards/margins_min": -0.00851544737815857, "rewards/margins_std": 0.015483888797461987, "rewards/rejected": -0.013037264347076416, "step": 240 }, { "epoch": 0.06, "grad_norm": 2.4960458001678663, "learning_rate": 2.990430622009569e-07, "logits/chosen": -2.3569631576538086, "logits/rejected": -2.339345932006836, "logps/chosen": -260.26513671875, "logps/rejected": -286.07720947265625, "loss": 0.6895, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.005638328846544027, "rewards/margins": 0.007306135259568691, "rewards/margins_max": 0.027475446462631226, "rewards/margins_min": -0.008061405271291733, "rewards/margins_std": 0.01602182537317276, "rewards/rejected": -0.012944464571774006, "step": 250 }, { "epoch": 0.06, "grad_norm": 2.9469795669660206, "learning_rate": 3.110047846889952e-07, "logits/chosen": -2.2550477981567383, "logits/rejected": -2.3483004570007324, "logps/chosen": -243.9563446044922, "logps/rejected": -280.0416259765625, "loss": 0.6882, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.0039217351004481316, "rewards/margins": 0.01320717204362154, "rewards/margins_max": 0.03990231081843376, "rewards/margins_min": -0.008333822712302208, "rewards/margins_std": 0.02189290151000023, "rewards/rejected": -0.017128905281424522, "step": 260 }, { "epoch": 0.06, "grad_norm": 2.623877239804059, "learning_rate": 3.229665071770335e-07, "logits/chosen": -2.494788885116577, "logits/rejected": -2.2321584224700928, "logps/chosen": -316.99365234375, "logps/rejected": -240.0007781982422, "loss": 0.6891, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.011083757504820824, "rewards/margins": 0.005596992559731007, "rewards/margins_max": 0.04543240740895271, "rewards/margins_min": -0.02876010164618492, "rewards/margins_std": 0.03349614888429642, "rewards/rejected": -0.016680750995874405, "step": 270 }, { "epoch": 0.07, "grad_norm": 2.512390141908436, "learning_rate": 3.3492822966507177e-07, "logits/chosen": -2.433187246322632, "logits/rejected": -2.3799986839294434, "logps/chosen": -249.8145294189453, "logps/rejected": -254.520751953125, "loss": 0.6903, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.012405243702232838, "rewards/margins": 0.00736930500715971, "rewards/margins_max": 0.04089691862463951, "rewards/margins_min": -0.018680576235055923, "rewards/margins_std": 0.02640439197421074, "rewards/rejected": -0.0197745468467474, "step": 280 }, { "epoch": 0.07, "grad_norm": 2.7846602202817192, "learning_rate": 3.4688995215311004e-07, "logits/chosen": -2.3288509845733643, "logits/rejected": -2.247399091720581, "logps/chosen": -246.5977020263672, "logps/rejected": -219.98806762695312, "loss": 0.6877, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.012363080866634846, "rewards/margins": 0.007946793921291828, "rewards/margins_max": 0.04615107923746109, "rewards/margins_min": -0.026964480057358742, "rewards/margins_std": 0.03265797719359398, "rewards/rejected": -0.020309874787926674, "step": 290 }, { "epoch": 0.07, "grad_norm": 2.7114876400544348, "learning_rate": 3.588516746411483e-07, "logits/chosen": -2.5003726482391357, "logits/rejected": -2.456261396408081, "logps/chosen": -239.73556518554688, "logps/rejected": -242.52676391601562, "loss": 0.6872, "rewards/accuracies": 0.625, "rewards/chosen": -0.013501867651939392, "rewards/margins": 0.01004777941852808, "rewards/margins_max": 0.047369394451379776, "rewards/margins_min": -0.02645121142268181, "rewards/margins_std": 0.03306268900632858, "rewards/rejected": -0.023549648001790047, "step": 300 }, { "epoch": 0.07, "eval_logits/chosen": -2.276411533355713, "eval_logits/rejected": -2.166684150695801, "eval_logps/chosen": -273.3193359375, "eval_logps/rejected": -263.7999267578125, "eval_loss": 0.6890299320220947, "eval_rewards/accuracies": 0.6069999933242798, "eval_rewards/chosen": -0.016562845557928085, "eval_rewards/margins": 0.00901089608669281, "eval_rewards/margins_max": 0.07224840670824051, "eval_rewards/margins_min": -0.04667587950825691, "eval_rewards/margins_std": 0.038151100277900696, "eval_rewards/rejected": -0.025573737919330597, "eval_runtime": 1496.299, "eval_samples_per_second": 2.673, "eval_steps_per_second": 0.167, "step": 300 }, { "epoch": 0.07, "grad_norm": 2.223509612961429, "learning_rate": 3.7081339712918656e-07, "logits/chosen": -2.504915714263916, "logits/rejected": -2.2530007362365723, "logps/chosen": -249.86563110351562, "logps/rejected": -196.68069458007812, "loss": 0.6888, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.01248231716454029, "rewards/margins": 0.008637329563498497, "rewards/margins_max": 0.04928823560476303, "rewards/margins_min": -0.024911368265748024, "rewards/margins_std": 0.03280934318900108, "rewards/rejected": -0.021119646728038788, "step": 310 }, { "epoch": 0.08, "grad_norm": 2.927336947675243, "learning_rate": 3.827751196172249e-07, "logits/chosen": -2.4442334175109863, "logits/rejected": -2.3791956901550293, "logps/chosen": -266.7646789550781, "logps/rejected": -352.1415710449219, "loss": 0.6862, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.014878204092383385, "rewards/margins": 0.01517223846167326, "rewards/margins_max": 0.07721895724534988, "rewards/margins_min": -0.04059915989637375, "rewards/margins_std": 0.052051711827516556, "rewards/rejected": -0.03005044534802437, "step": 320 }, { "epoch": 0.08, "grad_norm": 2.716965100409286, "learning_rate": 3.9473684210526315e-07, "logits/chosen": -2.4895529747009277, "logits/rejected": -2.343742609024048, "logps/chosen": -317.1722412109375, "logps/rejected": -265.96710205078125, "loss": 0.684, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.011329003609716892, "rewards/margins": 0.02624618634581566, "rewards/margins_max": 0.0863041877746582, "rewards/margins_min": -0.01587349735200405, "rewards/margins_std": 0.0461648590862751, "rewards/rejected": -0.037575189024209976, "step": 330 }, { "epoch": 0.08, "grad_norm": 2.937631482367434, "learning_rate": 4.066985645933014e-07, "logits/chosen": -2.4698424339294434, "logits/rejected": -2.42517352104187, "logps/chosen": -321.38336181640625, "logps/rejected": -269.46392822265625, "loss": 0.6844, "rewards/accuracies": 0.75, "rewards/chosen": -0.011882015503942966, "rewards/margins": 0.025472700595855713, "rewards/margins_max": 0.08070038259029388, "rewards/margins_min": -0.021532099694013596, "rewards/margins_std": 0.04498646780848503, "rewards/rejected": -0.037354715168476105, "step": 340 }, { "epoch": 0.08, "grad_norm": 2.944737223897925, "learning_rate": 4.1866028708133973e-07, "logits/chosen": -2.363245964050293, "logits/rejected": -2.18719220161438, "logps/chosen": -252.5238037109375, "logps/rejected": -215.97488403320312, "loss": 0.6866, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.02260414883494377, "rewards/margins": 0.0049986266531050205, "rewards/margins_max": 0.05810749530792236, "rewards/margins_min": -0.04638128727674484, "rewards/margins_std": 0.046332523226737976, "rewards/rejected": -0.027602773159742355, "step": 350 }, { "epoch": 0.09, "grad_norm": 2.9913618975639285, "learning_rate": 4.30622009569378e-07, "logits/chosen": -2.4596152305603027, "logits/rejected": -2.346587896347046, "logps/chosen": -274.7042541503906, "logps/rejected": -251.25399780273438, "loss": 0.684, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.018944965675473213, "rewards/margins": 0.020642247051000595, "rewards/margins_max": 0.07854585349559784, "rewards/margins_min": -0.04149174317717552, "rewards/margins_std": 0.05351649597287178, "rewards/rejected": -0.03958721458911896, "step": 360 }, { "epoch": 0.09, "grad_norm": 2.046448641336004, "learning_rate": 4.425837320574162e-07, "logits/chosen": -2.586242198944092, "logits/rejected": -2.3883042335510254, "logps/chosen": -289.3235778808594, "logps/rejected": -231.667236328125, "loss": 0.6879, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.029600298032164574, "rewards/margins": 0.013325506821274757, "rewards/margins_max": 0.07799428701400757, "rewards/margins_min": -0.05791464447975159, "rewards/margins_std": 0.061714719980955124, "rewards/rejected": -0.04292580485343933, "step": 370 }, { "epoch": 0.09, "grad_norm": 3.074248840882752, "learning_rate": 4.545454545454545e-07, "logits/chosen": -2.440675973892212, "logits/rejected": -2.2678444385528564, "logps/chosen": -269.9767761230469, "logps/rejected": -229.9214324951172, "loss": 0.685, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.027350476011633873, "rewards/margins": 0.018574869260191917, "rewards/margins_max": 0.08652856945991516, "rewards/margins_min": -0.04643934965133667, "rewards/margins_std": 0.06029176712036133, "rewards/rejected": -0.04592534154653549, "step": 380 }, { "epoch": 0.09, "grad_norm": 2.6458884606412996, "learning_rate": 4.665071770334928e-07, "logits/chosen": -2.322763681411743, "logits/rejected": -2.1636500358581543, "logps/chosen": -295.81475830078125, "logps/rejected": -265.8274841308594, "loss": 0.6799, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02743806317448616, "rewards/margins": 0.028139978647232056, "rewards/margins_max": 0.09491652250289917, "rewards/margins_min": -0.036690663546323776, "rewards/margins_std": 0.05786781385540962, "rewards/rejected": -0.055578041821718216, "step": 390 }, { "epoch": 0.1, "grad_norm": 3.1278142845262296, "learning_rate": 4.784688995215311e-07, "logits/chosen": -2.286860466003418, "logits/rejected": -2.2807908058166504, "logps/chosen": -255.78762817382812, "logps/rejected": -262.1524963378906, "loss": 0.6838, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.024156948551535606, "rewards/margins": 0.026079341769218445, "rewards/margins_max": 0.09697680175304413, "rewards/margins_min": -0.038907431066036224, "rewards/margins_std": 0.06209443137049675, "rewards/rejected": -0.0502362921833992, "step": 400 }, { "epoch": 0.1, "eval_logits/chosen": -2.2694156169891357, "eval_logits/rejected": -2.1606152057647705, "eval_logps/chosen": -275.453857421875, "eval_logps/rejected": -267.0469055175781, "eval_loss": 0.6843879222869873, "eval_rewards/accuracies": 0.6019999980926514, "eval_rewards/chosen": -0.037907905876636505, "eval_rewards/margins": 0.02013549767434597, "eval_rewards/margins_max": 0.15662512183189392, "eval_rewards/margins_min": -0.09699905663728714, "eval_rewards/margins_std": 0.08162891864776611, "eval_rewards/rejected": -0.05804340913891792, "eval_runtime": 1495.1262, "eval_samples_per_second": 2.675, "eval_steps_per_second": 0.167, "step": 400 }, { "epoch": 0.1, "grad_norm": 2.8679171680954525, "learning_rate": 4.904306220095694e-07, "logits/chosen": -2.437248945236206, "logits/rejected": -2.2588629722595215, "logps/chosen": -312.7847595214844, "logps/rejected": -266.1875305175781, "loss": 0.6797, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.034851789474487305, "rewards/margins": 0.029562119394540787, "rewards/margins_max": 0.15038950741291046, "rewards/margins_min": -0.0622372031211853, "rewards/margins_std": 0.09603596478700638, "rewards/rejected": -0.06441390514373779, "step": 410 }, { "epoch": 0.1, "grad_norm": 3.6657107291965203, "learning_rate": 4.999996505732917e-07, "logits/chosen": -2.3876137733459473, "logits/rejected": -2.323385238647461, "logps/chosen": -294.72467041015625, "logps/rejected": -299.84619140625, "loss": 0.6845, "rewards/accuracies": 0.625, "rewards/chosen": -0.04627472162246704, "rewards/margins": 0.016848457977175713, "rewards/margins_max": 0.09452972561120987, "rewards/margins_min": -0.06856296956539154, "rewards/margins_std": 0.07243378460407257, "rewards/rejected": -0.0631231889128685, "step": 420 }, { "epoch": 0.1, "grad_norm": 2.892546803615872, "learning_rate": 4.999874207410648e-07, "logits/chosen": -2.342020273208618, "logits/rejected": -2.3208391666412354, "logps/chosen": -239.465087890625, "logps/rejected": -265.7638854980469, "loss": 0.6764, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.02282531186938286, "rewards/margins": 0.032338697463274, "rewards/margins_max": 0.13463613390922546, "rewards/margins_min": -0.03967132791876793, "rewards/margins_std": 0.07847634702920914, "rewards/rejected": -0.05516401678323746, "step": 430 }, { "epoch": 0.11, "grad_norm": 2.7569204297902883, "learning_rate": 4.999577205502039e-07, "logits/chosen": -2.3258602619171143, "logits/rejected": -2.255052089691162, "logps/chosen": -232.8247528076172, "logps/rejected": -225.0179901123047, "loss": 0.69, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.043977439403533936, "rewards/margins": 0.00877896137535572, "rewards/margins_max": 0.10356054455041885, "rewards/margins_min": -0.08472234010696411, "rewards/margins_std": 0.08403487503528595, "rewards/rejected": -0.05275639891624451, "step": 440 }, { "epoch": 0.11, "grad_norm": 2.9637151487466125, "learning_rate": 4.999105520763054e-07, "logits/chosen": -2.3890719413757324, "logits/rejected": -2.175832748413086, "logps/chosen": -280.5219421386719, "logps/rejected": -259.12115478515625, "loss": 0.6854, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.04905271530151367, "rewards/margins": 0.004657468758523464, "rewards/margins_max": 0.09759913384914398, "rewards/margins_min": -0.10071220248937607, "rewards/margins_std": 0.08687575161457062, "rewards/rejected": -0.05371018499135971, "step": 450 }, { "epoch": 0.11, "grad_norm": 3.592225257174, "learning_rate": 4.998459186157357e-07, "logits/chosen": -2.4658498764038086, "logits/rejected": -2.2778396606445312, "logps/chosen": -283.7544250488281, "logps/rejected": -273.81951904296875, "loss": 0.6805, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.05411519855260849, "rewards/margins": 0.023511165753006935, "rewards/margins_max": 0.14344921708106995, "rewards/margins_min": -0.0850573182106018, "rewards/margins_std": 0.10127091407775879, "rewards/rejected": -0.07762636989355087, "step": 460 }, { "epoch": 0.11, "grad_norm": 3.2810414565890444, "learning_rate": 4.997638246854011e-07, "logits/chosen": -2.4672646522521973, "logits/rejected": -2.377880573272705, "logps/chosen": -284.3224792480469, "logps/rejected": -292.3838806152344, "loss": 0.6869, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.06258692592382431, "rewards/margins": 0.016758792102336884, "rewards/margins_max": 0.1475452482700348, "rewards/margins_min": -0.09731698036193848, "rewards/margins_std": 0.11060823500156403, "rewards/rejected": -0.0793457180261612, "step": 470 }, { "epoch": 0.11, "grad_norm": 3.801090365127773, "learning_rate": 4.996642760224317e-07, "logits/chosen": -2.3300671577453613, "logits/rejected": -2.2400989532470703, "logps/chosen": -278.75213623046875, "logps/rejected": -277.0263366699219, "loss": 0.679, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.03630394488573074, "rewards/margins": 0.03409670665860176, "rewards/margins_max": 0.1376000940799713, "rewards/margins_min": -0.06601649522781372, "rewards/margins_std": 0.08970925956964493, "rewards/rejected": -0.07040064036846161, "step": 480 }, { "epoch": 0.12, "grad_norm": 2.546210734795443, "learning_rate": 4.995472795837813e-07, "logits/chosen": -2.554861545562744, "logits/rejected": -2.279754161834717, "logps/chosen": -247.8121337890625, "logps/rejected": -226.10964965820312, "loss": 0.6754, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.046758852899074554, "rewards/margins": 0.028865143656730652, "rewards/margins_max": 0.12532363831996918, "rewards/margins_min": -0.05888228863477707, "rewards/margins_std": 0.08213096857070923, "rewards/rejected": -0.0756240040063858, "step": 490 }, { "epoch": 0.12, "grad_norm": 2.6204591258543677, "learning_rate": 4.994128435457401e-07, "logits/chosen": -2.4862895011901855, "logits/rejected": -2.323430061340332, "logps/chosen": -298.39312744140625, "logps/rejected": -276.8516540527344, "loss": 0.6798, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05121801421046257, "rewards/margins": 0.024608764797449112, "rewards/margins_max": 0.15406695008277893, "rewards/margins_min": -0.10288794338703156, "rewards/margins_std": 0.11485779285430908, "rewards/rejected": -0.07582677900791168, "step": 500 }, { "epoch": 0.12, "eval_logits/chosen": -2.2652719020843506, "eval_logits/rejected": -2.1578903198242188, "eval_logps/chosen": -277.9360656738281, "eval_logps/rejected": -270.9397277832031, "eval_loss": 0.6788930892944336, "eval_rewards/accuracies": 0.6175000071525574, "eval_rewards/chosen": -0.06273017823696136, "eval_rewards/margins": 0.03424164652824402, "eval_rewards/margins_max": 0.24493688344955444, "eval_rewards/margins_min": -0.14137069880962372, "eval_rewards/margins_std": 0.12482821941375732, "eval_rewards/rejected": -0.09697181731462479, "eval_runtime": 1493.3874, "eval_samples_per_second": 2.678, "eval_steps_per_second": 0.167, "step": 500 }, { "epoch": 0.12, "grad_norm": 2.7564157725889165, "learning_rate": 4.992609773033638e-07, "logits/chosen": -2.4324123859405518, "logits/rejected": -2.287686824798584, "logps/chosen": -306.4532775878906, "logps/rejected": -306.77630615234375, "loss": 0.6744, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.06588558852672577, "rewards/margins": 0.04000314325094223, "rewards/margins_max": 0.18204821646213531, "rewards/margins_min": -0.08453138172626495, "rewards/margins_std": 0.12070759385824203, "rewards/rejected": -0.1058887392282486, "step": 510 }, { "epoch": 0.12, "grad_norm": 2.854281699282652, "learning_rate": 4.990916914698176e-07, "logits/chosen": -2.467494010925293, "logits/rejected": -2.4888877868652344, "logps/chosen": -265.64129638671875, "logps/rejected": -297.2715759277344, "loss": 0.6751, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.05467858165502548, "rewards/margins": 0.049697332084178925, "rewards/margins_max": 0.1833997666835785, "rewards/margins_min": -0.06105310842394829, "rewards/margins_std": 0.110865019261837, "rewards/rejected": -0.1043759137392044, "step": 520 }, { "epoch": 0.13, "grad_norm": 2.7916782625631273, "learning_rate": 4.989049978756335e-07, "logits/chosen": -2.440230369567871, "logits/rejected": -2.247058629989624, "logps/chosen": -251.9412841796875, "logps/rejected": -237.79849243164062, "loss": 0.671, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.05540058761835098, "rewards/margins": 0.05279461666941643, "rewards/margins_max": 0.21537013351917267, "rewards/margins_min": -0.07653968036174774, "rewards/margins_std": 0.13138195872306824, "rewards/rejected": -0.10819520801305771, "step": 530 }, { "epoch": 0.13, "grad_norm": 2.92089762124905, "learning_rate": 4.987009095678842e-07, "logits/chosen": -2.5387706756591797, "logits/rejected": -2.266632556915283, "logps/chosen": -332.76495361328125, "logps/rejected": -266.73138427734375, "loss": 0.666, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.07755639404058456, "rewards/margins": 0.06593222171068192, "rewards/margins_max": 0.27216702699661255, "rewards/margins_min": -0.12844005227088928, "rewards/margins_std": 0.17713655531406403, "rewards/rejected": -0.1434885859489441, "step": 540 }, { "epoch": 0.13, "grad_norm": 2.648184705164278, "learning_rate": 4.984794408092712e-07, "logits/chosen": -2.2772836685180664, "logits/rejected": -2.341265916824341, "logps/chosen": -217.606201171875, "logps/rejected": -243.30752563476562, "loss": 0.6755, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.0623566210269928, "rewards/margins": 0.02618757262825966, "rewards/margins_max": 0.13937994837760925, "rewards/margins_min": -0.10764981806278229, "rewards/margins_std": 0.1129981055855751, "rewards/rejected": -0.08854419738054276, "step": 550 }, { "epoch": 0.13, "grad_norm": 3.198024652668761, "learning_rate": 4.982406070771277e-07, "logits/chosen": -2.4093024730682373, "logits/rejected": -2.2449746131896973, "logps/chosen": -255.7589569091797, "logps/rejected": -248.7128448486328, "loss": 0.6783, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07280144840478897, "rewards/margins": 0.03337310627102852, "rewards/margins_max": 0.15352794528007507, "rewards/margins_min": -0.08159597218036652, "rewards/margins_std": 0.10725705325603485, "rewards/rejected": -0.10617456585168839, "step": 560 }, { "epoch": 0.14, "grad_norm": 2.8057639943698134, "learning_rate": 4.979844250623374e-07, "logits/chosen": -2.2806472778320312, "logits/rejected": -2.189868450164795, "logps/chosen": -264.5873107910156, "logps/rejected": -299.18084716796875, "loss": 0.6668, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.06882619857788086, "rewards/margins": 0.061507683247327805, "rewards/margins_max": 0.2631301283836365, "rewards/margins_min": -0.10709092766046524, "rewards/margins_std": 0.16340143978595734, "rewards/rejected": -0.13033390045166016, "step": 570 }, { "epoch": 0.14, "grad_norm": 2.7594464786382864, "learning_rate": 4.977109126681678e-07, "logits/chosen": -2.4081342220306396, "logits/rejected": -2.3221030235290527, "logps/chosen": -323.609619140625, "logps/rejected": -298.1337890625, "loss": 0.6743, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0895765870809555, "rewards/margins": 0.037911128252744675, "rewards/margins_max": 0.2113153636455536, "rewards/margins_min": -0.14314982295036316, "rewards/margins_std": 0.15852686762809753, "rewards/rejected": -0.12748773396015167, "step": 580 }, { "epoch": 0.14, "grad_norm": 2.694969177494478, "learning_rate": 4.974200890090191e-07, "logits/chosen": -2.41780424118042, "logits/rejected": -2.322680950164795, "logps/chosen": -246.09207153320312, "logps/rejected": -257.63238525390625, "loss": 0.6695, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.08706489950418472, "rewards/margins": 0.05959819629788399, "rewards/margins_max": 0.2615725100040436, "rewards/margins_min": -0.0946430116891861, "rewards/margins_std": 0.15711984038352966, "rewards/rejected": -0.14666306972503662, "step": 590 }, { "epoch": 0.14, "grad_norm": 3.05731727983619, "learning_rate": 4.971119744090886e-07, "logits/chosen": -2.361783504486084, "logits/rejected": -2.2509255409240723, "logps/chosen": -261.0163269042969, "logps/rejected": -251.2213592529297, "loss": 0.685, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.10072357952594757, "rewards/margins": 0.03956902027130127, "rewards/margins_max": 0.22408254444599152, "rewards/margins_min": -0.1324639618396759, "rewards/margins_std": 0.15912732481956482, "rewards/rejected": -0.14029261469841003, "step": 600 }, { "epoch": 0.14, "eval_logits/chosen": -2.2503182888031006, "eval_logits/rejected": -2.14314866065979, "eval_logps/chosen": -282.5006103515625, "eval_logps/rejected": -277.26123046875, "eval_loss": 0.6730125546455383, "eval_rewards/accuracies": 0.6209999918937683, "eval_rewards/chosen": -0.10837589204311371, "eval_rewards/margins": 0.05181092396378517, "eval_rewards/margins_max": 0.3502222001552582, "eval_rewards/margins_min": -0.1977950930595398, "eval_rewards/margins_std": 0.17817051708698273, "eval_rewards/rejected": -0.16018681228160858, "eval_runtime": 1494.4422, "eval_samples_per_second": 2.677, "eval_steps_per_second": 0.167, "step": 600 }, { "epoch": 0.15, "grad_norm": 3.3817967394133173, "learning_rate": 4.967865904009499e-07, "logits/chosen": -2.4531264305114746, "logits/rejected": -2.324897050857544, "logps/chosen": -345.6144104003906, "logps/rejected": -290.9465026855469, "loss": 0.6599, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.11289738118648529, "rewards/margins": 0.07216618955135345, "rewards/margins_max": 0.26448854804039, "rewards/margins_min": -0.08863689005374908, "rewards/margins_std": 0.1578410416841507, "rewards/rejected": -0.18506357073783875, "step": 610 }, { "epoch": 0.15, "grad_norm": 3.2164660007429813, "learning_rate": 4.964439597240486e-07, "logits/chosen": -2.4797024726867676, "logits/rejected": -2.3193254470825195, "logps/chosen": -384.9510803222656, "logps/rejected": -318.635498046875, "loss": 0.666, "rewards/accuracies": 0.625, "rewards/chosen": -0.13338720798492432, "rewards/margins": 0.07245330512523651, "rewards/margins_max": 0.3238731324672699, "rewards/margins_min": -0.14915403723716736, "rewards/margins_std": 0.2075492888689041, "rewards/rejected": -0.20584049820899963, "step": 620 }, { "epoch": 0.15, "grad_norm": 2.9974923779770104, "learning_rate": 4.960841063231124e-07, "logits/chosen": -2.4050519466400146, "logits/rejected": -2.2076239585876465, "logps/chosen": -367.3083190917969, "logps/rejected": -321.6839904785156, "loss": 0.6512, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.09992216527462006, "rewards/margins": 0.1155349463224411, "rewards/margins_max": 0.3388310670852661, "rewards/margins_min": -0.07244092226028442, "rewards/margins_std": 0.1846884787082672, "rewards/rejected": -0.21545711159706116, "step": 630 }, { "epoch": 0.15, "grad_norm": 3.03481845620686, "learning_rate": 4.95707055346479e-07, "logits/chosen": -2.3924412727355957, "logits/rejected": -2.18711519241333, "logps/chosen": -322.00164794921875, "logps/rejected": -268.35552978515625, "loss": 0.6593, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.13142088055610657, "rewards/margins": 0.07422800362110138, "rewards/margins_max": 0.3032975494861603, "rewards/margins_min": -0.09201741218566895, "rewards/margins_std": 0.17782273888587952, "rewards/rejected": -0.20564886927604675, "step": 640 }, { "epoch": 0.16, "grad_norm": 3.1032810014463705, "learning_rate": 4.95312833144337e-07, "logits/chosen": -2.43044114112854, "logits/rejected": -2.268944263458252, "logps/chosen": -298.31256103515625, "logps/rejected": -269.2052917480469, "loss": 0.666, "rewards/accuracies": 0.625, "rewards/chosen": -0.12601418793201447, "rewards/margins": 0.06721445173025131, "rewards/margins_max": 0.31008675694465637, "rewards/margins_min": -0.16182588040828705, "rewards/margins_std": 0.2098376750946045, "rewards/rejected": -0.19322863221168518, "step": 650 }, { "epoch": 0.16, "grad_norm": 2.9635064648692397, "learning_rate": 4.949014672668858e-07, "logits/chosen": -2.4570531845092773, "logits/rejected": -2.361572504043579, "logps/chosen": -259.6556701660156, "logps/rejected": -268.4245300292969, "loss": 0.6606, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.09763294458389282, "rewards/margins": 0.10392501205205917, "rewards/margins_max": 0.3641187846660614, "rewards/margins_min": -0.09045568853616714, "rewards/margins_std": 0.2053099423646927, "rewards/rejected": -0.2015579640865326, "step": 660 }, { "epoch": 0.16, "grad_norm": 3.201136720940289, "learning_rate": 4.944729864624097e-07, "logits/chosen": -2.5336649417877197, "logits/rejected": -2.292915105819702, "logps/chosen": -342.21240234375, "logps/rejected": -290.4122009277344, "loss": 0.6616, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.17614498734474182, "rewards/margins": 0.06392403692007065, "rewards/margins_max": 0.33409273624420166, "rewards/margins_min": -0.16225367784500122, "rewards/margins_std": 0.21784797310829163, "rewards/rejected": -0.24006900191307068, "step": 670 }, { "epoch": 0.16, "grad_norm": 3.0878662262832726, "learning_rate": 4.940274206752687e-07, "logits/chosen": -2.41398286819458, "logits/rejected": -2.2174134254455566, "logps/chosen": -321.3291931152344, "logps/rejected": -277.26416015625, "loss": 0.6586, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.16976942121982574, "rewards/margins": 0.051950544118881226, "rewards/margins_max": 0.36415714025497437, "rewards/margins_min": -0.20703688263893127, "rewards/margins_std": 0.25374338030815125, "rewards/rejected": -0.22171998023986816, "step": 680 }, { "epoch": 0.17, "grad_norm": 3.0003406013417973, "learning_rate": 4.935648010438058e-07, "logits/chosen": -2.351980686187744, "logits/rejected": -2.2652995586395264, "logps/chosen": -265.3300476074219, "logps/rejected": -284.9433288574219, "loss": 0.6678, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.13920913636684418, "rewards/margins": 0.07906569540500641, "rewards/margins_max": 0.325488805770874, "rewards/margins_min": -0.16198305785655975, "rewards/margins_std": 0.21863476932048798, "rewards/rejected": -0.21827483177185059, "step": 690 }, { "epoch": 0.17, "grad_norm": 2.9512830368706933, "learning_rate": 4.930851598981713e-07, "logits/chosen": -2.410790205001831, "logits/rejected": -2.2298710346221924, "logps/chosen": -284.4775695800781, "logps/rejected": -262.7158508300781, "loss": 0.659, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.19152745604515076, "rewards/margins": 0.06756166368722916, "rewards/margins_max": 0.32807445526123047, "rewards/margins_min": -0.1789148449897766, "rewards/margins_std": 0.22235098481178284, "rewards/rejected": -0.2590891420841217, "step": 700 }, { "epoch": 0.17, "eval_logits/chosen": -2.247408628463745, "eval_logits/rejected": -2.141589641571045, "eval_logps/chosen": -289.1077575683594, "eval_logps/rejected": -285.9119873046875, "eval_loss": 0.6672701239585876, "eval_rewards/accuracies": 0.6230000257492065, "eval_rewards/chosen": -0.1744469553232193, "eval_rewards/margins": 0.07224775105714798, "eval_rewards/margins_max": 0.4793892204761505, "eval_rewards/margins_min": -0.2752997875213623, "eval_rewards/margins_std": 0.24602006375789642, "eval_rewards/rejected": -0.24669469892978668, "eval_runtime": 1495.0926, "eval_samples_per_second": 2.675, "eval_steps_per_second": 0.167, "step": 700 }, { "epoch": 0.17, "grad_norm": 3.2081628399090105, "learning_rate": 4.925885307580632e-07, "logits/chosen": -2.3697640895843506, "logits/rejected": -2.197603702545166, "logps/chosen": -281.3732604980469, "logps/rejected": -263.16571044921875, "loss": 0.6735, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.14310531318187714, "rewards/margins": 0.08512021601200104, "rewards/margins_max": 0.3119520843029022, "rewards/margins_min": -0.14834533631801605, "rewards/margins_std": 0.20381960272789001, "rewards/rejected": -0.22822551429271698, "step": 710 }, { "epoch": 0.17, "grad_norm": 3.070091957926823, "learning_rate": 4.920749483303846e-07, "logits/chosen": -2.1541881561279297, "logits/rejected": -2.1850485801696777, "logps/chosen": -269.72149658203125, "logps/rejected": -295.59857177734375, "loss": 0.6577, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.14654551446437836, "rewards/margins": 0.08586689084768295, "rewards/margins_max": 0.3441448509693146, "rewards/margins_min": -0.11081180721521378, "rewards/margins_std": 0.2033281773328781, "rewards/rejected": -0.2324124127626419, "step": 720 }, { "epoch": 0.17, "grad_norm": 3.1588839880449484, "learning_rate": 4.915444485068181e-07, "logits/chosen": -2.4117140769958496, "logits/rejected": -2.266968250274658, "logps/chosen": -333.79913330078125, "logps/rejected": -326.06646728515625, "loss": 0.6548, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1978965848684311, "rewards/margins": 0.07981382310390472, "rewards/margins_max": 0.3897927403450012, "rewards/margins_min": -0.2062167227268219, "rewards/margins_std": 0.2775718569755554, "rewards/rejected": -0.2777103781700134, "step": 730 }, { "epoch": 0.18, "grad_norm": 3.0429900613021386, "learning_rate": 4.90997068361318e-07, "logits/chosen": -2.4479782581329346, "logits/rejected": -2.3585877418518066, "logps/chosen": -265.07147216796875, "logps/rejected": -283.250732421875, "loss": 0.6498, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.15747538208961487, "rewards/margins": 0.09201022237539291, "rewards/margins_max": 0.38584285974502563, "rewards/margins_min": -0.18726351857185364, "rewards/margins_std": 0.2509940266609192, "rewards/rejected": -0.24948564171791077, "step": 740 }, { "epoch": 0.18, "grad_norm": 3.096815035919843, "learning_rate": 4.904328461475189e-07, "logits/chosen": -2.4886362552642822, "logits/rejected": -2.373009204864502, "logps/chosen": -298.14654541015625, "logps/rejected": -307.07977294921875, "loss": 0.6611, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.22404134273529053, "rewards/margins": 0.08591306954622269, "rewards/margins_max": 0.44171851873397827, "rewards/margins_min": -0.23418653011322021, "rewards/margins_std": 0.3089810311794281, "rewards/rejected": -0.309954434633255, "step": 750 }, { "epoch": 0.18, "grad_norm": 2.6428805317083124, "learning_rate": 4.898518212960625e-07, "logits/chosen": -2.414667844772339, "logits/rejected": -2.400287389755249, "logps/chosen": -278.496337890625, "logps/rejected": -306.20050048828125, "loss": 0.6493, "rewards/accuracies": 0.6875, "rewards/chosen": -0.15601886808872223, "rewards/margins": 0.08781120926141739, "rewards/margins_max": 0.346504271030426, "rewards/margins_min": -0.16563686728477478, "rewards/margins_std": 0.22753441333770752, "rewards/rejected": -0.24383005499839783, "step": 760 }, { "epoch": 0.18, "grad_norm": 3.3735642381530715, "learning_rate": 4.89254034411842e-07, "logits/chosen": -2.4528491497039795, "logits/rejected": -2.272374153137207, "logps/chosen": -275.2156677246094, "logps/rejected": -278.3580322265625, "loss": 0.6578, "rewards/accuracies": 0.625, "rewards/chosen": -0.21257421374320984, "rewards/margins": 0.06481559574604034, "rewards/margins_max": 0.3688669204711914, "rewards/margins_min": -0.23419871926307678, "rewards/margins_std": 0.2707606256008148, "rewards/rejected": -0.277389794588089, "step": 770 }, { "epoch": 0.19, "grad_norm": 3.8777876640371116, "learning_rate": 4.886395272711646e-07, "logits/chosen": -2.4776694774627686, "logits/rejected": -2.289794921875, "logps/chosen": -321.1734313964844, "logps/rejected": -271.56268310546875, "loss": 0.6568, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.2189309298992157, "rewards/margins": 0.0647321417927742, "rewards/margins_max": 0.42538976669311523, "rewards/margins_min": -0.2633572220802307, "rewards/margins_std": 0.30429449677467346, "rewards/rejected": -0.2836630940437317, "step": 780 }, { "epoch": 0.19, "grad_norm": 3.902395192705535, "learning_rate": 4.880083428188314e-07, "logits/chosen": -2.4492197036743164, "logits/rejected": -2.3283753395080566, "logps/chosen": -309.585205078125, "logps/rejected": -287.45660400390625, "loss": 0.6393, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.18748125433921814, "rewards/margins": 0.13159336149692535, "rewards/margins_max": 0.4702691435813904, "rewards/margins_min": -0.14326314628124237, "rewards/margins_std": 0.27160799503326416, "rewards/rejected": -0.3190745711326599, "step": 790 }, { "epoch": 0.19, "grad_norm": 4.035930902621156, "learning_rate": 4.873605251651373e-07, "logits/chosen": -2.3822567462921143, "logits/rejected": -2.178744077682495, "logps/chosen": -320.4302062988281, "logps/rejected": -280.50030517578125, "loss": 0.6583, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.21300539374351501, "rewards/margins": 0.0919991284608841, "rewards/margins_max": 0.3677797317504883, "rewards/margins_min": -0.22795164585113525, "rewards/margins_std": 0.26251500844955444, "rewards/rejected": -0.3050045371055603, "step": 800 }, { "epoch": 0.19, "eval_logits/chosen": -2.2443323135375977, "eval_logits/rejected": -2.13911771774292, "eval_logps/chosen": -294.9917297363281, "eval_logps/rejected": -293.98046875, "eval_loss": 0.6608997583389282, "eval_rewards/accuracies": 0.6190000176429749, "eval_rewards/chosen": -0.2332865297794342, "eval_rewards/margins": 0.09409263730049133, "eval_rewards/margins_max": 0.5862663388252258, "eval_rewards/margins_min": -0.3297277092933655, "eval_rewards/margins_std": 0.29900792241096497, "eval_rewards/rejected": -0.32737913727760315, "eval_runtime": 1496.3526, "eval_samples_per_second": 2.673, "eval_steps_per_second": 0.167, "step": 800 }, { "epoch": 0.19, "grad_norm": 3.1920450754831164, "learning_rate": 4.866961195827869e-07, "logits/chosen": -2.3050453662872314, "logits/rejected": -2.2720446586608887, "logps/chosen": -254.5433807373047, "logps/rejected": -267.4716491699219, "loss": 0.6687, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.21264970302581787, "rewards/margins": 0.10019989311695099, "rewards/margins_max": 0.3428945243358612, "rewards/margins_min": -0.14311207830905914, "rewards/margins_std": 0.22143355011940002, "rewards/rejected": -0.31284958124160767, "step": 810 }, { "epoch": 0.2, "grad_norm": 3.5681856527567426, "learning_rate": 4.860151725037318e-07, "logits/chosen": -2.3456642627716064, "logits/rejected": -2.2527120113372803, "logps/chosen": -299.58184814453125, "logps/rejected": -298.1337890625, "loss": 0.6411, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.21730554103851318, "rewards/margins": 0.12024341523647308, "rewards/margins_max": 0.4458374083042145, "rewards/margins_min": -0.22148564457893372, "rewards/margins_std": 0.3022783398628235, "rewards/rejected": -0.3375489413738251, "step": 820 }, { "epoch": 0.2, "grad_norm": 3.091077170162436, "learning_rate": 4.853177315159253e-07, "logits/chosen": -2.4664669036865234, "logits/rejected": -2.308396577835083, "logps/chosen": -358.7574462890625, "logps/rejected": -319.35284423828125, "loss": 0.6425, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2542029917240143, "rewards/margins": 0.11788780987262726, "rewards/margins_max": 0.4211495816707611, "rewards/margins_min": -0.1925216019153595, "rewards/margins_std": 0.2704932987689972, "rewards/rejected": -0.37209075689315796, "step": 830 }, { "epoch": 0.2, "grad_norm": 2.9336129300929255, "learning_rate": 4.846038453599967e-07, "logits/chosen": -2.466611385345459, "logits/rejected": -2.2284209728240967, "logps/chosen": -313.41143798828125, "logps/rejected": -292.84100341796875, "loss": 0.6551, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2437991350889206, "rewards/margins": 0.10974772274494171, "rewards/margins_max": 0.3211839199066162, "rewards/margins_min": -0.1025761142373085, "rewards/margins_std": 0.18767371773719788, "rewards/rejected": -0.3535468280315399, "step": 840 }, { "epoch": 0.2, "grad_norm": 3.5650562685341924, "learning_rate": 4.838735639258449e-07, "logits/chosen": -2.4552314281463623, "logits/rejected": -2.433683156967163, "logps/chosen": -288.6471862792969, "logps/rejected": -312.9539489746094, "loss": 0.651, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.29027825593948364, "rewards/margins": 0.04669810086488724, "rewards/margins_max": 0.3593463599681854, "rewards/margins_min": -0.29872941970825195, "rewards/margins_std": 0.29716697335243225, "rewards/rejected": -0.3369763493537903, "step": 850 }, { "epoch": 0.21, "grad_norm": 3.346160020156413, "learning_rate": 4.831269382491519e-07, "logits/chosen": -2.346867084503174, "logits/rejected": -2.357644557952881, "logps/chosen": -274.8031311035156, "logps/rejected": -309.43524169921875, "loss": 0.6463, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.22909191250801086, "rewards/margins": 0.11739081144332886, "rewards/margins_max": 0.47297605872154236, "rewards/margins_min": -0.22226938605308533, "rewards/margins_std": 0.29832276701927185, "rewards/rejected": -0.3464827239513397, "step": 860 }, { "epoch": 0.21, "grad_norm": 2.9174492104104757, "learning_rate": 4.823640205078166e-07, "logits/chosen": -2.4121623039245605, "logits/rejected": -2.3513846397399902, "logps/chosen": -251.77487182617188, "logps/rejected": -275.61962890625, "loss": 0.6588, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.23442283272743225, "rewards/margins": 0.1049787774682045, "rewards/margins_max": 0.41606220602989197, "rewards/margins_min": -0.17227521538734436, "rewards/margins_std": 0.268618643283844, "rewards/rejected": -0.33940163254737854, "step": 870 }, { "epoch": 0.21, "grad_norm": 4.111087108805471, "learning_rate": 4.815848640183081e-07, "logits/chosen": -2.3766472339630127, "logits/rejected": -2.230008125305176, "logps/chosen": -333.911376953125, "logps/rejected": -323.9111633300781, "loss": 0.6443, "rewards/accuracies": 0.6875, "rewards/chosen": -0.21184630692005157, "rewards/margins": 0.14703086018562317, "rewards/margins_max": 0.532608687877655, "rewards/margins_min": -0.1875142604112625, "rewards/margins_std": 0.32982760667800903, "rewards/rejected": -0.35887715220451355, "step": 880 }, { "epoch": 0.21, "grad_norm": 3.158348367968953, "learning_rate": 4.807895232319393e-07, "logits/chosen": -2.3449349403381348, "logits/rejected": -2.146338939666748, "logps/chosen": -305.1022033691406, "logps/rejected": -244.75057983398438, "loss": 0.6514, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2648373246192932, "rewards/margins": 0.08561765402555466, "rewards/margins_max": 0.4140554964542389, "rewards/margins_min": -0.2376786768436432, "rewards/margins_std": 0.29512304067611694, "rewards/rejected": -0.3504549562931061, "step": 890 }, { "epoch": 0.22, "grad_norm": 3.8597828267330607, "learning_rate": 4.799780537310621e-07, "logits/chosen": -2.36902117729187, "logits/rejected": -2.2361388206481934, "logps/chosen": -320.94952392578125, "logps/rejected": -316.57574462890625, "loss": 0.6461, "rewards/accuracies": 0.75, "rewards/chosen": -0.23059234023094177, "rewards/margins": 0.16576087474822998, "rewards/margins_max": 0.48638468980789185, "rewards/margins_min": -0.1291709989309311, "rewards/margins_std": 0.27711325883865356, "rewards/rejected": -0.39635321497917175, "step": 900 }, { "epoch": 0.22, "eval_logits/chosen": -2.231218099594116, "eval_logits/rejected": -2.1258907318115234, "eval_logps/chosen": -299.71417236328125, "eval_logps/rejected": -300.8446350097656, "eval_loss": 0.6548712253570557, "eval_rewards/accuracies": 0.6259999871253967, "eval_rewards/chosen": -0.2805112600326538, "eval_rewards/margins": 0.11550935357809067, "eval_rewards/margins_max": 0.6829928755760193, "eval_rewards/margins_min": -0.37661242485046387, "eval_rewards/margins_std": 0.34651389718055725, "eval_rewards/rejected": -0.3960206210613251, "eval_runtime": 1547.4013, "eval_samples_per_second": 2.585, "eval_steps_per_second": 0.162, "step": 900 }, { "epoch": 0.22, "grad_norm": 3.0797528450128504, "learning_rate": 4.791505122251827e-07, "logits/chosen": -2.468552350997925, "logits/rejected": -2.2628140449523926, "logps/chosen": -257.92535400390625, "logps/rejected": -258.787841796875, "loss": 0.6231, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.22096392512321472, "rewards/margins": 0.17900450527668, "rewards/margins_max": 0.567663848400116, "rewards/margins_min": -0.13577169179916382, "rewards/margins_std": 0.31748077273368835, "rewards/rejected": -0.3999684453010559, "step": 910 }, { "epoch": 0.22, "grad_norm": 4.0228901611171395, "learning_rate": 4.783069565469985e-07, "logits/chosen": -2.266120433807373, "logits/rejected": -2.237914562225342, "logps/chosen": -299.3618469238281, "logps/rejected": -310.4810485839844, "loss": 0.632, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2717237174510956, "rewards/margins": 0.14964650571346283, "rewards/margins_max": 0.5369307398796082, "rewards/margins_min": -0.18737640976905823, "rewards/margins_std": 0.32104068994522095, "rewards/rejected": -0.421370267868042, "step": 920 }, { "epoch": 0.22, "grad_norm": 3.6548892366296477, "learning_rate": 4.77447445648357e-07, "logits/chosen": -2.2938146591186523, "logits/rejected": -2.1845481395721436, "logps/chosen": -283.7642822265625, "logps/rejected": -264.29534912109375, "loss": 0.6538, "rewards/accuracies": 0.625, "rewards/chosen": -0.3067338764667511, "rewards/margins": 0.1226409450173378, "rewards/margins_max": 0.5580971837043762, "rewards/margins_min": -0.2774401903152466, "rewards/margins_std": 0.3810023367404938, "rewards/rejected": -0.4293747842311859, "step": 930 }, { "epoch": 0.23, "grad_norm": 3.434051170736584, "learning_rate": 4.765720395961349e-07, "logits/chosen": -2.347121238708496, "logits/rejected": -2.277678966522217, "logps/chosen": -294.29876708984375, "logps/rejected": -319.03753662109375, "loss": 0.6335, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.27976617217063904, "rewards/margins": 0.20592932403087616, "rewards/margins_max": 0.6754550337791443, "rewards/margins_min": -0.17637373507022858, "rewards/margins_std": 0.37887194752693176, "rewards/rejected": -0.4856955111026764, "step": 940 }, { "epoch": 0.23, "grad_norm": 2.9089785657127774, "learning_rate": 4.7568079956804144e-07, "logits/chosen": -2.4770052433013916, "logits/rejected": -2.297044515609741, "logps/chosen": -345.19305419921875, "logps/rejected": -351.08392333984375, "loss": 0.6533, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3537083566188812, "rewards/margins": 0.12422943115234375, "rewards/margins_max": 0.5163439512252808, "rewards/margins_min": -0.34537768363952637, "rewards/margins_std": 0.387656033039093, "rewards/rejected": -0.47793787717819214, "step": 950 }, { "epoch": 0.23, "grad_norm": 3.557234771556511, "learning_rate": 4.74773787848342e-07, "logits/chosen": -2.4619688987731934, "logits/rejected": -2.3016021251678467, "logps/chosen": -323.8647155761719, "logps/rejected": -290.92730712890625, "loss": 0.6299, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.29603904485702515, "rewards/margins": 0.14972428977489471, "rewards/margins_max": 0.5326004028320312, "rewards/margins_min": -0.21037602424621582, "rewards/margins_std": 0.3298071324825287, "rewards/rejected": -0.4457632899284363, "step": 960 }, { "epoch": 0.23, "grad_norm": 3.6538106491896696, "learning_rate": 4.7385106782350637e-07, "logits/chosen": -2.3749279975891113, "logits/rejected": -2.214154005050659, "logps/chosen": -339.3372497558594, "logps/rejected": -342.37493896484375, "loss": 0.6272, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.3063492476940155, "rewards/margins": 0.20046338438987732, "rewards/margins_max": 0.7209583520889282, "rewards/margins_min": -0.36391767859458923, "rewards/margins_std": 0.4686485230922699, "rewards/rejected": -0.5068126916885376, "step": 970 }, { "epoch": 0.23, "grad_norm": 3.2248638567909227, "learning_rate": 4.729127039777781e-07, "logits/chosen": -2.268017292022705, "logits/rejected": -2.1956000328063965, "logps/chosen": -255.5393524169922, "logps/rejected": -265.38848876953125, "loss": 0.6419, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3071519732475281, "rewards/margins": 0.14565886557102203, "rewards/margins_max": 0.5417459607124329, "rewards/margins_min": -0.25262564420700073, "rewards/margins_std": 0.35112863779067993, "rewards/rejected": -0.4528108239173889, "step": 980 }, { "epoch": 0.24, "grad_norm": 3.6682656587432754, "learning_rate": 4.719587618886685e-07, "logits/chosen": -2.490283489227295, "logits/rejected": -2.29603910446167, "logps/chosen": -331.20855712890625, "logps/rejected": -334.89520263671875, "loss": 0.6709, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.3136901557445526, "rewards/margins": 0.13260194659233093, "rewards/margins_max": 0.6070823073387146, "rewards/margins_min": -0.315726637840271, "rewards/margins_std": 0.4030987322330475, "rewards/rejected": -0.44629210233688354, "step": 990 }, { "epoch": 0.24, "grad_norm": 3.0832357820767484, "learning_rate": 4.709893082223737e-07, "logits/chosen": -2.4309589862823486, "logits/rejected": -2.3141777515411377, "logps/chosen": -312.267578125, "logps/rejected": -320.28179931640625, "loss": 0.6294, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2972213923931122, "rewards/margins": 0.11866515874862671, "rewards/margins_max": 0.49390506744384766, "rewards/margins_min": -0.27916720509529114, "rewards/margins_std": 0.34936681389808655, "rewards/rejected": -0.4158865511417389, "step": 1000 }, { "epoch": 0.24, "eval_logits/chosen": -2.22768235206604, "eval_logits/rejected": -2.123034715652466, "eval_logps/chosen": -304.5846252441406, "eval_logps/rejected": -308.093017578125, "eval_loss": 0.6490492224693298, "eval_rewards/accuracies": 0.6320000290870667, "eval_rewards/chosen": -0.3292158842086792, "eval_rewards/margins": 0.13928921520709991, "eval_rewards/margins_max": 0.7941285967826843, "eval_rewards/margins_min": -0.4288816750049591, "eval_rewards/margins_std": 0.4002571702003479, "eval_rewards/rejected": -0.4685050845146179, "eval_runtime": 1505.5808, "eval_samples_per_second": 2.657, "eval_steps_per_second": 0.166, "step": 1000 }, { "epoch": 0.24, "grad_norm": 3.4432344635773777, "learning_rate": 4.7000441072911554e-07, "logits/chosen": -2.300849199295044, "logits/rejected": -2.253727674484253, "logps/chosen": -267.82220458984375, "logps/rejected": -318.17791748046875, "loss": 0.6289, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.27558863162994385, "rewards/margins": 0.17123334109783173, "rewards/margins_max": 0.595753014087677, "rewards/margins_min": -0.16741998493671417, "rewards/margins_std": 0.3577662706375122, "rewards/rejected": -0.44682198762893677, "step": 1010 }, { "epoch": 0.24, "grad_norm": 4.302577448341239, "learning_rate": 4.690041382384071e-07, "logits/chosen": -2.338097333908081, "logits/rejected": -2.2996907234191895, "logps/chosen": -250.04800415039062, "logps/rejected": -281.29998779296875, "loss": 0.6228, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2898126542568207, "rewards/margins": 0.19900104403495789, "rewards/margins_max": 0.6168231964111328, "rewards/margins_min": -0.11563953012228012, "rewards/margins_std": 0.33153098821640015, "rewards/rejected": -0.48881372809410095, "step": 1020 }, { "epoch": 0.25, "grad_norm": 2.938454168658438, "learning_rate": 4.679885606542423e-07, "logits/chosen": -2.454998254776001, "logits/rejected": -2.374480724334717, "logps/chosen": -283.3580017089844, "logps/rejected": -307.2841491699219, "loss": 0.6381, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.360912561416626, "rewards/margins": 0.18049256503582, "rewards/margins_max": 0.7126591801643372, "rewards/margins_min": -0.265884131193161, "rewards/margins_std": 0.432709276676178, "rewards/rejected": -0.5414050817489624, "step": 1030 }, { "epoch": 0.25, "grad_norm": 3.7113478905476986, "learning_rate": 4.669577489502108e-07, "logits/chosen": -2.4574291706085205, "logits/rejected": -2.256026268005371, "logps/chosen": -316.6407165527344, "logps/rejected": -323.5616149902344, "loss": 0.6204, "rewards/accuracies": 0.625, "rewards/chosen": -0.43015462160110474, "rewards/margins": 0.12525419890880585, "rewards/margins_max": 0.5398000478744507, "rewards/margins_min": -0.3461098074913025, "rewards/margins_std": 0.39081233739852905, "rewards/rejected": -0.5554088354110718, "step": 1040 }, { "epoch": 0.25, "grad_norm": 4.526254781337553, "learning_rate": 4.6591177516453795e-07, "logits/chosen": -2.189232349395752, "logits/rejected": -2.190669298171997, "logps/chosen": -265.4393005371094, "logps/rejected": -281.74981689453125, "loss": 0.6398, "rewards/accuracies": 0.625, "rewards/chosen": -0.37716370820999146, "rewards/margins": 0.09194508194923401, "rewards/margins_max": 0.4913526177406311, "rewards/margins_min": -0.3186010718345642, "rewards/margins_std": 0.369209349155426, "rewards/rejected": -0.4691087603569031, "step": 1050 }, { "epoch": 0.25, "grad_norm": 3.4821141867103598, "learning_rate": 4.6485071239505037e-07, "logits/chosen": -2.3295235633850098, "logits/rejected": -2.2163546085357666, "logps/chosen": -302.76910400390625, "logps/rejected": -318.57830810546875, "loss": 0.6258, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.30321723222732544, "rewards/margins": 0.25288861989974976, "rewards/margins_max": 0.7242304086685181, "rewards/margins_min": -0.220504492521286, "rewards/margins_std": 0.42571085691452026, "rewards/rejected": -0.5561057925224304, "step": 1060 }, { "epoch": 0.26, "grad_norm": 3.165868926370762, "learning_rate": 4.6377463479406777e-07, "logits/chosen": -2.399397373199463, "logits/rejected": -2.2123234272003174, "logps/chosen": -326.80975341796875, "logps/rejected": -314.40325927734375, "loss": 0.6469, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4343215823173523, "rewards/margins": 0.1511266976594925, "rewards/margins_max": 0.6228338479995728, "rewards/margins_min": -0.3154509961605072, "rewards/margins_std": 0.406162828207016, "rewards/rejected": -0.5854482650756836, "step": 1070 }, { "epoch": 0.26, "grad_norm": 3.44507291830152, "learning_rate": 4.6268361756322037e-07, "logits/chosen": -2.408081293106079, "logits/rejected": -2.2022228240966797, "logps/chosen": -336.84600830078125, "logps/rejected": -313.1626281738281, "loss": 0.6443, "rewards/accuracies": 0.625, "rewards/chosen": -0.3460560441017151, "rewards/margins": 0.2014591246843338, "rewards/margins_max": 0.6191409230232239, "rewards/margins_min": -0.20395508408546448, "rewards/margins_std": 0.3673318028450012, "rewards/rejected": -0.5475151538848877, "step": 1080 }, { "epoch": 0.26, "grad_norm": 4.040369408969618, "learning_rate": 4.6157773694819396e-07, "logits/chosen": -2.3254928588867188, "logits/rejected": -2.3290889263153076, "logps/chosen": -310.9165954589844, "logps/rejected": -390.9909362792969, "loss": 0.6378, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4329812526702881, "rewards/margins": 0.1924930065870285, "rewards/margins_max": 0.7771722078323364, "rewards/margins_min": -0.26164352893829346, "rewards/margins_std": 0.4558800160884857, "rewards/rejected": -0.6254743337631226, "step": 1090 }, { "epoch": 0.26, "grad_norm": 3.492755340220409, "learning_rate": 4.60457070233401e-07, "logits/chosen": -2.2125890254974365, "logits/rejected": -2.1011548042297363, "logps/chosen": -273.10614013671875, "logps/rejected": -288.22625732421875, "loss": 0.6195, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3316890299320221, "rewards/margins": 0.25433018803596497, "rewards/margins_max": 0.6655761003494263, "rewards/margins_min": -0.11815662682056427, "rewards/margins_std": 0.3515794277191162, "rewards/rejected": -0.5860191583633423, "step": 1100 }, { "epoch": 0.26, "eval_logits/chosen": -2.2261343002319336, "eval_logits/rejected": -2.1223251819610596, "eval_logps/chosen": -312.2675476074219, "eval_logps/rejected": -318.60760498046875, "eval_loss": 0.6438679099082947, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -0.40604496002197266, "eval_rewards/margins": 0.1676054447889328, "eval_rewards/margins_max": 0.9327769875526428, "eval_rewards/margins_min": -0.49414122104644775, "eval_rewards/margins_std": 0.4674181640148163, "eval_rewards/rejected": -0.5736504793167114, "eval_runtime": 1578.6786, "eval_samples_per_second": 2.534, "eval_steps_per_second": 0.158, "step": 1100 }, { "epoch": 0.27, "grad_norm": 4.146878569198257, "learning_rate": 4.5932169573657987e-07, "logits/chosen": -2.5002052783966064, "logits/rejected": -2.3910346031188965, "logps/chosen": -339.526123046875, "logps/rejected": -384.61077880859375, "loss": 0.598, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.351265013217926, "rewards/margins": 0.32924920320510864, "rewards/margins_max": 0.8524104356765747, "rewards/margins_min": -0.20300130546092987, "rewards/margins_std": 0.4669255316257477, "rewards/rejected": -0.6805142164230347, "step": 1110 }, { "epoch": 0.27, "grad_norm": 4.39512973355234, "learning_rate": 4.581716928033216e-07, "logits/chosen": -2.316105365753174, "logits/rejected": -2.289231538772583, "logps/chosen": -309.3738708496094, "logps/rejected": -333.35101318359375, "loss": 0.6481, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4345484673976898, "rewards/margins": 0.08426953852176666, "rewards/margins_max": 0.7386177182197571, "rewards/margins_min": -0.599795401096344, "rewards/margins_std": 0.6043834686279297, "rewards/rejected": -0.5188180208206177, "step": 1120 }, { "epoch": 0.27, "grad_norm": 3.971264860889653, "learning_rate": 4.5700714180152467e-07, "logits/chosen": -2.288471221923828, "logits/rejected": -2.207989454269409, "logps/chosen": -248.96957397460938, "logps/rejected": -276.5824279785156, "loss": 0.6445, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.38618502020835876, "rewards/margins": 0.16013480722904205, "rewards/margins_max": 0.6282051801681519, "rewards/margins_min": -0.3568410575389862, "rewards/margins_std": 0.4367251992225647, "rewards/rejected": -0.546319842338562, "step": 1130 }, { "epoch": 0.27, "grad_norm": 4.099908985895535, "learning_rate": 4.5582812411577887e-07, "logits/chosen": -2.3780834674835205, "logits/rejected": -2.240727186203003, "logps/chosen": -309.24920654296875, "logps/rejected": -314.5780334472656, "loss": 0.6644, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4676491618156433, "rewards/margins": 0.15005311369895935, "rewards/margins_max": 0.6073339581489563, "rewards/margins_min": -0.3337249457836151, "rewards/margins_std": 0.42210274934768677, "rewards/rejected": -0.617702305316925, "step": 1140 }, { "epoch": 0.28, "grad_norm": 4.285690566406007, "learning_rate": 4.546347221416772e-07, "logits/chosen": -2.3988311290740967, "logits/rejected": -2.273453950881958, "logps/chosen": -302.38189697265625, "logps/rejected": -313.4936828613281, "loss": 0.6117, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4687812924385071, "rewards/margins": 0.18205413222312927, "rewards/margins_max": 0.7198780179023743, "rewards/margins_min": -0.30025526881217957, "rewards/margins_std": 0.4603729844093323, "rewards/rejected": -0.650835394859314, "step": 1150 }, { "epoch": 0.28, "grad_norm": 4.707007139785244, "learning_rate": 4.534270192800581e-07, "logits/chosen": -2.2688496112823486, "logits/rejected": -2.179121255874634, "logps/chosen": -283.2395324707031, "logps/rejected": -308.91552734375, "loss": 0.6238, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.35302022099494934, "rewards/margins": 0.22418050467967987, "rewards/margins_max": 0.7438470721244812, "rewards/margins_min": -0.23345494270324707, "rewards/margins_std": 0.4375784397125244, "rewards/rejected": -0.577200710773468, "step": 1160 }, { "epoch": 0.28, "grad_norm": 4.170970118537932, "learning_rate": 4.5220509993117684e-07, "logits/chosen": -2.3964593410491943, "logits/rejected": -2.206247329711914, "logps/chosen": -329.1454772949219, "logps/rejected": -318.5571594238281, "loss": 0.6518, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.46167173981666565, "rewards/margins": 0.11766606569290161, "rewards/margins_max": 0.6312614679336548, "rewards/margins_min": -0.37555062770843506, "rewards/margins_std": 0.4447709619998932, "rewards/rejected": -0.5793377757072449, "step": 1170 }, { "epoch": 0.28, "grad_norm": 5.402556287621441, "learning_rate": 4.509690494888071e-07, "logits/chosen": -2.4327127933502197, "logits/rejected": -2.2058887481689453, "logps/chosen": -370.6610412597656, "logps/rejected": -341.78546142578125, "loss": 0.6403, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4291810095310211, "rewards/margins": 0.1413087546825409, "rewards/margins_max": 0.5999075174331665, "rewards/margins_min": -0.3418174684047699, "rewards/margins_std": 0.41507115960121155, "rewards/rejected": -0.5704898238182068, "step": 1180 }, { "epoch": 0.28, "grad_norm": 4.256005475922697, "learning_rate": 4.4971895433427356e-07, "logits/chosen": -2.3157317638397217, "logits/rejected": -2.241391658782959, "logps/chosen": -251.4292755126953, "logps/rejected": -274.1908874511719, "loss": 0.6144, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.3352741003036499, "rewards/margins": 0.21672995388507843, "rewards/margins_max": 0.6320234537124634, "rewards/margins_min": -0.22713084518909454, "rewards/margins_std": 0.3785281181335449, "rewards/rejected": -0.5520040392875671, "step": 1190 }, { "epoch": 0.29, "grad_norm": 4.0443210864788055, "learning_rate": 4.4845490183041454e-07, "logits/chosen": -2.3574349880218506, "logits/rejected": -2.272819757461548, "logps/chosen": -329.6828918457031, "logps/rejected": -367.85040283203125, "loss": 0.5908, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.4124225080013275, "rewards/margins": 0.3094863295555115, "rewards/margins_max": 0.9087298512458801, "rewards/margins_min": -0.2066916525363922, "rewards/margins_std": 0.4878038465976715, "rewards/rejected": -0.7219088673591614, "step": 1200 }, { "epoch": 0.29, "eval_logits/chosen": -2.219698667526245, "eval_logits/rejected": -2.116809606552124, "eval_logps/chosen": -315.89862060546875, "eval_logps/rejected": -324.5404357910156, "eval_loss": 0.6382519006729126, "eval_rewards/accuracies": 0.6365000009536743, "eval_rewards/chosen": -0.44235560297966003, "eval_rewards/margins": 0.19062314927577972, "eval_rewards/margins_max": 1.0209616422653198, "eval_rewards/margins_min": -0.5223661661148071, "eval_rewards/margins_std": 0.5060446262359619, "eval_rewards/rejected": -0.6329786777496338, "eval_runtime": 1555.3363, "eval_samples_per_second": 2.572, "eval_steps_per_second": 0.161, "step": 1200 }, { "epoch": 0.29, "grad_norm": 4.902239089395744, "learning_rate": 4.4717698031547733e-07, "logits/chosen": -2.3533434867858887, "logits/rejected": -2.172482967376709, "logps/chosen": -341.41058349609375, "logps/rejected": -326.9712829589844, "loss": 0.6234, "rewards/accuracies": 0.625, "rewards/chosen": -0.45537644624710083, "rewards/margins": 0.2056373655796051, "rewards/margins_max": 0.8173977732658386, "rewards/margins_min": -0.30659928917884827, "rewards/margins_std": 0.512613832950592, "rewards/rejected": -0.6610138416290283, "step": 1210 }, { "epoch": 0.29, "grad_norm": 4.293567174936635, "learning_rate": 4.458852790969445e-07, "logits/chosen": -2.402947425842285, "logits/rejected": -2.3117432594299316, "logps/chosen": -296.55157470703125, "logps/rejected": -332.2838134765625, "loss": 0.6083, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.44581732153892517, "rewards/margins": 0.20890851318836212, "rewards/margins_max": 0.7212175130844116, "rewards/margins_min": -0.26086920499801636, "rewards/margins_std": 0.43562060594558716, "rewards/rejected": -0.6547258496284485, "step": 1220 }, { "epoch": 0.29, "grad_norm": 3.88417222835653, "learning_rate": 4.4457988844529204e-07, "logits/chosen": -2.3324713706970215, "logits/rejected": -2.2652242183685303, "logps/chosen": -288.92828369140625, "logps/rejected": -347.032958984375, "loss": 0.6511, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4611320495605469, "rewards/margins": 0.24559442698955536, "rewards/margins_max": 0.9546945691108704, "rewards/margins_min": -0.36741599440574646, "rewards/margins_std": 0.5900758504867554, "rewards/rejected": -0.7067264914512634, "step": 1230 }, { "epoch": 0.3, "grad_norm": 4.553751042774551, "learning_rate": 4.432608995876819e-07, "logits/chosen": -2.4468352794647217, "logits/rejected": -2.2406821250915527, "logps/chosen": -307.21649169921875, "logps/rejected": -307.28424072265625, "loss": 0.6614, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.5195098519325256, "rewards/margins": 0.1029490977525711, "rewards/margins_max": 0.7289954423904419, "rewards/margins_min": -0.5001469850540161, "rewards/margins_std": 0.5541279911994934, "rewards/rejected": -0.6224589943885803, "step": 1240 }, { "epoch": 0.3, "grad_norm": 4.496612545410703, "learning_rate": 4.419284047015854e-07, "logits/chosen": -2.4668831825256348, "logits/rejected": -2.3057703971862793, "logps/chosen": -335.5724182128906, "logps/rejected": -308.8241882324219, "loss": 0.63, "rewards/accuracies": 0.6875, "rewards/chosen": -0.47055473923683167, "rewards/margins": 0.21934108436107635, "rewards/margins_max": 0.856197714805603, "rewards/margins_min": -0.3771001696586609, "rewards/margins_std": 0.5560771226882935, "rewards/rejected": -0.6898958086967468, "step": 1250 }, { "epoch": 0.3, "grad_norm": 3.9229539093880654, "learning_rate": 4.4058249690834235e-07, "logits/chosen": -2.3976314067840576, "logits/rejected": -2.3277549743652344, "logps/chosen": -288.1208190917969, "logps/rejected": -307.2737731933594, "loss": 0.5996, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.49328312277793884, "rewards/margins": 0.2340044528245926, "rewards/margins_max": 0.7340611219406128, "rewards/margins_min": -0.26106178760528564, "rewards/margins_std": 0.44272905588150024, "rewards/rejected": -0.7272875905036926, "step": 1260 }, { "epoch": 0.3, "grad_norm": 4.263583299011223, "learning_rate": 4.39223270266653e-07, "logits/chosen": -2.4664294719696045, "logits/rejected": -2.286088466644287, "logps/chosen": -298.78448486328125, "logps/rejected": -309.5777282714844, "loss": 0.5951, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.44215768575668335, "rewards/margins": 0.2309143990278244, "rewards/margins_max": 0.7198029160499573, "rewards/margins_min": -0.338601291179657, "rewards/margins_std": 0.46803468465805054, "rewards/rejected": -0.6730721592903137, "step": 1270 }, { "epoch": 0.31, "grad_norm": 4.9706467181229765, "learning_rate": 4.378508197660045e-07, "logits/chosen": -2.3879165649414062, "logits/rejected": -2.2405476570129395, "logps/chosen": -345.5390625, "logps/rejected": -353.7935485839844, "loss": 0.5602, "rewards/accuracies": 0.75, "rewards/chosen": -0.4764694273471832, "rewards/margins": 0.3744627833366394, "rewards/margins_max": 1.0052845478057861, "rewards/margins_min": -0.20339909195899963, "rewards/margins_std": 0.5431888103485107, "rewards/rejected": -0.8509323000907898, "step": 1280 }, { "epoch": 0.31, "grad_norm": 5.392609780953355, "learning_rate": 4.364652413200325e-07, "logits/chosen": -2.448927164077759, "logits/rejected": -2.2756385803222656, "logps/chosen": -349.70941162109375, "logps/rejected": -344.00341796875, "loss": 0.5732, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5425821542739868, "rewards/margins": 0.35032421350479126, "rewards/margins_max": 0.9795330166816711, "rewards/margins_min": -0.19635611772537231, "rewards/margins_std": 0.5289679765701294, "rewards/rejected": -0.8929063081741333, "step": 1290 }, { "epoch": 0.31, "grad_norm": 4.845160691122921, "learning_rate": 4.35066631759819e-07, "logits/chosen": -2.350041627883911, "logits/rejected": -2.282637119293213, "logps/chosen": -325.2442626953125, "logps/rejected": -354.0623474121094, "loss": 0.6, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.55057692527771, "rewards/margins": 0.3553193211555481, "rewards/margins_max": 0.9742587208747864, "rewards/margins_min": -0.1493239849805832, "rewards/margins_std": 0.49752187728881836, "rewards/rejected": -0.9058961868286133, "step": 1300 }, { "epoch": 0.31, "eval_logits/chosen": -2.196049213409424, "eval_logits/rejected": -2.0949549674987793, "eval_logps/chosen": -335.8221740722656, "eval_logps/rejected": -349.71954345703125, "eval_loss": 0.6348028779029846, "eval_rewards/accuracies": 0.640999972820282, "eval_rewards/chosen": -0.6415910720825195, "eval_rewards/margins": 0.24317891895771027, "eval_rewards/margins_max": 1.3116979598999023, "eval_rewards/margins_min": -0.6515898704528809, "eval_rewards/margins_std": 0.6444133520126343, "eval_rewards/rejected": -0.8847700357437134, "eval_runtime": 1537.2399, "eval_samples_per_second": 2.602, "eval_steps_per_second": 0.163, "step": 1300 }, { "epoch": 0.31, "grad_norm": 4.295348183992599, "learning_rate": 4.3365508882712445e-07, "logits/chosen": -2.3973958492279053, "logits/rejected": -2.2862548828125, "logps/chosen": -351.510498046875, "logps/rejected": -353.1122131347656, "loss": 0.6006, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5742763876914978, "rewards/margins": 0.30094432830810547, "rewards/margins_max": 0.8947485089302063, "rewards/margins_min": -0.2811780869960785, "rewards/margins_std": 0.5243841409683228, "rewards/rejected": -0.875220775604248, "step": 1310 }, { "epoch": 0.32, "grad_norm": 4.318138527539103, "learning_rate": 4.322307111675573e-07, "logits/chosen": -2.2596309185028076, "logits/rejected": -2.1446681022644043, "logps/chosen": -317.4941101074219, "logps/rejected": -320.9417419433594, "loss": 0.622, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6093077063560486, "rewards/margins": 0.2886090576648712, "rewards/margins_max": 0.7762603759765625, "rewards/margins_min": -0.23630282282829285, "rewards/margins_std": 0.4524054527282715, "rewards/rejected": -0.8979167938232422, "step": 1320 }, { "epoch": 0.32, "grad_norm": 4.599283635294277, "learning_rate": 4.3079359832368055e-07, "logits/chosen": -2.413256883621216, "logits/rejected": -2.157926082611084, "logps/chosen": -341.5835876464844, "logps/rejected": -305.91644287109375, "loss": 0.5873, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6509702205657959, "rewards/margins": 0.18986381590366364, "rewards/margins_max": 0.8637007474899292, "rewards/margins_min": -0.42064419388771057, "rewards/margins_std": 0.5823715925216675, "rewards/rejected": -0.8408341407775879, "step": 1330 }, { "epoch": 0.32, "grad_norm": 4.370916804464237, "learning_rate": 4.2934385072805467e-07, "logits/chosen": -2.3677573204040527, "logits/rejected": -2.227693796157837, "logps/chosen": -311.3836364746094, "logps/rejected": -295.03778076171875, "loss": 0.6044, "rewards/accuracies": 0.625, "rewards/chosen": -0.5849432945251465, "rewards/margins": 0.22360841929912567, "rewards/margins_max": 0.9079456329345703, "rewards/margins_min": -0.5098749399185181, "rewards/margins_std": 0.6370505094528198, "rewards/rejected": -0.8085516691207886, "step": 1340 }, { "epoch": 0.32, "grad_norm": 6.293357301978896, "learning_rate": 4.278815696962195e-07, "logits/chosen": -2.4053139686584473, "logits/rejected": -2.303435802459717, "logps/chosen": -347.4659423828125, "logps/rejected": -387.4821472167969, "loss": 0.5945, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6723080277442932, "rewards/margins": 0.3411722183227539, "rewards/margins_max": 1.020204782485962, "rewards/margins_min": -0.29561054706573486, "rewards/margins_std": 0.5973449945449829, "rewards/rejected": -1.013480305671692, "step": 1350 }, { "epoch": 0.33, "grad_norm": 3.7348654197540125, "learning_rate": 4.264068574196129e-07, "logits/chosen": -2.3460612297058105, "logits/rejected": -2.161550760269165, "logps/chosen": -328.048583984375, "logps/rejected": -325.20782470703125, "loss": 0.6347, "rewards/accuracies": 0.75, "rewards/chosen": -0.5509380102157593, "rewards/margins": 0.29191073775291443, "rewards/margins_max": 0.8628549575805664, "rewards/margins_min": -0.23190875351428986, "rewards/margins_std": 0.48494529724121094, "rewards/rejected": -0.8428487777709961, "step": 1360 }, { "epoch": 0.33, "grad_norm": 4.521248559590332, "learning_rate": 4.2491981695843016e-07, "logits/chosen": -2.230130195617676, "logits/rejected": -2.325134754180908, "logps/chosen": -299.5582580566406, "logps/rejected": -378.7133483886719, "loss": 0.6037, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5805849432945251, "rewards/margins": 0.35733962059020996, "rewards/margins_max": 1.181233286857605, "rewards/margins_min": -0.25447431206703186, "rewards/margins_std": 0.6446479558944702, "rewards/rejected": -0.9379245638847351, "step": 1370 }, { "epoch": 0.33, "grad_norm": 4.664936948556624, "learning_rate": 4.2342055223442093e-07, "logits/chosen": -2.3150832653045654, "logits/rejected": -2.3037054538726807, "logps/chosen": -321.69891357421875, "logps/rejected": -354.2572937011719, "loss": 0.607, "rewards/accuracies": 0.75, "rewards/chosen": -0.5547856688499451, "rewards/margins": 0.3423914909362793, "rewards/margins_max": 1.0076104402542114, "rewards/margins_min": -0.2788556218147278, "rewards/margins_std": 0.5616511106491089, "rewards/rejected": -0.8971772193908691, "step": 1380 }, { "epoch": 0.33, "grad_norm": 4.616477950516231, "learning_rate": 4.2190916802362687e-07, "logits/chosen": -2.391636610031128, "logits/rejected": -2.367901086807251, "logps/chosen": -311.2222595214844, "logps/rejected": -355.91180419921875, "loss": 0.6098, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6496304273605347, "rewards/margins": 0.31357210874557495, "rewards/margins_max": 1.1335046291351318, "rewards/margins_min": -0.35718679428100586, "rewards/margins_std": 0.661542534828186, "rewards/rejected": -0.9632023572921753, "step": 1390 }, { "epoch": 0.34, "grad_norm": 6.253346272165757, "learning_rate": 4.203857699490593e-07, "logits/chosen": -2.389662265777588, "logits/rejected": -2.2011070251464844, "logps/chosen": -330.96502685546875, "logps/rejected": -305.61834716796875, "loss": 0.6562, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6669104099273682, "rewards/margins": 0.13130736351013184, "rewards/margins_max": 0.813090443611145, "rewards/margins_min": -0.5520807504653931, "rewards/margins_std": 0.6045147776603699, "rewards/rejected": -0.7982178330421448, "step": 1400 }, { "epoch": 0.34, "eval_logits/chosen": -2.2013356685638428, "eval_logits/rejected": -2.0991361141204834, "eval_logps/chosen": -328.429931640625, "eval_logps/rejected": -344.4360046386719, "eval_loss": 0.6243847608566284, "eval_rewards/accuracies": 0.6445000171661377, "eval_rewards/chosen": -0.567669153213501, "eval_rewards/margins": 0.26426535844802856, "eval_rewards/margins_max": 1.335082769393921, "eval_rewards/margins_min": -0.6113796830177307, "eval_rewards/margins_std": 0.6395717859268188, "eval_rewards/rejected": -0.8319344520568848, "eval_runtime": 1559.2977, "eval_samples_per_second": 2.565, "eval_steps_per_second": 0.16, "step": 1400 }, { "epoch": 0.34, "grad_norm": 5.73068975374788, "learning_rate": 4.1885046447331816e-07, "logits/chosen": -2.2985005378723145, "logits/rejected": -2.266115665435791, "logps/chosen": -343.3243408203125, "logps/rejected": -364.4938049316406, "loss": 0.5876, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.4864772856235504, "rewards/margins": 0.3863835632801056, "rewards/margins_max": 1.1123895645141602, "rewards/margins_min": -0.2642289698123932, "rewards/margins_std": 0.6134835481643677, "rewards/rejected": -0.8728609085083008, "step": 1410 }, { "epoch": 0.34, "grad_norm": 5.834842196098292, "learning_rate": 4.173033588911511e-07, "logits/chosen": -2.352750062942505, "logits/rejected": -2.288503885269165, "logps/chosen": -349.5456848144531, "logps/rejected": -385.8179626464844, "loss": 0.6125, "rewards/accuracies": 0.75, "rewards/chosen": -0.5210774540901184, "rewards/margins": 0.36380624771118164, "rewards/margins_max": 1.0118687152862549, "rewards/margins_min": -0.29262062907218933, "rewards/margins_std": 0.6008248329162598, "rewards/rejected": -0.8848837018013, "step": 1420 }, { "epoch": 0.34, "grad_norm": 4.885278869290606, "learning_rate": 4.157445613219559e-07, "logits/chosen": -2.2541935443878174, "logits/rejected": -2.2487776279449463, "logps/chosen": -319.67291259765625, "logps/rejected": -355.86480712890625, "loss": 0.5754, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6026453971862793, "rewards/margins": 0.2730913758277893, "rewards/margins_max": 0.9882809519767761, "rewards/margins_min": -0.3042627274990082, "rewards/margins_std": 0.5660111308097839, "rewards/rejected": -0.8757368326187134, "step": 1430 }, { "epoch": 0.34, "grad_norm": 5.491894905468418, "learning_rate": 4.141741807022243e-07, "logits/chosen": -2.4047532081604004, "logits/rejected": -2.162701368331909, "logps/chosen": -350.0587158203125, "logps/rejected": -320.4582214355469, "loss": 0.6557, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5737323760986328, "rewards/margins": 0.2283284217119217, "rewards/margins_max": 0.8615309596061707, "rewards/margins_min": -0.3444269597530365, "rewards/margins_std": 0.546771228313446, "rewards/rejected": -0.8020607233047485, "step": 1440 }, { "epoch": 0.35, "grad_norm": 4.911681066689785, "learning_rate": 4.1259232677792865e-07, "logits/chosen": -2.326155662536621, "logits/rejected": -2.315795660018921, "logps/chosen": -294.3655090332031, "logps/rejected": -296.48907470703125, "loss": 0.6156, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4602038860321045, "rewards/margins": 0.1950714886188507, "rewards/margins_max": 0.8077942728996277, "rewards/margins_min": -0.34487098455429077, "rewards/margins_std": 0.5171275734901428, "rewards/rejected": -0.655275285243988, "step": 1450 }, { "epoch": 0.35, "grad_norm": 4.962464585646324, "learning_rate": 4.1099911009685294e-07, "logits/chosen": -2.257016181945801, "logits/rejected": -2.2236053943634033, "logps/chosen": -345.76617431640625, "logps/rejected": -338.87127685546875, "loss": 0.6106, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5407674312591553, "rewards/margins": 0.27966317534446716, "rewards/margins_max": 1.0980759859085083, "rewards/margins_min": -0.4291163384914398, "rewards/margins_std": 0.6700000762939453, "rewards/rejected": -0.8204305768013, "step": 1460 }, { "epoch": 0.35, "grad_norm": 4.478882725905213, "learning_rate": 4.093946420008668e-07, "logits/chosen": -2.2895779609680176, "logits/rejected": -2.153841733932495, "logps/chosen": -325.39404296875, "logps/rejected": -337.7164001464844, "loss": 0.5828, "rewards/accuracies": 0.75, "rewards/chosen": -0.5845311880111694, "rewards/margins": 0.3467514216899872, "rewards/margins_max": 1.150130033493042, "rewards/margins_min": -0.3016558885574341, "rewards/margins_std": 0.6503480076789856, "rewards/rejected": -0.9312825202941895, "step": 1470 }, { "epoch": 0.35, "grad_norm": 5.340456298877897, "learning_rate": 4.0777903461814443e-07, "logits/chosen": -2.318495273590088, "logits/rejected": -2.177454948425293, "logps/chosen": -350.3999328613281, "logps/rejected": -343.06512451171875, "loss": 0.612, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6733035445213318, "rewards/margins": 0.2953178286552429, "rewards/margins_max": 1.0614795684814453, "rewards/margins_min": -0.37385374307632446, "rewards/margins_std": 0.6298120617866516, "rewards/rejected": -0.9686213731765747, "step": 1480 }, { "epoch": 0.36, "grad_norm": 5.755627407736663, "learning_rate": 4.061524008553285e-07, "logits/chosen": -2.244074821472168, "logits/rejected": -2.175053119659424, "logps/chosen": -294.0035095214844, "logps/rejected": -321.0830078125, "loss": 0.5803, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.525590717792511, "rewards/margins": 0.3730551302433014, "rewards/margins_max": 1.0708504915237427, "rewards/margins_min": -0.29104113578796387, "rewards/margins_std": 0.6074938774108887, "rewards/rejected": -0.8986458778381348, "step": 1490 }, { "epoch": 0.36, "grad_norm": 6.400502673912399, "learning_rate": 4.045148543896396e-07, "logits/chosen": -2.3792638778686523, "logits/rejected": -2.262115478515625, "logps/chosen": -345.04510498046875, "logps/rejected": -337.29046630859375, "loss": 0.6223, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6897913217544556, "rewards/margins": 0.18555831909179688, "rewards/margins_max": 0.9883977770805359, "rewards/margins_min": -0.5756305456161499, "rewards/margins_std": 0.6863231062889099, "rewards/rejected": -0.8753496408462524, "step": 1500 }, { "epoch": 0.36, "eval_logits/chosen": -2.1758041381835938, "eval_logits/rejected": -2.0745131969451904, "eval_logps/chosen": -340.6632995605469, "eval_logps/rejected": -361.3519592285156, "eval_loss": 0.6182387471199036, "eval_rewards/accuracies": 0.6549999713897705, "eval_rewards/chosen": -0.6900023818016052, "eval_rewards/margins": 0.3110922574996948, "eval_rewards/margins_max": 1.5452226400375366, "eval_rewards/margins_min": -0.6697593331336975, "eval_rewards/margins_std": 0.729656994342804, "eval_rewards/rejected": -1.0010945796966553, "eval_runtime": 1553.8009, "eval_samples_per_second": 2.574, "eval_steps_per_second": 0.161, "step": 1500 }, { "epoch": 0.36, "grad_norm": 5.349410600371064, "learning_rate": 4.028665096609323e-07, "logits/chosen": -2.3207499980926514, "logits/rejected": -2.2707111835479736, "logps/chosen": -379.7764587402344, "logps/rejected": -395.59332275390625, "loss": 0.6105, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7563599348068237, "rewards/margins": 0.2189294546842575, "rewards/margins_max": 0.8680636286735535, "rewards/margins_min": -0.44832438230514526, "rewards/margins_std": 0.5729413032531738, "rewards/rejected": -0.9752894639968872, "step": 1510 }, { "epoch": 0.36, "grad_norm": 5.577556335356216, "learning_rate": 4.01207481863697e-07, "logits/chosen": -2.4579689502716064, "logits/rejected": -2.2358219623565674, "logps/chosen": -388.2066345214844, "logps/rejected": -396.67938232421875, "loss": 0.5945, "rewards/accuracies": 0.75, "rewards/chosen": -0.59691321849823, "rewards/margins": 0.4801778197288513, "rewards/margins_max": 1.365128993988037, "rewards/margins_min": -0.251661479473114, "rewards/margins_std": 0.7063818573951721, "rewards/rejected": -1.077091097831726, "step": 1520 }, { "epoch": 0.37, "grad_norm": 6.278927805541885, "learning_rate": 3.9953788693901e-07, "logits/chosen": -2.240408420562744, "logits/rejected": -2.136202335357666, "logps/chosen": -375.01312255859375, "logps/rejected": -376.2455139160156, "loss": 0.6408, "rewards/accuracies": 0.625, "rewards/chosen": -0.7717885375022888, "rewards/margins": 0.19439414143562317, "rewards/margins_max": 0.9049777984619141, "rewards/margins_min": -0.7034814953804016, "rewards/margins_std": 0.7309711575508118, "rewards/rejected": -0.9661828279495239, "step": 1530 }, { "epoch": 0.37, "grad_norm": 5.2048872062781655, "learning_rate": 3.978578415664306e-07, "logits/chosen": -2.211455821990967, "logits/rejected": -2.1268532276153564, "logps/chosen": -299.19049072265625, "logps/rejected": -319.3480529785156, "loss": 0.55, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5583456158638, "rewards/margins": 0.4287661015987396, "rewards/margins_max": 1.2433301210403442, "rewards/margins_min": -0.20347020030021667, "rewards/margins_std": 0.670458972454071, "rewards/rejected": -0.9871117472648621, "step": 1540 }, { "epoch": 0.37, "grad_norm": 4.885387016591406, "learning_rate": 3.9616746315584733e-07, "logits/chosen": -2.371225118637085, "logits/rejected": -2.1451239585876465, "logps/chosen": -363.99273681640625, "logps/rejected": -330.90802001953125, "loss": 0.6088, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6165622472763062, "rewards/margins": 0.35272058844566345, "rewards/margins_max": 0.9494983553886414, "rewards/margins_min": -0.2435455024242401, "rewards/margins_std": 0.5442965626716614, "rewards/rejected": -0.9692827463150024, "step": 1550 }, { "epoch": 0.37, "grad_norm": 6.119908922303106, "learning_rate": 3.9446686983927236e-07, "logits/chosen": -2.333672046661377, "logits/rejected": -2.259251356124878, "logps/chosen": -305.39007568359375, "logps/rejected": -381.6943359375, "loss": 0.5541, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6414279341697693, "rewards/margins": 0.46348199248313904, "rewards/margins_max": 1.2339155673980713, "rewards/margins_min": -0.2919246554374695, "rewards/margins_std": 0.6692392826080322, "rewards/rejected": -1.1049100160598755, "step": 1560 }, { "epoch": 0.38, "grad_norm": 7.08385648104498, "learning_rate": 3.927561804625863e-07, "logits/chosen": -2.2995896339416504, "logits/rejected": -2.236870288848877, "logps/chosen": -364.95037841796875, "logps/rejected": -410.6171875, "loss": 0.5872, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7278547286987305, "rewards/margins": 0.34689298272132874, "rewards/margins_max": 1.1937592029571533, "rewards/margins_min": -0.42084938287734985, "rewards/margins_std": 0.7209205627441406, "rewards/rejected": -1.0747478008270264, "step": 1570 }, { "epoch": 0.38, "grad_norm": 7.522292258936595, "learning_rate": 3.910355145772323e-07, "logits/chosen": -2.2771050930023193, "logits/rejected": -2.179549217224121, "logps/chosen": -346.96087646484375, "logps/rejected": -399.2838439941406, "loss": 0.616, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.781330943107605, "rewards/margins": 0.5005442500114441, "rewards/margins_max": 1.4885427951812744, "rewards/margins_min": -0.2982368469238281, "rewards/margins_std": 0.8142154812812805, "rewards/rejected": -1.2818753719329834, "step": 1580 }, { "epoch": 0.38, "grad_norm": 5.305402255326198, "learning_rate": 3.893049924318613e-07, "logits/chosen": -2.255272626876831, "logits/rejected": -2.2085766792297363, "logps/chosen": -324.4107971191406, "logps/rejected": -378.49627685546875, "loss": 0.545, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6099139451980591, "rewards/margins": 0.4345978796482086, "rewards/margins_max": 1.2811332941055298, "rewards/margins_min": -0.14750699698925018, "rewards/margins_std": 0.6478888988494873, "rewards/rejected": -1.0445117950439453, "step": 1590 }, { "epoch": 0.38, "grad_norm": 7.2682683258660115, "learning_rate": 3.875647349639286e-07, "logits/chosen": -2.321608304977417, "logits/rejected": -2.13478422164917, "logps/chosen": -338.0977478027344, "logps/rejected": -312.79266357421875, "loss": 0.5927, "rewards/accuracies": 0.75, "rewards/chosen": -0.6523749828338623, "rewards/margins": 0.33668074011802673, "rewards/margins_max": 0.958840012550354, "rewards/margins_min": -0.3040617108345032, "rewards/margins_std": 0.5534626245498657, "rewards/rejected": -0.9890558123588562, "step": 1600 }, { "epoch": 0.38, "eval_logits/chosen": -2.1604669094085693, "eval_logits/rejected": -2.058563232421875, "eval_logps/chosen": -346.3847961425781, "eval_logps/rejected": -372.3523864746094, "eval_loss": 0.6113224029541016, "eval_rewards/accuracies": 0.6620000004768372, "eval_rewards/chosen": -0.747217059135437, "eval_rewards/margins": 0.36388134956359863, "eval_rewards/margins_max": 1.7606072425842285, "eval_rewards/margins_min": -0.7175586819648743, "eval_rewards/margins_std": 0.8175954222679138, "eval_rewards/rejected": -1.1110984086990356, "eval_runtime": 1501.9862, "eval_samples_per_second": 2.663, "eval_steps_per_second": 0.166, "step": 1600 }, { "epoch": 0.39, "grad_norm": 7.727717699916928, "learning_rate": 3.8581486379124185e-07, "logits/chosen": -2.3541316986083984, "logits/rejected": -2.2890353202819824, "logps/chosen": -381.8040466308594, "logps/rejected": -385.6960144042969, "loss": 0.6272, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.838807225227356, "rewards/margins": 0.3595799207687378, "rewards/margins_max": 1.295213222503662, "rewards/margins_min": -0.5221343636512756, "rewards/margins_std": 0.8010128736495972, "rewards/rejected": -1.1983870267868042, "step": 1610 }, { "epoch": 0.39, "grad_norm": 7.125462299540833, "learning_rate": 3.840555012034622e-07, "logits/chosen": -2.117300033569336, "logits/rejected": -1.9965636730194092, "logps/chosen": -314.2472839355469, "logps/rejected": -344.9120178222656, "loss": 0.5517, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7075129747390747, "rewards/margins": 0.3268389403820038, "rewards/margins_max": 1.0486294031143188, "rewards/margins_min": -0.312084436416626, "rewards/margins_std": 0.5949170589447021, "rewards/rejected": -1.0343519449234009, "step": 1620 }, { "epoch": 0.39, "grad_norm": 5.660020783187398, "learning_rate": 3.822867701535578e-07, "logits/chosen": -2.210599422454834, "logits/rejected": -2.147108316421509, "logps/chosen": -338.4330139160156, "logps/rejected": -369.67999267578125, "loss": 0.5614, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8056265115737915, "rewards/margins": 0.47563666105270386, "rewards/margins_max": 1.4032061100006104, "rewards/margins_min": -0.37327060103416443, "rewards/margins_std": 0.7978726625442505, "rewards/rejected": -1.2812631130218506, "step": 1630 }, { "epoch": 0.39, "grad_norm": 8.087946044319306, "learning_rate": 3.805087942492112e-07, "logits/chosen": -2.1932036876678467, "logits/rejected": -2.0676512718200684, "logps/chosen": -331.3818359375, "logps/rejected": -379.365234375, "loss": 0.5854, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7025831937789917, "rewards/margins": 0.48354464769363403, "rewards/margins_max": 1.2820312976837158, "rewards/margins_min": -0.1533391922712326, "rewards/margins_std": 0.632487416267395, "rewards/rejected": -1.1861279010772705, "step": 1640 }, { "epoch": 0.4, "grad_norm": 6.221671946731372, "learning_rate": 3.787216977441814e-07, "logits/chosen": -2.2697160243988037, "logits/rejected": -2.133417844772339, "logps/chosen": -326.9266662597656, "logps/rejected": -386.13385009765625, "loss": 0.5686, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8050438761711121, "rewards/margins": 0.3961057960987091, "rewards/margins_max": 1.3627588748931885, "rewards/margins_min": -0.4730393886566162, "rewards/margins_std": 0.8281580209732056, "rewards/rejected": -1.201149821281433, "step": 1650 }, { "epoch": 0.4, "grad_norm": 7.733427725687215, "learning_rate": 3.7692560552961976e-07, "logits/chosen": -2.290802001953125, "logits/rejected": -2.2568416595458984, "logps/chosen": -321.7680969238281, "logps/rejected": -368.76556396484375, "loss": 0.6035, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.8094627261161804, "rewards/margins": 0.31803658604621887, "rewards/margins_max": 1.2083570957183838, "rewards/margins_min": -0.3470551669597626, "rewards/margins_std": 0.6810132265090942, "rewards/rejected": -1.1274992227554321, "step": 1660 }, { "epoch": 0.4, "grad_norm": 7.498903321512461, "learning_rate": 3.7512064312534276e-07, "logits/chosen": -2.201200008392334, "logits/rejected": -2.1391210556030273, "logps/chosen": -378.8607482910156, "logps/rejected": -407.89556884765625, "loss": 0.5184, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8032592535018921, "rewards/margins": 0.4537124037742615, "rewards/margins_max": 1.5868498086929321, "rewards/margins_min": -0.40839165449142456, "rewards/margins_std": 0.894082248210907, "rewards/rejected": -1.2569717168807983, "step": 1670 }, { "epoch": 0.4, "grad_norm": 8.04649195083446, "learning_rate": 3.7330693667105937e-07, "logits/chosen": -2.3786263465881348, "logits/rejected": -2.192265748977661, "logps/chosen": -396.43341064453125, "logps/rejected": -371.79583740234375, "loss": 0.589, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8771692514419556, "rewards/margins": 0.38096219301223755, "rewards/margins_max": 1.2824891805648804, "rewards/margins_min": -0.45276355743408203, "rewards/margins_std": 0.8022412061691284, "rewards/rejected": -1.2581312656402588, "step": 1680 }, { "epoch": 0.4, "grad_norm": 7.094471361226549, "learning_rate": 3.7148461291755626e-07, "logits/chosen": -2.3189330101013184, "logits/rejected": -2.2371087074279785, "logps/chosen": -343.8477783203125, "logps/rejected": -424.7857360839844, "loss": 0.5667, "rewards/accuracies": 0.75, "rewards/chosen": -0.8474286794662476, "rewards/margins": 0.6017926335334778, "rewards/margins_max": 1.4043396711349487, "rewards/margins_min": -0.17678961157798767, "rewards/margins_std": 0.7105208039283752, "rewards/rejected": -1.4492213726043701, "step": 1690 }, { "epoch": 0.41, "grad_norm": 6.1658594101410555, "learning_rate": 3.6965379921783945e-07, "logits/chosen": -2.3411362171173096, "logits/rejected": -2.2549006938934326, "logps/chosen": -367.9657287597656, "logps/rejected": -410.591064453125, "loss": 0.5646, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8265560865402222, "rewards/margins": 0.5460506081581116, "rewards/margins_max": 1.572155237197876, "rewards/margins_min": -0.3843008875846863, "rewards/margins_std": 0.8602036237716675, "rewards/rejected": -1.3726065158843994, "step": 1700 }, { "epoch": 0.41, "eval_logits/chosen": -2.1420977115631104, "eval_logits/rejected": -2.040786027908325, "eval_logps/chosen": -364.9300537109375, "eval_logps/rejected": -398.6274108886719, "eval_loss": 0.6103922128677368, "eval_rewards/accuracies": 0.6679999828338623, "eval_rewards/chosen": -0.9326696395874023, "eval_rewards/margins": 0.4411788582801819, "eval_rewards/margins_max": 2.142662286758423, "eval_rewards/margins_min": -0.8518405556678772, "eval_rewards/margins_std": 0.9846555590629578, "eval_rewards/rejected": -1.3738484382629395, "eval_runtime": 1540.617, "eval_samples_per_second": 2.596, "eval_steps_per_second": 0.162, "step": 1700 }, { "epoch": 0.41, "grad_norm": 6.632219231878366, "learning_rate": 3.6781462351823455e-07, "logits/chosen": -2.2701971530914307, "logits/rejected": -2.2717127799987793, "logps/chosen": -366.0466003417969, "logps/rejected": -456.79058837890625, "loss": 0.5959, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9349940419197083, "rewards/margins": 0.43973350524902344, "rewards/margins_max": 1.6179587841033936, "rewards/margins_min": -0.566657543182373, "rewards/margins_std": 0.9640477299690247, "rewards/rejected": -1.374727487564087, "step": 1710 }, { "epoch": 0.41, "grad_norm": 8.710397965354037, "learning_rate": 3.6596721434944513e-07, "logits/chosen": -2.3106508255004883, "logits/rejected": -2.240830898284912, "logps/chosen": -365.48638916015625, "logps/rejected": -406.5863037109375, "loss": 0.5982, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9161807298660278, "rewards/margins": 0.36479100584983826, "rewards/margins_max": 1.2381747961044312, "rewards/margins_min": -0.45624440908432007, "rewards/margins_std": 0.7551611661911011, "rewards/rejected": -1.2809715270996094, "step": 1720 }, { "epoch": 0.41, "grad_norm": 8.513874623001644, "learning_rate": 3.6411170081757025e-07, "logits/chosen": -2.361016273498535, "logits/rejected": -2.251931667327881, "logps/chosen": -351.30438232421875, "logps/rejected": -384.8724060058594, "loss": 0.5818, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7639766931533813, "rewards/margins": 0.43196067214012146, "rewards/margins_max": 1.2066916227340698, "rewards/margins_min": -0.3082756996154785, "rewards/margins_std": 0.6784676313400269, "rewards/rejected": -1.1959373950958252, "step": 1730 }, { "epoch": 0.42, "grad_norm": 6.818301469157579, "learning_rate": 3.622482125950821e-07, "logits/chosen": -2.349828004837036, "logits/rejected": -2.2826290130615234, "logps/chosen": -374.9007263183594, "logps/rejected": -395.3811950683594, "loss": 0.617, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7560773491859436, "rewards/margins": 0.39648574590682983, "rewards/margins_max": 1.1901953220367432, "rewards/margins_min": -0.38299646973609924, "rewards/margins_std": 0.699086606502533, "rewards/rejected": -1.1525630950927734, "step": 1740 }, { "epoch": 0.42, "grad_norm": 8.304632838043295, "learning_rate": 3.603768799117637e-07, "logits/chosen": -2.2400736808776855, "logits/rejected": -2.1306400299072266, "logps/chosen": -345.6285095214844, "logps/rejected": -372.13494873046875, "loss": 0.5981, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7159274816513062, "rewards/margins": 0.436894953250885, "rewards/margins_max": 1.4176517724990845, "rewards/margins_min": -0.4486263394355774, "rewards/margins_std": 0.8396552801132202, "rewards/rejected": -1.1528222560882568, "step": 1750 }, { "epoch": 0.42, "grad_norm": 7.291857151994796, "learning_rate": 3.584978335456078e-07, "logits/chosen": -2.194343090057373, "logits/rejected": -2.2342498302459717, "logps/chosen": -319.1991882324219, "logps/rejected": -395.12127685546875, "loss": 0.6107, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6497689485549927, "rewards/margins": 0.4227171838283539, "rewards/margins_max": 1.525146484375, "rewards/margins_min": -0.5981516242027283, "rewards/margins_std": 0.9339895248413086, "rewards/rejected": -1.072486400604248, "step": 1760 }, { "epoch": 0.42, "grad_norm": 6.902710434781076, "learning_rate": 3.5661120481367757e-07, "logits/chosen": -2.3961105346679688, "logits/rejected": -2.269143581390381, "logps/chosen": -385.5531005859375, "logps/rejected": -383.8748779296875, "loss": 0.6216, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7216976881027222, "rewards/margins": 0.3068796396255493, "rewards/margins_max": 1.0051348209381104, "rewards/margins_min": -0.3713780641555786, "rewards/margins_std": 0.6087722182273865, "rewards/rejected": -1.028577446937561, "step": 1770 }, { "epoch": 0.43, "grad_norm": 6.83310571927231, "learning_rate": 3.547171255629292e-07, "logits/chosen": -2.2466726303100586, "logits/rejected": -2.0853893756866455, "logps/chosen": -317.86041259765625, "logps/rejected": -314.1649169921875, "loss": 0.5811, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5781975388526917, "rewards/margins": 0.3770865499973297, "rewards/margins_max": 1.1990553140640259, "rewards/margins_min": -0.35469183325767517, "rewards/margins_std": 0.6884657144546509, "rewards/rejected": -0.9552841186523438, "step": 1780 }, { "epoch": 0.43, "grad_norm": 7.21974974475938, "learning_rate": 3.528157281609984e-07, "logits/chosen": -2.3084144592285156, "logits/rejected": -2.301987409591675, "logps/chosen": -283.7975158691406, "logps/rejected": -300.7301940917969, "loss": 0.6316, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7296976447105408, "rewards/margins": 0.26906687021255493, "rewards/margins_max": 1.215813398361206, "rewards/margins_min": -0.537036120891571, "rewards/margins_std": 0.7726439237594604, "rewards/rejected": -0.9987645149230957, "step": 1790 }, { "epoch": 0.43, "grad_norm": 7.5631433638881, "learning_rate": 3.5090714548694916e-07, "logits/chosen": -2.1786742210388184, "logits/rejected": -2.058842897415161, "logps/chosen": -361.3190612792969, "logps/rejected": -381.50537109375, "loss": 0.5765, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.633159875869751, "rewards/margins": 0.3374477028846741, "rewards/margins_max": 1.1128568649291992, "rewards/margins_min": -0.4220476746559143, "rewards/margins_std": 0.6941351294517517, "rewards/rejected": -0.9706076383590698, "step": 1800 }, { "epoch": 0.43, "eval_logits/chosen": -2.1617860794067383, "eval_logits/rejected": -2.057659149169922, "eval_logps/chosen": -341.7124938964844, "eval_logps/rejected": -372.19976806640625, "eval_loss": 0.5996834635734558, "eval_rewards/accuracies": 0.6710000038146973, "eval_rewards/chosen": -0.7004943490028381, "eval_rewards/margins": 0.40907835960388184, "eval_rewards/margins_max": 1.902069330215454, "eval_rewards/margins_min": -0.70124751329422, "eval_rewards/margins_std": 0.8612772822380066, "eval_rewards/rejected": -1.1095727682113647, "eval_runtime": 1569.5556, "eval_samples_per_second": 2.548, "eval_steps_per_second": 0.159, "step": 1800 }, { "epoch": 0.43, "grad_norm": 10.222348014315559, "learning_rate": 3.489915109219882e-07, "logits/chosen": -2.2581310272216797, "logits/rejected": -2.1304614543914795, "logps/chosen": -304.2193908691406, "logps/rejected": -339.60369873046875, "loss": 0.5883, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.741899847984314, "rewards/margins": 0.4426964819431305, "rewards/margins_max": 1.3874841928482056, "rewards/margins_min": -0.3182061016559601, "rewards/margins_std": 0.7757003307342529, "rewards/rejected": -1.1845964193344116, "step": 1810 }, { "epoch": 0.44, "grad_norm": 6.723534020811073, "learning_rate": 3.4706895834014294e-07, "logits/chosen": -2.3538193702697754, "logits/rejected": -2.2900586128234863, "logps/chosen": -357.71221923828125, "logps/rejected": -405.8221740722656, "loss": 0.5764, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7458327412605286, "rewards/margins": 0.504269003868103, "rewards/margins_max": 1.5634111166000366, "rewards/margins_min": -0.3171696364879608, "rewards/margins_std": 0.8601846694946289, "rewards/rejected": -1.2501016855239868, "step": 1820 }, { "epoch": 0.44, "grad_norm": 8.943619347130511, "learning_rate": 3.451396220989064e-07, "logits/chosen": -2.3319504261016846, "logits/rejected": -2.1187522411346436, "logps/chosen": -356.7348937988281, "logps/rejected": -363.59796142578125, "loss": 0.5527, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8076604604721069, "rewards/margins": 0.354184627532959, "rewards/margins_max": 1.2168200016021729, "rewards/margins_min": -0.4756907820701599, "rewards/margins_std": 0.7870656847953796, "rewards/rejected": -1.161845088005066, "step": 1830 }, { "epoch": 0.44, "grad_norm": 9.62908927944707, "learning_rate": 3.43203637029847e-07, "logits/chosen": -2.3815670013427734, "logits/rejected": -2.217191219329834, "logps/chosen": -418.98126220703125, "logps/rejected": -427.2189025878906, "loss": 0.6028, "rewards/accuracies": 0.625, "rewards/chosen": -0.9497702717781067, "rewards/margins": 0.4112814962863922, "rewards/margins_max": 1.710329294204712, "rewards/margins_min": -0.6006239652633667, "rewards/margins_std": 1.0522825717926025, "rewards/rejected": -1.3610517978668213, "step": 1840 }, { "epoch": 0.44, "grad_norm": 7.93874487539014, "learning_rate": 3.4126113842918643e-07, "logits/chosen": -2.3161070346832275, "logits/rejected": -2.166799306869507, "logps/chosen": -361.44561767578125, "logps/rejected": -392.30419921875, "loss": 0.5609, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8738459348678589, "rewards/margins": 0.5069698691368103, "rewards/margins_max": 1.366966962814331, "rewards/margins_min": -0.24426352977752686, "rewards/margins_std": 0.7145646214485168, "rewards/rejected": -1.380815863609314, "step": 1850 }, { "epoch": 0.45, "grad_norm": 15.446231241643137, "learning_rate": 3.3931226204834397e-07, "logits/chosen": -2.3169639110565186, "logits/rejected": -2.273132085800171, "logps/chosen": -403.765380859375, "logps/rejected": -465.0936584472656, "loss": 0.6171, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.9261840581893921, "rewards/margins": 0.6405378580093384, "rewards/margins_max": 1.8157743215560913, "rewards/margins_min": -0.4265405535697937, "rewards/margins_std": 0.9659549593925476, "rewards/rejected": -1.56672203540802, "step": 1860 }, { "epoch": 0.45, "grad_norm": 10.999952836369062, "learning_rate": 3.3735714408445e-07, "logits/chosen": -2.2628073692321777, "logits/rejected": -2.252011775970459, "logps/chosen": -309.3660583496094, "logps/rejected": -381.1944274902344, "loss": 0.5666, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7401235699653625, "rewards/margins": 0.5099393725395203, "rewards/margins_max": 1.7542574405670166, "rewards/margins_min": -0.4272642135620117, "rewards/margins_std": 0.9705731272697449, "rewards/rejected": -1.2500629425048828, "step": 1870 }, { "epoch": 0.45, "grad_norm": 9.221371449315443, "learning_rate": 3.3539592117082746e-07, "logits/chosen": -2.233426570892334, "logits/rejected": -2.1249279975891113, "logps/chosen": -369.30072021484375, "logps/rejected": -411.7880859375, "loss": 0.6368, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.032252550125122, "rewards/margins": 0.3263944089412689, "rewards/margins_max": 1.3488740921020508, "rewards/margins_min": -0.5255783796310425, "rewards/margins_std": 0.8703895807266235, "rewards/rejected": -1.3586469888687134, "step": 1880 }, { "epoch": 0.45, "grad_norm": 7.4762155418450185, "learning_rate": 3.3342873036744346e-07, "logits/chosen": -2.2763772010803223, "logits/rejected": -2.194484233856201, "logps/chosen": -363.32489013671875, "logps/rejected": -424.7459411621094, "loss": 0.5664, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7643159031867981, "rewards/margins": 0.5254711508750916, "rewards/margins_max": 1.5481106042861938, "rewards/margins_min": -0.30798643827438354, "rewards/margins_std": 0.8306853175163269, "rewards/rejected": -1.2897870540618896, "step": 1890 }, { "epoch": 0.45, "grad_norm": 9.734992581733499, "learning_rate": 3.3145570915133067e-07, "logits/chosen": -2.2219715118408203, "logits/rejected": -2.103447198867798, "logps/chosen": -331.6070556640625, "logps/rejected": -369.47576904296875, "loss": 0.6009, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7518938779830933, "rewards/margins": 0.39933839440345764, "rewards/margins_max": 1.338181972503662, "rewards/margins_min": -0.4513072967529297, "rewards/margins_std": 0.8004790544509888, "rewards/rejected": -1.1512322425842285, "step": 1900 }, { "epoch": 0.45, "eval_logits/chosen": -2.1541647911071777, "eval_logits/rejected": -2.049808979034424, "eval_logps/chosen": -347.6073913574219, "eval_logps/rejected": -381.99359130859375, "eval_loss": 0.5961129665374756, "eval_rewards/accuracies": 0.6704999804496765, "eval_rewards/chosen": -0.7594434022903442, "eval_rewards/margins": 0.4480668306350708, "eval_rewards/margins_max": 2.0696640014648438, "eval_rewards/margins_min": -0.7257018089294434, "eval_rewards/margins_std": 0.9258064031600952, "eval_rewards/rejected": -1.2075101137161255, "eval_runtime": 1584.6059, "eval_samples_per_second": 2.524, "eval_steps_per_second": 0.158, "step": 1900 }, { "epoch": 0.46, "grad_norm": 7.477769584833915, "learning_rate": 3.294769954069802e-07, "logits/chosen": -2.3154220581054688, "logits/rejected": -2.1774978637695312, "logps/chosen": -344.8447570800781, "logps/rejected": -374.01861572265625, "loss": 0.5993, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.733178973197937, "rewards/margins": 0.3985547721385956, "rewards/margins_max": 1.4404284954071045, "rewards/margins_min": -0.4965454936027527, "rewards/margins_std": 0.8946989178657532, "rewards/rejected": -1.1317336559295654, "step": 1910 }, { "epoch": 0.46, "grad_norm": 6.736873154505905, "learning_rate": 3.274927274167048e-07, "logits/chosen": -2.2562804222106934, "logits/rejected": -2.176208019256592, "logps/chosen": -316.04119873046875, "logps/rejected": -363.6830139160156, "loss": 0.5834, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6197851896286011, "rewards/margins": 0.47865238785743713, "rewards/margins_max": 1.3647626638412476, "rewards/margins_min": -0.30827054381370544, "rewards/margins_std": 0.7570652961730957, "rewards/rejected": -1.0984375476837158, "step": 1920 }, { "epoch": 0.46, "grad_norm": 8.004341854605576, "learning_rate": 3.2550304385097575e-07, "logits/chosen": -2.2887442111968994, "logits/rejected": -2.1940433979034424, "logps/chosen": -320.30047607421875, "logps/rejected": -346.361328125, "loss": 0.5894, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5873011350631714, "rewards/margins": 0.4243379533290863, "rewards/margins_max": 1.2461038827896118, "rewards/margins_min": -0.32229143381118774, "rewards/margins_std": 0.696189284324646, "rewards/rejected": -1.011638879776001, "step": 1930 }, { "epoch": 0.46, "grad_norm": 6.645031855774493, "learning_rate": 3.235080837587314e-07, "logits/chosen": -2.354066848754883, "logits/rejected": -2.3206450939178467, "logps/chosen": -268.8098449707031, "logps/rejected": -362.1439514160156, "loss": 0.5804, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.5732561349868774, "rewards/margins": 0.45921745896339417, "rewards/margins_max": 1.2656594514846802, "rewards/margins_min": -0.2484348565340042, "rewards/margins_std": 0.6817165613174438, "rewards/rejected": -1.0324736833572388, "step": 1940 }, { "epoch": 0.47, "grad_norm": 9.431275999565512, "learning_rate": 3.215079865576599e-07, "logits/chosen": -2.250476837158203, "logits/rejected": -2.273078441619873, "logps/chosen": -328.11419677734375, "logps/rejected": -382.8624572753906, "loss": 0.5956, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.6076136231422424, "rewards/margins": 0.4649823307991028, "rewards/margins_max": 1.3210949897766113, "rewards/margins_min": -0.28047478199005127, "rewards/margins_std": 0.6983139514923096, "rewards/rejected": -1.0725960731506348, "step": 1950 }, { "epoch": 0.47, "grad_norm": 8.350448377072007, "learning_rate": 3.1950289202445594e-07, "logits/chosen": -2.2141032218933105, "logits/rejected": -2.1793408393859863, "logps/chosen": -325.1326904296875, "logps/rejected": -378.72723388671875, "loss": 0.575, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.6701504588127136, "rewards/margins": 0.4475323259830475, "rewards/margins_max": 1.4365272521972656, "rewards/margins_min": -0.47374558448791504, "rewards/margins_std": 0.877734363079071, "rewards/rejected": -1.1176828145980835, "step": 1960 }, { "epoch": 0.47, "grad_norm": 9.23290218994662, "learning_rate": 3.174929402850528e-07, "logits/chosen": -2.423187017440796, "logits/rejected": -2.1965746879577637, "logps/chosen": -341.0179748535156, "logps/rejected": -366.5667724609375, "loss": 0.5909, "rewards/accuracies": 0.75, "rewards/chosen": -0.7348090410232544, "rewards/margins": 0.34791040420532227, "rewards/margins_max": 1.1202346086502075, "rewards/margins_min": -0.47647666931152344, "rewards/margins_std": 0.6999490261077881, "rewards/rejected": -1.0827195644378662, "step": 1970 }, { "epoch": 0.47, "grad_norm": 8.382331030849485, "learning_rate": 3.15478271804829e-07, "logits/chosen": -2.31251859664917, "logits/rejected": -2.1721599102020264, "logps/chosen": -354.3834228515625, "logps/rejected": -407.0205993652344, "loss": 0.5233, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7153571844100952, "rewards/margins": 0.5832840204238892, "rewards/margins_max": 1.5437829494476318, "rewards/margins_min": -0.12932512164115906, "rewards/margins_std": 0.7581546306610107, "rewards/rejected": -1.2986410856246948, "step": 1980 }, { "epoch": 0.48, "grad_norm": 10.996470838893929, "learning_rate": 3.1345902737879257e-07, "logits/chosen": -2.136340856552124, "logits/rejected": -2.020050525665283, "logps/chosen": -336.372314453125, "logps/rejected": -396.25799560546875, "loss": 0.5746, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8744724988937378, "rewards/margins": 0.5016102194786072, "rewards/margins_max": 1.526653528213501, "rewards/margins_min": -0.4043669104576111, "rewards/margins_std": 0.8628823161125183, "rewards/rejected": -1.3760826587677002, "step": 1990 }, { "epoch": 0.48, "grad_norm": 7.248998901703897, "learning_rate": 3.1143534812174103e-07, "logits/chosen": -2.3891992568969727, "logits/rejected": -2.2027313709259033, "logps/chosen": -389.3179626464844, "logps/rejected": -393.8450622558594, "loss": 0.6246, "rewards/accuracies": 0.75, "rewards/chosen": -0.8192800283432007, "rewards/margins": 0.5883496999740601, "rewards/margins_max": 1.8675416707992554, "rewards/margins_min": -0.48337239027023315, "rewards/margins_std": 1.0604313611984253, "rewards/rejected": -1.4076298475265503, "step": 2000 }, { "epoch": 0.48, "eval_logits/chosen": -2.1615405082702637, "eval_logits/rejected": -2.055985927581787, "eval_logps/chosen": -343.02667236328125, "eval_logps/rejected": -377.97052001953125, "eval_loss": 0.5927345156669617, "eval_rewards/accuracies": 0.6855000257492065, "eval_rewards/chosen": -0.7136361002922058, "eval_rewards/margins": 0.45364394783973694, "eval_rewards/margins_max": 2.0828120708465576, "eval_rewards/margins_min": -0.6959524750709534, "eval_rewards/margins_std": 0.918849527835846, "eval_rewards/rejected": -1.1672801971435547, "eval_runtime": 1536.3917, "eval_samples_per_second": 2.604, "eval_steps_per_second": 0.163, "step": 2000 }, { "epoch": 0.48, "grad_norm": 8.285635078690502, "learning_rate": 3.094073754584001e-07, "logits/chosen": -2.211477518081665, "logits/rejected": -2.0843701362609863, "logps/chosen": -338.3864440917969, "logps/rejected": -390.53607177734375, "loss": 0.5487, "rewards/accuracies": 0.75, "rewards/chosen": -0.7731207609176636, "rewards/margins": 0.6617603898048401, "rewards/margins_max": 1.839966058731079, "rewards/margins_min": -0.35784536600112915, "rewards/margins_std": 0.9657366871833801, "rewards/rejected": -1.4348812103271484, "step": 2010 }, { "epoch": 0.48, "grad_norm": 5.979361004833463, "learning_rate": 3.0737525111353976e-07, "logits/chosen": -2.378627061843872, "logits/rejected": -2.2880947589874268, "logps/chosen": -364.23028564453125, "logps/rejected": -379.94024658203125, "loss": 0.5529, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8452831506729126, "rewards/margins": 0.4095057547092438, "rewards/margins_max": 1.2729713916778564, "rewards/margins_min": -0.4757913649082184, "rewards/margins_std": 0.7845117449760437, "rewards/rejected": -1.2547887563705444, "step": 2020 }, { "epoch": 0.49, "grad_norm": 8.877270475281723, "learning_rate": 3.053391171020702e-07, "logits/chosen": -2.223383665084839, "logits/rejected": -2.1204400062561035, "logps/chosen": -361.8363952636719, "logps/rejected": -419.35321044921875, "loss": 0.578, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7907285690307617, "rewards/margins": 0.621681272983551, "rewards/margins_max": 1.8572428226470947, "rewards/margins_min": -0.47413793206214905, "rewards/margins_std": 0.997836709022522, "rewards/rejected": -1.412409782409668, "step": 2030 }, { "epoch": 0.49, "grad_norm": 15.109358336825093, "learning_rate": 3.0329911571911693e-07, "logits/chosen": -2.202918767929077, "logits/rejected": -2.169145107269287, "logps/chosen": -332.624755859375, "logps/rejected": -386.75836181640625, "loss": 0.5898, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.931067168712616, "rewards/margins": 0.39683061838150024, "rewards/margins_max": 1.552000641822815, "rewards/margins_min": -0.7417031526565552, "rewards/margins_std": 1.0494534969329834, "rewards/rejected": -1.3278977870941162, "step": 2040 }, { "epoch": 0.49, "grad_norm": 11.821918910381378, "learning_rate": 3.012553895300765e-07, "logits/chosen": -2.158900737762451, "logits/rejected": -2.1002602577209473, "logps/chosen": -340.70062255859375, "logps/rejected": -398.469970703125, "loss": 0.6072, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9658359289169312, "rewards/margins": 0.4641633629798889, "rewards/margins_max": 1.3377294540405273, "rewards/margins_min": -0.3739112913608551, "rewards/margins_std": 0.7836833596229553, "rewards/rejected": -1.4299992322921753, "step": 2050 }, { "epoch": 0.49, "grad_norm": 12.751792033798315, "learning_rate": 2.9920808136065336e-07, "logits/chosen": -2.3554189205169678, "logits/rejected": -2.167008399963379, "logps/chosen": -351.1730041503906, "logps/rejected": -407.7794494628906, "loss": 0.6262, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8655673265457153, "rewards/margins": 0.506353497505188, "rewards/margins_max": 1.7573869228363037, "rewards/margins_min": -0.4727141261100769, "rewards/margins_std": 0.9939647912979126, "rewards/rejected": -1.3719208240509033, "step": 2060 }, { "epoch": 0.5, "grad_norm": 7.278395719558554, "learning_rate": 2.971573342868786e-07, "logits/chosen": -2.330958604812622, "logits/rejected": -2.14420485496521, "logps/chosen": -312.994140625, "logps/rejected": -360.21771240234375, "loss": 0.5712, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7200855016708374, "rewards/margins": 0.5679798126220703, "rewards/margins_max": 1.6605669260025024, "rewards/margins_min": -0.36372336745262146, "rewards/margins_std": 0.903790295124054, "rewards/rejected": -1.2880651950836182, "step": 2070 }, { "epoch": 0.5, "grad_norm": 8.50153740835527, "learning_rate": 2.9510329162511054e-07, "logits/chosen": -2.187025785446167, "logits/rejected": -2.174272298812866, "logps/chosen": -371.7486877441406, "logps/rejected": -397.32781982421875, "loss": 0.6234, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.788277804851532, "rewards/margins": 0.4696384072303772, "rewards/margins_max": 1.6484591960906982, "rewards/margins_min": -0.5580712556838989, "rewards/margins_std": 0.9815530776977539, "rewards/rejected": -1.2579162120819092, "step": 2080 }, { "epoch": 0.5, "grad_norm": 9.840891201940885, "learning_rate": 2.930460969220202e-07, "logits/chosen": -2.2919814586639404, "logits/rejected": -2.184452533721924, "logps/chosen": -328.14556884765625, "logps/rejected": -403.58270263671875, "loss": 0.5983, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8198297619819641, "rewards/margins": 0.49597787857055664, "rewards/margins_max": 1.5857298374176025, "rewards/margins_min": -0.5425015091896057, "rewards/margins_std": 0.9532259106636047, "rewards/rejected": -1.315807580947876, "step": 2090 }, { "epoch": 0.5, "grad_norm": 6.680713668436098, "learning_rate": 2.909858939445584e-07, "logits/chosen": -2.3211870193481445, "logits/rejected": -2.2100892066955566, "logps/chosen": -318.2855529785156, "logps/rejected": -362.3763732910156, "loss": 0.5758, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7094635367393494, "rewards/margins": 0.4552193284034729, "rewards/margins_max": 1.4264600276947021, "rewards/margins_min": -0.2823147177696228, "rewards/margins_std": 0.7549293041229248, "rewards/rejected": -1.1646828651428223, "step": 2100 }, { "epoch": 0.5, "eval_logits/chosen": -2.154378890991211, "eval_logits/rejected": -2.049565076828003, "eval_logps/chosen": -342.5803527832031, "eval_logps/rejected": -377.820556640625, "eval_loss": 0.590307354927063, "eval_rewards/accuracies": 0.6865000128746033, "eval_rewards/chosen": -0.7091726660728455, "eval_rewards/margins": 0.4566076397895813, "eval_rewards/margins_max": 2.089244842529297, "eval_rewards/margins_min": -0.6922873258590698, "eval_rewards/margins_std": 0.9199235439300537, "eval_rewards/rejected": -1.1657804250717163, "eval_runtime": 1509.9925, "eval_samples_per_second": 2.649, "eval_steps_per_second": 0.166, "step": 2100 }, { "epoch": 0.51, "grad_norm": 9.129622451312848, "learning_rate": 2.8892282666990894e-07, "logits/chosen": -2.3455488681793213, "logits/rejected": -2.1557815074920654, "logps/chosen": -321.3612365722656, "logps/rejected": -347.50885009765625, "loss": 0.552, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.6190410852432251, "rewards/margins": 0.5925929546356201, "rewards/margins_max": 1.5952322483062744, "rewards/margins_min": -0.32552486658096313, "rewards/margins_std": 0.8690015077590942, "rewards/rejected": -1.2116341590881348, "step": 2110 }, { "epoch": 0.51, "grad_norm": 10.685429601177425, "learning_rate": 2.868570392754272e-07, "logits/chosen": -2.3623011112213135, "logits/rejected": -2.288388729095459, "logps/chosen": -397.1722106933594, "logps/rejected": -438.0806579589844, "loss": 0.6129, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8597472906112671, "rewards/margins": 0.3890746533870697, "rewards/margins_max": 1.3771077394485474, "rewards/margins_min": -0.5370277762413025, "rewards/margins_std": 0.8462254405021667, "rewards/rejected": -1.2488219738006592, "step": 2120 }, { "epoch": 0.51, "grad_norm": 9.359169577253168, "learning_rate": 2.8478867612856394e-07, "logits/chosen": -2.297898530960083, "logits/rejected": -2.126291275024414, "logps/chosen": -363.72357177734375, "logps/rejected": -378.7575988769531, "loss": 0.5831, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8068483471870422, "rewards/margins": 0.5872797966003418, "rewards/margins_max": 1.9363224506378174, "rewards/margins_min": -0.4493553042411804, "rewards/margins_std": 1.06607186794281, "rewards/rejected": -1.3941280841827393, "step": 2130 }, { "epoch": 0.51, "grad_norm": 7.108674201380113, "learning_rate": 2.827178817767762e-07, "logits/chosen": -2.2061030864715576, "logits/rejected": -2.0863699913024902, "logps/chosen": -356.1858215332031, "logps/rejected": -369.397705078125, "loss": 0.5677, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6814908385276794, "rewards/margins": 0.4906036853790283, "rewards/margins_max": 1.652635931968689, "rewards/margins_min": -0.4360484480857849, "rewards/margins_std": 0.9383756518363953, "rewards/rejected": -1.1720945835113525, "step": 2140 }, { "epoch": 0.51, "grad_norm": 11.976318516146575, "learning_rate": 2.8064480093742565e-07, "logits/chosen": -2.24428129196167, "logits/rejected": -2.2006611824035645, "logps/chosen": -314.3396301269531, "logps/rejected": -370.1969299316406, "loss": 0.5514, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7830640077590942, "rewards/margins": 0.5004338026046753, "rewards/margins_max": 1.5879522562026978, "rewards/margins_min": -0.5409722328186035, "rewards/margins_std": 0.9820780754089355, "rewards/rejected": -1.2834978103637695, "step": 2150 }, { "epoch": 0.52, "grad_norm": 8.00576622102123, "learning_rate": 2.7856957848766497e-07, "logits/chosen": -2.243147134780884, "logits/rejected": -2.020589828491211, "logps/chosen": -357.06353759765625, "logps/rejected": -413.5083923339844, "loss": 0.6047, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.9727823138237, "rewards/margins": 0.5786701440811157, "rewards/margins_max": 2.0888283252716064, "rewards/margins_min": -0.6575952768325806, "rewards/margins_std": 1.2244327068328857, "rewards/rejected": -1.551452398300171, "step": 2160 }, { "epoch": 0.52, "grad_norm": 8.561026119157132, "learning_rate": 2.7649235945431336e-07, "logits/chosen": -2.1802055835723877, "logits/rejected": -2.081882953643799, "logps/chosen": -352.650146484375, "logps/rejected": -422.68096923828125, "loss": 0.5644, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8436532020568848, "rewards/margins": 0.5075746178627014, "rewards/margins_max": 1.5993292331695557, "rewards/margins_min": -0.4662080705165863, "rewards/margins_std": 0.9233331680297852, "rewards/rejected": -1.351227879524231, "step": 2170 }, { "epoch": 0.52, "grad_norm": 7.275029615623783, "learning_rate": 2.74413289003721e-07, "logits/chosen": -2.296869993209839, "logits/rejected": -2.1813907623291016, "logps/chosen": -346.771240234375, "logps/rejected": -413.9910583496094, "loss": 0.5804, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8352276682853699, "rewards/margins": 0.6454394459724426, "rewards/margins_max": 2.04996657371521, "rewards/margins_min": -0.5391429662704468, "rewards/margins_std": 1.1311023235321045, "rewards/rejected": -1.480666995048523, "step": 2180 }, { "epoch": 0.52, "grad_norm": 8.101717882128552, "learning_rate": 2.7233251243162434e-07, "logits/chosen": -2.3574635982513428, "logits/rejected": -2.238738536834717, "logps/chosen": -405.80853271484375, "logps/rejected": -417.9353942871094, "loss": 0.601, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8843311071395874, "rewards/margins": 0.40951403975486755, "rewards/margins_max": 1.348105549812317, "rewards/margins_min": -0.3659106492996216, "rewards/margins_std": 0.7601439356803894, "rewards/rejected": -1.2938449382781982, "step": 2190 }, { "epoch": 0.53, "grad_norm": 7.956278777089184, "learning_rate": 2.7025017515299207e-07, "logits/chosen": -2.2218921184539795, "logits/rejected": -2.1124062538146973, "logps/chosen": -350.0830993652344, "logps/rejected": -363.72637939453125, "loss": 0.5821, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.8745296597480774, "rewards/margins": 0.3519309163093567, "rewards/margins_max": 1.6782519817352295, "rewards/margins_min": -0.5373163819313049, "rewards/margins_std": 1.0014039278030396, "rewards/rejected": -1.2264604568481445, "step": 2200 }, { "epoch": 0.53, "eval_logits/chosen": -2.1375954151153564, "eval_logits/rejected": -2.0335936546325684, "eval_logps/chosen": -353.47235107421875, "eval_logps/rejected": -392.70196533203125, "eval_loss": 0.5914161205291748, "eval_rewards/accuracies": 0.6769999861717224, "eval_rewards/chosen": -0.8180926442146301, "eval_rewards/margins": 0.49650150537490845, "eval_rewards/margins_max": 2.2775261402130127, "eval_rewards/margins_min": -0.7769955992698669, "eval_rewards/margins_std": 1.0092629194259644, "eval_rewards/rejected": -1.3145942687988281, "eval_runtime": 1564.5041, "eval_samples_per_second": 2.557, "eval_steps_per_second": 0.16, "step": 2200 }, { "epoch": 0.53, "grad_norm": 8.994820269855964, "learning_rate": 2.6816642269186275e-07, "logits/chosen": -2.1785659790039062, "logits/rejected": -2.1119942665100098, "logps/chosen": -331.712890625, "logps/rejected": -363.6921691894531, "loss": 0.5937, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8564426302909851, "rewards/margins": 0.36001232266426086, "rewards/margins_max": 1.2967666387557983, "rewards/margins_min": -0.4843088686466217, "rewards/margins_std": 0.7921321988105774, "rewards/rejected": -1.216455101966858, "step": 2210 }, { "epoch": 0.53, "grad_norm": 9.14840872847205, "learning_rate": 2.660814006711748e-07, "logits/chosen": -2.1119871139526367, "logits/rejected": -2.163943290710449, "logps/chosen": -334.981201171875, "logps/rejected": -426.54571533203125, "loss": 0.5694, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8568441271781921, "rewards/margins": 0.4937872290611267, "rewards/margins_max": 1.4551918506622314, "rewards/margins_min": -0.41088181734085083, "rewards/margins_std": 0.8557828068733215, "rewards/rejected": -1.3506313562393188, "step": 2220 }, { "epoch": 0.53, "grad_norm": 7.2714521113808805, "learning_rate": 2.639952548025899e-07, "logits/chosen": -2.23884916305542, "logits/rejected": -2.078699827194214, "logps/chosen": -358.21514892578125, "logps/rejected": -387.00469970703125, "loss": 0.5732, "rewards/accuracies": 0.625, "rewards/chosen": -0.7057204842567444, "rewards/margins": 0.6662472486495972, "rewards/margins_max": 2.4422647953033447, "rewards/margins_min": -0.4950762689113617, "rewards/margins_std": 1.3406832218170166, "rewards/rejected": -1.3719676733016968, "step": 2230 }, { "epoch": 0.54, "grad_norm": 8.53442949566087, "learning_rate": 2.619081308763097e-07, "logits/chosen": -2.2453503608703613, "logits/rejected": -2.123828887939453, "logps/chosen": -360.81982421875, "logps/rejected": -395.600830078125, "loss": 0.5448, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7899690270423889, "rewards/margins": 0.5292414426803589, "rewards/margins_max": 1.5981518030166626, "rewards/margins_min": -0.3389052748680115, "rewards/margins_std": 0.8711957931518555, "rewards/rejected": -1.3192105293273926, "step": 2240 }, { "epoch": 0.54, "grad_norm": 9.612240743923376, "learning_rate": 2.598201747508875e-07, "logits/chosen": -2.2830798625946045, "logits/rejected": -2.2598392963409424, "logps/chosen": -371.29473876953125, "logps/rejected": -408.0329895019531, "loss": 0.5714, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.6862138509750366, "rewards/margins": 0.47710585594177246, "rewards/margins_max": 1.3919486999511719, "rewards/margins_min": -0.3695757985115051, "rewards/margins_std": 0.7841047644615173, "rewards/rejected": -1.1633195877075195, "step": 2250 }, { "epoch": 0.54, "grad_norm": 7.598506238604103, "learning_rate": 2.577315323430346e-07, "logits/chosen": -2.312260150909424, "logits/rejected": -2.1881906986236572, "logps/chosen": -350.95452880859375, "logps/rejected": -414.6181640625, "loss": 0.5487, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7382746934890747, "rewards/margins": 0.6465110182762146, "rewards/margins_max": 1.7670600414276123, "rewards/margins_min": -0.3906131386756897, "rewards/margins_std": 0.9748347997665405, "rewards/rejected": -1.384785532951355, "step": 2260 }, { "epoch": 0.54, "grad_norm": 10.758113549368833, "learning_rate": 2.5564234961742315e-07, "logits/chosen": -2.2698960304260254, "logits/rejected": -2.127373218536377, "logps/chosen": -397.01715087890625, "logps/rejected": -435.0760803222656, "loss": 0.5817, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7826915979385376, "rewards/margins": 0.6340346336364746, "rewards/margins_max": 1.9195044040679932, "rewards/margins_min": -0.4933221936225891, "rewards/margins_std": 1.0657789707183838, "rewards/rejected": -1.4167262315750122, "step": 2270 }, { "epoch": 0.55, "grad_norm": 6.781284877142453, "learning_rate": 2.5355277257648553e-07, "logits/chosen": -2.3609373569488525, "logits/rejected": -2.226810932159424, "logps/chosen": -340.54168701171875, "logps/rejected": -374.62347412109375, "loss": 0.6089, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8493808507919312, "rewards/margins": 0.32950735092163086, "rewards/margins_max": 1.4013553857803345, "rewards/margins_min": -0.7397369742393494, "rewards/margins_std": 0.9628899693489075, "rewards/rejected": -1.178888201713562, "step": 2280 }, { "epoch": 0.55, "grad_norm": 9.013221777426873, "learning_rate": 2.514629472502108e-07, "logits/chosen": -2.2310891151428223, "logits/rejected": -2.1314873695373535, "logps/chosen": -377.0622863769531, "logps/rejected": -402.41497802734375, "loss": 0.5353, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6762335896492004, "rewards/margins": 0.5654059052467346, "rewards/margins_max": 1.5738112926483154, "rewards/margins_min": -0.2663133144378662, "rewards/margins_std": 0.8062018156051636, "rewards/rejected": -1.2416393756866455, "step": 2290 }, { "epoch": 0.55, "grad_norm": 6.794837663857307, "learning_rate": 2.4937301968593915e-07, "logits/chosen": -2.2112669944763184, "logits/rejected": -2.1359317302703857, "logps/chosen": -318.8021545410156, "logps/rejected": -377.1285095214844, "loss": 0.5703, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7856169939041138, "rewards/margins": 0.594869077205658, "rewards/margins_max": 1.7734527587890625, "rewards/margins_min": -0.3492550551891327, "rewards/margins_std": 0.9743860363960266, "rewards/rejected": -1.3804861307144165, "step": 2300 }, { "epoch": 0.55, "eval_logits/chosen": -2.1370773315429688, "eval_logits/rejected": -2.033658027648926, "eval_logps/chosen": -355.53326416015625, "eval_logps/rejected": -396.2969665527344, "eval_loss": 0.5908156633377075, "eval_rewards/accuracies": 0.6865000128746033, "eval_rewards/chosen": -0.8387019038200378, "eval_rewards/margins": 0.5118422508239746, "eval_rewards/margins_max": 2.351066827774048, "eval_rewards/margins_min": -0.790500819683075, "eval_rewards/margins_std": 1.0380847454071045, "eval_rewards/rejected": -1.3505440950393677, "eval_runtime": 1601.0999, "eval_samples_per_second": 2.498, "eval_steps_per_second": 0.156, "step": 2300 }, { "epoch": 0.55, "grad_norm": 8.55474769884641, "learning_rate": 2.47283135938156e-07, "logits/chosen": -2.3023080825805664, "logits/rejected": -2.201951503753662, "logps/chosen": -330.50457763671875, "logps/rejected": -361.70074462890625, "loss": 0.6086, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7735743522644043, "rewards/margins": 0.5472443699836731, "rewards/margins_max": 1.4782692193984985, "rewards/margins_min": -0.36417728662490845, "rewards/margins_std": 0.8189528584480286, "rewards/rejected": -1.3208186626434326, "step": 2310 }, { "epoch": 0.56, "grad_norm": 5.726884570048719, "learning_rate": 2.451934420582846e-07, "logits/chosen": -2.295928955078125, "logits/rejected": -2.1835927963256836, "logps/chosen": -352.1939697265625, "logps/rejected": -414.6656799316406, "loss": 0.5698, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8815978765487671, "rewards/margins": 0.6570836901664734, "rewards/margins_max": 2.1854469776153564, "rewards/margins_min": -0.5321916341781616, "rewards/margins_std": 1.2462656497955322, "rewards/rejected": -1.5386816263198853, "step": 2320 }, { "epoch": 0.56, "grad_norm": 8.430150426787277, "learning_rate": 2.4310408408447903e-07, "logits/chosen": -2.2279810905456543, "logits/rejected": -2.081627368927002, "logps/chosen": -302.0141906738281, "logps/rejected": -340.5296630859375, "loss": 0.6657, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7267032861709595, "rewards/margins": 0.48589572310447693, "rewards/margins_max": 1.6361875534057617, "rewards/margins_min": -0.5151346921920776, "rewards/margins_std": 0.9902347326278687, "rewards/rejected": -1.2125990390777588, "step": 2330 }, { "epoch": 0.56, "grad_norm": 6.903279904328087, "learning_rate": 2.41015208031419e-07, "logits/chosen": -2.3411154747009277, "logits/rejected": -2.1315674781799316, "logps/chosen": -354.97784423828125, "logps/rejected": -372.99932861328125, "loss": 0.5764, "rewards/accuracies": 0.625, "rewards/chosen": -0.8175373077392578, "rewards/margins": 0.3957211971282959, "rewards/margins_max": 1.6154868602752686, "rewards/margins_min": -0.5010572671890259, "rewards/margins_std": 0.9424025416374207, "rewards/rejected": -1.2132585048675537, "step": 2340 }, { "epoch": 0.56, "grad_norm": 10.473000099260076, "learning_rate": 2.389269598801048e-07, "logits/chosen": -2.3265719413757324, "logits/rejected": -2.0716915130615234, "logps/chosen": -352.95025634765625, "logps/rejected": -359.391357421875, "loss": 0.5602, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7586410045623779, "rewards/margins": 0.5185399055480957, "rewards/margins_max": 1.669735312461853, "rewards/margins_min": -0.5012242197990417, "rewards/margins_std": 0.9487419128417969, "rewards/rejected": -1.2771809101104736, "step": 2350 }, { "epoch": 0.57, "grad_norm": 7.148878273637695, "learning_rate": 2.3683948556765624e-07, "logits/chosen": -2.2011232376098633, "logits/rejected": -2.082606554031372, "logps/chosen": -312.028076171875, "logps/rejected": -350.52154541015625, "loss": 0.588, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.6675496101379395, "rewards/margins": 0.5226207971572876, "rewards/margins_max": 1.435473918914795, "rewards/margins_min": -0.308535635471344, "rewards/margins_std": 0.7646826505661011, "rewards/rejected": -1.1901702880859375, "step": 2360 }, { "epoch": 0.57, "grad_norm": 10.822150185214936, "learning_rate": 2.34752930977113e-07, "logits/chosen": -2.166903018951416, "logits/rejected": -2.127718448638916, "logps/chosen": -340.00079345703125, "logps/rejected": -395.65087890625, "loss": 0.5702, "rewards/accuracies": 0.625, "rewards/chosen": -0.7993613481521606, "rewards/margins": 0.470763623714447, "rewards/margins_max": 1.8239552974700928, "rewards/margins_min": -0.436536967754364, "rewards/margins_std": 1.0380691289901733, "rewards/rejected": -1.270124912261963, "step": 2370 }, { "epoch": 0.57, "grad_norm": 7.345587058292688, "learning_rate": 2.3266744192724052e-07, "logits/chosen": -2.17414927482605, "logits/rejected": -2.0803465843200684, "logps/chosen": -366.0765075683594, "logps/rejected": -395.1354064941406, "loss": 0.5827, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8468583226203918, "rewards/margins": 0.49055153131484985, "rewards/margins_max": 1.517333984375, "rewards/margins_min": -0.5585001707077026, "rewards/margins_std": 0.9400030970573425, "rewards/rejected": -1.3374097347259521, "step": 2380 }, { "epoch": 0.57, "grad_norm": 5.554964082809511, "learning_rate": 2.3058316416233864e-07, "logits/chosen": -2.250479221343994, "logits/rejected": -2.1593432426452637, "logps/chosen": -329.5320129394531, "logps/rejected": -383.1172790527344, "loss": 0.5301, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6815643310546875, "rewards/margins": 0.5962087512016296, "rewards/margins_max": 1.8362629413604736, "rewards/margins_min": -0.4838317930698395, "rewards/margins_std": 1.0325170755386353, "rewards/rejected": -1.277773141860962, "step": 2390 }, { "epoch": 0.57, "grad_norm": 10.656720371995977, "learning_rate": 2.2850024334205654e-07, "logits/chosen": -2.214466094970703, "logits/rejected": -2.113741636276245, "logps/chosen": -347.58795166015625, "logps/rejected": -378.8655090332031, "loss": 0.5852, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8682153820991516, "rewards/margins": 0.45568814873695374, "rewards/margins_max": 1.7477096319198608, "rewards/margins_min": -0.7294108867645264, "rewards/margins_std": 1.0804404020309448, "rewards/rejected": -1.3239034414291382, "step": 2400 }, { "epoch": 0.57, "eval_logits/chosen": -2.143878221511841, "eval_logits/rejected": -2.038877010345459, "eval_logps/chosen": -345.3100280761719, "eval_logps/rejected": -384.9013977050781, "eval_loss": 0.5860528349876404, "eval_rewards/accuracies": 0.6840000152587891, "eval_rewards/chosen": -0.7364694476127625, "eval_rewards/margins": 0.5001190900802612, "eval_rewards/margins_max": 2.2801120281219482, "eval_rewards/margins_min": -0.7261773943901062, "eval_rewards/margins_std": 0.9931062459945679, "eval_rewards/rejected": -1.236588478088379, "eval_runtime": 1565.8564, "eval_samples_per_second": 2.555, "eval_steps_per_second": 0.16, "step": 2400 }, { "epoch": 0.58, "grad_norm": 11.321317324063859, "learning_rate": 2.264188250312138e-07, "logits/chosen": -2.252739667892456, "logits/rejected": -2.054727077484131, "logps/chosen": -344.7042236328125, "logps/rejected": -349.3314208984375, "loss": 0.5246, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7048185467720032, "rewards/margins": 0.575404942035675, "rewards/margins_max": 1.6823570728302002, "rewards/margins_min": -0.25047996640205383, "rewards/margins_std": 0.852095901966095, "rewards/rejected": -1.2802236080169678, "step": 2410 }, { "epoch": 0.58, "grad_norm": 5.85497686497184, "learning_rate": 2.2433905468962674e-07, "logits/chosen": -2.317852735519409, "logits/rejected": -2.1942315101623535, "logps/chosen": -371.1949768066406, "logps/rejected": -390.58770751953125, "loss": 0.5359, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8063470125198364, "rewards/margins": 0.5994687676429749, "rewards/margins_max": 1.6925287246704102, "rewards/margins_min": -0.2816247344017029, "rewards/margins_std": 0.8841756582260132, "rewards/rejected": -1.405815839767456, "step": 2420 }, { "epoch": 0.58, "grad_norm": 9.717142935106443, "learning_rate": 2.222610776619439e-07, "logits/chosen": -2.307480573654175, "logits/rejected": -2.1449503898620605, "logps/chosen": -361.1840515136719, "logps/rejected": -385.5247802734375, "loss": 0.5289, "rewards/accuracies": 0.6875, "rewards/chosen": -0.801993191242218, "rewards/margins": 0.6295140981674194, "rewards/margins_max": 1.947885513305664, "rewards/margins_min": -0.4305938184261322, "rewards/margins_std": 1.0765188932418823, "rewards/rejected": -1.4315072298049927, "step": 2430 }, { "epoch": 0.58, "grad_norm": 7.143425924270992, "learning_rate": 2.201850391674877e-07, "logits/chosen": -2.330867052078247, "logits/rejected": -2.1417508125305176, "logps/chosen": -377.4809265136719, "logps/rejected": -407.08599853515625, "loss": 0.5339, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9291650652885437, "rewards/margins": 0.6469839811325073, "rewards/margins_max": 1.9287936687469482, "rewards/margins_min": -0.4411528706550598, "rewards/margins_std": 1.079876184463501, "rewards/rejected": -1.5761489868164062, "step": 2440 }, { "epoch": 0.59, "grad_norm": 22.173365601320963, "learning_rate": 2.181110842901066e-07, "logits/chosen": -2.2900538444519043, "logits/rejected": -2.10183048248291, "logps/chosen": -361.0449523925781, "logps/rejected": -392.08544921875, "loss": 0.5933, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9875124096870422, "rewards/margins": 0.5746131539344788, "rewards/margins_max": 1.8985391855239868, "rewards/margins_min": -0.5715238451957703, "rewards/margins_std": 1.1146481037139893, "rewards/rejected": -1.562125563621521, "step": 2450 }, { "epoch": 0.59, "grad_norm": 10.133649734108326, "learning_rate": 2.160393579680353e-07, "logits/chosen": -2.2130608558654785, "logits/rejected": -2.1114115715026855, "logps/chosen": -351.90057373046875, "logps/rejected": -439.4019470214844, "loss": 0.5249, "rewards/accuracies": 0.75, "rewards/chosen": -1.0215137004852295, "rewards/margins": 0.8079718351364136, "rewards/margins_max": 2.273721218109131, "rewards/margins_min": -0.20698101818561554, "rewards/margins_std": 1.1034561395645142, "rewards/rejected": -1.8294856548309326, "step": 2460 }, { "epoch": 0.59, "grad_norm": 7.78260246074613, "learning_rate": 2.1397000498376634e-07, "logits/chosen": -2.2493505477905273, "logits/rejected": -2.059276580810547, "logps/chosen": -375.99200439453125, "logps/rejected": -418.6184997558594, "loss": 0.5481, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.072912335395813, "rewards/margins": 0.5297819375991821, "rewards/margins_max": 1.7399168014526367, "rewards/margins_min": -0.47067588567733765, "rewards/margins_std": 0.9852256774902344, "rewards/rejected": -1.6026942729949951, "step": 2470 }, { "epoch": 0.59, "grad_norm": 12.558248683050975, "learning_rate": 2.1190316995393144e-07, "logits/chosen": -2.260863780975342, "logits/rejected": -2.030372381210327, "logps/chosen": -349.14813232421875, "logps/rejected": -366.12872314453125, "loss": 0.56, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.998975932598114, "rewards/margins": 0.5108473896980286, "rewards/margins_max": 1.6825597286224365, "rewards/margins_min": -0.5741672515869141, "rewards/margins_std": 1.0053203105926514, "rewards/rejected": -1.5098233222961426, "step": 2480 }, { "epoch": 0.6, "grad_norm": 8.44287209623588, "learning_rate": 2.098389973191953e-07, "logits/chosen": -2.234262466430664, "logits/rejected": -2.1752798557281494, "logps/chosen": -380.1053771972656, "logps/rejected": -433.7974548339844, "loss": 0.555, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9529165029525757, "rewards/margins": 0.5886826515197754, "rewards/margins_max": 1.7806297540664673, "rewards/margins_min": -0.4360727369785309, "rewards/margins_std": 0.9865517616271973, "rewards/rejected": -1.5415990352630615, "step": 2490 }, { "epoch": 0.6, "grad_norm": 8.778687887318311, "learning_rate": 2.0777763133416118e-07, "logits/chosen": -2.3188843727111816, "logits/rejected": -2.0748417377471924, "logps/chosen": -340.01092529296875, "logps/rejected": -372.6244201660156, "loss": 0.5554, "rewards/accuracies": 0.75, "rewards/chosen": -0.8833228945732117, "rewards/margins": 0.7618860006332397, "rewards/margins_max": 1.9406477212905884, "rewards/margins_min": -0.12824508547782898, "rewards/margins_std": 0.9326462745666504, "rewards/rejected": -1.6452089548110962, "step": 2500 }, { "epoch": 0.6, "eval_logits/chosen": -2.1346750259399414, "eval_logits/rejected": -2.0290520191192627, "eval_logps/chosen": -352.1300354003906, "eval_logps/rejected": -396.08642578125, "eval_loss": 0.5850762724876404, "eval_rewards/accuracies": 0.6869999766349792, "eval_rewards/chosen": -0.8046696186065674, "eval_rewards/margins": 0.5437690615653992, "eval_rewards/margins_max": 2.488455057144165, "eval_rewards/margins_min": -0.78057461977005, "eval_rewards/margins_std": 1.0799211263656616, "eval_rewards/rejected": -1.3484388589859009, "eval_runtime": 1519.1623, "eval_samples_per_second": 2.633, "eval_steps_per_second": 0.165, "step": 2500 }, { "epoch": 0.6, "grad_norm": 6.255298352341528, "learning_rate": 2.057192160572898e-07, "logits/chosen": -2.3098647594451904, "logits/rejected": -2.130199670791626, "logps/chosen": -333.4775695800781, "logps/rejected": -375.1148986816406, "loss": 0.6122, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7306645512580872, "rewards/margins": 0.4927193224430084, "rewards/margins_max": 1.4374754428863525, "rewards/margins_min": -0.43880996108055115, "rewards/margins_std": 0.8438846468925476, "rewards/rejected": -1.2233836650848389, "step": 2510 }, { "epoch": 0.6, "grad_norm": 9.279221438272087, "learning_rate": 2.0366389534083185e-07, "logits/chosen": -2.2382874488830566, "logits/rejected": -2.103217124938965, "logps/chosen": -338.45806884765625, "logps/rejected": -370.3023376464844, "loss": 0.5592, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6555181741714478, "rewards/margins": 0.6042823791503906, "rewards/margins_max": 1.8908348083496094, "rewards/margins_min": -0.39471834897994995, "rewards/margins_std": 1.0250580310821533, "rewards/rejected": -1.2598004341125488, "step": 2520 }, { "epoch": 0.61, "grad_norm": 8.121828369852194, "learning_rate": 2.0161181282077469e-07, "logits/chosen": -2.214073896408081, "logits/rejected": -2.201019525527954, "logps/chosen": -302.781005859375, "logps/rejected": -379.20294189453125, "loss": 0.5524, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.738277792930603, "rewards/margins": 0.652738630771637, "rewards/margins_max": 1.7349151372909546, "rewards/margins_min": -0.22372667491436005, "rewards/margins_std": 0.8831348419189453, "rewards/rejected": -1.3910163640975952, "step": 2530 }, { "epoch": 0.61, "grad_norm": 8.924192110997184, "learning_rate": 1.9956311190680468e-07, "logits/chosen": -2.202094554901123, "logits/rejected": -2.141479253768921, "logps/chosen": -337.9544982910156, "logps/rejected": -414.3702697753906, "loss": 0.5674, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9090129137039185, "rewards/margins": 0.5950706601142883, "rewards/margins_max": 1.922425627708435, "rewards/margins_min": -0.5352715253829956, "rewards/margins_std": 1.0928497314453125, "rewards/rejected": -1.504083514213562, "step": 2540 }, { "epoch": 0.61, "grad_norm": 7.524762030098968, "learning_rate": 1.9751793577228455e-07, "logits/chosen": -2.2999870777130127, "logits/rejected": -2.2381348609924316, "logps/chosen": -367.44476318359375, "logps/rejected": -420.818359375, "loss": 0.5536, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8627947568893433, "rewards/margins": 0.6434819102287292, "rewards/margins_max": 2.022932529449463, "rewards/margins_min": -0.5195577144622803, "rewards/margins_std": 1.1449410915374756, "rewards/rejected": -1.5062767267227173, "step": 2550 }, { "epoch": 0.61, "grad_norm": 9.223010717589856, "learning_rate": 1.9547642734424823e-07, "logits/chosen": -2.1504464149475098, "logits/rejected": -2.1337990760803223, "logps/chosen": -355.55328369140625, "logps/rejected": -436.2899475097656, "loss": 0.5524, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8137137293815613, "rewards/margins": 0.8529330492019653, "rewards/margins_max": 2.391366720199585, "rewards/margins_min": -0.34231704473495483, "rewards/margins_std": 1.220885992050171, "rewards/rejected": -1.6666467189788818, "step": 2560 }, { "epoch": 0.62, "grad_norm": 13.106545689730464, "learning_rate": 1.9343872929341196e-07, "logits/chosen": -2.2564780712127686, "logits/rejected": -2.210069179534912, "logps/chosen": -354.81927490234375, "logps/rejected": -416.82861328125, "loss": 0.5376, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7428320050239563, "rewards/margins": 0.6815324425697327, "rewards/margins_max": 2.213290214538574, "rewards/margins_min": -0.45670971274375916, "rewards/margins_std": 1.1947391033172607, "rewards/rejected": -1.4243643283843994, "step": 2570 }, { "epoch": 0.62, "grad_norm": 10.093453701146476, "learning_rate": 1.9140498402420416e-07, "logits/chosen": -2.231541872024536, "logits/rejected": -2.182448148727417, "logps/chosen": -376.1432189941406, "logps/rejected": -446.85162353515625, "loss": 0.5543, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8583730459213257, "rewards/margins": 0.7337325215339661, "rewards/margins_max": 2.128575086593628, "rewards/margins_min": -0.39311498403549194, "rewards/margins_std": 1.1432095766067505, "rewards/rejected": -1.5921056270599365, "step": 2580 }, { "epoch": 0.62, "grad_norm": 10.396047178424844, "learning_rate": 1.8937533366481308e-07, "logits/chosen": -2.219251871109009, "logits/rejected": -2.167388916015625, "logps/chosen": -343.32586669921875, "logps/rejected": -408.667724609375, "loss": 0.5631, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9631455540657043, "rewards/margins": 0.5252482295036316, "rewards/margins_max": 1.5739165544509888, "rewards/margins_min": -0.42900729179382324, "rewards/margins_std": 0.9175116419792175, "rewards/rejected": -1.4883939027786255, "step": 2590 }, { "epoch": 0.62, "grad_norm": 11.942094967054501, "learning_rate": 1.8734992005725463e-07, "logits/chosen": -2.0592405796051025, "logits/rejected": -2.061183452606201, "logps/chosen": -376.0555114746094, "logps/rejected": -455.09759521484375, "loss": 0.5772, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0244112014770508, "rewards/margins": 0.7232916951179504, "rewards/margins_max": 2.389145851135254, "rewards/margins_min": -0.6807640194892883, "rewards/margins_std": 1.3609912395477295, "rewards/rejected": -1.747702956199646, "step": 2600 }, { "epoch": 0.62, "eval_logits/chosen": -2.134948492050171, "eval_logits/rejected": -2.029139995574951, "eval_logps/chosen": -355.49664306640625, "eval_logps/rejected": -401.94158935546875, "eval_loss": 0.5848167538642883, "eval_rewards/accuracies": 0.6850000023841858, "eval_rewards/chosen": -0.8383359313011169, "eval_rewards/margins": 0.5686543583869934, "eval_rewards/margins_max": 2.6189122200012207, "eval_rewards/margins_min": -0.809954822063446, "eval_rewards/margins_std": 1.1308537721633911, "eval_rewards/rejected": -1.4069902896881104, "eval_runtime": 1603.3361, "eval_samples_per_second": 2.495, "eval_steps_per_second": 0.156, "step": 2600 }, { "epoch": 0.62, "grad_norm": 7.481512553643951, "learning_rate": 1.853288847474594e-07, "logits/chosen": -2.2641968727111816, "logits/rejected": -2.1381983757019043, "logps/chosen": -385.5421142578125, "logps/rejected": -414.9905700683594, "loss": 0.5434, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9095985293388367, "rewards/margins": 0.552988588809967, "rewards/margins_max": 1.8610680103302002, "rewards/margins_min": -0.38848552107810974, "rewards/margins_std": 1.011828064918518, "rewards/rejected": -1.4625871181488037, "step": 2610 }, { "epoch": 0.63, "grad_norm": 8.415431117655, "learning_rate": 1.8331236897538065e-07, "logits/chosen": -2.2634575366973877, "logits/rejected": -2.2281742095947266, "logps/chosen": -355.7486877441406, "logps/rejected": -412.7940979003906, "loss": 0.6055, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8892784118652344, "rewards/margins": 0.5209530591964722, "rewards/margins_max": 1.837864875793457, "rewards/margins_min": -0.6562310457229614, "rewards/margins_std": 1.1063191890716553, "rewards/rejected": -1.410231351852417, "step": 2620 }, { "epoch": 0.63, "grad_norm": 7.236862644732225, "learning_rate": 1.8130051366512447e-07, "logits/chosen": -2.2386887073516846, "logits/rejected": -2.0710043907165527, "logps/chosen": -352.4814758300781, "logps/rejected": -436.6332092285156, "loss": 0.5508, "rewards/accuracies": 0.75, "rewards/chosen": -0.7595894932746887, "rewards/margins": 0.7041056156158447, "rewards/margins_max": 2.0144927501678467, "rewards/margins_min": -0.29318708181381226, "rewards/margins_std": 1.0378332138061523, "rewards/rejected": -1.4636952877044678, "step": 2630 }, { "epoch": 0.63, "grad_norm": 11.50118415375028, "learning_rate": 1.792934594151003e-07, "logits/chosen": -2.391502857208252, "logits/rejected": -2.300816059112549, "logps/chosen": -341.96954345703125, "logps/rejected": -369.87335205078125, "loss": 0.6071, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8969739675521851, "rewards/margins": 0.41168031096458435, "rewards/margins_max": 1.6322259902954102, "rewards/margins_min": -0.6757373809814453, "rewards/margins_std": 1.0282056331634521, "rewards/rejected": -1.3086541891098022, "step": 2640 }, { "epoch": 0.63, "grad_norm": 8.358140515594242, "learning_rate": 1.7729134648819605e-07, "logits/chosen": -2.172358274459839, "logits/rejected": -2.0474321842193604, "logps/chosen": -311.364990234375, "logps/rejected": -355.5836181640625, "loss": 0.5633, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7487236261367798, "rewards/margins": 0.554851233959198, "rewards/margins_max": 1.915903091430664, "rewards/margins_min": -0.4874037802219391, "rewards/margins_std": 1.0828654766082764, "rewards/rejected": -1.303574800491333, "step": 2650 }, { "epoch": 0.64, "grad_norm": 8.886210493707459, "learning_rate": 1.7529431480197533e-07, "logits/chosen": -2.284759998321533, "logits/rejected": -2.139875888824463, "logps/chosen": -342.4052734375, "logps/rejected": -399.605712890625, "loss": 0.5661, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8138421177864075, "rewards/margins": 0.5458594560623169, "rewards/margins_max": 1.8594259023666382, "rewards/margins_min": -0.49750369787216187, "rewards/margins_std": 1.0364296436309814, "rewards/rejected": -1.3597016334533691, "step": 2660 }, { "epoch": 0.64, "grad_norm": 8.22023559603948, "learning_rate": 1.7330250391889961e-07, "logits/chosen": -2.3451480865478516, "logits/rejected": -2.070934534072876, "logps/chosen": -344.791259765625, "logps/rejected": -349.4190979003906, "loss": 0.5587, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7241935133934021, "rewards/margins": 0.6361091136932373, "rewards/margins_max": 1.739561676979065, "rewards/margins_min": -0.24547407031059265, "rewards/margins_std": 0.8842795491218567, "rewards/rejected": -1.3603025674819946, "step": 2670 }, { "epoch": 0.64, "grad_norm": 9.491538043625688, "learning_rate": 1.713160530365747e-07, "logits/chosen": -2.3210301399230957, "logits/rejected": -2.275268316268921, "logps/chosen": -369.0946960449219, "logps/rejected": -405.9383850097656, "loss": 0.6011, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8611409068107605, "rewards/margins": 0.3265928328037262, "rewards/margins_max": 1.300518274307251, "rewards/margins_min": -0.748623251914978, "rewards/margins_std": 0.9056294560432434, "rewards/rejected": -1.187733769416809, "step": 2680 }, { "epoch": 0.64, "grad_norm": 9.163795252656902, "learning_rate": 1.693351009780231e-07, "logits/chosen": -2.244758129119873, "logits/rejected": -2.081564426422119, "logps/chosen": -358.62750244140625, "logps/rejected": -397.1409606933594, "loss": 0.57, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8103052973747253, "rewards/margins": 0.6798052787780762, "rewards/margins_max": 2.06199312210083, "rewards/margins_min": -0.4610458016395569, "rewards/margins_std": 1.1559914350509644, "rewards/rejected": -1.4901106357574463, "step": 2690 }, { "epoch": 0.65, "grad_norm": 8.016334071037495, "learning_rate": 1.6735978618198215e-07, "logits/chosen": -2.305821657180786, "logits/rejected": -2.280149459838867, "logps/chosen": -315.2251892089844, "logps/rejected": -394.9398498535156, "loss": 0.5886, "rewards/accuracies": 0.625, "rewards/chosen": -0.7464590072631836, "rewards/margins": 0.39061635732650757, "rewards/margins_max": 1.4922980070114136, "rewards/margins_min": -0.5402814149856567, "rewards/margins_std": 0.9152474403381348, "rewards/rejected": -1.137075424194336, "step": 2700 }, { "epoch": 0.65, "eval_logits/chosen": -2.1470866203308105, "eval_logits/rejected": -2.0422286987304688, "eval_logps/chosen": -346.0683898925781, "eval_logps/rejected": -389.058349609375, "eval_loss": 0.5817098617553711, "eval_rewards/accuracies": 0.690500020980835, "eval_rewards/chosen": -0.7440533638000488, "eval_rewards/margins": 0.5341048240661621, "eval_rewards/margins_max": 2.4311182498931885, "eval_rewards/margins_min": -0.7282640337944031, "eval_rewards/margins_std": 1.043923258781433, "eval_rewards/rejected": -1.278158187866211, "eval_runtime": 1511.0452, "eval_samples_per_second": 2.647, "eval_steps_per_second": 0.165, "step": 2700 }, { "epoch": 0.65, "grad_norm": 8.898370194955076, "learning_rate": 1.6539024669322954e-07, "logits/chosen": -2.2500927448272705, "logits/rejected": -2.1628692150115967, "logps/chosen": -355.16265869140625, "logps/rejected": -393.99542236328125, "loss": 0.5655, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7464293241500854, "rewards/margins": 0.6963762640953064, "rewards/margins_max": 1.910645842552185, "rewards/margins_min": -0.22545702755451202, "rewards/margins_std": 0.9582613110542297, "rewards/rejected": -1.4428056478500366, "step": 2710 }, { "epoch": 0.65, "grad_norm": 8.448233100305048, "learning_rate": 1.6342662015293584e-07, "logits/chosen": -2.2581653594970703, "logits/rejected": -2.0949902534484863, "logps/chosen": -366.9489440917969, "logps/rejected": -393.984130859375, "loss": 0.6299, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.751304566860199, "rewards/margins": 0.5684714913368225, "rewards/margins_max": 1.7647361755371094, "rewards/margins_min": -0.4300934672355652, "rewards/margins_std": 0.980744481086731, "rewards/rejected": -1.3197760581970215, "step": 2720 }, { "epoch": 0.65, "grad_norm": 17.263509132230148, "learning_rate": 1.6146904378904536e-07, "logits/chosen": -2.4308507442474365, "logits/rejected": -2.310580015182495, "logps/chosen": -411.7900390625, "logps/rejected": -437.73797607421875, "loss": 0.6281, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8868436813354492, "rewards/margins": 0.3524634540081024, "rewards/margins_max": 1.5443838834762573, "rewards/margins_min": -0.7296649813652039, "rewards/margins_std": 1.022323727607727, "rewards/rejected": -1.239307165145874, "step": 2730 }, { "epoch": 0.66, "grad_norm": 10.897217953719169, "learning_rate": 1.5951765440668635e-07, "logits/chosen": -2.3328700065612793, "logits/rejected": -2.144836187362671, "logps/chosen": -356.7285461425781, "logps/rejected": -367.534423828125, "loss": 0.5423, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6329866647720337, "rewards/margins": 0.6544613838195801, "rewards/margins_max": 2.148946762084961, "rewards/margins_min": -0.3031309247016907, "rewards/margins_std": 1.1030254364013672, "rewards/rejected": -1.2874481678009033, "step": 2740 }, { "epoch": 0.66, "grad_norm": 6.713005697569865, "learning_rate": 1.5757258837860998e-07, "logits/chosen": -2.248119592666626, "logits/rejected": -2.1280505657196045, "logps/chosen": -348.53314208984375, "logps/rejected": -400.66278076171875, "loss": 0.5626, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8887054324150085, "rewards/margins": 0.7401953935623169, "rewards/margins_max": 2.0942726135253906, "rewards/margins_min": -0.38710081577301025, "rewards/margins_std": 1.11003577709198, "rewards/rejected": -1.6289007663726807, "step": 2750 }, { "epoch": 0.66, "grad_norm": 7.472227172975976, "learning_rate": 1.5563398163566034e-07, "logits/chosen": -2.20658802986145, "logits/rejected": -2.165048837661743, "logps/chosen": -340.1798095703125, "logps/rejected": -428.27117919921875, "loss": 0.5194, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8839414715766907, "rewards/margins": 0.6610814929008484, "rewards/margins_max": 1.9468320608139038, "rewards/margins_min": -0.24807903170585632, "rewards/margins_std": 0.9920464754104614, "rewards/rejected": -1.545022964477539, "step": 2760 }, { "epoch": 0.66, "grad_norm": 12.283682468236826, "learning_rate": 1.5370196965727438e-07, "logits/chosen": -2.1551387310028076, "logits/rejected": -2.0748305320739746, "logps/chosen": -349.5186462402344, "logps/rejected": -411.78125, "loss": 0.5194, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.843592643737793, "rewards/margins": 0.7036651372909546, "rewards/margins_max": 2.0399329662323, "rewards/margins_min": -0.3526110053062439, "rewards/margins_std": 1.096990942955017, "rewards/rejected": -1.5472577810287476, "step": 2770 }, { "epoch": 0.67, "grad_norm": 6.49007744130159, "learning_rate": 1.5177668746201454e-07, "logits/chosen": -2.1466658115386963, "logits/rejected": -2.1873269081115723, "logps/chosen": -329.50732421875, "logps/rejected": -421.3229064941406, "loss": 0.5777, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9477412104606628, "rewards/margins": 0.696178138256073, "rewards/margins_max": 2.1885428428649902, "rewards/margins_min": -0.612154483795166, "rewards/margins_std": 1.2407530546188354, "rewards/rejected": -1.6439193487167358, "step": 2780 }, { "epoch": 0.67, "grad_norm": 8.944033495490556, "learning_rate": 1.4985826959813254e-07, "logits/chosen": -2.333343505859375, "logits/rejected": -2.14142107963562, "logps/chosen": -405.83197021484375, "logps/rejected": -441.6629333496094, "loss": 0.5669, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9833502769470215, "rewards/margins": 0.5409424901008606, "rewards/margins_max": 1.7926830053329468, "rewards/margins_min": -0.5090065002441406, "rewards/margins_std": 1.0471017360687256, "rewards/rejected": -1.5242927074432373, "step": 2790 }, { "epoch": 0.67, "grad_norm": 6.7259882771562545, "learning_rate": 1.4794685013416674e-07, "logits/chosen": -2.2594521045684814, "logits/rejected": -2.063821315765381, "logps/chosen": -377.37652587890625, "logps/rejected": -410.6341247558594, "loss": 0.6359, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8451797366142273, "rewards/margins": 0.517044186592102, "rewards/margins_max": 1.5615179538726807, "rewards/margins_min": -0.4892970025539398, "rewards/margins_std": 0.9270143508911133, "rewards/rejected": -1.3622238636016846, "step": 2800 }, { "epoch": 0.67, "eval_logits/chosen": -2.139292001724243, "eval_logits/rejected": -2.0337820053100586, "eval_logps/chosen": -352.04095458984375, "eval_logps/rejected": -398.0761413574219, "eval_loss": 0.5827240347862244, "eval_rewards/accuracies": 0.6899999976158142, "eval_rewards/chosen": -0.8037792444229126, "eval_rewards/margins": 0.5645570158958435, "eval_rewards/margins_max": 2.5874712467193604, "eval_rewards/margins_min": -0.7822895050048828, "eval_rewards/margins_std": 1.1114647388458252, "eval_rewards/rejected": -1.3683362007141113, "eval_runtime": 1526.0751, "eval_samples_per_second": 2.621, "eval_steps_per_second": 0.164, "step": 2800 }, { "epoch": 0.67, "grad_norm": 8.203460744617272, "learning_rate": 1.460425626495725e-07, "logits/chosen": -2.3165714740753174, "logits/rejected": -2.185732841491699, "logps/chosen": -319.2771911621094, "logps/rejected": -368.2125549316406, "loss": 0.6151, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8163260221481323, "rewards/margins": 0.5329941511154175, "rewards/margins_max": 1.8180688619613647, "rewards/margins_min": -0.6117970943450928, "rewards/margins_std": 1.0984268188476562, "rewards/rejected": -1.3493201732635498, "step": 2810 }, { "epoch": 0.68, "grad_norm": 8.67856714965069, "learning_rate": 1.4414554022538737e-07, "logits/chosen": -2.4573521614074707, "logits/rejected": -2.2219181060791016, "logps/chosen": -346.9584045410156, "logps/rejected": -379.37591552734375, "loss": 0.5389, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7269964218139648, "rewards/margins": 0.6475740671157837, "rewards/margins_max": 1.9422130584716797, "rewards/margins_min": -0.30908530950546265, "rewards/margins_std": 1.0065529346466064, "rewards/rejected": -1.374570369720459, "step": 2820 }, { "epoch": 0.68, "grad_norm": 10.433937789746158, "learning_rate": 1.4225591543493025e-07, "logits/chosen": -2.1318726539611816, "logits/rejected": -2.1110665798187256, "logps/chosen": -315.83306884765625, "logps/rejected": -414.3623962402344, "loss": 0.5415, "rewards/accuracies": 0.75, "rewards/chosen": -0.8180092573165894, "rewards/margins": 0.557115375995636, "rewards/margins_max": 1.7082513570785522, "rewards/margins_min": -0.4385001063346863, "rewards/margins_std": 0.9662193059921265, "rewards/rejected": -1.3751246929168701, "step": 2830 }, { "epoch": 0.68, "grad_norm": 7.755350788380245, "learning_rate": 1.4037382033453698e-07, "logits/chosen": -2.2425472736358643, "logits/rejected": -2.1612067222595215, "logps/chosen": -364.8679504394531, "logps/rejected": -443.75030517578125, "loss": 0.5396, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8928340673446655, "rewards/margins": 0.6894606351852417, "rewards/margins_max": 2.1586272716522217, "rewards/margins_min": -0.38689035177230835, "rewards/margins_std": 1.1497681140899658, "rewards/rejected": -1.5822948217391968, "step": 2840 }, { "epoch": 0.68, "grad_norm": 9.10646746185309, "learning_rate": 1.384993864543314e-07, "logits/chosen": -2.3344109058380127, "logits/rejected": -2.217395782470703, "logps/chosen": -350.96246337890625, "logps/rejected": -441.06231689453125, "loss": 0.5532, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8194843530654907, "rewards/margins": 0.7351148128509521, "rewards/margins_max": 2.0805861949920654, "rewards/margins_min": -0.27767643332481384, "rewards/margins_std": 1.072550654411316, "rewards/rejected": -1.5545990467071533, "step": 2850 }, { "epoch": 0.68, "grad_norm": 9.405176449975608, "learning_rate": 1.366327447890332e-07, "logits/chosen": -2.395904064178467, "logits/rejected": -2.2710351943969727, "logps/chosen": -374.7344665527344, "logps/rejected": -451.0226135253906, "loss": 0.5403, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8131138682365417, "rewards/margins": 0.7941919565200806, "rewards/margins_max": 2.1848716735839844, "rewards/margins_min": -0.32637813687324524, "rewards/margins_std": 1.113545536994934, "rewards/rejected": -1.6073057651519775, "step": 2860 }, { "epoch": 0.69, "grad_norm": 6.917240839033676, "learning_rate": 1.3477402578880356e-07, "logits/chosen": -2.3734850883483887, "logits/rejected": -2.208554744720459, "logps/chosen": -384.0926208496094, "logps/rejected": -455.77764892578125, "loss": 0.5427, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9692071080207825, "rewards/margins": 0.7382278442382812, "rewards/margins_max": 2.095735788345337, "rewards/margins_min": -0.18673917651176453, "rewards/margins_std": 1.0263745784759521, "rewards/rejected": -1.7074350118637085, "step": 2870 }, { "epoch": 0.69, "grad_norm": 10.77973940674106, "learning_rate": 1.3292335935012854e-07, "logits/chosen": -2.257936477661133, "logits/rejected": -2.168585777282715, "logps/chosen": -410.82000732421875, "logps/rejected": -446.6611328125, "loss": 0.5832, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9769846796989441, "rewards/margins": 0.7440393567085266, "rewards/margins_max": 2.303378105163574, "rewards/margins_min": -0.4075072407722473, "rewards/margins_std": 1.2136720418930054, "rewards/rejected": -1.7210241556167603, "step": 2880 }, { "epoch": 0.69, "grad_norm": 7.90711264747897, "learning_rate": 1.3108087480674166e-07, "logits/chosen": -2.280320644378662, "logits/rejected": -2.2297027111053467, "logps/chosen": -399.75225830078125, "logps/rejected": -470.48724365234375, "loss": 0.5746, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0159825086593628, "rewards/margins": 0.7090276479721069, "rewards/margins_max": 2.0818214416503906, "rewards/margins_min": -0.5684320330619812, "rewards/margins_std": 1.16270911693573, "rewards/rejected": -1.7250101566314697, "step": 2890 }, { "epoch": 0.69, "grad_norm": 7.495110507427805, "learning_rate": 1.2924670092058465e-07, "logits/chosen": -2.3289551734924316, "logits/rejected": -2.1326732635498047, "logps/chosen": -371.326416015625, "logps/rejected": -363.91552734375, "loss": 0.5778, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8273908495903015, "rewards/margins": 0.5302734971046448, "rewards/margins_max": 1.7274143695831299, "rewards/margins_min": -0.4215204119682312, "rewards/margins_std": 0.9660667181015015, "rewards/rejected": -1.3576643466949463, "step": 2900 }, { "epoch": 0.69, "eval_logits/chosen": -2.1376724243164062, "eval_logits/rejected": -2.0322623252868652, "eval_logps/chosen": -351.26409912109375, "eval_logps/rejected": -397.5572814941406, "eval_loss": 0.5805562138557434, "eval_rewards/accuracies": 0.6940000057220459, "eval_rewards/chosen": -0.7960103154182434, "eval_rewards/margins": 0.5671369433403015, "eval_rewards/margins_max": 2.5883193016052246, "eval_rewards/margins_min": -0.7722490429878235, "eval_rewards/margins_std": 1.1082133054733276, "eval_rewards/rejected": -1.3631473779678345, "eval_runtime": 1524.789, "eval_samples_per_second": 2.623, "eval_steps_per_second": 0.164, "step": 2900 }, { "epoch": 0.7, "grad_norm": 18.123754840042203, "learning_rate": 1.2742096587280966e-07, "logits/chosen": -2.2022037506103516, "logits/rejected": -2.0976521968841553, "logps/chosen": -353.94854736328125, "logps/rejected": -370.7405090332031, "loss": 0.6033, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8805254697799683, "rewards/margins": 0.47915711998939514, "rewards/margins_max": 1.6357091665267944, "rewards/margins_min": -0.5535486936569214, "rewards/margins_std": 1.0017355680465698, "rewards/rejected": -1.3596824407577515, "step": 2910 }, { "epoch": 0.7, "grad_norm": 7.899473641159457, "learning_rate": 1.2560379725482073e-07, "logits/chosen": -2.2610809803009033, "logits/rejected": -2.13193941116333, "logps/chosen": -346.4670104980469, "logps/rejected": -377.5896911621094, "loss": 0.5492, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7713009715080261, "rewards/margins": 0.5974099636077881, "rewards/margins_max": 1.6422069072723389, "rewards/margins_min": -0.3003827631473541, "rewards/margins_std": 0.8733491897583008, "rewards/rejected": -1.3687108755111694, "step": 2920 }, { "epoch": 0.7, "grad_norm": 8.118519237783318, "learning_rate": 1.237953220593579e-07, "logits/chosen": -2.3072988986968994, "logits/rejected": -2.1986711025238037, "logps/chosen": -403.4251403808594, "logps/rejected": -418.2538146972656, "loss": 0.5862, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9623511433601379, "rewards/margins": 0.5040423274040222, "rewards/margins_max": 1.7518939971923828, "rewards/margins_min": -0.6664860248565674, "rewards/margins_std": 1.0851104259490967, "rewards/rejected": -1.4663935899734497, "step": 2930 }, { "epoch": 0.7, "grad_norm": 6.6812732264344294, "learning_rate": 1.2199566667162127e-07, "logits/chosen": -2.396554470062256, "logits/rejected": -2.1564297676086426, "logps/chosen": -389.1228332519531, "logps/rejected": -427.5606384277344, "loss": 0.4998, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8328034281730652, "rewards/margins": 0.8399865031242371, "rewards/margins_max": 2.3138556480407715, "rewards/margins_min": -0.3017105162143707, "rewards/margins_std": 1.1915481090545654, "rewards/rejected": -1.6727898120880127, "step": 2940 }, { "epoch": 0.71, "grad_norm": 9.905852589623155, "learning_rate": 1.2020495686043924e-07, "logits/chosen": -2.2875280380249023, "logits/rejected": -2.1613175868988037, "logps/chosen": -359.3728332519531, "logps/rejected": -412.16448974609375, "loss": 0.5272, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7654290795326233, "rewards/margins": 0.8201557993888855, "rewards/margins_max": 2.1511101722717285, "rewards/margins_min": -0.36065372824668884, "rewards/margins_std": 1.1050117015838623, "rewards/rejected": -1.5855847597122192, "step": 2950 }, { "epoch": 0.71, "grad_norm": 10.940199164716878, "learning_rate": 1.1842331776947931e-07, "logits/chosen": -2.274101972579956, "logits/rejected": -2.11092209815979, "logps/chosen": -404.9562072753906, "logps/rejected": -382.06536865234375, "loss": 0.5824, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8620790243148804, "rewards/margins": 0.4577743411064148, "rewards/margins_max": 1.4929348230361938, "rewards/margins_min": -0.6569734215736389, "rewards/margins_std": 0.9340102076530457, "rewards/rejected": -1.31985342502594, "step": 2960 }, { "epoch": 0.71, "grad_norm": 8.581328025068622, "learning_rate": 1.1665087390850187e-07, "logits/chosen": -2.2240023612976074, "logits/rejected": -2.2176856994628906, "logps/chosen": -287.99407958984375, "logps/rejected": -382.7607421875, "loss": 0.5941, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8186147809028625, "rewards/margins": 0.5471965670585632, "rewards/margins_max": 2.0354976654052734, "rewards/margins_min": -0.5602542757987976, "rewards/margins_std": 1.166197419166565, "rewards/rejected": -1.3658114671707153, "step": 2970 }, { "epoch": 0.71, "grad_norm": 7.778123097095739, "learning_rate": 1.1488774914465918e-07, "logits/chosen": -2.277890682220459, "logits/rejected": -2.237089157104492, "logps/chosen": -335.88848876953125, "logps/rejected": -446.7124938964844, "loss": 0.5692, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8786054849624634, "rewards/margins": 0.7675966024398804, "rewards/margins_max": 2.256605386734009, "rewards/margins_min": -0.36861157417297363, "rewards/margins_std": 1.196476936340332, "rewards/rejected": -1.6462020874023438, "step": 2980 }, { "epoch": 0.72, "grad_norm": 9.531929167016482, "learning_rate": 1.1313406669383877e-07, "logits/chosen": -2.2536275386810303, "logits/rejected": -2.1243231296539307, "logps/chosen": -411.26605224609375, "logps/rejected": -447.8019104003906, "loss": 0.6123, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.079096794128418, "rewards/margins": 0.6233213543891907, "rewards/margins_max": 2.2061333656311035, "rewards/margins_min": -0.7484980225563049, "rewards/margins_std": 1.3047716617584229, "rewards/rejected": -1.7024180889129639, "step": 2990 }, { "epoch": 0.72, "grad_norm": 7.524558739382138, "learning_rate": 1.1138994911205284e-07, "logits/chosen": -2.3330841064453125, "logits/rejected": -2.1604387760162354, "logps/chosen": -352.4581604003906, "logps/rejected": -469.4291076660156, "loss": 0.5122, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.7326239347457886, "rewards/margins": 1.0046582221984863, "rewards/margins_max": 2.3721349239349365, "rewards/margins_min": -0.022612880915403366, "rewards/margins_std": 1.0809820890426636, "rewards/rejected": -1.737282156944275, "step": 3000 }, { "epoch": 0.72, "eval_logits/chosen": -2.137059211730957, "eval_logits/rejected": -2.030626058578491, "eval_logps/chosen": -354.7577819824219, "eval_logps/rejected": -403.6569519042969, "eval_loss": 0.5801701545715332, "eval_rewards/accuracies": 0.6909999847412109, "eval_rewards/chosen": -0.83094722032547, "eval_rewards/margins": 0.5931965112686157, "eval_rewards/margins_max": 2.7199370861053467, "eval_rewards/margins_min": -0.8026529550552368, "eval_rewards/margins_std": 1.1605943441390991, "eval_rewards/rejected": -1.4241435527801514, "eval_runtime": 1532.3298, "eval_samples_per_second": 2.61, "eval_steps_per_second": 0.163, "step": 3000 }, { "epoch": 0.72, "grad_norm": 15.231768607169155, "learning_rate": 1.0965551828687297e-07, "logits/chosen": -2.2573747634887695, "logits/rejected": -2.1658437252044678, "logps/chosen": -334.1466979980469, "logps/rejected": -444.46484375, "loss": 0.524, "rewards/accuracies": 0.75, "rewards/chosen": -0.8231682777404785, "rewards/margins": 0.7372006177902222, "rewards/margins_max": 2.0043468475341797, "rewards/margins_min": -0.2653668224811554, "rewards/margins_std": 1.0091005563735962, "rewards/rejected": -1.5603687763214111, "step": 3010 }, { "epoch": 0.72, "grad_norm": 8.200512976999518, "learning_rate": 1.0793089542891229e-07, "logits/chosen": -2.222470283508301, "logits/rejected": -2.050097942352295, "logps/chosen": -353.1983947753906, "logps/rejected": -388.42681884765625, "loss": 0.493, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7806388139724731, "rewards/margins": 0.746993899345398, "rewards/margins_max": 1.827378511428833, "rewards/margins_min": -0.13711689412593842, "rewards/margins_std": 0.8838556408882141, "rewards/rejected": -1.5276325941085815, "step": 3020 }, { "epoch": 0.73, "grad_norm": 9.043235349741835, "learning_rate": 1.062162010633545e-07, "logits/chosen": -2.310997486114502, "logits/rejected": -2.1224937438964844, "logps/chosen": -355.1472473144531, "logps/rejected": -385.2495422363281, "loss": 0.5933, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8114493489265442, "rewards/margins": 0.7061553001403809, "rewards/margins_max": 1.9671236276626587, "rewards/margins_min": -0.36878952383995056, "rewards/margins_std": 1.0379085540771484, "rewards/rejected": -1.5176045894622803, "step": 3030 }, { "epoch": 0.73, "grad_norm": 8.519290352222347, "learning_rate": 1.0451155502153138e-07, "logits/chosen": -2.246466875076294, "logits/rejected": -2.131251811981201, "logps/chosen": -372.54791259765625, "logps/rejected": -372.6636657714844, "loss": 0.6203, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9832660555839539, "rewards/margins": 0.36633047461509705, "rewards/margins_max": 1.5019137859344482, "rewards/margins_min": -0.7529887557029724, "rewards/margins_std": 1.0146019458770752, "rewards/rejected": -1.349596619606018, "step": 3040 }, { "epoch": 0.73, "grad_norm": 10.147512016355059, "learning_rate": 1.028170764325479e-07, "logits/chosen": -2.3300507068634033, "logits/rejected": -2.174025535583496, "logps/chosen": -392.2646484375, "logps/rejected": -419.32501220703125, "loss": 0.5731, "rewards/accuracies": 0.6875, "rewards/chosen": -0.930245041847229, "rewards/margins": 0.5309632420539856, "rewards/margins_max": 2.113675832748413, "rewards/margins_min": -0.7868351936340332, "rewards/margins_std": 1.2782131433486938, "rewards/rejected": -1.4612082242965698, "step": 3050 }, { "epoch": 0.73, "grad_norm": 7.5199416487032895, "learning_rate": 1.0113288371495707e-07, "logits/chosen": -2.2938389778137207, "logits/rejected": -2.2126317024230957, "logps/chosen": -391.1658630371094, "logps/rejected": -419.5048828125, "loss": 0.6032, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0022815465927124, "rewards/margins": 0.520848274230957, "rewards/margins_max": 2.1273388862609863, "rewards/margins_min": -0.894837498664856, "rewards/margins_std": 1.313733696937561, "rewards/rejected": -1.523129940032959, "step": 3060 }, { "epoch": 0.74, "grad_norm": 19.572441719251987, "learning_rate": 9.945909456848434e-08, "logits/chosen": -2.225040912628174, "logits/rejected": -2.0976192951202393, "logps/chosen": -372.3159484863281, "logps/rejected": -370.4169921875, "loss": 0.625, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8342713117599487, "rewards/margins": 0.4293574392795563, "rewards/margins_max": 1.5873908996582031, "rewards/margins_min": -0.49454063177108765, "rewards/margins_std": 0.9341064691543579, "rewards/rejected": -1.2636287212371826, "step": 3070 }, { "epoch": 0.74, "grad_norm": 6.831793107538385, "learning_rate": 9.779582596580203e-08, "logits/chosen": -2.173398017883301, "logits/rejected": -2.0887134075164795, "logps/chosen": -319.0833435058594, "logps/rejected": -407.9541015625, "loss": 0.523, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8416653871536255, "rewards/margins": 0.7973026633262634, "rewards/margins_max": 2.0009312629699707, "rewards/margins_min": -0.34980857372283936, "rewards/margins_std": 1.0175703763961792, "rewards/rejected": -1.6389681100845337, "step": 3080 }, { "epoch": 0.74, "grad_norm": 8.909308514034194, "learning_rate": 9.614319414435499e-08, "logits/chosen": -2.3257970809936523, "logits/rejected": -2.1586790084838867, "logps/chosen": -351.21612548828125, "logps/rejected": -384.20220947265625, "loss": 0.5076, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8276811838150024, "rewards/margins": 0.6658140420913696, "rewards/margins_max": 1.8174184560775757, "rewards/margins_min": -0.25405535101890564, "rewards/margins_std": 0.9171269536018372, "rewards/rejected": -1.493495225906372, "step": 3090 }, { "epoch": 0.74, "grad_norm": 8.645134906058164, "learning_rate": 9.450131459823688e-08, "logits/chosen": -2.3899850845336914, "logits/rejected": -2.2390995025634766, "logps/chosen": -393.9625244140625, "logps/rejected": -407.4827575683594, "loss": 0.5337, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7428141832351685, "rewards/margins": 0.5780808329582214, "rewards/margins_max": 1.5134384632110596, "rewards/margins_min": -0.2902988791465759, "rewards/margins_std": 0.7956008315086365, "rewards/rejected": -1.3208950757980347, "step": 3100 }, { "epoch": 0.74, "eval_logits/chosen": -2.1313130855560303, "eval_logits/rejected": -2.025378465652466, "eval_logps/chosen": -357.8872375488281, "eval_logps/rejected": -408.40167236328125, "eval_loss": 0.5811232328414917, "eval_rewards/accuracies": 0.6894999742507935, "eval_rewards/chosen": -0.862241804599762, "eval_rewards/margins": 0.6093496680259705, "eval_rewards/margins_max": 2.7945051193237305, "eval_rewards/margins_min": -0.8310177326202393, "eval_rewards/margins_std": 1.1946074962615967, "eval_rewards/rejected": -1.471591591835022, "eval_runtime": 1548.2168, "eval_samples_per_second": 2.584, "eval_steps_per_second": 0.161, "step": 3100 }, { "epoch": 0.74, "grad_norm": 8.221554353870827, "learning_rate": 9.287030207011929e-08, "logits/chosen": -2.1603076457977295, "logits/rejected": -2.08491587638855, "logps/chosen": -360.92266845703125, "logps/rejected": -435.125732421875, "loss": 0.5836, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9610616564750671, "rewards/margins": 0.7061542272567749, "rewards/margins_max": 2.2416043281555176, "rewards/margins_min": -0.7093074321746826, "rewards/margins_std": 1.3259179592132568, "rewards/rejected": -1.6672159433364868, "step": 3110 }, { "epoch": 0.75, "grad_norm": 15.77363194066634, "learning_rate": 9.125027054323256e-08, "logits/chosen": -2.2946836948394775, "logits/rejected": -2.083644390106201, "logps/chosen": -410.8192443847656, "logps/rejected": -423.0220642089844, "loss": 0.5972, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.009279727935791, "rewards/margins": 0.6496905088424683, "rewards/margins_max": 2.0685534477233887, "rewards/margins_min": -0.5475030541419983, "rewards/margins_std": 1.1892989873886108, "rewards/rejected": -1.6589702367782593, "step": 3120 }, { "epoch": 0.75, "grad_norm": 14.927370347721894, "learning_rate": 8.964133323340081e-08, "logits/chosen": -2.2040863037109375, "logits/rejected": -2.089371681213379, "logps/chosen": -315.8217468261719, "logps/rejected": -331.59185791015625, "loss": 0.6178, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9262818098068237, "rewards/margins": 0.3326750099658966, "rewards/margins_max": 1.5433346033096313, "rewards/margins_min": -0.7282071709632874, "rewards/margins_std": 1.0230998992919922, "rewards/rejected": -1.258956789970398, "step": 3130 }, { "epoch": 0.75, "grad_norm": 7.073798400422113, "learning_rate": 8.804360258112861e-08, "logits/chosen": -2.371908187866211, "logits/rejected": -2.2101285457611084, "logps/chosen": -353.1387634277344, "logps/rejected": -374.95758056640625, "loss": 0.5656, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8321869969367981, "rewards/margins": 0.5218116044998169, "rewards/margins_max": 1.668221116065979, "rewards/margins_min": -0.36608409881591797, "rewards/margins_std": 0.9201259613037109, "rewards/rejected": -1.3539987802505493, "step": 3140 }, { "epoch": 0.75, "grad_norm": 6.975378604580979, "learning_rate": 8.645719024374446e-08, "logits/chosen": -2.2691850662231445, "logits/rejected": -2.1298177242279053, "logps/chosen": -378.91156005859375, "logps/rejected": -395.57281494140625, "loss": 0.5733, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7717379927635193, "rewards/margins": 0.39803680777549744, "rewards/margins_max": 1.295255184173584, "rewards/margins_min": -0.4729923605918884, "rewards/margins_std": 0.7877697348594666, "rewards/rejected": -1.1697747707366943, "step": 3150 }, { "epoch": 0.76, "grad_norm": 8.793251605845867, "learning_rate": 8.488220708759667e-08, "logits/chosen": -2.318495035171509, "logits/rejected": -2.1720662117004395, "logps/chosen": -412.4773864746094, "logps/rejected": -429.54766845703125, "loss": 0.5086, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9840967059135437, "rewards/margins": 0.6757220029830933, "rewards/margins_max": 1.696758508682251, "rewards/margins_min": -0.3646393418312073, "rewards/margins_std": 0.9350331425666809, "rewards/rejected": -1.6598186492919922, "step": 3160 }, { "epoch": 0.76, "grad_norm": 26.478627925487494, "learning_rate": 8.331876318030585e-08, "logits/chosen": -2.255403995513916, "logits/rejected": -2.1471354961395264, "logps/chosen": -357.1045227050781, "logps/rejected": -393.75433349609375, "loss": 0.6382, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8231166005134583, "rewards/margins": 0.5701841115951538, "rewards/margins_max": 1.6920678615570068, "rewards/margins_min": -0.4817492365837097, "rewards/margins_std": 0.9719716906547546, "rewards/rejected": -1.3933007717132568, "step": 3170 }, { "epoch": 0.76, "grad_norm": 7.1005023295045, "learning_rate": 8.176696778307269e-08, "logits/chosen": -2.2078447341918945, "logits/rejected": -2.093635082244873, "logps/chosen": -381.47711181640625, "logps/rejected": -411.4353942871094, "loss": 0.5417, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.924704372882843, "rewards/margins": 0.5208083391189575, "rewards/margins_max": 1.600974440574646, "rewards/margins_min": -0.582036018371582, "rewards/margins_std": 0.9650928378105164, "rewards/rejected": -1.4455125331878662, "step": 3180 }, { "epoch": 0.76, "grad_norm": 8.910460644597775, "learning_rate": 8.022692934304238e-08, "logits/chosen": -2.2856853008270264, "logits/rejected": -2.115417718887329, "logps/chosen": -376.12969970703125, "logps/rejected": -395.3996887207031, "loss": 0.6002, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8988031148910522, "rewards/margins": 0.5730702877044678, "rewards/margins_max": 1.824355125427246, "rewards/margins_min": -0.5747734904289246, "rewards/margins_std": 1.0453455448150635, "rewards/rejected": -1.47187340259552, "step": 3190 }, { "epoch": 0.77, "grad_norm": 14.75537838912476, "learning_rate": 7.869875548572588e-08, "logits/chosen": -2.242248058319092, "logits/rejected": -2.2185468673706055, "logps/chosen": -319.61151123046875, "logps/rejected": -379.0608825683594, "loss": 0.5356, "rewards/accuracies": 0.75, "rewards/chosen": -0.7732641100883484, "rewards/margins": 0.6938229203224182, "rewards/margins_max": 1.8196117877960205, "rewards/margins_min": -0.5819738507270813, "rewards/margins_std": 1.0817651748657227, "rewards/rejected": -1.4670870304107666, "step": 3200 }, { "epoch": 0.77, "eval_logits/chosen": -2.134446382522583, "eval_logits/rejected": -2.029043674468994, "eval_logps/chosen": -353.6847229003906, "eval_logps/rejected": -402.294677734375, "eval_loss": 0.5798461437225342, "eval_rewards/accuracies": 0.6909999847412109, "eval_rewards/chosen": -0.8202168941497803, "eval_rewards/margins": 0.5903045535087585, "eval_rewards/margins_max": 2.6922214031219482, "eval_rewards/margins_min": -0.7970418930053711, "eval_rewards/margins_std": 1.1511950492858887, "eval_rewards/rejected": -1.410521388053894, "eval_runtime": 1549.703, "eval_samples_per_second": 2.581, "eval_steps_per_second": 0.161, "step": 3200 }, { "epoch": 0.77, "grad_norm": 8.245546468249305, "learning_rate": 7.718255300747817e-08, "logits/chosen": -2.2367072105407715, "logits/rejected": -2.1595067977905273, "logps/chosen": -336.65435791015625, "logps/rejected": -446.177978515625, "loss": 0.5149, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7477490305900574, "rewards/margins": 0.8679901957511902, "rewards/margins_max": 2.615446090698242, "rewards/margins_min": -0.3728719651699066, "rewards/margins_std": 1.35932457447052, "rewards/rejected": -1.6157392263412476, "step": 3210 }, { "epoch": 0.77, "grad_norm": 20.15984706275695, "learning_rate": 7.567842786803502e-08, "logits/chosen": -2.2334787845611572, "logits/rejected": -2.0680527687072754, "logps/chosen": -365.5607604980469, "logps/rejected": -433.08038330078125, "loss": 0.5375, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9566822052001953, "rewards/margins": 0.6228682994842529, "rewards/margins_max": 1.950193166732788, "rewards/margins_min": -0.6751230955123901, "rewards/margins_std": 1.1991097927093506, "rewards/rejected": -1.5795505046844482, "step": 3220 }, { "epoch": 0.77, "grad_norm": 8.030800179675678, "learning_rate": 7.418648518310797e-08, "logits/chosen": -2.273526668548584, "logits/rejected": -2.095607280731201, "logps/chosen": -346.53533935546875, "logps/rejected": -361.431396484375, "loss": 0.585, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.6967358589172363, "rewards/margins": 0.668177604675293, "rewards/margins_max": 2.370635509490967, "rewards/margins_min": -0.4398622512817383, "rewards/margins_std": 1.258202314376831, "rewards/rejected": -1.3649133443832397, "step": 3230 }, { "epoch": 0.78, "grad_norm": 9.323906179302197, "learning_rate": 7.270682921703853e-08, "logits/chosen": -2.2592129707336426, "logits/rejected": -2.177143096923828, "logps/chosen": -354.3105773925781, "logps/rejected": -406.95916748046875, "loss": 0.5514, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8031813502311707, "rewards/margins": 0.5103456974029541, "rewards/margins_max": 1.6575119495391846, "rewards/margins_min": -0.4452950954437256, "rewards/margins_std": 0.961545467376709, "rewards/rejected": -1.3135271072387695, "step": 3240 }, { "epoch": 0.78, "grad_norm": 29.54111525034854, "learning_rate": 7.123956337551116e-08, "logits/chosen": -2.2409348487854004, "logits/rejected": -2.1845171451568604, "logps/chosen": -352.8401794433594, "logps/rejected": -377.38543701171875, "loss": 0.6188, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8801943063735962, "rewards/margins": 0.531808078289032, "rewards/margins_max": 2.0200257301330566, "rewards/margins_min": -0.949517548084259, "rewards/margins_std": 1.3527804613113403, "rewards/rejected": -1.4120023250579834, "step": 3250 }, { "epoch": 0.78, "grad_norm": 8.798994388343122, "learning_rate": 6.978479019832725e-08, "logits/chosen": -2.2430479526519775, "logits/rejected": -2.060004472732544, "logps/chosen": -399.75762939453125, "logps/rejected": -423.31695556640625, "loss": 0.5363, "rewards/accuracies": 0.75, "rewards/chosen": -0.9198942184448242, "rewards/margins": 0.6667669415473938, "rewards/margins_max": 1.9378042221069336, "rewards/margins_min": -0.4396510720252991, "rewards/margins_std": 1.0915218591690063, "rewards/rejected": -1.5866611003875732, "step": 3260 }, { "epoch": 0.78, "grad_norm": 9.575441068445539, "learning_rate": 6.83426113522389e-08, "logits/chosen": -2.176809310913086, "logits/rejected": -2.105595350265503, "logps/chosen": -359.45050048828125, "logps/rejected": -385.4041748046875, "loss": 0.5075, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8201757669448853, "rewards/margins": 0.49509182572364807, "rewards/margins_max": 1.438071608543396, "rewards/margins_min": -0.3610461950302124, "rewards/margins_std": 0.807999312877655, "rewards/rejected": -1.315267562866211, "step": 3270 }, { "epoch": 0.79, "grad_norm": 13.961464844719968, "learning_rate": 6.691312762384396e-08, "logits/chosen": -2.3408875465393066, "logits/rejected": -2.1948342323303223, "logps/chosen": -338.2998352050781, "logps/rejected": -344.25067138671875, "loss": 0.5734, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8336318135261536, "rewards/margins": 0.3687313497066498, "rewards/margins_max": 1.3273518085479736, "rewards/margins_min": -0.5507371425628662, "rewards/margins_std": 0.8559874296188354, "rewards/rejected": -1.2023632526397705, "step": 3280 }, { "epoch": 0.79, "grad_norm": 16.474145226308405, "learning_rate": 6.54964389125428e-08, "logits/chosen": -2.14575457572937, "logits/rejected": -2.084578514099121, "logps/chosen": -344.61572265625, "logps/rejected": -423.445556640625, "loss": 0.5814, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9800515174865723, "rewards/margins": 0.5990694165229797, "rewards/margins_max": 1.9624567031860352, "rewards/margins_min": -0.5665584802627563, "rewards/margins_std": 1.1210383176803589, "rewards/rejected": -1.5791209936141968, "step": 3290 }, { "epoch": 0.79, "grad_norm": 8.959586642812189, "learning_rate": 6.409264422355642e-08, "logits/chosen": -2.3165104389190674, "logits/rejected": -2.2302772998809814, "logps/chosen": -365.2793884277344, "logps/rejected": -449.20599365234375, "loss": 0.5481, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7845451235771179, "rewards/margins": 0.8389819860458374, "rewards/margins_max": 2.4345955848693848, "rewards/margins_min": -0.3447968363761902, "rewards/margins_std": 1.2920169830322266, "rewards/rejected": -1.6235271692276, "step": 3300 }, { "epoch": 0.79, "eval_logits/chosen": -2.130585193634033, "eval_logits/rejected": -2.024704933166504, "eval_logps/chosen": -351.1285400390625, "eval_logps/rejected": -399.08367919921875, "eval_loss": 0.5790132284164429, "eval_rewards/accuracies": 0.6945000290870667, "eval_rewards/chosen": -0.7946546077728271, "eval_rewards/margins": 0.583756685256958, "eval_rewards/margins_max": 2.6645469665527344, "eval_rewards/margins_min": -0.7789820432662964, "eval_rewards/margins_std": 1.1355273723602295, "eval_rewards/rejected": -1.3784115314483643, "eval_runtime": 1531.8604, "eval_samples_per_second": 2.611, "eval_steps_per_second": 0.163, "step": 3300 }, { "epoch": 0.79, "grad_norm": 13.726354624564268, "learning_rate": 6.27018416610078e-08, "logits/chosen": -2.2425320148468018, "logits/rejected": -2.162163019180298, "logps/chosen": -318.7059020996094, "logps/rejected": -426.3963317871094, "loss": 0.5736, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7830327749252319, "rewards/margins": 0.5505672693252563, "rewards/margins_max": 1.9457142353057861, "rewards/margins_min": -0.5719438791275024, "rewards/margins_std": 1.1443437337875366, "rewards/rejected": -1.3335999250411987, "step": 3310 }, { "epoch": 0.8, "grad_norm": 4.475541935504021, "learning_rate": 6.132412842106572e-08, "logits/chosen": -2.2637441158294678, "logits/rejected": -2.1286253929138184, "logps/chosen": -382.79150390625, "logps/rejected": -440.62689208984375, "loss": 0.5836, "rewards/accuracies": 0.75, "rewards/chosen": -0.8995658755302429, "rewards/margins": 0.5994361639022827, "rewards/margins_max": 1.7083218097686768, "rewards/margins_min": -0.4656209945678711, "rewards/margins_std": 0.9842336773872375, "rewards/rejected": -1.4990020990371704, "step": 3320 }, { "epoch": 0.8, "grad_norm": 9.470571613401713, "learning_rate": 5.995960078515255e-08, "logits/chosen": -2.309659481048584, "logits/rejected": -2.1483631134033203, "logps/chosen": -370.1103820800781, "logps/rejected": -396.7162780761719, "loss": 0.5968, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9388855695724487, "rewards/margins": 0.46289610862731934, "rewards/margins_max": 1.5064226388931274, "rewards/margins_min": -0.6487630605697632, "rewards/margins_std": 0.9421648979187012, "rewards/rejected": -1.401781439781189, "step": 3330 }, { "epoch": 0.8, "grad_norm": 10.829324146714299, "learning_rate": 5.860835411321494e-08, "logits/chosen": -2.1813974380493164, "logits/rejected": -2.0915889739990234, "logps/chosen": -367.8969421386719, "logps/rejected": -412.524169921875, "loss": 0.5737, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9319075345993042, "rewards/margins": 0.46096378564834595, "rewards/margins_max": 1.6062819957733154, "rewards/margins_min": -0.6786168813705444, "rewards/margins_std": 1.0213310718536377, "rewards/rejected": -1.3928712606430054, "step": 3340 }, { "epoch": 0.8, "grad_norm": 7.877269115206038, "learning_rate": 5.7270482837060455e-08, "logits/chosen": -2.3484013080596924, "logits/rejected": -2.1863367557525635, "logps/chosen": -365.36236572265625, "logps/rejected": -368.1033630371094, "loss": 0.5494, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7965494394302368, "rewards/margins": 0.4626794457435608, "rewards/margins_max": 1.433593511581421, "rewards/margins_min": -0.40905672311782837, "rewards/margins_std": 0.8293336629867554, "rewards/rejected": -1.2592289447784424, "step": 3350 }, { "epoch": 0.8, "grad_norm": 10.727781889107082, "learning_rate": 5.5946080453757425e-08, "logits/chosen": -2.210631847381592, "logits/rejected": -2.1203572750091553, "logps/chosen": -354.32342529296875, "logps/rejected": -390.8208923339844, "loss": 0.5266, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7129632234573364, "rewards/margins": 0.5516061782836914, "rewards/margins_max": 1.5592617988586426, "rewards/margins_min": -0.39137059450149536, "rewards/margins_std": 0.9011121988296509, "rewards/rejected": -1.2645695209503174, "step": 3360 }, { "epoch": 0.81, "grad_norm": 6.94882679649279, "learning_rate": 5.4635239519101706e-08, "logits/chosen": -2.1713509559631348, "logits/rejected": -2.241694688796997, "logps/chosen": -341.897705078125, "logps/rejected": -412.19415283203125, "loss": 0.608, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8169388771057129, "rewards/margins": 0.43762272596359253, "rewards/margins_max": 1.4810574054718018, "rewards/margins_min": -0.5821611881256104, "rewards/margins_std": 0.9281484484672546, "rewards/rejected": -1.2545615434646606, "step": 3370 }, { "epoch": 0.81, "grad_norm": 14.786071636128067, "learning_rate": 5.333805164114744e-08, "logits/chosen": -2.2553021907806396, "logits/rejected": -2.1070806980133057, "logps/chosen": -384.5975646972656, "logps/rejected": -434.3223571777344, "loss": 0.5621, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8704391717910767, "rewards/margins": 0.7068461179733276, "rewards/margins_max": 1.8835039138793945, "rewards/margins_min": -0.2678353786468506, "rewards/margins_std": 0.9482309222221375, "rewards/rejected": -1.5772855281829834, "step": 3380 }, { "epoch": 0.81, "grad_norm": 19.03694415918484, "learning_rate": 5.205460747380588e-08, "logits/chosen": -2.3047525882720947, "logits/rejected": -2.211268663406372, "logps/chosen": -343.63201904296875, "logps/rejected": -390.42620849609375, "loss": 0.5813, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8590719103813171, "rewards/margins": 0.6166432499885559, "rewards/margins_max": 1.9427121877670288, "rewards/margins_min": -0.33624500036239624, "rewards/margins_std": 1.0190200805664062, "rewards/rejected": -1.4757152795791626, "step": 3390 }, { "epoch": 0.81, "grad_norm": 11.707684228536282, "learning_rate": 5.0784996710509785e-08, "logits/chosen": -2.226902484893799, "logits/rejected": -2.1316277980804443, "logps/chosen": -435.12762451171875, "logps/rejected": -494.62493896484375, "loss": 0.5216, "rewards/accuracies": 0.6875, "rewards/chosen": -0.952497661113739, "rewards/margins": 0.7511574029922485, "rewards/margins_max": 2.335672378540039, "rewards/margins_min": -0.395571768283844, "rewards/margins_std": 1.2278518676757812, "rewards/rejected": -1.7036550045013428, "step": 3400 }, { "epoch": 0.81, "eval_logits/chosen": -2.132641553878784, "eval_logits/rejected": -2.0275228023529053, "eval_logps/chosen": -357.4659729003906, "eval_logps/rejected": -407.9558410644531, "eval_loss": 0.5802480578422546, "eval_rewards/accuracies": 0.6869999766349792, "eval_rewards/chosen": -0.8580292463302612, "eval_rewards/margins": 0.6091036796569824, "eval_rewards/margins_max": 2.782928705215454, "eval_rewards/margins_min": -0.8238416314125061, "eval_rewards/margins_std": 1.1893301010131836, "eval_rewards/rejected": -1.467132806777954, "eval_runtime": 1510.1156, "eval_samples_per_second": 2.649, "eval_steps_per_second": 0.166, "step": 3400 }, { "epoch": 0.82, "grad_norm": 8.728889940438313, "learning_rate": 4.952930807794503e-08, "logits/chosen": -2.2597286701202393, "logits/rejected": -2.17844557762146, "logps/chosen": -340.1341857910156, "logps/rejected": -419.45001220703125, "loss": 0.551, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9266648292541504, "rewards/margins": 0.6093469262123108, "rewards/margins_max": 1.7101576328277588, "rewards/margins_min": -0.3558119237422943, "rewards/margins_std": 0.9267221689224243, "rewards/rejected": -1.5360116958618164, "step": 3410 }, { "epoch": 0.82, "grad_norm": 9.19625831478148, "learning_rate": 4.828762932985009e-08, "logits/chosen": -2.3623757362365723, "logits/rejected": -2.1383187770843506, "logps/chosen": -395.03076171875, "logps/rejected": -415.6767578125, "loss": 0.6073, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0331547260284424, "rewards/margins": 0.6337083578109741, "rewards/margins_max": 2.2001430988311768, "rewards/margins_min": -0.6054942607879639, "rewards/margins_std": 1.2887663841247559, "rewards/rejected": -1.6668630838394165, "step": 3420 }, { "epoch": 0.82, "grad_norm": 13.593397534942055, "learning_rate": 4.706004724088328e-08, "logits/chosen": -2.2128801345825195, "logits/rejected": -2.098672389984131, "logps/chosen": -393.31634521484375, "logps/rejected": -441.32464599609375, "loss": 0.6241, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0529236793518066, "rewards/margins": 0.5473018288612366, "rewards/margins_max": 1.8711349964141846, "rewards/margins_min": -0.5184833407402039, "rewards/margins_std": 1.0769404172897339, "rewards/rejected": -1.6002256870269775, "step": 3430 }, { "epoch": 0.82, "grad_norm": 10.124215178036835, "learning_rate": 4.584664760055881e-08, "logits/chosen": -2.275381088256836, "logits/rejected": -2.168309450149536, "logps/chosen": -303.4335632324219, "logps/rejected": -341.6516418457031, "loss": 0.5774, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7584865093231201, "rewards/margins": 0.4985629618167877, "rewards/margins_max": 1.4203546047210693, "rewards/margins_min": -0.35362544655799866, "rewards/margins_std": 0.7958400845527649, "rewards/rejected": -1.257049322128296, "step": 3440 }, { "epoch": 0.83, "grad_norm": 7.355312902872992, "learning_rate": 4.4647515207250934e-08, "logits/chosen": -2.404358148574829, "logits/rejected": -2.2829818725585938, "logps/chosen": -371.11468505859375, "logps/rejected": -384.892333984375, "loss": 0.6228, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.8142098188400269, "rewards/margins": 0.3368798792362213, "rewards/margins_max": 1.2985020875930786, "rewards/margins_min": -0.5661624670028687, "rewards/margins_std": 0.8433007001876831, "rewards/rejected": -1.1510899066925049, "step": 3450 }, { "epoch": 0.83, "grad_norm": 8.694758833731838, "learning_rate": 4.346273386226812e-08, "logits/chosen": -2.2963764667510986, "logits/rejected": -2.1740527153015137, "logps/chosen": -354.6163024902344, "logps/rejected": -398.3978271484375, "loss": 0.5875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7273462414741516, "rewards/margins": 0.5810668468475342, "rewards/margins_max": 1.8487069606781006, "rewards/margins_min": -0.5208570957183838, "rewards/margins_std": 1.059582233428955, "rewards/rejected": -1.3084131479263306, "step": 3460 }, { "epoch": 0.83, "grad_norm": 10.953721926717492, "learning_rate": 4.2292386363996484e-08, "logits/chosen": -2.2753405570983887, "logits/rejected": -2.142350912094116, "logps/chosen": -381.29132080078125, "logps/rejected": -428.2992248535156, "loss": 0.5595, "rewards/accuracies": 0.75, "rewards/chosen": -0.9294906854629517, "rewards/margins": 0.6927562952041626, "rewards/margins_max": 2.077968120574951, "rewards/margins_min": -0.3660210371017456, "rewards/margins_std": 1.106979250907898, "rewards/rejected": -1.6222469806671143, "step": 3470 }, { "epoch": 0.83, "grad_norm": 13.487234225761295, "learning_rate": 4.1136554502113676e-08, "logits/chosen": -2.1718063354492188, "logits/rejected": -2.2057089805603027, "logps/chosen": -343.5164489746094, "logps/rejected": -436.0372009277344, "loss": 0.556, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8319752812385559, "rewards/margins": 0.6065745949745178, "rewards/margins_max": 1.893119215965271, "rewards/margins_min": -0.3629196286201477, "rewards/margins_std": 1.00821852684021, "rewards/rejected": -1.4385499954223633, "step": 3480 }, { "epoch": 0.84, "grad_norm": 8.01881722877727, "learning_rate": 3.999531905187256e-08, "logits/chosen": -2.2799549102783203, "logits/rejected": -2.1733109951019287, "logps/chosen": -369.2069091796875, "logps/rejected": -431.19873046875, "loss": 0.57, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8160541653633118, "rewards/margins": 0.6780989766120911, "rewards/margins_max": 2.048820972442627, "rewards/margins_min": -0.45701780915260315, "rewards/margins_std": 1.1230634450912476, "rewards/rejected": -1.4941531419754028, "step": 3490 }, { "epoch": 0.84, "grad_norm": 9.715174369876712, "learning_rate": 3.886875976845661e-08, "logits/chosen": -2.4811336994171143, "logits/rejected": -2.3460984230041504, "logps/chosen": -398.22540283203125, "logps/rejected": -431.91387939453125, "loss": 0.5254, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.9367545247077942, "rewards/margins": 0.6325701475143433, "rewards/margins_max": 1.7396118640899658, "rewards/margins_min": -0.3208602964878082, "rewards/margins_std": 0.9033064842224121, "rewards/rejected": -1.5693246126174927, "step": 3500 }, { "epoch": 0.84, "eval_logits/chosen": -2.137748956680298, "eval_logits/rejected": -2.033116340637207, "eval_logps/chosen": -353.97015380859375, "eval_logps/rejected": -402.92132568359375, "eval_loss": 0.5791488289833069, "eval_rewards/accuracies": 0.690500020980835, "eval_rewards/chosen": -0.8230710029602051, "eval_rewards/margins": 0.5937168598175049, "eval_rewards/margins_max": 2.7039239406585693, "eval_rewards/margins_min": -0.7954254150390625, "eval_rewards/margins_std": 1.1546231508255005, "eval_rewards/rejected": -1.41678786277771, "eval_runtime": 1526.0723, "eval_samples_per_second": 2.621, "eval_steps_per_second": 0.164, "step": 3500 }, { "epoch": 0.84, "grad_norm": 8.176315472423372, "learning_rate": 3.775695538140608e-08, "logits/chosen": -2.303211212158203, "logits/rejected": -2.144487142562866, "logps/chosen": -322.45404052734375, "logps/rejected": -375.42236328125, "loss": 0.5304, "rewards/accuracies": 0.75, "rewards/chosen": -0.7990286946296692, "rewards/margins": 0.8339442014694214, "rewards/margins_max": 2.411538600921631, "rewards/margins_min": -0.4224955141544342, "rewards/margins_std": 1.27775239944458, "rewards/rejected": -1.6329729557037354, "step": 3510 }, { "epoch": 0.84, "grad_norm": 17.601236542698675, "learning_rate": 3.665998358911593e-08, "logits/chosen": -2.3472819328308105, "logits/rejected": -2.155184745788574, "logps/chosen": -318.1134033203125, "logps/rejected": -390.4021911621094, "loss": 0.5685, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7277802228927612, "rewards/margins": 0.7009325623512268, "rewards/margins_max": 2.2405009269714355, "rewards/margins_min": -0.3171394467353821, "rewards/margins_std": 1.1813361644744873, "rewards/rejected": -1.4287127256393433, "step": 3520 }, { "epoch": 0.85, "grad_norm": 11.749508461917872, "learning_rate": 3.557792105340621e-08, "logits/chosen": -2.3927388191223145, "logits/rejected": -2.2641429901123047, "logps/chosen": -356.36968994140625, "logps/rejected": -398.8579406738281, "loss": 0.5935, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.800003707408905, "rewards/margins": 0.5922523736953735, "rewards/margins_max": 1.9293792247772217, "rewards/margins_min": -0.46332770586013794, "rewards/margins_std": 1.0827903747558594, "rewards/rejected": -1.3922561407089233, "step": 3530 }, { "epoch": 0.85, "grad_norm": 9.265937889953113, "learning_rate": 3.4510843394163966e-08, "logits/chosen": -2.350940227508545, "logits/rejected": -2.220615863800049, "logps/chosen": -358.4539794921875, "logps/rejected": -464.9853515625, "loss": 0.5792, "rewards/accuracies": 0.75, "rewards/chosen": -0.7825900316238403, "rewards/margins": 0.9537205696105957, "rewards/margins_max": 2.3956093788146973, "rewards/margins_min": -0.213771253824234, "rewards/margins_std": 1.205636978149414, "rewards/rejected": -1.7363107204437256, "step": 3540 }, { "epoch": 0.85, "grad_norm": 13.880365107678578, "learning_rate": 3.345882518405918e-08, "logits/chosen": -2.240265369415283, "logits/rejected": -2.243673086166382, "logps/chosen": -281.1455078125, "logps/rejected": -381.0218811035156, "loss": 0.5475, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.6216169595718384, "rewards/margins": 0.6986228823661804, "rewards/margins_max": 1.9428884983062744, "rewards/margins_min": -0.2776763439178467, "rewards/margins_std": 0.9915906190872192, "rewards/rejected": -1.3202400207519531, "step": 3550 }, { "epoch": 0.85, "grad_norm": 9.040807820913322, "learning_rate": 3.242193994333278e-08, "logits/chosen": -2.2032573223114014, "logits/rejected": -2.1340651512145996, "logps/chosen": -333.020751953125, "logps/rejected": -371.69854736328125, "loss": 0.5797, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9424605369567871, "rewards/margins": 0.459073930978775, "rewards/margins_max": 1.7106342315673828, "rewards/margins_min": -0.6318844556808472, "rewards/margins_std": 1.0428351163864136, "rewards/rejected": -1.4015344381332397, "step": 3560 }, { "epoch": 0.85, "grad_norm": 7.050678994237729, "learning_rate": 3.14002601346591e-08, "logits/chosen": -2.2537546157836914, "logits/rejected": -2.2554469108581543, "logps/chosen": -354.36322021484375, "logps/rejected": -430.35205078125, "loss": 0.5381, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7406297922134399, "rewards/margins": 0.6457460522651672, "rewards/margins_max": 1.6351501941680908, "rewards/margins_min": -0.3264870047569275, "rewards/margins_std": 0.893252968788147, "rewards/rejected": -1.3863757848739624, "step": 3570 }, { "epoch": 0.86, "grad_norm": 11.209560019904606, "learning_rate": 3.039385715808121e-08, "logits/chosen": -2.357199192047119, "logits/rejected": -2.1211209297180176, "logps/chosen": -314.12640380859375, "logps/rejected": -344.11773681640625, "loss": 0.557, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7144862413406372, "rewards/margins": 0.664794385433197, "rewards/margins_max": 1.8493525981903076, "rewards/margins_min": -0.4147608280181885, "rewards/margins_std": 1.041449785232544, "rewards/rejected": -1.3792805671691895, "step": 3580 }, { "epoch": 0.86, "grad_norm": 12.537017332239243, "learning_rate": 2.9402801346021937e-08, "logits/chosen": -2.3326268196105957, "logits/rejected": -2.074568510055542, "logps/chosen": -413.81561279296875, "logps/rejected": -414.9437561035156, "loss": 0.5963, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9259670376777649, "rewards/margins": 0.5444958209991455, "rewards/margins_max": 2.021422863006592, "rewards/margins_min": -0.5557324290275574, "rewards/margins_std": 1.1533992290496826, "rewards/rejected": -1.470462679862976, "step": 3590 }, { "epoch": 0.86, "grad_norm": 12.725584700965426, "learning_rate": 2.8427161958368002e-08, "logits/chosen": -2.2213528156280518, "logits/rejected": -2.0597572326660156, "logps/chosen": -368.15863037109375, "logps/rejected": -395.1624755859375, "loss": 0.5838, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8663617968559265, "rewards/margins": 0.5736824870109558, "rewards/margins_max": 1.7366317510604858, "rewards/margins_min": -0.505719780921936, "rewards/margins_std": 1.005513310432434, "rewards/rejected": -1.4400444030761719, "step": 3600 }, { "epoch": 0.86, "eval_logits/chosen": -2.1327924728393555, "eval_logits/rejected": -2.027250051498413, "eval_logps/chosen": -352.2793273925781, "eval_logps/rejected": -400.8924560546875, "eval_loss": 0.5783558487892151, "eval_rewards/accuracies": 0.6919999718666077, "eval_rewards/chosen": -0.8061626553535461, "eval_rewards/margins": 0.5903366208076477, "eval_rewards/margins_max": 2.6855058670043945, "eval_rewards/margins_min": -0.783281147480011, "eval_rewards/margins_std": 1.144561767578125, "eval_rewards/rejected": -1.3964993953704834, "eval_runtime": 1514.5548, "eval_samples_per_second": 2.641, "eval_steps_per_second": 0.165, "step": 3600 }, { "epoch": 0.86, "grad_norm": 8.219008060943812, "learning_rate": 2.7467007177630174e-08, "logits/chosen": -2.3055763244628906, "logits/rejected": -2.2115702629089355, "logps/chosen": -397.17376708984375, "logps/rejected": -479.5096130371094, "loss": 0.4846, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.9177542924880981, "rewards/margins": 0.7675668001174927, "rewards/margins_max": 2.097571849822998, "rewards/margins_min": -0.4052547812461853, "rewards/margins_std": 1.1146230697631836, "rewards/rejected": -1.6853210926055908, "step": 3610 }, { "epoch": 0.87, "grad_norm": 14.531144467862758, "learning_rate": 2.652240410417819e-08, "logits/chosen": -2.3802151679992676, "logits/rejected": -2.176363706588745, "logps/chosen": -376.2300109863281, "logps/rejected": -373.46502685546875, "loss": 0.5841, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9010432362556458, "rewards/margins": 0.3579115867614746, "rewards/margins_max": 1.215152382850647, "rewards/margins_min": -0.6653003096580505, "rewards/margins_std": 0.8294261693954468, "rewards/rejected": -1.2589547634124756, "step": 3620 }, { "epoch": 0.87, "grad_norm": 7.5981156472615075, "learning_rate": 2.5593418751551437e-08, "logits/chosen": -2.3440566062927246, "logits/rejected": -2.1402828693389893, "logps/chosen": -399.32293701171875, "logps/rejected": -408.205322265625, "loss": 0.5495, "rewards/accuracies": 0.75, "rewards/chosen": -0.8664854764938354, "rewards/margins": 0.6742043495178223, "rewards/margins_max": 1.8542773723602295, "rewards/margins_min": -0.34133607149124146, "rewards/margins_std": 0.9912660717964172, "rewards/rejected": -1.5406900644302368, "step": 3630 }, { "epoch": 0.87, "grad_norm": 8.156932816844554, "learning_rate": 2.4680116041845834e-08, "logits/chosen": -2.1183152198791504, "logits/rejected": -2.061955451965332, "logps/chosen": -352.27838134765625, "logps/rejected": -430.496826171875, "loss": 0.5434, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8345056772232056, "rewards/margins": 0.7603470087051392, "rewards/margins_max": 2.2744932174682617, "rewards/margins_min": -0.5490296483039856, "rewards/margins_std": 1.2327032089233398, "rewards/rejected": -1.5948525667190552, "step": 3640 }, { "epoch": 0.87, "grad_norm": 8.699386259332465, "learning_rate": 2.3782559801176354e-08, "logits/chosen": -2.2701752185821533, "logits/rejected": -2.114495038986206, "logps/chosen": -382.0144348144531, "logps/rejected": -473.03741455078125, "loss": 0.5193, "rewards/accuracies": 0.75, "rewards/chosen": -0.8448399305343628, "rewards/margins": 0.8452709913253784, "rewards/margins_max": 2.237246036529541, "rewards/margins_min": -0.38787809014320374, "rewards/margins_std": 1.1944913864135742, "rewards/rejected": -1.6901108026504517, "step": 3650 }, { "epoch": 0.88, "grad_norm": 13.204759051989315, "learning_rate": 2.290081275521688e-08, "logits/chosen": -2.1678922176361084, "logits/rejected": -2.104832172393799, "logps/chosen": -333.69256591796875, "logps/rejected": -370.3411560058594, "loss": 0.593, "rewards/accuracies": 0.625, "rewards/chosen": -0.9413816332817078, "rewards/margins": 0.4408838748931885, "rewards/margins_max": 1.7454280853271484, "rewards/margins_min": -0.8067914843559265, "rewards/margins_std": 1.1289927959442139, "rewards/rejected": -1.382265329360962, "step": 3660 }, { "epoch": 0.88, "grad_norm": 8.6057276260426, "learning_rate": 2.2034936524816388e-08, "logits/chosen": -2.139958620071411, "logits/rejected": -2.1557977199554443, "logps/chosen": -367.608642578125, "logps/rejected": -477.5233459472656, "loss": 0.5918, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8218317031860352, "rewards/margins": 0.617006242275238, "rewards/margins_max": 2.428431987762451, "rewards/margins_min": -0.7195814251899719, "rewards/margins_std": 1.4059988260269165, "rewards/rejected": -1.438838005065918, "step": 3670 }, { "epoch": 0.88, "grad_norm": 11.574737506304139, "learning_rate": 2.118499162169285e-08, "logits/chosen": -2.287055492401123, "logits/rejected": -2.043408155441284, "logps/chosen": -439.4178161621094, "logps/rejected": -448.04083251953125, "loss": 0.5616, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9677483439445496, "rewards/margins": 0.7290508151054382, "rewards/margins_max": 2.051032543182373, "rewards/margins_min": -0.5750768184661865, "rewards/margins_std": 1.177049994468689, "rewards/rejected": -1.6967992782592773, "step": 3680 }, { "epoch": 0.88, "grad_norm": 10.452570651056707, "learning_rate": 2.035103744420408e-08, "logits/chosen": -2.343034267425537, "logits/rejected": -2.170119285583496, "logps/chosen": -423.13494873046875, "logps/rejected": -443.84210205078125, "loss": 0.5728, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8548452258110046, "rewards/margins": 0.6448922157287598, "rewards/margins_max": 1.8757946491241455, "rewards/margins_min": -0.265564501285553, "rewards/margins_std": 0.9764896631240845, "rewards/rejected": -1.4997375011444092, "step": 3690 }, { "epoch": 0.89, "grad_norm": 12.213828732411779, "learning_rate": 1.953313227319689e-08, "logits/chosen": -2.284311532974243, "logits/rejected": -2.094996929168701, "logps/chosen": -387.90838623046875, "logps/rejected": -431.8561096191406, "loss": 0.5567, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.811794638633728, "rewards/margins": 0.6923728585243225, "rewards/margins_max": 2.321518898010254, "rewards/margins_min": -0.552316427230835, "rewards/margins_std": 1.3003530502319336, "rewards/rejected": -1.5041675567626953, "step": 3700 }, { "epoch": 0.89, "eval_logits/chosen": -2.1356961727142334, "eval_logits/rejected": -2.030524492263794, "eval_logps/chosen": -352.0264892578125, "eval_logps/rejected": -400.4902648925781, "eval_loss": 0.5783910155296326, "eval_rewards/accuracies": 0.6919999718666077, "eval_rewards/chosen": -0.8036341667175293, "eval_rewards/margins": 0.5888431072235107, "eval_rewards/margins_max": 2.6775901317596436, "eval_rewards/margins_min": -0.7816547751426697, "eval_rewards/margins_std": 1.1418603658676147, "eval_rewards/rejected": -1.3924771547317505, "eval_runtime": 1532.5377, "eval_samples_per_second": 2.61, "eval_steps_per_second": 0.163, "step": 3700 }, { "epoch": 0.89, "grad_norm": 7.998463483121012, "learning_rate": 1.873133326793397e-08, "logits/chosen": -2.1978397369384766, "logits/rejected": -2.0661773681640625, "logps/chosen": -325.09112548828125, "logps/rejected": -377.57720947265625, "loss": 0.5789, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7560814023017883, "rewards/margins": 0.588628888130188, "rewards/margins_max": 1.7896034717559814, "rewards/margins_min": -0.4375428259372711, "rewards/margins_std": 0.9815313220024109, "rewards/rejected": -1.3447102308273315, "step": 3710 }, { "epoch": 0.89, "grad_norm": 9.587621066884525, "learning_rate": 1.794569646209948e-08, "logits/chosen": -2.156107187271118, "logits/rejected": -2.0480566024780273, "logps/chosen": -376.7787780761719, "logps/rejected": -373.9519958496094, "loss": 0.5729, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.9024326205253601, "rewards/margins": 0.3353961110115051, "rewards/margins_max": 1.3991725444793701, "rewards/margins_min": -0.5804914832115173, "rewards/margins_std": 0.8762849569320679, "rewards/rejected": -1.2378286123275757, "step": 3720 }, { "epoch": 0.89, "grad_norm": 9.692478008427342, "learning_rate": 1.7176276759883146e-08, "logits/chosen": -2.2497398853302, "logits/rejected": -2.194068193435669, "logps/chosen": -350.18963623046875, "logps/rejected": -401.6360168457031, "loss": 0.5504, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7462789416313171, "rewards/margins": 0.7022184133529663, "rewards/margins_max": 1.886450171470642, "rewards/margins_min": -0.1826545149087906, "rewards/margins_std": 0.9383076429367065, "rewards/rejected": -1.4484975337982178, "step": 3730 }, { "epoch": 0.9, "grad_norm": 8.433648028426537, "learning_rate": 1.642312793214293e-08, "logits/chosen": -2.165067434310913, "logits/rejected": -2.06459379196167, "logps/chosen": -332.3847961425781, "logps/rejected": -419.3334045410156, "loss": 0.5189, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8300457000732422, "rewards/margins": 0.7776058912277222, "rewards/margins_max": 2.3387045860290527, "rewards/margins_min": -0.358219712972641, "rewards/margins_std": 1.225597620010376, "rewards/rejected": -1.6076514720916748, "step": 3740 }, { "epoch": 0.9, "grad_norm": 9.096705634775796, "learning_rate": 1.568630261264789e-08, "logits/chosen": -2.2922096252441406, "logits/rejected": -2.1537282466888428, "logps/chosen": -338.7301330566406, "logps/rejected": -365.5114440917969, "loss": 0.5827, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7972782254219055, "rewards/margins": 0.6061583757400513, "rewards/margins_max": 1.971874475479126, "rewards/margins_min": -0.3591035008430481, "rewards/margins_std": 1.039502501487732, "rewards/rejected": -1.4034364223480225, "step": 3750 }, { "epoch": 0.9, "grad_norm": 8.049749183301458, "learning_rate": 1.49658522943992e-08, "logits/chosen": -2.180112361907959, "logits/rejected": -2.077213764190674, "logps/chosen": -292.9327392578125, "logps/rejected": -396.641845703125, "loss": 0.5009, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7181032299995422, "rewards/margins": 0.7541881799697876, "rewards/margins_max": 2.266845464706421, "rewards/margins_min": -0.2670992612838745, "rewards/margins_std": 1.1569912433624268, "rewards/rejected": -1.4722915887832642, "step": 3760 }, { "epoch": 0.9, "grad_norm": 7.753746111775207, "learning_rate": 1.4261827326032122e-08, "logits/chosen": -2.271407127380371, "logits/rejected": -2.1097826957702637, "logps/chosen": -387.440185546875, "logps/rejected": -433.08282470703125, "loss": 0.55, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8959513902664185, "rewards/margins": 0.6993917226791382, "rewards/margins_max": 2.1504006385803223, "rewards/margins_min": -0.42309173941612244, "rewards/margins_std": 1.1664221286773682, "rewards/rejected": -1.5953431129455566, "step": 3770 }, { "epoch": 0.91, "grad_norm": 13.042700258355126, "learning_rate": 1.3574276908296906e-08, "logits/chosen": -2.1976513862609863, "logits/rejected": -2.0769054889678955, "logps/chosen": -297.0116882324219, "logps/rejected": -395.8526916503906, "loss": 0.5212, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7780067920684814, "rewards/margins": 0.821733295917511, "rewards/margins_max": 2.15226411819458, "rewards/margins_min": -0.2936638295650482, "rewards/margins_std": 1.1076699495315552, "rewards/rejected": -1.5997402667999268, "step": 3780 }, { "epoch": 0.91, "grad_norm": 12.284297341795362, "learning_rate": 1.2903249090620849e-08, "logits/chosen": -2.286663770675659, "logits/rejected": -2.1142868995666504, "logps/chosen": -408.79931640625, "logps/rejected": -443.96783447265625, "loss": 0.5491, "rewards/accuracies": 0.75, "rewards/chosen": -0.8428421020507812, "rewards/margins": 0.7392042875289917, "rewards/margins_max": 2.0289976596832275, "rewards/margins_min": -0.35057735443115234, "rewards/margins_std": 1.0364800691604614, "rewards/rejected": -1.5820465087890625, "step": 3790 }, { "epoch": 0.91, "grad_norm": 10.241534575274496, "learning_rate": 1.2248790767750012e-08, "logits/chosen": -2.1532607078552246, "logits/rejected": -2.135650873184204, "logps/chosen": -297.8315124511719, "logps/rejected": -405.40570068359375, "loss": 0.5429, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8696303367614746, "rewards/margins": 0.7533867955207825, "rewards/margins_max": 2.06772518157959, "rewards/margins_min": -0.2650609612464905, "rewards/margins_std": 1.0539860725402832, "rewards/rejected": -1.6230170726776123, "step": 3800 }, { "epoch": 0.91, "eval_logits/chosen": -2.131572723388672, "eval_logits/rejected": -2.0261316299438477, "eval_logps/chosen": -353.10443115234375, "eval_logps/rejected": -402.15386962890625, "eval_loss": 0.5783755779266357, "eval_rewards/accuracies": 0.6934999823570251, "eval_rewards/chosen": -0.8144132494926453, "eval_rewards/margins": 0.5946996808052063, "eval_rewards/margins_max": 2.7068185806274414, "eval_rewards/margins_min": -0.7897871732711792, "eval_rewards/margins_std": 1.153843641281128, "eval_rewards/rejected": -1.4091129302978516, "eval_runtime": 1521.7637, "eval_samples_per_second": 2.629, "eval_steps_per_second": 0.164, "step": 3800 }, { "epoch": 0.91, "grad_norm": 17.62329093276228, "learning_rate": 1.1610947676472277e-08, "logits/chosen": -2.2370047569274902, "logits/rejected": -2.129960536956787, "logps/chosen": -371.01580810546875, "logps/rejected": -443.95928955078125, "loss": 0.5977, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9854887127876282, "rewards/margins": 0.7810400128364563, "rewards/margins_max": 2.7080159187316895, "rewards/margins_min": -0.7159267663955688, "rewards/margins_std": 1.5382016897201538, "rewards/rejected": -1.7665287256240845, "step": 3810 }, { "epoch": 0.91, "grad_norm": 9.659019690266353, "learning_rate": 1.0989764392420692e-08, "logits/chosen": -2.325295925140381, "logits/rejected": -2.1801047325134277, "logps/chosen": -401.95135498046875, "logps/rejected": -454.5005798339844, "loss": 0.5203, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9426606893539429, "rewards/margins": 0.6538997888565063, "rewards/margins_max": 2.292274236679077, "rewards/margins_min": -0.41830310225486755, "rewards/margins_std": 1.2633287906646729, "rewards/rejected": -1.5965605974197388, "step": 3820 }, { "epoch": 0.92, "grad_norm": 8.438850803329762, "learning_rate": 1.0385284326958593e-08, "logits/chosen": -2.314499616622925, "logits/rejected": -2.119990587234497, "logps/chosen": -383.75335693359375, "logps/rejected": -410.32745361328125, "loss": 0.5725, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.748749315738678, "rewards/margins": 0.6275306940078735, "rewards/margins_max": 1.7518634796142578, "rewards/margins_min": -0.39614757895469666, "rewards/margins_std": 0.9608545303344727, "rewards/rejected": -1.3762798309326172, "step": 3830 }, { "epoch": 0.92, "grad_norm": 10.788202935188083, "learning_rate": 9.797549724145731e-09, "logits/chosen": -2.391742467880249, "logits/rejected": -2.169917583465576, "logps/chosen": -397.5116271972656, "logps/rejected": -425.3232421875, "loss": 0.5199, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.83563232421875, "rewards/margins": 0.6723750829696655, "rewards/margins_max": 2.1205077171325684, "rewards/margins_min": -0.48921841382980347, "rewards/margins_std": 1.1693025827407837, "rewards/rejected": -1.508007526397705, "step": 3840 }, { "epoch": 0.92, "grad_norm": 8.37864264226942, "learning_rate": 9.226601657785993e-09, "logits/chosen": -2.2369818687438965, "logits/rejected": -2.2329630851745605, "logps/chosen": -332.69207763671875, "logps/rejected": -467.9469299316406, "loss": 0.5678, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8125410079956055, "rewards/margins": 0.8859883546829224, "rewards/margins_max": 3.1938388347625732, "rewards/margins_min": -0.5991750359535217, "rewards/margins_std": 1.735571265220642, "rewards/rejected": -1.6985292434692383, "step": 3850 }, { "epoch": 0.92, "grad_norm": 8.340087544006911, "learning_rate": 8.672480028556972e-09, "logits/chosen": -2.0964598655700684, "logits/rejected": -2.0185728073120117, "logps/chosen": -333.4912109375, "logps/rejected": -419.2801208496094, "loss": 0.5414, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.813727080821991, "rewards/margins": 0.5758689641952515, "rewards/margins_max": 1.8022687435150146, "rewards/margins_min": -0.3718183636665344, "rewards/margins_std": 1.0023366212844849, "rewards/rejected": -1.3895961046218872, "step": 3860 }, { "epoch": 0.93, "grad_norm": 9.195239900547092, "learning_rate": 8.13522356122151e-09, "logits/chosen": -2.369772434234619, "logits/rejected": -2.1620211601257324, "logps/chosen": -345.564697265625, "logps/rejected": -394.4166259765625, "loss": 0.5905, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7392550706863403, "rewards/margins": 0.5238005518913269, "rewards/margins_max": 1.7102943658828735, "rewards/margins_min": -0.2584485411643982, "rewards/margins_std": 0.8947032690048218, "rewards/rejected": -1.2630555629730225, "step": 3870 }, { "epoch": 0.93, "grad_norm": 9.66148180414658, "learning_rate": 7.614869801921525e-09, "logits/chosen": -2.204747200012207, "logits/rejected": -2.1386661529541016, "logps/chosen": -353.8135070800781, "logps/rejected": -382.6553955078125, "loss": 0.5788, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9248906373977661, "rewards/margins": 0.3916553854942322, "rewards/margins_max": 1.3607351779937744, "rewards/margins_min": -0.6305935978889465, "rewards/margins_std": 0.8848153352737427, "rewards/rejected": -1.316546082496643, "step": 3880 }, { "epoch": 0.93, "grad_norm": 20.94631635742458, "learning_rate": 7.111455115553944e-09, "logits/chosen": -2.1940722465515137, "logits/rejected": -2.1592557430267334, "logps/chosen": -340.5252380371094, "logps/rejected": -443.606201171875, "loss": 0.5936, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9086848497390747, "rewards/margins": 0.6710085868835449, "rewards/margins_max": 2.46950626373291, "rewards/margins_min": -0.5943151116371155, "rewards/margins_std": 1.388077974319458, "rewards/rejected": -1.5796934366226196, "step": 3890 }, { "epoch": 0.93, "grad_norm": 17.987973752122034, "learning_rate": 6.6250146832294296e-09, "logits/chosen": -2.257678508758545, "logits/rejected": -2.108079671859741, "logps/chosen": -360.92510986328125, "logps/rejected": -390.8533020019531, "loss": 0.5582, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8340743184089661, "rewards/margins": 0.6402066946029663, "rewards/margins_max": 1.9235937595367432, "rewards/margins_min": -0.5671850442886353, "rewards/margins_std": 1.1226530075073242, "rewards/rejected": -1.4742809534072876, "step": 3900 }, { "epoch": 0.93, "eval_logits/chosen": -2.1322901248931885, "eval_logits/rejected": -2.026811361312866, "eval_logps/chosen": -353.2459411621094, "eval_logps/rejected": -402.41162109375, "eval_loss": 0.5784348249435425, "eval_rewards/accuracies": 0.6894999742507935, "eval_rewards/chosen": -0.8158286213874817, "eval_rewards/margins": 0.5958622097969055, "eval_rewards/margins_max": 2.7125163078308105, "eval_rewards/margins_min": -0.791420578956604, "eval_rewards/margins_std": 1.1563152074813843, "eval_rewards/rejected": -1.4116909503936768, "eval_runtime": 1500.7733, "eval_samples_per_second": 2.665, "eval_steps_per_second": 0.167, "step": 3900 }, { "epoch": 0.94, "grad_norm": 13.36570402438206, "learning_rate": 6.155582499813655e-09, "logits/chosen": -2.2506978511810303, "logits/rejected": -2.1304123401641846, "logps/chosen": -370.6759338378906, "logps/rejected": -417.4082946777344, "loss": 0.6375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9938521385192871, "rewards/margins": 0.46526747941970825, "rewards/margins_max": 1.7388111352920532, "rewards/margins_min": -0.5178717374801636, "rewards/margins_std": 0.9952529668807983, "rewards/rejected": -1.4591195583343506, "step": 3910 }, { "epoch": 0.94, "grad_norm": 10.29716285222176, "learning_rate": 5.703191371551841e-09, "logits/chosen": -2.2457427978515625, "logits/rejected": -1.9213478565216064, "logps/chosen": -445.5091247558594, "logps/rejected": -417.0675354003906, "loss": 0.5531, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9814988374710083, "rewards/margins": 0.582013726234436, "rewards/margins_max": 1.7901108264923096, "rewards/margins_min": -0.47941121459007263, "rewards/margins_std": 1.0097873210906982, "rewards/rejected": -1.5635125637054443, "step": 3920 }, { "epoch": 0.94, "grad_norm": 12.268456082508406, "learning_rate": 5.267872913775756e-09, "logits/chosen": -2.385425567626953, "logits/rejected": -2.2777578830718994, "logps/chosen": -351.8330078125, "logps/rejected": -361.5228576660156, "loss": 0.5746, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7818518280982971, "rewards/margins": 0.47268444299697876, "rewards/margins_max": 1.4626641273498535, "rewards/margins_min": -0.5286623239517212, "rewards/margins_std": 0.9073006510734558, "rewards/rejected": -1.2545362710952759, "step": 3930 }, { "epoch": 0.94, "grad_norm": 8.220292675441787, "learning_rate": 4.8496575486943744e-09, "logits/chosen": -2.3342947959899902, "logits/rejected": -2.0535120964050293, "logps/chosen": -421.1119689941406, "logps/rejected": -441.86199951171875, "loss": 0.5446, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9969315528869629, "rewards/margins": 0.7184998989105225, "rewards/margins_max": 2.1075263023376465, "rewards/margins_min": -0.5700963139533997, "rewards/margins_std": 1.209326982498169, "rewards/rejected": -1.7154314517974854, "step": 3940 }, { "epoch": 0.95, "grad_norm": 5.9183741868131925, "learning_rate": 4.448574503268076e-09, "logits/chosen": -2.155308246612549, "logits/rejected": -2.046985149383545, "logps/chosen": -334.6973876953125, "logps/rejected": -409.38458251953125, "loss": 0.5778, "rewards/accuracies": 0.625, "rewards/chosen": -0.8813613653182983, "rewards/margins": 0.5831890106201172, "rewards/margins_max": 1.9092601537704468, "rewards/margins_min": -0.5654799342155457, "rewards/margins_std": 1.1305408477783203, "rewards/rejected": -1.464550256729126, "step": 3950 }, { "epoch": 0.95, "grad_norm": 27.78497145350117, "learning_rate": 4.064651807165781e-09, "logits/chosen": -2.242802858352661, "logits/rejected": -2.088991641998291, "logps/chosen": -318.96124267578125, "logps/rejected": -378.01226806640625, "loss": 0.5595, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8208603858947754, "rewards/margins": 0.8398618698120117, "rewards/margins_max": 2.3969340324401855, "rewards/margins_min": -0.3654637336730957, "rewards/margins_std": 1.2372076511383057, "rewards/rejected": -1.6607223749160767, "step": 3960 }, { "epoch": 0.95, "grad_norm": 8.874727486738237, "learning_rate": 3.697916290806291e-09, "logits/chosen": -2.386133909225464, "logits/rejected": -2.1743955612182617, "logps/chosen": -359.5025939941406, "logps/rejected": -376.75970458984375, "loss": 0.5543, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8064457178115845, "rewards/margins": 0.6292827725410461, "rewards/margins_max": 1.8308775424957275, "rewards/margins_min": -0.42312318086624146, "rewards/margins_std": 0.9967203140258789, "rewards/rejected": -1.4357284307479858, "step": 3970 }, { "epoch": 0.95, "grad_norm": 15.005184040714166, "learning_rate": 3.3483935834831e-09, "logits/chosen": -2.2909064292907715, "logits/rejected": -2.117558002471924, "logps/chosen": -373.4837341308594, "logps/rejected": -412.1748962402344, "loss": 0.5407, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8489245176315308, "rewards/margins": 0.6362956762313843, "rewards/margins_max": 2.146629810333252, "rewards/margins_min": -0.4375552237033844, "rewards/margins_std": 1.192993402481079, "rewards/rejected": -1.485220193862915, "step": 3980 }, { "epoch": 0.96, "grad_norm": 8.804975178308284, "learning_rate": 3.0161081115735456e-09, "logits/chosen": -2.2982571125030518, "logits/rejected": -2.1207261085510254, "logps/chosen": -391.1363830566406, "logps/rejected": -421.528564453125, "loss": 0.5557, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8516534566879272, "rewards/margins": 0.5386523008346558, "rewards/margins_max": 1.8274084329605103, "rewards/margins_min": -0.42443856596946716, "rewards/margins_std": 0.996340274810791, "rewards/rejected": -1.390305757522583, "step": 3990 }, { "epoch": 0.96, "grad_norm": 8.792486312287203, "learning_rate": 2.7010830968314802e-09, "logits/chosen": -2.1945462226867676, "logits/rejected": -2.1107583045959473, "logps/chosen": -321.17218017578125, "logps/rejected": -388.5024108886719, "loss": 0.5487, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6967689394950867, "rewards/margins": 0.7160161137580872, "rewards/margins_max": 1.580930471420288, "rewards/margins_min": -0.14464738965034485, "rewards/margins_std": 0.7794433832168579, "rewards/rejected": -1.4127850532531738, "step": 4000 }, { "epoch": 0.96, "eval_logits/chosen": -2.1295669078826904, "eval_logits/rejected": -2.023876190185547, "eval_logps/chosen": -352.8768005371094, "eval_logps/rejected": -401.8472595214844, "eval_loss": 0.5783280730247498, "eval_rewards/accuracies": 0.6915000081062317, "eval_rewards/chosen": -0.8121373653411865, "eval_rewards/margins": 0.5939096808433533, "eval_rewards/margins_max": 2.7036924362182617, "eval_rewards/margins_min": -0.7878880500793457, "eval_rewards/margins_std": 1.1522502899169922, "eval_rewards/rejected": -1.4060471057891846, "eval_runtime": 1499.8852, "eval_samples_per_second": 2.667, "eval_steps_per_second": 0.167, "step": 4000 }, { "epoch": 0.96, "grad_norm": 14.323654159516883, "learning_rate": 2.4033405547646545e-09, "logits/chosen": -2.2879276275634766, "logits/rejected": -2.1684353351593018, "logps/chosen": -317.5887451171875, "logps/rejected": -447.36041259765625, "loss": 0.5374, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8081300854682922, "rewards/margins": 0.7110003232955933, "rewards/margins_max": 2.1918561458587646, "rewards/margins_min": -0.2621138095855713, "rewards/margins_std": 1.1125352382659912, "rewards/rejected": -1.5191304683685303, "step": 4010 }, { "epoch": 0.96, "grad_norm": 17.400477330467833, "learning_rate": 2.122901293095919e-09, "logits/chosen": -2.2576680183410645, "logits/rejected": -2.125464916229248, "logps/chosen": -350.72723388671875, "logps/rejected": -431.97528076171875, "loss": 0.5712, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7537821531295776, "rewards/margins": 0.8883110284805298, "rewards/margins_max": 2.729142665863037, "rewards/margins_min": -0.33858248591423035, "rewards/margins_std": 1.4183647632598877, "rewards/rejected": -1.642093300819397, "step": 4020 }, { "epoch": 0.97, "grad_norm": 21.55595816184914, "learning_rate": 1.8597849103094143e-09, "logits/chosen": -2.2039694786071777, "logits/rejected": -2.1063923835754395, "logps/chosen": -360.505859375, "logps/rejected": -416.98895263671875, "loss": 0.5925, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8756182789802551, "rewards/margins": 0.5064601302146912, "rewards/margins_max": 1.7495571374893188, "rewards/margins_min": -0.37489575147628784, "rewards/margins_std": 0.9356142282485962, "rewards/rejected": -1.3820784091949463, "step": 4030 }, { "epoch": 0.97, "grad_norm": 11.566852801874603, "learning_rate": 1.614009794280613e-09, "logits/chosen": -2.32755184173584, "logits/rejected": -2.1789393424987793, "logps/chosen": -384.1612854003906, "logps/rejected": -434.8182678222656, "loss": 0.5293, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0300973653793335, "rewards/margins": 0.5265058279037476, "rewards/margins_max": 1.5259325504302979, "rewards/margins_min": -0.533371090888977, "rewards/margins_std": 0.9078731536865234, "rewards/rejected": -1.5566033124923706, "step": 4040 }, { "epoch": 0.97, "grad_norm": 8.8325283715413, "learning_rate": 1.3855931209914295e-09, "logits/chosen": -2.361175775527954, "logits/rejected": -2.2897026538848877, "logps/chosen": -363.19451904296875, "logps/rejected": -436.4124450683594, "loss": 0.5682, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.904403805732727, "rewards/margins": 0.5715718269348145, "rewards/margins_max": 1.9459638595581055, "rewards/margins_min": -0.6068178415298462, "rewards/margins_std": 1.1224706172943115, "rewards/rejected": -1.4759756326675415, "step": 4050 }, { "epoch": 0.97, "grad_norm": 9.90820832255186, "learning_rate": 1.1745508533298754e-09, "logits/chosen": -2.3179311752319336, "logits/rejected": -2.170384168624878, "logps/chosen": -367.3550109863281, "logps/rejected": -389.77069091796875, "loss": 0.5568, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9128246307373047, "rewards/margins": 0.5387172698974609, "rewards/margins_max": 1.665052056312561, "rewards/margins_min": -0.40296775102615356, "rewards/margins_std": 0.9177685976028442, "rewards/rejected": -1.4515416622161865, "step": 4060 }, { "epoch": 0.97, "grad_norm": 7.330300860091821, "learning_rate": 9.808977399744511e-10, "logits/chosen": -2.2126502990722656, "logits/rejected": -2.1006791591644287, "logps/chosen": -349.21795654296875, "logps/rejected": -390.6170959472656, "loss": 0.6207, "rewards/accuracies": 0.625, "rewards/chosen": -0.928858757019043, "rewards/margins": 0.5110155940055847, "rewards/margins_max": 2.1441538333892822, "rewards/margins_min": -0.7736586332321167, "rewards/margins_std": 1.3188291788101196, "rewards/rejected": -1.439874529838562, "step": 4070 }, { "epoch": 0.98, "grad_norm": 8.20218837658823, "learning_rate": 8.046473143635268e-10, "logits/chosen": -2.2460570335388184, "logits/rejected": -2.1219441890716553, "logps/chosen": -344.0184631347656, "logps/rejected": -410.72998046875, "loss": 0.5447, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7547734379768372, "rewards/margins": 0.7155425548553467, "rewards/margins_max": 1.843017578125, "rewards/margins_min": -0.38036468625068665, "rewards/margins_std": 1.0035768747329712, "rewards/rejected": -1.470315933227539, "step": 4080 }, { "epoch": 0.98, "grad_norm": 8.368618939764456, "learning_rate": 6.458118937494317e-10, "logits/chosen": -2.1966946125030518, "logits/rejected": -2.1100621223449707, "logps/chosen": -365.1218566894531, "logps/rejected": -452.59197998046875, "loss": 0.5314, "rewards/accuracies": 0.75, "rewards/chosen": -0.782892644405365, "rewards/margins": 0.9436525106430054, "rewards/margins_max": 2.6691668033599854, "rewards/margins_min": -0.24203410744667053, "rewards/margins_std": 1.2975715398788452, "rewards/rejected": -1.7265453338623047, "step": 4090 }, { "epoch": 0.98, "grad_norm": 16.044423422149478, "learning_rate": 5.044025783377259e-10, "logits/chosen": -2.386979341506958, "logits/rejected": -2.2479074001312256, "logps/chosen": -389.9580078125, "logps/rejected": -435.470947265625, "loss": 0.5322, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8630214929580688, "rewards/margins": 0.7707746624946594, "rewards/margins_max": 1.8760143518447876, "rewards/margins_min": -0.251091331243515, "rewards/margins_std": 0.9490987658500671, "rewards/rejected": -1.6337960958480835, "step": 4100 }, { "epoch": 0.98, "eval_logits/chosen": -2.133305072784424, "eval_logits/rejected": -2.027928590774536, "eval_logps/chosen": -353.0736389160156, "eval_logps/rejected": -402.1445617675781, "eval_loss": 0.5783573985099792, "eval_rewards/accuracies": 0.6919999718666077, "eval_rewards/chosen": -0.814105749130249, "eval_rewards/margins": 0.594914436340332, "eval_rewards/margins_max": 2.709044933319092, "eval_rewards/margins_min": -0.7893127202987671, "eval_rewards/margins_std": 1.154624581336975, "eval_rewards/rejected": -1.4090203046798706, "eval_runtime": 1500.0264, "eval_samples_per_second": 2.667, "eval_steps_per_second": 0.167, "step": 4100 }, { "epoch": 0.98, "grad_norm": 8.683035382548539, "learning_rate": 3.8042925051148813e-10, "logits/chosen": -2.2525622844696045, "logits/rejected": -2.1328423023223877, "logps/chosen": -367.0737609863281, "logps/rejected": -412.88494873046875, "loss": 0.5563, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8216296434402466, "rewards/margins": 0.6967242360115051, "rewards/margins_max": 2.2108843326568604, "rewards/margins_min": -0.3301966190338135, "rewards/margins_std": 1.1610016822814941, "rewards/rejected": -1.5183539390563965, "step": 4110 }, { "epoch": 0.99, "grad_norm": 7.22261743034795, "learning_rate": 2.7390057414064525e-10, "logits/chosen": -2.2294363975524902, "logits/rejected": -2.149284839630127, "logps/chosen": -391.35015869140625, "logps/rejected": -419.497314453125, "loss": 0.5569, "rewards/accuracies": 0.6875, "rewards/chosen": -0.974208652973175, "rewards/margins": 0.4896170198917389, "rewards/margins_max": 1.7576297521591187, "rewards/margins_min": -0.7427107095718384, "rewards/margins_std": 1.1269792318344116, "rewards/rejected": -1.4638257026672363, "step": 4120 }, { "epoch": 0.99, "grad_norm": 9.229836421686668, "learning_rate": 1.8482399397654057e-10, "logits/chosen": -2.3102428913116455, "logits/rejected": -2.232504367828369, "logps/chosen": -380.5609130859375, "logps/rejected": -438.815673828125, "loss": 0.5815, "rewards/accuracies": 0.75, "rewards/chosen": -0.8976086378097534, "rewards/margins": 0.49089011549949646, "rewards/margins_max": 1.4279097318649292, "rewards/margins_min": -0.4023559093475342, "rewards/margins_std": 0.8319241404533386, "rewards/rejected": -1.3884989023208618, "step": 4130 }, { "epoch": 0.99, "grad_norm": 36.78824042800991, "learning_rate": 1.1320573513159959e-10, "logits/chosen": -2.2408447265625, "logits/rejected": -2.127920627593994, "logps/chosen": -349.3087463378906, "logps/rejected": -365.7108154296875, "loss": 0.6128, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9973850250244141, "rewards/margins": 0.3007507622241974, "rewards/margins_max": 1.6107676029205322, "rewards/margins_min": -0.9421485066413879, "rewards/margins_std": 1.1356065273284912, "rewards/rejected": -1.298135757446289, "step": 4140 }, { "epoch": 0.99, "grad_norm": 16.430632922750952, "learning_rate": 5.905080264431705e-11, "logits/chosen": -2.243288993835449, "logits/rejected": -2.1521263122558594, "logps/chosen": -359.5000915527344, "logps/rejected": -432.80413818359375, "loss": 0.5703, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8617836236953735, "rewards/margins": 0.8007880449295044, "rewards/margins_max": 2.3553054332733154, "rewards/margins_min": -0.3089454174041748, "rewards/margins_std": 1.2010096311569214, "rewards/rejected": -1.662571668624878, "step": 4150 }, { "epoch": 1.0, "grad_norm": 9.217794077822134, "learning_rate": 2.2362981129508963e-11, "logits/chosen": -2.2625725269317627, "logits/rejected": -2.132338047027588, "logps/chosen": -351.02301025390625, "logps/rejected": -425.73431396484375, "loss": 0.5467, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6961579322814941, "rewards/margins": 0.786538302898407, "rewards/margins_max": 1.9862388372421265, "rewards/margins_min": -0.15629743039608002, "rewards/margins_std": 0.9888018369674683, "rewards/rejected": -1.4826964139938354, "step": 4160 }, { "epoch": 1.0, "grad_norm": 19.632246684047175, "learning_rate": 3.144834513746364e-12, "logits/chosen": -2.3043723106384277, "logits/rejected": -2.19461727142334, "logps/chosen": -388.92059326171875, "logps/rejected": -429.6858825683594, "loss": 0.5429, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8300372362136841, "rewards/margins": 0.6888028383255005, "rewards/margins_max": 1.9750791788101196, "rewards/margins_min": -0.39340880513191223, "rewards/margins_std": 1.0533735752105713, "rewards/rejected": -1.5188400745391846, "step": 4170 }, { "epoch": 1.0, "step": 4176, "total_flos": 0.0, "train_loss": 0.6004282865259383, "train_runtime": 118629.0501, "train_samples_per_second": 0.563, "train_steps_per_second": 0.035 } ], "logging_steps": 10, "max_steps": 4176, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }