kanishka's picture
End of training
869bce2 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 20.0,
"eval_steps": 500,
"global_step": 371900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05,
"grad_norm": 1.0679160356521606,
"learning_rate": 3.125e-06,
"loss": 7.5946,
"step": 1000
},
{
"epoch": 0.11,
"grad_norm": 0.7341670989990234,
"learning_rate": 6.25e-06,
"loss": 5.8196,
"step": 2000
},
{
"epoch": 0.16,
"grad_norm": 0.8071977496147156,
"learning_rate": 9.375000000000001e-06,
"loss": 5.3842,
"step": 3000
},
{
"epoch": 0.22,
"grad_norm": 0.9924404621124268,
"learning_rate": 1.25e-05,
"loss": 5.1603,
"step": 4000
},
{
"epoch": 0.27,
"grad_norm": 0.9524043202400208,
"learning_rate": 1.5625e-05,
"loss": 4.999,
"step": 5000
},
{
"epoch": 0.32,
"grad_norm": 1.0067890882492065,
"learning_rate": 1.8750000000000002e-05,
"loss": 4.8622,
"step": 6000
},
{
"epoch": 0.38,
"grad_norm": 1.0616331100463867,
"learning_rate": 2.1875e-05,
"loss": 4.7445,
"step": 7000
},
{
"epoch": 0.43,
"grad_norm": 1.4308688640594482,
"learning_rate": 2.5e-05,
"loss": 4.6507,
"step": 8000
},
{
"epoch": 0.48,
"grad_norm": 1.0376019477844238,
"learning_rate": 2.8125000000000003e-05,
"loss": 4.558,
"step": 9000
},
{
"epoch": 0.54,
"grad_norm": 1.0812149047851562,
"learning_rate": 3.125e-05,
"loss": 4.4792,
"step": 10000
},
{
"epoch": 0.59,
"grad_norm": 1.039007544517517,
"learning_rate": 3.4375e-05,
"loss": 4.4158,
"step": 11000
},
{
"epoch": 0.65,
"grad_norm": 1.0472838878631592,
"learning_rate": 3.7500000000000003e-05,
"loss": 4.3502,
"step": 12000
},
{
"epoch": 0.7,
"grad_norm": 1.0667674541473389,
"learning_rate": 4.061875e-05,
"loss": 4.2864,
"step": 13000
},
{
"epoch": 0.75,
"grad_norm": 1.0470460653305054,
"learning_rate": 4.374375e-05,
"loss": 4.2366,
"step": 14000
},
{
"epoch": 0.81,
"grad_norm": 1.0131665468215942,
"learning_rate": 4.6865625e-05,
"loss": 4.1872,
"step": 15000
},
{
"epoch": 0.86,
"grad_norm": 1.0265806913375854,
"learning_rate": 4.9990625000000004e-05,
"loss": 4.1391,
"step": 16000
},
{
"epoch": 0.91,
"grad_norm": 0.9650479555130005,
"learning_rate": 5.3115625000000005e-05,
"loss": 4.0979,
"step": 17000
},
{
"epoch": 0.97,
"grad_norm": 1.008900761604309,
"learning_rate": 5.6240625e-05,
"loss": 4.0574,
"step": 18000
},
{
"epoch": 1.0,
"eval_accuracy": 0.3094689486155737,
"eval_loss": 4.26678466796875,
"eval_runtime": 152.067,
"eval_samples_per_second": 380.891,
"eval_steps_per_second": 5.958,
"step": 18595
},
{
"epoch": 1.02,
"grad_norm": 0.9984724521636963,
"learning_rate": 5.93625e-05,
"loss": 4.0058,
"step": 19000
},
{
"epoch": 1.08,
"grad_norm": 1.0308632850646973,
"learning_rate": 6.24875e-05,
"loss": 3.9657,
"step": 20000
},
{
"epoch": 1.13,
"grad_norm": 0.9864353537559509,
"learning_rate": 6.56125e-05,
"loss": 3.925,
"step": 21000
},
{
"epoch": 1.18,
"grad_norm": 1.0101842880249023,
"learning_rate": 6.8734375e-05,
"loss": 3.8953,
"step": 22000
},
{
"epoch": 1.24,
"grad_norm": 0.9528496861457825,
"learning_rate": 7.185937500000001e-05,
"loss": 3.8541,
"step": 23000
},
{
"epoch": 1.29,
"grad_norm": 1.021189570426941,
"learning_rate": 7.4978125e-05,
"loss": 3.8288,
"step": 24000
},
{
"epoch": 1.34,
"grad_norm": 0.9835717082023621,
"learning_rate": 7.8103125e-05,
"loss": 3.7995,
"step": 25000
},
{
"epoch": 1.4,
"grad_norm": 1.0188689231872559,
"learning_rate": 8.1228125e-05,
"loss": 3.7699,
"step": 26000
},
{
"epoch": 1.45,
"grad_norm": 0.9111623764038086,
"learning_rate": 8.435e-05,
"loss": 3.7484,
"step": 27000
},
{
"epoch": 1.51,
"grad_norm": 0.8624160289764404,
"learning_rate": 8.747500000000001e-05,
"loss": 3.7345,
"step": 28000
},
{
"epoch": 1.56,
"grad_norm": 0.8700354695320129,
"learning_rate": 9.0596875e-05,
"loss": 3.7102,
"step": 29000
},
{
"epoch": 1.61,
"grad_norm": 0.8851209878921509,
"learning_rate": 9.3721875e-05,
"loss": 3.6873,
"step": 30000
},
{
"epoch": 1.67,
"grad_norm": 0.8330232501029968,
"learning_rate": 9.684375e-05,
"loss": 3.6671,
"step": 31000
},
{
"epoch": 1.72,
"grad_norm": 0.8477538824081421,
"learning_rate": 9.9965625e-05,
"loss": 3.6533,
"step": 32000
},
{
"epoch": 1.77,
"grad_norm": 0.8847412467002869,
"learning_rate": 9.970903206825538e-05,
"loss": 3.6399,
"step": 33000
},
{
"epoch": 1.83,
"grad_norm": 0.8788692951202393,
"learning_rate": 9.941482789055604e-05,
"loss": 3.6138,
"step": 34000
},
{
"epoch": 1.88,
"grad_norm": 0.8514883518218994,
"learning_rate": 9.912091791703443e-05,
"loss": 3.6049,
"step": 35000
},
{
"epoch": 1.94,
"grad_norm": 0.8380881547927856,
"learning_rate": 9.882671373933511e-05,
"loss": 3.5864,
"step": 36000
},
{
"epoch": 1.99,
"grad_norm": 0.7953729033470154,
"learning_rate": 9.853250956163578e-05,
"loss": 3.574,
"step": 37000
},
{
"epoch": 2.0,
"eval_accuracy": 0.36377712108475674,
"eval_loss": 3.7340097427368164,
"eval_runtime": 153.4304,
"eval_samples_per_second": 377.507,
"eval_steps_per_second": 5.905,
"step": 37190
},
{
"epoch": 2.04,
"grad_norm": 0.8078764081001282,
"learning_rate": 9.823830538393646e-05,
"loss": 3.5281,
"step": 38000
},
{
"epoch": 2.1,
"grad_norm": 0.8253265619277954,
"learning_rate": 9.794439541041483e-05,
"loss": 3.52,
"step": 39000
},
{
"epoch": 2.15,
"grad_norm": 0.8200889825820923,
"learning_rate": 9.765048543689321e-05,
"loss": 3.5065,
"step": 40000
},
{
"epoch": 2.2,
"grad_norm": 0.7781252861022949,
"learning_rate": 9.735628125919388e-05,
"loss": 3.4945,
"step": 41000
},
{
"epoch": 2.26,
"grad_norm": 0.7886627912521362,
"learning_rate": 9.706207708149456e-05,
"loss": 3.4924,
"step": 42000
},
{
"epoch": 2.31,
"grad_norm": 0.7948366403579712,
"learning_rate": 9.676787290379524e-05,
"loss": 3.4834,
"step": 43000
},
{
"epoch": 2.37,
"grad_norm": 0.7732539176940918,
"learning_rate": 9.647425713445132e-05,
"loss": 3.4722,
"step": 44000
},
{
"epoch": 2.42,
"grad_norm": 0.769277036190033,
"learning_rate": 9.618005295675198e-05,
"loss": 3.4659,
"step": 45000
},
{
"epoch": 2.47,
"grad_norm": 0.7610235214233398,
"learning_rate": 9.588584877905267e-05,
"loss": 3.4572,
"step": 46000
},
{
"epoch": 2.53,
"grad_norm": 0.7955553531646729,
"learning_rate": 9.559164460135335e-05,
"loss": 3.4458,
"step": 47000
},
{
"epoch": 2.58,
"grad_norm": 0.772249698638916,
"learning_rate": 9.529773462783173e-05,
"loss": 3.4369,
"step": 48000
},
{
"epoch": 2.64,
"grad_norm": 0.7617048025131226,
"learning_rate": 9.50035304501324e-05,
"loss": 3.4327,
"step": 49000
},
{
"epoch": 2.69,
"grad_norm": 0.7853600382804871,
"learning_rate": 9.470962047661077e-05,
"loss": 3.4226,
"step": 50000
},
{
"epoch": 2.74,
"grad_norm": 0.7761564254760742,
"learning_rate": 9.441541629891145e-05,
"loss": 3.4227,
"step": 51000
},
{
"epoch": 2.8,
"grad_norm": 0.7742950916290283,
"learning_rate": 9.412121212121212e-05,
"loss": 3.4126,
"step": 52000
},
{
"epoch": 2.85,
"grad_norm": 0.7954655885696411,
"learning_rate": 9.38270079435128e-05,
"loss": 3.4063,
"step": 53000
},
{
"epoch": 2.9,
"grad_norm": 0.7167351841926575,
"learning_rate": 9.353309796999118e-05,
"loss": 3.3961,
"step": 54000
},
{
"epoch": 2.96,
"grad_norm": 0.7676237225532532,
"learning_rate": 9.323889379229186e-05,
"loss": 3.3998,
"step": 55000
},
{
"epoch": 3.0,
"eval_accuracy": 0.3792631275512326,
"eval_loss": 3.594615936279297,
"eval_runtime": 153.2776,
"eval_samples_per_second": 377.883,
"eval_steps_per_second": 5.911,
"step": 55785
},
{
"epoch": 3.01,
"grad_norm": 0.7646710276603699,
"learning_rate": 9.294468961459253e-05,
"loss": 3.3752,
"step": 56000
},
{
"epoch": 3.07,
"grad_norm": 0.7935709953308105,
"learning_rate": 9.26507796410709e-05,
"loss": 3.3398,
"step": 57000
},
{
"epoch": 3.12,
"grad_norm": 0.746233344078064,
"learning_rate": 9.235657546337158e-05,
"loss": 3.3393,
"step": 58000
},
{
"epoch": 3.17,
"grad_norm": 0.7573293447494507,
"learning_rate": 9.206266548984997e-05,
"loss": 3.3332,
"step": 59000
},
{
"epoch": 3.23,
"grad_norm": 0.7440087199211121,
"learning_rate": 9.176875551632832e-05,
"loss": 3.3342,
"step": 60000
},
{
"epoch": 3.28,
"grad_norm": 0.8584343194961548,
"learning_rate": 9.1474551338629e-05,
"loss": 3.3328,
"step": 61000
},
{
"epoch": 3.33,
"grad_norm": 0.7430059909820557,
"learning_rate": 9.118034716092969e-05,
"loss": 3.3274,
"step": 62000
},
{
"epoch": 3.39,
"grad_norm": 0.7765605449676514,
"learning_rate": 9.088614298323037e-05,
"loss": 3.3256,
"step": 63000
},
{
"epoch": 3.44,
"grad_norm": 0.7461695075035095,
"learning_rate": 9.059193880553104e-05,
"loss": 3.323,
"step": 64000
},
{
"epoch": 3.5,
"grad_norm": 0.767082691192627,
"learning_rate": 9.029802883200942e-05,
"loss": 3.318,
"step": 65000
},
{
"epoch": 3.55,
"grad_norm": 0.749569296836853,
"learning_rate": 9.00038246543101e-05,
"loss": 3.3153,
"step": 66000
},
{
"epoch": 3.6,
"grad_norm": 0.7436603307723999,
"learning_rate": 8.971020888496617e-05,
"loss": 3.3119,
"step": 67000
},
{
"epoch": 3.66,
"grad_norm": 0.7696906924247742,
"learning_rate": 8.941600470726684e-05,
"loss": 3.3112,
"step": 68000
},
{
"epoch": 3.71,
"grad_norm": 0.7421345114707947,
"learning_rate": 8.912180052956752e-05,
"loss": 3.3105,
"step": 69000
},
{
"epoch": 3.76,
"grad_norm": 0.753808319568634,
"learning_rate": 8.88275963518682e-05,
"loss": 3.2991,
"step": 70000
},
{
"epoch": 3.82,
"grad_norm": 0.7684649229049683,
"learning_rate": 8.853368637834658e-05,
"loss": 3.2998,
"step": 71000
},
{
"epoch": 3.87,
"grad_norm": 0.7607173919677734,
"learning_rate": 8.823948220064724e-05,
"loss": 3.2958,
"step": 72000
},
{
"epoch": 3.93,
"grad_norm": 0.7405970096588135,
"learning_rate": 8.794557222712563e-05,
"loss": 3.2941,
"step": 73000
},
{
"epoch": 3.98,
"grad_norm": 0.7150483131408691,
"learning_rate": 8.765166225360401e-05,
"loss": 3.2908,
"step": 74000
},
{
"epoch": 4.0,
"eval_accuracy": 0.38712933630182045,
"eval_loss": 3.5169150829315186,
"eval_runtime": 153.253,
"eval_samples_per_second": 377.944,
"eval_steps_per_second": 5.912,
"step": 74380
},
{
"epoch": 4.03,
"grad_norm": 0.7505189776420593,
"learning_rate": 8.735745807590468e-05,
"loss": 3.254,
"step": 75000
},
{
"epoch": 4.09,
"grad_norm": 0.8018031120300293,
"learning_rate": 8.706325389820536e-05,
"loss": 3.2441,
"step": 76000
},
{
"epoch": 4.14,
"grad_norm": 0.7522112727165222,
"learning_rate": 8.676934392468373e-05,
"loss": 3.2338,
"step": 77000
},
{
"epoch": 4.19,
"grad_norm": 0.7681379318237305,
"learning_rate": 8.647513974698441e-05,
"loss": 3.2398,
"step": 78000
},
{
"epoch": 4.25,
"grad_norm": 0.7102425694465637,
"learning_rate": 8.61809355692851e-05,
"loss": 3.2401,
"step": 79000
},
{
"epoch": 4.3,
"grad_norm": 0.7491008043289185,
"learning_rate": 8.588702559576346e-05,
"loss": 3.2365,
"step": 80000
},
{
"epoch": 4.36,
"grad_norm": 0.7204309701919556,
"learning_rate": 8.559282141806415e-05,
"loss": 3.2395,
"step": 81000
},
{
"epoch": 4.41,
"grad_norm": 0.7859947085380554,
"learning_rate": 8.529861724036483e-05,
"loss": 3.2378,
"step": 82000
},
{
"epoch": 4.46,
"grad_norm": 0.76870197057724,
"learning_rate": 8.50044130626655e-05,
"loss": 3.2353,
"step": 83000
},
{
"epoch": 4.52,
"grad_norm": 0.7622324228286743,
"learning_rate": 8.471050308914386e-05,
"loss": 3.2353,
"step": 84000
},
{
"epoch": 4.57,
"grad_norm": 0.7557118535041809,
"learning_rate": 8.441629891144455e-05,
"loss": 3.2375,
"step": 85000
},
{
"epoch": 4.62,
"grad_norm": 0.7363464832305908,
"learning_rate": 8.412238893792293e-05,
"loss": 3.233,
"step": 86000
},
{
"epoch": 4.68,
"grad_norm": 0.7295689582824707,
"learning_rate": 8.38281847602236e-05,
"loss": 3.2331,
"step": 87000
},
{
"epoch": 4.73,
"grad_norm": 0.7375525832176208,
"learning_rate": 8.353456899087967e-05,
"loss": 3.2309,
"step": 88000
},
{
"epoch": 4.79,
"grad_norm": 0.7149308919906616,
"learning_rate": 8.324036481318035e-05,
"loss": 3.228,
"step": 89000
},
{
"epoch": 4.84,
"grad_norm": 0.7252637147903442,
"learning_rate": 8.294616063548103e-05,
"loss": 3.2236,
"step": 90000
},
{
"epoch": 4.89,
"grad_norm": 0.7209696173667908,
"learning_rate": 8.26522506619594e-05,
"loss": 3.23,
"step": 91000
},
{
"epoch": 4.95,
"grad_norm": 0.7185742855072021,
"learning_rate": 8.235804648426007e-05,
"loss": 3.2244,
"step": 92000
},
{
"epoch": 5.0,
"eval_accuracy": 0.3919204455122256,
"eval_loss": 3.484323501586914,
"eval_runtime": 153.2201,
"eval_samples_per_second": 378.025,
"eval_steps_per_second": 5.913,
"step": 92975
},
{
"epoch": 5.0,
"grad_norm": 0.7267788648605347,
"learning_rate": 8.206384230656075e-05,
"loss": 3.2244,
"step": 93000
},
{
"epoch": 5.06,
"grad_norm": 0.7365211248397827,
"learning_rate": 8.176993233303914e-05,
"loss": 3.1676,
"step": 94000
},
{
"epoch": 5.11,
"grad_norm": 0.7716740369796753,
"learning_rate": 8.147572815533982e-05,
"loss": 3.1672,
"step": 95000
},
{
"epoch": 5.16,
"grad_norm": 0.7571492791175842,
"learning_rate": 8.118181818181818e-05,
"loss": 3.1731,
"step": 96000
},
{
"epoch": 5.22,
"grad_norm": 0.7364664673805237,
"learning_rate": 8.088761400411886e-05,
"loss": 3.1719,
"step": 97000
},
{
"epoch": 5.27,
"grad_norm": 0.7242543697357178,
"learning_rate": 8.059370403059724e-05,
"loss": 3.1739,
"step": 98000
},
{
"epoch": 5.32,
"grad_norm": 0.7749021053314209,
"learning_rate": 8.029949985289792e-05,
"loss": 3.1771,
"step": 99000
},
{
"epoch": 5.38,
"grad_norm": 0.740552544593811,
"learning_rate": 8.000529567519859e-05,
"loss": 3.1771,
"step": 100000
},
{
"epoch": 5.43,
"grad_norm": 0.7202236652374268,
"learning_rate": 7.971109149749927e-05,
"loss": 3.1755,
"step": 101000
},
{
"epoch": 5.49,
"grad_norm": 0.7516536712646484,
"learning_rate": 7.941688731979995e-05,
"loss": 3.1761,
"step": 102000
},
{
"epoch": 5.54,
"grad_norm": 0.7896652221679688,
"learning_rate": 7.912268314210062e-05,
"loss": 3.1821,
"step": 103000
},
{
"epoch": 5.59,
"grad_norm": 0.7660993337631226,
"learning_rate": 7.882906737275669e-05,
"loss": 3.1744,
"step": 104000
},
{
"epoch": 5.65,
"grad_norm": 0.7442207336425781,
"learning_rate": 7.853486319505737e-05,
"loss": 3.17,
"step": 105000
},
{
"epoch": 5.7,
"grad_norm": 0.7453558444976807,
"learning_rate": 7.824065901735806e-05,
"loss": 3.1778,
"step": 106000
},
{
"epoch": 5.75,
"grad_norm": 0.7240561842918396,
"learning_rate": 7.794674904383643e-05,
"loss": 3.1805,
"step": 107000
},
{
"epoch": 5.81,
"grad_norm": 0.7229368090629578,
"learning_rate": 7.76525448661371e-05,
"loss": 3.1764,
"step": 108000
},
{
"epoch": 5.86,
"grad_norm": 0.7163822054862976,
"learning_rate": 7.735834068843777e-05,
"loss": 3.1734,
"step": 109000
},
{
"epoch": 5.92,
"grad_norm": 0.7416737675666809,
"learning_rate": 7.706413651073846e-05,
"loss": 3.174,
"step": 110000
},
{
"epoch": 5.97,
"grad_norm": 0.7351878881454468,
"learning_rate": 7.677022653721683e-05,
"loss": 3.1723,
"step": 111000
},
{
"epoch": 6.0,
"eval_accuracy": 0.3955316301279006,
"eval_loss": 3.442295551300049,
"eval_runtime": 153.2217,
"eval_samples_per_second": 378.021,
"eval_steps_per_second": 5.913,
"step": 111570
},
{
"epoch": 6.02,
"grad_norm": 0.7372068762779236,
"learning_rate": 7.647631656369521e-05,
"loss": 3.1476,
"step": 112000
},
{
"epoch": 6.08,
"grad_norm": 0.734592854976654,
"learning_rate": 7.618211238599588e-05,
"loss": 3.1148,
"step": 113000
},
{
"epoch": 6.13,
"grad_norm": 0.731022298336029,
"learning_rate": 7.588790820829656e-05,
"loss": 3.1139,
"step": 114000
},
{
"epoch": 6.18,
"grad_norm": 0.7731876373291016,
"learning_rate": 7.559429243895265e-05,
"loss": 3.1289,
"step": 115000
},
{
"epoch": 6.24,
"grad_norm": 0.7593401074409485,
"learning_rate": 7.530008826125331e-05,
"loss": 3.1261,
"step": 116000
},
{
"epoch": 6.29,
"grad_norm": 0.7750897407531738,
"learning_rate": 7.5005884083554e-05,
"loss": 3.1222,
"step": 117000
},
{
"epoch": 6.35,
"grad_norm": 0.7594813704490662,
"learning_rate": 7.471167990585468e-05,
"loss": 3.128,
"step": 118000
},
{
"epoch": 6.4,
"grad_norm": 0.7291180491447449,
"learning_rate": 7.441747572815534e-05,
"loss": 3.1269,
"step": 119000
},
{
"epoch": 6.45,
"grad_norm": 0.7330898642539978,
"learning_rate": 7.412327155045603e-05,
"loss": 3.1344,
"step": 120000
},
{
"epoch": 6.51,
"grad_norm": 0.7279812693595886,
"learning_rate": 7.38290673727567e-05,
"loss": 3.1298,
"step": 121000
},
{
"epoch": 6.56,
"grad_norm": 0.7451930642127991,
"learning_rate": 7.353515739923508e-05,
"loss": 3.1308,
"step": 122000
},
{
"epoch": 6.61,
"grad_norm": 0.7272422909736633,
"learning_rate": 7.324124742571345e-05,
"loss": 3.1364,
"step": 123000
},
{
"epoch": 6.67,
"grad_norm": 0.7475588917732239,
"learning_rate": 7.294704324801413e-05,
"loss": 3.1333,
"step": 124000
},
{
"epoch": 6.72,
"grad_norm": 0.7194586992263794,
"learning_rate": 7.265283907031481e-05,
"loss": 3.1342,
"step": 125000
},
{
"epoch": 6.78,
"grad_norm": 0.7266201972961426,
"learning_rate": 7.235863489261548e-05,
"loss": 3.1322,
"step": 126000
},
{
"epoch": 6.83,
"grad_norm": 0.7517876029014587,
"learning_rate": 7.206472491909385e-05,
"loss": 3.1335,
"step": 127000
},
{
"epoch": 6.88,
"grad_norm": 0.7235228419303894,
"learning_rate": 7.177081494557223e-05,
"loss": 3.1315,
"step": 128000
},
{
"epoch": 6.94,
"grad_norm": 0.7359281182289124,
"learning_rate": 7.147661076787291e-05,
"loss": 3.1309,
"step": 129000
},
{
"epoch": 6.99,
"grad_norm": 0.7120690941810608,
"learning_rate": 7.11824065901736e-05,
"loss": 3.1287,
"step": 130000
},
{
"epoch": 7.0,
"eval_accuracy": 0.3986633612117726,
"eval_loss": 3.4223978519439697,
"eval_runtime": 153.3442,
"eval_samples_per_second": 377.719,
"eval_steps_per_second": 5.908,
"step": 130165
},
{
"epoch": 7.04,
"grad_norm": 0.7617224454879761,
"learning_rate": 7.088820241247426e-05,
"loss": 3.0854,
"step": 131000
},
{
"epoch": 7.1,
"grad_norm": 0.7914534211158752,
"learning_rate": 7.059429243895263e-05,
"loss": 3.0764,
"step": 132000
},
{
"epoch": 7.15,
"grad_norm": 0.7849559187889099,
"learning_rate": 7.030008826125331e-05,
"loss": 3.0822,
"step": 133000
},
{
"epoch": 7.21,
"grad_norm": 0.7480010390281677,
"learning_rate": 7.000588408355398e-05,
"loss": 3.0807,
"step": 134000
},
{
"epoch": 7.26,
"grad_norm": 0.7867963910102844,
"learning_rate": 6.971167990585466e-05,
"loss": 3.0868,
"step": 135000
},
{
"epoch": 7.31,
"grad_norm": 0.7274609208106995,
"learning_rate": 6.941806413651074e-05,
"loss": 3.0906,
"step": 136000
},
{
"epoch": 7.37,
"grad_norm": 0.7924407720565796,
"learning_rate": 6.912385995881142e-05,
"loss": 3.0901,
"step": 137000
},
{
"epoch": 7.42,
"grad_norm": 0.751484215259552,
"learning_rate": 6.882965578111209e-05,
"loss": 3.0894,
"step": 138000
},
{
"epoch": 7.48,
"grad_norm": 0.761904239654541,
"learning_rate": 6.853574580759047e-05,
"loss": 3.098,
"step": 139000
},
{
"epoch": 7.53,
"grad_norm": 0.7824081778526306,
"learning_rate": 6.824154162989115e-05,
"loss": 3.0904,
"step": 140000
},
{
"epoch": 7.58,
"grad_norm": 0.7663974761962891,
"learning_rate": 6.794733745219183e-05,
"loss": 3.0915,
"step": 141000
},
{
"epoch": 7.64,
"grad_norm": 0.7567524909973145,
"learning_rate": 6.765342747867019e-05,
"loss": 3.0938,
"step": 142000
},
{
"epoch": 7.69,
"grad_norm": 0.7632171511650085,
"learning_rate": 6.735922330097087e-05,
"loss": 3.0936,
"step": 143000
},
{
"epoch": 7.74,
"grad_norm": 0.7414125204086304,
"learning_rate": 6.706501912327155e-05,
"loss": 3.0933,
"step": 144000
},
{
"epoch": 7.8,
"grad_norm": 0.7134759426116943,
"learning_rate": 6.677081494557223e-05,
"loss": 3.0941,
"step": 145000
},
{
"epoch": 7.85,
"grad_norm": 0.7270065546035767,
"learning_rate": 6.64769049720506e-05,
"loss": 3.0953,
"step": 146000
},
{
"epoch": 7.91,
"grad_norm": 0.7573560476303101,
"learning_rate": 6.618299499852897e-05,
"loss": 3.094,
"step": 147000
},
{
"epoch": 7.96,
"grad_norm": 0.7566157579421997,
"learning_rate": 6.588879082082966e-05,
"loss": 3.0995,
"step": 148000
},
{
"epoch": 8.0,
"eval_accuracy": 0.40014243202720035,
"eval_loss": 3.411925792694092,
"eval_runtime": 153.1198,
"eval_samples_per_second": 378.272,
"eval_steps_per_second": 5.917,
"step": 148760
},
{
"epoch": 8.01,
"grad_norm": 0.7916214466094971,
"learning_rate": 6.559488084730804e-05,
"loss": 3.0778,
"step": 149000
},
{
"epoch": 8.07,
"grad_norm": 0.7633106708526611,
"learning_rate": 6.530067666960871e-05,
"loss": 3.0389,
"step": 150000
},
{
"epoch": 8.12,
"grad_norm": 0.7663868069648743,
"learning_rate": 6.500647249190939e-05,
"loss": 3.0467,
"step": 151000
},
{
"epoch": 8.17,
"grad_norm": 0.772544801235199,
"learning_rate": 6.471226831421007e-05,
"loss": 3.0511,
"step": 152000
},
{
"epoch": 8.23,
"grad_norm": 0.7798665165901184,
"learning_rate": 6.441835834068844e-05,
"loss": 3.0481,
"step": 153000
},
{
"epoch": 8.28,
"grad_norm": 0.7501771450042725,
"learning_rate": 6.412415416298911e-05,
"loss": 3.0541,
"step": 154000
},
{
"epoch": 8.34,
"grad_norm": 0.7601937651634216,
"learning_rate": 6.383024418946749e-05,
"loss": 3.0572,
"step": 155000
},
{
"epoch": 8.39,
"grad_norm": 0.7670091986656189,
"learning_rate": 6.353604001176817e-05,
"loss": 3.0551,
"step": 156000
},
{
"epoch": 8.44,
"grad_norm": 0.7640057802200317,
"learning_rate": 6.324183583406885e-05,
"loss": 3.0579,
"step": 157000
},
{
"epoch": 8.5,
"grad_norm": 0.7653687596321106,
"learning_rate": 6.294763165636952e-05,
"loss": 3.0599,
"step": 158000
},
{
"epoch": 8.55,
"grad_norm": 0.7580332159996033,
"learning_rate": 6.265372168284789e-05,
"loss": 3.0579,
"step": 159000
},
{
"epoch": 8.6,
"grad_norm": 0.7833405137062073,
"learning_rate": 6.235951750514857e-05,
"loss": 3.0605,
"step": 160000
},
{
"epoch": 8.66,
"grad_norm": 0.7054916024208069,
"learning_rate": 6.206560753162696e-05,
"loss": 3.0629,
"step": 161000
},
{
"epoch": 8.71,
"grad_norm": 0.7401750087738037,
"learning_rate": 6.177169755810533e-05,
"loss": 3.0622,
"step": 162000
},
{
"epoch": 8.77,
"grad_norm": 0.7965439558029175,
"learning_rate": 6.147749338040601e-05,
"loss": 3.0641,
"step": 163000
},
{
"epoch": 8.82,
"grad_norm": 0.747765064239502,
"learning_rate": 6.118328920270668e-05,
"loss": 3.0636,
"step": 164000
},
{
"epoch": 8.87,
"grad_norm": 0.7850915193557739,
"learning_rate": 6.088908502500735e-05,
"loss": 3.0647,
"step": 165000
},
{
"epoch": 8.93,
"grad_norm": 0.762381911277771,
"learning_rate": 6.0595469255663425e-05,
"loss": 3.0674,
"step": 166000
},
{
"epoch": 8.98,
"grad_norm": 0.7457260489463806,
"learning_rate": 6.0301265077964107e-05,
"loss": 3.0666,
"step": 167000
},
{
"epoch": 9.0,
"eval_accuracy": 0.4014027022046734,
"eval_loss": 3.4093098640441895,
"eval_runtime": 153.0419,
"eval_samples_per_second": 378.465,
"eval_steps_per_second": 5.92,
"step": 167355
},
{
"epoch": 9.03,
"grad_norm": 0.8109008073806763,
"learning_rate": 6.000706090026479e-05,
"loss": 3.0281,
"step": 168000
},
{
"epoch": 9.09,
"grad_norm": 0.7763618230819702,
"learning_rate": 5.9712856722565455e-05,
"loss": 3.0099,
"step": 169000
},
{
"epoch": 9.14,
"grad_norm": 0.7697040438652039,
"learning_rate": 5.941865254486614e-05,
"loss": 3.0167,
"step": 170000
},
{
"epoch": 9.2,
"grad_norm": 0.8015623688697815,
"learning_rate": 5.9124742571344514e-05,
"loss": 3.021,
"step": 171000
},
{
"epoch": 9.25,
"grad_norm": 0.8062806725502014,
"learning_rate": 5.883083259782289e-05,
"loss": 3.0259,
"step": 172000
},
{
"epoch": 9.3,
"grad_norm": 0.7650690078735352,
"learning_rate": 5.853662842012357e-05,
"loss": 3.0219,
"step": 173000
},
{
"epoch": 9.36,
"grad_norm": 0.7815614342689514,
"learning_rate": 5.824271844660194e-05,
"loss": 3.0225,
"step": 174000
},
{
"epoch": 9.41,
"grad_norm": 0.7516615986824036,
"learning_rate": 5.794880847308032e-05,
"loss": 3.0309,
"step": 175000
},
{
"epoch": 9.46,
"grad_norm": 0.7918343544006348,
"learning_rate": 5.7654604295381e-05,
"loss": 3.0323,
"step": 176000
},
{
"epoch": 9.52,
"grad_norm": 0.7646675109863281,
"learning_rate": 5.7360400117681676e-05,
"loss": 3.0324,
"step": 177000
},
{
"epoch": 9.57,
"grad_norm": 0.764613151550293,
"learning_rate": 5.706619593998235e-05,
"loss": 3.0294,
"step": 178000
},
{
"epoch": 9.63,
"grad_norm": 0.7855122685432434,
"learning_rate": 5.6771991762283025e-05,
"loss": 3.0325,
"step": 179000
},
{
"epoch": 9.68,
"grad_norm": 0.7663726210594177,
"learning_rate": 5.6477787584583706e-05,
"loss": 3.0352,
"step": 180000
},
{
"epoch": 9.73,
"grad_norm": 0.741322934627533,
"learning_rate": 5.618387761106208e-05,
"loss": 3.0389,
"step": 181000
},
{
"epoch": 9.79,
"grad_norm": 0.8002005219459534,
"learning_rate": 5.5889967637540454e-05,
"loss": 3.0358,
"step": 182000
},
{
"epoch": 9.84,
"grad_norm": 0.7639887928962708,
"learning_rate": 5.5595763459841135e-05,
"loss": 3.0366,
"step": 183000
},
{
"epoch": 9.9,
"grad_norm": 0.7922505736351013,
"learning_rate": 5.530155928214181e-05,
"loss": 3.0392,
"step": 184000
},
{
"epoch": 9.95,
"grad_norm": 0.7338058352470398,
"learning_rate": 5.5007355104442484e-05,
"loss": 3.0395,
"step": 185000
},
{
"epoch": 10.0,
"eval_accuracy": 0.40239089912954695,
"eval_loss": 3.3992717266082764,
"eval_runtime": 153.2452,
"eval_samples_per_second": 377.963,
"eval_steps_per_second": 5.912,
"step": 185950
},
{
"epoch": 10.0,
"grad_norm": 0.7729306817054749,
"learning_rate": 5.471344513092086e-05,
"loss": 3.0359,
"step": 186000
},
{
"epoch": 10.06,
"grad_norm": 0.8014710545539856,
"learning_rate": 5.441924095322154e-05,
"loss": 2.9824,
"step": 187000
},
{
"epoch": 10.11,
"grad_norm": 0.7774161100387573,
"learning_rate": 5.412503677552222e-05,
"loss": 2.9844,
"step": 188000
},
{
"epoch": 10.16,
"grad_norm": 0.7719259262084961,
"learning_rate": 5.383112680200059e-05,
"loss": 2.9878,
"step": 189000
},
{
"epoch": 10.22,
"grad_norm": 0.77918940782547,
"learning_rate": 5.353692262430127e-05,
"loss": 2.9942,
"step": 190000
},
{
"epoch": 10.27,
"grad_norm": 0.803722620010376,
"learning_rate": 5.3242718446601943e-05,
"loss": 2.9966,
"step": 191000
},
{
"epoch": 10.33,
"grad_norm": 0.7969011664390564,
"learning_rate": 5.294851426890262e-05,
"loss": 3.001,
"step": 192000
},
{
"epoch": 10.38,
"grad_norm": 0.7830320000648499,
"learning_rate": 5.2654604295380995e-05,
"loss": 3.0019,
"step": 193000
},
{
"epoch": 10.43,
"grad_norm": 0.788979709148407,
"learning_rate": 5.236069432185937e-05,
"loss": 3.003,
"step": 194000
},
{
"epoch": 10.49,
"grad_norm": 0.7922942042350769,
"learning_rate": 5.2066490144160054e-05,
"loss": 3.0082,
"step": 195000
},
{
"epoch": 10.54,
"grad_norm": 0.7848927974700928,
"learning_rate": 5.177258017063843e-05,
"loss": 3.0071,
"step": 196000
},
{
"epoch": 10.59,
"grad_norm": 0.7817524075508118,
"learning_rate": 5.14783759929391e-05,
"loss": 3.0102,
"step": 197000
},
{
"epoch": 10.65,
"grad_norm": 0.7726117968559265,
"learning_rate": 5.118417181523978e-05,
"loss": 3.0095,
"step": 198000
},
{
"epoch": 10.7,
"grad_norm": 0.7940185070037842,
"learning_rate": 5.089026184171816e-05,
"loss": 3.0116,
"step": 199000
},
{
"epoch": 10.76,
"grad_norm": 0.8069621324539185,
"learning_rate": 5.059605766401884e-05,
"loss": 3.0125,
"step": 200000
},
{
"epoch": 10.81,
"grad_norm": 0.7631500959396362,
"learning_rate": 5.03021476904972e-05,
"loss": 3.0131,
"step": 201000
},
{
"epoch": 10.86,
"grad_norm": 0.7978447675704956,
"learning_rate": 5.0007943512797884e-05,
"loss": 3.0115,
"step": 202000
},
{
"epoch": 10.92,
"grad_norm": 0.8167823553085327,
"learning_rate": 4.971403353927626e-05,
"loss": 3.0125,
"step": 203000
},
{
"epoch": 10.97,
"grad_norm": 0.8195194005966187,
"learning_rate": 4.9419829361576935e-05,
"loss": 3.0097,
"step": 204000
},
{
"epoch": 11.0,
"eval_accuracy": 0.40306617822464075,
"eval_loss": 3.4086954593658447,
"eval_runtime": 154.7236,
"eval_samples_per_second": 374.352,
"eval_steps_per_second": 5.856,
"step": 204545
},
{
"epoch": 11.02,
"grad_norm": 0.7763456106185913,
"learning_rate": 4.9125625183877617e-05,
"loss": 2.9904,
"step": 205000
},
{
"epoch": 11.08,
"grad_norm": 0.8207082152366638,
"learning_rate": 4.883142100617829e-05,
"loss": 2.9575,
"step": 206000
},
{
"epoch": 11.13,
"grad_norm": 0.8150632977485657,
"learning_rate": 4.853751103265667e-05,
"loss": 2.9651,
"step": 207000
},
{
"epoch": 11.19,
"grad_norm": 0.8188782334327698,
"learning_rate": 4.824360105913504e-05,
"loss": 2.9688,
"step": 208000
},
{
"epoch": 11.24,
"grad_norm": 0.898553192615509,
"learning_rate": 4.794939688143572e-05,
"loss": 2.9716,
"step": 209000
},
{
"epoch": 11.29,
"grad_norm": 0.8035799860954285,
"learning_rate": 4.7655192703736395e-05,
"loss": 2.977,
"step": 210000
},
{
"epoch": 11.35,
"grad_norm": 0.8373109102249146,
"learning_rate": 4.736098852603707e-05,
"loss": 2.9771,
"step": 211000
},
{
"epoch": 11.4,
"grad_norm": 0.8262482285499573,
"learning_rate": 4.7067078552515446e-05,
"loss": 2.9828,
"step": 212000
},
{
"epoch": 11.45,
"grad_norm": 0.8329269289970398,
"learning_rate": 4.677287437481612e-05,
"loss": 2.9779,
"step": 213000
},
{
"epoch": 11.51,
"grad_norm": 0.8244719505310059,
"learning_rate": 4.64789644012945e-05,
"loss": 2.9854,
"step": 214000
},
{
"epoch": 11.56,
"grad_norm": 0.800463855266571,
"learning_rate": 4.618476022359517e-05,
"loss": 2.9844,
"step": 215000
},
{
"epoch": 11.62,
"grad_norm": 0.8081605434417725,
"learning_rate": 4.589085025007355e-05,
"loss": 2.9834,
"step": 216000
},
{
"epoch": 11.67,
"grad_norm": 0.8294093608856201,
"learning_rate": 4.5596646072374224e-05,
"loss": 2.9862,
"step": 217000
},
{
"epoch": 11.72,
"grad_norm": 0.8135597109794617,
"learning_rate": 4.530273609885261e-05,
"loss": 2.9889,
"step": 218000
},
{
"epoch": 11.78,
"grad_norm": 0.8094319105148315,
"learning_rate": 4.500853192115328e-05,
"loss": 2.9857,
"step": 219000
},
{
"epoch": 11.83,
"grad_norm": 0.8075074553489685,
"learning_rate": 4.471462194763166e-05,
"loss": 2.9917,
"step": 220000
},
{
"epoch": 11.88,
"grad_norm": 0.8017106652259827,
"learning_rate": 4.4420417769932335e-05,
"loss": 2.988,
"step": 221000
},
{
"epoch": 11.94,
"grad_norm": 0.8178462982177734,
"learning_rate": 4.412621359223301e-05,
"loss": 2.9942,
"step": 222000
},
{
"epoch": 11.99,
"grad_norm": 0.8240616917610168,
"learning_rate": 4.383230361871139e-05,
"loss": 2.9923,
"step": 223000
},
{
"epoch": 12.0,
"eval_accuracy": 0.4041937035262776,
"eval_loss": 3.402968406677246,
"eval_runtime": 154.7261,
"eval_samples_per_second": 374.345,
"eval_steps_per_second": 5.856,
"step": 223140
},
{
"epoch": 12.05,
"grad_norm": 0.816884458065033,
"learning_rate": 4.353809944101207e-05,
"loss": 2.9496,
"step": 224000
},
{
"epoch": 12.1,
"grad_norm": 0.8183420896530151,
"learning_rate": 4.3244189467490445e-05,
"loss": 2.9446,
"step": 225000
},
{
"epoch": 12.15,
"grad_norm": 0.8411849737167358,
"learning_rate": 4.2950279493968815e-05,
"loss": 2.9469,
"step": 226000
},
{
"epoch": 12.21,
"grad_norm": 0.836554765701294,
"learning_rate": 4.26560753162695e-05,
"loss": 2.948,
"step": 227000
},
{
"epoch": 12.26,
"grad_norm": 0.8519123196601868,
"learning_rate": 4.236187113857017e-05,
"loss": 2.952,
"step": 228000
},
{
"epoch": 12.32,
"grad_norm": 0.8291841745376587,
"learning_rate": 4.2067666960870846e-05,
"loss": 2.9543,
"step": 229000
},
{
"epoch": 12.37,
"grad_norm": 0.8258066177368164,
"learning_rate": 4.177375698734922e-05,
"loss": 2.959,
"step": 230000
},
{
"epoch": 12.42,
"grad_norm": 0.8320387005805969,
"learning_rate": 4.14798470138276e-05,
"loss": 2.9631,
"step": 231000
},
{
"epoch": 12.48,
"grad_norm": 0.8495768308639526,
"learning_rate": 4.1185642836128275e-05,
"loss": 2.9607,
"step": 232000
},
{
"epoch": 12.53,
"grad_norm": 0.7984282970428467,
"learning_rate": 4.0891438658428956e-05,
"loss": 2.9598,
"step": 233000
},
{
"epoch": 12.58,
"grad_norm": 0.8098889589309692,
"learning_rate": 4.059723448072963e-05,
"loss": 2.9605,
"step": 234000
},
{
"epoch": 12.64,
"grad_norm": 0.8174999356269836,
"learning_rate": 4.030332450720801e-05,
"loss": 2.9623,
"step": 235000
},
{
"epoch": 12.69,
"grad_norm": 0.8120893239974976,
"learning_rate": 4.000912032950868e-05,
"loss": 2.9613,
"step": 236000
},
{
"epoch": 12.75,
"grad_norm": 0.8061412572860718,
"learning_rate": 3.9714916151809357e-05,
"loss": 2.9632,
"step": 237000
},
{
"epoch": 12.8,
"grad_norm": 0.8751248121261597,
"learning_rate": 3.942071197411004e-05,
"loss": 2.9681,
"step": 238000
},
{
"epoch": 12.85,
"grad_norm": 0.8276800513267517,
"learning_rate": 3.912680200058841e-05,
"loss": 2.9684,
"step": 239000
},
{
"epoch": 12.91,
"grad_norm": 0.861995279788971,
"learning_rate": 3.883259782288909e-05,
"loss": 2.967,
"step": 240000
},
{
"epoch": 12.96,
"grad_norm": 0.8120051026344299,
"learning_rate": 3.853898205354516e-05,
"loss": 2.9703,
"step": 241000
},
{
"epoch": 13.0,
"eval_accuracy": 0.404724884565365,
"eval_loss": 3.393812894821167,
"eval_runtime": 154.9346,
"eval_samples_per_second": 373.842,
"eval_steps_per_second": 5.848,
"step": 241735
},
{
"epoch": 13.01,
"grad_norm": 0.8200232982635498,
"learning_rate": 3.824477787584584e-05,
"loss": 2.9557,
"step": 242000
},
{
"epoch": 13.07,
"grad_norm": 0.857276201248169,
"learning_rate": 3.795057369814651e-05,
"loss": 2.9224,
"step": 243000
},
{
"epoch": 13.12,
"grad_norm": 0.8572235107421875,
"learning_rate": 3.765666372462489e-05,
"loss": 2.9268,
"step": 244000
},
{
"epoch": 13.18,
"grad_norm": 0.8236098885536194,
"learning_rate": 3.7362459546925564e-05,
"loss": 2.9321,
"step": 245000
},
{
"epoch": 13.23,
"grad_norm": 0.8436420559883118,
"learning_rate": 3.7068255369226245e-05,
"loss": 2.9301,
"step": 246000
},
{
"epoch": 13.28,
"grad_norm": 0.850407063961029,
"learning_rate": 3.6774345395704615e-05,
"loss": 2.9385,
"step": 247000
},
{
"epoch": 13.34,
"grad_norm": 0.8579990863800049,
"learning_rate": 3.64801412180053e-05,
"loss": 2.934,
"step": 248000
},
{
"epoch": 13.39,
"grad_norm": 0.8318254351615906,
"learning_rate": 3.618593704030597e-05,
"loss": 2.9366,
"step": 249000
},
{
"epoch": 13.44,
"grad_norm": 0.8493559956550598,
"learning_rate": 3.589202706678435e-05,
"loss": 2.9407,
"step": 250000
},
{
"epoch": 13.5,
"grad_norm": 0.8592551350593567,
"learning_rate": 3.559782288908502e-05,
"loss": 2.942,
"step": 251000
},
{
"epoch": 13.55,
"grad_norm": 0.8678461909294128,
"learning_rate": 3.5303618711385704e-05,
"loss": 2.9419,
"step": 252000
},
{
"epoch": 13.61,
"grad_norm": 0.8739249110221863,
"learning_rate": 3.500941453368638e-05,
"loss": 2.9452,
"step": 253000
},
{
"epoch": 13.66,
"grad_norm": 0.8372821807861328,
"learning_rate": 3.4715504560164756e-05,
"loss": 2.944,
"step": 254000
},
{
"epoch": 13.71,
"grad_norm": 0.8296213746070862,
"learning_rate": 3.442159458664313e-05,
"loss": 2.9454,
"step": 255000
},
{
"epoch": 13.77,
"grad_norm": 0.7957202792167664,
"learning_rate": 3.412739040894381e-05,
"loss": 2.9452,
"step": 256000
},
{
"epoch": 13.82,
"grad_norm": 0.8241844773292542,
"learning_rate": 3.3833480435422185e-05,
"loss": 2.9427,
"step": 257000
},
{
"epoch": 13.87,
"grad_norm": 0.8526179194450378,
"learning_rate": 3.353927625772286e-05,
"loss": 2.9461,
"step": 258000
},
{
"epoch": 13.93,
"grad_norm": 0.8483251333236694,
"learning_rate": 3.3245072080023534e-05,
"loss": 2.95,
"step": 259000
},
{
"epoch": 13.98,
"grad_norm": 0.8320598602294922,
"learning_rate": 3.2950867902324215e-05,
"loss": 2.9483,
"step": 260000
},
{
"epoch": 14.0,
"eval_accuracy": 0.4050780423650972,
"eval_loss": 3.400024175643921,
"eval_runtime": 154.9428,
"eval_samples_per_second": 373.822,
"eval_steps_per_second": 5.847,
"step": 260330
},
{
"epoch": 14.04,
"grad_norm": 0.8741620182991028,
"learning_rate": 3.265725213298029e-05,
"loss": 2.9159,
"step": 261000
},
{
"epoch": 14.09,
"grad_norm": 0.8629675507545471,
"learning_rate": 3.236304795528097e-05,
"loss": 2.9064,
"step": 262000
},
{
"epoch": 14.14,
"grad_norm": 0.8614948987960815,
"learning_rate": 3.206913798175935e-05,
"loss": 2.9098,
"step": 263000
},
{
"epoch": 14.2,
"grad_norm": 0.8749313354492188,
"learning_rate": 3.177493380406002e-05,
"loss": 2.9085,
"step": 264000
},
{
"epoch": 14.25,
"grad_norm": 0.8873867392539978,
"learning_rate": 3.1480729626360696e-05,
"loss": 2.9161,
"step": 265000
},
{
"epoch": 14.3,
"grad_norm": 0.8229598999023438,
"learning_rate": 3.118681965283907e-05,
"loss": 2.9146,
"step": 266000
},
{
"epoch": 14.36,
"grad_norm": 0.8612464070320129,
"learning_rate": 3.089261547513975e-05,
"loss": 2.9189,
"step": 267000
},
{
"epoch": 14.41,
"grad_norm": 0.8653061985969543,
"learning_rate": 3.059841129744043e-05,
"loss": 2.9204,
"step": 268000
},
{
"epoch": 14.47,
"grad_norm": 0.8890775442123413,
"learning_rate": 3.03045013239188e-05,
"loss": 2.9217,
"step": 269000
},
{
"epoch": 14.52,
"grad_norm": 0.8957571983337402,
"learning_rate": 3.001029714621948e-05,
"loss": 2.9262,
"step": 270000
},
{
"epoch": 14.57,
"grad_norm": 0.8220794796943665,
"learning_rate": 2.9716092968520155e-05,
"loss": 2.9236,
"step": 271000
},
{
"epoch": 14.63,
"grad_norm": 0.841556191444397,
"learning_rate": 2.942188879082083e-05,
"loss": 2.9248,
"step": 272000
},
{
"epoch": 14.68,
"grad_norm": 0.8488039374351501,
"learning_rate": 2.9127684613121508e-05,
"loss": 2.9272,
"step": 273000
},
{
"epoch": 14.74,
"grad_norm": 0.8521600365638733,
"learning_rate": 2.8833774639599885e-05,
"loss": 2.9286,
"step": 274000
},
{
"epoch": 14.79,
"grad_norm": 0.878585696220398,
"learning_rate": 2.8539570461900563e-05,
"loss": 2.9254,
"step": 275000
},
{
"epoch": 14.84,
"grad_norm": 0.8403483033180237,
"learning_rate": 2.8245954692556636e-05,
"loss": 2.9266,
"step": 276000
},
{
"epoch": 14.9,
"grad_norm": 0.8805426955223083,
"learning_rate": 2.795175051485731e-05,
"loss": 2.932,
"step": 277000
},
{
"epoch": 14.95,
"grad_norm": 0.8541871309280396,
"learning_rate": 2.7657546337157992e-05,
"loss": 2.9286,
"step": 278000
},
{
"epoch": 15.0,
"eval_accuracy": 0.40484641061819276,
"eval_loss": 3.40685772895813,
"eval_runtime": 155.3799,
"eval_samples_per_second": 372.77,
"eval_steps_per_second": 5.831,
"step": 278925
},
{
"epoch": 15.0,
"grad_norm": 0.851596474647522,
"learning_rate": 2.7363636363636362e-05,
"loss": 2.9252,
"step": 279000
},
{
"epoch": 15.06,
"grad_norm": 0.8839541077613831,
"learning_rate": 2.7069726390114743e-05,
"loss": 2.8894,
"step": 280000
},
{
"epoch": 15.11,
"grad_norm": 0.9067598581314087,
"learning_rate": 2.6775522212415417e-05,
"loss": 2.8899,
"step": 281000
},
{
"epoch": 15.17,
"grad_norm": 0.8887162208557129,
"learning_rate": 2.6481318034716095e-05,
"loss": 2.8969,
"step": 282000
},
{
"epoch": 15.22,
"grad_norm": 0.9024575352668762,
"learning_rate": 2.618740806119447e-05,
"loss": 2.8941,
"step": 283000
},
{
"epoch": 15.27,
"grad_norm": 0.9097668528556824,
"learning_rate": 2.5893203883495147e-05,
"loss": 2.899,
"step": 284000
},
{
"epoch": 15.33,
"grad_norm": 0.8730289936065674,
"learning_rate": 2.559899970579582e-05,
"loss": 2.8947,
"step": 285000
},
{
"epoch": 15.38,
"grad_norm": 0.8862543702125549,
"learning_rate": 2.53047955280965e-05,
"loss": 2.9005,
"step": 286000
},
{
"epoch": 15.43,
"grad_norm": 0.8661286234855652,
"learning_rate": 2.5010885554574877e-05,
"loss": 2.903,
"step": 287000
},
{
"epoch": 15.49,
"grad_norm": 0.8767553567886353,
"learning_rate": 2.4716975581053254e-05,
"loss": 2.9094,
"step": 288000
},
{
"epoch": 15.54,
"grad_norm": 0.8723335266113281,
"learning_rate": 2.442277140335393e-05,
"loss": 2.9033,
"step": 289000
},
{
"epoch": 15.6,
"grad_norm": 0.8956754207611084,
"learning_rate": 2.4128567225654606e-05,
"loss": 2.9059,
"step": 290000
},
{
"epoch": 15.65,
"grad_norm": 0.8903990387916565,
"learning_rate": 2.3834363047955284e-05,
"loss": 2.9088,
"step": 291000
},
{
"epoch": 15.7,
"grad_norm": 0.8938170671463013,
"learning_rate": 2.354015887025596e-05,
"loss": 2.9057,
"step": 292000
},
{
"epoch": 15.76,
"grad_norm": 0.9013971090316772,
"learning_rate": 2.3245954692556633e-05,
"loss": 2.9101,
"step": 293000
},
{
"epoch": 15.81,
"grad_norm": 0.8826420307159424,
"learning_rate": 2.2952044719035014e-05,
"loss": 2.9129,
"step": 294000
},
{
"epoch": 15.86,
"grad_norm": 0.8986074328422546,
"learning_rate": 2.2658134745513388e-05,
"loss": 2.9102,
"step": 295000
},
{
"epoch": 15.92,
"grad_norm": 0.8564696311950684,
"learning_rate": 2.2363930567814065e-05,
"loss": 2.9093,
"step": 296000
},
{
"epoch": 15.97,
"grad_norm": 0.8886106610298157,
"learning_rate": 2.206972639011474e-05,
"loss": 2.9143,
"step": 297000
},
{
"epoch": 16.0,
"eval_accuracy": 0.4056046552606841,
"eval_loss": 3.4019551277160645,
"eval_runtime": 154.6299,
"eval_samples_per_second": 374.578,
"eval_steps_per_second": 5.859,
"step": 297520
},
{
"epoch": 16.03,
"grad_norm": 0.8847858905792236,
"learning_rate": 2.1775522212415418e-05,
"loss": 2.8953,
"step": 298000
},
{
"epoch": 16.08,
"grad_norm": 0.8800062537193298,
"learning_rate": 2.1481318034716096e-05,
"loss": 2.8747,
"step": 299000
},
{
"epoch": 16.13,
"grad_norm": 0.9220243692398071,
"learning_rate": 2.118711385701677e-05,
"loss": 2.8779,
"step": 300000
},
{
"epoch": 16.19,
"grad_norm": 0.9044596552848816,
"learning_rate": 2.0893203883495147e-05,
"loss": 2.8786,
"step": 301000
},
{
"epoch": 16.24,
"grad_norm": 0.9245201945304871,
"learning_rate": 2.0598999705795825e-05,
"loss": 2.883,
"step": 302000
},
{
"epoch": 16.29,
"grad_norm": 0.9172502160072327,
"learning_rate": 2.03050897322742e-05,
"loss": 2.8844,
"step": 303000
},
{
"epoch": 16.35,
"grad_norm": 0.8956001996994019,
"learning_rate": 2.0010885554574877e-05,
"loss": 2.8817,
"step": 304000
},
{
"epoch": 16.4,
"grad_norm": 0.9131536483764648,
"learning_rate": 1.971697558105325e-05,
"loss": 2.8848,
"step": 305000
},
{
"epoch": 16.46,
"grad_norm": 0.8909400105476379,
"learning_rate": 1.942277140335393e-05,
"loss": 2.8869,
"step": 306000
},
{
"epoch": 16.51,
"grad_norm": 0.9004851579666138,
"learning_rate": 1.9128567225654607e-05,
"loss": 2.8903,
"step": 307000
},
{
"epoch": 16.56,
"grad_norm": 0.8977559804916382,
"learning_rate": 1.883436304795528e-05,
"loss": 2.8893,
"step": 308000
},
{
"epoch": 16.62,
"grad_norm": 0.9052317142486572,
"learning_rate": 1.854045307443366e-05,
"loss": 2.8895,
"step": 309000
},
{
"epoch": 16.67,
"grad_norm": 0.9036689400672913,
"learning_rate": 1.8246248896734333e-05,
"loss": 2.8875,
"step": 310000
},
{
"epoch": 16.72,
"grad_norm": 0.9088098406791687,
"learning_rate": 1.795204471903501e-05,
"loss": 2.8949,
"step": 311000
},
{
"epoch": 16.78,
"grad_norm": 0.8923940658569336,
"learning_rate": 1.7658134745513388e-05,
"loss": 2.8941,
"step": 312000
},
{
"epoch": 16.83,
"grad_norm": 0.8931061625480652,
"learning_rate": 1.7363930567814063e-05,
"loss": 2.8923,
"step": 313000
},
{
"epoch": 16.89,
"grad_norm": 0.8664661049842834,
"learning_rate": 1.707002059429244e-05,
"loss": 2.896,
"step": 314000
},
{
"epoch": 16.94,
"grad_norm": 0.880660355091095,
"learning_rate": 1.6775816416593114e-05,
"loss": 2.8917,
"step": 315000
},
{
"epoch": 16.99,
"grad_norm": 0.8778538703918457,
"learning_rate": 1.6481612238893792e-05,
"loss": 2.8935,
"step": 316000
},
{
"epoch": 17.0,
"eval_accuracy": 0.4054564593112435,
"eval_loss": 3.409980058670044,
"eval_runtime": 155.1392,
"eval_samples_per_second": 373.349,
"eval_steps_per_second": 5.84,
"step": 316115
},
{
"epoch": 17.05,
"grad_norm": 0.8941461443901062,
"learning_rate": 1.618740806119447e-05,
"loss": 2.8677,
"step": 317000
},
{
"epoch": 17.1,
"grad_norm": 0.8646876215934753,
"learning_rate": 1.5893498087672844e-05,
"loss": 2.8632,
"step": 318000
},
{
"epoch": 17.16,
"grad_norm": 0.9284677505493164,
"learning_rate": 1.5599293909973522e-05,
"loss": 2.8627,
"step": 319000
},
{
"epoch": 17.21,
"grad_norm": 0.9292691349983215,
"learning_rate": 1.53056781406296e-05,
"loss": 2.8676,
"step": 320000
},
{
"epoch": 17.26,
"grad_norm": 0.8917170166969299,
"learning_rate": 1.5011473962930275e-05,
"loss": 2.8713,
"step": 321000
},
{
"epoch": 17.32,
"grad_norm": 0.964272141456604,
"learning_rate": 1.4717269785230949e-05,
"loss": 2.8679,
"step": 322000
},
{
"epoch": 17.37,
"grad_norm": 1.0006158351898193,
"learning_rate": 1.4423359811709328e-05,
"loss": 2.8746,
"step": 323000
},
{
"epoch": 17.42,
"grad_norm": 0.9144309163093567,
"learning_rate": 1.4129155634010003e-05,
"loss": 2.8703,
"step": 324000
},
{
"epoch": 17.48,
"grad_norm": 0.8891538381576538,
"learning_rate": 1.383524566048838e-05,
"loss": 2.8737,
"step": 325000
},
{
"epoch": 17.53,
"grad_norm": 0.9083254337310791,
"learning_rate": 1.3541041482789058e-05,
"loss": 2.8691,
"step": 326000
},
{
"epoch": 17.59,
"grad_norm": 0.8879551291465759,
"learning_rate": 1.3247131509267433e-05,
"loss": 2.875,
"step": 327000
},
{
"epoch": 17.64,
"grad_norm": 0.8899810314178467,
"learning_rate": 1.295292733156811e-05,
"loss": 2.8744,
"step": 328000
},
{
"epoch": 17.69,
"grad_norm": 0.9027150273323059,
"learning_rate": 1.2658723153868784e-05,
"loss": 2.8767,
"step": 329000
},
{
"epoch": 17.75,
"grad_norm": 0.8921009302139282,
"learning_rate": 1.2364518976169462e-05,
"loss": 2.8727,
"step": 330000
},
{
"epoch": 17.8,
"grad_norm": 0.9295068979263306,
"learning_rate": 1.207031479847014e-05,
"loss": 2.8767,
"step": 331000
},
{
"epoch": 17.85,
"grad_norm": 0.896858274936676,
"learning_rate": 1.1776404824948514e-05,
"loss": 2.8779,
"step": 332000
},
{
"epoch": 17.91,
"grad_norm": 0.9167591333389282,
"learning_rate": 1.1482200647249191e-05,
"loss": 2.8762,
"step": 333000
},
{
"epoch": 17.96,
"grad_norm": 0.9663364887237549,
"learning_rate": 1.1188290673727569e-05,
"loss": 2.8782,
"step": 334000
},
{
"epoch": 18.0,
"eval_accuracy": 0.40578731382212063,
"eval_loss": 3.407121181488037,
"eval_runtime": 155.0726,
"eval_samples_per_second": 373.509,
"eval_steps_per_second": 5.842,
"step": 334710
},
{
"epoch": 18.02,
"grad_norm": 0.9326697587966919,
"learning_rate": 1.0894086496028245e-05,
"loss": 2.8691,
"step": 335000
},
{
"epoch": 18.07,
"grad_norm": 0.922129213809967,
"learning_rate": 1.060017652250662e-05,
"loss": 2.8486,
"step": 336000
},
{
"epoch": 18.12,
"grad_norm": 0.9094411730766296,
"learning_rate": 1.0306266548984996e-05,
"loss": 2.8516,
"step": 337000
},
{
"epoch": 18.18,
"grad_norm": 0.9079362154006958,
"learning_rate": 1.0012062371285672e-05,
"loss": 2.854,
"step": 338000
},
{
"epoch": 18.23,
"grad_norm": 0.9157450199127197,
"learning_rate": 9.717858193586348e-06,
"loss": 2.8547,
"step": 339000
},
{
"epoch": 18.28,
"grad_norm": 0.9174096584320068,
"learning_rate": 9.423654015887026e-06,
"loss": 2.8564,
"step": 340000
},
{
"epoch": 18.34,
"grad_norm": 0.9197123050689697,
"learning_rate": 9.130038246543101e-06,
"loss": 2.8525,
"step": 341000
},
{
"epoch": 18.39,
"grad_norm": 0.9466710090637207,
"learning_rate": 8.835834068843777e-06,
"loss": 2.8571,
"step": 342000
},
{
"epoch": 18.45,
"grad_norm": 0.9464150667190552,
"learning_rate": 8.541629891144455e-06,
"loss": 2.8611,
"step": 343000
},
{
"epoch": 18.5,
"grad_norm": 0.9168482422828674,
"learning_rate": 8.247425713445131e-06,
"loss": 2.8564,
"step": 344000
},
{
"epoch": 18.55,
"grad_norm": 0.9207190871238708,
"learning_rate": 7.953221535745808e-06,
"loss": 2.8604,
"step": 345000
},
{
"epoch": 18.61,
"grad_norm": 0.9423150420188904,
"learning_rate": 7.659605766401884e-06,
"loss": 2.8611,
"step": 346000
},
{
"epoch": 18.66,
"grad_norm": 0.9121553301811218,
"learning_rate": 7.36540158870256e-06,
"loss": 2.8624,
"step": 347000
},
{
"epoch": 18.71,
"grad_norm": 0.9213626980781555,
"learning_rate": 7.0711974110032376e-06,
"loss": 2.8603,
"step": 348000
},
{
"epoch": 18.77,
"grad_norm": 0.9198666214942932,
"learning_rate": 6.776993233303913e-06,
"loss": 2.8627,
"step": 349000
},
{
"epoch": 18.82,
"grad_norm": 0.9301998615264893,
"learning_rate": 6.483083259782289e-06,
"loss": 2.8613,
"step": 350000
},
{
"epoch": 18.88,
"grad_norm": 0.899235725402832,
"learning_rate": 6.1888790820829655e-06,
"loss": 2.8622,
"step": 351000
},
{
"epoch": 18.93,
"grad_norm": 0.9574356079101562,
"learning_rate": 5.894969108561342e-06,
"loss": 2.8614,
"step": 352000
},
{
"epoch": 18.98,
"grad_norm": 0.9441936016082764,
"learning_rate": 5.6010591350397175e-06,
"loss": 2.8613,
"step": 353000
},
{
"epoch": 19.0,
"eval_accuracy": 0.4061827941278128,
"eval_loss": 3.4122915267944336,
"eval_runtime": 154.3443,
"eval_samples_per_second": 375.271,
"eval_steps_per_second": 5.87,
"step": 353305
},
{
"epoch": 19.04,
"grad_norm": 0.9131889939308167,
"learning_rate": 5.3068549573403945e-06,
"loss": 2.8451,
"step": 354000
},
{
"epoch": 19.09,
"grad_norm": 0.9485396146774292,
"learning_rate": 5.0126507796410715e-06,
"loss": 2.8459,
"step": 355000
},
{
"epoch": 19.14,
"grad_norm": 0.9703813791275024,
"learning_rate": 4.718446601941748e-06,
"loss": 2.8399,
"step": 356000
},
{
"epoch": 19.2,
"grad_norm": 0.9274348616600037,
"learning_rate": 4.424536628420123e-06,
"loss": 2.8446,
"step": 357000
},
{
"epoch": 19.25,
"grad_norm": 0.9653975367546082,
"learning_rate": 4.1303324507208e-06,
"loss": 2.8447,
"step": 358000
},
{
"epoch": 19.31,
"grad_norm": 0.9628462195396423,
"learning_rate": 3.8361282730214765e-06,
"loss": 2.8465,
"step": 359000
},
{
"epoch": 19.36,
"grad_norm": 0.9582130312919617,
"learning_rate": 3.5422182994998533e-06,
"loss": 2.8486,
"step": 360000
},
{
"epoch": 19.41,
"grad_norm": 0.9385794401168823,
"learning_rate": 3.24801412180053e-06,
"loss": 2.8457,
"step": 361000
},
{
"epoch": 19.47,
"grad_norm": 0.973229706287384,
"learning_rate": 2.954398352456605e-06,
"loss": 2.8481,
"step": 362000
},
{
"epoch": 19.52,
"grad_norm": 0.9264838099479675,
"learning_rate": 2.660194174757282e-06,
"loss": 2.8433,
"step": 363000
},
{
"epoch": 19.58,
"grad_norm": 0.9257712960243225,
"learning_rate": 2.3659899970579586e-06,
"loss": 2.8487,
"step": 364000
},
{
"epoch": 19.63,
"grad_norm": 0.898076057434082,
"learning_rate": 2.071785819358635e-06,
"loss": 2.8459,
"step": 365000
},
{
"epoch": 19.68,
"grad_norm": 0.9263227581977844,
"learning_rate": 1.777875845837011e-06,
"loss": 2.8443,
"step": 366000
},
{
"epoch": 19.74,
"grad_norm": 0.931954026222229,
"learning_rate": 1.4836716681376875e-06,
"loss": 2.8468,
"step": 367000
},
{
"epoch": 19.79,
"grad_norm": 0.9403001666069031,
"learning_rate": 1.1897616946160636e-06,
"loss": 2.8486,
"step": 368000
},
{
"epoch": 19.84,
"grad_norm": 0.9270106554031372,
"learning_rate": 8.955575169167403e-07,
"loss": 2.8456,
"step": 369000
},
{
"epoch": 19.9,
"grad_norm": 0.9441812634468079,
"learning_rate": 6.016475433951162e-07,
"loss": 2.8481,
"step": 370000
},
{
"epoch": 19.95,
"grad_norm": 0.9611223340034485,
"learning_rate": 3.0744336569579287e-07,
"loss": 2.8439,
"step": 371000
},
{
"epoch": 20.0,
"eval_accuracy": 0.40601854249753977,
"eval_loss": 3.4158718585968018,
"eval_runtime": 154.2832,
"eval_samples_per_second": 375.42,
"eval_steps_per_second": 5.872,
"step": 371900
},
{
"epoch": 20.0,
"step": 371900,
"total_flos": 1.5669257538816e+18,
"train_loss": 3.157264780781544,
"train_runtime": 81435.8042,
"train_samples_per_second": 146.136,
"train_steps_per_second": 4.567
}
],
"logging_steps": 1000,
"max_steps": 371900,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 5000,
"total_flos": 1.5669257538816e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}