kanishka's picture
End of training
7f4e01a verified
raw
history blame contribute delete
No virus
70.8 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 20.0,
"eval_steps": 500,
"global_step": 371900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05377789728421619,
"grad_norm": 0.7084974050521851,
"learning_rate": 9.375e-06,
"loss": 6.8536,
"step": 1000
},
{
"epoch": 0.10755579456843238,
"grad_norm": 0.8629822731018066,
"learning_rate": 1.875e-05,
"loss": 5.3656,
"step": 2000
},
{
"epoch": 0.16133369185264856,
"grad_norm": 0.9347710013389587,
"learning_rate": 2.8125e-05,
"loss": 5.0405,
"step": 3000
},
{
"epoch": 0.21511158913686476,
"grad_norm": 0.9131508469581604,
"learning_rate": 3.75e-05,
"loss": 4.8207,
"step": 4000
},
{
"epoch": 0.26888948642108096,
"grad_norm": 1.0589386224746704,
"learning_rate": 4.6874999999999994e-05,
"loss": 4.6468,
"step": 5000
},
{
"epoch": 0.32266738370529713,
"grad_norm": 0.9457437992095947,
"learning_rate": 5.625e-05,
"loss": 4.4985,
"step": 6000
},
{
"epoch": 0.3764452809895133,
"grad_norm": 0.9490616917610168,
"learning_rate": 6.5625e-05,
"loss": 4.3901,
"step": 7000
},
{
"epoch": 0.4302231782737295,
"grad_norm": 0.9046244621276855,
"learning_rate": 7.5e-05,
"loss": 4.2829,
"step": 8000
},
{
"epoch": 0.4840010755579457,
"grad_norm": 0.9089182615280151,
"learning_rate": 8.437499999999999e-05,
"loss": 4.2055,
"step": 9000
},
{
"epoch": 0.5377789728421619,
"grad_norm": 0.8736773729324341,
"learning_rate": 9.374999999999999e-05,
"loss": 4.1359,
"step": 10000
},
{
"epoch": 0.591556870126378,
"grad_norm": 0.9014385342597961,
"learning_rate": 0.000103115625,
"loss": 4.07,
"step": 11000
},
{
"epoch": 0.6453347674105943,
"grad_norm": 0.8683918118476868,
"learning_rate": 0.000112490625,
"loss": 4.0086,
"step": 12000
},
{
"epoch": 0.6991126646948105,
"grad_norm": 0.818735659122467,
"learning_rate": 0.00012185624999999998,
"loss": 3.946,
"step": 13000
},
{
"epoch": 0.7528905619790266,
"grad_norm": 0.8135210275650024,
"learning_rate": 0.00013123125,
"loss": 3.8931,
"step": 14000
},
{
"epoch": 0.8066684592632428,
"grad_norm": 0.8030542731285095,
"learning_rate": 0.000140596875,
"loss": 3.8372,
"step": 15000
},
{
"epoch": 0.860446356547459,
"grad_norm": 0.7653594613075256,
"learning_rate": 0.0001499625,
"loss": 3.8021,
"step": 16000
},
{
"epoch": 0.9142242538316752,
"grad_norm": 0.749381422996521,
"learning_rate": 0.00015933749999999996,
"loss": 3.762,
"step": 17000
},
{
"epoch": 0.9680021511158914,
"grad_norm": 0.7983600497245789,
"learning_rate": 0.00016871249999999996,
"loss": 3.7356,
"step": 18000
},
{
"epoch": 1.0,
"eval_accuracy": 0.34747039389556295,
"eval_loss": 3.884793758392334,
"eval_runtime": 152.1058,
"eval_samples_per_second": 380.794,
"eval_steps_per_second": 5.956,
"step": 18595
},
{
"epoch": 1.0217800484001076,
"grad_norm": 0.7763124704360962,
"learning_rate": 0.000178078125,
"loss": 3.6984,
"step": 19000
},
{
"epoch": 1.0755579456843238,
"grad_norm": 0.7071164846420288,
"learning_rate": 0.00018745312499999998,
"loss": 3.6652,
"step": 20000
},
{
"epoch": 1.1293358429685398,
"grad_norm": 0.6947665214538574,
"learning_rate": 0.00019681874999999998,
"loss": 3.6519,
"step": 21000
},
{
"epoch": 1.183113740252756,
"grad_norm": 0.6985417008399963,
"learning_rate": 0.00020619374999999998,
"loss": 3.6337,
"step": 22000
},
{
"epoch": 1.2368916375369723,
"grad_norm": 0.6635871529579163,
"learning_rate": 0.00021556874999999998,
"loss": 3.6167,
"step": 23000
},
{
"epoch": 1.2906695348211885,
"grad_norm": 0.6359060406684875,
"learning_rate": 0.00022493437499999998,
"loss": 3.5919,
"step": 24000
},
{
"epoch": 1.3444474321054047,
"grad_norm": 0.6042994260787964,
"learning_rate": 0.00023430937499999997,
"loss": 3.5719,
"step": 25000
},
{
"epoch": 1.398225329389621,
"grad_norm": 0.6072087287902832,
"learning_rate": 0.00024368437499999997,
"loss": 3.563,
"step": 26000
},
{
"epoch": 1.452003226673837,
"grad_norm": 0.6790162920951843,
"learning_rate": 0.00025305,
"loss": 3.5498,
"step": 27000
},
{
"epoch": 1.5057811239580532,
"grad_norm": 0.5274273157119751,
"learning_rate": 0.000262425,
"loss": 3.5337,
"step": 28000
},
{
"epoch": 1.5595590212422694,
"grad_norm": 0.5580631494522095,
"learning_rate": 0.000271790625,
"loss": 3.5206,
"step": 29000
},
{
"epoch": 1.6133369185264856,
"grad_norm": 0.5512628555297852,
"learning_rate": 0.000281165625,
"loss": 3.5171,
"step": 30000
},
{
"epoch": 1.6671148158107019,
"grad_norm": 0.5888254642486572,
"learning_rate": 0.000290540625,
"loss": 3.5055,
"step": 31000
},
{
"epoch": 1.7208927130949179,
"grad_norm": 0.5367724895477295,
"learning_rate": 0.000299915625,
"loss": 3.4934,
"step": 32000
},
{
"epoch": 1.7746706103791343,
"grad_norm": 0.512359619140625,
"learning_rate": 0.00029912621359223297,
"loss": 3.4814,
"step": 33000
},
{
"epoch": 1.8284485076633503,
"grad_norm": 0.5001698732376099,
"learning_rate": 0.00029824360105913504,
"loss": 3.4694,
"step": 34000
},
{
"epoch": 1.8822264049475665,
"grad_norm": 0.48298901319503784,
"learning_rate": 0.00029736187113857014,
"loss": 3.4576,
"step": 35000
},
{
"epoch": 1.9360043022317828,
"grad_norm": 0.4670732021331787,
"learning_rate": 0.00029647925860547215,
"loss": 3.4434,
"step": 36000
},
{
"epoch": 1.9897821995159988,
"grad_norm": 0.4532361924648285,
"learning_rate": 0.0002955966460723742,
"loss": 3.4297,
"step": 37000
},
{
"epoch": 2.0,
"eval_accuracy": 0.3762663582363661,
"eval_loss": 3.621605634689331,
"eval_runtime": 154.0921,
"eval_samples_per_second": 375.886,
"eval_steps_per_second": 5.88,
"step": 37190
},
{
"epoch": 2.043560096800215,
"grad_norm": 0.45829683542251587,
"learning_rate": 0.0002947149161518093,
"loss": 3.3852,
"step": 38000
},
{
"epoch": 2.0973379940844312,
"grad_norm": 0.4475788474082947,
"learning_rate": 0.0002938323036187114,
"loss": 3.3735,
"step": 39000
},
{
"epoch": 2.1511158913686477,
"grad_norm": 0.45039886236190796,
"learning_rate": 0.00029294969108561335,
"loss": 3.3662,
"step": 40000
},
{
"epoch": 2.2048937886528637,
"grad_norm": 0.43961483240127563,
"learning_rate": 0.0002920679611650485,
"loss": 3.3651,
"step": 41000
},
{
"epoch": 2.2586716859370797,
"grad_norm": 0.4328303337097168,
"learning_rate": 0.0002911853486319506,
"loss": 3.3594,
"step": 42000
},
{
"epoch": 2.312449583221296,
"grad_norm": 0.42095446586608887,
"learning_rate": 0.00029030273609885254,
"loss": 3.3508,
"step": 43000
},
{
"epoch": 2.366227480505512,
"grad_norm": 0.41147682070732117,
"learning_rate": 0.0002894201235657546,
"loss": 3.3519,
"step": 44000
},
{
"epoch": 2.4200053777897286,
"grad_norm": 0.506058394908905,
"learning_rate": 0.00028853839364518976,
"loss": 3.3417,
"step": 45000
},
{
"epoch": 2.4737832750739446,
"grad_norm": 0.4321398138999939,
"learning_rate": 0.0002876557811120918,
"loss": 3.3402,
"step": 46000
},
{
"epoch": 2.527561172358161,
"grad_norm": 0.3769163191318512,
"learning_rate": 0.0002867731685789938,
"loss": 3.3323,
"step": 47000
},
{
"epoch": 2.581339069642377,
"grad_norm": 0.3871973156929016,
"learning_rate": 0.0002858905560458958,
"loss": 3.3219,
"step": 48000
},
{
"epoch": 2.635116966926593,
"grad_norm": 0.3995376527309418,
"learning_rate": 0.00028500970873786405,
"loss": 3.3231,
"step": 49000
},
{
"epoch": 2.6888948642108095,
"grad_norm": 0.4042655825614929,
"learning_rate": 0.0002841270962047661,
"loss": 3.3141,
"step": 50000
},
{
"epoch": 2.7426727614950255,
"grad_norm": 0.38159438967704773,
"learning_rate": 0.0002832453662842012,
"loss": 3.3062,
"step": 51000
},
{
"epoch": 2.796450658779242,
"grad_norm": 0.3883545994758606,
"learning_rate": 0.00028236275375110323,
"loss": 3.3035,
"step": 52000
},
{
"epoch": 2.850228556063458,
"grad_norm": 0.382907509803772,
"learning_rate": 0.00028148014121800525,
"loss": 3.297,
"step": 53000
},
{
"epoch": 2.904006453347674,
"grad_norm": 0.3734199106693268,
"learning_rate": 0.0002805984112974404,
"loss": 3.2939,
"step": 54000
},
{
"epoch": 2.9577843506318904,
"grad_norm": 0.3749130964279175,
"learning_rate": 0.0002797157987643424,
"loss": 3.2928,
"step": 55000
},
{
"epoch": 3.0,
"eval_accuracy": 0.3889436282358831,
"eval_loss": 3.500473737716675,
"eval_runtime": 153.9288,
"eval_samples_per_second": 376.284,
"eval_steps_per_second": 5.886,
"step": 55785
},
{
"epoch": 3.0115622479161064,
"grad_norm": 0.3949376344680786,
"learning_rate": 0.00027883318623124443,
"loss": 3.2765,
"step": 56000
},
{
"epoch": 3.065340145200323,
"grad_norm": 0.3761361241340637,
"learning_rate": 0.0002779505736981465,
"loss": 3.2289,
"step": 57000
},
{
"epoch": 3.119118042484539,
"grad_norm": 0.3742624521255493,
"learning_rate": 0.0002770688437775816,
"loss": 3.2292,
"step": 58000
},
{
"epoch": 3.172895939768755,
"grad_norm": 0.3935544490814209,
"learning_rate": 0.0002761862312444836,
"loss": 3.2269,
"step": 59000
},
{
"epoch": 3.2266738370529713,
"grad_norm": 0.3780879080295563,
"learning_rate": 0.00027530450132391877,
"loss": 3.2311,
"step": 60000
},
{
"epoch": 3.2804517343371873,
"grad_norm": 0.38651397824287415,
"learning_rate": 0.0002744218887908208,
"loss": 3.2284,
"step": 61000
},
{
"epoch": 3.3342296316214037,
"grad_norm": 0.36211293935775757,
"learning_rate": 0.000273541041482789,
"loss": 3.2297,
"step": 62000
},
{
"epoch": 3.3880075289056197,
"grad_norm": 0.35539260506629944,
"learning_rate": 0.0002726584289496911,
"loss": 3.2274,
"step": 63000
},
{
"epoch": 3.4417854261898357,
"grad_norm": 0.3886893391609192,
"learning_rate": 0.0002717758164165931,
"loss": 3.2248,
"step": 64000
},
{
"epoch": 3.495563323474052,
"grad_norm": 0.35611864924430847,
"learning_rate": 0.0002708940864960282,
"loss": 3.2225,
"step": 65000
},
{
"epoch": 3.549341220758268,
"grad_norm": 0.36730748414993286,
"learning_rate": 0.0002700114739629303,
"loss": 3.2154,
"step": 66000
},
{
"epoch": 3.6031191180424846,
"grad_norm": 0.3518161475658417,
"learning_rate": 0.0002691297440423654,
"loss": 3.2157,
"step": 67000
},
{
"epoch": 3.6568970153267006,
"grad_norm": 0.35682082176208496,
"learning_rate": 0.0002682471315092674,
"loss": 3.2173,
"step": 68000
},
{
"epoch": 3.7106749126109166,
"grad_norm": 0.35613182187080383,
"learning_rate": 0.00026736451897616946,
"loss": 3.2145,
"step": 69000
},
{
"epoch": 3.764452809895133,
"grad_norm": 0.36303049325942993,
"learning_rate": 0.0002664819064430715,
"loss": 3.217,
"step": 70000
},
{
"epoch": 3.8182307071793495,
"grad_norm": 0.3425980508327484,
"learning_rate": 0.0002656010591350397,
"loss": 3.2102,
"step": 71000
},
{
"epoch": 3.8720086044635655,
"grad_norm": 0.3505886197090149,
"learning_rate": 0.00026471844660194173,
"loss": 3.2077,
"step": 72000
},
{
"epoch": 3.9257865017477815,
"grad_norm": 0.3440117835998535,
"learning_rate": 0.00026383583406884375,
"loss": 3.2041,
"step": 73000
},
{
"epoch": 3.979564399031998,
"grad_norm": 0.353041410446167,
"learning_rate": 0.0002629532215357458,
"loss": 3.2003,
"step": 74000
},
{
"epoch": 4.0,
"eval_accuracy": 0.3951829061145071,
"eval_loss": 3.465585231781006,
"eval_runtime": 154.6505,
"eval_samples_per_second": 374.528,
"eval_steps_per_second": 5.858,
"step": 74380
},
{
"epoch": 4.033342296316214,
"grad_norm": 0.3720568120479584,
"learning_rate": 0.0002620706090026478,
"loss": 3.165,
"step": 75000
},
{
"epoch": 4.08712019360043,
"grad_norm": 0.36939120292663574,
"learning_rate": 0.00026118887908208293,
"loss": 3.1394,
"step": 76000
},
{
"epoch": 4.140898090884646,
"grad_norm": 0.37178149819374084,
"learning_rate": 0.0002603071491615181,
"loss": 3.1458,
"step": 77000
},
{
"epoch": 4.1946759881688624,
"grad_norm": 0.35939717292785645,
"learning_rate": 0.0002594245366284201,
"loss": 3.1472,
"step": 78000
},
{
"epoch": 4.2484538854530784,
"grad_norm": 0.39039725065231323,
"learning_rate": 0.0002585419240953221,
"loss": 3.1441,
"step": 79000
},
{
"epoch": 4.302231782737295,
"grad_norm": 0.35796868801116943,
"learning_rate": 0.00025765931156222413,
"loss": 3.1484,
"step": 80000
},
{
"epoch": 4.356009680021511,
"grad_norm": 0.3571794033050537,
"learning_rate": 0.0002567775816416593,
"loss": 3.1469,
"step": 81000
},
{
"epoch": 4.409787577305727,
"grad_norm": 0.34421437978744507,
"learning_rate": 0.00025589496910856136,
"loss": 3.1543,
"step": 82000
},
{
"epoch": 4.463565474589943,
"grad_norm": 0.3431869447231293,
"learning_rate": 0.0002550123565754633,
"loss": 3.151,
"step": 83000
},
{
"epoch": 4.517343371874159,
"grad_norm": 0.3607708215713501,
"learning_rate": 0.00025413062665489847,
"loss": 3.1488,
"step": 84000
},
{
"epoch": 4.571121269158376,
"grad_norm": 0.33604896068573,
"learning_rate": 0.0002532480141218005,
"loss": 3.1481,
"step": 85000
},
{
"epoch": 4.624899166442592,
"grad_norm": 0.3487938940525055,
"learning_rate": 0.00025236540158870256,
"loss": 3.1539,
"step": 86000
},
{
"epoch": 4.678677063726808,
"grad_norm": 0.33633244037628174,
"learning_rate": 0.00025148367166813766,
"loss": 3.1482,
"step": 87000
},
{
"epoch": 4.732454961011024,
"grad_norm": 0.3653378188610077,
"learning_rate": 0.0002506010591350397,
"loss": 3.1469,
"step": 88000
},
{
"epoch": 4.786232858295241,
"grad_norm": 0.34846368432044983,
"learning_rate": 0.00024971844660194174,
"loss": 3.1499,
"step": 89000
},
{
"epoch": 4.840010755579457,
"grad_norm": 0.34447020292282104,
"learning_rate": 0.00024883671668137684,
"loss": 3.1463,
"step": 90000
},
{
"epoch": 4.893788652863673,
"grad_norm": 0.34981223940849304,
"learning_rate": 0.00024795410414827886,
"loss": 3.1476,
"step": 91000
},
{
"epoch": 4.947566550147889,
"grad_norm": 0.33254608511924744,
"learning_rate": 0.00024707149161518093,
"loss": 3.1447,
"step": 92000
},
{
"epoch": 5.0,
"eval_accuracy": 0.39942771910512487,
"eval_loss": 3.406761407852173,
"eval_runtime": 154.9021,
"eval_samples_per_second": 373.92,
"eval_steps_per_second": 5.849,
"step": 92975
},
{
"epoch": 5.001344447432105,
"grad_norm": 0.35289084911346436,
"learning_rate": 0.00024618976169461603,
"loss": 3.1467,
"step": 93000
},
{
"epoch": 5.055122344716321,
"grad_norm": 0.3626822531223297,
"learning_rate": 0.00024530714916151804,
"loss": 3.0786,
"step": 94000
},
{
"epoch": 5.108900242000538,
"grad_norm": 0.3887752592563629,
"learning_rate": 0.0002444245366284201,
"loss": 3.0804,
"step": 95000
},
{
"epoch": 5.162678139284754,
"grad_norm": 0.3546692430973053,
"learning_rate": 0.00024354280670785524,
"loss": 3.0885,
"step": 96000
},
{
"epoch": 5.21645603656897,
"grad_norm": 0.3464365601539612,
"learning_rate": 0.00024266019417475728,
"loss": 3.092,
"step": 97000
},
{
"epoch": 5.270233933853186,
"grad_norm": 0.3478812873363495,
"learning_rate": 0.00024177758164165927,
"loss": 3.0923,
"step": 98000
},
{
"epoch": 5.324011831137403,
"grad_norm": 0.331265389919281,
"learning_rate": 0.00024089496910856132,
"loss": 3.0949,
"step": 99000
},
{
"epoch": 5.377789728421619,
"grad_norm": 0.343042254447937,
"learning_rate": 0.00024001412180052955,
"loss": 3.0939,
"step": 100000
},
{
"epoch": 5.431567625705835,
"grad_norm": 0.34835925698280334,
"learning_rate": 0.0002391315092674316,
"loss": 3.0965,
"step": 101000
},
{
"epoch": 5.485345522990051,
"grad_norm": 0.3566593825817108,
"learning_rate": 0.00023824889673433358,
"loss": 3.0956,
"step": 102000
},
{
"epoch": 5.539123420274267,
"grad_norm": 0.37776514887809753,
"learning_rate": 0.00023736716681376874,
"loss": 3.0957,
"step": 103000
},
{
"epoch": 5.592901317558484,
"grad_norm": 0.3815798759460449,
"learning_rate": 0.00023648455428067078,
"loss": 3.0974,
"step": 104000
},
{
"epoch": 5.6466792148427,
"grad_norm": 0.344208687543869,
"learning_rate": 0.0002356019417475728,
"loss": 3.1012,
"step": 105000
},
{
"epoch": 5.700457112126916,
"grad_norm": 0.3420184850692749,
"learning_rate": 0.00023471932921447481,
"loss": 3.1009,
"step": 106000
},
{
"epoch": 5.754235009411132,
"grad_norm": 0.3633731007575989,
"learning_rate": 0.00023383848190644305,
"loss": 3.0986,
"step": 107000
},
{
"epoch": 5.808012906695348,
"grad_norm": 0.3575678765773773,
"learning_rate": 0.0002329558693733451,
"loss": 3.1004,
"step": 108000
},
{
"epoch": 5.861790803979565,
"grad_norm": 0.35180947184562683,
"learning_rate": 0.00023207325684024714,
"loss": 3.1004,
"step": 109000
},
{
"epoch": 5.915568701263781,
"grad_norm": 0.33941227197647095,
"learning_rate": 0.00023119064430714912,
"loss": 3.0987,
"step": 110000
},
{
"epoch": 5.969346598547997,
"grad_norm": 0.33375173807144165,
"learning_rate": 0.00023030891438658428,
"loss": 3.0991,
"step": 111000
},
{
"epoch": 6.0,
"eval_accuracy": 0.4022032021745438,
"eval_loss": 3.4298133850097656,
"eval_runtime": 153.945,
"eval_samples_per_second": 376.245,
"eval_steps_per_second": 5.885,
"step": 111570
},
{
"epoch": 6.023124495832213,
"grad_norm": 0.3521207869052887,
"learning_rate": 0.0002294263018534863,
"loss": 3.0669,
"step": 112000
},
{
"epoch": 6.076902393116429,
"grad_norm": 0.3600497245788574,
"learning_rate": 0.00022854457193292145,
"loss": 3.0351,
"step": 113000
},
{
"epoch": 6.130680290400646,
"grad_norm": 0.373585045337677,
"learning_rate": 0.00022766195939982344,
"loss": 3.034,
"step": 114000
},
{
"epoch": 6.184458187684862,
"grad_norm": 0.37141576409339905,
"learning_rate": 0.00022677934686672548,
"loss": 3.0454,
"step": 115000
},
{
"epoch": 6.238236084969078,
"grad_norm": 0.34734484553337097,
"learning_rate": 0.0002258984995586937,
"loss": 3.0489,
"step": 116000
},
{
"epoch": 6.292013982253294,
"grad_norm": 0.34972816705703735,
"learning_rate": 0.00022501588702559576,
"loss": 3.0488,
"step": 117000
},
{
"epoch": 6.34579187953751,
"grad_norm": 0.36381569504737854,
"learning_rate": 0.00022413327449249777,
"loss": 3.0513,
"step": 118000
},
{
"epoch": 6.399569776821727,
"grad_norm": 0.3417690396308899,
"learning_rate": 0.0002232515445719329,
"loss": 3.0494,
"step": 119000
},
{
"epoch": 6.453347674105943,
"grad_norm": 0.35012194514274597,
"learning_rate": 0.00022236893203883494,
"loss": 3.0557,
"step": 120000
},
{
"epoch": 6.507125571390159,
"grad_norm": 0.35446369647979736,
"learning_rate": 0.00022148631950573699,
"loss": 3.0584,
"step": 121000
},
{
"epoch": 6.560903468674375,
"grad_norm": 0.3539404273033142,
"learning_rate": 0.00022060370697263897,
"loss": 3.0593,
"step": 122000
},
{
"epoch": 6.614681365958591,
"grad_norm": 0.3495212197303772,
"learning_rate": 0.00021972197705207413,
"loss": 3.0562,
"step": 123000
},
{
"epoch": 6.6684592632428075,
"grad_norm": 0.34305086731910706,
"learning_rate": 0.00021883936451897614,
"loss": 3.0582,
"step": 124000
},
{
"epoch": 6.7222371605270235,
"grad_norm": 0.34190261363983154,
"learning_rate": 0.00021795675198587816,
"loss": 3.0585,
"step": 125000
},
{
"epoch": 6.7760150578112395,
"grad_norm": 0.3615049719810486,
"learning_rate": 0.0002170741394527802,
"loss": 3.06,
"step": 126000
},
{
"epoch": 6.8297929550954555,
"grad_norm": 0.3419729769229889,
"learning_rate": 0.00021619152691968225,
"loss": 3.0571,
"step": 127000
},
{
"epoch": 6.8835708523796715,
"grad_norm": 0.3504064381122589,
"learning_rate": 0.00021531067961165048,
"loss": 3.0587,
"step": 128000
},
{
"epoch": 6.937348749663888,
"grad_norm": 0.3564949333667755,
"learning_rate": 0.00021442806707855253,
"loss": 3.0596,
"step": 129000
},
{
"epoch": 6.991126646948104,
"grad_norm": 0.358786404132843,
"learning_rate": 0.00021354633715798762,
"loss": 3.0592,
"step": 130000
},
{
"epoch": 7.0,
"eval_accuracy": 0.4040891064758338,
"eval_loss": 3.398965358734131,
"eval_runtime": 154.0201,
"eval_samples_per_second": 376.061,
"eval_steps_per_second": 5.882,
"step": 130165
},
{
"epoch": 7.04490454423232,
"grad_norm": 0.36317363381385803,
"learning_rate": 0.00021266372462488964,
"loss": 3.0022,
"step": 131000
},
{
"epoch": 7.098682441516536,
"grad_norm": 0.36218351125717163,
"learning_rate": 0.0002117819947043248,
"loss": 2.995,
"step": 132000
},
{
"epoch": 7.152460338800753,
"grad_norm": 0.36327067017555237,
"learning_rate": 0.00021089938217122684,
"loss": 3.0054,
"step": 133000
},
{
"epoch": 7.206238236084969,
"grad_norm": 0.33495405316352844,
"learning_rate": 0.00021001676963812883,
"loss": 3.0095,
"step": 134000
},
{
"epoch": 7.260016133369185,
"grad_norm": 0.38024088740348816,
"learning_rate": 0.00020913415710503087,
"loss": 3.0094,
"step": 135000
},
{
"epoch": 7.313794030653401,
"grad_norm": 0.3585461676120758,
"learning_rate": 0.00020825242718446602,
"loss": 3.0138,
"step": 136000
},
{
"epoch": 7.367571927937617,
"grad_norm": 0.3688068687915802,
"learning_rate": 0.000207369814651368,
"loss": 3.0179,
"step": 137000
},
{
"epoch": 7.421349825221834,
"grad_norm": 0.35052135586738586,
"learning_rate": 0.00020648720211827005,
"loss": 3.0208,
"step": 138000
},
{
"epoch": 7.47512772250605,
"grad_norm": 0.35148343443870544,
"learning_rate": 0.00020560547219770518,
"loss": 3.0192,
"step": 139000
},
{
"epoch": 7.528905619790266,
"grad_norm": 0.3611489236354828,
"learning_rate": 0.00020472285966460722,
"loss": 3.0199,
"step": 140000
},
{
"epoch": 7.582683517074482,
"grad_norm": 0.3296034634113312,
"learning_rate": 0.00020384024713150924,
"loss": 3.0237,
"step": 141000
},
{
"epoch": 7.636461414358699,
"grad_norm": 0.35607197880744934,
"learning_rate": 0.00020295851721094437,
"loss": 3.0238,
"step": 142000
},
{
"epoch": 7.690239311642915,
"grad_norm": 0.35551297664642334,
"learning_rate": 0.0002020759046778464,
"loss": 3.0217,
"step": 143000
},
{
"epoch": 7.744017208927131,
"grad_norm": 0.3628598153591156,
"learning_rate": 0.00020119329214474845,
"loss": 3.0262,
"step": 144000
},
{
"epoch": 7.797795106211347,
"grad_norm": 0.350730836391449,
"learning_rate": 0.00020031156222418355,
"loss": 3.0256,
"step": 145000
},
{
"epoch": 7.851573003495563,
"grad_norm": 0.3436521291732788,
"learning_rate": 0.0001994289496910856,
"loss": 3.0269,
"step": 146000
},
{
"epoch": 7.90535090077978,
"grad_norm": 0.3377685248851776,
"learning_rate": 0.00019854633715798764,
"loss": 3.0278,
"step": 147000
},
{
"epoch": 7.959128798063996,
"grad_norm": 0.3418731689453125,
"learning_rate": 0.00019766372462488965,
"loss": 3.0279,
"step": 148000
},
{
"epoch": 8.0,
"eval_accuracy": 0.40574452106609454,
"eval_loss": 3.3796093463897705,
"eval_runtime": 154.5289,
"eval_samples_per_second": 374.823,
"eval_steps_per_second": 5.863,
"step": 148760
},
{
"epoch": 8.012906695348212,
"grad_norm": 0.34725406765937805,
"learning_rate": 0.00019678111209179167,
"loss": 3.0056,
"step": 149000
},
{
"epoch": 8.066684592632429,
"grad_norm": 0.34432175755500793,
"learning_rate": 0.0001959002647837599,
"loss": 2.962,
"step": 150000
},
{
"epoch": 8.120462489916644,
"grad_norm": 0.3772197961807251,
"learning_rate": 0.00019501765225066195,
"loss": 2.9645,
"step": 151000
},
{
"epoch": 8.17424038720086,
"grad_norm": 0.3569514751434326,
"learning_rate": 0.00019413503971756397,
"loss": 2.9748,
"step": 152000
},
{
"epoch": 8.228018284485076,
"grad_norm": 0.3487160801887512,
"learning_rate": 0.000193252427184466,
"loss": 2.9757,
"step": 153000
},
{
"epoch": 8.281796181769293,
"grad_norm": 0.35110118985176086,
"learning_rate": 0.00019237069726390113,
"loss": 2.9814,
"step": 154000
},
{
"epoch": 8.33557407905351,
"grad_norm": 0.36479270458221436,
"learning_rate": 0.00019148808473080318,
"loss": 2.9834,
"step": 155000
},
{
"epoch": 8.389351976337725,
"grad_norm": 0.36717069149017334,
"learning_rate": 0.00019060547219770517,
"loss": 2.9883,
"step": 156000
},
{
"epoch": 8.443129873621942,
"grad_norm": 0.35104259848594666,
"learning_rate": 0.00018972374227714032,
"loss": 2.9878,
"step": 157000
},
{
"epoch": 8.496907770906157,
"grad_norm": 0.3428269922733307,
"learning_rate": 0.00018884112974404236,
"loss": 2.9893,
"step": 158000
},
{
"epoch": 8.550685668190374,
"grad_norm": 0.3437536358833313,
"learning_rate": 0.0001879593998234775,
"loss": 2.9924,
"step": 159000
},
{
"epoch": 8.60446356547459,
"grad_norm": 0.34665447473526,
"learning_rate": 0.0001870767872903795,
"loss": 2.9911,
"step": 160000
},
{
"epoch": 8.658241462758806,
"grad_norm": 0.36713042855262756,
"learning_rate": 0.00018619417475728152,
"loss": 2.9901,
"step": 161000
},
{
"epoch": 8.712019360043023,
"grad_norm": 0.38735637068748474,
"learning_rate": 0.00018531244483671667,
"loss": 2.9959,
"step": 162000
},
{
"epoch": 8.765797257327238,
"grad_norm": 0.3698832094669342,
"learning_rate": 0.00018442983230361872,
"loss": 2.9985,
"step": 163000
},
{
"epoch": 8.819575154611455,
"grad_norm": 0.37609660625457764,
"learning_rate": 0.0001835472197705207,
"loss": 2.9956,
"step": 164000
},
{
"epoch": 8.873353051895672,
"grad_norm": 0.3600751757621765,
"learning_rate": 0.00018266460723742275,
"loss": 2.9981,
"step": 165000
},
{
"epoch": 8.927130949179887,
"grad_norm": 0.37786364555358887,
"learning_rate": 0.00018178287731685788,
"loss": 2.9975,
"step": 166000
},
{
"epoch": 8.980908846464104,
"grad_norm": 0.3594890832901001,
"learning_rate": 0.00018090114739629303,
"loss": 2.9978,
"step": 167000
},
{
"epoch": 9.0,
"eval_accuracy": 0.40631117239589115,
"eval_loss": 3.3757009506225586,
"eval_runtime": 153.8626,
"eval_samples_per_second": 376.446,
"eval_steps_per_second": 5.888,
"step": 167355
},
{
"epoch": 9.034686743748319,
"grad_norm": 0.36975300312042236,
"learning_rate": 0.00018001853486319502,
"loss": 2.9532,
"step": 168000
},
{
"epoch": 9.088464641032536,
"grad_norm": 0.4000973403453827,
"learning_rate": 0.00017913592233009706,
"loss": 2.9361,
"step": 169000
},
{
"epoch": 9.142242538316752,
"grad_norm": 0.3800918161869049,
"learning_rate": 0.0001782533097969991,
"loss": 2.942,
"step": 170000
},
{
"epoch": 9.196020435600968,
"grad_norm": 0.3795217275619507,
"learning_rate": 0.00017737157987643426,
"loss": 2.9452,
"step": 171000
},
{
"epoch": 9.249798332885184,
"grad_norm": 0.37562939524650574,
"learning_rate": 0.00017648984995586936,
"loss": 2.9491,
"step": 172000
},
{
"epoch": 9.3035762301694,
"grad_norm": 0.3602105379104614,
"learning_rate": 0.00017560723742277137,
"loss": 2.9543,
"step": 173000
},
{
"epoch": 9.357354127453616,
"grad_norm": 0.3644072115421295,
"learning_rate": 0.00017472550750220652,
"loss": 2.9583,
"step": 174000
},
{
"epoch": 9.411132024737833,
"grad_norm": 0.3795081675052643,
"learning_rate": 0.00017384289496910857,
"loss": 2.963,
"step": 175000
},
{
"epoch": 9.464909922022049,
"grad_norm": 0.37337377667427063,
"learning_rate": 0.00017296028243601056,
"loss": 2.9585,
"step": 176000
},
{
"epoch": 9.518687819306265,
"grad_norm": 0.35948848724365234,
"learning_rate": 0.0001720785525154457,
"loss": 2.9648,
"step": 177000
},
{
"epoch": 9.57246571659048,
"grad_norm": 0.3625440001487732,
"learning_rate": 0.00017119593998234775,
"loss": 2.9573,
"step": 178000
},
{
"epoch": 9.626243613874697,
"grad_norm": 0.35655486583709717,
"learning_rate": 0.00017031332744924974,
"loss": 2.9647,
"step": 179000
},
{
"epoch": 9.680021511158914,
"grad_norm": 0.3656957745552063,
"learning_rate": 0.0001694307149161518,
"loss": 2.9667,
"step": 180000
},
{
"epoch": 9.73379940844313,
"grad_norm": 0.3741768002510071,
"learning_rate": 0.0001685489849955869,
"loss": 2.9664,
"step": 181000
},
{
"epoch": 9.787577305727346,
"grad_norm": 0.37431761622428894,
"learning_rate": 0.00016766725507502206,
"loss": 2.966,
"step": 182000
},
{
"epoch": 9.841355203011563,
"grad_norm": 0.3461393117904663,
"learning_rate": 0.0001667846425419241,
"loss": 2.9733,
"step": 183000
},
{
"epoch": 9.895133100295778,
"grad_norm": 0.3577495217323303,
"learning_rate": 0.0001659029126213592,
"loss": 2.9728,
"step": 184000
},
{
"epoch": 9.948910997579995,
"grad_norm": 0.35284364223480225,
"learning_rate": 0.00016502030008826125,
"loss": 2.9761,
"step": 185000
},
{
"epoch": 10.0,
"eval_accuracy": 0.4067828332123111,
"eval_loss": 3.390697717666626,
"eval_runtime": 154.1665,
"eval_samples_per_second": 375.704,
"eval_steps_per_second": 5.877,
"step": 185950
},
{
"epoch": 10.00268889486421,
"grad_norm": 0.3526718020439148,
"learning_rate": 0.00016413768755516327,
"loss": 2.9713,
"step": 186000
},
{
"epoch": 10.056466792148427,
"grad_norm": 0.3849118649959564,
"learning_rate": 0.00016325507502206528,
"loss": 2.9052,
"step": 187000
},
{
"epoch": 10.110244689432642,
"grad_norm": 0.3799303472042084,
"learning_rate": 0.00016237246248896733,
"loss": 2.9176,
"step": 188000
},
{
"epoch": 10.16402258671686,
"grad_norm": 0.3599086105823517,
"learning_rate": 0.00016148984995586937,
"loss": 2.9221,
"step": 189000
},
{
"epoch": 10.217800484001076,
"grad_norm": 0.3713199198246002,
"learning_rate": 0.0001606081200353045,
"loss": 2.9216,
"step": 190000
},
{
"epoch": 10.271578381285291,
"grad_norm": 0.4002034664154053,
"learning_rate": 0.0001597255075022065,
"loss": 2.9235,
"step": 191000
},
{
"epoch": 10.325356278569508,
"grad_norm": 0.39608556032180786,
"learning_rate": 0.00015884466019417475,
"loss": 2.9293,
"step": 192000
},
{
"epoch": 10.379134175853725,
"grad_norm": 0.3750903904438019,
"learning_rate": 0.00015796204766107676,
"loss": 2.9305,
"step": 193000
},
{
"epoch": 10.43291207313794,
"grad_norm": 0.36680716276168823,
"learning_rate": 0.0001570794351279788,
"loss": 2.9354,
"step": 194000
},
{
"epoch": 10.486689970422157,
"grad_norm": 0.36688679456710815,
"learning_rate": 0.00015619682259488082,
"loss": 2.9362,
"step": 195000
},
{
"epoch": 10.540467867706372,
"grad_norm": 0.3786678910255432,
"learning_rate": 0.00015531509267431595,
"loss": 2.9394,
"step": 196000
},
{
"epoch": 10.594245764990589,
"grad_norm": 0.39312633872032166,
"learning_rate": 0.0001544333627537511,
"loss": 2.938,
"step": 197000
},
{
"epoch": 10.648023662274806,
"grad_norm": 0.3482972979545593,
"learning_rate": 0.00015355075022065312,
"loss": 2.9451,
"step": 198000
},
{
"epoch": 10.701801559559021,
"grad_norm": 0.3624250590801239,
"learning_rate": 0.00015266813768755513,
"loss": 2.9398,
"step": 199000
},
{
"epoch": 10.755579456843238,
"grad_norm": 0.3837498128414154,
"learning_rate": 0.00015178552515445718,
"loss": 2.9443,
"step": 200000
},
{
"epoch": 10.809357354127453,
"grad_norm": 0.37360501289367676,
"learning_rate": 0.0001509037952338923,
"loss": 2.949,
"step": 201000
},
{
"epoch": 10.86313525141167,
"grad_norm": 0.37839803099632263,
"learning_rate": 0.00015002206531332745,
"loss": 2.9452,
"step": 202000
},
{
"epoch": 10.916913148695887,
"grad_norm": 0.39486682415008545,
"learning_rate": 0.00014913945278022947,
"loss": 2.9498,
"step": 203000
},
{
"epoch": 10.970691045980102,
"grad_norm": 0.3875749409198761,
"learning_rate": 0.0001482568402471315,
"loss": 2.9464,
"step": 204000
},
{
"epoch": 11.0,
"eval_accuracy": 0.4072896956051168,
"eval_loss": 3.3881287574768066,
"eval_runtime": 154.0418,
"eval_samples_per_second": 376.008,
"eval_steps_per_second": 5.882,
"step": 204545
},
{
"epoch": 11.024468943264319,
"grad_norm": 0.40221425890922546,
"learning_rate": 0.00014737422771403353,
"loss": 2.9172,
"step": 205000
},
{
"epoch": 11.078246840548534,
"grad_norm": 0.384132981300354,
"learning_rate": 0.00014649249779346866,
"loss": 2.8888,
"step": 206000
},
{
"epoch": 11.13202473783275,
"grad_norm": 0.40102720260620117,
"learning_rate": 0.0001456098852603707,
"loss": 2.8918,
"step": 207000
},
{
"epoch": 11.185802635116968,
"grad_norm": 0.3817023038864136,
"learning_rate": 0.00014472727272727272,
"loss": 2.8962,
"step": 208000
},
{
"epoch": 11.239580532401183,
"grad_norm": 0.384730726480484,
"learning_rate": 0.00014384466019417473,
"loss": 2.9001,
"step": 209000
},
{
"epoch": 11.2933584296854,
"grad_norm": 0.41385701298713684,
"learning_rate": 0.00014296293027360986,
"loss": 2.9037,
"step": 210000
},
{
"epoch": 11.347136326969615,
"grad_norm": 0.4016377329826355,
"learning_rate": 0.000142081200353045,
"loss": 2.9088,
"step": 211000
},
{
"epoch": 11.400914224253832,
"grad_norm": 0.39662501215934753,
"learning_rate": 0.00014119858781994703,
"loss": 2.9112,
"step": 212000
},
{
"epoch": 11.454692121538049,
"grad_norm": 0.3822558522224426,
"learning_rate": 0.00014031597528684904,
"loss": 2.914,
"step": 213000
},
{
"epoch": 11.508470018822264,
"grad_norm": 0.3875982463359833,
"learning_rate": 0.0001394333627537511,
"loss": 2.917,
"step": 214000
},
{
"epoch": 11.56224791610648,
"grad_norm": 0.38756969571113586,
"learning_rate": 0.0001385507502206531,
"loss": 2.9182,
"step": 215000
},
{
"epoch": 11.616025813390696,
"grad_norm": 0.3970773220062256,
"learning_rate": 0.00013766902030008826,
"loss": 2.9191,
"step": 216000
},
{
"epoch": 11.669803710674913,
"grad_norm": 0.36975619196891785,
"learning_rate": 0.00013678640776699027,
"loss": 2.9223,
"step": 217000
},
{
"epoch": 11.72358160795913,
"grad_norm": 0.3606310188770294,
"learning_rate": 0.0001359046778464254,
"loss": 2.9189,
"step": 218000
},
{
"epoch": 11.777359505243345,
"grad_norm": 0.38579317927360535,
"learning_rate": 0.00013502206531332744,
"loss": 2.9166,
"step": 219000
},
{
"epoch": 11.831137402527562,
"grad_norm": 0.38102084398269653,
"learning_rate": 0.00013413945278022949,
"loss": 2.923,
"step": 220000
},
{
"epoch": 11.884915299811777,
"grad_norm": 0.39203277230262756,
"learning_rate": 0.0001332568402471315,
"loss": 2.9235,
"step": 221000
},
{
"epoch": 11.938693197095994,
"grad_norm": 0.40768030285835266,
"learning_rate": 0.00013237511032656663,
"loss": 2.9268,
"step": 222000
},
{
"epoch": 11.99247109438021,
"grad_norm": 0.3832620084285736,
"learning_rate": 0.00013149249779346864,
"loss": 2.9236,
"step": 223000
},
{
"epoch": 12.0,
"eval_accuracy": 0.4080087079563691,
"eval_loss": 3.3905434608459473,
"eval_runtime": 153.9882,
"eval_samples_per_second": 376.139,
"eval_steps_per_second": 5.884,
"step": 223140
},
{
"epoch": 12.046248991664426,
"grad_norm": 0.39426013827323914,
"learning_rate": 0.0001306098852603707,
"loss": 2.867,
"step": 224000
},
{
"epoch": 12.100026888948642,
"grad_norm": 0.3989400267601013,
"learning_rate": 0.00012972727272727273,
"loss": 2.8675,
"step": 225000
},
{
"epoch": 12.153804786232858,
"grad_norm": 0.4114680290222168,
"learning_rate": 0.00012884554280670786,
"loss": 2.8738,
"step": 226000
},
{
"epoch": 12.207582683517074,
"grad_norm": 0.3789440393447876,
"learning_rate": 0.00012796293027360987,
"loss": 2.8802,
"step": 227000
},
{
"epoch": 12.261360580801291,
"grad_norm": 0.4167686700820923,
"learning_rate": 0.0001270803177405119,
"loss": 2.8806,
"step": 228000
},
{
"epoch": 12.315138478085506,
"grad_norm": 0.3888895511627197,
"learning_rate": 0.00012619770520741393,
"loss": 2.8876,
"step": 229000
},
{
"epoch": 12.368916375369723,
"grad_norm": 0.40582403540611267,
"learning_rate": 0.00012531597528684906,
"loss": 2.8857,
"step": 230000
},
{
"epoch": 12.422694272653938,
"grad_norm": 0.3923300504684448,
"learning_rate": 0.0001244333627537511,
"loss": 2.8883,
"step": 231000
},
{
"epoch": 12.476472169938155,
"grad_norm": 0.3777754604816437,
"learning_rate": 0.00012355163283318623,
"loss": 2.893,
"step": 232000
},
{
"epoch": 12.530250067222372,
"grad_norm": 0.4126594066619873,
"learning_rate": 0.00012266990291262135,
"loss": 2.8934,
"step": 233000
},
{
"epoch": 12.584027964506587,
"grad_norm": 0.3741176426410675,
"learning_rate": 0.00012178729037952337,
"loss": 2.8971,
"step": 234000
},
{
"epoch": 12.637805861790804,
"grad_norm": 0.41800355911254883,
"learning_rate": 0.00012090467784642541,
"loss": 2.895,
"step": 235000
},
{
"epoch": 12.69158375907502,
"grad_norm": 0.3904781937599182,
"learning_rate": 0.00012002294792586052,
"loss": 2.8958,
"step": 236000
},
{
"epoch": 12.745361656359236,
"grad_norm": 0.3890039324760437,
"learning_rate": 0.00011914033539276257,
"loss": 2.8997,
"step": 237000
},
{
"epoch": 12.799139553643453,
"grad_norm": 0.3933662474155426,
"learning_rate": 0.0001182577228596646,
"loss": 2.9018,
"step": 238000
},
{
"epoch": 12.852917450927668,
"grad_norm": 0.3928120732307434,
"learning_rate": 0.00011737511032656663,
"loss": 2.902,
"step": 239000
},
{
"epoch": 12.906695348211885,
"grad_norm": 0.37820199131965637,
"learning_rate": 0.00011649338040600175,
"loss": 2.905,
"step": 240000
},
{
"epoch": 12.9604732454961,
"grad_norm": 0.3890197277069092,
"learning_rate": 0.0001156107678729038,
"loss": 2.907,
"step": 241000
},
{
"epoch": 13.0,
"eval_accuracy": 0.4086836511585584,
"eval_loss": 3.3879897594451904,
"eval_runtime": 154.2919,
"eval_samples_per_second": 375.399,
"eval_steps_per_second": 5.872,
"step": 241735
},
{
"epoch": 13.014251142780317,
"grad_norm": 0.3926098942756653,
"learning_rate": 0.00011472903795233891,
"loss": 2.8894,
"step": 242000
},
{
"epoch": 13.068029040064534,
"grad_norm": 0.4247579872608185,
"learning_rate": 0.00011384642541924095,
"loss": 2.8512,
"step": 243000
},
{
"epoch": 13.12180693734875,
"grad_norm": 0.4137355387210846,
"learning_rate": 0.00011296469549867606,
"loss": 2.8545,
"step": 244000
},
{
"epoch": 13.175584834632966,
"grad_norm": 0.3952256739139557,
"learning_rate": 0.0001120829655781112,
"loss": 2.8564,
"step": 245000
},
{
"epoch": 13.229362731917181,
"grad_norm": 0.3921244740486145,
"learning_rate": 0.00011120035304501322,
"loss": 2.8581,
"step": 246000
},
{
"epoch": 13.283140629201398,
"grad_norm": 0.3818395137786865,
"learning_rate": 0.00011031774051191526,
"loss": 2.8626,
"step": 247000
},
{
"epoch": 13.336918526485615,
"grad_norm": 0.40112724900245667,
"learning_rate": 0.00010943601059135037,
"loss": 2.8613,
"step": 248000
},
{
"epoch": 13.39069642376983,
"grad_norm": 0.3878421187400818,
"learning_rate": 0.00010855339805825242,
"loss": 2.8637,
"step": 249000
},
{
"epoch": 13.444474321054047,
"grad_norm": 0.4171620011329651,
"learning_rate": 0.00010767078552515445,
"loss": 2.8673,
"step": 250000
},
{
"epoch": 13.498252218338264,
"grad_norm": 0.39591026306152344,
"learning_rate": 0.00010678905560458957,
"loss": 2.8703,
"step": 251000
},
{
"epoch": 13.552030115622479,
"grad_norm": 0.39251312613487244,
"learning_rate": 0.0001059064430714916,
"loss": 2.878,
"step": 252000
},
{
"epoch": 13.605808012906696,
"grad_norm": 0.3924385607242584,
"learning_rate": 0.00010502471315092674,
"loss": 2.8778,
"step": 253000
},
{
"epoch": 13.659585910190911,
"grad_norm": 0.40747836232185364,
"learning_rate": 0.00010414210061782876,
"loss": 2.8754,
"step": 254000
},
{
"epoch": 13.713363807475128,
"grad_norm": 0.3943057954311371,
"learning_rate": 0.0001032594880847308,
"loss": 2.8814,
"step": 255000
},
{
"epoch": 13.767141704759345,
"grad_norm": 0.39504125714302063,
"learning_rate": 0.00010237775816416591,
"loss": 2.8816,
"step": 256000
},
{
"epoch": 13.82091960204356,
"grad_norm": 0.3863300085067749,
"learning_rate": 0.00010149514563106796,
"loss": 2.885,
"step": 257000
},
{
"epoch": 13.874697499327777,
"grad_norm": 0.406428724527359,
"learning_rate": 0.00010061341571050307,
"loss": 2.8806,
"step": 258000
},
{
"epoch": 13.928475396611992,
"grad_norm": 0.4188634157180786,
"learning_rate": 9.973080317740511e-05,
"loss": 2.8808,
"step": 259000
},
{
"epoch": 13.982253293896209,
"grad_norm": 0.4093535542488098,
"learning_rate": 9.884819064430714e-05,
"loss": 2.8803,
"step": 260000
},
{
"epoch": 14.0,
"eval_accuracy": 0.40880511003280534,
"eval_loss": 3.3926610946655273,
"eval_runtime": 154.1123,
"eval_samples_per_second": 375.836,
"eval_steps_per_second": 5.879,
"step": 260330
},
{
"epoch": 14.036031191180426,
"grad_norm": 0.41151463985443115,
"learning_rate": 9.796557811120916e-05,
"loss": 2.8484,
"step": 261000
},
{
"epoch": 14.08980908846464,
"grad_norm": 0.4244749844074249,
"learning_rate": 9.70838481906443e-05,
"loss": 2.8302,
"step": 262000
},
{
"epoch": 14.143586985748858,
"grad_norm": 0.43255192041397095,
"learning_rate": 9.620123565754632e-05,
"loss": 2.8321,
"step": 263000
},
{
"epoch": 14.197364883033073,
"grad_norm": 0.40345045924186707,
"learning_rate": 9.531862312444836e-05,
"loss": 2.8367,
"step": 264000
},
{
"epoch": 14.25114278031729,
"grad_norm": 0.4035230875015259,
"learning_rate": 9.443601059135039e-05,
"loss": 2.8442,
"step": 265000
},
{
"epoch": 14.304920677601507,
"grad_norm": 0.40154018998146057,
"learning_rate": 9.355516328331861e-05,
"loss": 2.845,
"step": 266000
},
{
"epoch": 14.358698574885722,
"grad_norm": 0.39079976081848145,
"learning_rate": 9.267255075022065e-05,
"loss": 2.8454,
"step": 267000
},
{
"epoch": 14.412476472169939,
"grad_norm": 0.39859312772750854,
"learning_rate": 9.178993821712268e-05,
"loss": 2.8517,
"step": 268000
},
{
"epoch": 14.466254369454154,
"grad_norm": 0.42548927664756775,
"learning_rate": 9.09073256840247e-05,
"loss": 2.8499,
"step": 269000
},
{
"epoch": 14.52003226673837,
"grad_norm": 0.4101540148258209,
"learning_rate": 9.002647837599293e-05,
"loss": 2.8538,
"step": 270000
},
{
"epoch": 14.573810164022587,
"grad_norm": 0.4425499737262726,
"learning_rate": 8.914386584289496e-05,
"loss": 2.8565,
"step": 271000
},
{
"epoch": 14.627588061306803,
"grad_norm": 0.40012863278388977,
"learning_rate": 8.8261253309797e-05,
"loss": 2.8565,
"step": 272000
},
{
"epoch": 14.68136595859102,
"grad_norm": 0.39511892199516296,
"learning_rate": 8.737952338923212e-05,
"loss": 2.8588,
"step": 273000
},
{
"epoch": 14.735143855875235,
"grad_norm": 0.42598825693130493,
"learning_rate": 8.649691085613415e-05,
"loss": 2.8593,
"step": 274000
},
{
"epoch": 14.788921753159451,
"grad_norm": 0.4081316292285919,
"learning_rate": 8.561429832303618e-05,
"loss": 2.861,
"step": 275000
},
{
"epoch": 14.842699650443668,
"grad_norm": 0.4154646098613739,
"learning_rate": 8.473168578993821e-05,
"loss": 2.8624,
"step": 276000
},
{
"epoch": 14.896477547727883,
"grad_norm": 0.4205508828163147,
"learning_rate": 8.384995586937333e-05,
"loss": 2.8649,
"step": 277000
},
{
"epoch": 14.9502554450121,
"grad_norm": 0.4075837731361389,
"learning_rate": 8.296734333627537e-05,
"loss": 2.8614,
"step": 278000
},
{
"epoch": 15.0,
"eval_accuracy": 0.4090819529646479,
"eval_loss": 3.3923890590667725,
"eval_runtime": 153.9312,
"eval_samples_per_second": 376.279,
"eval_steps_per_second": 5.886,
"step": 278925
},
{
"epoch": 15.004033342296315,
"grad_norm": 0.42821255326271057,
"learning_rate": 8.20847308031774e-05,
"loss": 2.8598,
"step": 279000
},
{
"epoch": 15.057811239580532,
"grad_norm": 0.41713452339172363,
"learning_rate": 8.120300088261253e-05,
"loss": 2.8112,
"step": 280000
},
{
"epoch": 15.11158913686475,
"grad_norm": 0.4310871660709381,
"learning_rate": 8.032127096204765e-05,
"loss": 2.8187,
"step": 281000
},
{
"epoch": 15.165367034148964,
"grad_norm": 0.42941373586654663,
"learning_rate": 7.943865842894969e-05,
"loss": 2.82,
"step": 282000
},
{
"epoch": 15.219144931433181,
"grad_norm": 0.4279724657535553,
"learning_rate": 7.85560458958517e-05,
"loss": 2.824,
"step": 283000
},
{
"epoch": 15.272922828717396,
"grad_norm": 0.4205762445926666,
"learning_rate": 7.767343336275375e-05,
"loss": 2.8251,
"step": 284000
},
{
"epoch": 15.326700726001613,
"grad_norm": 0.40928196907043457,
"learning_rate": 7.679082082965578e-05,
"loss": 2.8295,
"step": 285000
},
{
"epoch": 15.38047862328583,
"grad_norm": 0.4192693829536438,
"learning_rate": 7.59090909090909e-05,
"loss": 2.8333,
"step": 286000
},
{
"epoch": 15.434256520570045,
"grad_norm": 0.40103474259376526,
"learning_rate": 7.502647837599294e-05,
"loss": 2.8327,
"step": 287000
},
{
"epoch": 15.488034417854262,
"grad_norm": 0.42553824186325073,
"learning_rate": 7.414386584289497e-05,
"loss": 2.8343,
"step": 288000
},
{
"epoch": 15.541812315138477,
"grad_norm": 0.4238561987876892,
"learning_rate": 7.3261253309797e-05,
"loss": 2.8346,
"step": 289000
},
{
"epoch": 15.595590212422694,
"grad_norm": 0.44222867488861084,
"learning_rate": 7.237952338923212e-05,
"loss": 2.837,
"step": 290000
},
{
"epoch": 15.649368109706911,
"grad_norm": 0.4435216784477234,
"learning_rate": 7.149691085613415e-05,
"loss": 2.8348,
"step": 291000
},
{
"epoch": 15.703146006991126,
"grad_norm": 0.434276282787323,
"learning_rate": 7.061518093556928e-05,
"loss": 2.8376,
"step": 292000
},
{
"epoch": 15.756923904275343,
"grad_norm": 0.4052608013153076,
"learning_rate": 6.97325684024713e-05,
"loss": 2.8421,
"step": 293000
},
{
"epoch": 15.81070180155956,
"grad_norm": 0.4162592589855194,
"learning_rate": 6.885083848190643e-05,
"loss": 2.8413,
"step": 294000
},
{
"epoch": 15.864479698843775,
"grad_norm": 0.4163593649864197,
"learning_rate": 6.796822594880847e-05,
"loss": 2.843,
"step": 295000
},
{
"epoch": 15.918257596127992,
"grad_norm": 0.4112990200519562,
"learning_rate": 6.70856134157105e-05,
"loss": 2.8427,
"step": 296000
},
{
"epoch": 15.972035493412207,
"grad_norm": 0.3960702121257782,
"learning_rate": 6.620300088261252e-05,
"loss": 2.8414,
"step": 297000
},
{
"epoch": 16.0,
"eval_accuracy": 0.4098178943182842,
"eval_loss": 3.3970065116882324,
"eval_runtime": 154.1119,
"eval_samples_per_second": 375.837,
"eval_steps_per_second": 5.879,
"step": 297520
},
{
"epoch": 16.025813390696424,
"grad_norm": 0.4249773919582367,
"learning_rate": 6.532127096204766e-05,
"loss": 2.8211,
"step": 298000
},
{
"epoch": 16.07959128798064,
"grad_norm": 0.4380571246147156,
"learning_rate": 6.443865842894968e-05,
"loss": 2.8058,
"step": 299000
},
{
"epoch": 16.133369185264858,
"grad_norm": 0.43307915329933167,
"learning_rate": 6.355604589585172e-05,
"loss": 2.8082,
"step": 300000
},
{
"epoch": 16.18714708254907,
"grad_norm": 0.44826626777648926,
"learning_rate": 6.267431597528685e-05,
"loss": 2.8046,
"step": 301000
},
{
"epoch": 16.240924979833288,
"grad_norm": 0.4037844240665436,
"learning_rate": 6.179170344218888e-05,
"loss": 2.8096,
"step": 302000
},
{
"epoch": 16.294702877117505,
"grad_norm": 0.4280061721801758,
"learning_rate": 6.0909973521623994e-05,
"loss": 2.8112,
"step": 303000
},
{
"epoch": 16.34848077440172,
"grad_norm": 0.43137866258621216,
"learning_rate": 6.002736098852603e-05,
"loss": 2.8104,
"step": 304000
},
{
"epoch": 16.40225867168594,
"grad_norm": 0.41685307025909424,
"learning_rate": 5.914563106796116e-05,
"loss": 2.811,
"step": 305000
},
{
"epoch": 16.456036568970152,
"grad_norm": 0.4230118989944458,
"learning_rate": 5.8263018534863187e-05,
"loss": 2.8155,
"step": 306000
},
{
"epoch": 16.50981446625437,
"grad_norm": 0.4346941113471985,
"learning_rate": 5.738040600176522e-05,
"loss": 2.8177,
"step": 307000
},
{
"epoch": 16.563592363538586,
"grad_norm": 0.46773964166641235,
"learning_rate": 5.6497793468667254e-05,
"loss": 2.8171,
"step": 308000
},
{
"epoch": 16.617370260822803,
"grad_norm": 0.42793235182762146,
"learning_rate": 5.561606354810238e-05,
"loss": 2.815,
"step": 309000
},
{
"epoch": 16.67114815810702,
"grad_norm": 0.43302974104881287,
"learning_rate": 5.4734333627537504e-05,
"loss": 2.8221,
"step": 310000
},
{
"epoch": 16.724926055391233,
"grad_norm": 0.4403083622455597,
"learning_rate": 5.3851721094439534e-05,
"loss": 2.8215,
"step": 311000
},
{
"epoch": 16.77870395267545,
"grad_norm": 0.4399563670158386,
"learning_rate": 5.2969108561341564e-05,
"loss": 2.8208,
"step": 312000
},
{
"epoch": 16.832481849959667,
"grad_norm": 0.4378258287906647,
"learning_rate": 5.208737864077669e-05,
"loss": 2.8231,
"step": 313000
},
{
"epoch": 16.886259747243884,
"grad_norm": 0.41973891854286194,
"learning_rate": 5.1204766107678726e-05,
"loss": 2.8237,
"step": 314000
},
{
"epoch": 16.9400376445281,
"grad_norm": 0.4335421323776245,
"learning_rate": 5.0322153574580757e-05,
"loss": 2.8232,
"step": 315000
},
{
"epoch": 16.993815541812314,
"grad_norm": 0.4244571924209595,
"learning_rate": 4.944042365401588e-05,
"loss": 2.8226,
"step": 316000
},
{
"epoch": 17.0,
"eval_accuracy": 0.40916397801191307,
"eval_loss": 3.411121129989624,
"eval_runtime": 154.5975,
"eval_samples_per_second": 374.657,
"eval_steps_per_second": 5.86,
"step": 316115
},
{
"epoch": 17.04759343909653,
"grad_norm": 0.4443401098251343,
"learning_rate": 4.855781112091791e-05,
"loss": 2.789,
"step": 317000
},
{
"epoch": 17.101371336380748,
"grad_norm": 0.40948519110679626,
"learning_rate": 4.767519858781995e-05,
"loss": 2.7875,
"step": 318000
},
{
"epoch": 17.155149233664964,
"grad_norm": 0.44507384300231934,
"learning_rate": 4.6793468667255074e-05,
"loss": 2.7909,
"step": 319000
},
{
"epoch": 17.20892713094918,
"grad_norm": 0.44478124380111694,
"learning_rate": 4.5910856134157104e-05,
"loss": 2.7936,
"step": 320000
},
{
"epoch": 17.262705028233395,
"grad_norm": 0.44282448291778564,
"learning_rate": 4.502824360105913e-05,
"loss": 2.7925,
"step": 321000
},
{
"epoch": 17.31648292551761,
"grad_norm": 0.45248299837112427,
"learning_rate": 4.414651368049426e-05,
"loss": 2.7958,
"step": 322000
},
{
"epoch": 17.37026082280183,
"grad_norm": 0.4552863836288452,
"learning_rate": 4.3263901147396296e-05,
"loss": 2.7993,
"step": 323000
},
{
"epoch": 17.424038720086045,
"grad_norm": 0.46229588985443115,
"learning_rate": 4.238128861429832e-05,
"loss": 2.7969,
"step": 324000
},
{
"epoch": 17.477816617370262,
"grad_norm": 0.4318771958351135,
"learning_rate": 4.149955869373345e-05,
"loss": 2.7997,
"step": 325000
},
{
"epoch": 17.531594514654476,
"grad_norm": 0.45026683807373047,
"learning_rate": 4.0616946160635475e-05,
"loss": 2.7975,
"step": 326000
},
{
"epoch": 17.585372411938692,
"grad_norm": 0.4539460837841034,
"learning_rate": 3.9734333627537505e-05,
"loss": 2.8004,
"step": 327000
},
{
"epoch": 17.63915030922291,
"grad_norm": 0.44773465394973755,
"learning_rate": 3.885260370697263e-05,
"loss": 2.8014,
"step": 328000
},
{
"epoch": 17.692928206507126,
"grad_norm": 0.443488210439682,
"learning_rate": 3.796999117387467e-05,
"loss": 2.804,
"step": 329000
},
{
"epoch": 17.746706103791343,
"grad_norm": 0.4228520691394806,
"learning_rate": 3.70873786407767e-05,
"loss": 2.8023,
"step": 330000
},
{
"epoch": 17.800484001075557,
"grad_norm": 0.43788018822669983,
"learning_rate": 3.620564872021182e-05,
"loss": 2.8027,
"step": 331000
},
{
"epoch": 17.854261898359773,
"grad_norm": 0.4629572629928589,
"learning_rate": 3.532303618711385e-05,
"loss": 2.8044,
"step": 332000
},
{
"epoch": 17.90803979564399,
"grad_norm": 0.4508598744869232,
"learning_rate": 3.444042365401588e-05,
"loss": 2.8069,
"step": 333000
},
{
"epoch": 17.961817692928207,
"grad_norm": 0.4376344084739685,
"learning_rate": 3.355869373345101e-05,
"loss": 2.8063,
"step": 334000
},
{
"epoch": 18.0,
"eval_accuracy": 0.4091488628312131,
"eval_loss": 3.419936418533325,
"eval_runtime": 154.2734,
"eval_samples_per_second": 375.444,
"eval_steps_per_second": 5.873,
"step": 334710
},
{
"epoch": 18.015595590212424,
"grad_norm": 0.44690123200416565,
"learning_rate": 3.2676081200353045e-05,
"loss": 2.7994,
"step": 335000
},
{
"epoch": 18.069373487496637,
"grad_norm": 0.4568174183368683,
"learning_rate": 3.179435127978817e-05,
"loss": 2.7778,
"step": 336000
},
{
"epoch": 18.123151384780854,
"grad_norm": 0.4270166754722595,
"learning_rate": 3.09117387466902e-05,
"loss": 2.7778,
"step": 337000
},
{
"epoch": 18.17692928206507,
"grad_norm": 0.4290498197078705,
"learning_rate": 3.0030008826125325e-05,
"loss": 2.7754,
"step": 338000
},
{
"epoch": 18.230707179349288,
"grad_norm": 0.44628649950027466,
"learning_rate": 2.914739629302736e-05,
"loss": 2.7749,
"step": 339000
},
{
"epoch": 18.284485076633505,
"grad_norm": 0.4463174641132355,
"learning_rate": 2.8264783759929386e-05,
"loss": 2.7784,
"step": 340000
},
{
"epoch": 18.33826297391772,
"grad_norm": 0.46662789583206177,
"learning_rate": 2.7383936451897613e-05,
"loss": 2.7852,
"step": 341000
},
{
"epoch": 18.392040871201935,
"grad_norm": 0.4375401437282562,
"learning_rate": 2.6501323918799646e-05,
"loss": 2.7807,
"step": 342000
},
{
"epoch": 18.445818768486152,
"grad_norm": 0.4658895432949066,
"learning_rate": 2.5618711385701673e-05,
"loss": 2.7848,
"step": 343000
},
{
"epoch": 18.49959666577037,
"grad_norm": 0.47445955872535706,
"learning_rate": 2.47369814651368e-05,
"loss": 2.7869,
"step": 344000
},
{
"epoch": 18.553374563054586,
"grad_norm": 0.45969846844673157,
"learning_rate": 2.385436893203883e-05,
"loss": 2.7861,
"step": 345000
},
{
"epoch": 18.6071524603388,
"grad_norm": 0.4380541741847992,
"learning_rate": 2.2971756398940862e-05,
"loss": 2.7824,
"step": 346000
},
{
"epoch": 18.660930357623016,
"grad_norm": 0.45412492752075195,
"learning_rate": 2.2089143865842892e-05,
"loss": 2.7861,
"step": 347000
},
{
"epoch": 18.714708254907233,
"grad_norm": 0.47223806381225586,
"learning_rate": 2.120741394527802e-05,
"loss": 2.7849,
"step": 348000
},
{
"epoch": 18.76848615219145,
"grad_norm": 0.4475175440311432,
"learning_rate": 2.032568402471315e-05,
"loss": 2.7855,
"step": 349000
},
{
"epoch": 18.822264049475667,
"grad_norm": 0.44210994243621826,
"learning_rate": 1.944307149161518e-05,
"loss": 2.7863,
"step": 350000
},
{
"epoch": 18.87604194675988,
"grad_norm": 0.4564194977283478,
"learning_rate": 1.856045895851721e-05,
"loss": 2.7858,
"step": 351000
},
{
"epoch": 18.929819844044097,
"grad_norm": 0.43993762135505676,
"learning_rate": 1.7678729037952338e-05,
"loss": 2.7868,
"step": 352000
},
{
"epoch": 18.983597741328314,
"grad_norm": 0.42271167039871216,
"learning_rate": 1.6796116504854368e-05,
"loss": 2.7905,
"step": 353000
},
{
"epoch": 19.0,
"eval_accuracy": 0.4092819435999537,
"eval_loss": 3.423438310623169,
"eval_runtime": 154.0851,
"eval_samples_per_second": 375.903,
"eval_steps_per_second": 5.88,
"step": 353305
},
{
"epoch": 19.03737563861253,
"grad_norm": 0.4467901289463043,
"learning_rate": 1.5914386584289496e-05,
"loss": 2.7736,
"step": 354000
},
{
"epoch": 19.091153535896748,
"grad_norm": 0.45330286026000977,
"learning_rate": 1.5031774051191525e-05,
"loss": 2.7671,
"step": 355000
},
{
"epoch": 19.14493143318096,
"grad_norm": 0.451779305934906,
"learning_rate": 1.4149161518093555e-05,
"loss": 2.7674,
"step": 356000
},
{
"epoch": 19.198709330465178,
"grad_norm": 0.462492436170578,
"learning_rate": 1.3266548984995585e-05,
"loss": 2.7679,
"step": 357000
},
{
"epoch": 19.252487227749395,
"grad_norm": 0.47610774636268616,
"learning_rate": 1.2383936451897616e-05,
"loss": 2.7677,
"step": 358000
},
{
"epoch": 19.30626512503361,
"grad_norm": 0.47243231534957886,
"learning_rate": 1.1501323918799646e-05,
"loss": 2.7695,
"step": 359000
},
{
"epoch": 19.36004302231783,
"grad_norm": 0.47811663150787354,
"learning_rate": 1.0618711385701676e-05,
"loss": 2.7679,
"step": 360000
},
{
"epoch": 19.413820919602042,
"grad_norm": 0.4842175543308258,
"learning_rate": 9.737864077669903e-06,
"loss": 2.7683,
"step": 361000
},
{
"epoch": 19.46759881688626,
"grad_norm": 0.4452952444553375,
"learning_rate": 8.855251544571931e-06,
"loss": 2.7647,
"step": 362000
},
{
"epoch": 19.521376714170476,
"grad_norm": 0.47025254368782043,
"learning_rate": 7.972639011473961e-06,
"loss": 2.7692,
"step": 363000
},
{
"epoch": 19.575154611454693,
"grad_norm": 0.446979820728302,
"learning_rate": 7.09090909090909e-06,
"loss": 2.7708,
"step": 364000
},
{
"epoch": 19.62893250873891,
"grad_norm": 0.4704710841178894,
"learning_rate": 6.20829655781112e-06,
"loss": 2.7699,
"step": 365000
},
{
"epoch": 19.682710406023123,
"grad_norm": 0.4596230685710907,
"learning_rate": 5.32568402471315e-06,
"loss": 2.7718,
"step": 366000
},
{
"epoch": 19.73648830330734,
"grad_norm": 0.4677480459213257,
"learning_rate": 4.443954104148278e-06,
"loss": 2.7665,
"step": 367000
},
{
"epoch": 19.790266200591557,
"grad_norm": 0.45196905732154846,
"learning_rate": 3.5613415710503085e-06,
"loss": 2.7717,
"step": 368000
},
{
"epoch": 19.844044097875774,
"grad_norm": 0.44339102506637573,
"learning_rate": 2.6787290379523387e-06,
"loss": 2.7711,
"step": 369000
},
{
"epoch": 19.89782199515999,
"grad_norm": 0.46412837505340576,
"learning_rate": 1.7961165048543689e-06,
"loss": 2.7658,
"step": 370000
},
{
"epoch": 19.951599892444204,
"grad_norm": 0.4551531970500946,
"learning_rate": 9.143865842894968e-07,
"loss": 2.7753,
"step": 371000
},
{
"epoch": 20.0,
"eval_accuracy": 0.40922034084127873,
"eval_loss": 3.4287688732147217,
"eval_runtime": 154.0794,
"eval_samples_per_second": 375.917,
"eval_steps_per_second": 5.88,
"step": 371900
},
{
"epoch": 20.0,
"step": 371900,
"total_flos": 1.5669257538816e+18,
"train_loss": 3.0594762622313207,
"train_runtime": 80951.5095,
"train_samples_per_second": 147.01,
"train_steps_per_second": 4.594
}
],
"logging_steps": 1000,
"max_steps": 371900,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 5000,
"total_flos": 1.5669257538816e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}