craa's picture
Upload folder using huggingface_hub
028718f verified
{
"best_global_step": 65000,
"best_metric": 3.5314111709594727,
"best_model_checkpoint": "/scratch/cl5625/exceptions/models/last_to_push_frequency_2128/checkpoint-40000",
"epoch": 29.120857359193895,
"eval_steps": 1000,
"global_step": 100000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014561127613722406,
"grad_norm": 1.4858547449111938,
"learning_rate": 0.000294,
"loss": 8.4433,
"step": 50
},
{
"epoch": 0.029122255227444813,
"grad_norm": 0.7950296998023987,
"learning_rate": 0.0005939999999999999,
"loss": 6.7515,
"step": 100
},
{
"epoch": 0.04368338284116722,
"grad_norm": 0.46963977813720703,
"learning_rate": 0.0005998286713286713,
"loss": 6.3451,
"step": 150
},
{
"epoch": 0.058244510454889625,
"grad_norm": 0.5301603674888611,
"learning_rate": 0.0005996538461538461,
"loss": 6.1351,
"step": 200
},
{
"epoch": 0.07280563806861204,
"grad_norm": 0.39352092146873474,
"learning_rate": 0.0005994790209790209,
"loss": 5.9881,
"step": 250
},
{
"epoch": 0.08736676568233444,
"grad_norm": 0.40677252411842346,
"learning_rate": 0.0005993041958041958,
"loss": 5.8428,
"step": 300
},
{
"epoch": 0.10192789329605685,
"grad_norm": 0.4639081358909607,
"learning_rate": 0.0005991293706293705,
"loss": 5.724,
"step": 350
},
{
"epoch": 0.11648902090977925,
"grad_norm": 0.4001801609992981,
"learning_rate": 0.0005989545454545454,
"loss": 5.6175,
"step": 400
},
{
"epoch": 0.13105014852350166,
"grad_norm": 0.48260679841041565,
"learning_rate": 0.0005987797202797202,
"loss": 5.4975,
"step": 450
},
{
"epoch": 0.14561127613722408,
"grad_norm": 0.5032499432563782,
"learning_rate": 0.000598604895104895,
"loss": 5.3974,
"step": 500
},
{
"epoch": 0.16017240375094646,
"grad_norm": 0.45361870527267456,
"learning_rate": 0.0005984300699300698,
"loss": 5.3242,
"step": 550
},
{
"epoch": 0.17473353136466888,
"grad_norm": 0.42623206973075867,
"learning_rate": 0.0005982552447552447,
"loss": 5.2535,
"step": 600
},
{
"epoch": 0.1892946589783913,
"grad_norm": 0.4469488561153412,
"learning_rate": 0.0005980804195804195,
"loss": 5.1812,
"step": 650
},
{
"epoch": 0.2038557865921137,
"grad_norm": 0.4211214780807495,
"learning_rate": 0.0005979055944055943,
"loss": 5.1233,
"step": 700
},
{
"epoch": 0.2184169142058361,
"grad_norm": 0.4314533770084381,
"learning_rate": 0.0005977307692307691,
"loss": 5.0677,
"step": 750
},
{
"epoch": 0.2329780418195585,
"grad_norm": 0.5419679284095764,
"learning_rate": 0.000597555944055944,
"loss": 5.0121,
"step": 800
},
{
"epoch": 0.24753916943328091,
"grad_norm": 0.4582156836986542,
"learning_rate": 0.0005973811188811188,
"loss": 4.9482,
"step": 850
},
{
"epoch": 0.2621002970470033,
"grad_norm": 0.4464901387691498,
"learning_rate": 0.0005972062937062936,
"loss": 4.9097,
"step": 900
},
{
"epoch": 0.27666142466072574,
"grad_norm": 0.4611063301563263,
"learning_rate": 0.0005970314685314685,
"loss": 4.867,
"step": 950
},
{
"epoch": 0.29122255227444815,
"grad_norm": 0.638515830039978,
"learning_rate": 0.0005968566433566433,
"loss": 4.8082,
"step": 1000
},
{
"epoch": 0.29122255227444815,
"eval_accuracy": 0.25687335662170213,
"eval_loss": 4.738000869750977,
"eval_runtime": 179.5194,
"eval_samples_per_second": 92.725,
"eval_steps_per_second": 5.799,
"step": 1000
},
{
"epoch": 0.30578367988817057,
"grad_norm": 0.43320566415786743,
"learning_rate": 0.0005966818181818181,
"loss": 4.7777,
"step": 1050
},
{
"epoch": 0.3203448075018929,
"grad_norm": 0.5117963552474976,
"learning_rate": 0.0005965069930069929,
"loss": 4.7314,
"step": 1100
},
{
"epoch": 0.33490593511561534,
"grad_norm": 0.43931126594543457,
"learning_rate": 0.0005963321678321677,
"loss": 4.6915,
"step": 1150
},
{
"epoch": 0.34946706272933775,
"grad_norm": 0.47124797105789185,
"learning_rate": 0.0005961573426573425,
"loss": 4.6574,
"step": 1200
},
{
"epoch": 0.36402819034306017,
"grad_norm": 0.416259229183197,
"learning_rate": 0.0005959825174825174,
"loss": 4.6264,
"step": 1250
},
{
"epoch": 0.3785893179567826,
"grad_norm": 0.46466392278671265,
"learning_rate": 0.0005958076923076922,
"loss": 4.5968,
"step": 1300
},
{
"epoch": 0.393150445570505,
"grad_norm": 0.4708881378173828,
"learning_rate": 0.000595632867132867,
"loss": 4.57,
"step": 1350
},
{
"epoch": 0.4077115731842274,
"grad_norm": 0.48490315675735474,
"learning_rate": 0.0005954580419580418,
"loss": 4.5425,
"step": 1400
},
{
"epoch": 0.4222727007979498,
"grad_norm": 0.48324763774871826,
"learning_rate": 0.0005952832167832168,
"loss": 4.5181,
"step": 1450
},
{
"epoch": 0.4368338284116722,
"grad_norm": 0.39952364563941956,
"learning_rate": 0.0005951083916083916,
"loss": 4.5073,
"step": 1500
},
{
"epoch": 0.4513949560253946,
"grad_norm": 0.4076775908470154,
"learning_rate": 0.0005949335664335664,
"loss": 4.4717,
"step": 1550
},
{
"epoch": 0.465956083639117,
"grad_norm": 0.43624168634414673,
"learning_rate": 0.0005947587412587413,
"loss": 4.4605,
"step": 1600
},
{
"epoch": 0.4805172112528394,
"grad_norm": 0.41203925013542175,
"learning_rate": 0.0005945839160839161,
"loss": 4.4414,
"step": 1650
},
{
"epoch": 0.49507833886656183,
"grad_norm": 0.4499853849411011,
"learning_rate": 0.0005944090909090909,
"loss": 4.4138,
"step": 1700
},
{
"epoch": 0.5096394664802842,
"grad_norm": 0.4240889549255371,
"learning_rate": 0.0005942342657342657,
"loss": 4.4028,
"step": 1750
},
{
"epoch": 0.5242005940940067,
"grad_norm": 0.4965490698814392,
"learning_rate": 0.0005940594405594406,
"loss": 4.3865,
"step": 1800
},
{
"epoch": 0.5387617217077291,
"grad_norm": 0.4231831133365631,
"learning_rate": 0.0005938846153846153,
"loss": 4.3704,
"step": 1850
},
{
"epoch": 0.5533228493214515,
"grad_norm": 0.43335089087486267,
"learning_rate": 0.0005937097902097902,
"loss": 4.37,
"step": 1900
},
{
"epoch": 0.5678839769351739,
"grad_norm": 0.42372700572013855,
"learning_rate": 0.000593534965034965,
"loss": 4.3537,
"step": 1950
},
{
"epoch": 0.5824451045488963,
"grad_norm": 0.3962233364582062,
"learning_rate": 0.0005933601398601398,
"loss": 4.3388,
"step": 2000
},
{
"epoch": 0.5824451045488963,
"eval_accuracy": 0.3000544549997378,
"eval_loss": 4.2761712074279785,
"eval_runtime": 179.7222,
"eval_samples_per_second": 92.621,
"eval_steps_per_second": 5.792,
"step": 2000
},
{
"epoch": 0.5970062321626187,
"grad_norm": 0.3804718852043152,
"learning_rate": 0.0005931853146853146,
"loss": 4.3206,
"step": 2050
},
{
"epoch": 0.6115673597763411,
"grad_norm": 0.36061641573905945,
"learning_rate": 0.0005930104895104895,
"loss": 4.3124,
"step": 2100
},
{
"epoch": 0.6261284873900634,
"grad_norm": 0.44453078508377075,
"learning_rate": 0.0005928356643356643,
"loss": 4.2984,
"step": 2150
},
{
"epoch": 0.6406896150037859,
"grad_norm": 0.381816029548645,
"learning_rate": 0.0005926608391608391,
"loss": 4.2922,
"step": 2200
},
{
"epoch": 0.6552507426175083,
"grad_norm": 0.37145373225212097,
"learning_rate": 0.000592486013986014,
"loss": 4.2668,
"step": 2250
},
{
"epoch": 0.6698118702312307,
"grad_norm": 0.37593939900398254,
"learning_rate": 0.0005923111888111888,
"loss": 4.2766,
"step": 2300
},
{
"epoch": 0.6843729978449531,
"grad_norm": 0.40763673186302185,
"learning_rate": 0.0005921363636363636,
"loss": 4.2411,
"step": 2350
},
{
"epoch": 0.6989341254586755,
"grad_norm": 0.37425413727760315,
"learning_rate": 0.0005919615384615384,
"loss": 4.2362,
"step": 2400
},
{
"epoch": 0.7134952530723979,
"grad_norm": 0.46256929636001587,
"learning_rate": 0.0005917867132867133,
"loss": 4.2464,
"step": 2450
},
{
"epoch": 0.7280563806861203,
"grad_norm": 0.34601518511772156,
"learning_rate": 0.0005916118881118881,
"loss": 4.2394,
"step": 2500
},
{
"epoch": 0.7426175082998427,
"grad_norm": 0.4125564694404602,
"learning_rate": 0.0005914370629370629,
"loss": 4.2238,
"step": 2550
},
{
"epoch": 0.7571786359135652,
"grad_norm": 0.3776606321334839,
"learning_rate": 0.0005912622377622377,
"loss": 4.2097,
"step": 2600
},
{
"epoch": 0.7717397635272876,
"grad_norm": 0.3779846131801605,
"learning_rate": 0.0005910874125874125,
"loss": 4.1987,
"step": 2650
},
{
"epoch": 0.78630089114101,
"grad_norm": 0.3909547030925751,
"learning_rate": 0.0005909125874125873,
"loss": 4.19,
"step": 2700
},
{
"epoch": 0.8008620187547324,
"grad_norm": 0.3437758982181549,
"learning_rate": 0.0005907377622377622,
"loss": 4.1919,
"step": 2750
},
{
"epoch": 0.8154231463684548,
"grad_norm": 0.34142592549324036,
"learning_rate": 0.000590562937062937,
"loss": 4.175,
"step": 2800
},
{
"epoch": 0.8299842739821772,
"grad_norm": 0.362221360206604,
"learning_rate": 0.0005903881118881118,
"loss": 4.1635,
"step": 2850
},
{
"epoch": 0.8445454015958996,
"grad_norm": 0.37655583024024963,
"learning_rate": 0.0005902132867132867,
"loss": 4.1565,
"step": 2900
},
{
"epoch": 0.8591065292096219,
"grad_norm": 0.3475290536880493,
"learning_rate": 0.0005900384615384615,
"loss": 4.1429,
"step": 2950
},
{
"epoch": 0.8736676568233444,
"grad_norm": 0.3560430705547333,
"learning_rate": 0.0005898636363636363,
"loss": 4.1487,
"step": 3000
},
{
"epoch": 0.8736676568233444,
"eval_accuracy": 0.31562550478444545,
"eval_loss": 4.094728946685791,
"eval_runtime": 179.6318,
"eval_samples_per_second": 92.667,
"eval_steps_per_second": 5.795,
"step": 3000
},
{
"epoch": 0.8882287844370668,
"grad_norm": 0.3338482081890106,
"learning_rate": 0.0005896888111888111,
"loss": 4.1433,
"step": 3050
},
{
"epoch": 0.9027899120507892,
"grad_norm": 0.3649440407752991,
"learning_rate": 0.000589513986013986,
"loss": 4.1306,
"step": 3100
},
{
"epoch": 0.9173510396645116,
"grad_norm": 0.361857146024704,
"learning_rate": 0.0005893391608391608,
"loss": 4.1207,
"step": 3150
},
{
"epoch": 0.931912167278234,
"grad_norm": 0.34746089577674866,
"learning_rate": 0.0005891643356643356,
"loss": 4.1244,
"step": 3200
},
{
"epoch": 0.9464732948919564,
"grad_norm": 0.344913512468338,
"learning_rate": 0.0005889895104895104,
"loss": 4.1127,
"step": 3250
},
{
"epoch": 0.9610344225056788,
"grad_norm": 0.3733556568622589,
"learning_rate": 0.0005888146853146853,
"loss": 4.1038,
"step": 3300
},
{
"epoch": 0.9755955501194012,
"grad_norm": 0.38611236214637756,
"learning_rate": 0.00058863986013986,
"loss": 4.0889,
"step": 3350
},
{
"epoch": 0.9901566777331237,
"grad_norm": 0.33691319823265076,
"learning_rate": 0.0005884650349650349,
"loss": 4.0955,
"step": 3400
},
{
"epoch": 1.004659560836391,
"grad_norm": 0.3706021010875702,
"learning_rate": 0.0005882902097902097,
"loss": 4.0726,
"step": 3450
},
{
"epoch": 1.0192206884501136,
"grad_norm": 0.3402159810066223,
"learning_rate": 0.0005881153846153845,
"loss": 4.0144,
"step": 3500
},
{
"epoch": 1.033781816063836,
"grad_norm": 0.3579849898815155,
"learning_rate": 0.0005879405594405594,
"loss": 4.0103,
"step": 3550
},
{
"epoch": 1.0483429436775584,
"grad_norm": 0.33701223134994507,
"learning_rate": 0.0005877657342657342,
"loss": 4.0348,
"step": 3600
},
{
"epoch": 1.0629040712912807,
"grad_norm": 0.341237336397171,
"learning_rate": 0.000587590909090909,
"loss": 4.0054,
"step": 3650
},
{
"epoch": 1.0774651989050033,
"grad_norm": 0.3806729018688202,
"learning_rate": 0.0005874160839160838,
"loss": 4.0021,
"step": 3700
},
{
"epoch": 1.0920263265187256,
"grad_norm": 0.33116230368614197,
"learning_rate": 0.0005872412587412587,
"loss": 4.0159,
"step": 3750
},
{
"epoch": 1.106587454132448,
"grad_norm": 0.35582491755485535,
"learning_rate": 0.0005870664335664335,
"loss": 4.0044,
"step": 3800
},
{
"epoch": 1.1211485817461704,
"grad_norm": 0.32886144518852234,
"learning_rate": 0.0005868916083916083,
"loss": 4.0174,
"step": 3850
},
{
"epoch": 1.135709709359893,
"grad_norm": 0.3607575297355652,
"learning_rate": 0.0005867167832167831,
"loss": 3.9909,
"step": 3900
},
{
"epoch": 1.1502708369736152,
"grad_norm": 0.3437859117984772,
"learning_rate": 0.000586541958041958,
"loss": 3.9831,
"step": 3950
},
{
"epoch": 1.1648319645873377,
"grad_norm": 0.3273104727268219,
"learning_rate": 0.0005863671328671328,
"loss": 3.9789,
"step": 4000
},
{
"epoch": 1.1648319645873377,
"eval_accuracy": 0.324832420381312,
"eval_loss": 3.989461660385132,
"eval_runtime": 179.6199,
"eval_samples_per_second": 92.673,
"eval_steps_per_second": 5.796,
"step": 4000
},
{
"epoch": 1.17939309220106,
"grad_norm": 0.3365177810192108,
"learning_rate": 0.0005861923076923076,
"loss": 3.9821,
"step": 4050
},
{
"epoch": 1.1939542198147826,
"grad_norm": 0.34082263708114624,
"learning_rate": 0.0005860174825174824,
"loss": 3.9917,
"step": 4100
},
{
"epoch": 1.2085153474285049,
"grad_norm": 0.34475305676460266,
"learning_rate": 0.0005858426573426573,
"loss": 3.9865,
"step": 4150
},
{
"epoch": 1.2230764750422272,
"grad_norm": 0.31244170665740967,
"learning_rate": 0.000585667832167832,
"loss": 3.9812,
"step": 4200
},
{
"epoch": 1.2376376026559497,
"grad_norm": 0.36344829201698303,
"learning_rate": 0.000585493006993007,
"loss": 3.97,
"step": 4250
},
{
"epoch": 1.2521987302696722,
"grad_norm": 0.3585837781429291,
"learning_rate": 0.0005853181818181817,
"loss": 3.9773,
"step": 4300
},
{
"epoch": 1.2667598578833945,
"grad_norm": 0.37048792839050293,
"learning_rate": 0.0005851433566433565,
"loss": 3.9808,
"step": 4350
},
{
"epoch": 1.2813209854971168,
"grad_norm": 0.32924020290374756,
"learning_rate": 0.0005849685314685315,
"loss": 3.9735,
"step": 4400
},
{
"epoch": 1.2958821131108393,
"grad_norm": 0.5044884085655212,
"learning_rate": 0.0005847937062937063,
"loss": 3.9652,
"step": 4450
},
{
"epoch": 1.3104432407245616,
"grad_norm": 0.34231820702552795,
"learning_rate": 0.0005846188811188811,
"loss": 3.9638,
"step": 4500
},
{
"epoch": 1.3250043683382842,
"grad_norm": 0.34204453229904175,
"learning_rate": 0.0005844440559440559,
"loss": 3.9566,
"step": 4550
},
{
"epoch": 1.3395654959520065,
"grad_norm": 0.3231236934661865,
"learning_rate": 0.0005842692307692308,
"loss": 3.9454,
"step": 4600
},
{
"epoch": 1.354126623565729,
"grad_norm": 0.3487296402454376,
"learning_rate": 0.0005840944055944056,
"loss": 3.9472,
"step": 4650
},
{
"epoch": 1.3686877511794513,
"grad_norm": 0.3496778905391693,
"learning_rate": 0.0005839195804195804,
"loss": 3.9488,
"step": 4700
},
{
"epoch": 1.3832488787931738,
"grad_norm": 0.3419700860977173,
"learning_rate": 0.0005837447552447552,
"loss": 3.953,
"step": 4750
},
{
"epoch": 1.3978100064068961,
"grad_norm": 0.33043453097343445,
"learning_rate": 0.0005835699300699301,
"loss": 3.9475,
"step": 4800
},
{
"epoch": 1.4123711340206184,
"grad_norm": 0.316031277179718,
"learning_rate": 0.0005833951048951048,
"loss": 3.9505,
"step": 4850
},
{
"epoch": 1.426932261634341,
"grad_norm": 0.3300357162952423,
"learning_rate": 0.0005832202797202797,
"loss": 3.9385,
"step": 4900
},
{
"epoch": 1.4414933892480635,
"grad_norm": 0.35844576358795166,
"learning_rate": 0.0005830454545454546,
"loss": 3.9406,
"step": 4950
},
{
"epoch": 1.4560545168617858,
"grad_norm": 0.32424795627593994,
"learning_rate": 0.0005828706293706293,
"loss": 3.9446,
"step": 5000
},
{
"epoch": 1.4560545168617858,
"eval_accuracy": 0.3322647284197963,
"eval_loss": 3.9135546684265137,
"eval_runtime": 183.5994,
"eval_samples_per_second": 90.665,
"eval_steps_per_second": 5.67,
"step": 5000
},
{
"epoch": 1.470615644475508,
"grad_norm": 0.33048802614212036,
"learning_rate": 0.0005826958041958042,
"loss": 3.9274,
"step": 5050
},
{
"epoch": 1.4851767720892306,
"grad_norm": 0.33716776967048645,
"learning_rate": 0.000582520979020979,
"loss": 3.9214,
"step": 5100
},
{
"epoch": 1.4997378997029531,
"grad_norm": 0.31546398997306824,
"learning_rate": 0.0005823461538461538,
"loss": 3.9127,
"step": 5150
},
{
"epoch": 1.5142990273166754,
"grad_norm": 0.3368748426437378,
"learning_rate": 0.0005821713286713286,
"loss": 3.9187,
"step": 5200
},
{
"epoch": 1.5288601549303977,
"grad_norm": 0.32343748211860657,
"learning_rate": 0.0005819965034965035,
"loss": 3.9177,
"step": 5250
},
{
"epoch": 1.5434212825441203,
"grad_norm": 0.33815833926200867,
"learning_rate": 0.0005818216783216783,
"loss": 3.9081,
"step": 5300
},
{
"epoch": 1.5579824101578428,
"grad_norm": 0.32644879817962646,
"learning_rate": 0.0005816468531468531,
"loss": 3.9079,
"step": 5350
},
{
"epoch": 1.572543537771565,
"grad_norm": 0.32149451971054077,
"learning_rate": 0.0005814720279720279,
"loss": 3.8998,
"step": 5400
},
{
"epoch": 1.5871046653852874,
"grad_norm": 0.34723344445228577,
"learning_rate": 0.0005812972027972028,
"loss": 3.9069,
"step": 5450
},
{
"epoch": 1.6016657929990097,
"grad_norm": 0.32005998492240906,
"learning_rate": 0.0005811223776223776,
"loss": 3.9124,
"step": 5500
},
{
"epoch": 1.6162269206127322,
"grad_norm": 0.31391555070877075,
"learning_rate": 0.0005809475524475524,
"loss": 3.8972,
"step": 5550
},
{
"epoch": 1.6307880482264547,
"grad_norm": 0.3457600772380829,
"learning_rate": 0.0005807727272727272,
"loss": 3.8887,
"step": 5600
},
{
"epoch": 1.645349175840177,
"grad_norm": 0.34775686264038086,
"learning_rate": 0.0005805979020979021,
"loss": 3.9066,
"step": 5650
},
{
"epoch": 1.6599103034538993,
"grad_norm": 0.3384147584438324,
"learning_rate": 0.0005804230769230769,
"loss": 3.8885,
"step": 5700
},
{
"epoch": 1.6744714310676219,
"grad_norm": 0.33265241980552673,
"learning_rate": 0.0005802482517482517,
"loss": 3.8862,
"step": 5750
},
{
"epoch": 1.6890325586813444,
"grad_norm": 0.33996856212615967,
"learning_rate": 0.0005800734265734265,
"loss": 3.8993,
"step": 5800
},
{
"epoch": 1.7035936862950667,
"grad_norm": 0.3211154043674469,
"learning_rate": 0.0005798986013986013,
"loss": 3.8839,
"step": 5850
},
{
"epoch": 1.718154813908789,
"grad_norm": 0.3389524817466736,
"learning_rate": 0.0005797237762237762,
"loss": 3.8848,
"step": 5900
},
{
"epoch": 1.7327159415225115,
"grad_norm": 0.31527891755104065,
"learning_rate": 0.000579548951048951,
"loss": 3.8743,
"step": 5950
},
{
"epoch": 1.747277069136234,
"grad_norm": 0.33459725975990295,
"learning_rate": 0.0005793741258741258,
"loss": 3.8779,
"step": 6000
},
{
"epoch": 1.747277069136234,
"eval_accuracy": 0.33703706490372914,
"eval_loss": 3.856008291244507,
"eval_runtime": 181.6053,
"eval_samples_per_second": 91.66,
"eval_steps_per_second": 5.732,
"step": 6000
},
{
"epoch": 1.7618381967499563,
"grad_norm": 0.3163743317127228,
"learning_rate": 0.0005791993006993006,
"loss": 3.8718,
"step": 6050
},
{
"epoch": 1.7763993243636786,
"grad_norm": 0.3169962167739868,
"learning_rate": 0.0005790244755244755,
"loss": 3.871,
"step": 6100
},
{
"epoch": 1.7909604519774012,
"grad_norm": 0.3243774175643921,
"learning_rate": 0.0005788496503496503,
"loss": 3.8746,
"step": 6150
},
{
"epoch": 1.8055215795911237,
"grad_norm": 0.30815380811691284,
"learning_rate": 0.0005786748251748251,
"loss": 3.8719,
"step": 6200
},
{
"epoch": 1.820082707204846,
"grad_norm": 0.355295866727829,
"learning_rate": 0.0005784999999999999,
"loss": 3.8605,
"step": 6250
},
{
"epoch": 1.8346438348185683,
"grad_norm": 0.34578466415405273,
"learning_rate": 0.0005783251748251748,
"loss": 3.8651,
"step": 6300
},
{
"epoch": 1.8492049624322906,
"grad_norm": 0.3516864776611328,
"learning_rate": 0.0005781503496503496,
"loss": 3.8615,
"step": 6350
},
{
"epoch": 1.8637660900460131,
"grad_norm": 0.3126903772354126,
"learning_rate": 0.0005779755244755244,
"loss": 3.8609,
"step": 6400
},
{
"epoch": 1.8783272176597356,
"grad_norm": 0.31201526522636414,
"learning_rate": 0.0005778006993006993,
"loss": 3.8545,
"step": 6450
},
{
"epoch": 1.892888345273458,
"grad_norm": 0.33601483702659607,
"learning_rate": 0.000577625874125874,
"loss": 3.8421,
"step": 6500
},
{
"epoch": 1.9074494728871803,
"grad_norm": 0.3365996479988098,
"learning_rate": 0.0005774510489510489,
"loss": 3.851,
"step": 6550
},
{
"epoch": 1.9220106005009028,
"grad_norm": 0.30747294425964355,
"learning_rate": 0.0005772762237762237,
"loss": 3.8576,
"step": 6600
},
{
"epoch": 1.9365717281146253,
"grad_norm": 0.3154364228248596,
"learning_rate": 0.0005771013986013985,
"loss": 3.8576,
"step": 6650
},
{
"epoch": 1.9511328557283476,
"grad_norm": 0.32481271028518677,
"learning_rate": 0.0005769265734265733,
"loss": 3.8464,
"step": 6700
},
{
"epoch": 1.96569398334207,
"grad_norm": 0.3115486204624176,
"learning_rate": 0.0005767517482517482,
"loss": 3.8582,
"step": 6750
},
{
"epoch": 1.9802551109557924,
"grad_norm": 0.33827632665634155,
"learning_rate": 0.000576576923076923,
"loss": 3.8423,
"step": 6800
},
{
"epoch": 1.994816238569515,
"grad_norm": 0.31877878308296204,
"learning_rate": 0.0005764020979020978,
"loss": 3.8472,
"step": 6850
},
{
"epoch": 2.009319121672782,
"grad_norm": 0.31373608112335205,
"learning_rate": 0.0005762272727272726,
"loss": 3.7795,
"step": 6900
},
{
"epoch": 2.023880249286505,
"grad_norm": 0.3324246108531952,
"learning_rate": 0.0005760524475524475,
"loss": 3.7456,
"step": 6950
},
{
"epoch": 2.038441376900227,
"grad_norm": 0.33372506499290466,
"learning_rate": 0.0005758776223776223,
"loss": 3.7467,
"step": 7000
},
{
"epoch": 2.038441376900227,
"eval_accuracy": 0.3411688027400552,
"eval_loss": 3.816201686859131,
"eval_runtime": 179.7567,
"eval_samples_per_second": 92.603,
"eval_steps_per_second": 5.791,
"step": 7000
},
{
"epoch": 2.0530025045139495,
"grad_norm": 0.32919570803642273,
"learning_rate": 0.0005757027972027971,
"loss": 3.7547,
"step": 7050
},
{
"epoch": 2.067563632127672,
"grad_norm": 0.35541394352912903,
"learning_rate": 0.000575527972027972,
"loss": 3.7378,
"step": 7100
},
{
"epoch": 2.0821247597413945,
"grad_norm": 0.3367560803890228,
"learning_rate": 0.0005753531468531468,
"loss": 3.7486,
"step": 7150
},
{
"epoch": 2.096685887355117,
"grad_norm": 0.317826509475708,
"learning_rate": 0.0005751783216783216,
"loss": 3.7494,
"step": 7200
},
{
"epoch": 2.111247014968839,
"grad_norm": 0.31629154086112976,
"learning_rate": 0.0005750034965034964,
"loss": 3.7552,
"step": 7250
},
{
"epoch": 2.1258081425825615,
"grad_norm": 0.33633357286453247,
"learning_rate": 0.0005748286713286712,
"loss": 3.7447,
"step": 7300
},
{
"epoch": 2.140369270196284,
"grad_norm": 0.3292492628097534,
"learning_rate": 0.000574653846153846,
"loss": 3.746,
"step": 7350
},
{
"epoch": 2.1549303978100065,
"grad_norm": 0.3265497088432312,
"learning_rate": 0.000574479020979021,
"loss": 3.7502,
"step": 7400
},
{
"epoch": 2.169491525423729,
"grad_norm": 0.3585764467716217,
"learning_rate": 0.0005743041958041958,
"loss": 3.765,
"step": 7450
},
{
"epoch": 2.184052653037451,
"grad_norm": 0.3560110926628113,
"learning_rate": 0.0005741293706293706,
"loss": 3.7482,
"step": 7500
},
{
"epoch": 2.198613780651174,
"grad_norm": 0.3085196018218994,
"learning_rate": 0.0005739545454545454,
"loss": 3.7441,
"step": 7550
},
{
"epoch": 2.213174908264896,
"grad_norm": 0.30168864130973816,
"learning_rate": 0.0005737797202797203,
"loss": 3.7617,
"step": 7600
},
{
"epoch": 2.2277360358786185,
"grad_norm": 0.33070042729377747,
"learning_rate": 0.0005736048951048951,
"loss": 3.7502,
"step": 7650
},
{
"epoch": 2.2422971634923408,
"grad_norm": 0.3281424045562744,
"learning_rate": 0.0005734300699300699,
"loss": 3.7585,
"step": 7700
},
{
"epoch": 2.256858291106063,
"grad_norm": 0.311362087726593,
"learning_rate": 0.0005732552447552448,
"loss": 3.7633,
"step": 7750
},
{
"epoch": 2.271419418719786,
"grad_norm": 0.34092429280281067,
"learning_rate": 0.0005730804195804196,
"loss": 3.758,
"step": 7800
},
{
"epoch": 2.285980546333508,
"grad_norm": 0.3130812346935272,
"learning_rate": 0.0005729055944055944,
"loss": 3.7595,
"step": 7850
},
{
"epoch": 2.3005416739472304,
"grad_norm": 0.3428857922554016,
"learning_rate": 0.0005727307692307692,
"loss": 3.7469,
"step": 7900
},
{
"epoch": 2.3151028015609527,
"grad_norm": 0.3283872604370117,
"learning_rate": 0.0005725559440559441,
"loss": 3.7548,
"step": 7950
},
{
"epoch": 2.3296639291746755,
"grad_norm": 0.3140428066253662,
"learning_rate": 0.0005723811188811188,
"loss": 3.7515,
"step": 8000
},
{
"epoch": 2.3296639291746755,
"eval_accuracy": 0.34455049114130487,
"eval_loss": 3.783414125442505,
"eval_runtime": 180.5023,
"eval_samples_per_second": 92.22,
"eval_steps_per_second": 5.767,
"step": 8000
},
{
"epoch": 2.3442250567883978,
"grad_norm": 0.32429584860801697,
"learning_rate": 0.0005722062937062937,
"loss": 3.7638,
"step": 8050
},
{
"epoch": 2.35878618440212,
"grad_norm": 0.3218806982040405,
"learning_rate": 0.0005720314685314685,
"loss": 3.7444,
"step": 8100
},
{
"epoch": 2.3733473120158424,
"grad_norm": 0.33977949619293213,
"learning_rate": 0.0005718566433566433,
"loss": 3.7523,
"step": 8150
},
{
"epoch": 2.387908439629565,
"grad_norm": 0.3360040783882141,
"learning_rate": 0.0005716818181818181,
"loss": 3.7403,
"step": 8200
},
{
"epoch": 2.4024695672432874,
"grad_norm": 0.3237862288951874,
"learning_rate": 0.000571506993006993,
"loss": 3.7531,
"step": 8250
},
{
"epoch": 2.4170306948570097,
"grad_norm": 0.36249440908432007,
"learning_rate": 0.0005713321678321678,
"loss": 3.7537,
"step": 8300
},
{
"epoch": 2.431591822470732,
"grad_norm": 0.31824633479118347,
"learning_rate": 0.0005711573426573426,
"loss": 3.7521,
"step": 8350
},
{
"epoch": 2.4461529500844543,
"grad_norm": 0.31667229533195496,
"learning_rate": 0.0005709825174825175,
"loss": 3.7455,
"step": 8400
},
{
"epoch": 2.460714077698177,
"grad_norm": 0.3123161196708679,
"learning_rate": 0.0005708076923076923,
"loss": 3.7387,
"step": 8450
},
{
"epoch": 2.4752752053118994,
"grad_norm": 0.3177723288536072,
"learning_rate": 0.0005706328671328671,
"loss": 3.7467,
"step": 8500
},
{
"epoch": 2.4898363329256217,
"grad_norm": 0.3312056064605713,
"learning_rate": 0.0005704580419580419,
"loss": 3.7488,
"step": 8550
},
{
"epoch": 2.5043974605393444,
"grad_norm": 0.30859455466270447,
"learning_rate": 0.0005702832167832168,
"loss": 3.7394,
"step": 8600
},
{
"epoch": 2.5189585881530667,
"grad_norm": 0.30974218249320984,
"learning_rate": 0.0005701083916083916,
"loss": 3.7423,
"step": 8650
},
{
"epoch": 2.533519715766789,
"grad_norm": 0.3171643614768982,
"learning_rate": 0.0005699335664335664,
"loss": 3.7398,
"step": 8700
},
{
"epoch": 2.5480808433805113,
"grad_norm": 0.3243333101272583,
"learning_rate": 0.0005697587412587412,
"loss": 3.7445,
"step": 8750
},
{
"epoch": 2.5626419709942336,
"grad_norm": 0.3245783746242523,
"learning_rate": 0.000569583916083916,
"loss": 3.7514,
"step": 8800
},
{
"epoch": 2.5772030986079564,
"grad_norm": 0.31804442405700684,
"learning_rate": 0.0005694090909090908,
"loss": 3.7359,
"step": 8850
},
{
"epoch": 2.5917642262216787,
"grad_norm": 0.3267365097999573,
"learning_rate": 0.0005692342657342657,
"loss": 3.7395,
"step": 8900
},
{
"epoch": 2.606325353835401,
"grad_norm": 0.29884088039398193,
"learning_rate": 0.0005690594405594405,
"loss": 3.7334,
"step": 8950
},
{
"epoch": 2.6208864814491233,
"grad_norm": 0.3067515790462494,
"learning_rate": 0.0005688846153846153,
"loss": 3.7465,
"step": 9000
},
{
"epoch": 2.6208864814491233,
"eval_accuracy": 0.3470682119409281,
"eval_loss": 3.756100654602051,
"eval_runtime": 180.4789,
"eval_samples_per_second": 92.232,
"eval_steps_per_second": 5.768,
"step": 9000
},
{
"epoch": 2.6354476090628456,
"grad_norm": 0.33908265829086304,
"learning_rate": 0.0005687097902097901,
"loss": 3.7438,
"step": 9050
},
{
"epoch": 2.6500087366765683,
"grad_norm": 0.3071008026599884,
"learning_rate": 0.000568534965034965,
"loss": 3.7439,
"step": 9100
},
{
"epoch": 2.6645698642902906,
"grad_norm": 0.3263190686702728,
"learning_rate": 0.0005683601398601398,
"loss": 3.7297,
"step": 9150
},
{
"epoch": 2.679130991904013,
"grad_norm": 0.3221532702445984,
"learning_rate": 0.0005681853146853146,
"loss": 3.715,
"step": 9200
},
{
"epoch": 2.6936921195177357,
"grad_norm": 0.3336532711982727,
"learning_rate": 0.0005680104895104895,
"loss": 3.7365,
"step": 9250
},
{
"epoch": 2.708253247131458,
"grad_norm": 0.32632067799568176,
"learning_rate": 0.0005678356643356643,
"loss": 3.7242,
"step": 9300
},
{
"epoch": 2.7228143747451803,
"grad_norm": 0.3240657150745392,
"learning_rate": 0.0005676608391608391,
"loss": 3.7353,
"step": 9350
},
{
"epoch": 2.7373755023589026,
"grad_norm": 0.3053086996078491,
"learning_rate": 0.0005674860139860139,
"loss": 3.7285,
"step": 9400
},
{
"epoch": 2.751936629972625,
"grad_norm": 0.33227983117103577,
"learning_rate": 0.0005673111888111888,
"loss": 3.7225,
"step": 9450
},
{
"epoch": 2.7664977575863476,
"grad_norm": 0.3048139810562134,
"learning_rate": 0.0005671363636363635,
"loss": 3.7369,
"step": 9500
},
{
"epoch": 2.78105888520007,
"grad_norm": 0.3088410496711731,
"learning_rate": 0.0005669615384615384,
"loss": 3.7381,
"step": 9550
},
{
"epoch": 2.7956200128137922,
"grad_norm": 0.311073362827301,
"learning_rate": 0.0005667867132867132,
"loss": 3.7295,
"step": 9600
},
{
"epoch": 2.8101811404275145,
"grad_norm": 0.3153594136238098,
"learning_rate": 0.000566611888111888,
"loss": 3.7254,
"step": 9650
},
{
"epoch": 2.824742268041237,
"grad_norm": 0.31726452708244324,
"learning_rate": 0.0005664370629370628,
"loss": 3.7383,
"step": 9700
},
{
"epoch": 2.8393033956549596,
"grad_norm": 0.35246723890304565,
"learning_rate": 0.0005662622377622377,
"loss": 3.7415,
"step": 9750
},
{
"epoch": 2.853864523268682,
"grad_norm": 0.34164196252822876,
"learning_rate": 0.0005660874125874125,
"loss": 3.73,
"step": 9800
},
{
"epoch": 2.868425650882404,
"grad_norm": 0.34838706254959106,
"learning_rate": 0.0005659125874125873,
"loss": 3.7303,
"step": 9850
},
{
"epoch": 2.882986778496127,
"grad_norm": 0.338086873292923,
"learning_rate": 0.0005657377622377622,
"loss": 3.7274,
"step": 9900
},
{
"epoch": 2.8975479061098492,
"grad_norm": 0.3079865872859955,
"learning_rate": 0.000565562937062937,
"loss": 3.7255,
"step": 9950
},
{
"epoch": 2.9121090337235715,
"grad_norm": 0.3039040267467499,
"learning_rate": 0.0005653881118881118,
"loss": 3.7206,
"step": 10000
},
{
"epoch": 2.9121090337235715,
"eval_accuracy": 0.3496147355793591,
"eval_loss": 3.727715253829956,
"eval_runtime": 179.8629,
"eval_samples_per_second": 92.548,
"eval_steps_per_second": 5.788,
"step": 10000
},
{
"epoch": 2.926670161337294,
"grad_norm": 0.32474496960639954,
"learning_rate": 0.0005652132867132866,
"loss": 3.7148,
"step": 10050
},
{
"epoch": 2.941231288951016,
"grad_norm": 0.33152255415916443,
"learning_rate": 0.0005650384615384615,
"loss": 3.7307,
"step": 10100
},
{
"epoch": 2.955792416564739,
"grad_norm": 0.3443283438682556,
"learning_rate": 0.0005648636363636363,
"loss": 3.7031,
"step": 10150
},
{
"epoch": 2.970353544178461,
"grad_norm": 0.31178221106529236,
"learning_rate": 0.0005646888111888111,
"loss": 3.7225,
"step": 10200
},
{
"epoch": 2.9849146717921835,
"grad_norm": 0.2839028537273407,
"learning_rate": 0.000564513986013986,
"loss": 3.711,
"step": 10250
},
{
"epoch": 2.9994757994059063,
"grad_norm": 0.3071160912513733,
"learning_rate": 0.0005643391608391607,
"loss": 3.7179,
"step": 10300
},
{
"epoch": 3.0139786825091734,
"grad_norm": 0.2976529002189636,
"learning_rate": 0.0005641643356643355,
"loss": 3.6143,
"step": 10350
},
{
"epoch": 3.0285398101228957,
"grad_norm": 0.3535459339618683,
"learning_rate": 0.0005639895104895105,
"loss": 3.6151,
"step": 10400
},
{
"epoch": 3.0431009377366185,
"grad_norm": 0.3356330394744873,
"learning_rate": 0.0005638146853146853,
"loss": 3.6181,
"step": 10450
},
{
"epoch": 3.057662065350341,
"grad_norm": 0.3338814973831177,
"learning_rate": 0.0005636398601398601,
"loss": 3.6144,
"step": 10500
},
{
"epoch": 3.072223192964063,
"grad_norm": 0.30894315242767334,
"learning_rate": 0.000563465034965035,
"loss": 3.6221,
"step": 10550
},
{
"epoch": 3.0867843205777854,
"grad_norm": 0.3283049166202545,
"learning_rate": 0.0005632902097902098,
"loss": 3.6405,
"step": 10600
},
{
"epoch": 3.101345448191508,
"grad_norm": 0.31288209557533264,
"learning_rate": 0.0005631153846153846,
"loss": 3.6257,
"step": 10650
},
{
"epoch": 3.1159065758052304,
"grad_norm": 0.32523486018180847,
"learning_rate": 0.0005629405594405594,
"loss": 3.6256,
"step": 10700
},
{
"epoch": 3.1304677034189528,
"grad_norm": 0.32469362020492554,
"learning_rate": 0.0005627657342657343,
"loss": 3.6281,
"step": 10750
},
{
"epoch": 3.145028831032675,
"grad_norm": 0.3065453767776489,
"learning_rate": 0.0005625909090909091,
"loss": 3.6223,
"step": 10800
},
{
"epoch": 3.1595899586463974,
"grad_norm": 0.3205210864543915,
"learning_rate": 0.0005624160839160839,
"loss": 3.6287,
"step": 10850
},
{
"epoch": 3.17415108626012,
"grad_norm": 0.3646294176578522,
"learning_rate": 0.0005622412587412587,
"loss": 3.6388,
"step": 10900
},
{
"epoch": 3.1887122138738424,
"grad_norm": 0.31701645255088806,
"learning_rate": 0.0005620664335664336,
"loss": 3.6382,
"step": 10950
},
{
"epoch": 3.2032733414875647,
"grad_norm": 0.3214119076728821,
"learning_rate": 0.0005618916083916083,
"loss": 3.6411,
"step": 11000
},
{
"epoch": 3.2032733414875647,
"eval_accuracy": 0.35119712827467703,
"eval_loss": 3.7161929607391357,
"eval_runtime": 179.6418,
"eval_samples_per_second": 92.662,
"eval_steps_per_second": 5.795,
"step": 11000
},
{
"epoch": 3.217834469101287,
"grad_norm": 0.3275344967842102,
"learning_rate": 0.0005617167832167832,
"loss": 3.6293,
"step": 11050
},
{
"epoch": 3.2323955967150098,
"grad_norm": 0.3210190534591675,
"learning_rate": 0.000561541958041958,
"loss": 3.6476,
"step": 11100
},
{
"epoch": 3.246956724328732,
"grad_norm": 0.3466476798057556,
"learning_rate": 0.0005613671328671328,
"loss": 3.6433,
"step": 11150
},
{
"epoch": 3.2615178519424544,
"grad_norm": 0.3047979176044464,
"learning_rate": 0.0005611923076923077,
"loss": 3.6438,
"step": 11200
},
{
"epoch": 3.2760789795561767,
"grad_norm": 0.3035935163497925,
"learning_rate": 0.0005610174825174825,
"loss": 3.6351,
"step": 11250
},
{
"epoch": 3.2906401071698994,
"grad_norm": 0.3299195170402527,
"learning_rate": 0.0005608426573426573,
"loss": 3.6261,
"step": 11300
},
{
"epoch": 3.3052012347836217,
"grad_norm": 0.3068985044956207,
"learning_rate": 0.0005606678321678321,
"loss": 3.646,
"step": 11350
},
{
"epoch": 3.319762362397344,
"grad_norm": 0.31294307112693787,
"learning_rate": 0.000560493006993007,
"loss": 3.6391,
"step": 11400
},
{
"epoch": 3.3343234900110663,
"grad_norm": 0.3207429051399231,
"learning_rate": 0.0005603181818181818,
"loss": 3.6352,
"step": 11450
},
{
"epoch": 3.3488846176247886,
"grad_norm": 0.3064245581626892,
"learning_rate": 0.0005601433566433566,
"loss": 3.6414,
"step": 11500
},
{
"epoch": 3.3634457452385114,
"grad_norm": 0.32791194319725037,
"learning_rate": 0.0005599685314685314,
"loss": 3.6523,
"step": 11550
},
{
"epoch": 3.3780068728522337,
"grad_norm": 0.318341463804245,
"learning_rate": 0.0005597937062937063,
"loss": 3.6428,
"step": 11600
},
{
"epoch": 3.392568000465956,
"grad_norm": 0.30965712666511536,
"learning_rate": 0.0005596188811188811,
"loss": 3.6303,
"step": 11650
},
{
"epoch": 3.4071291280796787,
"grad_norm": 0.3249650001525879,
"learning_rate": 0.0005594440559440559,
"loss": 3.6413,
"step": 11700
},
{
"epoch": 3.421690255693401,
"grad_norm": 0.33683815598487854,
"learning_rate": 0.0005592692307692307,
"loss": 3.6245,
"step": 11750
},
{
"epoch": 3.4362513833071233,
"grad_norm": 0.32494112849235535,
"learning_rate": 0.0005590944055944055,
"loss": 3.6388,
"step": 11800
},
{
"epoch": 3.4508125109208456,
"grad_norm": 0.3111244738101959,
"learning_rate": 0.0005589195804195803,
"loss": 3.6472,
"step": 11850
},
{
"epoch": 3.465373638534568,
"grad_norm": 0.3108598589897156,
"learning_rate": 0.0005587447552447552,
"loss": 3.6419,
"step": 11900
},
{
"epoch": 3.4799347661482907,
"grad_norm": 0.32381555438041687,
"learning_rate": 0.00055856993006993,
"loss": 3.6358,
"step": 11950
},
{
"epoch": 3.494495893762013,
"grad_norm": 0.334005743265152,
"learning_rate": 0.0005583951048951048,
"loss": 3.6479,
"step": 12000
},
{
"epoch": 3.494495893762013,
"eval_accuracy": 0.3532355463240171,
"eval_loss": 3.6959714889526367,
"eval_runtime": 179.7113,
"eval_samples_per_second": 92.626,
"eval_steps_per_second": 5.793,
"step": 12000
},
{
"epoch": 3.5090570213757353,
"grad_norm": 0.3108784258365631,
"learning_rate": 0.0005582202797202797,
"loss": 3.6374,
"step": 12050
},
{
"epoch": 3.523618148989458,
"grad_norm": 0.31018486618995667,
"learning_rate": 0.0005580454545454545,
"loss": 3.6339,
"step": 12100
},
{
"epoch": 3.53817927660318,
"grad_norm": 0.3244931697845459,
"learning_rate": 0.0005578706293706293,
"loss": 3.6435,
"step": 12150
},
{
"epoch": 3.5527404042169026,
"grad_norm": 0.3162214457988739,
"learning_rate": 0.0005576958041958041,
"loss": 3.6443,
"step": 12200
},
{
"epoch": 3.567301531830625,
"grad_norm": 0.3021166920661926,
"learning_rate": 0.000557520979020979,
"loss": 3.653,
"step": 12250
},
{
"epoch": 3.5818626594443472,
"grad_norm": 0.31752142310142517,
"learning_rate": 0.0005573461538461538,
"loss": 3.6493,
"step": 12300
},
{
"epoch": 3.59642378705807,
"grad_norm": 0.3096977174282074,
"learning_rate": 0.0005571713286713286,
"loss": 3.647,
"step": 12350
},
{
"epoch": 3.6109849146717923,
"grad_norm": 0.33170512318611145,
"learning_rate": 0.0005569965034965034,
"loss": 3.641,
"step": 12400
},
{
"epoch": 3.6255460422855146,
"grad_norm": 0.324098140001297,
"learning_rate": 0.0005568216783216783,
"loss": 3.6424,
"step": 12450
},
{
"epoch": 3.640107169899237,
"grad_norm": 0.30960825085639954,
"learning_rate": 0.000556646853146853,
"loss": 3.6419,
"step": 12500
},
{
"epoch": 3.654668297512959,
"grad_norm": 0.3171733319759369,
"learning_rate": 0.0005564720279720279,
"loss": 3.6432,
"step": 12550
},
{
"epoch": 3.669229425126682,
"grad_norm": 0.3451572358608246,
"learning_rate": 0.0005562972027972027,
"loss": 3.639,
"step": 12600
},
{
"epoch": 3.6837905527404042,
"grad_norm": 0.32623323798179626,
"learning_rate": 0.0005561223776223775,
"loss": 3.652,
"step": 12650
},
{
"epoch": 3.6983516803541265,
"grad_norm": 0.3222995400428772,
"learning_rate": 0.0005559475524475524,
"loss": 3.6358,
"step": 12700
},
{
"epoch": 3.7129128079678493,
"grad_norm": 0.31685107946395874,
"learning_rate": 0.0005557727272727272,
"loss": 3.6335,
"step": 12750
},
{
"epoch": 3.7274739355815716,
"grad_norm": 0.296373188495636,
"learning_rate": 0.000555597902097902,
"loss": 3.6436,
"step": 12800
},
{
"epoch": 3.742035063195294,
"grad_norm": 0.3100621700286865,
"learning_rate": 0.0005554230769230768,
"loss": 3.627,
"step": 12850
},
{
"epoch": 3.756596190809016,
"grad_norm": 0.315127432346344,
"learning_rate": 0.0005552482517482517,
"loss": 3.6297,
"step": 12900
},
{
"epoch": 3.7711573184227385,
"grad_norm": 0.3003104329109192,
"learning_rate": 0.0005550734265734265,
"loss": 3.642,
"step": 12950
},
{
"epoch": 3.7857184460364612,
"grad_norm": 0.31278714537620544,
"learning_rate": 0.0005548986013986013,
"loss": 3.6408,
"step": 13000
},
{
"epoch": 3.7857184460364612,
"eval_accuracy": 0.3544306877906295,
"eval_loss": 3.6803815364837646,
"eval_runtime": 179.6757,
"eval_samples_per_second": 92.645,
"eval_steps_per_second": 5.794,
"step": 13000
},
{
"epoch": 3.8002795736501835,
"grad_norm": 0.30766820907592773,
"learning_rate": 0.0005547237762237761,
"loss": 3.6428,
"step": 13050
},
{
"epoch": 3.814840701263906,
"grad_norm": 0.31651240587234497,
"learning_rate": 0.000554548951048951,
"loss": 3.6435,
"step": 13100
},
{
"epoch": 3.829401828877628,
"grad_norm": 0.3394828736782074,
"learning_rate": 0.0005543741258741258,
"loss": 3.6463,
"step": 13150
},
{
"epoch": 3.8439629564913504,
"grad_norm": 0.32541000843048096,
"learning_rate": 0.0005541993006993006,
"loss": 3.64,
"step": 13200
},
{
"epoch": 3.858524084105073,
"grad_norm": 0.32382452487945557,
"learning_rate": 0.0005540244755244756,
"loss": 3.6398,
"step": 13250
},
{
"epoch": 3.8730852117187955,
"grad_norm": 0.3062750995159149,
"learning_rate": 0.0005538496503496502,
"loss": 3.6359,
"step": 13300
},
{
"epoch": 3.887646339332518,
"grad_norm": 0.3173046112060547,
"learning_rate": 0.0005536748251748252,
"loss": 3.6353,
"step": 13350
},
{
"epoch": 3.9022074669462405,
"grad_norm": 0.30907636880874634,
"learning_rate": 0.0005535,
"loss": 3.6324,
"step": 13400
},
{
"epoch": 3.916768594559963,
"grad_norm": 0.3197796642780304,
"learning_rate": 0.0005533251748251748,
"loss": 3.6239,
"step": 13450
},
{
"epoch": 3.931329722173685,
"grad_norm": 0.31007564067840576,
"learning_rate": 0.0005531503496503496,
"loss": 3.6527,
"step": 13500
},
{
"epoch": 3.9458908497874075,
"grad_norm": 0.3278939723968506,
"learning_rate": 0.0005529755244755245,
"loss": 3.6205,
"step": 13550
},
{
"epoch": 3.9604519774011298,
"grad_norm": 0.3057309091091156,
"learning_rate": 0.0005528006993006993,
"loss": 3.64,
"step": 13600
},
{
"epoch": 3.9750131050148525,
"grad_norm": 0.32244446873664856,
"learning_rate": 0.0005526258741258741,
"loss": 3.6376,
"step": 13650
},
{
"epoch": 3.989574232628575,
"grad_norm": 0.3175927400588989,
"learning_rate": 0.0005524510489510489,
"loss": 3.6282,
"step": 13700
},
{
"epoch": 4.004077115731842,
"grad_norm": 0.3246242105960846,
"learning_rate": 0.0005522762237762238,
"loss": 3.6123,
"step": 13750
},
{
"epoch": 4.018638243345564,
"grad_norm": 0.3010479211807251,
"learning_rate": 0.0005521013986013986,
"loss": 3.5191,
"step": 13800
},
{
"epoch": 4.033199370959287,
"grad_norm": 0.30843934416770935,
"learning_rate": 0.0005519265734265734,
"loss": 3.5335,
"step": 13850
},
{
"epoch": 4.04776049857301,
"grad_norm": 0.3538695275783539,
"learning_rate": 0.0005517517482517482,
"loss": 3.5185,
"step": 13900
},
{
"epoch": 4.062321626186732,
"grad_norm": 0.3226032257080078,
"learning_rate": 0.0005515769230769231,
"loss": 3.5432,
"step": 13950
},
{
"epoch": 4.076882753800454,
"grad_norm": 0.32624679803848267,
"learning_rate": 0.0005514020979020979,
"loss": 3.5399,
"step": 14000
},
{
"epoch": 4.076882753800454,
"eval_accuracy": 0.3558751795474921,
"eval_loss": 3.6733622550964355,
"eval_runtime": 179.6598,
"eval_samples_per_second": 92.653,
"eval_steps_per_second": 5.794,
"step": 14000
},
{
"epoch": 4.091443881414177,
"grad_norm": 0.3052377998828888,
"learning_rate": 0.0005512272727272727,
"loss": 3.5443,
"step": 14050
},
{
"epoch": 4.106005009027899,
"grad_norm": 0.32962766289711,
"learning_rate": 0.0005510524475524475,
"loss": 3.5366,
"step": 14100
},
{
"epoch": 4.120566136641622,
"grad_norm": 0.31939440965652466,
"learning_rate": 0.0005508776223776223,
"loss": 3.5529,
"step": 14150
},
{
"epoch": 4.135127264255344,
"grad_norm": 0.33249884843826294,
"learning_rate": 0.0005507027972027972,
"loss": 3.5555,
"step": 14200
},
{
"epoch": 4.149688391869066,
"grad_norm": 0.32969000935554504,
"learning_rate": 0.000550527972027972,
"loss": 3.5595,
"step": 14250
},
{
"epoch": 4.164249519482789,
"grad_norm": 0.31157979369163513,
"learning_rate": 0.0005503531468531468,
"loss": 3.5487,
"step": 14300
},
{
"epoch": 4.178810647096511,
"grad_norm": 0.31714004278182983,
"learning_rate": 0.0005501783216783216,
"loss": 3.5475,
"step": 14350
},
{
"epoch": 4.193371774710234,
"grad_norm": 0.31385311484336853,
"learning_rate": 0.0005500034965034965,
"loss": 3.5714,
"step": 14400
},
{
"epoch": 4.207932902323956,
"grad_norm": 0.318708598613739,
"learning_rate": 0.0005498286713286713,
"loss": 3.5619,
"step": 14450
},
{
"epoch": 4.222494029937678,
"grad_norm": 0.3174968957901001,
"learning_rate": 0.0005496538461538461,
"loss": 3.5536,
"step": 14500
},
{
"epoch": 4.237055157551401,
"grad_norm": 0.31125175952911377,
"learning_rate": 0.0005494790209790209,
"loss": 3.5567,
"step": 14550
},
{
"epoch": 4.251616285165123,
"grad_norm": 0.3577822744846344,
"learning_rate": 0.0005493041958041958,
"loss": 3.5653,
"step": 14600
},
{
"epoch": 4.266177412778846,
"grad_norm": 0.3287855386734009,
"learning_rate": 0.0005491293706293706,
"loss": 3.5722,
"step": 14650
},
{
"epoch": 4.280738540392568,
"grad_norm": 0.3440224528312683,
"learning_rate": 0.0005489545454545454,
"loss": 3.5661,
"step": 14700
},
{
"epoch": 4.29529966800629,
"grad_norm": 0.320300430059433,
"learning_rate": 0.0005487797202797203,
"loss": 3.5702,
"step": 14750
},
{
"epoch": 4.309860795620013,
"grad_norm": 0.31104516983032227,
"learning_rate": 0.000548604895104895,
"loss": 3.574,
"step": 14800
},
{
"epoch": 4.324421923233735,
"grad_norm": 0.31709253787994385,
"learning_rate": 0.0005484300699300699,
"loss": 3.5653,
"step": 14850
},
{
"epoch": 4.338983050847458,
"grad_norm": 0.32527729868888855,
"learning_rate": 0.0005482552447552447,
"loss": 3.5626,
"step": 14900
},
{
"epoch": 4.35354417846118,
"grad_norm": 0.3272615671157837,
"learning_rate": 0.0005480804195804195,
"loss": 3.5621,
"step": 14950
},
{
"epoch": 4.368105306074902,
"grad_norm": 0.31901460886001587,
"learning_rate": 0.0005479055944055943,
"loss": 3.5705,
"step": 15000
},
{
"epoch": 4.368105306074902,
"eval_accuracy": 0.3572903982151175,
"eval_loss": 3.659005641937256,
"eval_runtime": 179.666,
"eval_samples_per_second": 92.65,
"eval_steps_per_second": 5.794,
"step": 15000
},
{
"epoch": 4.382666433688625,
"grad_norm": 0.3244258463382721,
"learning_rate": 0.0005477307692307692,
"loss": 3.5795,
"step": 15050
},
{
"epoch": 4.397227561302348,
"grad_norm": 0.32407137751579285,
"learning_rate": 0.000547555944055944,
"loss": 3.583,
"step": 15100
},
{
"epoch": 4.41178868891607,
"grad_norm": 0.32865169644355774,
"learning_rate": 0.0005473811188811188,
"loss": 3.5689,
"step": 15150
},
{
"epoch": 4.426349816529792,
"grad_norm": 0.33799371123313904,
"learning_rate": 0.0005472062937062936,
"loss": 3.5609,
"step": 15200
},
{
"epoch": 4.440910944143514,
"grad_norm": 0.33456024527549744,
"learning_rate": 0.0005470314685314685,
"loss": 3.5751,
"step": 15250
},
{
"epoch": 4.455472071757237,
"grad_norm": 0.3036324977874756,
"learning_rate": 0.0005468566433566433,
"loss": 3.5639,
"step": 15300
},
{
"epoch": 4.47003319937096,
"grad_norm": 0.3154810667037964,
"learning_rate": 0.0005466818181818181,
"loss": 3.5711,
"step": 15350
},
{
"epoch": 4.4845943269846815,
"grad_norm": 0.33000850677490234,
"learning_rate": 0.000546506993006993,
"loss": 3.5648,
"step": 15400
},
{
"epoch": 4.499155454598404,
"grad_norm": 0.30540409684181213,
"learning_rate": 0.0005463321678321678,
"loss": 3.5704,
"step": 15450
},
{
"epoch": 4.513716582212126,
"grad_norm": 0.30891209840774536,
"learning_rate": 0.0005461573426573426,
"loss": 3.5838,
"step": 15500
},
{
"epoch": 4.528277709825849,
"grad_norm": 0.31514647603034973,
"learning_rate": 0.0005459825174825174,
"loss": 3.5842,
"step": 15550
},
{
"epoch": 4.542838837439572,
"grad_norm": 0.3225398361682892,
"learning_rate": 0.0005458076923076922,
"loss": 3.573,
"step": 15600
},
{
"epoch": 4.5573999650532935,
"grad_norm": 0.330959290266037,
"learning_rate": 0.000545632867132867,
"loss": 3.5666,
"step": 15650
},
{
"epoch": 4.571961092667016,
"grad_norm": 0.33557143807411194,
"learning_rate": 0.0005454580419580419,
"loss": 3.5644,
"step": 15700
},
{
"epoch": 4.586522220280738,
"grad_norm": 0.32927149534225464,
"learning_rate": 0.0005452832167832167,
"loss": 3.5739,
"step": 15750
},
{
"epoch": 4.601083347894461,
"grad_norm": 0.34957724809646606,
"learning_rate": 0.0005451083916083915,
"loss": 3.5869,
"step": 15800
},
{
"epoch": 4.615644475508184,
"grad_norm": 0.30984318256378174,
"learning_rate": 0.0005449335664335663,
"loss": 3.5735,
"step": 15850
},
{
"epoch": 4.630205603121905,
"grad_norm": 0.31692495942115784,
"learning_rate": 0.0005447587412587412,
"loss": 3.5705,
"step": 15900
},
{
"epoch": 4.644766730735628,
"grad_norm": 0.3213113248348236,
"learning_rate": 0.000544583916083916,
"loss": 3.5775,
"step": 15950
},
{
"epoch": 4.659327858349351,
"grad_norm": 0.31125399470329285,
"learning_rate": 0.0005444090909090908,
"loss": 3.5765,
"step": 16000
},
{
"epoch": 4.659327858349351,
"eval_accuracy": 0.35853856041765764,
"eval_loss": 3.6479623317718506,
"eval_runtime": 179.6168,
"eval_samples_per_second": 92.675,
"eval_steps_per_second": 5.796,
"step": 16000
},
{
"epoch": 4.673888985963073,
"grad_norm": 0.33686563372612,
"learning_rate": 0.0005442342657342657,
"loss": 3.5719,
"step": 16050
},
{
"epoch": 4.6884501135767955,
"grad_norm": 0.33824530243873596,
"learning_rate": 0.0005440594405594405,
"loss": 3.5811,
"step": 16100
},
{
"epoch": 4.703011241190518,
"grad_norm": 0.31893661618232727,
"learning_rate": 0.0005438846153846153,
"loss": 3.5637,
"step": 16150
},
{
"epoch": 4.71757236880424,
"grad_norm": 0.3194436728954315,
"learning_rate": 0.0005437097902097901,
"loss": 3.5643,
"step": 16200
},
{
"epoch": 4.732133496417963,
"grad_norm": 0.3170466721057892,
"learning_rate": 0.0005435349650349651,
"loss": 3.5724,
"step": 16250
},
{
"epoch": 4.746694624031685,
"grad_norm": 0.3126927614212036,
"learning_rate": 0.0005433601398601397,
"loss": 3.577,
"step": 16300
},
{
"epoch": 4.7612557516454075,
"grad_norm": 0.32292503118515015,
"learning_rate": 0.0005431853146853147,
"loss": 3.5742,
"step": 16350
},
{
"epoch": 4.77581687925913,
"grad_norm": 0.31847289204597473,
"learning_rate": 0.0005430104895104895,
"loss": 3.5728,
"step": 16400
},
{
"epoch": 4.790378006872852,
"grad_norm": 0.3244653642177582,
"learning_rate": 0.0005428356643356643,
"loss": 3.569,
"step": 16450
},
{
"epoch": 4.804939134486575,
"grad_norm": 0.3252134323120117,
"learning_rate": 0.0005426608391608391,
"loss": 3.578,
"step": 16500
},
{
"epoch": 4.819500262100297,
"grad_norm": 0.3335552215576172,
"learning_rate": 0.000542486013986014,
"loss": 3.5879,
"step": 16550
},
{
"epoch": 4.834061389714019,
"grad_norm": 0.32028138637542725,
"learning_rate": 0.0005423111888111888,
"loss": 3.5684,
"step": 16600
},
{
"epoch": 4.848622517327742,
"grad_norm": 0.35907861590385437,
"learning_rate": 0.0005421363636363636,
"loss": 3.5713,
"step": 16650
},
{
"epoch": 4.863183644941464,
"grad_norm": 0.3116554617881775,
"learning_rate": 0.0005419615384615385,
"loss": 3.569,
"step": 16700
},
{
"epoch": 4.877744772555187,
"grad_norm": 0.32443204522132874,
"learning_rate": 0.0005417867132867133,
"loss": 3.5691,
"step": 16750
},
{
"epoch": 4.892305900168909,
"grad_norm": 0.32515308260917664,
"learning_rate": 0.0005416118881118881,
"loss": 3.5714,
"step": 16800
},
{
"epoch": 4.906867027782631,
"grad_norm": 0.3074589669704437,
"learning_rate": 0.0005414370629370629,
"loss": 3.5773,
"step": 16850
},
{
"epoch": 4.921428155396354,
"grad_norm": 0.32288840413093567,
"learning_rate": 0.0005412622377622378,
"loss": 3.5677,
"step": 16900
},
{
"epoch": 4.935989283010076,
"grad_norm": 0.314647912979126,
"learning_rate": 0.0005410874125874126,
"loss": 3.5722,
"step": 16950
},
{
"epoch": 4.950550410623799,
"grad_norm": 0.33100083470344543,
"learning_rate": 0.0005409125874125874,
"loss": 3.5786,
"step": 17000
},
{
"epoch": 4.950550410623799,
"eval_accuracy": 0.35935820691630227,
"eval_loss": 3.6380977630615234,
"eval_runtime": 179.6744,
"eval_samples_per_second": 92.645,
"eval_steps_per_second": 5.794,
"step": 17000
},
{
"epoch": 4.9651115382375215,
"grad_norm": 0.30516111850738525,
"learning_rate": 0.0005407377622377622,
"loss": 3.5811,
"step": 17050
},
{
"epoch": 4.979672665851243,
"grad_norm": 0.32663649320602417,
"learning_rate": 0.000540562937062937,
"loss": 3.5675,
"step": 17100
},
{
"epoch": 4.994233793464966,
"grad_norm": 0.3119376599788666,
"learning_rate": 0.0005403881118881118,
"loss": 3.5709,
"step": 17150
},
{
"epoch": 5.008736676568233,
"grad_norm": 0.32153835892677307,
"learning_rate": 0.0005402132867132867,
"loss": 3.5135,
"step": 17200
},
{
"epoch": 5.023297804181956,
"grad_norm": 0.3149344325065613,
"learning_rate": 0.0005400384615384615,
"loss": 3.4688,
"step": 17250
},
{
"epoch": 5.037858931795678,
"grad_norm": 0.31834840774536133,
"learning_rate": 0.0005398636363636363,
"loss": 3.4763,
"step": 17300
},
{
"epoch": 5.052420059409401,
"grad_norm": 0.31698572635650635,
"learning_rate": 0.0005396888111888111,
"loss": 3.4748,
"step": 17350
},
{
"epoch": 5.066981187023123,
"grad_norm": 0.33012357354164124,
"learning_rate": 0.000539513986013986,
"loss": 3.4676,
"step": 17400
},
{
"epoch": 5.081542314636845,
"grad_norm": 0.3190668821334839,
"learning_rate": 0.0005393391608391608,
"loss": 3.4737,
"step": 17450
},
{
"epoch": 5.096103442250568,
"grad_norm": 0.3394056260585785,
"learning_rate": 0.0005391643356643356,
"loss": 3.4838,
"step": 17500
},
{
"epoch": 5.110664569864291,
"grad_norm": 0.33012598752975464,
"learning_rate": 0.0005389895104895105,
"loss": 3.4785,
"step": 17550
},
{
"epoch": 5.125225697478013,
"grad_norm": 0.32027074694633484,
"learning_rate": 0.0005388146853146853,
"loss": 3.4866,
"step": 17600
},
{
"epoch": 5.139786825091735,
"grad_norm": 0.34201139211654663,
"learning_rate": 0.0005386398601398601,
"loss": 3.4902,
"step": 17650
},
{
"epoch": 5.154347952705457,
"grad_norm": 0.3230908513069153,
"learning_rate": 0.0005384650349650349,
"loss": 3.4993,
"step": 17700
},
{
"epoch": 5.16890908031918,
"grad_norm": 0.34298035502433777,
"learning_rate": 0.0005382902097902098,
"loss": 3.4852,
"step": 17750
},
{
"epoch": 5.183470207932903,
"grad_norm": 0.32764288783073425,
"learning_rate": 0.0005381153846153845,
"loss": 3.492,
"step": 17800
},
{
"epoch": 5.1980313355466246,
"grad_norm": 0.3107599914073944,
"learning_rate": 0.0005379405594405594,
"loss": 3.5035,
"step": 17850
},
{
"epoch": 5.212592463160347,
"grad_norm": 0.3222023844718933,
"learning_rate": 0.0005377657342657342,
"loss": 3.5083,
"step": 17900
},
{
"epoch": 5.227153590774069,
"grad_norm": 0.328563928604126,
"learning_rate": 0.000537590909090909,
"loss": 3.5112,
"step": 17950
},
{
"epoch": 5.241714718387792,
"grad_norm": 0.31996095180511475,
"learning_rate": 0.0005374160839160838,
"loss": 3.4988,
"step": 18000
},
{
"epoch": 5.241714718387792,
"eval_accuracy": 0.3599712959137824,
"eval_loss": 3.6374216079711914,
"eval_runtime": 179.7436,
"eval_samples_per_second": 92.61,
"eval_steps_per_second": 5.792,
"step": 18000
},
{
"epoch": 5.256275846001515,
"grad_norm": 0.3201790750026703,
"learning_rate": 0.0005372412587412587,
"loss": 3.5086,
"step": 18050
},
{
"epoch": 5.2708369736152365,
"grad_norm": 0.33598074316978455,
"learning_rate": 0.0005370664335664335,
"loss": 3.5094,
"step": 18100
},
{
"epoch": 5.285398101228959,
"grad_norm": 0.3486185073852539,
"learning_rate": 0.0005368916083916083,
"loss": 3.5194,
"step": 18150
},
{
"epoch": 5.299959228842681,
"grad_norm": 0.3399152159690857,
"learning_rate": 0.0005367167832167832,
"loss": 3.5113,
"step": 18200
},
{
"epoch": 5.314520356456404,
"grad_norm": 0.3166239559650421,
"learning_rate": 0.000536541958041958,
"loss": 3.5098,
"step": 18250
},
{
"epoch": 5.329081484070127,
"grad_norm": 0.33888429403305054,
"learning_rate": 0.0005363671328671328,
"loss": 3.5138,
"step": 18300
},
{
"epoch": 5.3436426116838485,
"grad_norm": 0.3163507878780365,
"learning_rate": 0.0005361923076923076,
"loss": 3.5154,
"step": 18350
},
{
"epoch": 5.358203739297571,
"grad_norm": 0.3177511990070343,
"learning_rate": 0.0005360174825174825,
"loss": 3.5125,
"step": 18400
},
{
"epoch": 5.372764866911294,
"grad_norm": 0.30713725090026855,
"learning_rate": 0.0005358426573426573,
"loss": 3.5124,
"step": 18450
},
{
"epoch": 5.387325994525016,
"grad_norm": 0.3410167694091797,
"learning_rate": 0.0005356678321678321,
"loss": 3.5107,
"step": 18500
},
{
"epoch": 5.401887122138739,
"grad_norm": 0.32011181116104126,
"learning_rate": 0.0005354930069930069,
"loss": 3.5171,
"step": 18550
},
{
"epoch": 5.41644824975246,
"grad_norm": 0.33928465843200684,
"learning_rate": 0.0005353181818181817,
"loss": 3.5235,
"step": 18600
},
{
"epoch": 5.431009377366183,
"grad_norm": 0.3426041901111603,
"learning_rate": 0.0005351433566433565,
"loss": 3.5077,
"step": 18650
},
{
"epoch": 5.445570504979906,
"grad_norm": 0.3379696309566498,
"learning_rate": 0.0005349685314685314,
"loss": 3.515,
"step": 18700
},
{
"epoch": 5.460131632593628,
"grad_norm": 0.3430345356464386,
"learning_rate": 0.0005347937062937062,
"loss": 3.5124,
"step": 18750
},
{
"epoch": 5.4746927602073505,
"grad_norm": 0.3233330547809601,
"learning_rate": 0.000534618881118881,
"loss": 3.5201,
"step": 18800
},
{
"epoch": 5.489253887821073,
"grad_norm": 0.353807270526886,
"learning_rate": 0.0005344440559440559,
"loss": 3.5234,
"step": 18850
},
{
"epoch": 5.503815015434795,
"grad_norm": 0.3057951033115387,
"learning_rate": 0.0005342692307692307,
"loss": 3.5253,
"step": 18900
},
{
"epoch": 5.518376143048518,
"grad_norm": 0.3360867202281952,
"learning_rate": 0.0005340944055944055,
"loss": 3.5197,
"step": 18950
},
{
"epoch": 5.53293727066224,
"grad_norm": 0.31620731949806213,
"learning_rate": 0.0005339195804195803,
"loss": 3.513,
"step": 19000
},
{
"epoch": 5.53293727066224,
"eval_accuracy": 0.36111988258787275,
"eval_loss": 3.6275086402893066,
"eval_runtime": 179.6266,
"eval_samples_per_second": 92.67,
"eval_steps_per_second": 5.795,
"step": 19000
},
{
"epoch": 5.5474983982759625,
"grad_norm": 0.3172198235988617,
"learning_rate": 0.0005337447552447552,
"loss": 3.5143,
"step": 19050
},
{
"epoch": 5.562059525889685,
"grad_norm": 0.30855488777160645,
"learning_rate": 0.00053356993006993,
"loss": 3.5163,
"step": 19100
},
{
"epoch": 5.576620653503407,
"grad_norm": 0.320773720741272,
"learning_rate": 0.0005333951048951048,
"loss": 3.5108,
"step": 19150
},
{
"epoch": 5.59118178111713,
"grad_norm": 0.33892980217933655,
"learning_rate": 0.0005332202797202796,
"loss": 3.5252,
"step": 19200
},
{
"epoch": 5.605742908730852,
"grad_norm": 0.34134751558303833,
"learning_rate": 0.0005330454545454546,
"loss": 3.5172,
"step": 19250
},
{
"epoch": 5.620304036344574,
"grad_norm": 0.3078143000602722,
"learning_rate": 0.0005328706293706292,
"loss": 3.5308,
"step": 19300
},
{
"epoch": 5.634865163958297,
"grad_norm": 0.34207335114479065,
"learning_rate": 0.0005326958041958042,
"loss": 3.5397,
"step": 19350
},
{
"epoch": 5.649426291572019,
"grad_norm": 0.32775601744651794,
"learning_rate": 0.000532520979020979,
"loss": 3.5251,
"step": 19400
},
{
"epoch": 5.663987419185742,
"grad_norm": 0.30578893423080444,
"learning_rate": 0.0005323461538461538,
"loss": 3.5262,
"step": 19450
},
{
"epoch": 5.6785485467994645,
"grad_norm": 0.3056182563304901,
"learning_rate": 0.0005321713286713287,
"loss": 3.525,
"step": 19500
},
{
"epoch": 5.693109674413186,
"grad_norm": 0.35645192861557007,
"learning_rate": 0.0005319965034965035,
"loss": 3.5154,
"step": 19550
},
{
"epoch": 5.707670802026909,
"grad_norm": 0.32187339663505554,
"learning_rate": 0.0005318216783216783,
"loss": 3.5408,
"step": 19600
},
{
"epoch": 5.722231929640631,
"grad_norm": 0.33344683051109314,
"learning_rate": 0.0005316468531468531,
"loss": 3.5218,
"step": 19650
},
{
"epoch": 5.736793057254354,
"grad_norm": 0.30798256397247314,
"learning_rate": 0.000531472027972028,
"loss": 3.5235,
"step": 19700
},
{
"epoch": 5.7513541848680765,
"grad_norm": 0.34034493565559387,
"learning_rate": 0.0005312972027972028,
"loss": 3.5312,
"step": 19750
},
{
"epoch": 5.765915312481798,
"grad_norm": 0.335997611284256,
"learning_rate": 0.0005311223776223776,
"loss": 3.5245,
"step": 19800
},
{
"epoch": 5.780476440095521,
"grad_norm": 0.34608331322669983,
"learning_rate": 0.0005309475524475524,
"loss": 3.5313,
"step": 19850
},
{
"epoch": 5.795037567709244,
"grad_norm": 0.31513699889183044,
"learning_rate": 0.0005307727272727273,
"loss": 3.5271,
"step": 19900
},
{
"epoch": 5.809598695322966,
"grad_norm": 0.3229629695415497,
"learning_rate": 0.0005305979020979021,
"loss": 3.5301,
"step": 19950
},
{
"epoch": 5.824159822936688,
"grad_norm": 0.3298615515232086,
"learning_rate": 0.0005304230769230769,
"loss": 3.5213,
"step": 20000
},
{
"epoch": 5.824159822936688,
"eval_accuracy": 0.3618738115889927,
"eval_loss": 3.6149404048919678,
"eval_runtime": 179.7404,
"eval_samples_per_second": 92.611,
"eval_steps_per_second": 5.792,
"step": 20000
},
{
"epoch": 5.83872095055041,
"grad_norm": 0.33221063017845154,
"learning_rate": 0.0005302482517482517,
"loss": 3.5375,
"step": 20050
},
{
"epoch": 5.853282078164133,
"grad_norm": 0.31011486053466797,
"learning_rate": 0.0005300734265734265,
"loss": 3.5339,
"step": 20100
},
{
"epoch": 5.867843205777856,
"grad_norm": 0.31870928406715393,
"learning_rate": 0.0005298986013986013,
"loss": 3.5193,
"step": 20150
},
{
"epoch": 5.882404333391578,
"grad_norm": 0.31526702642440796,
"learning_rate": 0.0005297237762237762,
"loss": 3.526,
"step": 20200
},
{
"epoch": 5.8969654610053,
"grad_norm": 0.33031392097473145,
"learning_rate": 0.000529548951048951,
"loss": 3.5219,
"step": 20250
},
{
"epoch": 5.911526588619022,
"grad_norm": 0.29852011799812317,
"learning_rate": 0.0005293741258741258,
"loss": 3.5221,
"step": 20300
},
{
"epoch": 5.926087716232745,
"grad_norm": 0.32039979100227356,
"learning_rate": 0.0005291993006993007,
"loss": 3.5277,
"step": 20350
},
{
"epoch": 5.940648843846468,
"grad_norm": 0.3331441879272461,
"learning_rate": 0.0005290244755244755,
"loss": 3.5332,
"step": 20400
},
{
"epoch": 5.95520997146019,
"grad_norm": 0.3218717873096466,
"learning_rate": 0.0005288496503496503,
"loss": 3.5284,
"step": 20450
},
{
"epoch": 5.969771099073912,
"grad_norm": 0.322635680437088,
"learning_rate": 0.0005286748251748251,
"loss": 3.5293,
"step": 20500
},
{
"epoch": 5.984332226687634,
"grad_norm": 0.3240005075931549,
"learning_rate": 0.0005285,
"loss": 3.5176,
"step": 20550
},
{
"epoch": 5.998893354301357,
"grad_norm": 0.3016450107097626,
"learning_rate": 0.0005283251748251748,
"loss": 3.5365,
"step": 20600
},
{
"epoch": 6.013396237404625,
"grad_norm": 0.32817426323890686,
"learning_rate": 0.0005281503496503496,
"loss": 3.4283,
"step": 20650
},
{
"epoch": 6.027957365018347,
"grad_norm": 0.35756346583366394,
"learning_rate": 0.0005279755244755244,
"loss": 3.4236,
"step": 20700
},
{
"epoch": 6.04251849263207,
"grad_norm": 0.32357922196388245,
"learning_rate": 0.0005278006993006993,
"loss": 3.4285,
"step": 20750
},
{
"epoch": 6.0570796202457915,
"grad_norm": 0.3198466897010803,
"learning_rate": 0.000527625874125874,
"loss": 3.4119,
"step": 20800
},
{
"epoch": 6.071640747859514,
"grad_norm": 0.3190059959888458,
"learning_rate": 0.0005274510489510489,
"loss": 3.4369,
"step": 20850
},
{
"epoch": 6.086201875473237,
"grad_norm": 0.33216336369514465,
"learning_rate": 0.0005272762237762238,
"loss": 3.4332,
"step": 20900
},
{
"epoch": 6.100763003086959,
"grad_norm": 0.3313469886779785,
"learning_rate": 0.0005271013986013985,
"loss": 3.4451,
"step": 20950
},
{
"epoch": 6.115324130700682,
"grad_norm": 0.3283461630344391,
"learning_rate": 0.0005269265734265734,
"loss": 3.4434,
"step": 21000
},
{
"epoch": 6.115324130700682,
"eval_accuracy": 0.36220145857575725,
"eval_loss": 3.6175012588500977,
"eval_runtime": 179.6715,
"eval_samples_per_second": 92.647,
"eval_steps_per_second": 5.794,
"step": 21000
},
{
"epoch": 6.1298852583144035,
"grad_norm": 0.3373625576496124,
"learning_rate": 0.0005267517482517482,
"loss": 3.4382,
"step": 21050
},
{
"epoch": 6.144446385928126,
"grad_norm": 0.3465948700904846,
"learning_rate": 0.000526576923076923,
"loss": 3.4579,
"step": 21100
},
{
"epoch": 6.159007513541849,
"grad_norm": 0.32088997960090637,
"learning_rate": 0.0005264020979020978,
"loss": 3.4415,
"step": 21150
},
{
"epoch": 6.173568641155571,
"grad_norm": 0.34192368388175964,
"learning_rate": 0.0005262272727272727,
"loss": 3.4381,
"step": 21200
},
{
"epoch": 6.1881297687692935,
"grad_norm": 0.3320399224758148,
"learning_rate": 0.0005260524475524475,
"loss": 3.4543,
"step": 21250
},
{
"epoch": 6.202690896383016,
"grad_norm": 0.3265776038169861,
"learning_rate": 0.0005258776223776223,
"loss": 3.4382,
"step": 21300
},
{
"epoch": 6.217252023996738,
"grad_norm": 0.32715266942977905,
"learning_rate": 0.0005257027972027971,
"loss": 3.4466,
"step": 21350
},
{
"epoch": 6.231813151610461,
"grad_norm": 0.342515766620636,
"learning_rate": 0.000525527972027972,
"loss": 3.4544,
"step": 21400
},
{
"epoch": 6.246374279224183,
"grad_norm": 0.32371506094932556,
"learning_rate": 0.0005253531468531468,
"loss": 3.4531,
"step": 21450
},
{
"epoch": 6.2609354068379055,
"grad_norm": 0.3336334228515625,
"learning_rate": 0.0005251783216783216,
"loss": 3.448,
"step": 21500
},
{
"epoch": 6.275496534451628,
"grad_norm": 0.3309522867202759,
"learning_rate": 0.0005250034965034965,
"loss": 3.4578,
"step": 21550
},
{
"epoch": 6.29005766206535,
"grad_norm": 0.3392053544521332,
"learning_rate": 0.0005248286713286712,
"loss": 3.4488,
"step": 21600
},
{
"epoch": 6.304618789679073,
"grad_norm": 0.319116473197937,
"learning_rate": 0.0005246538461538461,
"loss": 3.4704,
"step": 21650
},
{
"epoch": 6.319179917292795,
"grad_norm": 0.34017646312713623,
"learning_rate": 0.0005244790209790209,
"loss": 3.4707,
"step": 21700
},
{
"epoch": 6.3337410449065175,
"grad_norm": 0.3570762872695923,
"learning_rate": 0.0005243041958041957,
"loss": 3.4818,
"step": 21750
},
{
"epoch": 6.34830217252024,
"grad_norm": 0.32639503479003906,
"learning_rate": 0.0005241293706293705,
"loss": 3.4707,
"step": 21800
},
{
"epoch": 6.362863300133962,
"grad_norm": 0.3118131458759308,
"learning_rate": 0.0005239545454545454,
"loss": 3.474,
"step": 21850
},
{
"epoch": 6.377424427747685,
"grad_norm": 0.334991991519928,
"learning_rate": 0.0005237797202797202,
"loss": 3.4688,
"step": 21900
},
{
"epoch": 6.391985555361408,
"grad_norm": 0.3370341956615448,
"learning_rate": 0.000523604895104895,
"loss": 3.4606,
"step": 21950
},
{
"epoch": 6.406546682975129,
"grad_norm": 0.30481281876564026,
"learning_rate": 0.0005234300699300698,
"loss": 3.4748,
"step": 22000
},
{
"epoch": 6.406546682975129,
"eval_accuracy": 0.3628913159558557,
"eval_loss": 3.6097097396850586,
"eval_runtime": 179.7178,
"eval_samples_per_second": 92.623,
"eval_steps_per_second": 5.792,
"step": 22000
},
{
"epoch": 6.421107810588852,
"grad_norm": 0.31929466128349304,
"learning_rate": 0.0005232552447552447,
"loss": 3.4819,
"step": 22050
},
{
"epoch": 6.435668938202574,
"grad_norm": 0.3431412875652313,
"learning_rate": 0.0005230804195804195,
"loss": 3.4821,
"step": 22100
},
{
"epoch": 6.450230065816297,
"grad_norm": 0.31847938895225525,
"learning_rate": 0.0005229055944055943,
"loss": 3.4796,
"step": 22150
},
{
"epoch": 6.4647911934300195,
"grad_norm": 0.33600088953971863,
"learning_rate": 0.0005227307692307691,
"loss": 3.4894,
"step": 22200
},
{
"epoch": 6.479352321043741,
"grad_norm": 0.3299783766269684,
"learning_rate": 0.0005225559440559441,
"loss": 3.4861,
"step": 22250
},
{
"epoch": 6.493913448657464,
"grad_norm": 0.3157522976398468,
"learning_rate": 0.0005223811188811189,
"loss": 3.4758,
"step": 22300
},
{
"epoch": 6.508474576271187,
"grad_norm": 0.3057360053062439,
"learning_rate": 0.0005222062937062937,
"loss": 3.4781,
"step": 22350
},
{
"epoch": 6.523035703884909,
"grad_norm": 0.32586047053337097,
"learning_rate": 0.0005220314685314686,
"loss": 3.4823,
"step": 22400
},
{
"epoch": 6.5375968314986315,
"grad_norm": 0.3262418210506439,
"learning_rate": 0.0005218566433566433,
"loss": 3.4934,
"step": 22450
},
{
"epoch": 6.552157959112353,
"grad_norm": 0.33487650752067566,
"learning_rate": 0.0005216818181818182,
"loss": 3.4761,
"step": 22500
},
{
"epoch": 6.566719086726076,
"grad_norm": 0.342220664024353,
"learning_rate": 0.000521506993006993,
"loss": 3.4971,
"step": 22550
},
{
"epoch": 6.581280214339799,
"grad_norm": 0.32479947805404663,
"learning_rate": 0.0005213321678321678,
"loss": 3.492,
"step": 22600
},
{
"epoch": 6.595841341953521,
"grad_norm": 0.3203052580356598,
"learning_rate": 0.0005211573426573426,
"loss": 3.4777,
"step": 22650
},
{
"epoch": 6.610402469567243,
"grad_norm": 0.3330882787704468,
"learning_rate": 0.0005209825174825175,
"loss": 3.4797,
"step": 22700
},
{
"epoch": 6.624963597180965,
"grad_norm": 0.3708348572254181,
"learning_rate": 0.0005208076923076923,
"loss": 3.4816,
"step": 22750
},
{
"epoch": 6.639524724794688,
"grad_norm": 0.3509019613265991,
"learning_rate": 0.0005206328671328671,
"loss": 3.4941,
"step": 22800
},
{
"epoch": 6.654085852408411,
"grad_norm": 0.3052864968776703,
"learning_rate": 0.0005204580419580419,
"loss": 3.4806,
"step": 22850
},
{
"epoch": 6.668646980022133,
"grad_norm": 0.309030681848526,
"learning_rate": 0.0005202832167832168,
"loss": 3.4757,
"step": 22900
},
{
"epoch": 6.683208107635855,
"grad_norm": 0.3289431035518646,
"learning_rate": 0.0005201083916083916,
"loss": 3.4787,
"step": 22950
},
{
"epoch": 6.697769235249577,
"grad_norm": 0.3410831391811371,
"learning_rate": 0.0005199335664335664,
"loss": 3.4856,
"step": 23000
},
{
"epoch": 6.697769235249577,
"eval_accuracy": 0.3638117136090239,
"eval_loss": 3.60054349899292,
"eval_runtime": 179.6205,
"eval_samples_per_second": 92.673,
"eval_steps_per_second": 5.796,
"step": 23000
},
{
"epoch": 6.7123303628633,
"grad_norm": 0.3080759346485138,
"learning_rate": 0.0005197587412587413,
"loss": 3.4842,
"step": 23050
},
{
"epoch": 6.726891490477023,
"grad_norm": 0.3141343295574188,
"learning_rate": 0.0005195839160839161,
"loss": 3.4951,
"step": 23100
},
{
"epoch": 6.741452618090745,
"grad_norm": 0.332527220249176,
"learning_rate": 0.0005194090909090909,
"loss": 3.4855,
"step": 23150
},
{
"epoch": 6.756013745704467,
"grad_norm": 0.32234665751457214,
"learning_rate": 0.0005192342657342657,
"loss": 3.4815,
"step": 23200
},
{
"epoch": 6.77057487331819,
"grad_norm": 0.33322620391845703,
"learning_rate": 0.0005190594405594405,
"loss": 3.4962,
"step": 23250
},
{
"epoch": 6.785136000931912,
"grad_norm": 0.337287575006485,
"learning_rate": 0.0005188846153846153,
"loss": 3.4865,
"step": 23300
},
{
"epoch": 6.799697128545635,
"grad_norm": 0.3224928379058838,
"learning_rate": 0.0005187097902097902,
"loss": 3.4865,
"step": 23350
},
{
"epoch": 6.814258256159357,
"grad_norm": 0.32035595178604126,
"learning_rate": 0.000518534965034965,
"loss": 3.4969,
"step": 23400
},
{
"epoch": 6.828819383773079,
"grad_norm": 0.343305379152298,
"learning_rate": 0.0005183601398601398,
"loss": 3.4836,
"step": 23450
},
{
"epoch": 6.843380511386802,
"grad_norm": 0.3211219012737274,
"learning_rate": 0.0005181853146853146,
"loss": 3.4926,
"step": 23500
},
{
"epoch": 6.857941639000524,
"grad_norm": 0.31823667883872986,
"learning_rate": 0.0005180104895104895,
"loss": 3.49,
"step": 23550
},
{
"epoch": 6.872502766614247,
"grad_norm": 0.31736093759536743,
"learning_rate": 0.0005178356643356643,
"loss": 3.497,
"step": 23600
},
{
"epoch": 6.887063894227969,
"grad_norm": 0.3398060202598572,
"learning_rate": 0.0005176608391608391,
"loss": 3.4832,
"step": 23650
},
{
"epoch": 6.901625021841691,
"grad_norm": 0.37513840198516846,
"learning_rate": 0.000517486013986014,
"loss": 3.4865,
"step": 23700
},
{
"epoch": 6.916186149455414,
"grad_norm": 0.3081456422805786,
"learning_rate": 0.0005173111888111888,
"loss": 3.4894,
"step": 23750
},
{
"epoch": 6.930747277069136,
"grad_norm": 0.31910234689712524,
"learning_rate": 0.0005171363636363636,
"loss": 3.4849,
"step": 23800
},
{
"epoch": 6.945308404682859,
"grad_norm": 0.3732234537601471,
"learning_rate": 0.0005169615384615384,
"loss": 3.4812,
"step": 23850
},
{
"epoch": 6.959869532296581,
"grad_norm": 0.31321918964385986,
"learning_rate": 0.0005167867132867133,
"loss": 3.4905,
"step": 23900
},
{
"epoch": 6.974430659910303,
"grad_norm": 0.3159354627132416,
"learning_rate": 0.000516611888111888,
"loss": 3.4908,
"step": 23950
},
{
"epoch": 6.988991787524026,
"grad_norm": 0.324131041765213,
"learning_rate": 0.0005164370629370629,
"loss": 3.499,
"step": 24000
},
{
"epoch": 6.988991787524026,
"eval_accuracy": 0.3645026290525888,
"eval_loss": 3.5911543369293213,
"eval_runtime": 179.6242,
"eval_samples_per_second": 92.671,
"eval_steps_per_second": 5.795,
"step": 24000
},
{
"epoch": 7.003494670627293,
"grad_norm": 0.3455987274646759,
"learning_rate": 0.0005162622377622377,
"loss": 3.468,
"step": 24050
},
{
"epoch": 7.018055798241016,
"grad_norm": 0.32421788573265076,
"learning_rate": 0.0005160874125874125,
"loss": 3.3858,
"step": 24100
},
{
"epoch": 7.032616925854738,
"grad_norm": 0.3512269854545593,
"learning_rate": 0.0005159125874125873,
"loss": 3.3879,
"step": 24150
},
{
"epoch": 7.0471780534684605,
"grad_norm": 0.3255019783973694,
"learning_rate": 0.0005157377622377622,
"loss": 3.3864,
"step": 24200
},
{
"epoch": 7.061739181082183,
"grad_norm": 0.3749047517776489,
"learning_rate": 0.000515562937062937,
"loss": 3.3933,
"step": 24250
},
{
"epoch": 7.076300308695905,
"grad_norm": 0.33762863278388977,
"learning_rate": 0.0005153881118881118,
"loss": 3.406,
"step": 24300
},
{
"epoch": 7.090861436309628,
"grad_norm": 0.34979817271232605,
"learning_rate": 0.0005152132867132867,
"loss": 3.4044,
"step": 24350
},
{
"epoch": 7.105422563923351,
"grad_norm": 0.3524860739707947,
"learning_rate": 0.0005150384615384615,
"loss": 3.3998,
"step": 24400
},
{
"epoch": 7.1199836915370724,
"grad_norm": 0.3533429503440857,
"learning_rate": 0.0005148636363636363,
"loss": 3.3967,
"step": 24450
},
{
"epoch": 7.134544819150795,
"grad_norm": 0.33228060603141785,
"learning_rate": 0.0005146888111888111,
"loss": 3.4,
"step": 24500
},
{
"epoch": 7.149105946764517,
"grad_norm": 0.33005383610725403,
"learning_rate": 0.000514513986013986,
"loss": 3.4111,
"step": 24550
},
{
"epoch": 7.16366707437824,
"grad_norm": 0.32785147428512573,
"learning_rate": 0.0005143391608391608,
"loss": 3.4146,
"step": 24600
},
{
"epoch": 7.1782282019919625,
"grad_norm": 0.34615424275398254,
"learning_rate": 0.0005141643356643356,
"loss": 3.4083,
"step": 24650
},
{
"epoch": 7.192789329605684,
"grad_norm": 0.31757858395576477,
"learning_rate": 0.0005139895104895104,
"loss": 3.4192,
"step": 24700
},
{
"epoch": 7.207350457219407,
"grad_norm": 0.3305107355117798,
"learning_rate": 0.0005138146853146852,
"loss": 3.4174,
"step": 24750
},
{
"epoch": 7.22191158483313,
"grad_norm": 0.3234144151210785,
"learning_rate": 0.00051363986013986,
"loss": 3.4124,
"step": 24800
},
{
"epoch": 7.236472712446852,
"grad_norm": 0.3411427438259125,
"learning_rate": 0.0005134650349650349,
"loss": 3.4177,
"step": 24850
},
{
"epoch": 7.2510338400605745,
"grad_norm": 0.3230850398540497,
"learning_rate": 0.0005132902097902097,
"loss": 3.4201,
"step": 24900
},
{
"epoch": 7.265594967674296,
"grad_norm": 0.3260183036327362,
"learning_rate": 0.0005131153846153845,
"loss": 3.4257,
"step": 24950
},
{
"epoch": 7.280156095288019,
"grad_norm": 0.3340539038181305,
"learning_rate": 0.0005129405594405594,
"loss": 3.4248,
"step": 25000
},
{
"epoch": 7.280156095288019,
"eval_accuracy": 0.3648761254562311,
"eval_loss": 3.597316026687622,
"eval_runtime": 179.6106,
"eval_samples_per_second": 92.678,
"eval_steps_per_second": 5.796,
"step": 25000
},
{
"epoch": 7.294717222901742,
"grad_norm": 0.3235202133655548,
"learning_rate": 0.0005127657342657342,
"loss": 3.4151,
"step": 25050
},
{
"epoch": 7.309278350515464,
"grad_norm": 0.3350401818752289,
"learning_rate": 0.000512590909090909,
"loss": 3.4264,
"step": 25100
},
{
"epoch": 7.3238394781291865,
"grad_norm": 0.3224312663078308,
"learning_rate": 0.0005124160839160838,
"loss": 3.4351,
"step": 25150
},
{
"epoch": 7.338400605742908,
"grad_norm": 0.34827619791030884,
"learning_rate": 0.0005122412587412588,
"loss": 3.419,
"step": 25200
},
{
"epoch": 7.352961733356631,
"grad_norm": 0.3302023708820343,
"learning_rate": 0.0005120664335664336,
"loss": 3.4288,
"step": 25250
},
{
"epoch": 7.367522860970354,
"grad_norm": 0.32455217838287354,
"learning_rate": 0.0005118916083916084,
"loss": 3.4304,
"step": 25300
},
{
"epoch": 7.382083988584076,
"grad_norm": 0.3269766867160797,
"learning_rate": 0.0005117167832167832,
"loss": 3.4261,
"step": 25350
},
{
"epoch": 7.396645116197798,
"grad_norm": 0.33805644512176514,
"learning_rate": 0.0005115419580419581,
"loss": 3.4302,
"step": 25400
},
{
"epoch": 7.411206243811521,
"grad_norm": 0.30504998564720154,
"learning_rate": 0.0005113671328671328,
"loss": 3.4436,
"step": 25450
},
{
"epoch": 7.425767371425243,
"grad_norm": 0.3210298717021942,
"learning_rate": 0.0005111923076923077,
"loss": 3.4379,
"step": 25500
},
{
"epoch": 7.440328499038966,
"grad_norm": 0.3328574597835541,
"learning_rate": 0.0005110174825174825,
"loss": 3.4364,
"step": 25550
},
{
"epoch": 7.454889626652688,
"grad_norm": 0.3316180408000946,
"learning_rate": 0.0005108426573426573,
"loss": 3.4421,
"step": 25600
},
{
"epoch": 7.46945075426641,
"grad_norm": 0.32503074407577515,
"learning_rate": 0.0005106678321678321,
"loss": 3.4373,
"step": 25650
},
{
"epoch": 7.484011881880133,
"grad_norm": 0.34754231572151184,
"learning_rate": 0.000510493006993007,
"loss": 3.4424,
"step": 25700
},
{
"epoch": 7.498573009493855,
"grad_norm": 0.3636908531188965,
"learning_rate": 0.0005103181818181818,
"loss": 3.4421,
"step": 25750
},
{
"epoch": 7.513134137107578,
"grad_norm": 0.32700809836387634,
"learning_rate": 0.0005101433566433566,
"loss": 3.4452,
"step": 25800
},
{
"epoch": 7.5276952647213005,
"grad_norm": 0.34218472242355347,
"learning_rate": 0.0005099685314685315,
"loss": 3.4455,
"step": 25850
},
{
"epoch": 7.542256392335022,
"grad_norm": 0.3296022117137909,
"learning_rate": 0.0005097937062937063,
"loss": 3.4512,
"step": 25900
},
{
"epoch": 7.556817519948745,
"grad_norm": 0.3350162208080292,
"learning_rate": 0.0005096188811188811,
"loss": 3.4474,
"step": 25950
},
{
"epoch": 7.571378647562467,
"grad_norm": 0.30475959181785583,
"learning_rate": 0.0005094440559440559,
"loss": 3.452,
"step": 26000
},
{
"epoch": 7.571378647562467,
"eval_accuracy": 0.36505987581156407,
"eval_loss": 3.592822790145874,
"eval_runtime": 179.6236,
"eval_samples_per_second": 92.672,
"eval_steps_per_second": 5.795,
"step": 26000
},
{
"epoch": 7.58593977517619,
"grad_norm": 0.34838974475860596,
"learning_rate": 0.0005092692307692308,
"loss": 3.4448,
"step": 26050
},
{
"epoch": 7.600500902789912,
"grad_norm": 0.3396695852279663,
"learning_rate": 0.0005090944055944056,
"loss": 3.4505,
"step": 26100
},
{
"epoch": 7.615062030403634,
"grad_norm": 0.3172091543674469,
"learning_rate": 0.0005089195804195804,
"loss": 3.4539,
"step": 26150
},
{
"epoch": 7.629623158017357,
"grad_norm": 0.3365660607814789,
"learning_rate": 0.0005087447552447552,
"loss": 3.4448,
"step": 26200
},
{
"epoch": 7.644184285631079,
"grad_norm": 0.3183286488056183,
"learning_rate": 0.00050856993006993,
"loss": 3.4447,
"step": 26250
},
{
"epoch": 7.658745413244802,
"grad_norm": 0.32615023851394653,
"learning_rate": 0.0005083951048951048,
"loss": 3.4541,
"step": 26300
},
{
"epoch": 7.673306540858524,
"grad_norm": 0.33578434586524963,
"learning_rate": 0.0005082202797202797,
"loss": 3.4546,
"step": 26350
},
{
"epoch": 7.687867668472246,
"grad_norm": 0.3528318405151367,
"learning_rate": 0.0005080454545454545,
"loss": 3.4597,
"step": 26400
},
{
"epoch": 7.702428796085969,
"grad_norm": 0.33206814527511597,
"learning_rate": 0.0005078706293706293,
"loss": 3.4576,
"step": 26450
},
{
"epoch": 7.716989923699691,
"grad_norm": 0.35905346274375916,
"learning_rate": 0.0005076958041958042,
"loss": 3.4657,
"step": 26500
},
{
"epoch": 7.731551051313414,
"grad_norm": 0.32397958636283875,
"learning_rate": 0.000507520979020979,
"loss": 3.4592,
"step": 26550
},
{
"epoch": 7.746112178927136,
"grad_norm": 0.3245496451854706,
"learning_rate": 0.0005073461538461538,
"loss": 3.4444,
"step": 26600
},
{
"epoch": 7.760673306540858,
"grad_norm": 0.3393577039241791,
"learning_rate": 0.0005071713286713286,
"loss": 3.455,
"step": 26650
},
{
"epoch": 7.775234434154581,
"grad_norm": 0.3460133373737335,
"learning_rate": 0.0005069965034965035,
"loss": 3.4519,
"step": 26700
},
{
"epoch": 7.789795561768304,
"grad_norm": 0.3299025893211365,
"learning_rate": 0.0005068216783216783,
"loss": 3.4521,
"step": 26750
},
{
"epoch": 7.8043566893820255,
"grad_norm": 0.32487180829048157,
"learning_rate": 0.0005066468531468531,
"loss": 3.4672,
"step": 26800
},
{
"epoch": 7.818917816995748,
"grad_norm": 0.3425562381744385,
"learning_rate": 0.0005064720279720279,
"loss": 3.4408,
"step": 26850
},
{
"epoch": 7.833478944609471,
"grad_norm": 0.2968789339065552,
"learning_rate": 0.0005062972027972028,
"loss": 3.4553,
"step": 26900
},
{
"epoch": 7.848040072223193,
"grad_norm": 0.35579219460487366,
"learning_rate": 0.0005061223776223775,
"loss": 3.4581,
"step": 26950
},
{
"epoch": 7.862601199836916,
"grad_norm": 0.32660984992980957,
"learning_rate": 0.0005059475524475524,
"loss": 3.4628,
"step": 27000
},
{
"epoch": 7.862601199836916,
"eval_accuracy": 0.36588081549888984,
"eval_loss": 3.5821452140808105,
"eval_runtime": 179.7196,
"eval_samples_per_second": 92.622,
"eval_steps_per_second": 5.792,
"step": 27000
},
{
"epoch": 7.8771623274506375,
"grad_norm": 0.34840381145477295,
"learning_rate": 0.0005057727272727272,
"loss": 3.4578,
"step": 27050
},
{
"epoch": 7.89172345506436,
"grad_norm": 0.3340885043144226,
"learning_rate": 0.000505597902097902,
"loss": 3.4542,
"step": 27100
},
{
"epoch": 7.906284582678083,
"grad_norm": 0.3486022651195526,
"learning_rate": 0.0005054230769230769,
"loss": 3.4558,
"step": 27150
},
{
"epoch": 7.920845710291805,
"grad_norm": 0.342074453830719,
"learning_rate": 0.0005052482517482517,
"loss": 3.4689,
"step": 27200
},
{
"epoch": 7.935406837905528,
"grad_norm": 0.32947641611099243,
"learning_rate": 0.0005050734265734265,
"loss": 3.4728,
"step": 27250
},
{
"epoch": 7.9499679655192494,
"grad_norm": 0.31653088331222534,
"learning_rate": 0.0005048986013986013,
"loss": 3.4685,
"step": 27300
},
{
"epoch": 7.964529093132972,
"grad_norm": 0.34901005029678345,
"learning_rate": 0.0005047237762237762,
"loss": 3.4647,
"step": 27350
},
{
"epoch": 7.979090220746695,
"grad_norm": 0.3230254352092743,
"learning_rate": 0.000504548951048951,
"loss": 3.4553,
"step": 27400
},
{
"epoch": 7.993651348360417,
"grad_norm": 0.3505324125289917,
"learning_rate": 0.0005043741258741258,
"loss": 3.4651,
"step": 27450
},
{
"epoch": 8.008154231463685,
"grad_norm": 0.33360299468040466,
"learning_rate": 0.0005041993006993006,
"loss": 3.4055,
"step": 27500
},
{
"epoch": 8.022715359077408,
"grad_norm": 0.34768131375312805,
"learning_rate": 0.0005040244755244755,
"loss": 3.3498,
"step": 27550
},
{
"epoch": 8.037276486691129,
"grad_norm": 0.3448757827281952,
"learning_rate": 0.0005038496503496503,
"loss": 3.3425,
"step": 27600
},
{
"epoch": 8.051837614304851,
"grad_norm": 0.3235391676425934,
"learning_rate": 0.0005036748251748251,
"loss": 3.3552,
"step": 27650
},
{
"epoch": 8.066398741918574,
"grad_norm": 0.3475419580936432,
"learning_rate": 0.0005034999999999999,
"loss": 3.3667,
"step": 27700
},
{
"epoch": 8.080959869532297,
"grad_norm": 0.3373063802719116,
"learning_rate": 0.0005033251748251747,
"loss": 3.3604,
"step": 27750
},
{
"epoch": 8.09552099714602,
"grad_norm": 0.33979612588882446,
"learning_rate": 0.0005031503496503496,
"loss": 3.3733,
"step": 27800
},
{
"epoch": 8.11008212475974,
"grad_norm": 0.3554824888706207,
"learning_rate": 0.0005029755244755244,
"loss": 3.3742,
"step": 27850
},
{
"epoch": 8.124643252373463,
"grad_norm": 0.3337261378765106,
"learning_rate": 0.0005028006993006992,
"loss": 3.3687,
"step": 27900
},
{
"epoch": 8.139204379987186,
"grad_norm": 0.3323470652103424,
"learning_rate": 0.000502625874125874,
"loss": 3.3727,
"step": 27950
},
{
"epoch": 8.153765507600909,
"grad_norm": 0.3721840977668762,
"learning_rate": 0.000502451048951049,
"loss": 3.3875,
"step": 28000
},
{
"epoch": 8.153765507600909,
"eval_accuracy": 0.36575702207332006,
"eval_loss": 3.587688684463501,
"eval_runtime": 179.6153,
"eval_samples_per_second": 92.676,
"eval_steps_per_second": 5.796,
"step": 28000
},
{
"epoch": 8.168326635214632,
"grad_norm": 0.32385772466659546,
"learning_rate": 0.0005022762237762237,
"loss": 3.377,
"step": 28050
},
{
"epoch": 8.182887762828354,
"grad_norm": 0.34891006350517273,
"learning_rate": 0.0005021013986013985,
"loss": 3.3799,
"step": 28100
},
{
"epoch": 8.197448890442075,
"grad_norm": 0.34678199887275696,
"learning_rate": 0.0005019265734265733,
"loss": 3.3868,
"step": 28150
},
{
"epoch": 8.212010018055798,
"grad_norm": 0.3331698477268219,
"learning_rate": 0.0005017517482517483,
"loss": 3.3834,
"step": 28200
},
{
"epoch": 8.22657114566952,
"grad_norm": 0.3234923779964447,
"learning_rate": 0.0005015769230769231,
"loss": 3.3933,
"step": 28250
},
{
"epoch": 8.241132273283243,
"grad_norm": 0.33203065395355225,
"learning_rate": 0.0005014020979020979,
"loss": 3.3947,
"step": 28300
},
{
"epoch": 8.255693400896966,
"grad_norm": 0.3614442050457001,
"learning_rate": 0.0005012272727272727,
"loss": 3.396,
"step": 28350
},
{
"epoch": 8.270254528510687,
"grad_norm": 0.3333202302455902,
"learning_rate": 0.0005010524475524476,
"loss": 3.3944,
"step": 28400
},
{
"epoch": 8.28481565612441,
"grad_norm": 0.36438530683517456,
"learning_rate": 0.0005008776223776223,
"loss": 3.3969,
"step": 28450
},
{
"epoch": 8.299376783738133,
"grad_norm": 0.3521031439304352,
"learning_rate": 0.0005007027972027972,
"loss": 3.4055,
"step": 28500
},
{
"epoch": 8.313937911351855,
"grad_norm": 0.3472055494785309,
"learning_rate": 0.000500527972027972,
"loss": 3.4046,
"step": 28550
},
{
"epoch": 8.328499038965578,
"grad_norm": 0.3367118239402771,
"learning_rate": 0.0005003531468531468,
"loss": 3.3965,
"step": 28600
},
{
"epoch": 8.3430601665793,
"grad_norm": 0.32049131393432617,
"learning_rate": 0.0005001783216783217,
"loss": 3.4007,
"step": 28650
},
{
"epoch": 8.357621294193022,
"grad_norm": 0.36562150716781616,
"learning_rate": 0.0005000034965034965,
"loss": 3.398,
"step": 28700
},
{
"epoch": 8.372182421806745,
"grad_norm": 0.3249284327030182,
"learning_rate": 0.0004998286713286713,
"loss": 3.3956,
"step": 28750
},
{
"epoch": 8.386743549420467,
"grad_norm": 0.34902429580688477,
"learning_rate": 0.0004996538461538461,
"loss": 3.4019,
"step": 28800
},
{
"epoch": 8.40130467703419,
"grad_norm": 0.35250189900398254,
"learning_rate": 0.000499479020979021,
"loss": 3.396,
"step": 28850
},
{
"epoch": 8.415865804647911,
"grad_norm": 0.3263583481311798,
"learning_rate": 0.0004993041958041958,
"loss": 3.4019,
"step": 28900
},
{
"epoch": 8.430426932261634,
"grad_norm": 0.3295762836933136,
"learning_rate": 0.0004991293706293706,
"loss": 3.4142,
"step": 28950
},
{
"epoch": 8.444988059875357,
"grad_norm": 0.3347003161907196,
"learning_rate": 0.0004989545454545454,
"loss": 3.4067,
"step": 29000
},
{
"epoch": 8.444988059875357,
"eval_accuracy": 0.36621951337074804,
"eval_loss": 3.5861732959747314,
"eval_runtime": 179.638,
"eval_samples_per_second": 92.664,
"eval_steps_per_second": 5.795,
"step": 29000
},
{
"epoch": 8.45954918748908,
"grad_norm": 0.3246140778064728,
"learning_rate": 0.0004987797202797203,
"loss": 3.4132,
"step": 29050
},
{
"epoch": 8.474110315102802,
"grad_norm": 0.3662111163139343,
"learning_rate": 0.0004986048951048951,
"loss": 3.4005,
"step": 29100
},
{
"epoch": 8.488671442716523,
"grad_norm": 0.3711738884449005,
"learning_rate": 0.0004984300699300699,
"loss": 3.4136,
"step": 29150
},
{
"epoch": 8.503232570330246,
"grad_norm": 0.3224983513355255,
"learning_rate": 0.0004982552447552448,
"loss": 3.4193,
"step": 29200
},
{
"epoch": 8.517793697943969,
"grad_norm": 0.34007972478866577,
"learning_rate": 0.0004980804195804195,
"loss": 3.4033,
"step": 29250
},
{
"epoch": 8.532354825557691,
"grad_norm": 0.34795621037483215,
"learning_rate": 0.0004979055944055944,
"loss": 3.4279,
"step": 29300
},
{
"epoch": 8.546915953171414,
"grad_norm": 0.3562271296977997,
"learning_rate": 0.0004977307692307692,
"loss": 3.4191,
"step": 29350
},
{
"epoch": 8.561477080785137,
"grad_norm": 0.33655011653900146,
"learning_rate": 0.000497555944055944,
"loss": 3.4124,
"step": 29400
},
{
"epoch": 8.576038208398858,
"grad_norm": 0.3349234461784363,
"learning_rate": 0.0004973811188811188,
"loss": 3.4197,
"step": 29450
},
{
"epoch": 8.59059933601258,
"grad_norm": 0.3217172622680664,
"learning_rate": 0.0004972062937062937,
"loss": 3.4254,
"step": 29500
},
{
"epoch": 8.605160463626303,
"grad_norm": 0.33423301577568054,
"learning_rate": 0.0004970314685314685,
"loss": 3.4127,
"step": 29550
},
{
"epoch": 8.619721591240026,
"grad_norm": 0.34116896986961365,
"learning_rate": 0.0004968566433566433,
"loss": 3.4258,
"step": 29600
},
{
"epoch": 8.634282718853749,
"grad_norm": 0.3287065029144287,
"learning_rate": 0.0004966818181818181,
"loss": 3.4198,
"step": 29650
},
{
"epoch": 8.64884384646747,
"grad_norm": 0.35336413979530334,
"learning_rate": 0.000496506993006993,
"loss": 3.4168,
"step": 29700
},
{
"epoch": 8.663404974081192,
"grad_norm": 0.34202465415000916,
"learning_rate": 0.0004963321678321678,
"loss": 3.4277,
"step": 29750
},
{
"epoch": 8.677966101694915,
"grad_norm": 0.3359808921813965,
"learning_rate": 0.0004961573426573426,
"loss": 3.428,
"step": 29800
},
{
"epoch": 8.692527229308638,
"grad_norm": 0.33416661620140076,
"learning_rate": 0.0004959825174825175,
"loss": 3.4254,
"step": 29850
},
{
"epoch": 8.70708835692236,
"grad_norm": 0.34198299050331116,
"learning_rate": 0.0004958076923076923,
"loss": 3.4263,
"step": 29900
},
{
"epoch": 8.721649484536082,
"grad_norm": 0.359495609998703,
"learning_rate": 0.0004956328671328671,
"loss": 3.4389,
"step": 29950
},
{
"epoch": 8.736210612149804,
"grad_norm": 0.3388700485229492,
"learning_rate": 0.0004954580419580419,
"loss": 3.4324,
"step": 30000
},
{
"epoch": 8.736210612149804,
"eval_accuracy": 0.3668087371589303,
"eval_loss": 3.5739517211914062,
"eval_runtime": 179.5999,
"eval_samples_per_second": 92.684,
"eval_steps_per_second": 5.796,
"step": 30000
},
{
"epoch": 8.750771739763527,
"grad_norm": 0.3278059959411621,
"learning_rate": 0.0004952832167832167,
"loss": 3.4248,
"step": 30050
},
{
"epoch": 8.76533286737725,
"grad_norm": 0.34186989068984985,
"learning_rate": 0.0004951083916083915,
"loss": 3.4251,
"step": 30100
},
{
"epoch": 8.779893994990973,
"grad_norm": 0.3263101577758789,
"learning_rate": 0.0004949335664335664,
"loss": 3.4302,
"step": 30150
},
{
"epoch": 8.794455122604695,
"grad_norm": 0.354638010263443,
"learning_rate": 0.0004947587412587412,
"loss": 3.416,
"step": 30200
},
{
"epoch": 8.809016250218416,
"grad_norm": 0.3320951759815216,
"learning_rate": 0.000494583916083916,
"loss": 3.4236,
"step": 30250
},
{
"epoch": 8.82357737783214,
"grad_norm": 0.3258265554904938,
"learning_rate": 0.0004944090909090908,
"loss": 3.4288,
"step": 30300
},
{
"epoch": 8.838138505445862,
"grad_norm": 0.3345946967601776,
"learning_rate": 0.0004942342657342657,
"loss": 3.4451,
"step": 30350
},
{
"epoch": 8.852699633059585,
"grad_norm": 0.32482603192329407,
"learning_rate": 0.0004940594405594405,
"loss": 3.4356,
"step": 30400
},
{
"epoch": 8.867260760673307,
"grad_norm": 0.3448339104652405,
"learning_rate": 0.0004938846153846153,
"loss": 3.4376,
"step": 30450
},
{
"epoch": 8.881821888287028,
"grad_norm": 0.33671805262565613,
"learning_rate": 0.0004937097902097901,
"loss": 3.4342,
"step": 30500
},
{
"epoch": 8.896383015900751,
"grad_norm": 0.33519789576530457,
"learning_rate": 0.000493534965034965,
"loss": 3.4395,
"step": 30550
},
{
"epoch": 8.910944143514474,
"grad_norm": 0.3589082658290863,
"learning_rate": 0.0004933601398601398,
"loss": 3.4191,
"step": 30600
},
{
"epoch": 8.925505271128197,
"grad_norm": 0.3367752730846405,
"learning_rate": 0.0004931853146853146,
"loss": 3.4361,
"step": 30650
},
{
"epoch": 8.94006639874192,
"grad_norm": 0.31563615798950195,
"learning_rate": 0.0004930104895104895,
"loss": 3.4309,
"step": 30700
},
{
"epoch": 8.95462752635564,
"grad_norm": 0.32066795229911804,
"learning_rate": 0.0004928356643356642,
"loss": 3.4346,
"step": 30750
},
{
"epoch": 8.969188653969363,
"grad_norm": 0.33618220686912537,
"learning_rate": 0.0004926608391608391,
"loss": 3.4448,
"step": 30800
},
{
"epoch": 8.983749781583086,
"grad_norm": 0.3660648465156555,
"learning_rate": 0.0004924860139860139,
"loss": 3.4283,
"step": 30850
},
{
"epoch": 8.998310909196809,
"grad_norm": 0.363942414522171,
"learning_rate": 0.0004923111888111887,
"loss": 3.442,
"step": 30900
},
{
"epoch": 9.012813792300076,
"grad_norm": 0.31260979175567627,
"learning_rate": 0.0004921363636363635,
"loss": 3.3326,
"step": 30950
},
{
"epoch": 9.027374919913798,
"grad_norm": 0.3426477909088135,
"learning_rate": 0.0004919615384615384,
"loss": 3.3223,
"step": 31000
},
{
"epoch": 9.027374919913798,
"eval_accuracy": 0.36718293893821685,
"eval_loss": 3.578636646270752,
"eval_runtime": 179.7856,
"eval_samples_per_second": 92.588,
"eval_steps_per_second": 5.79,
"step": 31000
},
{
"epoch": 9.041936047527521,
"grad_norm": 0.3265765309333801,
"learning_rate": 0.0004917867132867132,
"loss": 3.3151,
"step": 31050
},
{
"epoch": 9.056497175141242,
"grad_norm": 0.341594934463501,
"learning_rate": 0.000491611888111888,
"loss": 3.3359,
"step": 31100
},
{
"epoch": 9.071058302754965,
"grad_norm": 0.3292291462421417,
"learning_rate": 0.0004914370629370628,
"loss": 3.3368,
"step": 31150
},
{
"epoch": 9.085619430368688,
"grad_norm": 0.3459334671497345,
"learning_rate": 0.0004912622377622378,
"loss": 3.3496,
"step": 31200
},
{
"epoch": 9.10018055798241,
"grad_norm": 0.3272961974143982,
"learning_rate": 0.0004910874125874126,
"loss": 3.3405,
"step": 31250
},
{
"epoch": 9.114741685596133,
"grad_norm": 0.32643869519233704,
"learning_rate": 0.0004909125874125874,
"loss": 3.3479,
"step": 31300
},
{
"epoch": 9.129302813209854,
"grad_norm": 0.3543800413608551,
"learning_rate": 0.0004907377622377623,
"loss": 3.3437,
"step": 31350
},
{
"epoch": 9.143863940823577,
"grad_norm": 0.3529180586338043,
"learning_rate": 0.0004905629370629371,
"loss": 3.3537,
"step": 31400
},
{
"epoch": 9.1584250684373,
"grad_norm": 0.3241512179374695,
"learning_rate": 0.0004903881118881119,
"loss": 3.3549,
"step": 31450
},
{
"epoch": 9.172986196051022,
"grad_norm": 0.3228810429573059,
"learning_rate": 0.0004902132867132867,
"loss": 3.3541,
"step": 31500
},
{
"epoch": 9.187547323664745,
"grad_norm": 0.350665420293808,
"learning_rate": 0.0004900384615384615,
"loss": 3.3433,
"step": 31550
},
{
"epoch": 9.202108451278466,
"grad_norm": 0.3541180491447449,
"learning_rate": 0.0004898636363636363,
"loss": 3.3573,
"step": 31600
},
{
"epoch": 9.216669578892189,
"grad_norm": 0.33510008454322815,
"learning_rate": 0.0004896888111888112,
"loss": 3.363,
"step": 31650
},
{
"epoch": 9.231230706505912,
"grad_norm": 0.3581586182117462,
"learning_rate": 0.000489513986013986,
"loss": 3.363,
"step": 31700
},
{
"epoch": 9.245791834119634,
"grad_norm": 0.33881622552871704,
"learning_rate": 0.0004893391608391608,
"loss": 3.3719,
"step": 31750
},
{
"epoch": 9.260352961733357,
"grad_norm": 0.34674209356307983,
"learning_rate": 0.0004891643356643356,
"loss": 3.3701,
"step": 31800
},
{
"epoch": 9.27491408934708,
"grad_norm": 0.3241184949874878,
"learning_rate": 0.0004889895104895105,
"loss": 3.3741,
"step": 31850
},
{
"epoch": 9.2894752169608,
"grad_norm": 0.354744017124176,
"learning_rate": 0.0004888146853146853,
"loss": 3.3578,
"step": 31900
},
{
"epoch": 9.304036344574524,
"grad_norm": 0.31170573830604553,
"learning_rate": 0.0004886398601398601,
"loss": 3.37,
"step": 31950
},
{
"epoch": 9.318597472188246,
"grad_norm": 0.341860830783844,
"learning_rate": 0.000488465034965035,
"loss": 3.3788,
"step": 32000
},
{
"epoch": 9.318597472188246,
"eval_accuracy": 0.36685105969758663,
"eval_loss": 3.579946279525757,
"eval_runtime": 179.6871,
"eval_samples_per_second": 92.639,
"eval_steps_per_second": 5.793,
"step": 32000
},
{
"epoch": 9.333158599801969,
"grad_norm": 0.33674827218055725,
"learning_rate": 0.0004882902097902098,
"loss": 3.3761,
"step": 32050
},
{
"epoch": 9.347719727415692,
"grad_norm": 0.34504297375679016,
"learning_rate": 0.0004881153846153846,
"loss": 3.3931,
"step": 32100
},
{
"epoch": 9.362280855029413,
"grad_norm": 0.334306538105011,
"learning_rate": 0.0004879405594405594,
"loss": 3.3882,
"step": 32150
},
{
"epoch": 9.376841982643136,
"grad_norm": 0.3271053433418274,
"learning_rate": 0.00048776573426573424,
"loss": 3.3648,
"step": 32200
},
{
"epoch": 9.391403110256858,
"grad_norm": 0.3338295519351959,
"learning_rate": 0.00048759090909090904,
"loss": 3.3715,
"step": 32250
},
{
"epoch": 9.405964237870581,
"grad_norm": 0.34873339533805847,
"learning_rate": 0.0004874160839160839,
"loss": 3.3766,
"step": 32300
},
{
"epoch": 9.420525365484304,
"grad_norm": 0.36661872267723083,
"learning_rate": 0.0004872412587412587,
"loss": 3.3698,
"step": 32350
},
{
"epoch": 9.435086493098025,
"grad_norm": 0.3396371006965637,
"learning_rate": 0.00048706643356643354,
"loss": 3.3939,
"step": 32400
},
{
"epoch": 9.449647620711747,
"grad_norm": 0.34849968552589417,
"learning_rate": 0.00048689160839160834,
"loss": 3.3774,
"step": 32450
},
{
"epoch": 9.46420874832547,
"grad_norm": 0.3258074223995209,
"learning_rate": 0.0004867167832167832,
"loss": 3.3934,
"step": 32500
},
{
"epoch": 9.478769875939193,
"grad_norm": 0.35088443756103516,
"learning_rate": 0.00048654195804195794,
"loss": 3.379,
"step": 32550
},
{
"epoch": 9.493331003552916,
"grad_norm": 0.3309447765350342,
"learning_rate": 0.00048636713286713285,
"loss": 3.3924,
"step": 32600
},
{
"epoch": 9.507892131166638,
"grad_norm": 0.3345271944999695,
"learning_rate": 0.0004861923076923077,
"loss": 3.3948,
"step": 32650
},
{
"epoch": 9.52245325878036,
"grad_norm": 0.3319976031780243,
"learning_rate": 0.00048601748251748245,
"loss": 3.3951,
"step": 32700
},
{
"epoch": 9.537014386394082,
"grad_norm": 0.3754766583442688,
"learning_rate": 0.0004858426573426573,
"loss": 3.3805,
"step": 32750
},
{
"epoch": 9.551575514007805,
"grad_norm": 0.34478750824928284,
"learning_rate": 0.0004856678321678321,
"loss": 3.3952,
"step": 32800
},
{
"epoch": 9.566136641621528,
"grad_norm": 0.32059377431869507,
"learning_rate": 0.00048549300699300696,
"loss": 3.3938,
"step": 32850
},
{
"epoch": 9.58069776923525,
"grad_norm": 0.3344727158546448,
"learning_rate": 0.00048531818181818176,
"loss": 3.4108,
"step": 32900
},
{
"epoch": 9.595258896848971,
"grad_norm": 0.3274308443069458,
"learning_rate": 0.0004851433566433566,
"loss": 3.3958,
"step": 32950
},
{
"epoch": 9.609820024462694,
"grad_norm": 0.37887972593307495,
"learning_rate": 0.0004849685314685314,
"loss": 3.3937,
"step": 33000
},
{
"epoch": 9.609820024462694,
"eval_accuracy": 0.36769280796641846,
"eval_loss": 3.572748899459839,
"eval_runtime": 179.797,
"eval_samples_per_second": 92.582,
"eval_steps_per_second": 5.79,
"step": 33000
},
{
"epoch": 9.624381152076417,
"grad_norm": 0.32813072204589844,
"learning_rate": 0.00048479370629370627,
"loss": 3.3986,
"step": 33050
},
{
"epoch": 9.63894227969014,
"grad_norm": 0.3385777771472931,
"learning_rate": 0.00048461888111888106,
"loss": 3.4083,
"step": 33100
},
{
"epoch": 9.653503407303862,
"grad_norm": 0.34293919801712036,
"learning_rate": 0.0004844440559440559,
"loss": 3.4105,
"step": 33150
},
{
"epoch": 9.668064534917583,
"grad_norm": 0.34296914935112,
"learning_rate": 0.0004842692307692307,
"loss": 3.387,
"step": 33200
},
{
"epoch": 9.682625662531306,
"grad_norm": 0.3283473551273346,
"learning_rate": 0.00048409440559440557,
"loss": 3.3968,
"step": 33250
},
{
"epoch": 9.697186790145029,
"grad_norm": 0.3375408351421356,
"learning_rate": 0.0004839195804195803,
"loss": 3.4106,
"step": 33300
},
{
"epoch": 9.711747917758752,
"grad_norm": 0.38011592626571655,
"learning_rate": 0.0004837447552447552,
"loss": 3.3923,
"step": 33350
},
{
"epoch": 9.726309045372474,
"grad_norm": 0.3376014232635498,
"learning_rate": 0.0004835699300699301,
"loss": 3.4004,
"step": 33400
},
{
"epoch": 9.740870172986195,
"grad_norm": 0.34998080134391785,
"learning_rate": 0.0004833951048951048,
"loss": 3.415,
"step": 33450
},
{
"epoch": 9.755431300599918,
"grad_norm": 0.33272120356559753,
"learning_rate": 0.0004832202797202797,
"loss": 3.4076,
"step": 33500
},
{
"epoch": 9.76999242821364,
"grad_norm": 0.3352126181125641,
"learning_rate": 0.0004830454545454545,
"loss": 3.3951,
"step": 33550
},
{
"epoch": 9.784553555827364,
"grad_norm": 0.32453832030296326,
"learning_rate": 0.00048287062937062933,
"loss": 3.4116,
"step": 33600
},
{
"epoch": 9.799114683441086,
"grad_norm": 0.35308536887168884,
"learning_rate": 0.00048269580419580413,
"loss": 3.4049,
"step": 33650
},
{
"epoch": 9.813675811054807,
"grad_norm": 0.36469438672065735,
"learning_rate": 0.000482520979020979,
"loss": 3.4074,
"step": 33700
},
{
"epoch": 9.82823693866853,
"grad_norm": 0.312566339969635,
"learning_rate": 0.0004823461538461538,
"loss": 3.3985,
"step": 33750
},
{
"epoch": 9.842798066282253,
"grad_norm": 0.34336328506469727,
"learning_rate": 0.00048217132867132864,
"loss": 3.3979,
"step": 33800
},
{
"epoch": 9.857359193895975,
"grad_norm": 0.35954421758651733,
"learning_rate": 0.00048199650349650344,
"loss": 3.3987,
"step": 33850
},
{
"epoch": 9.871920321509698,
"grad_norm": 0.3485148847103119,
"learning_rate": 0.0004818216783216783,
"loss": 3.3936,
"step": 33900
},
{
"epoch": 9.88648144912342,
"grad_norm": 0.3321332633495331,
"learning_rate": 0.0004816468531468531,
"loss": 3.4155,
"step": 33950
},
{
"epoch": 9.901042576737142,
"grad_norm": 0.34463703632354736,
"learning_rate": 0.00048147202797202795,
"loss": 3.4155,
"step": 34000
},
{
"epoch": 9.901042576737142,
"eval_accuracy": 0.3683254123567235,
"eval_loss": 3.5605504512786865,
"eval_runtime": 179.6436,
"eval_samples_per_second": 92.661,
"eval_steps_per_second": 5.795,
"step": 34000
},
{
"epoch": 9.915603704350865,
"grad_norm": 0.33447834849357605,
"learning_rate": 0.0004812972027972028,
"loss": 3.407,
"step": 34050
},
{
"epoch": 9.930164831964587,
"grad_norm": 0.3506791889667511,
"learning_rate": 0.0004811223776223776,
"loss": 3.4097,
"step": 34100
},
{
"epoch": 9.94472595957831,
"grad_norm": 0.33796730637550354,
"learning_rate": 0.00048094755244755245,
"loss": 3.4044,
"step": 34150
},
{
"epoch": 9.959287087192033,
"grad_norm": 0.3389424979686737,
"learning_rate": 0.0004807727272727272,
"loss": 3.4017,
"step": 34200
},
{
"epoch": 9.973848214805754,
"grad_norm": 0.32537972927093506,
"learning_rate": 0.00048059790209790205,
"loss": 3.4185,
"step": 34250
},
{
"epoch": 9.988409342419477,
"grad_norm": 0.349602073431015,
"learning_rate": 0.00048042307692307685,
"loss": 3.4123,
"step": 34300
},
{
"epoch": 10.002912225522744,
"grad_norm": 0.3237372040748596,
"learning_rate": 0.0004802482517482517,
"loss": 3.3901,
"step": 34350
},
{
"epoch": 10.017473353136467,
"grad_norm": 0.35387957096099854,
"learning_rate": 0.0004800734265734265,
"loss": 3.2917,
"step": 34400
},
{
"epoch": 10.03203448075019,
"grad_norm": 0.35952070355415344,
"learning_rate": 0.00047989860139860136,
"loss": 3.2902,
"step": 34450
},
{
"epoch": 10.046595608363912,
"grad_norm": 0.3631179928779602,
"learning_rate": 0.00047972377622377616,
"loss": 3.2917,
"step": 34500
},
{
"epoch": 10.061156735977635,
"grad_norm": 0.38791096210479736,
"learning_rate": 0.000479548951048951,
"loss": 3.3143,
"step": 34550
},
{
"epoch": 10.075717863591356,
"grad_norm": 0.3530723750591278,
"learning_rate": 0.0004793741258741258,
"loss": 3.3145,
"step": 34600
},
{
"epoch": 10.090278991205079,
"grad_norm": 0.3433317542076111,
"learning_rate": 0.00047919930069930067,
"loss": 3.3156,
"step": 34650
},
{
"epoch": 10.104840118818801,
"grad_norm": 0.31986525654792786,
"learning_rate": 0.0004790244755244755,
"loss": 3.3162,
"step": 34700
},
{
"epoch": 10.119401246432524,
"grad_norm": 0.3420194089412689,
"learning_rate": 0.0004788496503496503,
"loss": 3.31,
"step": 34750
},
{
"epoch": 10.133962374046247,
"grad_norm": 0.35869932174682617,
"learning_rate": 0.0004786748251748252,
"loss": 3.3294,
"step": 34800
},
{
"epoch": 10.148523501659968,
"grad_norm": 0.3446190655231476,
"learning_rate": 0.0004785,
"loss": 3.3318,
"step": 34850
},
{
"epoch": 10.16308462927369,
"grad_norm": 0.3328004777431488,
"learning_rate": 0.00047832517482517483,
"loss": 3.3373,
"step": 34900
},
{
"epoch": 10.177645756887413,
"grad_norm": 0.37485411763191223,
"learning_rate": 0.0004781503496503496,
"loss": 3.3202,
"step": 34950
},
{
"epoch": 10.192206884501136,
"grad_norm": 0.36680352687835693,
"learning_rate": 0.00047797552447552443,
"loss": 3.34,
"step": 35000
},
{
"epoch": 10.192206884501136,
"eval_accuracy": 0.3679084177883511,
"eval_loss": 3.5739078521728516,
"eval_runtime": 231.8779,
"eval_samples_per_second": 71.788,
"eval_steps_per_second": 4.489,
"step": 35000
},
{
"epoch": 10.206768012114859,
"grad_norm": 0.3383858799934387,
"learning_rate": 0.00047780069930069923,
"loss": 3.3325,
"step": 35050
},
{
"epoch": 10.221329139728581,
"grad_norm": 0.3608876168727875,
"learning_rate": 0.0004776258741258741,
"loss": 3.3455,
"step": 35100
},
{
"epoch": 10.235890267342302,
"grad_norm": 0.38920077681541443,
"learning_rate": 0.0004774510489510489,
"loss": 3.3508,
"step": 35150
},
{
"epoch": 10.250451394956025,
"grad_norm": 0.3486644923686981,
"learning_rate": 0.00047727622377622374,
"loss": 3.3526,
"step": 35200
},
{
"epoch": 10.265012522569748,
"grad_norm": 0.3848312199115753,
"learning_rate": 0.00047710139860139854,
"loss": 3.3389,
"step": 35250
},
{
"epoch": 10.27957365018347,
"grad_norm": 0.34103095531463623,
"learning_rate": 0.0004769265734265734,
"loss": 3.336,
"step": 35300
},
{
"epoch": 10.294134777797193,
"grad_norm": 0.3553299605846405,
"learning_rate": 0.0004767517482517482,
"loss": 3.3438,
"step": 35350
},
{
"epoch": 10.308695905410914,
"grad_norm": 0.3677619695663452,
"learning_rate": 0.00047657692307692304,
"loss": 3.3505,
"step": 35400
},
{
"epoch": 10.323257033024637,
"grad_norm": 0.3474290370941162,
"learning_rate": 0.0004764020979020979,
"loss": 3.3535,
"step": 35450
},
{
"epoch": 10.33781816063836,
"grad_norm": 0.35205620527267456,
"learning_rate": 0.0004762272727272727,
"loss": 3.3448,
"step": 35500
},
{
"epoch": 10.352379288252083,
"grad_norm": 0.3427030146121979,
"learning_rate": 0.00047605244755244755,
"loss": 3.3546,
"step": 35550
},
{
"epoch": 10.366940415865805,
"grad_norm": 0.36262500286102295,
"learning_rate": 0.00047587762237762235,
"loss": 3.3639,
"step": 35600
},
{
"epoch": 10.381501543479526,
"grad_norm": 0.37175849080085754,
"learning_rate": 0.0004757027972027972,
"loss": 3.3623,
"step": 35650
},
{
"epoch": 10.396062671093249,
"grad_norm": 0.32053637504577637,
"learning_rate": 0.00047552797202797195,
"loss": 3.3639,
"step": 35700
},
{
"epoch": 10.410623798706972,
"grad_norm": 0.3394221365451813,
"learning_rate": 0.0004753531468531468,
"loss": 3.3564,
"step": 35750
},
{
"epoch": 10.425184926320695,
"grad_norm": 0.33870938420295715,
"learning_rate": 0.0004751783216783216,
"loss": 3.3586,
"step": 35800
},
{
"epoch": 10.439746053934417,
"grad_norm": 0.3561233878135681,
"learning_rate": 0.00047500349650349646,
"loss": 3.3674,
"step": 35850
},
{
"epoch": 10.454307181548138,
"grad_norm": 0.3519926071166992,
"learning_rate": 0.00047482867132867126,
"loss": 3.3557,
"step": 35900
},
{
"epoch": 10.468868309161861,
"grad_norm": 0.33235129714012146,
"learning_rate": 0.0004746538461538461,
"loss": 3.3529,
"step": 35950
},
{
"epoch": 10.483429436775584,
"grad_norm": 0.3245657682418823,
"learning_rate": 0.0004744790209790209,
"loss": 3.3755,
"step": 36000
},
{
"epoch": 10.483429436775584,
"eval_accuracy": 0.3681230871094247,
"eval_loss": 3.5692830085754395,
"eval_runtime": 179.6694,
"eval_samples_per_second": 92.648,
"eval_steps_per_second": 5.794,
"step": 36000
},
{
"epoch": 10.497990564389307,
"grad_norm": 0.33548834919929504,
"learning_rate": 0.00047430419580419576,
"loss": 3.3667,
"step": 36050
},
{
"epoch": 10.51255169200303,
"grad_norm": 0.3279496133327484,
"learning_rate": 0.0004741293706293706,
"loss": 3.3668,
"step": 36100
},
{
"epoch": 10.52711281961675,
"grad_norm": 0.3511890470981598,
"learning_rate": 0.0004739545454545454,
"loss": 3.3761,
"step": 36150
},
{
"epoch": 10.541673947230473,
"grad_norm": 0.34296634793281555,
"learning_rate": 0.00047377972027972027,
"loss": 3.3659,
"step": 36200
},
{
"epoch": 10.556235074844196,
"grad_norm": 0.39123594760894775,
"learning_rate": 0.00047360489510489507,
"loss": 3.3614,
"step": 36250
},
{
"epoch": 10.570796202457919,
"grad_norm": 0.3517671227455139,
"learning_rate": 0.0004734300699300699,
"loss": 3.3887,
"step": 36300
},
{
"epoch": 10.585357330071641,
"grad_norm": 0.31128349900245667,
"learning_rate": 0.0004732552447552447,
"loss": 3.3766,
"step": 36350
},
{
"epoch": 10.599918457685362,
"grad_norm": 0.3317906856536865,
"learning_rate": 0.0004730804195804196,
"loss": 3.3748,
"step": 36400
},
{
"epoch": 10.614479585299085,
"grad_norm": 0.33843693137168884,
"learning_rate": 0.0004729055944055943,
"loss": 3.3758,
"step": 36450
},
{
"epoch": 10.629040712912808,
"grad_norm": 0.3605981767177582,
"learning_rate": 0.0004727307692307692,
"loss": 3.3822,
"step": 36500
},
{
"epoch": 10.64360184052653,
"grad_norm": 0.36126452684402466,
"learning_rate": 0.000472555944055944,
"loss": 3.3727,
"step": 36550
},
{
"epoch": 10.658162968140253,
"grad_norm": 0.3546067476272583,
"learning_rate": 0.00047238111888111883,
"loss": 3.365,
"step": 36600
},
{
"epoch": 10.672724095753976,
"grad_norm": 0.322816401720047,
"learning_rate": 0.00047220629370629363,
"loss": 3.3828,
"step": 36650
},
{
"epoch": 10.687285223367697,
"grad_norm": 0.3517470061779022,
"learning_rate": 0.0004720314685314685,
"loss": 3.365,
"step": 36700
},
{
"epoch": 10.70184635098142,
"grad_norm": 0.3618316948413849,
"learning_rate": 0.0004718566433566433,
"loss": 3.3713,
"step": 36750
},
{
"epoch": 10.716407478595142,
"grad_norm": 0.3243914544582367,
"learning_rate": 0.00047168181818181814,
"loss": 3.3811,
"step": 36800
},
{
"epoch": 10.730968606208865,
"grad_norm": 0.3605787456035614,
"learning_rate": 0.000471506993006993,
"loss": 3.3727,
"step": 36850
},
{
"epoch": 10.745529733822588,
"grad_norm": 0.3196176588535309,
"learning_rate": 0.0004713321678321678,
"loss": 3.3806,
"step": 36900
},
{
"epoch": 10.760090861436309,
"grad_norm": 0.35002508759498596,
"learning_rate": 0.00047115734265734265,
"loss": 3.4017,
"step": 36950
},
{
"epoch": 10.774651989050032,
"grad_norm": 0.35589703917503357,
"learning_rate": 0.00047098251748251745,
"loss": 3.3982,
"step": 37000
},
{
"epoch": 10.774651989050032,
"eval_accuracy": 0.3692364050013014,
"eval_loss": 3.5608346462249756,
"eval_runtime": 179.5409,
"eval_samples_per_second": 92.714,
"eval_steps_per_second": 5.798,
"step": 37000
},
{
"epoch": 10.789213116663754,
"grad_norm": 0.34006816148757935,
"learning_rate": 0.0004708076923076923,
"loss": 3.3882,
"step": 37050
},
{
"epoch": 10.803774244277477,
"grad_norm": 0.34267741441726685,
"learning_rate": 0.0004706328671328671,
"loss": 3.3936,
"step": 37100
},
{
"epoch": 10.8183353718912,
"grad_norm": 0.3394986391067505,
"learning_rate": 0.00047045804195804195,
"loss": 3.3949,
"step": 37150
},
{
"epoch": 10.83289649950492,
"grad_norm": 0.35784146189689636,
"learning_rate": 0.0004702832167832167,
"loss": 3.386,
"step": 37200
},
{
"epoch": 10.847457627118644,
"grad_norm": 0.3268081247806549,
"learning_rate": 0.00047010839160839155,
"loss": 3.3931,
"step": 37250
},
{
"epoch": 10.862018754732366,
"grad_norm": 0.34974247217178345,
"learning_rate": 0.00046993356643356635,
"loss": 3.3942,
"step": 37300
},
{
"epoch": 10.876579882346089,
"grad_norm": 0.3425455093383789,
"learning_rate": 0.0004697587412587412,
"loss": 3.3862,
"step": 37350
},
{
"epoch": 10.891141009959812,
"grad_norm": 0.32915806770324707,
"learning_rate": 0.000469583916083916,
"loss": 3.3955,
"step": 37400
},
{
"epoch": 10.905702137573535,
"grad_norm": 0.33617880940437317,
"learning_rate": 0.00046940909090909086,
"loss": 3.3878,
"step": 37450
},
{
"epoch": 10.920263265187256,
"grad_norm": 0.3757038414478302,
"learning_rate": 0.0004692342657342657,
"loss": 3.3805,
"step": 37500
},
{
"epoch": 10.934824392800978,
"grad_norm": 0.34372201561927795,
"learning_rate": 0.0004690594405594405,
"loss": 3.387,
"step": 37550
},
{
"epoch": 10.949385520414701,
"grad_norm": 0.33192873001098633,
"learning_rate": 0.00046888461538461537,
"loss": 3.3873,
"step": 37600
},
{
"epoch": 10.963946648028424,
"grad_norm": 0.38427871465682983,
"learning_rate": 0.00046870979020979017,
"loss": 3.3847,
"step": 37650
},
{
"epoch": 10.978507775642147,
"grad_norm": 0.3522765338420868,
"learning_rate": 0.000468534965034965,
"loss": 3.3893,
"step": 37700
},
{
"epoch": 10.993068903255867,
"grad_norm": 0.34591537714004517,
"learning_rate": 0.0004683601398601398,
"loss": 3.3719,
"step": 37750
},
{
"epoch": 11.007571786359136,
"grad_norm": 0.3357653319835663,
"learning_rate": 0.0004681853146853147,
"loss": 3.3227,
"step": 37800
},
{
"epoch": 11.022132913972857,
"grad_norm": 0.3487904369831085,
"learning_rate": 0.0004680104895104895,
"loss": 3.2621,
"step": 37850
},
{
"epoch": 11.03669404158658,
"grad_norm": 0.34650513529777527,
"learning_rate": 0.00046783566433566433,
"loss": 3.2841,
"step": 37900
},
{
"epoch": 11.051255169200303,
"grad_norm": 0.3532591760158539,
"learning_rate": 0.0004676608391608391,
"loss": 3.3019,
"step": 37950
},
{
"epoch": 11.065816296814026,
"grad_norm": 0.3463428020477295,
"learning_rate": 0.00046748601398601393,
"loss": 3.2846,
"step": 38000
},
{
"epoch": 11.065816296814026,
"eval_accuracy": 0.3690252625584492,
"eval_loss": 3.5694565773010254,
"eval_runtime": 179.6506,
"eval_samples_per_second": 92.658,
"eval_steps_per_second": 5.795,
"step": 38000
},
{
"epoch": 11.080377424427748,
"grad_norm": 0.34090572595596313,
"learning_rate": 0.00046731118881118873,
"loss": 3.2824,
"step": 38050
},
{
"epoch": 11.09493855204147,
"grad_norm": 0.342952162027359,
"learning_rate": 0.0004671363636363636,
"loss": 3.2853,
"step": 38100
},
{
"epoch": 11.109499679655192,
"grad_norm": 0.34960615634918213,
"learning_rate": 0.00046696153846153844,
"loss": 3.306,
"step": 38150
},
{
"epoch": 11.124060807268915,
"grad_norm": 0.3609829545021057,
"learning_rate": 0.00046678671328671324,
"loss": 3.3002,
"step": 38200
},
{
"epoch": 11.138621934882638,
"grad_norm": 0.36214137077331543,
"learning_rate": 0.0004666118881118881,
"loss": 3.301,
"step": 38250
},
{
"epoch": 11.15318306249636,
"grad_norm": 0.34393003582954407,
"learning_rate": 0.0004664370629370629,
"loss": 3.2942,
"step": 38300
},
{
"epoch": 11.167744190110081,
"grad_norm": 0.34303128719329834,
"learning_rate": 0.00046626223776223774,
"loss": 3.3214,
"step": 38350
},
{
"epoch": 11.182305317723804,
"grad_norm": 0.328708291053772,
"learning_rate": 0.00046608741258741254,
"loss": 3.3219,
"step": 38400
},
{
"epoch": 11.196866445337527,
"grad_norm": 0.3446009159088135,
"learning_rate": 0.0004659125874125874,
"loss": 3.3167,
"step": 38450
},
{
"epoch": 11.21142757295125,
"grad_norm": 0.34211987257003784,
"learning_rate": 0.0004657377622377622,
"loss": 3.3174,
"step": 38500
},
{
"epoch": 11.225988700564972,
"grad_norm": 0.3242914378643036,
"learning_rate": 0.00046556293706293705,
"loss": 3.3241,
"step": 38550
},
{
"epoch": 11.240549828178693,
"grad_norm": 0.3906053304672241,
"learning_rate": 0.00046538811188811185,
"loss": 3.3242,
"step": 38600
},
{
"epoch": 11.255110955792416,
"grad_norm": 0.35929811000823975,
"learning_rate": 0.0004652132867132867,
"loss": 3.3162,
"step": 38650
},
{
"epoch": 11.269672083406139,
"grad_norm": 0.362967312335968,
"learning_rate": 0.00046503846153846145,
"loss": 3.3231,
"step": 38700
},
{
"epoch": 11.284233211019862,
"grad_norm": 0.3617202639579773,
"learning_rate": 0.0004648636363636363,
"loss": 3.3383,
"step": 38750
},
{
"epoch": 11.298794338633584,
"grad_norm": 0.3506316840648651,
"learning_rate": 0.0004646888111888111,
"loss": 3.3331,
"step": 38800
},
{
"epoch": 11.313355466247307,
"grad_norm": 0.35573744773864746,
"learning_rate": 0.00046451398601398596,
"loss": 3.3353,
"step": 38850
},
{
"epoch": 11.327916593861028,
"grad_norm": 0.3686698377132416,
"learning_rate": 0.0004643391608391608,
"loss": 3.331,
"step": 38900
},
{
"epoch": 11.34247772147475,
"grad_norm": 0.3425818681716919,
"learning_rate": 0.0004641643356643356,
"loss": 3.3356,
"step": 38950
},
{
"epoch": 11.357038849088473,
"grad_norm": 0.3510589003562927,
"learning_rate": 0.00046398951048951046,
"loss": 3.3348,
"step": 39000
},
{
"epoch": 11.357038849088473,
"eval_accuracy": 0.3690062174160538,
"eval_loss": 3.5675673484802246,
"eval_runtime": 179.7196,
"eval_samples_per_second": 92.622,
"eval_steps_per_second": 5.792,
"step": 39000
},
{
"epoch": 11.371599976702196,
"grad_norm": 0.33674928545951843,
"learning_rate": 0.00046381468531468526,
"loss": 3.3398,
"step": 39050
},
{
"epoch": 11.386161104315919,
"grad_norm": 0.3514574468135834,
"learning_rate": 0.0004636398601398601,
"loss": 3.3288,
"step": 39100
},
{
"epoch": 11.40072223192964,
"grad_norm": 0.3495193123817444,
"learning_rate": 0.0004634650349650349,
"loss": 3.3363,
"step": 39150
},
{
"epoch": 11.415283359543363,
"grad_norm": 0.32252052426338196,
"learning_rate": 0.00046329020979020977,
"loss": 3.339,
"step": 39200
},
{
"epoch": 11.429844487157085,
"grad_norm": 0.35050782561302185,
"learning_rate": 0.00046311538461538457,
"loss": 3.3347,
"step": 39250
},
{
"epoch": 11.444405614770808,
"grad_norm": 0.3583682179450989,
"learning_rate": 0.0004629405594405594,
"loss": 3.3401,
"step": 39300
},
{
"epoch": 11.458966742384531,
"grad_norm": 0.34238868951797485,
"learning_rate": 0.0004627657342657342,
"loss": 3.3355,
"step": 39350
},
{
"epoch": 11.473527869998252,
"grad_norm": 0.37667524814605713,
"learning_rate": 0.0004625909090909091,
"loss": 3.3388,
"step": 39400
},
{
"epoch": 11.488088997611975,
"grad_norm": 0.3632262349128723,
"learning_rate": 0.0004624160839160838,
"loss": 3.3425,
"step": 39450
},
{
"epoch": 11.502650125225697,
"grad_norm": 0.35607948899269104,
"learning_rate": 0.0004622412587412587,
"loss": 3.351,
"step": 39500
},
{
"epoch": 11.51721125283942,
"grad_norm": 0.37387025356292725,
"learning_rate": 0.00046206643356643353,
"loss": 3.3431,
"step": 39550
},
{
"epoch": 11.531772380453143,
"grad_norm": 0.33748266100883484,
"learning_rate": 0.00046189160839160833,
"loss": 3.3548,
"step": 39600
},
{
"epoch": 11.546333508066864,
"grad_norm": 0.35886332392692566,
"learning_rate": 0.0004617167832167832,
"loss": 3.3526,
"step": 39650
},
{
"epoch": 11.560894635680587,
"grad_norm": 0.34527456760406494,
"learning_rate": 0.000461541958041958,
"loss": 3.3471,
"step": 39700
},
{
"epoch": 11.57545576329431,
"grad_norm": 0.3368580639362335,
"learning_rate": 0.00046136713286713284,
"loss": 3.355,
"step": 39750
},
{
"epoch": 11.590016890908032,
"grad_norm": 0.3541625738143921,
"learning_rate": 0.00046119230769230764,
"loss": 3.3553,
"step": 39800
},
{
"epoch": 11.604578018521755,
"grad_norm": 0.3746795058250427,
"learning_rate": 0.0004610174825174825,
"loss": 3.3639,
"step": 39850
},
{
"epoch": 11.619139146135478,
"grad_norm": 0.3315316438674927,
"learning_rate": 0.0004608426573426573,
"loss": 3.3647,
"step": 39900
},
{
"epoch": 11.633700273749199,
"grad_norm": 0.3384944796562195,
"learning_rate": 0.00046066783216783215,
"loss": 3.3451,
"step": 39950
},
{
"epoch": 11.648261401362921,
"grad_norm": 0.3402671813964844,
"learning_rate": 0.00046049300699300695,
"loss": 3.3578,
"step": 40000
},
{
"epoch": 11.648261401362921,
"eval_accuracy": 0.3696004963963534,
"eval_loss": 3.5571908950805664,
"eval_runtime": 179.721,
"eval_samples_per_second": 92.621,
"eval_steps_per_second": 5.792,
"step": 40000
},
{
"epoch": 11.662822528976644,
"grad_norm": 0.3571811020374298,
"learning_rate": 0.0004603181818181818,
"loss": 3.3727,
"step": 40050
},
{
"epoch": 11.677383656590367,
"grad_norm": 0.3699859082698822,
"learning_rate": 0.0004601433566433566,
"loss": 3.3617,
"step": 40100
},
{
"epoch": 11.69194478420409,
"grad_norm": 0.3565984070301056,
"learning_rate": 0.00045996853146853145,
"loss": 3.3641,
"step": 40150
},
{
"epoch": 11.70650591181781,
"grad_norm": 0.33900022506713867,
"learning_rate": 0.0004597937062937062,
"loss": 3.3517,
"step": 40200
},
{
"epoch": 11.721067039431533,
"grad_norm": 0.3517002463340759,
"learning_rate": 0.00045961888111888105,
"loss": 3.3626,
"step": 40250
},
{
"epoch": 11.735628167045256,
"grad_norm": 0.3393325209617615,
"learning_rate": 0.0004594440559440559,
"loss": 3.3688,
"step": 40300
},
{
"epoch": 11.750189294658979,
"grad_norm": 0.35617125034332275,
"learning_rate": 0.0004592692307692307,
"loss": 3.3727,
"step": 40350
},
{
"epoch": 11.764750422272702,
"grad_norm": 0.3454066514968872,
"learning_rate": 0.00045909440559440556,
"loss": 3.3805,
"step": 40400
},
{
"epoch": 11.779311549886422,
"grad_norm": 0.3658194839954376,
"learning_rate": 0.00045891958041958036,
"loss": 3.3681,
"step": 40450
},
{
"epoch": 11.793872677500145,
"grad_norm": 0.3409833312034607,
"learning_rate": 0.0004587447552447552,
"loss": 3.3682,
"step": 40500
},
{
"epoch": 11.808433805113868,
"grad_norm": 0.33058348298072815,
"learning_rate": 0.00045856993006993,
"loss": 3.3632,
"step": 40550
},
{
"epoch": 11.82299493272759,
"grad_norm": 0.33089274168014526,
"learning_rate": 0.00045839510489510487,
"loss": 3.368,
"step": 40600
},
{
"epoch": 11.837556060341313,
"grad_norm": 0.3300808072090149,
"learning_rate": 0.00045822027972027967,
"loss": 3.3671,
"step": 40650
},
{
"epoch": 11.852117187955034,
"grad_norm": 0.3723061978816986,
"learning_rate": 0.0004580454545454545,
"loss": 3.3466,
"step": 40700
},
{
"epoch": 11.866678315568757,
"grad_norm": 0.34598809480667114,
"learning_rate": 0.0004578706293706293,
"loss": 3.3639,
"step": 40750
},
{
"epoch": 11.88123944318248,
"grad_norm": 0.37761250138282776,
"learning_rate": 0.0004576958041958042,
"loss": 3.3698,
"step": 40800
},
{
"epoch": 11.895800570796203,
"grad_norm": 0.3515342175960541,
"learning_rate": 0.000457520979020979,
"loss": 3.3629,
"step": 40850
},
{
"epoch": 11.910361698409925,
"grad_norm": 0.3487488031387329,
"learning_rate": 0.00045734615384615383,
"loss": 3.3621,
"step": 40900
},
{
"epoch": 11.924922826023646,
"grad_norm": 0.34570902585983276,
"learning_rate": 0.0004571713286713287,
"loss": 3.366,
"step": 40950
},
{
"epoch": 11.93948395363737,
"grad_norm": 0.36092135310173035,
"learning_rate": 0.00045699650349650343,
"loss": 3.3609,
"step": 41000
},
{
"epoch": 11.93948395363737,
"eval_accuracy": 0.37018983774714304,
"eval_loss": 3.549649238586426,
"eval_runtime": 179.668,
"eval_samples_per_second": 92.649,
"eval_steps_per_second": 5.794,
"step": 41000
},
{
"epoch": 11.954045081251092,
"grad_norm": 0.3425419330596924,
"learning_rate": 0.0004568216783216783,
"loss": 3.3825,
"step": 41050
},
{
"epoch": 11.968606208864815,
"grad_norm": 0.33592498302459717,
"learning_rate": 0.0004566468531468531,
"loss": 3.3594,
"step": 41100
},
{
"epoch": 11.983167336478537,
"grad_norm": 0.36570852994918823,
"learning_rate": 0.00045647202797202794,
"loss": 3.3751,
"step": 41150
},
{
"epoch": 11.99772846409226,
"grad_norm": 0.3489970564842224,
"learning_rate": 0.00045629720279720274,
"loss": 3.3637,
"step": 41200
},
{
"epoch": 12.012231347195527,
"grad_norm": 0.3311231732368469,
"learning_rate": 0.0004561223776223776,
"loss": 3.2709,
"step": 41250
},
{
"epoch": 12.02679247480925,
"grad_norm": 0.3381282091140747,
"learning_rate": 0.0004559475524475524,
"loss": 3.2648,
"step": 41300
},
{
"epoch": 12.041353602422971,
"grad_norm": 0.35453909635543823,
"learning_rate": 0.00045577272727272724,
"loss": 3.2673,
"step": 41350
},
{
"epoch": 12.055914730036694,
"grad_norm": 0.3546675443649292,
"learning_rate": 0.00045559790209790204,
"loss": 3.2658,
"step": 41400
},
{
"epoch": 12.070475857650417,
"grad_norm": 0.34255877137184143,
"learning_rate": 0.0004554230769230769,
"loss": 3.2736,
"step": 41450
},
{
"epoch": 12.08503698526414,
"grad_norm": 0.349418967962265,
"learning_rate": 0.0004552482517482517,
"loss": 3.2715,
"step": 41500
},
{
"epoch": 12.099598112877862,
"grad_norm": 0.35405322909355164,
"learning_rate": 0.00045507342657342655,
"loss": 3.2731,
"step": 41550
},
{
"epoch": 12.114159240491583,
"grad_norm": 0.3374113142490387,
"learning_rate": 0.00045489860139860135,
"loss": 3.2743,
"step": 41600
},
{
"epoch": 12.128720368105306,
"grad_norm": 0.3446289896965027,
"learning_rate": 0.0004547237762237762,
"loss": 3.2669,
"step": 41650
},
{
"epoch": 12.143281495719028,
"grad_norm": 0.3512953519821167,
"learning_rate": 0.00045454895104895106,
"loss": 3.285,
"step": 41700
},
{
"epoch": 12.157842623332751,
"grad_norm": 0.3549095094203949,
"learning_rate": 0.0004543741258741258,
"loss": 3.2933,
"step": 41750
},
{
"epoch": 12.172403750946474,
"grad_norm": 0.339769572019577,
"learning_rate": 0.00045419930069930066,
"loss": 3.2875,
"step": 41800
},
{
"epoch": 12.186964878560195,
"grad_norm": 0.3452531397342682,
"learning_rate": 0.00045402447552447546,
"loss": 3.302,
"step": 41850
},
{
"epoch": 12.201526006173918,
"grad_norm": 0.3680498003959656,
"learning_rate": 0.0004538496503496503,
"loss": 3.2993,
"step": 41900
},
{
"epoch": 12.21608713378764,
"grad_norm": 0.34865322709083557,
"learning_rate": 0.0004536748251748251,
"loss": 3.3167,
"step": 41950
},
{
"epoch": 12.230648261401363,
"grad_norm": 0.3419394791126251,
"learning_rate": 0.00045349999999999996,
"loss": 3.2985,
"step": 42000
},
{
"epoch": 12.230648261401363,
"eval_accuracy": 0.3693504407304588,
"eval_loss": 3.568716287612915,
"eval_runtime": 179.8504,
"eval_samples_per_second": 92.555,
"eval_steps_per_second": 5.788,
"step": 42000
},
{
"epoch": 12.245209389015086,
"grad_norm": 0.360034704208374,
"learning_rate": 0.00045332517482517476,
"loss": 3.301,
"step": 42050
},
{
"epoch": 12.259770516628807,
"grad_norm": 0.38103193044662476,
"learning_rate": 0.0004531503496503496,
"loss": 3.305,
"step": 42100
},
{
"epoch": 12.27433164424253,
"grad_norm": 0.3412448465824127,
"learning_rate": 0.0004529755244755244,
"loss": 3.3242,
"step": 42150
},
{
"epoch": 12.288892771856252,
"grad_norm": 0.3405698835849762,
"learning_rate": 0.00045280069930069927,
"loss": 3.3107,
"step": 42200
},
{
"epoch": 12.303453899469975,
"grad_norm": 0.3409527838230133,
"learning_rate": 0.00045262587412587407,
"loss": 3.2999,
"step": 42250
},
{
"epoch": 12.318015027083698,
"grad_norm": 0.38172680139541626,
"learning_rate": 0.0004524510489510489,
"loss": 3.3119,
"step": 42300
},
{
"epoch": 12.33257615469742,
"grad_norm": 0.3319685757160187,
"learning_rate": 0.0004522762237762238,
"loss": 3.3068,
"step": 42350
},
{
"epoch": 12.347137282311142,
"grad_norm": 0.35366857051849365,
"learning_rate": 0.0004521013986013986,
"loss": 3.3112,
"step": 42400
},
{
"epoch": 12.361698409924864,
"grad_norm": 0.3479348421096802,
"learning_rate": 0.00045192657342657343,
"loss": 3.3216,
"step": 42450
},
{
"epoch": 12.376259537538587,
"grad_norm": 0.345636248588562,
"learning_rate": 0.0004517517482517482,
"loss": 3.3199,
"step": 42500
},
{
"epoch": 12.39082066515231,
"grad_norm": 0.35920679569244385,
"learning_rate": 0.00045157692307692303,
"loss": 3.3191,
"step": 42550
},
{
"epoch": 12.405381792766033,
"grad_norm": 0.34712013602256775,
"learning_rate": 0.00045140209790209783,
"loss": 3.3194,
"step": 42600
},
{
"epoch": 12.419942920379754,
"grad_norm": 0.3547152876853943,
"learning_rate": 0.0004512272727272727,
"loss": 3.3144,
"step": 42650
},
{
"epoch": 12.434504047993476,
"grad_norm": 0.3760581910610199,
"learning_rate": 0.0004510524475524475,
"loss": 3.3071,
"step": 42700
},
{
"epoch": 12.449065175607199,
"grad_norm": 0.33818721771240234,
"learning_rate": 0.00045087762237762234,
"loss": 3.3231,
"step": 42750
},
{
"epoch": 12.463626303220922,
"grad_norm": 0.3385844826698303,
"learning_rate": 0.00045070279720279714,
"loss": 3.318,
"step": 42800
},
{
"epoch": 12.478187430834645,
"grad_norm": 0.3650315999984741,
"learning_rate": 0.000450527972027972,
"loss": 3.3269,
"step": 42850
},
{
"epoch": 12.492748558448366,
"grad_norm": 0.3568150997161865,
"learning_rate": 0.0004503531468531468,
"loss": 3.3456,
"step": 42900
},
{
"epoch": 12.507309686062088,
"grad_norm": 0.3506946861743927,
"learning_rate": 0.00045017832167832165,
"loss": 3.3289,
"step": 42950
},
{
"epoch": 12.521870813675811,
"grad_norm": 0.35339799523353577,
"learning_rate": 0.0004500034965034965,
"loss": 3.3214,
"step": 43000
},
{
"epoch": 12.521870813675811,
"eval_accuracy": 0.37011318692713213,
"eval_loss": 3.5573697090148926,
"eval_runtime": 181.2705,
"eval_samples_per_second": 91.83,
"eval_steps_per_second": 5.743,
"step": 43000
},
{
"epoch": 12.536431941289534,
"grad_norm": 0.39414504170417786,
"learning_rate": 0.0004498286713286713,
"loss": 3.3347,
"step": 43050
},
{
"epoch": 12.550993068903256,
"grad_norm": 0.3470751941204071,
"learning_rate": 0.00044965384615384615,
"loss": 3.337,
"step": 43100
},
{
"epoch": 12.565554196516977,
"grad_norm": 0.3801160156726837,
"learning_rate": 0.00044947902097902095,
"loss": 3.336,
"step": 43150
},
{
"epoch": 12.5801153241307,
"grad_norm": 0.3630496561527252,
"learning_rate": 0.0004493041958041958,
"loss": 3.3382,
"step": 43200
},
{
"epoch": 12.594676451744423,
"grad_norm": 0.35550111532211304,
"learning_rate": 0.00044912937062937055,
"loss": 3.3348,
"step": 43250
},
{
"epoch": 12.609237579358146,
"grad_norm": 0.4416738748550415,
"learning_rate": 0.0004489545454545454,
"loss": 3.3484,
"step": 43300
},
{
"epoch": 12.623798706971868,
"grad_norm": 0.32709190249443054,
"learning_rate": 0.0004487797202797202,
"loss": 3.3303,
"step": 43350
},
{
"epoch": 12.63835983458559,
"grad_norm": 0.3459291458129883,
"learning_rate": 0.00044860489510489506,
"loss": 3.3335,
"step": 43400
},
{
"epoch": 12.652920962199312,
"grad_norm": 0.3465927243232727,
"learning_rate": 0.00044843006993006986,
"loss": 3.3361,
"step": 43450
},
{
"epoch": 12.667482089813035,
"grad_norm": 0.3654097616672516,
"learning_rate": 0.0004482552447552447,
"loss": 3.3372,
"step": 43500
},
{
"epoch": 12.682043217426758,
"grad_norm": 0.38142114877700806,
"learning_rate": 0.0004480804195804195,
"loss": 3.3387,
"step": 43550
},
{
"epoch": 12.69660434504048,
"grad_norm": 0.3935697674751282,
"learning_rate": 0.00044790559440559437,
"loss": 3.3418,
"step": 43600
},
{
"epoch": 12.711165472654203,
"grad_norm": 0.34622472524642944,
"learning_rate": 0.00044773076923076917,
"loss": 3.3506,
"step": 43650
},
{
"epoch": 12.725726600267924,
"grad_norm": 0.32980939745903015,
"learning_rate": 0.000447555944055944,
"loss": 3.3469,
"step": 43700
},
{
"epoch": 12.740287727881647,
"grad_norm": 0.33688464760780334,
"learning_rate": 0.0004473811188811189,
"loss": 3.3566,
"step": 43750
},
{
"epoch": 12.75484885549537,
"grad_norm": 0.34542152285575867,
"learning_rate": 0.0004472062937062937,
"loss": 3.3478,
"step": 43800
},
{
"epoch": 12.769409983109092,
"grad_norm": 0.34685632586479187,
"learning_rate": 0.00044703146853146853,
"loss": 3.3384,
"step": 43850
},
{
"epoch": 12.783971110722815,
"grad_norm": 0.3635971248149872,
"learning_rate": 0.00044685664335664333,
"loss": 3.3546,
"step": 43900
},
{
"epoch": 12.798532238336536,
"grad_norm": 0.3320182263851166,
"learning_rate": 0.0004466818181818182,
"loss": 3.3477,
"step": 43950
},
{
"epoch": 12.813093365950259,
"grad_norm": 0.3641332983970642,
"learning_rate": 0.00044650699300699293,
"loss": 3.362,
"step": 44000
},
{
"epoch": 12.813093365950259,
"eval_accuracy": 0.3707261583620049,
"eval_loss": 3.5470356941223145,
"eval_runtime": 179.6682,
"eval_samples_per_second": 92.649,
"eval_steps_per_second": 5.794,
"step": 44000
},
{
"epoch": 12.827654493563982,
"grad_norm": 0.38189804553985596,
"learning_rate": 0.0004463321678321678,
"loss": 3.3465,
"step": 44050
},
{
"epoch": 12.842215621177704,
"grad_norm": 0.35613173246383667,
"learning_rate": 0.0004461573426573426,
"loss": 3.3421,
"step": 44100
},
{
"epoch": 12.856776748791427,
"grad_norm": 0.3412075936794281,
"learning_rate": 0.00044598251748251744,
"loss": 3.3595,
"step": 44150
},
{
"epoch": 12.871337876405148,
"grad_norm": 0.36046281456947327,
"learning_rate": 0.00044580769230769224,
"loss": 3.3585,
"step": 44200
},
{
"epoch": 12.88589900401887,
"grad_norm": 0.32819634675979614,
"learning_rate": 0.0004456328671328671,
"loss": 3.3489,
"step": 44250
},
{
"epoch": 12.900460131632594,
"grad_norm": 0.33128878474235535,
"learning_rate": 0.0004454580419580419,
"loss": 3.3635,
"step": 44300
},
{
"epoch": 12.915021259246316,
"grad_norm": 0.33165040612220764,
"learning_rate": 0.00044528321678321674,
"loss": 3.3472,
"step": 44350
},
{
"epoch": 12.929582386860039,
"grad_norm": 0.3606609106063843,
"learning_rate": 0.0004451083916083916,
"loss": 3.3523,
"step": 44400
},
{
"epoch": 12.944143514473762,
"grad_norm": 0.3361433148384094,
"learning_rate": 0.0004449335664335664,
"loss": 3.343,
"step": 44450
},
{
"epoch": 12.958704642087483,
"grad_norm": 0.3337194621562958,
"learning_rate": 0.00044475874125874125,
"loss": 3.3552,
"step": 44500
},
{
"epoch": 12.973265769701205,
"grad_norm": 0.3324238955974579,
"learning_rate": 0.00044458391608391605,
"loss": 3.3403,
"step": 44550
},
{
"epoch": 12.987826897314928,
"grad_norm": 0.386767715215683,
"learning_rate": 0.0004444090909090909,
"loss": 3.3609,
"step": 44600
},
{
"epoch": 13.002329780418195,
"grad_norm": 0.36827847361564636,
"learning_rate": 0.0004442342657342657,
"loss": 3.3221,
"step": 44650
},
{
"epoch": 13.016890908031918,
"grad_norm": 0.3624404966831207,
"learning_rate": 0.00044405944055944056,
"loss": 3.2319,
"step": 44700
},
{
"epoch": 13.031452035645641,
"grad_norm": 0.3504217565059662,
"learning_rate": 0.0004438846153846153,
"loss": 3.2535,
"step": 44750
},
{
"epoch": 13.046013163259364,
"grad_norm": 0.331437349319458,
"learning_rate": 0.00044370979020979016,
"loss": 3.2517,
"step": 44800
},
{
"epoch": 13.060574290873085,
"grad_norm": 0.35558682680130005,
"learning_rate": 0.00044353496503496496,
"loss": 3.2519,
"step": 44850
},
{
"epoch": 13.075135418486807,
"grad_norm": 0.3366740643978119,
"learning_rate": 0.0004433601398601398,
"loss": 3.2485,
"step": 44900
},
{
"epoch": 13.08969654610053,
"grad_norm": 0.3887079358100891,
"learning_rate": 0.0004431853146853146,
"loss": 3.2614,
"step": 44950
},
{
"epoch": 13.104257673714253,
"grad_norm": 0.38102632761001587,
"learning_rate": 0.00044301048951048946,
"loss": 3.2706,
"step": 45000
},
{
"epoch": 13.104257673714253,
"eval_accuracy": 0.3700127884604307,
"eval_loss": 3.5635054111480713,
"eval_runtime": 179.6934,
"eval_samples_per_second": 92.636,
"eval_steps_per_second": 5.793,
"step": 45000
},
{
"epoch": 13.118818801327976,
"grad_norm": 0.4008774757385254,
"learning_rate": 0.00044283566433566426,
"loss": 3.2684,
"step": 45050
},
{
"epoch": 13.133379928941697,
"grad_norm": 0.38614171743392944,
"learning_rate": 0.0004426608391608391,
"loss": 3.2609,
"step": 45100
},
{
"epoch": 13.14794105655542,
"grad_norm": 0.3764742314815521,
"learning_rate": 0.00044248601398601397,
"loss": 3.2744,
"step": 45150
},
{
"epoch": 13.162502184169142,
"grad_norm": 0.351336807012558,
"learning_rate": 0.00044231118881118877,
"loss": 3.2772,
"step": 45200
},
{
"epoch": 13.177063311782865,
"grad_norm": 0.35065558552742004,
"learning_rate": 0.0004421363636363636,
"loss": 3.2886,
"step": 45250
},
{
"epoch": 13.191624439396588,
"grad_norm": 0.3613697588443756,
"learning_rate": 0.0004419615384615384,
"loss": 3.274,
"step": 45300
},
{
"epoch": 13.206185567010309,
"grad_norm": 0.3541945219039917,
"learning_rate": 0.0004417867132867133,
"loss": 3.2891,
"step": 45350
},
{
"epoch": 13.220746694624031,
"grad_norm": 0.3597337007522583,
"learning_rate": 0.0004416118881118881,
"loss": 3.2757,
"step": 45400
},
{
"epoch": 13.235307822237754,
"grad_norm": 0.3505752384662628,
"learning_rate": 0.00044143706293706293,
"loss": 3.2805,
"step": 45450
},
{
"epoch": 13.249868949851477,
"grad_norm": 0.3716984987258911,
"learning_rate": 0.0004412622377622377,
"loss": 3.2992,
"step": 45500
},
{
"epoch": 13.2644300774652,
"grad_norm": 0.35030633211135864,
"learning_rate": 0.00044108741258741253,
"loss": 3.2865,
"step": 45550
},
{
"epoch": 13.27899120507892,
"grad_norm": 0.3532033860683441,
"learning_rate": 0.00044091258741258733,
"loss": 3.2949,
"step": 45600
},
{
"epoch": 13.293552332692643,
"grad_norm": 0.35529381036758423,
"learning_rate": 0.0004407377622377622,
"loss": 3.2903,
"step": 45650
},
{
"epoch": 13.308113460306366,
"grad_norm": 0.3565613925457001,
"learning_rate": 0.000440562937062937,
"loss": 3.295,
"step": 45700
},
{
"epoch": 13.322674587920089,
"grad_norm": 0.3754475712776184,
"learning_rate": 0.00044038811188811184,
"loss": 3.2902,
"step": 45750
},
{
"epoch": 13.337235715533811,
"grad_norm": 0.3901439309120178,
"learning_rate": 0.0004402132867132867,
"loss": 3.2799,
"step": 45800
},
{
"epoch": 13.351796843147532,
"grad_norm": 0.36209163069725037,
"learning_rate": 0.0004400384615384615,
"loss": 3.2899,
"step": 45850
},
{
"epoch": 13.366357970761255,
"grad_norm": 0.3730039596557617,
"learning_rate": 0.00043986363636363635,
"loss": 3.2934,
"step": 45900
},
{
"epoch": 13.380919098374978,
"grad_norm": 0.3754393458366394,
"learning_rate": 0.00043968881118881115,
"loss": 3.3128,
"step": 45950
},
{
"epoch": 13.3954802259887,
"grad_norm": 0.3539136052131653,
"learning_rate": 0.000439513986013986,
"loss": 3.3045,
"step": 46000
},
{
"epoch": 13.3954802259887,
"eval_accuracy": 0.3705192481730183,
"eval_loss": 3.5570271015167236,
"eval_runtime": 179.814,
"eval_samples_per_second": 92.573,
"eval_steps_per_second": 5.789,
"step": 46000
},
{
"epoch": 13.410041353602423,
"grad_norm": 0.37800097465515137,
"learning_rate": 0.0004393391608391608,
"loss": 3.3057,
"step": 46050
},
{
"epoch": 13.424602481216146,
"grad_norm": 0.356802761554718,
"learning_rate": 0.00043916433566433565,
"loss": 3.311,
"step": 46100
},
{
"epoch": 13.439163608829867,
"grad_norm": 0.3797909915447235,
"learning_rate": 0.00043898951048951045,
"loss": 3.301,
"step": 46150
},
{
"epoch": 13.45372473644359,
"grad_norm": 0.38472598791122437,
"learning_rate": 0.0004388146853146853,
"loss": 3.3151,
"step": 46200
},
{
"epoch": 13.468285864057313,
"grad_norm": 0.33033671975135803,
"learning_rate": 0.00043863986013986005,
"loss": 3.3022,
"step": 46250
},
{
"epoch": 13.482846991671035,
"grad_norm": 0.37687408924102783,
"learning_rate": 0.0004384650349650349,
"loss": 3.3081,
"step": 46300
},
{
"epoch": 13.497408119284758,
"grad_norm": 0.3729974329471588,
"learning_rate": 0.0004382902097902097,
"loss": 3.3001,
"step": 46350
},
{
"epoch": 13.51196924689848,
"grad_norm": 0.3623746335506439,
"learning_rate": 0.00043811538461538456,
"loss": 3.3085,
"step": 46400
},
{
"epoch": 13.526530374512202,
"grad_norm": 0.3384764790534973,
"learning_rate": 0.0004379405594405594,
"loss": 3.3195,
"step": 46450
},
{
"epoch": 13.541091502125925,
"grad_norm": 0.3783145844936371,
"learning_rate": 0.0004377657342657342,
"loss": 3.316,
"step": 46500
},
{
"epoch": 13.555652629739647,
"grad_norm": 0.3664775788784027,
"learning_rate": 0.00043759090909090907,
"loss": 3.3281,
"step": 46550
},
{
"epoch": 13.57021375735337,
"grad_norm": 0.3659818470478058,
"learning_rate": 0.00043741608391608387,
"loss": 3.3121,
"step": 46600
},
{
"epoch": 13.584774884967091,
"grad_norm": 0.3645237386226654,
"learning_rate": 0.0004372412587412587,
"loss": 3.3266,
"step": 46650
},
{
"epoch": 13.599336012580814,
"grad_norm": 0.3517119586467743,
"learning_rate": 0.0004370664335664335,
"loss": 3.3054,
"step": 46700
},
{
"epoch": 13.613897140194537,
"grad_norm": 0.3515457808971405,
"learning_rate": 0.0004368916083916084,
"loss": 3.3194,
"step": 46750
},
{
"epoch": 13.62845826780826,
"grad_norm": 0.3286823332309723,
"learning_rate": 0.0004367167832167832,
"loss": 3.3175,
"step": 46800
},
{
"epoch": 13.643019395421982,
"grad_norm": 0.37633106112480164,
"learning_rate": 0.00043654195804195803,
"loss": 3.3182,
"step": 46850
},
{
"epoch": 13.657580523035705,
"grad_norm": 0.38671764731407166,
"learning_rate": 0.00043636713286713283,
"loss": 3.3334,
"step": 46900
},
{
"epoch": 13.672141650649426,
"grad_norm": 0.355014830827713,
"learning_rate": 0.0004361923076923077,
"loss": 3.3246,
"step": 46950
},
{
"epoch": 13.686702778263149,
"grad_norm": 0.3474278748035431,
"learning_rate": 0.00043601748251748243,
"loss": 3.3143,
"step": 47000
},
{
"epoch": 13.686702778263149,
"eval_accuracy": 0.37050690409924353,
"eval_loss": 3.5496487617492676,
"eval_runtime": 179.8313,
"eval_samples_per_second": 92.565,
"eval_steps_per_second": 5.789,
"step": 47000
},
{
"epoch": 13.701263905876871,
"grad_norm": 0.3466740548610687,
"learning_rate": 0.00043584265734265734,
"loss": 3.3342,
"step": 47050
},
{
"epoch": 13.715825033490594,
"grad_norm": 0.3670768439769745,
"learning_rate": 0.0004356678321678321,
"loss": 3.3233,
"step": 47100
},
{
"epoch": 13.730386161104317,
"grad_norm": 0.351852148771286,
"learning_rate": 0.00043549300699300694,
"loss": 3.3229,
"step": 47150
},
{
"epoch": 13.744947288718038,
"grad_norm": 0.3562369644641876,
"learning_rate": 0.0004353181818181818,
"loss": 3.3275,
"step": 47200
},
{
"epoch": 13.75950841633176,
"grad_norm": 0.3771800100803375,
"learning_rate": 0.0004351433566433566,
"loss": 3.3352,
"step": 47250
},
{
"epoch": 13.774069543945483,
"grad_norm": 0.35783255100250244,
"learning_rate": 0.00043496853146853144,
"loss": 3.3258,
"step": 47300
},
{
"epoch": 13.788630671559206,
"grad_norm": 0.3556366562843323,
"learning_rate": 0.00043479370629370624,
"loss": 3.3299,
"step": 47350
},
{
"epoch": 13.803191799172929,
"grad_norm": 0.34562426805496216,
"learning_rate": 0.0004346188811188811,
"loss": 3.3333,
"step": 47400
},
{
"epoch": 13.81775292678665,
"grad_norm": 0.3431943356990814,
"learning_rate": 0.0004344440559440559,
"loss": 3.3247,
"step": 47450
},
{
"epoch": 13.832314054400372,
"grad_norm": 0.33565446734428406,
"learning_rate": 0.00043426923076923075,
"loss": 3.3416,
"step": 47500
},
{
"epoch": 13.846875182014095,
"grad_norm": 0.3674251139163971,
"learning_rate": 0.00043409440559440555,
"loss": 3.3392,
"step": 47550
},
{
"epoch": 13.861436309627818,
"grad_norm": 0.3683677613735199,
"learning_rate": 0.0004339195804195804,
"loss": 3.3446,
"step": 47600
},
{
"epoch": 13.87599743724154,
"grad_norm": 0.3581762909889221,
"learning_rate": 0.0004337447552447552,
"loss": 3.3347,
"step": 47650
},
{
"epoch": 13.890558564855262,
"grad_norm": 0.384446382522583,
"learning_rate": 0.00043356993006993006,
"loss": 3.3301,
"step": 47700
},
{
"epoch": 13.905119692468984,
"grad_norm": 0.3708100914955139,
"learning_rate": 0.0004333951048951048,
"loss": 3.3372,
"step": 47750
},
{
"epoch": 13.919680820082707,
"grad_norm": 0.3896830976009369,
"learning_rate": 0.0004332202797202797,
"loss": 3.3412,
"step": 47800
},
{
"epoch": 13.93424194769643,
"grad_norm": 0.34177786111831665,
"learning_rate": 0.00043304545454545456,
"loss": 3.332,
"step": 47850
},
{
"epoch": 13.948803075310153,
"grad_norm": 0.3624970614910126,
"learning_rate": 0.0004328706293706293,
"loss": 3.3425,
"step": 47900
},
{
"epoch": 13.963364202923874,
"grad_norm": 0.3618220388889313,
"learning_rate": 0.00043269580419580416,
"loss": 3.3454,
"step": 47950
},
{
"epoch": 13.977925330537596,
"grad_norm": 0.3680497109889984,
"learning_rate": 0.00043252097902097896,
"loss": 3.3416,
"step": 48000
},
{
"epoch": 13.977925330537596,
"eval_accuracy": 0.3713867426528661,
"eval_loss": 3.5407259464263916,
"eval_runtime": 179.73,
"eval_samples_per_second": 92.617,
"eval_steps_per_second": 5.792,
"step": 48000
},
{
"epoch": 13.992486458151319,
"grad_norm": 0.3602592647075653,
"learning_rate": 0.0004323461538461538,
"loss": 3.3363,
"step": 48050
},
{
"epoch": 14.006989341254586,
"grad_norm": 0.355772465467453,
"learning_rate": 0.0004321713286713286,
"loss": 3.29,
"step": 48100
},
{
"epoch": 14.021550468868309,
"grad_norm": 0.3709073066711426,
"learning_rate": 0.00043199650349650347,
"loss": 3.2099,
"step": 48150
},
{
"epoch": 14.036111596482032,
"grad_norm": 0.37267613410949707,
"learning_rate": 0.00043182167832167827,
"loss": 3.2284,
"step": 48200
},
{
"epoch": 14.050672724095755,
"grad_norm": 0.3682970702648163,
"learning_rate": 0.0004316468531468531,
"loss": 3.228,
"step": 48250
},
{
"epoch": 14.065233851709475,
"grad_norm": 0.37280622124671936,
"learning_rate": 0.0004314720279720279,
"loss": 3.2376,
"step": 48300
},
{
"epoch": 14.079794979323198,
"grad_norm": 0.3666398525238037,
"learning_rate": 0.0004312972027972028,
"loss": 3.2405,
"step": 48350
},
{
"epoch": 14.094356106936921,
"grad_norm": 0.35189032554626465,
"learning_rate": 0.0004311223776223776,
"loss": 3.2435,
"step": 48400
},
{
"epoch": 14.108917234550644,
"grad_norm": 0.37200215458869934,
"learning_rate": 0.00043094755244755243,
"loss": 3.2499,
"step": 48450
},
{
"epoch": 14.123478362164366,
"grad_norm": 0.34880998730659485,
"learning_rate": 0.0004307727272727272,
"loss": 3.2626,
"step": 48500
},
{
"epoch": 14.13803948977809,
"grad_norm": 0.36202293634414673,
"learning_rate": 0.0004305979020979021,
"loss": 3.2548,
"step": 48550
},
{
"epoch": 14.15260061739181,
"grad_norm": 0.3718937635421753,
"learning_rate": 0.00043042307692307694,
"loss": 3.2417,
"step": 48600
},
{
"epoch": 14.167161745005533,
"grad_norm": 0.35780712962150574,
"learning_rate": 0.0004302482517482517,
"loss": 3.2628,
"step": 48650
},
{
"epoch": 14.181722872619256,
"grad_norm": 0.3607184588909149,
"learning_rate": 0.00043007342657342654,
"loss": 3.264,
"step": 48700
},
{
"epoch": 14.196284000232978,
"grad_norm": 0.37095871567726135,
"learning_rate": 0.00042989860139860134,
"loss": 3.2635,
"step": 48750
},
{
"epoch": 14.210845127846701,
"grad_norm": 0.35249850153923035,
"learning_rate": 0.0004297237762237762,
"loss": 3.2666,
"step": 48800
},
{
"epoch": 14.225406255460422,
"grad_norm": 0.3709149658679962,
"learning_rate": 0.000429548951048951,
"loss": 3.2794,
"step": 48850
},
{
"epoch": 14.239967383074145,
"grad_norm": 0.3637993037700653,
"learning_rate": 0.00042937412587412585,
"loss": 3.259,
"step": 48900
},
{
"epoch": 14.254528510687868,
"grad_norm": 0.3605745732784271,
"learning_rate": 0.00042919930069930065,
"loss": 3.2736,
"step": 48950
},
{
"epoch": 14.26908963830159,
"grad_norm": 0.35469844937324524,
"learning_rate": 0.0004290244755244755,
"loss": 3.2753,
"step": 49000
},
{
"epoch": 14.26908963830159,
"eval_accuracy": 0.3706431591611955,
"eval_loss": 3.5600173473358154,
"eval_runtime": 179.715,
"eval_samples_per_second": 92.624,
"eval_steps_per_second": 5.793,
"step": 49000
},
{
"epoch": 14.283650765915313,
"grad_norm": 0.39860716462135315,
"learning_rate": 0.0004288496503496503,
"loss": 3.2797,
"step": 49050
},
{
"epoch": 14.298211893529034,
"grad_norm": 0.35852912068367004,
"learning_rate": 0.00042867482517482515,
"loss": 3.2746,
"step": 49100
},
{
"epoch": 14.312773021142757,
"grad_norm": 0.37002283334732056,
"learning_rate": 0.00042849999999999995,
"loss": 3.2834,
"step": 49150
},
{
"epoch": 14.32733414875648,
"grad_norm": 0.37728050351142883,
"learning_rate": 0.0004283251748251748,
"loss": 3.2807,
"step": 49200
},
{
"epoch": 14.341895276370202,
"grad_norm": 0.36190709471702576,
"learning_rate": 0.00042815034965034966,
"loss": 3.2826,
"step": 49250
},
{
"epoch": 14.356456403983925,
"grad_norm": 0.37520870566368103,
"learning_rate": 0.00042797552447552446,
"loss": 3.2863,
"step": 49300
},
{
"epoch": 14.371017531597648,
"grad_norm": 0.346926748752594,
"learning_rate": 0.0004278006993006993,
"loss": 3.2886,
"step": 49350
},
{
"epoch": 14.385578659211369,
"grad_norm": 0.3690744936466217,
"learning_rate": 0.00042762587412587406,
"loss": 3.2712,
"step": 49400
},
{
"epoch": 14.400139786825092,
"grad_norm": 0.3768012225627899,
"learning_rate": 0.0004274510489510489,
"loss": 3.2886,
"step": 49450
},
{
"epoch": 14.414700914438814,
"grad_norm": 0.3553199768066406,
"learning_rate": 0.0004272762237762237,
"loss": 3.2898,
"step": 49500
},
{
"epoch": 14.429262042052537,
"grad_norm": 0.37578845024108887,
"learning_rate": 0.00042710139860139857,
"loss": 3.2897,
"step": 49550
},
{
"epoch": 14.44382316966626,
"grad_norm": 0.37218427658081055,
"learning_rate": 0.00042692657342657337,
"loss": 3.2884,
"step": 49600
},
{
"epoch": 14.45838429727998,
"grad_norm": 0.33228814601898193,
"learning_rate": 0.0004267517482517482,
"loss": 3.2952,
"step": 49650
},
{
"epoch": 14.472945424893704,
"grad_norm": 0.35473567247390747,
"learning_rate": 0.000426576923076923,
"loss": 3.2968,
"step": 49700
},
{
"epoch": 14.487506552507426,
"grad_norm": 0.372232049703598,
"learning_rate": 0.0004264020979020979,
"loss": 3.3083,
"step": 49750
},
{
"epoch": 14.502067680121149,
"grad_norm": 0.3498397171497345,
"learning_rate": 0.0004262272727272727,
"loss": 3.2939,
"step": 49800
},
{
"epoch": 14.516628807734872,
"grad_norm": 0.38301196694374084,
"learning_rate": 0.00042605244755244753,
"loss": 3.2994,
"step": 49850
},
{
"epoch": 14.531189935348593,
"grad_norm": 0.35607635974884033,
"learning_rate": 0.00042587762237762233,
"loss": 3.3046,
"step": 49900
},
{
"epoch": 14.545751062962315,
"grad_norm": 0.3492361903190613,
"learning_rate": 0.0004257027972027972,
"loss": 3.2998,
"step": 49950
},
{
"epoch": 14.560312190576038,
"grad_norm": 0.35411879420280457,
"learning_rate": 0.00042552797202797204,
"loss": 3.3121,
"step": 50000
},
{
"epoch": 14.560312190576038,
"eval_accuracy": 0.37108778094230194,
"eval_loss": 3.551468849182129,
"eval_runtime": 179.6249,
"eval_samples_per_second": 92.671,
"eval_steps_per_second": 5.795,
"step": 50000
},
{
"epoch": 14.574873318189761,
"grad_norm": 0.3809182345867157,
"learning_rate": 0.00042535314685314684,
"loss": 3.2999,
"step": 50050
},
{
"epoch": 14.589434445803484,
"grad_norm": 0.35519906878471375,
"learning_rate": 0.0004251783216783217,
"loss": 3.2908,
"step": 50100
},
{
"epoch": 14.603995573417205,
"grad_norm": 0.3606894016265869,
"learning_rate": 0.00042500349650349643,
"loss": 3.3015,
"step": 50150
},
{
"epoch": 14.618556701030927,
"grad_norm": 0.3618135154247284,
"learning_rate": 0.0004248286713286713,
"loss": 3.3088,
"step": 50200
},
{
"epoch": 14.63311782864465,
"grad_norm": 0.3675486743450165,
"learning_rate": 0.0004246538461538461,
"loss": 3.3063,
"step": 50250
},
{
"epoch": 14.647678956258373,
"grad_norm": 0.37515851855278015,
"learning_rate": 0.00042447902097902094,
"loss": 3.3183,
"step": 50300
},
{
"epoch": 14.662240083872096,
"grad_norm": 0.3448165953159332,
"learning_rate": 0.00042430419580419574,
"loss": 3.304,
"step": 50350
},
{
"epoch": 14.676801211485817,
"grad_norm": 0.37604591250419617,
"learning_rate": 0.0004241293706293706,
"loss": 3.3114,
"step": 50400
},
{
"epoch": 14.69136233909954,
"grad_norm": 0.3427177369594574,
"learning_rate": 0.0004239545454545454,
"loss": 3.3145,
"step": 50450
},
{
"epoch": 14.705923466713262,
"grad_norm": 0.36776697635650635,
"learning_rate": 0.00042377972027972025,
"loss": 3.3216,
"step": 50500
},
{
"epoch": 14.720484594326985,
"grad_norm": 0.35328057408332825,
"learning_rate": 0.00042360489510489505,
"loss": 3.3124,
"step": 50550
},
{
"epoch": 14.735045721940708,
"grad_norm": 0.37412354350090027,
"learning_rate": 0.0004234300699300699,
"loss": 3.3076,
"step": 50600
},
{
"epoch": 14.749606849554429,
"grad_norm": 0.3240453004837036,
"learning_rate": 0.00042325524475524476,
"loss": 3.3164,
"step": 50650
},
{
"epoch": 14.764167977168151,
"grad_norm": 0.3579900860786438,
"learning_rate": 0.00042308041958041956,
"loss": 3.3072,
"step": 50700
},
{
"epoch": 14.778729104781874,
"grad_norm": 0.36066341400146484,
"learning_rate": 0.0004229055944055944,
"loss": 3.3254,
"step": 50750
},
{
"epoch": 14.793290232395597,
"grad_norm": 0.3647271692752838,
"learning_rate": 0.0004227307692307692,
"loss": 3.3134,
"step": 50800
},
{
"epoch": 14.80785136000932,
"grad_norm": 0.36275023221969604,
"learning_rate": 0.00042255594405594406,
"loss": 3.3145,
"step": 50850
},
{
"epoch": 14.822412487623042,
"grad_norm": 0.3940187394618988,
"learning_rate": 0.0004223811188811188,
"loss": 3.3063,
"step": 50900
},
{
"epoch": 14.836973615236763,
"grad_norm": 0.3709195554256439,
"learning_rate": 0.00042220629370629366,
"loss": 3.3114,
"step": 50950
},
{
"epoch": 14.851534742850486,
"grad_norm": 0.318628191947937,
"learning_rate": 0.00042203146853146846,
"loss": 3.3199,
"step": 51000
},
{
"epoch": 14.851534742850486,
"eval_accuracy": 0.371846412447717,
"eval_loss": 3.541372537612915,
"eval_runtime": 200.5321,
"eval_samples_per_second": 83.009,
"eval_steps_per_second": 5.191,
"step": 51000
},
{
"epoch": 14.866095870464209,
"grad_norm": 0.3727494180202484,
"learning_rate": 0.0004218566433566433,
"loss": 3.3183,
"step": 51050
},
{
"epoch": 14.880656998077932,
"grad_norm": 0.3709549903869629,
"learning_rate": 0.0004216818181818181,
"loss": 3.3156,
"step": 51100
},
{
"epoch": 14.895218125691654,
"grad_norm": 0.3711429834365845,
"learning_rate": 0.00042150699300699297,
"loss": 3.3241,
"step": 51150
},
{
"epoch": 14.909779253305375,
"grad_norm": 0.367157518863678,
"learning_rate": 0.00042133216783216777,
"loss": 3.3154,
"step": 51200
},
{
"epoch": 14.924340380919098,
"grad_norm": 0.35663118958473206,
"learning_rate": 0.0004211573426573426,
"loss": 3.3259,
"step": 51250
},
{
"epoch": 14.93890150853282,
"grad_norm": 0.33772292733192444,
"learning_rate": 0.0004209825174825175,
"loss": 3.3343,
"step": 51300
},
{
"epoch": 14.953462636146543,
"grad_norm": 0.3367787301540375,
"learning_rate": 0.0004208076923076923,
"loss": 3.3245,
"step": 51350
},
{
"epoch": 14.968023763760266,
"grad_norm": 0.3641185462474823,
"learning_rate": 0.00042063286713286713,
"loss": 3.318,
"step": 51400
},
{
"epoch": 14.982584891373987,
"grad_norm": 0.4175531566143036,
"learning_rate": 0.00042045804195804193,
"loss": 3.3194,
"step": 51450
},
{
"epoch": 14.99714601898771,
"grad_norm": 0.35317283868789673,
"learning_rate": 0.0004202832167832168,
"loss": 3.3351,
"step": 51500
},
{
"epoch": 15.011648902090977,
"grad_norm": 0.347444087266922,
"learning_rate": 0.0004201083916083916,
"loss": 3.229,
"step": 51550
},
{
"epoch": 15.0262100297047,
"grad_norm": 0.366161972284317,
"learning_rate": 0.00041993356643356644,
"loss": 3.2158,
"step": 51600
},
{
"epoch": 15.040771157318423,
"grad_norm": 0.3661559820175171,
"learning_rate": 0.0004197587412587412,
"loss": 3.2267,
"step": 51650
},
{
"epoch": 15.055332284932145,
"grad_norm": 0.3468046486377716,
"learning_rate": 0.00041958391608391604,
"loss": 3.2284,
"step": 51700
},
{
"epoch": 15.069893412545868,
"grad_norm": 0.3720075786113739,
"learning_rate": 0.00041940909090909084,
"loss": 3.2371,
"step": 51750
},
{
"epoch": 15.084454540159589,
"grad_norm": 0.3633178770542145,
"learning_rate": 0.0004192342657342657,
"loss": 3.22,
"step": 51800
},
{
"epoch": 15.099015667773312,
"grad_norm": 0.3946298360824585,
"learning_rate": 0.0004190594405594405,
"loss": 3.2385,
"step": 51850
},
{
"epoch": 15.113576795387035,
"grad_norm": 0.368947297334671,
"learning_rate": 0.00041888461538461535,
"loss": 3.2462,
"step": 51900
},
{
"epoch": 15.128137923000757,
"grad_norm": 0.363775372505188,
"learning_rate": 0.00041870979020979015,
"loss": 3.2498,
"step": 51950
},
{
"epoch": 15.14269905061448,
"grad_norm": 0.36256226897239685,
"learning_rate": 0.000418534965034965,
"loss": 3.2368,
"step": 52000
},
{
"epoch": 15.14269905061448,
"eval_accuracy": 0.370988793226889,
"eval_loss": 3.5600876808166504,
"eval_runtime": 179.687,
"eval_samples_per_second": 92.639,
"eval_steps_per_second": 5.793,
"step": 52000
},
{
"epoch": 15.157260178228203,
"grad_norm": 0.401760071516037,
"learning_rate": 0.00041836013986013985,
"loss": 3.2442,
"step": 52050
},
{
"epoch": 15.171821305841924,
"grad_norm": 0.3602779805660248,
"learning_rate": 0.00041818531468531465,
"loss": 3.2474,
"step": 52100
},
{
"epoch": 15.186382433455647,
"grad_norm": 0.34138384461402893,
"learning_rate": 0.0004180104895104895,
"loss": 3.2486,
"step": 52150
},
{
"epoch": 15.20094356106937,
"grad_norm": 0.35512077808380127,
"learning_rate": 0.0004178356643356643,
"loss": 3.2549,
"step": 52200
},
{
"epoch": 15.215504688683092,
"grad_norm": 0.37063032388687134,
"learning_rate": 0.00041766083916083916,
"loss": 3.2446,
"step": 52250
},
{
"epoch": 15.230065816296815,
"grad_norm": 0.3616124093532562,
"learning_rate": 0.00041748601398601396,
"loss": 3.2578,
"step": 52300
},
{
"epoch": 15.244626943910536,
"grad_norm": 0.36814188957214355,
"learning_rate": 0.0004173111888111888,
"loss": 3.2614,
"step": 52350
},
{
"epoch": 15.259188071524258,
"grad_norm": 0.4011984169483185,
"learning_rate": 0.00041713636363636356,
"loss": 3.261,
"step": 52400
},
{
"epoch": 15.273749199137981,
"grad_norm": 0.3679613769054413,
"learning_rate": 0.0004169615384615384,
"loss": 3.2555,
"step": 52450
},
{
"epoch": 15.288310326751704,
"grad_norm": 0.4017728865146637,
"learning_rate": 0.0004167867132867132,
"loss": 3.2608,
"step": 52500
},
{
"epoch": 15.302871454365427,
"grad_norm": 0.3769571781158447,
"learning_rate": 0.00041661188811188807,
"loss": 3.2751,
"step": 52550
},
{
"epoch": 15.317432581979148,
"grad_norm": 0.3748333752155304,
"learning_rate": 0.00041643706293706287,
"loss": 3.2633,
"step": 52600
},
{
"epoch": 15.33199370959287,
"grad_norm": 0.3775624632835388,
"learning_rate": 0.0004162622377622377,
"loss": 3.2559,
"step": 52650
},
{
"epoch": 15.346554837206593,
"grad_norm": 0.36379382014274597,
"learning_rate": 0.0004160874125874126,
"loss": 3.2603,
"step": 52700
},
{
"epoch": 15.361115964820316,
"grad_norm": 0.3573223352432251,
"learning_rate": 0.0004159125874125874,
"loss": 3.2752,
"step": 52750
},
{
"epoch": 15.375677092434039,
"grad_norm": 0.34874892234802246,
"learning_rate": 0.00041573776223776223,
"loss": 3.2645,
"step": 52800
},
{
"epoch": 15.39023822004776,
"grad_norm": 0.3864137828350067,
"learning_rate": 0.00041556293706293703,
"loss": 3.2675,
"step": 52850
},
{
"epoch": 15.404799347661482,
"grad_norm": 0.3531895875930786,
"learning_rate": 0.0004153881118881119,
"loss": 3.281,
"step": 52900
},
{
"epoch": 15.419360475275205,
"grad_norm": 0.37176671624183655,
"learning_rate": 0.0004152132867132867,
"loss": 3.2719,
"step": 52950
},
{
"epoch": 15.433921602888928,
"grad_norm": 0.37971508502960205,
"learning_rate": 0.00041503846153846154,
"loss": 3.2748,
"step": 53000
},
{
"epoch": 15.433921602888928,
"eval_accuracy": 0.3718487636998645,
"eval_loss": 3.550957679748535,
"eval_runtime": 179.6741,
"eval_samples_per_second": 92.646,
"eval_steps_per_second": 5.794,
"step": 53000
},
{
"epoch": 15.44848273050265,
"grad_norm": 0.35884761810302734,
"learning_rate": 0.00041486363636363634,
"loss": 3.2723,
"step": 53050
},
{
"epoch": 15.463043858116373,
"grad_norm": 0.3565036654472351,
"learning_rate": 0.0004146888111888112,
"loss": 3.2723,
"step": 53100
},
{
"epoch": 15.477604985730094,
"grad_norm": 0.38016510009765625,
"learning_rate": 0.00041451398601398593,
"loss": 3.2801,
"step": 53150
},
{
"epoch": 15.492166113343817,
"grad_norm": 0.3596195876598358,
"learning_rate": 0.0004143391608391608,
"loss": 3.2973,
"step": 53200
},
{
"epoch": 15.50672724095754,
"grad_norm": 0.36341792345046997,
"learning_rate": 0.0004141643356643356,
"loss": 3.2902,
"step": 53250
},
{
"epoch": 15.521288368571263,
"grad_norm": 0.3592833876609802,
"learning_rate": 0.00041398951048951044,
"loss": 3.2797,
"step": 53300
},
{
"epoch": 15.535849496184985,
"grad_norm": 0.378161758184433,
"learning_rate": 0.00041381468531468524,
"loss": 3.2824,
"step": 53350
},
{
"epoch": 15.550410623798706,
"grad_norm": 0.3580067753791809,
"learning_rate": 0.0004136398601398601,
"loss": 3.2861,
"step": 53400
},
{
"epoch": 15.564971751412429,
"grad_norm": 0.37102365493774414,
"learning_rate": 0.00041346503496503495,
"loss": 3.2941,
"step": 53450
},
{
"epoch": 15.579532879026152,
"grad_norm": 0.37696951627731323,
"learning_rate": 0.00041329020979020975,
"loss": 3.2929,
"step": 53500
},
{
"epoch": 15.594094006639875,
"grad_norm": 0.36684298515319824,
"learning_rate": 0.0004131153846153846,
"loss": 3.2921,
"step": 53550
},
{
"epoch": 15.608655134253597,
"grad_norm": 0.37633803486824036,
"learning_rate": 0.0004129405594405594,
"loss": 3.2902,
"step": 53600
},
{
"epoch": 15.623216261867318,
"grad_norm": 0.33005252480506897,
"learning_rate": 0.00041276573426573426,
"loss": 3.2933,
"step": 53650
},
{
"epoch": 15.637777389481041,
"grad_norm": 0.37316614389419556,
"learning_rate": 0.00041259090909090906,
"loss": 3.2866,
"step": 53700
},
{
"epoch": 15.652338517094764,
"grad_norm": 0.3721347451210022,
"learning_rate": 0.0004124160839160839,
"loss": 3.284,
"step": 53750
},
{
"epoch": 15.666899644708487,
"grad_norm": 0.3940325379371643,
"learning_rate": 0.0004122412587412587,
"loss": 3.2911,
"step": 53800
},
{
"epoch": 15.68146077232221,
"grad_norm": 0.3668735921382904,
"learning_rate": 0.00041206643356643356,
"loss": 3.2957,
"step": 53850
},
{
"epoch": 15.69602189993593,
"grad_norm": 0.35475608706474304,
"learning_rate": 0.0004118916083916083,
"loss": 3.2891,
"step": 53900
},
{
"epoch": 15.710583027549653,
"grad_norm": 0.36943408846855164,
"learning_rate": 0.00041171678321678316,
"loss": 3.3051,
"step": 53950
},
{
"epoch": 15.725144155163376,
"grad_norm": 0.41088443994522095,
"learning_rate": 0.00041154195804195796,
"loss": 3.2951,
"step": 54000
},
{
"epoch": 15.725144155163376,
"eval_accuracy": 0.3721188050090135,
"eval_loss": 3.5433807373046875,
"eval_runtime": 249.9993,
"eval_samples_per_second": 66.584,
"eval_steps_per_second": 4.164,
"step": 54000
},
{
"epoch": 15.739705282777098,
"grad_norm": 0.3841875195503235,
"learning_rate": 0.0004113671328671328,
"loss": 3.3085,
"step": 54050
},
{
"epoch": 15.754266410390821,
"grad_norm": 0.37713882327079773,
"learning_rate": 0.00041119230769230767,
"loss": 3.2896,
"step": 54100
},
{
"epoch": 15.768827538004544,
"grad_norm": 0.3698156476020813,
"learning_rate": 0.00041101748251748247,
"loss": 3.3038,
"step": 54150
},
{
"epoch": 15.783388665618265,
"grad_norm": 0.3522474467754364,
"learning_rate": 0.0004108426573426573,
"loss": 3.3029,
"step": 54200
},
{
"epoch": 15.797949793231988,
"grad_norm": 0.3552002012729645,
"learning_rate": 0.0004106678321678321,
"loss": 3.2929,
"step": 54250
},
{
"epoch": 15.81251092084571,
"grad_norm": 0.38907817006111145,
"learning_rate": 0.000410493006993007,
"loss": 3.3145,
"step": 54300
},
{
"epoch": 15.827072048459433,
"grad_norm": 0.35792553424835205,
"learning_rate": 0.0004103181818181818,
"loss": 3.2966,
"step": 54350
},
{
"epoch": 15.841633176073156,
"grad_norm": 0.35903868079185486,
"learning_rate": 0.00041014335664335663,
"loss": 3.2997,
"step": 54400
},
{
"epoch": 15.856194303686877,
"grad_norm": 0.3578985929489136,
"learning_rate": 0.00040996853146853143,
"loss": 3.3062,
"step": 54450
},
{
"epoch": 15.8707554313006,
"grad_norm": 0.35918229818344116,
"learning_rate": 0.0004097937062937063,
"loss": 3.2963,
"step": 54500
},
{
"epoch": 15.885316558914322,
"grad_norm": 0.3494114875793457,
"learning_rate": 0.0004096188811188811,
"loss": 3.3158,
"step": 54550
},
{
"epoch": 15.899877686528045,
"grad_norm": 0.3552234172821045,
"learning_rate": 0.00040944405594405594,
"loss": 3.3068,
"step": 54600
},
{
"epoch": 15.914438814141768,
"grad_norm": 0.3404771685600281,
"learning_rate": 0.0004092692307692307,
"loss": 3.3117,
"step": 54650
},
{
"epoch": 15.928999941755489,
"grad_norm": 0.35783398151397705,
"learning_rate": 0.00040909440559440554,
"loss": 3.3102,
"step": 54700
},
{
"epoch": 15.943561069369212,
"grad_norm": 0.3604854345321655,
"learning_rate": 0.00040891958041958034,
"loss": 3.3007,
"step": 54750
},
{
"epoch": 15.958122196982934,
"grad_norm": 0.3782384693622589,
"learning_rate": 0.0004087447552447552,
"loss": 3.3086,
"step": 54800
},
{
"epoch": 15.972683324596657,
"grad_norm": 0.3438422381877899,
"learning_rate": 0.00040856993006993005,
"loss": 3.3063,
"step": 54850
},
{
"epoch": 15.98724445221038,
"grad_norm": 0.38098853826522827,
"learning_rate": 0.00040839510489510485,
"loss": 3.3099,
"step": 54900
},
{
"epoch": 16.001747335313645,
"grad_norm": 0.41185808181762695,
"learning_rate": 0.0004082202797202797,
"loss": 3.2985,
"step": 54950
},
{
"epoch": 16.01630846292737,
"grad_norm": 0.3694097697734833,
"learning_rate": 0.0004080454545454545,
"loss": 3.1991,
"step": 55000
},
{
"epoch": 16.01630846292737,
"eval_accuracy": 0.3716150492363956,
"eval_loss": 3.5482091903686523,
"eval_runtime": 197.4314,
"eval_samples_per_second": 84.313,
"eval_steps_per_second": 5.273,
"step": 55000
},
{
"epoch": 16.03086959054109,
"grad_norm": 0.3613179922103882,
"learning_rate": 0.00040787062937062935,
"loss": 3.1937,
"step": 55050
},
{
"epoch": 16.045430718154815,
"grad_norm": 0.3634468913078308,
"learning_rate": 0.00040769580419580415,
"loss": 3.2083,
"step": 55100
},
{
"epoch": 16.059991845768536,
"grad_norm": 0.3805219233036041,
"learning_rate": 0.000407520979020979,
"loss": 3.2066,
"step": 55150
},
{
"epoch": 16.074552973382257,
"grad_norm": 0.34996068477630615,
"learning_rate": 0.0004073461538461538,
"loss": 3.2163,
"step": 55200
},
{
"epoch": 16.08911410099598,
"grad_norm": 0.39141207933425903,
"learning_rate": 0.00040717132867132866,
"loss": 3.222,
"step": 55250
},
{
"epoch": 16.103675228609703,
"grad_norm": 0.37998223304748535,
"learning_rate": 0.00040699650349650346,
"loss": 3.2199,
"step": 55300
},
{
"epoch": 16.118236356223427,
"grad_norm": 0.38038671016693115,
"learning_rate": 0.0004068216783216783,
"loss": 3.2289,
"step": 55350
},
{
"epoch": 16.132797483837148,
"grad_norm": 0.3913242220878601,
"learning_rate": 0.00040664685314685306,
"loss": 3.2256,
"step": 55400
},
{
"epoch": 16.14735861145087,
"grad_norm": 0.3965040445327759,
"learning_rate": 0.0004064720279720279,
"loss": 3.231,
"step": 55450
},
{
"epoch": 16.161919739064594,
"grad_norm": 0.3698939085006714,
"learning_rate": 0.00040629720279720277,
"loss": 3.2351,
"step": 55500
},
{
"epoch": 16.176480866678315,
"grad_norm": 0.3591434061527252,
"learning_rate": 0.00040612237762237757,
"loss": 3.2359,
"step": 55550
},
{
"epoch": 16.19104199429204,
"grad_norm": 0.39180421829223633,
"learning_rate": 0.0004059475524475524,
"loss": 3.2335,
"step": 55600
},
{
"epoch": 16.20560312190576,
"grad_norm": 0.40741053223609924,
"learning_rate": 0.0004057727272727272,
"loss": 3.2431,
"step": 55650
},
{
"epoch": 16.22016424951948,
"grad_norm": 0.3898237347602844,
"learning_rate": 0.0004055979020979021,
"loss": 3.245,
"step": 55700
},
{
"epoch": 16.234725377133206,
"grad_norm": 0.37869152426719666,
"learning_rate": 0.0004054230769230769,
"loss": 3.247,
"step": 55750
},
{
"epoch": 16.249286504746927,
"grad_norm": 0.35681819915771484,
"learning_rate": 0.00040524825174825173,
"loss": 3.247,
"step": 55800
},
{
"epoch": 16.26384763236065,
"grad_norm": 0.3998211920261383,
"learning_rate": 0.00040507342657342653,
"loss": 3.26,
"step": 55850
},
{
"epoch": 16.278408759974372,
"grad_norm": 0.3638876974582672,
"learning_rate": 0.0004048986013986014,
"loss": 3.2559,
"step": 55900
},
{
"epoch": 16.292969887588093,
"grad_norm": 0.3709234893321991,
"learning_rate": 0.0004047237762237762,
"loss": 3.2477,
"step": 55950
},
{
"epoch": 16.307531015201818,
"grad_norm": 0.3915596902370453,
"learning_rate": 0.00040454895104895104,
"loss": 3.2351,
"step": 56000
},
{
"epoch": 16.307531015201818,
"eval_accuracy": 0.37186792640486727,
"eval_loss": 3.552690267562866,
"eval_runtime": 179.6303,
"eval_samples_per_second": 92.668,
"eval_steps_per_second": 5.795,
"step": 56000
},
{
"epoch": 16.32209214281554,
"grad_norm": 0.40091753005981445,
"learning_rate": 0.00040437412587412583,
"loss": 3.2482,
"step": 56050
},
{
"epoch": 16.336653270429263,
"grad_norm": 0.3732578158378601,
"learning_rate": 0.0004041993006993007,
"loss": 3.246,
"step": 56100
},
{
"epoch": 16.351214398042984,
"grad_norm": 0.39505109190940857,
"learning_rate": 0.00040402447552447554,
"loss": 3.2391,
"step": 56150
},
{
"epoch": 16.36577552565671,
"grad_norm": 0.413886159658432,
"learning_rate": 0.0004038496503496503,
"loss": 3.262,
"step": 56200
},
{
"epoch": 16.38033665327043,
"grad_norm": 0.3627515137195587,
"learning_rate": 0.00040367482517482514,
"loss": 3.258,
"step": 56250
},
{
"epoch": 16.39489778088415,
"grad_norm": 0.37059104442596436,
"learning_rate": 0.00040349999999999994,
"loss": 3.2625,
"step": 56300
},
{
"epoch": 16.409458908497875,
"grad_norm": 0.35158687829971313,
"learning_rate": 0.0004033251748251748,
"loss": 3.2768,
"step": 56350
},
{
"epoch": 16.424020036111596,
"grad_norm": 0.3694722652435303,
"learning_rate": 0.0004031503496503496,
"loss": 3.2663,
"step": 56400
},
{
"epoch": 16.43858116372532,
"grad_norm": 0.3753471076488495,
"learning_rate": 0.00040297552447552445,
"loss": 3.2689,
"step": 56450
},
{
"epoch": 16.45314229133904,
"grad_norm": 0.36508360505104065,
"learning_rate": 0.00040280069930069925,
"loss": 3.2685,
"step": 56500
},
{
"epoch": 16.467703418952762,
"grad_norm": 0.3872244954109192,
"learning_rate": 0.0004026258741258741,
"loss": 3.2705,
"step": 56550
},
{
"epoch": 16.482264546566487,
"grad_norm": 0.421705037355423,
"learning_rate": 0.0004024510489510489,
"loss": 3.2588,
"step": 56600
},
{
"epoch": 16.496825674180208,
"grad_norm": 0.4006613790988922,
"learning_rate": 0.00040227622377622376,
"loss": 3.2674,
"step": 56650
},
{
"epoch": 16.511386801793932,
"grad_norm": 0.36601510643959045,
"learning_rate": 0.00040210139860139856,
"loss": 3.264,
"step": 56700
},
{
"epoch": 16.525947929407653,
"grad_norm": 0.3723083436489105,
"learning_rate": 0.0004019265734265734,
"loss": 3.2703,
"step": 56750
},
{
"epoch": 16.540509057021374,
"grad_norm": 0.3596556484699249,
"learning_rate": 0.0004017517482517482,
"loss": 3.2809,
"step": 56800
},
{
"epoch": 16.5550701846351,
"grad_norm": 0.38851556181907654,
"learning_rate": 0.00040157692307692306,
"loss": 3.2817,
"step": 56850
},
{
"epoch": 16.56963131224882,
"grad_norm": 0.39849522709846497,
"learning_rate": 0.0004014020979020979,
"loss": 3.2581,
"step": 56900
},
{
"epoch": 16.584192439862544,
"grad_norm": 0.34800028800964355,
"learning_rate": 0.00040122727272727266,
"loss": 3.2751,
"step": 56950
},
{
"epoch": 16.598753567476265,
"grad_norm": 0.3724021017551422,
"learning_rate": 0.0004010524475524475,
"loss": 3.2696,
"step": 57000
},
{
"epoch": 16.598753567476265,
"eval_accuracy": 0.37231513456333604,
"eval_loss": 3.5443036556243896,
"eval_runtime": 179.584,
"eval_samples_per_second": 92.692,
"eval_steps_per_second": 5.797,
"step": 57000
},
{
"epoch": 16.613314695089986,
"grad_norm": 0.39707282185554504,
"learning_rate": 0.0004008776223776223,
"loss": 3.2868,
"step": 57050
},
{
"epoch": 16.62787582270371,
"grad_norm": 0.3560337424278259,
"learning_rate": 0.00040070279720279717,
"loss": 3.2795,
"step": 57100
},
{
"epoch": 16.642436950317432,
"grad_norm": 0.36631080508232117,
"learning_rate": 0.00040052797202797197,
"loss": 3.2798,
"step": 57150
},
{
"epoch": 16.656998077931156,
"grad_norm": 0.34885546565055847,
"learning_rate": 0.0004003531468531468,
"loss": 3.2781,
"step": 57200
},
{
"epoch": 16.671559205544877,
"grad_norm": 0.389565646648407,
"learning_rate": 0.0004001783216783216,
"loss": 3.2789,
"step": 57250
},
{
"epoch": 16.6861203331586,
"grad_norm": 0.3716311454772949,
"learning_rate": 0.0004000034965034965,
"loss": 3.288,
"step": 57300
},
{
"epoch": 16.700681460772323,
"grad_norm": 0.37248024344444275,
"learning_rate": 0.0003998286713286713,
"loss": 3.2819,
"step": 57350
},
{
"epoch": 16.715242588386044,
"grad_norm": 0.3983492851257324,
"learning_rate": 0.00039965384615384613,
"loss": 3.2976,
"step": 57400
},
{
"epoch": 16.72980371599977,
"grad_norm": 0.39866718649864197,
"learning_rate": 0.00039947902097902093,
"loss": 3.2803,
"step": 57450
},
{
"epoch": 16.74436484361349,
"grad_norm": 0.38010174036026,
"learning_rate": 0.0003993041958041958,
"loss": 3.2934,
"step": 57500
},
{
"epoch": 16.75892597122721,
"grad_norm": 0.3775840103626251,
"learning_rate": 0.00039912937062937064,
"loss": 3.2927,
"step": 57550
},
{
"epoch": 16.773487098840935,
"grad_norm": 0.37588465213775635,
"learning_rate": 0.00039895454545454544,
"loss": 3.2897,
"step": 57600
},
{
"epoch": 16.788048226454656,
"grad_norm": 0.3820333778858185,
"learning_rate": 0.0003987797202797203,
"loss": 3.2909,
"step": 57650
},
{
"epoch": 16.80260935406838,
"grad_norm": 0.3661067485809326,
"learning_rate": 0.00039860489510489504,
"loss": 3.287,
"step": 57700
},
{
"epoch": 16.8171704816821,
"grad_norm": 0.3876943588256836,
"learning_rate": 0.0003984300699300699,
"loss": 3.2808,
"step": 57750
},
{
"epoch": 16.831731609295822,
"grad_norm": 0.3884466588497162,
"learning_rate": 0.0003982552447552447,
"loss": 3.2898,
"step": 57800
},
{
"epoch": 16.846292736909547,
"grad_norm": 0.35857149958610535,
"learning_rate": 0.00039808041958041955,
"loss": 3.2899,
"step": 57850
},
{
"epoch": 16.860853864523268,
"grad_norm": 0.3881604075431824,
"learning_rate": 0.00039790559440559435,
"loss": 3.2848,
"step": 57900
},
{
"epoch": 16.875414992136992,
"grad_norm": 0.3821488916873932,
"learning_rate": 0.0003977307692307692,
"loss": 3.2922,
"step": 57950
},
{
"epoch": 16.889976119750713,
"grad_norm": 0.3618803918361664,
"learning_rate": 0.000397555944055944,
"loss": 3.2829,
"step": 58000
},
{
"epoch": 16.889976119750713,
"eval_accuracy": 0.3727345979464634,
"eval_loss": 3.5385096073150635,
"eval_runtime": 179.6339,
"eval_samples_per_second": 92.666,
"eval_steps_per_second": 5.795,
"step": 58000
},
{
"epoch": 16.904537247364434,
"grad_norm": 0.3813997507095337,
"learning_rate": 0.00039738111888111885,
"loss": 3.3026,
"step": 58050
},
{
"epoch": 16.91909837497816,
"grad_norm": 0.3762351870536804,
"learning_rate": 0.00039720629370629365,
"loss": 3.2899,
"step": 58100
},
{
"epoch": 16.93365950259188,
"grad_norm": 0.3354097008705139,
"learning_rate": 0.0003970314685314685,
"loss": 3.3009,
"step": 58150
},
{
"epoch": 16.948220630205604,
"grad_norm": 0.3629801869392395,
"learning_rate": 0.0003968566433566433,
"loss": 3.2948,
"step": 58200
},
{
"epoch": 16.962781757819325,
"grad_norm": 0.34712672233581543,
"learning_rate": 0.00039668181818181816,
"loss": 3.2939,
"step": 58250
},
{
"epoch": 16.977342885433046,
"grad_norm": 0.3528999388217926,
"learning_rate": 0.000396506993006993,
"loss": 3.2935,
"step": 58300
},
{
"epoch": 16.99190401304677,
"grad_norm": 0.3971739709377289,
"learning_rate": 0.0003963321678321678,
"loss": 3.2863,
"step": 58350
},
{
"epoch": 17.006406896150036,
"grad_norm": 0.3811483681201935,
"learning_rate": 0.00039615734265734267,
"loss": 3.2487,
"step": 58400
},
{
"epoch": 17.02096802376376,
"grad_norm": 0.3732599914073944,
"learning_rate": 0.0003959825174825174,
"loss": 3.1863,
"step": 58450
},
{
"epoch": 17.03552915137748,
"grad_norm": 0.3752318322658539,
"learning_rate": 0.00039580769230769227,
"loss": 3.178,
"step": 58500
},
{
"epoch": 17.050090278991206,
"grad_norm": 0.39765167236328125,
"learning_rate": 0.00039563286713286707,
"loss": 3.1956,
"step": 58550
},
{
"epoch": 17.064651406604927,
"grad_norm": 0.3749917149543762,
"learning_rate": 0.0003954580419580419,
"loss": 3.2071,
"step": 58600
},
{
"epoch": 17.07921253421865,
"grad_norm": 0.3832402229309082,
"learning_rate": 0.0003952832167832167,
"loss": 3.2076,
"step": 58650
},
{
"epoch": 17.093773661832373,
"grad_norm": 0.3718985915184021,
"learning_rate": 0.0003951083916083916,
"loss": 3.2011,
"step": 58700
},
{
"epoch": 17.108334789446094,
"grad_norm": 0.3974931836128235,
"learning_rate": 0.0003949335664335664,
"loss": 3.2032,
"step": 58750
},
{
"epoch": 17.122895917059818,
"grad_norm": 0.3815936744213104,
"learning_rate": 0.00039475874125874123,
"loss": 3.2111,
"step": 58800
},
{
"epoch": 17.13745704467354,
"grad_norm": 0.39021918177604675,
"learning_rate": 0.00039458391608391603,
"loss": 3.2276,
"step": 58850
},
{
"epoch": 17.152018172287264,
"grad_norm": 0.36819562315940857,
"learning_rate": 0.0003944090909090909,
"loss": 3.2292,
"step": 58900
},
{
"epoch": 17.166579299900985,
"grad_norm": 0.36677491664886475,
"learning_rate": 0.00039423426573426573,
"loss": 3.2164,
"step": 58950
},
{
"epoch": 17.181140427514705,
"grad_norm": 0.3470790684223175,
"learning_rate": 0.00039405944055944053,
"loss": 3.229,
"step": 59000
},
{
"epoch": 17.181140427514705,
"eval_accuracy": 0.3720724853417063,
"eval_loss": 3.55291485786438,
"eval_runtime": 179.7336,
"eval_samples_per_second": 92.615,
"eval_steps_per_second": 5.792,
"step": 59000
},
{
"epoch": 17.19570155512843,
"grad_norm": 0.3945004940032959,
"learning_rate": 0.0003938846153846154,
"loss": 3.2196,
"step": 59050
},
{
"epoch": 17.21026268274215,
"grad_norm": 0.38693010807037354,
"learning_rate": 0.0003937097902097902,
"loss": 3.2238,
"step": 59100
},
{
"epoch": 17.224823810355876,
"grad_norm": 0.3574848473072052,
"learning_rate": 0.00039353496503496504,
"loss": 3.2328,
"step": 59150
},
{
"epoch": 17.239384937969596,
"grad_norm": 0.38169723749160767,
"learning_rate": 0.0003933601398601398,
"loss": 3.2294,
"step": 59200
},
{
"epoch": 17.253946065583317,
"grad_norm": 0.3648778796195984,
"learning_rate": 0.00039318531468531464,
"loss": 3.2355,
"step": 59250
},
{
"epoch": 17.268507193197042,
"grad_norm": 0.3846014142036438,
"learning_rate": 0.00039301048951048944,
"loss": 3.2347,
"step": 59300
},
{
"epoch": 17.283068320810763,
"grad_norm": 0.4090299904346466,
"learning_rate": 0.0003928356643356643,
"loss": 3.2331,
"step": 59350
},
{
"epoch": 17.297629448424487,
"grad_norm": 0.3630618751049042,
"learning_rate": 0.0003926608391608391,
"loss": 3.2456,
"step": 59400
},
{
"epoch": 17.31219057603821,
"grad_norm": 0.36168116331100464,
"learning_rate": 0.00039248601398601395,
"loss": 3.2326,
"step": 59450
},
{
"epoch": 17.32675170365193,
"grad_norm": 0.3843954801559448,
"learning_rate": 0.00039231118881118875,
"loss": 3.235,
"step": 59500
},
{
"epoch": 17.341312831265654,
"grad_norm": 0.3814232051372528,
"learning_rate": 0.0003921363636363636,
"loss": 3.2351,
"step": 59550
},
{
"epoch": 17.355873958879375,
"grad_norm": 0.39595067501068115,
"learning_rate": 0.00039196153846153846,
"loss": 3.2375,
"step": 59600
},
{
"epoch": 17.3704350864931,
"grad_norm": 0.3715277314186096,
"learning_rate": 0.00039178671328671326,
"loss": 3.2504,
"step": 59650
},
{
"epoch": 17.38499621410682,
"grad_norm": 0.39446181058883667,
"learning_rate": 0.0003916118881118881,
"loss": 3.2451,
"step": 59700
},
{
"epoch": 17.39955734172054,
"grad_norm": 0.37456580996513367,
"learning_rate": 0.0003914370629370629,
"loss": 3.2519,
"step": 59750
},
{
"epoch": 17.414118469334266,
"grad_norm": 0.3798275589942932,
"learning_rate": 0.00039126223776223776,
"loss": 3.2422,
"step": 59800
},
{
"epoch": 17.428679596947987,
"grad_norm": 0.37132883071899414,
"learning_rate": 0.00039108741258741256,
"loss": 3.2471,
"step": 59850
},
{
"epoch": 17.44324072456171,
"grad_norm": 0.3520642817020416,
"learning_rate": 0.0003909125874125874,
"loss": 3.2602,
"step": 59900
},
{
"epoch": 17.457801852175432,
"grad_norm": 0.36127546429634094,
"learning_rate": 0.00039073776223776216,
"loss": 3.2495,
"step": 59950
},
{
"epoch": 17.472362979789153,
"grad_norm": 0.35555124282836914,
"learning_rate": 0.000390562937062937,
"loss": 3.2599,
"step": 60000
},
{
"epoch": 17.472362979789153,
"eval_accuracy": 0.37272707393959115,
"eval_loss": 3.5451500415802,
"eval_runtime": 179.6675,
"eval_samples_per_second": 92.649,
"eval_steps_per_second": 5.794,
"step": 60000
},
{
"epoch": 17.486924107402878,
"grad_norm": 0.37541213631629944,
"learning_rate": 0.0003903881118881118,
"loss": 3.261,
"step": 60050
},
{
"epoch": 17.5014852350166,
"grad_norm": 0.35101550817489624,
"learning_rate": 0.00039021328671328667,
"loss": 3.2555,
"step": 60100
},
{
"epoch": 17.516046362630323,
"grad_norm": 0.3492276966571808,
"learning_rate": 0.00039003846153846147,
"loss": 3.2538,
"step": 60150
},
{
"epoch": 17.530607490244044,
"grad_norm": 0.3945137560367584,
"learning_rate": 0.0003898636363636363,
"loss": 3.2566,
"step": 60200
},
{
"epoch": 17.545168617857765,
"grad_norm": 0.38326242566108704,
"learning_rate": 0.0003896888111888111,
"loss": 3.2586,
"step": 60250
},
{
"epoch": 17.55972974547149,
"grad_norm": 0.38606488704681396,
"learning_rate": 0.000389513986013986,
"loss": 3.2724,
"step": 60300
},
{
"epoch": 17.57429087308521,
"grad_norm": 0.43482670187950134,
"learning_rate": 0.00038933916083916083,
"loss": 3.2605,
"step": 60350
},
{
"epoch": 17.588852000698935,
"grad_norm": 0.41528668999671936,
"learning_rate": 0.00038916433566433563,
"loss": 3.2734,
"step": 60400
},
{
"epoch": 17.603413128312656,
"grad_norm": 0.39981091022491455,
"learning_rate": 0.0003889895104895105,
"loss": 3.2685,
"step": 60450
},
{
"epoch": 17.617974255926377,
"grad_norm": 0.3848313093185425,
"learning_rate": 0.0003888146853146853,
"loss": 3.269,
"step": 60500
},
{
"epoch": 17.6325353835401,
"grad_norm": 0.38773873448371887,
"learning_rate": 0.00038863986013986014,
"loss": 3.2803,
"step": 60550
},
{
"epoch": 17.647096511153823,
"grad_norm": 0.3747732639312744,
"learning_rate": 0.00038846503496503494,
"loss": 3.2612,
"step": 60600
},
{
"epoch": 17.661657638767547,
"grad_norm": 0.380312442779541,
"learning_rate": 0.0003882902097902098,
"loss": 3.2803,
"step": 60650
},
{
"epoch": 17.676218766381268,
"grad_norm": 0.41522619128227234,
"learning_rate": 0.00038811538461538454,
"loss": 3.2678,
"step": 60700
},
{
"epoch": 17.690779893994993,
"grad_norm": 0.4010021984577179,
"learning_rate": 0.0003879405594405594,
"loss": 3.2548,
"step": 60750
},
{
"epoch": 17.705341021608714,
"grad_norm": 0.356964647769928,
"learning_rate": 0.0003877657342657342,
"loss": 3.2744,
"step": 60800
},
{
"epoch": 17.719902149222435,
"grad_norm": 0.3881632685661316,
"learning_rate": 0.00038759090909090905,
"loss": 3.2672,
"step": 60850
},
{
"epoch": 17.73446327683616,
"grad_norm": 0.39139822125434875,
"learning_rate": 0.00038741608391608384,
"loss": 3.2709,
"step": 60900
},
{
"epoch": 17.74902440444988,
"grad_norm": 0.3625246286392212,
"learning_rate": 0.0003872412587412587,
"loss": 3.2738,
"step": 60950
},
{
"epoch": 17.763585532063605,
"grad_norm": 0.3550924062728882,
"learning_rate": 0.00038706643356643355,
"loss": 3.2611,
"step": 61000
},
{
"epoch": 17.763585532063605,
"eval_accuracy": 0.3728541591181676,
"eval_loss": 3.5407886505126953,
"eval_runtime": 179.6406,
"eval_samples_per_second": 92.663,
"eval_steps_per_second": 5.795,
"step": 61000
},
{
"epoch": 17.778146659677326,
"grad_norm": 0.36368635296821594,
"learning_rate": 0.00038689160839160835,
"loss": 3.2716,
"step": 61050
},
{
"epoch": 17.792707787291047,
"grad_norm": 0.3799475431442261,
"learning_rate": 0.0003867167832167832,
"loss": 3.2733,
"step": 61100
},
{
"epoch": 17.80726891490477,
"grad_norm": 0.3661085069179535,
"learning_rate": 0.000386541958041958,
"loss": 3.282,
"step": 61150
},
{
"epoch": 17.821830042518492,
"grad_norm": 0.3987153470516205,
"learning_rate": 0.00038636713286713286,
"loss": 3.285,
"step": 61200
},
{
"epoch": 17.836391170132217,
"grad_norm": 0.39087626338005066,
"learning_rate": 0.00038619230769230766,
"loss": 3.2701,
"step": 61250
},
{
"epoch": 17.850952297745938,
"grad_norm": 0.36785832047462463,
"learning_rate": 0.0003860174825174825,
"loss": 3.275,
"step": 61300
},
{
"epoch": 17.86551342535966,
"grad_norm": 0.37993091344833374,
"learning_rate": 0.0003858426573426573,
"loss": 3.2701,
"step": 61350
},
{
"epoch": 17.880074552973383,
"grad_norm": 0.38080283999443054,
"learning_rate": 0.00038566783216783217,
"loss": 3.2976,
"step": 61400
},
{
"epoch": 17.894635680587104,
"grad_norm": 0.3847339153289795,
"learning_rate": 0.0003854930069930069,
"loss": 3.2779,
"step": 61450
},
{
"epoch": 17.90919680820083,
"grad_norm": 0.37311482429504395,
"learning_rate": 0.00038531818181818177,
"loss": 3.2877,
"step": 61500
},
{
"epoch": 17.92375793581455,
"grad_norm": 0.34330421686172485,
"learning_rate": 0.00038514335664335657,
"loss": 3.2893,
"step": 61550
},
{
"epoch": 17.93831906342827,
"grad_norm": 0.3885880708694458,
"learning_rate": 0.0003849685314685314,
"loss": 3.2828,
"step": 61600
},
{
"epoch": 17.952880191041995,
"grad_norm": 0.3785800635814667,
"learning_rate": 0.0003847937062937062,
"loss": 3.2855,
"step": 61650
},
{
"epoch": 17.967441318655716,
"grad_norm": 0.3802858293056488,
"learning_rate": 0.0003846188811188811,
"loss": 3.28,
"step": 61700
},
{
"epoch": 17.98200244626944,
"grad_norm": 0.3743467628955841,
"learning_rate": 0.00038444405594405593,
"loss": 3.2796,
"step": 61750
},
{
"epoch": 17.99656357388316,
"grad_norm": 0.37794119119644165,
"learning_rate": 0.00038426923076923073,
"loss": 3.2935,
"step": 61800
},
{
"epoch": 18.01106645698643,
"grad_norm": 0.3793052136898041,
"learning_rate": 0.0003840944055944056,
"loss": 3.1983,
"step": 61850
},
{
"epoch": 18.02562758460015,
"grad_norm": 0.3796166777610779,
"learning_rate": 0.0003839195804195804,
"loss": 3.1601,
"step": 61900
},
{
"epoch": 18.040188712213872,
"grad_norm": 0.37116897106170654,
"learning_rate": 0.00038374475524475523,
"loss": 3.1751,
"step": 61950
},
{
"epoch": 18.054749839827597,
"grad_norm": 0.3894861340522766,
"learning_rate": 0.00038356993006993003,
"loss": 3.1826,
"step": 62000
},
{
"epoch": 18.054749839827597,
"eval_accuracy": 0.37210540287177235,
"eval_loss": 3.5559823513031006,
"eval_runtime": 179.6722,
"eval_samples_per_second": 92.647,
"eval_steps_per_second": 5.794,
"step": 62000
},
{
"epoch": 18.069310967441318,
"grad_norm": 0.3755824863910675,
"learning_rate": 0.0003833951048951049,
"loss": 3.193,
"step": 62050
},
{
"epoch": 18.083872095055042,
"grad_norm": 0.3850991725921631,
"learning_rate": 0.0003832202797202797,
"loss": 3.1883,
"step": 62100
},
{
"epoch": 18.098433222668763,
"grad_norm": 0.4030168950557709,
"learning_rate": 0.00038304545454545454,
"loss": 3.2046,
"step": 62150
},
{
"epoch": 18.112994350282484,
"grad_norm": 0.3849511742591858,
"learning_rate": 0.0003828706293706293,
"loss": 3.1879,
"step": 62200
},
{
"epoch": 18.12755547789621,
"grad_norm": 0.3749910593032837,
"learning_rate": 0.00038269580419580414,
"loss": 3.2002,
"step": 62250
},
{
"epoch": 18.14211660550993,
"grad_norm": 0.4127649962902069,
"learning_rate": 0.00038252097902097894,
"loss": 3.2016,
"step": 62300
},
{
"epoch": 18.156677733123654,
"grad_norm": 0.38885459303855896,
"learning_rate": 0.0003823461538461538,
"loss": 3.199,
"step": 62350
},
{
"epoch": 18.171238860737375,
"grad_norm": 0.35722509026527405,
"learning_rate": 0.00038217132867132865,
"loss": 3.2083,
"step": 62400
},
{
"epoch": 18.185799988351096,
"grad_norm": 0.4163343012332916,
"learning_rate": 0.00038199650349650345,
"loss": 3.2205,
"step": 62450
},
{
"epoch": 18.20036111596482,
"grad_norm": 0.3997627794742584,
"learning_rate": 0.0003818216783216783,
"loss": 3.1985,
"step": 62500
},
{
"epoch": 18.214922243578542,
"grad_norm": 0.37067654728889465,
"learning_rate": 0.0003816468531468531,
"loss": 3.2073,
"step": 62550
},
{
"epoch": 18.229483371192266,
"grad_norm": 0.37254971265792847,
"learning_rate": 0.00038147202797202796,
"loss": 3.2324,
"step": 62600
},
{
"epoch": 18.244044498805987,
"grad_norm": 0.3757394552230835,
"learning_rate": 0.00038129720279720276,
"loss": 3.2225,
"step": 62650
},
{
"epoch": 18.25860562641971,
"grad_norm": 0.39608684182167053,
"learning_rate": 0.0003811223776223776,
"loss": 3.2245,
"step": 62700
},
{
"epoch": 18.273166754033433,
"grad_norm": 0.3775257468223572,
"learning_rate": 0.0003809475524475524,
"loss": 3.2271,
"step": 62750
},
{
"epoch": 18.287727881647154,
"grad_norm": 0.3755112290382385,
"learning_rate": 0.00038077272727272726,
"loss": 3.2322,
"step": 62800
},
{
"epoch": 18.30228900926088,
"grad_norm": 0.3830290138721466,
"learning_rate": 0.00038059790209790206,
"loss": 3.2307,
"step": 62850
},
{
"epoch": 18.3168501368746,
"grad_norm": 0.3844941258430481,
"learning_rate": 0.0003804230769230769,
"loss": 3.228,
"step": 62900
},
{
"epoch": 18.33141126448832,
"grad_norm": 0.3822082579135895,
"learning_rate": 0.00038024825174825166,
"loss": 3.2339,
"step": 62950
},
{
"epoch": 18.345972392102045,
"grad_norm": 0.3925999402999878,
"learning_rate": 0.0003800734265734265,
"loss": 3.2269,
"step": 63000
},
{
"epoch": 18.345972392102045,
"eval_accuracy": 0.3722970299217997,
"eval_loss": 3.5539674758911133,
"eval_runtime": 179.5711,
"eval_samples_per_second": 92.699,
"eval_steps_per_second": 5.797,
"step": 63000
},
{
"epoch": 18.360533519715766,
"grad_norm": 0.37486016750335693,
"learning_rate": 0.0003798986013986013,
"loss": 3.239,
"step": 63050
},
{
"epoch": 18.37509464732949,
"grad_norm": 0.36448609828948975,
"learning_rate": 0.00037972377622377617,
"loss": 3.2318,
"step": 63100
},
{
"epoch": 18.38965577494321,
"grad_norm": 0.4011726677417755,
"learning_rate": 0.000379548951048951,
"loss": 3.2378,
"step": 63150
},
{
"epoch": 18.404216902556932,
"grad_norm": 0.3729187846183777,
"learning_rate": 0.0003793741258741258,
"loss": 3.2354,
"step": 63200
},
{
"epoch": 18.418778030170657,
"grad_norm": 0.3895740807056427,
"learning_rate": 0.0003791993006993007,
"loss": 3.2388,
"step": 63250
},
{
"epoch": 18.433339157784378,
"grad_norm": 0.38216131925582886,
"learning_rate": 0.0003790244755244755,
"loss": 3.2369,
"step": 63300
},
{
"epoch": 18.447900285398102,
"grad_norm": 0.4059195816516876,
"learning_rate": 0.00037884965034965033,
"loss": 3.2533,
"step": 63350
},
{
"epoch": 18.462461413011823,
"grad_norm": 0.4038628041744232,
"learning_rate": 0.00037867482517482513,
"loss": 3.2431,
"step": 63400
},
{
"epoch": 18.477022540625548,
"grad_norm": 0.3941684067249298,
"learning_rate": 0.0003785,
"loss": 3.2319,
"step": 63450
},
{
"epoch": 18.49158366823927,
"grad_norm": 0.3561994433403015,
"learning_rate": 0.0003783251748251748,
"loss": 3.2426,
"step": 63500
},
{
"epoch": 18.50614479585299,
"grad_norm": 0.3846251666545868,
"learning_rate": 0.00037815034965034964,
"loss": 3.2478,
"step": 63550
},
{
"epoch": 18.520705923466714,
"grad_norm": 0.39818981289863586,
"learning_rate": 0.00037797552447552444,
"loss": 3.2479,
"step": 63600
},
{
"epoch": 18.535267051080435,
"grad_norm": 0.406084269285202,
"learning_rate": 0.0003778006993006993,
"loss": 3.2647,
"step": 63650
},
{
"epoch": 18.54982817869416,
"grad_norm": 0.37038683891296387,
"learning_rate": 0.00037762587412587404,
"loss": 3.2626,
"step": 63700
},
{
"epoch": 18.56438930630788,
"grad_norm": 0.37109628319740295,
"learning_rate": 0.0003774510489510489,
"loss": 3.2504,
"step": 63750
},
{
"epoch": 18.5789504339216,
"grad_norm": 0.38222911953926086,
"learning_rate": 0.0003772762237762238,
"loss": 3.2477,
"step": 63800
},
{
"epoch": 18.593511561535326,
"grad_norm": 0.37439829111099243,
"learning_rate": 0.00037710139860139854,
"loss": 3.2621,
"step": 63850
},
{
"epoch": 18.608072689149047,
"grad_norm": 0.3829028606414795,
"learning_rate": 0.0003769265734265734,
"loss": 3.2523,
"step": 63900
},
{
"epoch": 18.62263381676277,
"grad_norm": 0.3815508782863617,
"learning_rate": 0.0003767517482517482,
"loss": 3.2567,
"step": 63950
},
{
"epoch": 18.637194944376493,
"grad_norm": 0.39528024196624756,
"learning_rate": 0.00037657692307692305,
"loss": 3.2488,
"step": 64000
},
{
"epoch": 18.637194944376493,
"eval_accuracy": 0.3730161603911355,
"eval_loss": 3.540642261505127,
"eval_runtime": 179.6505,
"eval_samples_per_second": 92.658,
"eval_steps_per_second": 5.795,
"step": 64000
},
{
"epoch": 18.651756071990214,
"grad_norm": 0.3818016052246094,
"learning_rate": 0.00037640209790209785,
"loss": 3.2776,
"step": 64050
},
{
"epoch": 18.666317199603938,
"grad_norm": 0.3714297413825989,
"learning_rate": 0.0003762272727272727,
"loss": 3.2655,
"step": 64100
},
{
"epoch": 18.68087832721766,
"grad_norm": 0.38805392384529114,
"learning_rate": 0.0003760524475524475,
"loss": 3.2512,
"step": 64150
},
{
"epoch": 18.695439454831384,
"grad_norm": 0.352531373500824,
"learning_rate": 0.00037587762237762236,
"loss": 3.2509,
"step": 64200
},
{
"epoch": 18.710000582445105,
"grad_norm": 0.38840219378471375,
"learning_rate": 0.00037570279720279716,
"loss": 3.2707,
"step": 64250
},
{
"epoch": 18.724561710058826,
"grad_norm": 0.3757995665073395,
"learning_rate": 0.000375527972027972,
"loss": 3.2662,
"step": 64300
},
{
"epoch": 18.73912283767255,
"grad_norm": 0.38328617811203003,
"learning_rate": 0.0003753531468531468,
"loss": 3.2576,
"step": 64350
},
{
"epoch": 18.75368396528627,
"grad_norm": 0.3754730820655823,
"learning_rate": 0.00037517832167832167,
"loss": 3.2666,
"step": 64400
},
{
"epoch": 18.768245092899996,
"grad_norm": 0.39446189999580383,
"learning_rate": 0.0003750034965034965,
"loss": 3.2602,
"step": 64450
},
{
"epoch": 18.782806220513717,
"grad_norm": 0.3993825316429138,
"learning_rate": 0.00037482867132867127,
"loss": 3.2561,
"step": 64500
},
{
"epoch": 18.797367348127437,
"grad_norm": 0.39360371232032776,
"learning_rate": 0.0003746538461538462,
"loss": 3.2614,
"step": 64550
},
{
"epoch": 18.811928475741162,
"grad_norm": 0.376064270734787,
"learning_rate": 0.0003744790209790209,
"loss": 3.2637,
"step": 64600
},
{
"epoch": 18.826489603354883,
"grad_norm": 0.41290155053138733,
"learning_rate": 0.0003743041958041958,
"loss": 3.2659,
"step": 64650
},
{
"epoch": 18.841050730968607,
"grad_norm": 0.3746908903121948,
"learning_rate": 0.0003741293706293706,
"loss": 3.2676,
"step": 64700
},
{
"epoch": 18.85561185858233,
"grad_norm": 0.5334339141845703,
"learning_rate": 0.0003739545454545454,
"loss": 3.2664,
"step": 64750
},
{
"epoch": 18.87017298619605,
"grad_norm": 0.345940500497818,
"learning_rate": 0.0003737797202797202,
"loss": 3.2723,
"step": 64800
},
{
"epoch": 18.884734113809774,
"grad_norm": 0.37264400720596313,
"learning_rate": 0.0003736048951048951,
"loss": 3.2669,
"step": 64850
},
{
"epoch": 18.899295241423495,
"grad_norm": 0.3550002872943878,
"learning_rate": 0.0003734300699300699,
"loss": 3.2732,
"step": 64900
},
{
"epoch": 18.91385636903722,
"grad_norm": 0.3796321153640747,
"learning_rate": 0.00037325524475524473,
"loss": 3.2682,
"step": 64950
},
{
"epoch": 18.92841749665094,
"grad_norm": 0.3669205904006958,
"learning_rate": 0.00037308041958041953,
"loss": 3.274,
"step": 65000
},
{
"epoch": 18.92841749665094,
"eval_accuracy": 0.3733553285134232,
"eval_loss": 3.5314111709594727,
"eval_runtime": 179.6545,
"eval_samples_per_second": 92.656,
"eval_steps_per_second": 5.794,
"step": 65000
},
{
"epoch": 18.94297862426466,
"grad_norm": 0.37981703877449036,
"learning_rate": 0.0003729055944055944,
"loss": 3.2619,
"step": 65050
},
{
"epoch": 18.957539751878386,
"grad_norm": 0.39228788018226624,
"learning_rate": 0.0003727307692307692,
"loss": 3.2697,
"step": 65100
},
{
"epoch": 18.972100879492107,
"grad_norm": 0.36584481596946716,
"learning_rate": 0.00037255594405594404,
"loss": 3.2772,
"step": 65150
},
{
"epoch": 18.98666200710583,
"grad_norm": 0.4020899832248688,
"learning_rate": 0.0003723811188811189,
"loss": 3.2603,
"step": 65200
},
{
"epoch": 19.001164890209097,
"grad_norm": 0.41100603342056274,
"learning_rate": 0.00037220629370629364,
"loss": 3.268,
"step": 65250
},
{
"epoch": 19.01572601782282,
"grad_norm": 0.40301769971847534,
"learning_rate": 0.00037203146853146855,
"loss": 3.1662,
"step": 65300
},
{
"epoch": 19.030287145436542,
"grad_norm": 0.37851014733314514,
"learning_rate": 0.0003718566433566433,
"loss": 3.1681,
"step": 65350
},
{
"epoch": 19.044848273050263,
"grad_norm": 0.38446855545043945,
"learning_rate": 0.00037168181818181815,
"loss": 3.1762,
"step": 65400
},
{
"epoch": 19.059409400663988,
"grad_norm": 0.39558929204940796,
"learning_rate": 0.00037150699300699295,
"loss": 3.1821,
"step": 65450
},
{
"epoch": 19.07397052827771,
"grad_norm": 0.403755247592926,
"learning_rate": 0.0003713321678321678,
"loss": 3.1692,
"step": 65500
},
{
"epoch": 19.088531655891433,
"grad_norm": 0.3988681733608246,
"learning_rate": 0.0003711573426573426,
"loss": 3.1838,
"step": 65550
},
{
"epoch": 19.103092783505154,
"grad_norm": 0.3952530324459076,
"learning_rate": 0.00037098251748251746,
"loss": 3.1991,
"step": 65600
},
{
"epoch": 19.11765391111888,
"grad_norm": 0.413113534450531,
"learning_rate": 0.00037080769230769226,
"loss": 3.1934,
"step": 65650
},
{
"epoch": 19.1322150387326,
"grad_norm": 0.3580508232116699,
"learning_rate": 0.0003706328671328671,
"loss": 3.1843,
"step": 65700
},
{
"epoch": 19.14677616634632,
"grad_norm": 0.36587846279144287,
"learning_rate": 0.0003704580419580419,
"loss": 3.1922,
"step": 65750
},
{
"epoch": 19.161337293960045,
"grad_norm": 0.3886634409427643,
"learning_rate": 0.00037028321678321676,
"loss": 3.199,
"step": 65800
},
{
"epoch": 19.175898421573766,
"grad_norm": 0.37114614248275757,
"learning_rate": 0.0003701083916083916,
"loss": 3.203,
"step": 65850
},
{
"epoch": 19.19045954918749,
"grad_norm": 0.37493696808815,
"learning_rate": 0.0003699335664335664,
"loss": 3.194,
"step": 65900
},
{
"epoch": 19.20502067680121,
"grad_norm": 0.37686535716056824,
"learning_rate": 0.00036975874125874127,
"loss": 3.1928,
"step": 65950
},
{
"epoch": 19.219581804414933,
"grad_norm": 0.39175036549568176,
"learning_rate": 0.00036958391608391607,
"loss": 3.2044,
"step": 66000
},
{
"epoch": 19.219581804414933,
"eval_accuracy": 0.3724803100267032,
"eval_loss": 3.5525450706481934,
"eval_runtime": 179.6933,
"eval_samples_per_second": 92.636,
"eval_steps_per_second": 5.793,
"step": 66000
},
{
"epoch": 19.234142932028657,
"grad_norm": 0.41886278986930847,
"learning_rate": 0.0003694090909090909,
"loss": 3.2055,
"step": 66050
},
{
"epoch": 19.248704059642378,
"grad_norm": 0.3687989115715027,
"learning_rate": 0.00036923426573426567,
"loss": 3.2055,
"step": 66100
},
{
"epoch": 19.263265187256103,
"grad_norm": 0.37460798025131226,
"learning_rate": 0.0003690594405594405,
"loss": 3.2179,
"step": 66150
},
{
"epoch": 19.277826314869824,
"grad_norm": 0.3773028552532196,
"learning_rate": 0.0003688846153846153,
"loss": 3.2105,
"step": 66200
},
{
"epoch": 19.292387442483545,
"grad_norm": 0.40406858921051025,
"learning_rate": 0.0003687097902097902,
"loss": 3.2131,
"step": 66250
},
{
"epoch": 19.30694857009727,
"grad_norm": 0.42478838562965393,
"learning_rate": 0.000368534965034965,
"loss": 3.223,
"step": 66300
},
{
"epoch": 19.32150969771099,
"grad_norm": 0.3828580379486084,
"learning_rate": 0.00036836013986013983,
"loss": 3.2216,
"step": 66350
},
{
"epoch": 19.336070825324715,
"grad_norm": 0.3864606022834778,
"learning_rate": 0.00036818531468531463,
"loss": 3.2219,
"step": 66400
},
{
"epoch": 19.350631952938436,
"grad_norm": 0.3833235502243042,
"learning_rate": 0.0003680104895104895,
"loss": 3.2349,
"step": 66450
},
{
"epoch": 19.365193080552157,
"grad_norm": 0.37136217951774597,
"learning_rate": 0.0003678356643356643,
"loss": 3.2145,
"step": 66500
},
{
"epoch": 19.37975420816588,
"grad_norm": 0.39863675832748413,
"learning_rate": 0.00036766083916083914,
"loss": 3.234,
"step": 66550
},
{
"epoch": 19.394315335779602,
"grad_norm": 0.4460127353668213,
"learning_rate": 0.000367486013986014,
"loss": 3.2275,
"step": 66600
},
{
"epoch": 19.408876463393327,
"grad_norm": 0.3718584179878235,
"learning_rate": 0.0003673111888111888,
"loss": 3.2298,
"step": 66650
},
{
"epoch": 19.423437591007048,
"grad_norm": 0.4181348383426666,
"learning_rate": 0.00036713636363636365,
"loss": 3.2253,
"step": 66700
},
{
"epoch": 19.43799871862077,
"grad_norm": 0.42995330691337585,
"learning_rate": 0.00036696153846153844,
"loss": 3.2321,
"step": 66750
},
{
"epoch": 19.452559846234493,
"grad_norm": 0.6013868451118469,
"learning_rate": 0.0003667867132867133,
"loss": 3.2433,
"step": 66800
},
{
"epoch": 19.467120973848214,
"grad_norm": 0.40891793370246887,
"learning_rate": 0.00036661188811188804,
"loss": 3.2243,
"step": 66850
},
{
"epoch": 19.48168210146194,
"grad_norm": 0.38710060715675354,
"learning_rate": 0.0003664370629370629,
"loss": 3.2262,
"step": 66900
},
{
"epoch": 19.49624322907566,
"grad_norm": 0.39001980423927307,
"learning_rate": 0.0003662622377622377,
"loss": 3.2335,
"step": 66950
},
{
"epoch": 19.51080435668938,
"grad_norm": 0.3877623379230499,
"learning_rate": 0.00036608741258741255,
"loss": 3.2513,
"step": 67000
},
{
"epoch": 19.51080435668938,
"eval_accuracy": 0.37284933905126505,
"eval_loss": 3.5439090728759766,
"eval_runtime": 179.7018,
"eval_samples_per_second": 92.631,
"eval_steps_per_second": 5.793,
"step": 67000
},
{
"epoch": 19.525365484303105,
"grad_norm": 0.4291560649871826,
"learning_rate": 0.00036591258741258735,
"loss": 3.2392,
"step": 67050
},
{
"epoch": 19.539926611916826,
"grad_norm": 0.3970586657524109,
"learning_rate": 0.0003657377622377622,
"loss": 3.24,
"step": 67100
},
{
"epoch": 19.55448773953055,
"grad_norm": 0.41603973507881165,
"learning_rate": 0.000365562937062937,
"loss": 3.2329,
"step": 67150
},
{
"epoch": 19.56904886714427,
"grad_norm": 0.3834895193576813,
"learning_rate": 0.00036538811188811186,
"loss": 3.2372,
"step": 67200
},
{
"epoch": 19.583609994757992,
"grad_norm": 0.3993031680583954,
"learning_rate": 0.0003652132867132867,
"loss": 3.24,
"step": 67250
},
{
"epoch": 19.598171122371717,
"grad_norm": 0.4024580419063568,
"learning_rate": 0.0003650384615384615,
"loss": 3.2366,
"step": 67300
},
{
"epoch": 19.612732249985438,
"grad_norm": 0.38864803314208984,
"learning_rate": 0.00036486363636363637,
"loss": 3.2278,
"step": 67350
},
{
"epoch": 19.627293377599162,
"grad_norm": 0.41746726632118225,
"learning_rate": 0.00036468881118881117,
"loss": 3.2398,
"step": 67400
},
{
"epoch": 19.641854505212883,
"grad_norm": 0.4189150035381317,
"learning_rate": 0.000364513986013986,
"loss": 3.2485,
"step": 67450
},
{
"epoch": 19.656415632826604,
"grad_norm": 0.39212143421173096,
"learning_rate": 0.0003643391608391608,
"loss": 3.237,
"step": 67500
},
{
"epoch": 19.67097676044033,
"grad_norm": 0.3836154043674469,
"learning_rate": 0.0003641643356643357,
"loss": 3.2504,
"step": 67550
},
{
"epoch": 19.68553788805405,
"grad_norm": 0.40508440136909485,
"learning_rate": 0.0003639895104895104,
"loss": 3.244,
"step": 67600
},
{
"epoch": 19.700099015667774,
"grad_norm": 0.42780637741088867,
"learning_rate": 0.0003638146853146853,
"loss": 3.2576,
"step": 67650
},
{
"epoch": 19.714660143281495,
"grad_norm": 0.3892393112182617,
"learning_rate": 0.00036363986013986007,
"loss": 3.2437,
"step": 67700
},
{
"epoch": 19.729221270895216,
"grad_norm": 0.39174842834472656,
"learning_rate": 0.0003634650349650349,
"loss": 3.2477,
"step": 67750
},
{
"epoch": 19.74378239850894,
"grad_norm": 0.37506580352783203,
"learning_rate": 0.0003632902097902097,
"loss": 3.2437,
"step": 67800
},
{
"epoch": 19.758343526122662,
"grad_norm": 0.40734153985977173,
"learning_rate": 0.0003631153846153846,
"loss": 3.2476,
"step": 67850
},
{
"epoch": 19.772904653736386,
"grad_norm": 0.38541653752326965,
"learning_rate": 0.00036294055944055943,
"loss": 3.2599,
"step": 67900
},
{
"epoch": 19.787465781350107,
"grad_norm": 0.3902021050453186,
"learning_rate": 0.00036276573426573423,
"loss": 3.2518,
"step": 67950
},
{
"epoch": 19.802026908963832,
"grad_norm": 0.36354759335517883,
"learning_rate": 0.0003625909090909091,
"loss": 3.2637,
"step": 68000
},
{
"epoch": 19.802026908963832,
"eval_accuracy": 0.37342245676223645,
"eval_loss": 3.5373194217681885,
"eval_runtime": 179.7108,
"eval_samples_per_second": 92.627,
"eval_steps_per_second": 5.793,
"step": 68000
},
{
"epoch": 19.816588036577553,
"grad_norm": 0.4016396403312683,
"learning_rate": 0.0003624160839160839,
"loss": 3.2542,
"step": 68050
},
{
"epoch": 19.831149164191274,
"grad_norm": 0.3677242398262024,
"learning_rate": 0.00036224125874125874,
"loss": 3.2584,
"step": 68100
},
{
"epoch": 19.845710291805,
"grad_norm": 0.39884600043296814,
"learning_rate": 0.00036206643356643354,
"loss": 3.2544,
"step": 68150
},
{
"epoch": 19.86027141941872,
"grad_norm": 0.38445019721984863,
"learning_rate": 0.0003618916083916084,
"loss": 3.2553,
"step": 68200
},
{
"epoch": 19.874832547032444,
"grad_norm": 0.42044463753700256,
"learning_rate": 0.0003617167832167832,
"loss": 3.2556,
"step": 68250
},
{
"epoch": 19.889393674646165,
"grad_norm": 0.3916245400905609,
"learning_rate": 0.00036154195804195805,
"loss": 3.2487,
"step": 68300
},
{
"epoch": 19.903954802259886,
"grad_norm": 0.3802781105041504,
"learning_rate": 0.0003613671328671328,
"loss": 3.2621,
"step": 68350
},
{
"epoch": 19.91851592987361,
"grad_norm": 0.3673400282859802,
"learning_rate": 0.00036119230769230765,
"loss": 3.2533,
"step": 68400
},
{
"epoch": 19.93307705748733,
"grad_norm": 0.3766266703605652,
"learning_rate": 0.00036101748251748245,
"loss": 3.2592,
"step": 68450
},
{
"epoch": 19.947638185101056,
"grad_norm": 0.3740299642086029,
"learning_rate": 0.0003608426573426573,
"loss": 3.2632,
"step": 68500
},
{
"epoch": 19.962199312714777,
"grad_norm": 0.4291253387928009,
"learning_rate": 0.0003606678321678321,
"loss": 3.2554,
"step": 68550
},
{
"epoch": 19.976760440328498,
"grad_norm": 0.4110807478427887,
"learning_rate": 0.00036049300699300696,
"loss": 3.2576,
"step": 68600
},
{
"epoch": 19.991321567942222,
"grad_norm": 0.37780332565307617,
"learning_rate": 0.0003603181818181818,
"loss": 3.2641,
"step": 68650
},
{
"epoch": 20.005824451045488,
"grad_norm": 0.392168253660202,
"learning_rate": 0.0003601433566433566,
"loss": 3.216,
"step": 68700
},
{
"epoch": 20.020385578659212,
"grad_norm": 0.3862382471561432,
"learning_rate": 0.00035996853146853146,
"loss": 3.1516,
"step": 68750
},
{
"epoch": 20.034946706272933,
"grad_norm": 0.39903658628463745,
"learning_rate": 0.00035979370629370626,
"loss": 3.1669,
"step": 68800
},
{
"epoch": 20.049507833886658,
"grad_norm": 0.378799706697464,
"learning_rate": 0.0003596188811188811,
"loss": 3.1626,
"step": 68850
},
{
"epoch": 20.06406896150038,
"grad_norm": 0.408006489276886,
"learning_rate": 0.0003594440559440559,
"loss": 3.1738,
"step": 68900
},
{
"epoch": 20.0786300891141,
"grad_norm": 0.3917410373687744,
"learning_rate": 0.00035926923076923077,
"loss": 3.1682,
"step": 68950
},
{
"epoch": 20.093191216727824,
"grad_norm": 0.3857029378414154,
"learning_rate": 0.00035909440559440557,
"loss": 3.1733,
"step": 69000
},
{
"epoch": 20.093191216727824,
"eval_accuracy": 0.3726052790783468,
"eval_loss": 3.5531675815582275,
"eval_runtime": 179.7349,
"eval_samples_per_second": 92.614,
"eval_steps_per_second": 5.792,
"step": 69000
},
{
"epoch": 20.107752344341545,
"grad_norm": 0.4075562059879303,
"learning_rate": 0.0003589195804195804,
"loss": 3.1648,
"step": 69050
},
{
"epoch": 20.12231347195527,
"grad_norm": 0.3684654235839844,
"learning_rate": 0.00035874475524475517,
"loss": 3.1868,
"step": 69100
},
{
"epoch": 20.13687459956899,
"grad_norm": 0.3954562246799469,
"learning_rate": 0.00035856993006993,
"loss": 3.1835,
"step": 69150
},
{
"epoch": 20.15143572718271,
"grad_norm": 0.40872836112976074,
"learning_rate": 0.0003583951048951048,
"loss": 3.186,
"step": 69200
},
{
"epoch": 20.165996854796436,
"grad_norm": 0.3897246718406677,
"learning_rate": 0.0003582202797202797,
"loss": 3.177,
"step": 69250
},
{
"epoch": 20.180557982410157,
"grad_norm": 0.40773889422416687,
"learning_rate": 0.00035804545454545453,
"loss": 3.1861,
"step": 69300
},
{
"epoch": 20.19511911002388,
"grad_norm": 0.40160179138183594,
"learning_rate": 0.00035787062937062933,
"loss": 3.199,
"step": 69350
},
{
"epoch": 20.209680237637603,
"grad_norm": 0.3950497508049011,
"learning_rate": 0.0003576958041958042,
"loss": 3.193,
"step": 69400
},
{
"epoch": 20.224241365251324,
"grad_norm": 0.389279842376709,
"learning_rate": 0.000357520979020979,
"loss": 3.1968,
"step": 69450
},
{
"epoch": 20.238802492865048,
"grad_norm": 0.39602160453796387,
"learning_rate": 0.00035734615384615384,
"loss": 3.1951,
"step": 69500
},
{
"epoch": 20.25336362047877,
"grad_norm": 0.3904061019420624,
"learning_rate": 0.00035717132867132864,
"loss": 3.2021,
"step": 69550
},
{
"epoch": 20.267924748092494,
"grad_norm": 0.38812994956970215,
"learning_rate": 0.0003569965034965035,
"loss": 3.1965,
"step": 69600
},
{
"epoch": 20.282485875706215,
"grad_norm": 0.389804482460022,
"learning_rate": 0.0003568216783216783,
"loss": 3.2068,
"step": 69650
},
{
"epoch": 20.297047003319935,
"grad_norm": 0.37710636854171753,
"learning_rate": 0.00035664685314685314,
"loss": 3.2118,
"step": 69700
},
{
"epoch": 20.31160813093366,
"grad_norm": 0.37913864850997925,
"learning_rate": 0.00035647202797202794,
"loss": 3.2204,
"step": 69750
},
{
"epoch": 20.32616925854738,
"grad_norm": 0.37935346364974976,
"learning_rate": 0.0003562972027972028,
"loss": 3.2166,
"step": 69800
},
{
"epoch": 20.340730386161106,
"grad_norm": 0.36167511343955994,
"learning_rate": 0.00035612237762237754,
"loss": 3.2164,
"step": 69850
},
{
"epoch": 20.355291513774826,
"grad_norm": 0.37107759714126587,
"learning_rate": 0.0003559475524475524,
"loss": 3.2067,
"step": 69900
},
{
"epoch": 20.369852641388547,
"grad_norm": 0.42005228996276855,
"learning_rate": 0.0003557727272727272,
"loss": 3.2016,
"step": 69950
},
{
"epoch": 20.384413769002272,
"grad_norm": 0.35222989320755005,
"learning_rate": 0.00035559790209790205,
"loss": 3.2166,
"step": 70000
},
{
"epoch": 20.384413769002272,
"eval_accuracy": 0.3728020788830988,
"eval_loss": 3.5458731651306152,
"eval_runtime": 179.7192,
"eval_samples_per_second": 92.622,
"eval_steps_per_second": 5.792,
"step": 70000
},
{
"epoch": 20.398974896615993,
"grad_norm": 0.40135788917541504,
"learning_rate": 0.0003554230769230769,
"loss": 3.2174,
"step": 70050
},
{
"epoch": 20.413536024229717,
"grad_norm": 0.3890798091888428,
"learning_rate": 0.0003552482517482517,
"loss": 3.2196,
"step": 70100
},
{
"epoch": 20.42809715184344,
"grad_norm": 0.41307833790779114,
"learning_rate": 0.00035507342657342656,
"loss": 3.2289,
"step": 70150
},
{
"epoch": 20.442658279457163,
"grad_norm": 0.4063875079154968,
"learning_rate": 0.00035489860139860136,
"loss": 3.2129,
"step": 70200
},
{
"epoch": 20.457219407070884,
"grad_norm": 0.4029952883720398,
"learning_rate": 0.0003547237762237762,
"loss": 3.2108,
"step": 70250
},
{
"epoch": 20.471780534684605,
"grad_norm": 0.38970711827278137,
"learning_rate": 0.000354548951048951,
"loss": 3.2282,
"step": 70300
},
{
"epoch": 20.48634166229833,
"grad_norm": 0.40323925018310547,
"learning_rate": 0.00035437412587412587,
"loss": 3.2214,
"step": 70350
},
{
"epoch": 20.50090278991205,
"grad_norm": 0.3895185887813568,
"learning_rate": 0.00035419930069930067,
"loss": 3.2082,
"step": 70400
},
{
"epoch": 20.51546391752577,
"grad_norm": 0.4097273051738739,
"learning_rate": 0.0003540244755244755,
"loss": 3.2183,
"step": 70450
},
{
"epoch": 20.530025045139496,
"grad_norm": 0.4041067361831665,
"learning_rate": 0.0003538496503496503,
"loss": 3.2263,
"step": 70500
},
{
"epoch": 20.544586172753217,
"grad_norm": 0.36330464482307434,
"learning_rate": 0.0003536748251748252,
"loss": 3.2326,
"step": 70550
},
{
"epoch": 20.55914730036694,
"grad_norm": 0.400479257106781,
"learning_rate": 0.0003534999999999999,
"loss": 3.2113,
"step": 70600
},
{
"epoch": 20.573708427980662,
"grad_norm": 0.38448238372802734,
"learning_rate": 0.00035332517482517477,
"loss": 3.2169,
"step": 70650
},
{
"epoch": 20.588269555594387,
"grad_norm": 0.4327026307582855,
"learning_rate": 0.0003531503496503496,
"loss": 3.2317,
"step": 70700
},
{
"epoch": 20.602830683208108,
"grad_norm": 0.4079606831073761,
"learning_rate": 0.0003529755244755244,
"loss": 3.2411,
"step": 70750
},
{
"epoch": 20.61739181082183,
"grad_norm": 0.43886488676071167,
"learning_rate": 0.0003528006993006993,
"loss": 3.2325,
"step": 70800
},
{
"epoch": 20.631952938435553,
"grad_norm": 0.41194280982017517,
"learning_rate": 0.0003526258741258741,
"loss": 3.2272,
"step": 70850
},
{
"epoch": 20.646514066049274,
"grad_norm": 0.3900597393512726,
"learning_rate": 0.00035245104895104893,
"loss": 3.2366,
"step": 70900
},
{
"epoch": 20.661075193663,
"grad_norm": 0.38845738768577576,
"learning_rate": 0.00035227622377622373,
"loss": 3.2368,
"step": 70950
},
{
"epoch": 20.67563632127672,
"grad_norm": 0.39005860686302185,
"learning_rate": 0.0003521013986013986,
"loss": 3.2374,
"step": 71000
},
{
"epoch": 20.67563632127672,
"eval_accuracy": 0.3733951822373246,
"eval_loss": 3.5416312217712402,
"eval_runtime": 179.5842,
"eval_samples_per_second": 92.692,
"eval_steps_per_second": 5.797,
"step": 71000
},
{
"epoch": 20.69019744889044,
"grad_norm": 0.40957584977149963,
"learning_rate": 0.0003519265734265734,
"loss": 3.2389,
"step": 71050
},
{
"epoch": 20.704758576504165,
"grad_norm": 0.38271039724349976,
"learning_rate": 0.00035175174825174824,
"loss": 3.243,
"step": 71100
},
{
"epoch": 20.719319704117886,
"grad_norm": 0.3969852328300476,
"learning_rate": 0.00035157692307692304,
"loss": 3.2359,
"step": 71150
},
{
"epoch": 20.73388083173161,
"grad_norm": 0.4403655230998993,
"learning_rate": 0.0003514020979020979,
"loss": 3.242,
"step": 71200
},
{
"epoch": 20.74844195934533,
"grad_norm": 0.391052782535553,
"learning_rate": 0.0003512272727272727,
"loss": 3.2308,
"step": 71250
},
{
"epoch": 20.763003086959053,
"grad_norm": 0.4325944483280182,
"learning_rate": 0.00035105244755244755,
"loss": 3.2394,
"step": 71300
},
{
"epoch": 20.777564214572777,
"grad_norm": 0.3717031478881836,
"learning_rate": 0.0003508776223776223,
"loss": 3.2403,
"step": 71350
},
{
"epoch": 20.792125342186498,
"grad_norm": 0.3885386884212494,
"learning_rate": 0.00035070279720279715,
"loss": 3.2422,
"step": 71400
},
{
"epoch": 20.806686469800223,
"grad_norm": 0.37559714913368225,
"learning_rate": 0.000350527972027972,
"loss": 3.2487,
"step": 71450
},
{
"epoch": 20.821247597413944,
"grad_norm": 0.40500393509864807,
"learning_rate": 0.0003503531468531468,
"loss": 3.231,
"step": 71500
},
{
"epoch": 20.835808725027665,
"grad_norm": 0.41687142848968506,
"learning_rate": 0.00035017832167832166,
"loss": 3.2431,
"step": 71550
},
{
"epoch": 20.85036985264139,
"grad_norm": 0.3623868227005005,
"learning_rate": 0.00035000349650349645,
"loss": 3.252,
"step": 71600
},
{
"epoch": 20.86493098025511,
"grad_norm": 0.39037078619003296,
"learning_rate": 0.0003498286713286713,
"loss": 3.2456,
"step": 71650
},
{
"epoch": 20.879492107868835,
"grad_norm": 0.41543281078338623,
"learning_rate": 0.0003496538461538461,
"loss": 3.2495,
"step": 71700
},
{
"epoch": 20.894053235482556,
"grad_norm": 0.3910142183303833,
"learning_rate": 0.00034947902097902096,
"loss": 3.2402,
"step": 71750
},
{
"epoch": 20.908614363096277,
"grad_norm": 0.4314521253108978,
"learning_rate": 0.00034930419580419576,
"loss": 3.2467,
"step": 71800
},
{
"epoch": 20.92317549071,
"grad_norm": 0.3962632417678833,
"learning_rate": 0.0003491293706293706,
"loss": 3.2502,
"step": 71850
},
{
"epoch": 20.937736618323722,
"grad_norm": 0.38694316148757935,
"learning_rate": 0.0003489545454545454,
"loss": 3.2508,
"step": 71900
},
{
"epoch": 20.952297745937447,
"grad_norm": 0.39612144231796265,
"learning_rate": 0.00034877972027972027,
"loss": 3.2507,
"step": 71950
},
{
"epoch": 20.966858873551168,
"grad_norm": 0.39431706070899963,
"learning_rate": 0.00034860489510489507,
"loss": 3.2554,
"step": 72000
},
{
"epoch": 20.966858873551168,
"eval_accuracy": 0.3737660922636045,
"eval_loss": 3.531592607498169,
"eval_runtime": 179.7191,
"eval_samples_per_second": 92.622,
"eval_steps_per_second": 5.792,
"step": 72000
},
{
"epoch": 20.98142000116489,
"grad_norm": 0.3855423927307129,
"learning_rate": 0.0003484300699300699,
"loss": 3.2645,
"step": 72050
},
{
"epoch": 20.995981128778613,
"grad_norm": 0.3567679524421692,
"learning_rate": 0.0003482552447552448,
"loss": 3.2477,
"step": 72100
},
{
"epoch": 21.01048401188188,
"grad_norm": 0.4059353172779083,
"learning_rate": 0.0003480804195804195,
"loss": 3.1889,
"step": 72150
},
{
"epoch": 21.025045139495603,
"grad_norm": 0.40449172258377075,
"learning_rate": 0.0003479055944055944,
"loss": 3.1533,
"step": 72200
},
{
"epoch": 21.039606267109324,
"grad_norm": 0.3914928436279297,
"learning_rate": 0.0003477307692307692,
"loss": 3.1555,
"step": 72250
},
{
"epoch": 21.05416739472305,
"grad_norm": 0.7120218873023987,
"learning_rate": 0.00034755594405594403,
"loss": 3.1556,
"step": 72300
},
{
"epoch": 21.06872852233677,
"grad_norm": 0.4098522961139679,
"learning_rate": 0.00034738111888111883,
"loss": 3.1564,
"step": 72350
},
{
"epoch": 21.08328964995049,
"grad_norm": 0.39420628547668457,
"learning_rate": 0.0003472062937062937,
"loss": 3.1529,
"step": 72400
},
{
"epoch": 21.097850777564215,
"grad_norm": 0.4091591536998749,
"learning_rate": 0.0003470314685314685,
"loss": 3.1703,
"step": 72450
},
{
"epoch": 21.112411905177936,
"grad_norm": 0.39323386549949646,
"learning_rate": 0.00034685664335664334,
"loss": 3.1647,
"step": 72500
},
{
"epoch": 21.12697303279166,
"grad_norm": 0.3856316804885864,
"learning_rate": 0.00034668181818181814,
"loss": 3.1613,
"step": 72550
},
{
"epoch": 21.14153416040538,
"grad_norm": 0.40396738052368164,
"learning_rate": 0.000346506993006993,
"loss": 3.1678,
"step": 72600
},
{
"epoch": 21.156095288019102,
"grad_norm": 0.39266011118888855,
"learning_rate": 0.0003463321678321678,
"loss": 3.1705,
"step": 72650
},
{
"epoch": 21.170656415632827,
"grad_norm": 0.4100908935070038,
"learning_rate": 0.00034615734265734264,
"loss": 3.18,
"step": 72700
},
{
"epoch": 21.185217543246548,
"grad_norm": 0.38292568922042847,
"learning_rate": 0.0003459825174825175,
"loss": 3.1768,
"step": 72750
},
{
"epoch": 21.199778670860272,
"grad_norm": 0.40505489706993103,
"learning_rate": 0.0003458076923076923,
"loss": 3.1972,
"step": 72800
},
{
"epoch": 21.214339798473993,
"grad_norm": 0.4033255875110626,
"learning_rate": 0.00034563286713286715,
"loss": 3.1717,
"step": 72850
},
{
"epoch": 21.228900926087718,
"grad_norm": 0.39897650480270386,
"learning_rate": 0.0003454580419580419,
"loss": 3.1847,
"step": 72900
},
{
"epoch": 21.24346205370144,
"grad_norm": 0.3876497149467468,
"learning_rate": 0.00034528321678321675,
"loss": 3.1874,
"step": 72950
},
{
"epoch": 21.25802318131516,
"grad_norm": 0.4150758683681488,
"learning_rate": 0.00034510839160839155,
"loss": 3.1957,
"step": 73000
},
{
"epoch": 21.25802318131516,
"eval_accuracy": 0.3731560598939162,
"eval_loss": 3.550673723220825,
"eval_runtime": 179.8147,
"eval_samples_per_second": 92.573,
"eval_steps_per_second": 5.789,
"step": 73000
},
{
"epoch": 21.272584308928884,
"grad_norm": 0.4000180661678314,
"learning_rate": 0.0003449335664335664,
"loss": 3.1891,
"step": 73050
},
{
"epoch": 21.287145436542605,
"grad_norm": 0.3856852352619171,
"learning_rate": 0.0003447587412587412,
"loss": 3.1842,
"step": 73100
},
{
"epoch": 21.30170656415633,
"grad_norm": 0.4193423390388489,
"learning_rate": 0.00034458391608391606,
"loss": 3.2006,
"step": 73150
},
{
"epoch": 21.31626769177005,
"grad_norm": 0.41471514105796814,
"learning_rate": 0.00034440909090909086,
"loss": 3.1932,
"step": 73200
},
{
"epoch": 21.330828819383772,
"grad_norm": 0.40419620275497437,
"learning_rate": 0.0003442342657342657,
"loss": 3.202,
"step": 73250
},
{
"epoch": 21.345389946997496,
"grad_norm": 0.40216565132141113,
"learning_rate": 0.0003440594405594405,
"loss": 3.195,
"step": 73300
},
{
"epoch": 21.359951074611217,
"grad_norm": 0.36628732085227966,
"learning_rate": 0.00034388461538461537,
"loss": 3.2072,
"step": 73350
},
{
"epoch": 21.374512202224942,
"grad_norm": 0.40025609731674194,
"learning_rate": 0.00034370979020979017,
"loss": 3.2068,
"step": 73400
},
{
"epoch": 21.389073329838663,
"grad_norm": 0.39362242817878723,
"learning_rate": 0.000343534965034965,
"loss": 3.2108,
"step": 73450
},
{
"epoch": 21.403634457452384,
"grad_norm": 0.4039856493473053,
"learning_rate": 0.0003433601398601399,
"loss": 3.2048,
"step": 73500
},
{
"epoch": 21.41819558506611,
"grad_norm": 0.4046418368816376,
"learning_rate": 0.0003431853146853147,
"loss": 3.2083,
"step": 73550
},
{
"epoch": 21.43275671267983,
"grad_norm": 0.3927595615386963,
"learning_rate": 0.0003430104895104895,
"loss": 3.2069,
"step": 73600
},
{
"epoch": 21.447317840293554,
"grad_norm": 0.3955070674419403,
"learning_rate": 0.00034283566433566427,
"loss": 3.2076,
"step": 73650
},
{
"epoch": 21.461878967907275,
"grad_norm": 0.39903560280799866,
"learning_rate": 0.0003426608391608391,
"loss": 3.2096,
"step": 73700
},
{
"epoch": 21.476440095520996,
"grad_norm": 0.44239842891693115,
"learning_rate": 0.0003424860139860139,
"loss": 3.2168,
"step": 73750
},
{
"epoch": 21.49100122313472,
"grad_norm": 0.3976947069168091,
"learning_rate": 0.0003423111888111888,
"loss": 3.2251,
"step": 73800
},
{
"epoch": 21.50556235074844,
"grad_norm": 0.3877549469470978,
"learning_rate": 0.0003421363636363636,
"loss": 3.2118,
"step": 73850
},
{
"epoch": 21.520123478362166,
"grad_norm": 0.4139254689216614,
"learning_rate": 0.00034196153846153843,
"loss": 3.2257,
"step": 73900
},
{
"epoch": 21.534684605975887,
"grad_norm": 0.4532964825630188,
"learning_rate": 0.00034178671328671323,
"loss": 3.2198,
"step": 73950
},
{
"epoch": 21.549245733589608,
"grad_norm": 0.42118948698043823,
"learning_rate": 0.0003416118881118881,
"loss": 3.2102,
"step": 74000
},
{
"epoch": 21.549245733589608,
"eval_accuracy": 0.3733187665425284,
"eval_loss": 3.5405354499816895,
"eval_runtime": 179.8258,
"eval_samples_per_second": 92.567,
"eval_steps_per_second": 5.789,
"step": 74000
},
{
"epoch": 21.563806861203332,
"grad_norm": 0.4034072458744049,
"learning_rate": 0.0003414370629370629,
"loss": 3.2256,
"step": 74050
},
{
"epoch": 21.578367988817053,
"grad_norm": 0.39188823103904724,
"learning_rate": 0.00034126223776223774,
"loss": 3.2066,
"step": 74100
},
{
"epoch": 21.592929116430778,
"grad_norm": 0.4021751582622528,
"learning_rate": 0.0003410874125874126,
"loss": 3.2211,
"step": 74150
},
{
"epoch": 21.6074902440445,
"grad_norm": 0.3793504536151886,
"learning_rate": 0.0003409125874125874,
"loss": 3.2253,
"step": 74200
},
{
"epoch": 21.62205137165822,
"grad_norm": 0.4196092486381531,
"learning_rate": 0.00034073776223776225,
"loss": 3.2312,
"step": 74250
},
{
"epoch": 21.636612499271944,
"grad_norm": 0.40465742349624634,
"learning_rate": 0.00034056293706293705,
"loss": 3.2181,
"step": 74300
},
{
"epoch": 21.651173626885665,
"grad_norm": 0.4095724821090698,
"learning_rate": 0.0003403881118881119,
"loss": 3.2266,
"step": 74350
},
{
"epoch": 21.66573475449939,
"grad_norm": 0.3888077437877655,
"learning_rate": 0.00034021328671328665,
"loss": 3.2216,
"step": 74400
},
{
"epoch": 21.68029588211311,
"grad_norm": 0.4043755829334259,
"learning_rate": 0.0003400384615384615,
"loss": 3.2288,
"step": 74450
},
{
"epoch": 21.69485700972683,
"grad_norm": 0.38790950179100037,
"learning_rate": 0.0003398636363636363,
"loss": 3.2154,
"step": 74500
},
{
"epoch": 21.709418137340556,
"grad_norm": 0.3935188949108124,
"learning_rate": 0.00033968881118881115,
"loss": 3.219,
"step": 74550
},
{
"epoch": 21.723979264954277,
"grad_norm": 0.4363575279712677,
"learning_rate": 0.00033951398601398595,
"loss": 3.2314,
"step": 74600
},
{
"epoch": 21.738540392568,
"grad_norm": 0.41048911213874817,
"learning_rate": 0.0003393391608391608,
"loss": 3.2328,
"step": 74650
},
{
"epoch": 21.753101520181723,
"grad_norm": 0.38812801241874695,
"learning_rate": 0.0003391643356643356,
"loss": 3.2266,
"step": 74700
},
{
"epoch": 21.767662647795444,
"grad_norm": 0.38845062255859375,
"learning_rate": 0.00033898951048951046,
"loss": 3.2302,
"step": 74750
},
{
"epoch": 21.782223775409168,
"grad_norm": 0.3952963650226593,
"learning_rate": 0.00033881468531468526,
"loss": 3.2299,
"step": 74800
},
{
"epoch": 21.79678490302289,
"grad_norm": 0.4123411178588867,
"learning_rate": 0.0003386398601398601,
"loss": 3.2384,
"step": 74850
},
{
"epoch": 21.811346030636614,
"grad_norm": 0.3929203152656555,
"learning_rate": 0.00033846503496503497,
"loss": 3.2281,
"step": 74900
},
{
"epoch": 21.825907158250335,
"grad_norm": 0.43496498465538025,
"learning_rate": 0.00033829020979020977,
"loss": 3.2365,
"step": 74950
},
{
"epoch": 21.840468285864056,
"grad_norm": 0.40218108892440796,
"learning_rate": 0.0003381153846153846,
"loss": 3.233,
"step": 75000
},
{
"epoch": 21.840468285864056,
"eval_accuracy": 0.37406411347330965,
"eval_loss": 3.5364959239959717,
"eval_runtime": 180.0061,
"eval_samples_per_second": 92.475,
"eval_steps_per_second": 5.783,
"step": 75000
},
{
"epoch": 21.85502941347778,
"grad_norm": 0.394771009683609,
"learning_rate": 0.0003379405594405594,
"loss": 3.2279,
"step": 75050
},
{
"epoch": 21.8695905410915,
"grad_norm": 0.38714203238487244,
"learning_rate": 0.0003377657342657343,
"loss": 3.2447,
"step": 75100
},
{
"epoch": 21.884151668705226,
"grad_norm": 0.4088663160800934,
"learning_rate": 0.000337590909090909,
"loss": 3.2378,
"step": 75150
},
{
"epoch": 21.898712796318947,
"grad_norm": 0.38015908002853394,
"learning_rate": 0.0003374160839160839,
"loss": 3.2498,
"step": 75200
},
{
"epoch": 21.91327392393267,
"grad_norm": 0.4377588927745819,
"learning_rate": 0.0003372412587412587,
"loss": 3.2369,
"step": 75250
},
{
"epoch": 21.927835051546392,
"grad_norm": 0.38883963227272034,
"learning_rate": 0.00033706643356643353,
"loss": 3.238,
"step": 75300
},
{
"epoch": 21.942396179160113,
"grad_norm": 0.3904089033603668,
"learning_rate": 0.00033689160839160833,
"loss": 3.2277,
"step": 75350
},
{
"epoch": 21.956957306773838,
"grad_norm": 0.37466031312942505,
"learning_rate": 0.0003367167832167832,
"loss": 3.2465,
"step": 75400
},
{
"epoch": 21.97151843438756,
"grad_norm": 0.41088932752609253,
"learning_rate": 0.000336541958041958,
"loss": 3.2396,
"step": 75450
},
{
"epoch": 21.986079562001283,
"grad_norm": 0.41808900237083435,
"learning_rate": 0.00033636713286713284,
"loss": 3.2411,
"step": 75500
},
{
"epoch": 22.00058244510455,
"grad_norm": 0.4222349524497986,
"learning_rate": 0.0003361923076923077,
"loss": 3.2277,
"step": 75550
},
{
"epoch": 22.015143572718273,
"grad_norm": 0.3944070339202881,
"learning_rate": 0.0003360174825174825,
"loss": 3.1384,
"step": 75600
},
{
"epoch": 22.029704700331994,
"grad_norm": 0.3932644724845886,
"learning_rate": 0.00033584265734265734,
"loss": 3.1394,
"step": 75650
},
{
"epoch": 22.044265827945715,
"grad_norm": 0.41474151611328125,
"learning_rate": 0.00033566783216783214,
"loss": 3.1455,
"step": 75700
},
{
"epoch": 22.05882695555944,
"grad_norm": 0.3940243721008301,
"learning_rate": 0.000335493006993007,
"loss": 3.145,
"step": 75750
},
{
"epoch": 22.07338808317316,
"grad_norm": 0.44909387826919556,
"learning_rate": 0.0003353181818181818,
"loss": 3.1456,
"step": 75800
},
{
"epoch": 22.087949210786885,
"grad_norm": 0.4262406527996063,
"learning_rate": 0.00033514335664335665,
"loss": 3.1601,
"step": 75850
},
{
"epoch": 22.102510338400606,
"grad_norm": 0.40372106432914734,
"learning_rate": 0.0003349685314685314,
"loss": 3.1587,
"step": 75900
},
{
"epoch": 22.117071466014327,
"grad_norm": 0.3838382065296173,
"learning_rate": 0.00033479370629370625,
"loss": 3.1586,
"step": 75950
},
{
"epoch": 22.13163259362805,
"grad_norm": 0.39858493208885193,
"learning_rate": 0.00033461888111888105,
"loss": 3.1569,
"step": 76000
},
{
"epoch": 22.13163259362805,
"eval_accuracy": 0.373211549444599,
"eval_loss": 3.5540387630462646,
"eval_runtime": 179.8336,
"eval_samples_per_second": 92.563,
"eval_steps_per_second": 5.789,
"step": 76000
},
{
"epoch": 22.146193721241772,
"grad_norm": 0.3894960582256317,
"learning_rate": 0.0003344440559440559,
"loss": 3.1707,
"step": 76050
},
{
"epoch": 22.160754848855497,
"grad_norm": 0.4287494719028473,
"learning_rate": 0.0003342692307692307,
"loss": 3.1714,
"step": 76100
},
{
"epoch": 22.175315976469218,
"grad_norm": 0.4149375557899475,
"learning_rate": 0.00033409440559440556,
"loss": 3.1688,
"step": 76150
},
{
"epoch": 22.18987710408294,
"grad_norm": 0.42831283807754517,
"learning_rate": 0.00033391958041958036,
"loss": 3.1748,
"step": 76200
},
{
"epoch": 22.204438231696663,
"grad_norm": 0.3864456117153168,
"learning_rate": 0.0003337447552447552,
"loss": 3.1734,
"step": 76250
},
{
"epoch": 22.218999359310384,
"grad_norm": 0.4284059405326843,
"learning_rate": 0.00033356993006993007,
"loss": 3.1737,
"step": 76300
},
{
"epoch": 22.23356048692411,
"grad_norm": 0.4283577501773834,
"learning_rate": 0.00033339510489510487,
"loss": 3.1684,
"step": 76350
},
{
"epoch": 22.24812161453783,
"grad_norm": 0.40983396768569946,
"learning_rate": 0.0003332202797202797,
"loss": 3.164,
"step": 76400
},
{
"epoch": 22.26268274215155,
"grad_norm": 0.3836318850517273,
"learning_rate": 0.0003330454545454545,
"loss": 3.1722,
"step": 76450
},
{
"epoch": 22.277243869765275,
"grad_norm": 0.39988741278648376,
"learning_rate": 0.0003328706293706294,
"loss": 3.1842,
"step": 76500
},
{
"epoch": 22.291804997378996,
"grad_norm": 0.40669435262680054,
"learning_rate": 0.00033269580419580417,
"loss": 3.1787,
"step": 76550
},
{
"epoch": 22.30636612499272,
"grad_norm": 0.3948037624359131,
"learning_rate": 0.000332520979020979,
"loss": 3.1898,
"step": 76600
},
{
"epoch": 22.32092725260644,
"grad_norm": 0.39549776911735535,
"learning_rate": 0.00033234615384615377,
"loss": 3.1864,
"step": 76650
},
{
"epoch": 22.335488380220163,
"grad_norm": 0.4276168644428253,
"learning_rate": 0.0003321713286713286,
"loss": 3.1899,
"step": 76700
},
{
"epoch": 22.350049507833887,
"grad_norm": 0.3946962058544159,
"learning_rate": 0.0003319965034965034,
"loss": 3.1924,
"step": 76750
},
{
"epoch": 22.364610635447608,
"grad_norm": 0.3978945016860962,
"learning_rate": 0.0003318216783216783,
"loss": 3.2019,
"step": 76800
},
{
"epoch": 22.379171763061333,
"grad_norm": 0.41653361916542053,
"learning_rate": 0.0003316468531468531,
"loss": 3.1912,
"step": 76850
},
{
"epoch": 22.393732890675054,
"grad_norm": 0.3889034390449524,
"learning_rate": 0.00033147202797202793,
"loss": 3.1958,
"step": 76900
},
{
"epoch": 22.408294018288775,
"grad_norm": 0.3997187912464142,
"learning_rate": 0.0003312972027972028,
"loss": 3.1976,
"step": 76950
},
{
"epoch": 22.4228551459025,
"grad_norm": 0.40132373571395874,
"learning_rate": 0.0003311223776223776,
"loss": 3.1916,
"step": 77000
},
{
"epoch": 22.4228551459025,
"eval_accuracy": 0.3737350557352565,
"eval_loss": 3.5441718101501465,
"eval_runtime": 179.8222,
"eval_samples_per_second": 92.569,
"eval_steps_per_second": 5.789,
"step": 77000
},
{
"epoch": 22.43741627351622,
"grad_norm": 0.4320622682571411,
"learning_rate": 0.00033094755244755244,
"loss": 3.1979,
"step": 77050
},
{
"epoch": 22.451977401129945,
"grad_norm": 0.389909029006958,
"learning_rate": 0.00033077272727272724,
"loss": 3.2031,
"step": 77100
},
{
"epoch": 22.466538528743666,
"grad_norm": 0.4020571708679199,
"learning_rate": 0.0003305979020979021,
"loss": 3.1921,
"step": 77150
},
{
"epoch": 22.481099656357387,
"grad_norm": 0.40053069591522217,
"learning_rate": 0.0003304230769230769,
"loss": 3.1998,
"step": 77200
},
{
"epoch": 22.49566078397111,
"grad_norm": 0.38186731934547424,
"learning_rate": 0.00033024825174825175,
"loss": 3.1899,
"step": 77250
},
{
"epoch": 22.510221911584832,
"grad_norm": 0.4473145008087158,
"learning_rate": 0.00033007342657342655,
"loss": 3.1922,
"step": 77300
},
{
"epoch": 22.524783039198557,
"grad_norm": 0.3939569592475891,
"learning_rate": 0.0003298986013986014,
"loss": 3.2047,
"step": 77350
},
{
"epoch": 22.539344166812278,
"grad_norm": 0.4264968931674957,
"learning_rate": 0.00032972377622377615,
"loss": 3.2149,
"step": 77400
},
{
"epoch": 22.553905294426002,
"grad_norm": 0.3924943208694458,
"learning_rate": 0.000329548951048951,
"loss": 3.2066,
"step": 77450
},
{
"epoch": 22.568466422039723,
"grad_norm": 0.4023171067237854,
"learning_rate": 0.0003293741258741258,
"loss": 3.2094,
"step": 77500
},
{
"epoch": 22.583027549653444,
"grad_norm": 0.3770167827606201,
"learning_rate": 0.00032919930069930065,
"loss": 3.2141,
"step": 77550
},
{
"epoch": 22.59758867726717,
"grad_norm": 0.4017462432384491,
"learning_rate": 0.0003290244755244755,
"loss": 3.2058,
"step": 77600
},
{
"epoch": 22.61214980488089,
"grad_norm": 0.41972553730010986,
"learning_rate": 0.0003288496503496503,
"loss": 3.2036,
"step": 77650
},
{
"epoch": 22.626710932494614,
"grad_norm": 0.4151745140552521,
"learning_rate": 0.00032867482517482516,
"loss": 3.2057,
"step": 77700
},
{
"epoch": 22.641272060108335,
"grad_norm": 0.3936101794242859,
"learning_rate": 0.00032849999999999996,
"loss": 3.2112,
"step": 77750
},
{
"epoch": 22.655833187722056,
"grad_norm": 0.39264073967933655,
"learning_rate": 0.0003283251748251748,
"loss": 3.2176,
"step": 77800
},
{
"epoch": 22.67039431533578,
"grad_norm": 0.38945305347442627,
"learning_rate": 0.0003281503496503496,
"loss": 3.2162,
"step": 77850
},
{
"epoch": 22.6849554429495,
"grad_norm": 0.41507482528686523,
"learning_rate": 0.00032797552447552447,
"loss": 3.2144,
"step": 77900
},
{
"epoch": 22.699516570563226,
"grad_norm": 0.382941871881485,
"learning_rate": 0.00032780069930069927,
"loss": 3.2255,
"step": 77950
},
{
"epoch": 22.714077698176947,
"grad_norm": 0.400842547416687,
"learning_rate": 0.0003276258741258741,
"loss": 3.2222,
"step": 78000
},
{
"epoch": 22.714077698176947,
"eval_accuracy": 0.3742671440962527,
"eval_loss": 3.5415430068969727,
"eval_runtime": 179.7265,
"eval_samples_per_second": 92.619,
"eval_steps_per_second": 5.792,
"step": 78000
},
{
"epoch": 22.728638825790668,
"grad_norm": 0.37490975856781006,
"learning_rate": 0.0003274510489510489,
"loss": 3.2217,
"step": 78050
},
{
"epoch": 22.743199953404392,
"grad_norm": 0.4152382016181946,
"learning_rate": 0.0003272762237762238,
"loss": 3.2128,
"step": 78100
},
{
"epoch": 22.757761081018113,
"grad_norm": 0.41579964756965637,
"learning_rate": 0.0003271013986013985,
"loss": 3.2198,
"step": 78150
},
{
"epoch": 22.772322208631838,
"grad_norm": 0.395577996969223,
"learning_rate": 0.0003269265734265734,
"loss": 3.2269,
"step": 78200
},
{
"epoch": 22.78688333624556,
"grad_norm": 0.4251713156700134,
"learning_rate": 0.0003267517482517482,
"loss": 3.2236,
"step": 78250
},
{
"epoch": 22.80144446385928,
"grad_norm": 0.39792853593826294,
"learning_rate": 0.00032657692307692303,
"loss": 3.2098,
"step": 78300
},
{
"epoch": 22.816005591473004,
"grad_norm": 0.38918763399124146,
"learning_rate": 0.0003264020979020979,
"loss": 3.2152,
"step": 78350
},
{
"epoch": 22.830566719086725,
"grad_norm": 0.4394005835056305,
"learning_rate": 0.0003262272727272727,
"loss": 3.2299,
"step": 78400
},
{
"epoch": 22.84512784670045,
"grad_norm": 0.3849785625934601,
"learning_rate": 0.00032605244755244754,
"loss": 3.2214,
"step": 78450
},
{
"epoch": 22.85968897431417,
"grad_norm": 0.4107806980609894,
"learning_rate": 0.00032587762237762234,
"loss": 3.2318,
"step": 78500
},
{
"epoch": 22.874250101927892,
"grad_norm": 0.39863264560699463,
"learning_rate": 0.0003257027972027972,
"loss": 3.2306,
"step": 78550
},
{
"epoch": 22.888811229541616,
"grad_norm": 0.38673919439315796,
"learning_rate": 0.000325527972027972,
"loss": 3.2328,
"step": 78600
},
{
"epoch": 22.903372357155337,
"grad_norm": 0.3986453115940094,
"learning_rate": 0.00032535314685314684,
"loss": 3.2246,
"step": 78650
},
{
"epoch": 22.917933484769062,
"grad_norm": 0.39048075675964355,
"learning_rate": 0.00032517832167832164,
"loss": 3.2393,
"step": 78700
},
{
"epoch": 22.932494612382783,
"grad_norm": 0.40996915102005005,
"learning_rate": 0.0003250034965034965,
"loss": 3.2255,
"step": 78750
},
{
"epoch": 22.947055739996504,
"grad_norm": 0.4182490408420563,
"learning_rate": 0.0003248286713286713,
"loss": 3.2214,
"step": 78800
},
{
"epoch": 22.96161686761023,
"grad_norm": 0.3786882758140564,
"learning_rate": 0.00032465384615384615,
"loss": 3.228,
"step": 78850
},
{
"epoch": 22.97617799522395,
"grad_norm": 0.4005744457244873,
"learning_rate": 0.0003244790209790209,
"loss": 3.2486,
"step": 78900
},
{
"epoch": 22.990739122837674,
"grad_norm": 0.40672826766967773,
"learning_rate": 0.00032430419580419575,
"loss": 3.2536,
"step": 78950
},
{
"epoch": 23.00524200594094,
"grad_norm": 0.3979138433933258,
"learning_rate": 0.00032412937062937066,
"loss": 3.1881,
"step": 79000
},
{
"epoch": 23.00524200594094,
"eval_accuracy": 0.373619726817418,
"eval_loss": 3.546339511871338,
"eval_runtime": 179.9537,
"eval_samples_per_second": 92.502,
"eval_steps_per_second": 5.785,
"step": 79000
},
{
"epoch": 23.019803133554664,
"grad_norm": 0.41906410455703735,
"learning_rate": 0.0003239545454545454,
"loss": 3.135,
"step": 79050
},
{
"epoch": 23.034364261168385,
"grad_norm": 0.39772552251815796,
"learning_rate": 0.00032377972027972026,
"loss": 3.1365,
"step": 79100
},
{
"epoch": 23.048925388782106,
"grad_norm": 0.4002935290336609,
"learning_rate": 0.00032360489510489506,
"loss": 3.1301,
"step": 79150
},
{
"epoch": 23.06348651639583,
"grad_norm": 0.46575191617012024,
"learning_rate": 0.0003234300699300699,
"loss": 3.1407,
"step": 79200
},
{
"epoch": 23.07804764400955,
"grad_norm": 0.3918403387069702,
"learning_rate": 0.0003232552447552447,
"loss": 3.1477,
"step": 79250
},
{
"epoch": 23.092608771623276,
"grad_norm": 0.4178999662399292,
"learning_rate": 0.00032308041958041957,
"loss": 3.1305,
"step": 79300
},
{
"epoch": 23.107169899236997,
"grad_norm": 0.39694491028785706,
"learning_rate": 0.00032290559440559437,
"loss": 3.1488,
"step": 79350
},
{
"epoch": 23.121731026850718,
"grad_norm": 0.4120016396045685,
"learning_rate": 0.0003227307692307692,
"loss": 3.1543,
"step": 79400
},
{
"epoch": 23.136292154464442,
"grad_norm": 0.41624969244003296,
"learning_rate": 0.000322555944055944,
"loss": 3.1436,
"step": 79450
},
{
"epoch": 23.150853282078163,
"grad_norm": 0.41706162691116333,
"learning_rate": 0.00032238111888111887,
"loss": 3.1529,
"step": 79500
},
{
"epoch": 23.165414409691888,
"grad_norm": 0.3970998227596283,
"learning_rate": 0.00032220629370629367,
"loss": 3.1575,
"step": 79550
},
{
"epoch": 23.17997553730561,
"grad_norm": 0.46410492062568665,
"learning_rate": 0.0003220314685314685,
"loss": 3.1709,
"step": 79600
},
{
"epoch": 23.19453666491933,
"grad_norm": 0.42136386036872864,
"learning_rate": 0.00032185664335664327,
"loss": 3.1718,
"step": 79650
},
{
"epoch": 23.209097792533054,
"grad_norm": 0.4282018840312958,
"learning_rate": 0.0003216818181818181,
"loss": 3.1653,
"step": 79700
},
{
"epoch": 23.223658920146775,
"grad_norm": 0.40414661169052124,
"learning_rate": 0.00032150699300699303,
"loss": 3.1692,
"step": 79750
},
{
"epoch": 23.2382200477605,
"grad_norm": 0.3950580060482025,
"learning_rate": 0.0003213321678321678,
"loss": 3.1618,
"step": 79800
},
{
"epoch": 23.25278117537422,
"grad_norm": 0.4253195822238922,
"learning_rate": 0.00032115734265734263,
"loss": 3.1673,
"step": 79850
},
{
"epoch": 23.26734230298794,
"grad_norm": 0.416155070066452,
"learning_rate": 0.00032098251748251743,
"loss": 3.1776,
"step": 79900
},
{
"epoch": 23.281903430601666,
"grad_norm": 0.42353808879852295,
"learning_rate": 0.0003208076923076923,
"loss": 3.1801,
"step": 79950
},
{
"epoch": 23.296464558215387,
"grad_norm": 0.4196244776248932,
"learning_rate": 0.0003206328671328671,
"loss": 3.1804,
"step": 80000
},
{
"epoch": 23.296464558215387,
"eval_accuracy": 0.3738718986102454,
"eval_loss": 3.547682523727417,
"eval_runtime": 179.7001,
"eval_samples_per_second": 92.632,
"eval_steps_per_second": 5.793,
"step": 80000
},
{
"epoch": 23.31102568582911,
"grad_norm": 0.40260496735572815,
"learning_rate": 0.00032045804195804194,
"loss": 3.1384,
"step": 80050
},
{
"epoch": 23.325586813442833,
"grad_norm": 0.406258761882782,
"learning_rate": 0.00032028321678321674,
"loss": 3.1362,
"step": 80100
},
{
"epoch": 23.340147941056557,
"grad_norm": 0.41720008850097656,
"learning_rate": 0.0003201083916083916,
"loss": 3.1423,
"step": 80150
},
{
"epoch": 23.354709068670278,
"grad_norm": 0.41342195868492126,
"learning_rate": 0.0003199335664335664,
"loss": 3.1448,
"step": 80200
},
{
"epoch": 23.369270196284,
"grad_norm": 0.41920602321624756,
"learning_rate": 0.00031975874125874125,
"loss": 3.148,
"step": 80250
},
{
"epoch": 23.383831323897724,
"grad_norm": 0.44642165303230286,
"learning_rate": 0.00031958391608391605,
"loss": 3.1507,
"step": 80300
},
{
"epoch": 23.398392451511445,
"grad_norm": 0.4066300392150879,
"learning_rate": 0.0003194090909090909,
"loss": 3.1393,
"step": 80350
},
{
"epoch": 23.41295357912517,
"grad_norm": 0.39508238434791565,
"learning_rate": 0.00031923426573426576,
"loss": 3.1531,
"step": 80400
},
{
"epoch": 23.42751470673889,
"grad_norm": 0.42066729068756104,
"learning_rate": 0.0003190594405594405,
"loss": 3.1488,
"step": 80450
},
{
"epoch": 23.44207583435261,
"grad_norm": 0.40843942761421204,
"learning_rate": 0.0003188846153846154,
"loss": 3.1657,
"step": 80500
},
{
"epoch": 23.456636961966336,
"grad_norm": 0.40340083837509155,
"learning_rate": 0.00031870979020979015,
"loss": 3.1621,
"step": 80550
},
{
"epoch": 23.471198089580056,
"grad_norm": 0.40068215131759644,
"learning_rate": 0.000318534965034965,
"loss": 3.1627,
"step": 80600
},
{
"epoch": 23.48575921719378,
"grad_norm": 0.42831701040267944,
"learning_rate": 0.0003183601398601398,
"loss": 3.1638,
"step": 80650
},
{
"epoch": 23.500320344807502,
"grad_norm": 0.4072822332382202,
"learning_rate": 0.00031818531468531466,
"loss": 3.1618,
"step": 80700
},
{
"epoch": 23.514881472421223,
"grad_norm": 0.38190311193466187,
"learning_rate": 0.00031801048951048946,
"loss": 3.167,
"step": 80750
},
{
"epoch": 23.529442600034947,
"grad_norm": 0.4031204283237457,
"learning_rate": 0.0003178356643356643,
"loss": 3.1701,
"step": 80800
},
{
"epoch": 23.54400372764867,
"grad_norm": 0.39064645767211914,
"learning_rate": 0.0003176608391608391,
"loss": 3.1613,
"step": 80850
},
{
"epoch": 23.558564855262393,
"grad_norm": 0.4251323938369751,
"learning_rate": 0.00031748601398601397,
"loss": 3.1794,
"step": 80900
},
{
"epoch": 23.573125982876114,
"grad_norm": 0.41958099603652954,
"learning_rate": 0.00031731118881118877,
"loss": 3.1859,
"step": 80950
},
{
"epoch": 23.587687110489835,
"grad_norm": 0.44959893822669983,
"learning_rate": 0.0003171363636363636,
"loss": 3.1784,
"step": 81000
},
{
"epoch": 23.587687110489835,
"eval_accuracy": 0.3735306143610249,
"eval_loss": 3.5550436973571777,
"eval_runtime": 179.1883,
"eval_samples_per_second": 92.897,
"eval_steps_per_second": 5.81,
"step": 81000
},
{
"epoch": 23.60224823810356,
"grad_norm": 0.41412636637687683,
"learning_rate": 0.0003169615384615385,
"loss": 3.1808,
"step": 81050
},
{
"epoch": 23.61680936571728,
"grad_norm": 0.4038364291191101,
"learning_rate": 0.0003167867132867133,
"loss": 3.1787,
"step": 81100
},
{
"epoch": 23.631370493331005,
"grad_norm": 0.4096704125404358,
"learning_rate": 0.00031661188811188813,
"loss": 3.1722,
"step": 81150
},
{
"epoch": 23.645931620944726,
"grad_norm": 0.41439685225486755,
"learning_rate": 0.0003164370629370629,
"loss": 3.1848,
"step": 81200
},
{
"epoch": 23.660492748558447,
"grad_norm": 0.4395192861557007,
"learning_rate": 0.0003162622377622378,
"loss": 3.1754,
"step": 81250
},
{
"epoch": 23.67505387617217,
"grad_norm": 0.4596894383430481,
"learning_rate": 0.00031608741258741253,
"loss": 3.1898,
"step": 81300
},
{
"epoch": 23.689615003785892,
"grad_norm": 0.4107434153556824,
"learning_rate": 0.0003159125874125874,
"loss": 3.1825,
"step": 81350
},
{
"epoch": 23.704176131399617,
"grad_norm": 0.432025283575058,
"learning_rate": 0.0003157377622377622,
"loss": 3.1727,
"step": 81400
},
{
"epoch": 23.718737259013338,
"grad_norm": 0.410547137260437,
"learning_rate": 0.00031556293706293704,
"loss": 3.1878,
"step": 81450
},
{
"epoch": 23.73329838662706,
"grad_norm": 0.41193047165870667,
"learning_rate": 0.00031538811188811184,
"loss": 3.1934,
"step": 81500
},
{
"epoch": 23.747859514240783,
"grad_norm": 0.39965835213661194,
"learning_rate": 0.0003152132867132867,
"loss": 3.196,
"step": 81550
},
{
"epoch": 23.762420641854504,
"grad_norm": 0.40422728657722473,
"learning_rate": 0.0003150384615384615,
"loss": 3.1873,
"step": 81600
},
{
"epoch": 23.77698176946823,
"grad_norm": 0.44458886981010437,
"learning_rate": 0.00031486363636363634,
"loss": 3.1894,
"step": 81650
},
{
"epoch": 23.79154289708195,
"grad_norm": 0.44601744413375854,
"learning_rate": 0.00031468881118881114,
"loss": 3.2071,
"step": 81700
},
{
"epoch": 23.80610402469567,
"grad_norm": 0.39317721128463745,
"learning_rate": 0.000314513986013986,
"loss": 3.1947,
"step": 81750
},
{
"epoch": 23.820665152309395,
"grad_norm": 0.4235857427120209,
"learning_rate": 0.00031433916083916085,
"loss": 3.1835,
"step": 81800
},
{
"epoch": 23.835226279923116,
"grad_norm": 0.47576215863227844,
"learning_rate": 0.00031416433566433565,
"loss": 3.1952,
"step": 81850
},
{
"epoch": 23.84978740753684,
"grad_norm": 0.389379620552063,
"learning_rate": 0.0003139895104895105,
"loss": 3.1848,
"step": 81900
},
{
"epoch": 23.86434853515056,
"grad_norm": 0.39700204133987427,
"learning_rate": 0.00031381468531468525,
"loss": 3.1985,
"step": 81950
},
{
"epoch": 23.878909662764286,
"grad_norm": 0.44006362557411194,
"learning_rate": 0.00031363986013986016,
"loss": 3.2003,
"step": 82000
},
{
"epoch": 23.878909662764286,
"eval_accuracy": 0.373745753932528,
"eval_loss": 3.5439348220825195,
"eval_runtime": 179.5744,
"eval_samples_per_second": 92.697,
"eval_steps_per_second": 5.797,
"step": 82000
},
{
"epoch": 23.893470790378007,
"grad_norm": 0.4344028830528259,
"learning_rate": 0.0003134650349650349,
"loss": 3.1971,
"step": 82050
},
{
"epoch": 23.908031917991728,
"grad_norm": 0.43484529852867126,
"learning_rate": 0.00031329020979020976,
"loss": 3.1974,
"step": 82100
},
{
"epoch": 23.922593045605453,
"grad_norm": 0.3977762460708618,
"learning_rate": 0.00031311538461538456,
"loss": 3.1998,
"step": 82150
},
{
"epoch": 23.937154173219174,
"grad_norm": 0.44011130928993225,
"learning_rate": 0.0003129405594405594,
"loss": 3.2011,
"step": 82200
},
{
"epoch": 23.951715300832895,
"grad_norm": 0.3970146179199219,
"learning_rate": 0.0003127657342657342,
"loss": 3.2042,
"step": 82250
},
{
"epoch": 23.96627642844662,
"grad_norm": 0.4210617244243622,
"learning_rate": 0.00031259090909090907,
"loss": 3.1969,
"step": 82300
},
{
"epoch": 23.98083755606034,
"grad_norm": 0.39233553409576416,
"learning_rate": 0.00031241608391608386,
"loss": 3.1973,
"step": 82350
},
{
"epoch": 23.995398683674065,
"grad_norm": 0.41381213068962097,
"learning_rate": 0.0003122412587412587,
"loss": 3.2042,
"step": 82400
},
{
"epoch": 24.010192789329604,
"grad_norm": 0.4154031574726105,
"learning_rate": 0.00031206643356643357,
"loss": 3.2096,
"step": 82450
},
{
"epoch": 24.02475391694333,
"grad_norm": 0.39306122064590454,
"learning_rate": 0.00031189160839160837,
"loss": 3.1272,
"step": 82500
},
{
"epoch": 24.03931504455705,
"grad_norm": 0.4373951554298401,
"learning_rate": 0.0003117167832167832,
"loss": 3.1189,
"step": 82550
},
{
"epoch": 24.053876172170774,
"grad_norm": 0.4158374071121216,
"learning_rate": 0.000311541958041958,
"loss": 3.1326,
"step": 82600
},
{
"epoch": 24.068437299784495,
"grad_norm": 0.39844948053359985,
"learning_rate": 0.0003113671328671329,
"loss": 3.1355,
"step": 82650
},
{
"epoch": 24.082998427398216,
"grad_norm": 0.3967423439025879,
"learning_rate": 0.0003111923076923076,
"loss": 3.1339,
"step": 82700
},
{
"epoch": 24.09755955501194,
"grad_norm": 0.42140015959739685,
"learning_rate": 0.00031101748251748253,
"loss": 3.1283,
"step": 82750
},
{
"epoch": 24.11212068262566,
"grad_norm": 0.3859243094921112,
"learning_rate": 0.0003108426573426573,
"loss": 3.1429,
"step": 82800
},
{
"epoch": 24.126681810239386,
"grad_norm": 0.4281948208808899,
"learning_rate": 0.00031066783216783213,
"loss": 3.1468,
"step": 82850
},
{
"epoch": 24.141242937853107,
"grad_norm": 0.4105588495731354,
"learning_rate": 0.00031049300699300693,
"loss": 3.1553,
"step": 82900
},
{
"epoch": 24.15580406546683,
"grad_norm": 0.4336562156677246,
"learning_rate": 0.0003103181818181818,
"loss": 3.1577,
"step": 82950
},
{
"epoch": 24.170365193080553,
"grad_norm": 0.4350588619709015,
"learning_rate": 0.0003101433566433566,
"loss": 3.1491,
"step": 83000
},
{
"epoch": 24.170365193080553,
"eval_accuracy": 0.37365452534920207,
"eval_loss": 3.552607536315918,
"eval_runtime": 179.534,
"eval_samples_per_second": 92.718,
"eval_steps_per_second": 5.798,
"step": 83000
},
{
"epoch": 24.184926320694274,
"grad_norm": 0.38933441042900085,
"learning_rate": 0.00030996853146853144,
"loss": 3.1537,
"step": 83050
},
{
"epoch": 24.199487448308,
"grad_norm": 0.42273107171058655,
"learning_rate": 0.00030979370629370624,
"loss": 3.1601,
"step": 83100
},
{
"epoch": 24.21404857592172,
"grad_norm": 0.4331780672073364,
"learning_rate": 0.0003096188811188811,
"loss": 3.153,
"step": 83150
},
{
"epoch": 24.22860970353544,
"grad_norm": 0.40830710530281067,
"learning_rate": 0.00030944405594405595,
"loss": 3.1608,
"step": 83200
},
{
"epoch": 24.243170831149165,
"grad_norm": 0.3968210220336914,
"learning_rate": 0.00030926923076923075,
"loss": 3.1623,
"step": 83250
},
{
"epoch": 24.257731958762886,
"grad_norm": 0.40018799901008606,
"learning_rate": 0.0003090944055944056,
"loss": 3.1495,
"step": 83300
},
{
"epoch": 24.27229308637661,
"grad_norm": 0.4155309796333313,
"learning_rate": 0.0003089195804195804,
"loss": 3.1773,
"step": 83350
},
{
"epoch": 24.28685421399033,
"grad_norm": 0.4509030282497406,
"learning_rate": 0.00030874475524475525,
"loss": 3.1644,
"step": 83400
},
{
"epoch": 24.301415341604052,
"grad_norm": 0.4153228998184204,
"learning_rate": 0.00030856993006993,
"loss": 3.1714,
"step": 83450
},
{
"epoch": 24.315976469217777,
"grad_norm": 0.4305969178676605,
"learning_rate": 0.0003083951048951049,
"loss": 3.1673,
"step": 83500
},
{
"epoch": 24.330537596831498,
"grad_norm": 0.4283483326435089,
"learning_rate": 0.00030822027972027965,
"loss": 3.1767,
"step": 83550
},
{
"epoch": 24.345098724445222,
"grad_norm": 0.4154931902885437,
"learning_rate": 0.0003080454545454545,
"loss": 3.1723,
"step": 83600
},
{
"epoch": 24.359659852058943,
"grad_norm": 0.41499650478363037,
"learning_rate": 0.0003078706293706293,
"loss": 3.1668,
"step": 83650
},
{
"epoch": 24.374220979672664,
"grad_norm": 0.4085869789123535,
"learning_rate": 0.00030769580419580416,
"loss": 3.1747,
"step": 83700
},
{
"epoch": 24.38878210728639,
"grad_norm": 0.43307942152023315,
"learning_rate": 0.00030752097902097896,
"loss": 3.1783,
"step": 83750
},
{
"epoch": 24.40334323490011,
"grad_norm": 0.42285290360450745,
"learning_rate": 0.0003073461538461538,
"loss": 3.1874,
"step": 83800
},
{
"epoch": 24.417904362513834,
"grad_norm": 0.4364725947380066,
"learning_rate": 0.00030717132867132867,
"loss": 3.1865,
"step": 83850
},
{
"epoch": 24.432465490127555,
"grad_norm": 0.41130372881889343,
"learning_rate": 0.00030699650349650347,
"loss": 3.1957,
"step": 83900
},
{
"epoch": 24.44702661774128,
"grad_norm": 0.43359464406967163,
"learning_rate": 0.0003068216783216783,
"loss": 3.185,
"step": 83950
},
{
"epoch": 24.461587745355,
"grad_norm": 0.419915109872818,
"learning_rate": 0.0003066468531468531,
"loss": 3.1797,
"step": 84000
},
{
"epoch": 24.461587745355,
"eval_accuracy": 0.3736964952000363,
"eval_loss": 3.5468900203704834,
"eval_runtime": 179.5844,
"eval_samples_per_second": 92.692,
"eval_steps_per_second": 5.797,
"step": 84000
},
{
"epoch": 24.47614887296872,
"grad_norm": 0.4072420299053192,
"learning_rate": 0.000306472027972028,
"loss": 3.191,
"step": 84050
},
{
"epoch": 24.490710000582446,
"grad_norm": 0.4290916919708252,
"learning_rate": 0.0003062972027972028,
"loss": 3.1807,
"step": 84100
},
{
"epoch": 24.505271128196167,
"grad_norm": 0.43152981996536255,
"learning_rate": 0.00030612237762237763,
"loss": 3.1817,
"step": 84150
},
{
"epoch": 24.51983225580989,
"grad_norm": 0.4400935471057892,
"learning_rate": 0.0003059475524475524,
"loss": 3.2011,
"step": 84200
},
{
"epoch": 24.534393383423613,
"grad_norm": 0.4061969816684723,
"learning_rate": 0.0003057727272727273,
"loss": 3.1836,
"step": 84250
},
{
"epoch": 24.548954511037334,
"grad_norm": 0.4131897985935211,
"learning_rate": 0.00030559790209790203,
"loss": 3.2057,
"step": 84300
},
{
"epoch": 24.563515638651058,
"grad_norm": 0.4729417562484741,
"learning_rate": 0.0003054230769230769,
"loss": 3.1933,
"step": 84350
},
{
"epoch": 24.57807676626478,
"grad_norm": 0.39170345664024353,
"learning_rate": 0.0003052482517482517,
"loss": 3.2049,
"step": 84400
},
{
"epoch": 24.592637893878504,
"grad_norm": 0.41034242510795593,
"learning_rate": 0.00030507342657342654,
"loss": 3.1882,
"step": 84450
},
{
"epoch": 24.607199021492224,
"grad_norm": 0.4086054563522339,
"learning_rate": 0.00030489860139860134,
"loss": 3.1912,
"step": 84500
},
{
"epoch": 24.621760149105945,
"grad_norm": 0.4149019420146942,
"learning_rate": 0.0003047237762237762,
"loss": 3.2091,
"step": 84550
},
{
"epoch": 24.63632127671967,
"grad_norm": 0.4383767545223236,
"learning_rate": 0.00030454895104895104,
"loss": 3.2013,
"step": 84600
},
{
"epoch": 24.65088240433339,
"grad_norm": 0.4240492880344391,
"learning_rate": 0.00030437412587412584,
"loss": 3.1967,
"step": 84650
},
{
"epoch": 24.665443531947115,
"grad_norm": 0.4269134998321533,
"learning_rate": 0.0003041993006993007,
"loss": 3.1905,
"step": 84700
},
{
"epoch": 24.680004659560836,
"grad_norm": 0.41195279359817505,
"learning_rate": 0.0003040244755244755,
"loss": 3.1993,
"step": 84750
},
{
"epoch": 24.694565787174557,
"grad_norm": 0.42535364627838135,
"learning_rate": 0.00030384965034965035,
"loss": 3.2005,
"step": 84800
},
{
"epoch": 24.709126914788282,
"grad_norm": 0.42742469906806946,
"learning_rate": 0.00030367482517482515,
"loss": 3.2101,
"step": 84850
},
{
"epoch": 24.723688042402003,
"grad_norm": 0.4245951473712921,
"learning_rate": 0.0003035,
"loss": 3.1844,
"step": 84900
},
{
"epoch": 24.738249170015727,
"grad_norm": 0.4056941568851471,
"learning_rate": 0.00030332517482517475,
"loss": 3.2026,
"step": 84950
},
{
"epoch": 24.75281029762945,
"grad_norm": 0.41141870617866516,
"learning_rate": 0.00030315034965034966,
"loss": 3.1931,
"step": 85000
},
{
"epoch": 24.75281029762945,
"eval_accuracy": 0.37420977354385193,
"eval_loss": 3.5391323566436768,
"eval_runtime": 179.6699,
"eval_samples_per_second": 92.648,
"eval_steps_per_second": 5.794,
"step": 85000
},
{
"epoch": 24.76737142524317,
"grad_norm": 0.42172756791114807,
"learning_rate": 0.0003029755244755244,
"loss": 3.2005,
"step": 85050
},
{
"epoch": 24.781932552856894,
"grad_norm": 0.4049641489982605,
"learning_rate": 0.00030280069930069926,
"loss": 3.1973,
"step": 85100
},
{
"epoch": 24.796493680470615,
"grad_norm": 0.4251534938812256,
"learning_rate": 0.00030262587412587406,
"loss": 3.204,
"step": 85150
},
{
"epoch": 24.81105480808434,
"grad_norm": 0.41703763604164124,
"learning_rate": 0.0003024510489510489,
"loss": 3.2046,
"step": 85200
},
{
"epoch": 24.82561593569806,
"grad_norm": 0.42161500453948975,
"learning_rate": 0.00030227622377622377,
"loss": 3.2215,
"step": 85250
},
{
"epoch": 24.84017706331178,
"grad_norm": 0.3901550769805908,
"learning_rate": 0.00030210139860139856,
"loss": 3.1957,
"step": 85300
},
{
"epoch": 24.854738190925506,
"grad_norm": 0.3804318606853485,
"learning_rate": 0.0003019265734265734,
"loss": 3.1989,
"step": 85350
},
{
"epoch": 24.869299318539227,
"grad_norm": 0.417339563369751,
"learning_rate": 0.0003017517482517482,
"loss": 3.2125,
"step": 85400
},
{
"epoch": 24.88386044615295,
"grad_norm": 0.4014167785644531,
"learning_rate": 0.00030157692307692307,
"loss": 3.2093,
"step": 85450
},
{
"epoch": 24.898421573766672,
"grad_norm": 0.42887428402900696,
"learning_rate": 0.00030140209790209787,
"loss": 3.2139,
"step": 85500
},
{
"epoch": 24.912982701380393,
"grad_norm": 0.39529547095298767,
"learning_rate": 0.0003012272727272727,
"loss": 3.2057,
"step": 85550
},
{
"epoch": 24.927543828994118,
"grad_norm": 0.4156612753868103,
"learning_rate": 0.0003010524475524475,
"loss": 3.223,
"step": 85600
},
{
"epoch": 24.94210495660784,
"grad_norm": 0.41448283195495605,
"learning_rate": 0.0003008776223776224,
"loss": 3.2285,
"step": 85650
},
{
"epoch": 24.956666084221563,
"grad_norm": 0.4078405797481537,
"learning_rate": 0.0003007027972027972,
"loss": 3.218,
"step": 85700
},
{
"epoch": 24.971227211835284,
"grad_norm": 0.3926677405834198,
"learning_rate": 0.00030052797202797203,
"loss": 3.2141,
"step": 85750
},
{
"epoch": 24.985788339449005,
"grad_norm": 0.38652199506759644,
"learning_rate": 0.0003003531468531468,
"loss": 3.2265,
"step": 85800
},
{
"epoch": 25.000291222552274,
"grad_norm": 0.43078088760375977,
"learning_rate": 0.00030017832167832163,
"loss": 3.2164,
"step": 85850
},
{
"epoch": 25.014852350165995,
"grad_norm": 0.41077038645744324,
"learning_rate": 0.0003000034965034965,
"loss": 3.1137,
"step": 85900
},
{
"epoch": 25.02941347777972,
"grad_norm": 0.41429752111434937,
"learning_rate": 0.0002998286713286713,
"loss": 3.096,
"step": 85950
},
{
"epoch": 25.04397460539344,
"grad_norm": 0.4240957796573639,
"learning_rate": 0.00029965384615384614,
"loss": 3.1166,
"step": 86000
},
{
"epoch": 25.04397460539344,
"eval_accuracy": 0.3734610172974567,
"eval_loss": 3.5545589923858643,
"eval_runtime": 179.5671,
"eval_samples_per_second": 92.701,
"eval_steps_per_second": 5.797,
"step": 86000
},
{
"epoch": 25.058535733007165,
"grad_norm": 0.4221683144569397,
"learning_rate": 0.00029947902097902094,
"loss": 3.1228,
"step": 86050
},
{
"epoch": 25.073096860620886,
"grad_norm": 0.40716591477394104,
"learning_rate": 0.0002993041958041958,
"loss": 3.1302,
"step": 86100
},
{
"epoch": 25.087657988234607,
"grad_norm": 0.49493148922920227,
"learning_rate": 0.0002991293706293706,
"loss": 3.1236,
"step": 86150
},
{
"epoch": 25.10221911584833,
"grad_norm": 0.40609848499298096,
"learning_rate": 0.0002989545454545454,
"loss": 3.1331,
"step": 86200
},
{
"epoch": 25.116780243462053,
"grad_norm": 0.4115280508995056,
"learning_rate": 0.00029877972027972025,
"loss": 3.1269,
"step": 86250
},
{
"epoch": 25.131341371075777,
"grad_norm": 0.45201724767684937,
"learning_rate": 0.0002986048951048951,
"loss": 3.1332,
"step": 86300
},
{
"epoch": 25.145902498689498,
"grad_norm": 0.4339931905269623,
"learning_rate": 0.0002984300699300699,
"loss": 3.1337,
"step": 86350
},
{
"epoch": 25.160463626303223,
"grad_norm": 0.45109128952026367,
"learning_rate": 0.00029825524475524475,
"loss": 3.1415,
"step": 86400
},
{
"epoch": 25.175024753916944,
"grad_norm": 0.4096085727214813,
"learning_rate": 0.00029808041958041955,
"loss": 3.1476,
"step": 86450
},
{
"epoch": 25.189585881530665,
"grad_norm": 0.4265762269496918,
"learning_rate": 0.0002979055944055944,
"loss": 3.1591,
"step": 86500
},
{
"epoch": 25.20414700914439,
"grad_norm": 0.41000792384147644,
"learning_rate": 0.0002977307692307692,
"loss": 3.1476,
"step": 86550
},
{
"epoch": 25.21870813675811,
"grad_norm": 0.43935540318489075,
"learning_rate": 0.000297555944055944,
"loss": 3.1401,
"step": 86600
},
{
"epoch": 25.233269264371835,
"grad_norm": 0.4473186731338501,
"learning_rate": 0.00029738111888111886,
"loss": 3.1485,
"step": 86650
},
{
"epoch": 25.247830391985556,
"grad_norm": 0.43369588255882263,
"learning_rate": 0.00029720629370629366,
"loss": 3.1469,
"step": 86700
},
{
"epoch": 25.262391519599277,
"grad_norm": 0.42044442892074585,
"learning_rate": 0.0002970314685314685,
"loss": 3.155,
"step": 86750
},
{
"epoch": 25.276952647213,
"grad_norm": 0.4263746738433838,
"learning_rate": 0.0002968566433566433,
"loss": 3.1673,
"step": 86800
},
{
"epoch": 25.291513774826722,
"grad_norm": 0.39871883392333984,
"learning_rate": 0.00029668181818181817,
"loss": 3.1636,
"step": 86850
},
{
"epoch": 25.306074902440447,
"grad_norm": 0.4151465594768524,
"learning_rate": 0.00029650699300699297,
"loss": 3.1559,
"step": 86900
},
{
"epoch": 25.320636030054168,
"grad_norm": 0.4401465356349945,
"learning_rate": 0.0002963321678321678,
"loss": 3.1607,
"step": 86950
},
{
"epoch": 25.33519715766789,
"grad_norm": 0.44773170351982117,
"learning_rate": 0.0002961573426573426,
"loss": 3.1681,
"step": 87000
},
{
"epoch": 25.33519715766789,
"eval_accuracy": 0.37373070591878355,
"eval_loss": 3.5502004623413086,
"eval_runtime": 179.4013,
"eval_samples_per_second": 92.786,
"eval_steps_per_second": 5.803,
"step": 87000
},
{
"epoch": 25.349758285281613,
"grad_norm": 0.4073939919471741,
"learning_rate": 0.0002959825174825175,
"loss": 3.1745,
"step": 87050
},
{
"epoch": 25.364319412895334,
"grad_norm": 0.42033010721206665,
"learning_rate": 0.0002958076923076923,
"loss": 3.1593,
"step": 87100
},
{
"epoch": 25.37888054050906,
"grad_norm": 0.42100638151168823,
"learning_rate": 0.00029563286713286713,
"loss": 3.1704,
"step": 87150
},
{
"epoch": 25.39344166812278,
"grad_norm": 0.39759254455566406,
"learning_rate": 0.00029545804195804193,
"loss": 3.1726,
"step": 87200
},
{
"epoch": 25.4080027957365,
"grad_norm": 0.4013213813304901,
"learning_rate": 0.0002952832167832168,
"loss": 3.1691,
"step": 87250
},
{
"epoch": 25.422563923350225,
"grad_norm": 0.41608861088752747,
"learning_rate": 0.0002951083916083916,
"loss": 3.1637,
"step": 87300
},
{
"epoch": 25.437125050963946,
"grad_norm": 0.40248188376426697,
"learning_rate": 0.0002949335664335664,
"loss": 3.1755,
"step": 87350
},
{
"epoch": 25.45168617857767,
"grad_norm": 0.4516514241695404,
"learning_rate": 0.00029475874125874124,
"loss": 3.1584,
"step": 87400
},
{
"epoch": 25.46624730619139,
"grad_norm": 0.4402170479297638,
"learning_rate": 0.00029458391608391604,
"loss": 3.1646,
"step": 87450
},
{
"epoch": 25.480808433805112,
"grad_norm": 0.42294979095458984,
"learning_rate": 0.0002944090909090909,
"loss": 3.195,
"step": 87500
},
{
"epoch": 25.495369561418837,
"grad_norm": 0.40274524688720703,
"learning_rate": 0.0002942342657342657,
"loss": 3.175,
"step": 87550
},
{
"epoch": 25.509930689032558,
"grad_norm": 0.417464017868042,
"learning_rate": 0.00029405944055944054,
"loss": 3.1744,
"step": 87600
},
{
"epoch": 25.524491816646282,
"grad_norm": 0.40466922521591187,
"learning_rate": 0.0002938846153846154,
"loss": 3.1796,
"step": 87650
},
{
"epoch": 25.539052944260003,
"grad_norm": 0.42506036162376404,
"learning_rate": 0.0002937097902097902,
"loss": 3.1843,
"step": 87700
},
{
"epoch": 25.553614071873724,
"grad_norm": 0.4386059045791626,
"learning_rate": 0.000293534965034965,
"loss": 3.1923,
"step": 87750
},
{
"epoch": 25.56817519948745,
"grad_norm": 0.4146467447280884,
"learning_rate": 0.00029336013986013985,
"loss": 3.1858,
"step": 87800
},
{
"epoch": 25.58273632710117,
"grad_norm": 0.43993905186653137,
"learning_rate": 0.00029318531468531465,
"loss": 3.196,
"step": 87850
},
{
"epoch": 25.597297454714894,
"grad_norm": 0.4224095046520233,
"learning_rate": 0.0002930104895104895,
"loss": 3.1821,
"step": 87900
},
{
"epoch": 25.611858582328615,
"grad_norm": 0.4316778779029846,
"learning_rate": 0.0002928356643356643,
"loss": 3.1887,
"step": 87950
},
{
"epoch": 25.626419709942336,
"grad_norm": 0.41799435019493103,
"learning_rate": 0.00029266083916083916,
"loss": 3.1803,
"step": 88000
},
{
"epoch": 25.626419709942336,
"eval_accuracy": 0.374554349546079,
"eval_loss": 3.536559820175171,
"eval_runtime": 179.4759,
"eval_samples_per_second": 92.748,
"eval_steps_per_second": 5.8,
"step": 88000
},
{
"epoch": 25.64098083755606,
"grad_norm": 0.4062333405017853,
"learning_rate": 0.00029248601398601396,
"loss": 3.1928,
"step": 88050
},
{
"epoch": 25.655541965169782,
"grad_norm": 0.419531911611557,
"learning_rate": 0.00029231118881118876,
"loss": 3.1956,
"step": 88100
},
{
"epoch": 25.670103092783506,
"grad_norm": 0.42531606554985046,
"learning_rate": 0.0002921363636363636,
"loss": 3.1854,
"step": 88150
},
{
"epoch": 25.684664220397227,
"grad_norm": 0.43867820501327515,
"learning_rate": 0.0002919615384615384,
"loss": 3.1958,
"step": 88200
},
{
"epoch": 25.69922534801095,
"grad_norm": 0.42020124197006226,
"learning_rate": 0.00029178671328671326,
"loss": 3.1814,
"step": 88250
},
{
"epoch": 25.713786475624673,
"grad_norm": 0.4607504904270172,
"learning_rate": 0.00029161188811188806,
"loss": 3.1824,
"step": 88300
},
{
"epoch": 25.728347603238394,
"grad_norm": 0.4196507930755615,
"learning_rate": 0.0002914370629370629,
"loss": 3.1873,
"step": 88350
},
{
"epoch": 25.74290873085212,
"grad_norm": 0.441723495721817,
"learning_rate": 0.00029126223776223777,
"loss": 3.19,
"step": 88400
},
{
"epoch": 25.75746985846584,
"grad_norm": 0.43082764744758606,
"learning_rate": 0.00029108741258741257,
"loss": 3.1919,
"step": 88450
},
{
"epoch": 25.772030986079564,
"grad_norm": 0.40166640281677246,
"learning_rate": 0.00029091258741258737,
"loss": 3.1974,
"step": 88500
},
{
"epoch": 25.786592113693285,
"grad_norm": 0.42622798681259155,
"learning_rate": 0.0002907377622377622,
"loss": 3.1984,
"step": 88550
},
{
"epoch": 25.801153241307006,
"grad_norm": 0.4091523289680481,
"learning_rate": 0.000290562937062937,
"loss": 3.1943,
"step": 88600
},
{
"epoch": 25.81571436892073,
"grad_norm": 0.4160587191581726,
"learning_rate": 0.0002903881118881119,
"loss": 3.1949,
"step": 88650
},
{
"epoch": 25.83027549653445,
"grad_norm": 0.4708373546600342,
"learning_rate": 0.0002902132867132867,
"loss": 3.1895,
"step": 88700
},
{
"epoch": 25.844836624148176,
"grad_norm": 0.39775756001472473,
"learning_rate": 0.00029003846153846153,
"loss": 3.2102,
"step": 88750
},
{
"epoch": 25.859397751761897,
"grad_norm": 0.41929927468299866,
"learning_rate": 0.00028986363636363633,
"loss": 3.2036,
"step": 88800
},
{
"epoch": 25.873958879375618,
"grad_norm": 0.41182512044906616,
"learning_rate": 0.00028968881118881113,
"loss": 3.1945,
"step": 88850
},
{
"epoch": 25.888520006989342,
"grad_norm": 0.46520110964775085,
"learning_rate": 0.000289513986013986,
"loss": 3.1894,
"step": 88900
},
{
"epoch": 25.903081134603063,
"grad_norm": 0.4061237573623657,
"learning_rate": 0.0002893391608391608,
"loss": 3.1986,
"step": 88950
},
{
"epoch": 25.917642262216788,
"grad_norm": 0.3934573531150818,
"learning_rate": 0.00028916433566433564,
"loss": 3.2024,
"step": 89000
},
{
"epoch": 25.917642262216788,
"eval_accuracy": 0.3748935176683667,
"eval_loss": 3.5322020053863525,
"eval_runtime": 179.4097,
"eval_samples_per_second": 92.782,
"eval_steps_per_second": 5.802,
"step": 89000
},
{
"epoch": 25.93220338983051,
"grad_norm": 0.4177358150482178,
"learning_rate": 0.0002889895104895105,
"loss": 3.1961,
"step": 89050
},
{
"epoch": 25.94676451744423,
"grad_norm": 0.43535518646240234,
"learning_rate": 0.0002888146853146853,
"loss": 3.1913,
"step": 89100
},
{
"epoch": 25.961325645057954,
"grad_norm": 0.46203741431236267,
"learning_rate": 0.00028863986013986015,
"loss": 3.1868,
"step": 89150
},
{
"epoch": 25.975886772671675,
"grad_norm": 0.4000639021396637,
"learning_rate": 0.00028846503496503495,
"loss": 3.1947,
"step": 89200
},
{
"epoch": 25.9904479002854,
"grad_norm": 0.40938976407051086,
"learning_rate": 0.00028829020979020975,
"loss": 3.1956,
"step": 89250
},
{
"epoch": 26.004950783388665,
"grad_norm": 0.39724087715148926,
"learning_rate": 0.0002881153846153846,
"loss": 3.1687,
"step": 89300
},
{
"epoch": 26.01951191100239,
"grad_norm": 0.4286766052246094,
"learning_rate": 0.0002879405594405594,
"loss": 3.0999,
"step": 89350
},
{
"epoch": 26.03407303861611,
"grad_norm": 0.4051412045955658,
"learning_rate": 0.00028776573426573425,
"loss": 3.1078,
"step": 89400
},
{
"epoch": 26.04863416622983,
"grad_norm": 0.41459497809410095,
"learning_rate": 0.00028759090909090905,
"loss": 3.1089,
"step": 89450
},
{
"epoch": 26.063195293843556,
"grad_norm": 0.4035181999206543,
"learning_rate": 0.0002874160839160839,
"loss": 3.1077,
"step": 89500
},
{
"epoch": 26.077756421457277,
"grad_norm": 0.4261769652366638,
"learning_rate": 0.0002872412587412587,
"loss": 3.1224,
"step": 89550
},
{
"epoch": 26.092317549071,
"grad_norm": 0.48869553208351135,
"learning_rate": 0.0002870664335664335,
"loss": 3.123,
"step": 89600
},
{
"epoch": 26.106878676684723,
"grad_norm": 0.42526131868362427,
"learning_rate": 0.00028689160839160836,
"loss": 3.1323,
"step": 89650
},
{
"epoch": 26.121439804298443,
"grad_norm": 0.44049084186553955,
"learning_rate": 0.0002867167832167832,
"loss": 3.1283,
"step": 89700
},
{
"epoch": 26.136000931912168,
"grad_norm": 0.4529089331626892,
"learning_rate": 0.000286541958041958,
"loss": 3.1439,
"step": 89750
},
{
"epoch": 26.15056205952589,
"grad_norm": 0.4227697551250458,
"learning_rate": 0.00028636713286713287,
"loss": 3.136,
"step": 89800
},
{
"epoch": 26.165123187139613,
"grad_norm": 0.43322882056236267,
"learning_rate": 0.00028619230769230767,
"loss": 3.128,
"step": 89850
},
{
"epoch": 26.179684314753334,
"grad_norm": 0.41136738657951355,
"learning_rate": 0.0002860174825174825,
"loss": 3.1288,
"step": 89900
},
{
"epoch": 26.194245442367055,
"grad_norm": 0.42501312494277954,
"learning_rate": 0.0002858426573426573,
"loss": 3.147,
"step": 89950
},
{
"epoch": 26.20880656998078,
"grad_norm": 0.4537741243839264,
"learning_rate": 0.0002856678321678321,
"loss": 3.135,
"step": 90000
},
{
"epoch": 26.20880656998078,
"eval_accuracy": 0.37393232579043806,
"eval_loss": 3.550401449203491,
"eval_runtime": 179.4726,
"eval_samples_per_second": 92.75,
"eval_steps_per_second": 5.8,
"step": 90000
},
{
"epoch": 26.2233676975945,
"grad_norm": 0.42622241377830505,
"learning_rate": 0.000285493006993007,
"loss": 3.1271,
"step": 90050
},
{
"epoch": 26.237928825208225,
"grad_norm": 0.4380102753639221,
"learning_rate": 0.0002853181818181818,
"loss": 3.1392,
"step": 90100
},
{
"epoch": 26.252489952821946,
"grad_norm": 0.43311333656311035,
"learning_rate": 0.00028514335664335663,
"loss": 3.1523,
"step": 90150
},
{
"epoch": 26.267051080435667,
"grad_norm": 0.41497403383255005,
"learning_rate": 0.00028496853146853143,
"loss": 3.1481,
"step": 90200
},
{
"epoch": 26.281612208049392,
"grad_norm": 0.4311578571796417,
"learning_rate": 0.0002847937062937063,
"loss": 3.1517,
"step": 90250
},
{
"epoch": 26.296173335663113,
"grad_norm": 0.44830796122550964,
"learning_rate": 0.0002846188811188811,
"loss": 3.1489,
"step": 90300
},
{
"epoch": 26.310734463276837,
"grad_norm": 0.4175836145877838,
"learning_rate": 0.0002844440559440559,
"loss": 3.1483,
"step": 90350
},
{
"epoch": 26.32529559089056,
"grad_norm": 0.40580078959465027,
"learning_rate": 0.00028426923076923074,
"loss": 3.1461,
"step": 90400
},
{
"epoch": 26.33985671850428,
"grad_norm": 0.4338361620903015,
"learning_rate": 0.0002840944055944056,
"loss": 3.1598,
"step": 90450
},
{
"epoch": 26.354417846118004,
"grad_norm": 0.39846161007881165,
"learning_rate": 0.0002839195804195804,
"loss": 3.151,
"step": 90500
},
{
"epoch": 26.368978973731725,
"grad_norm": 0.42146289348602295,
"learning_rate": 0.00028374475524475524,
"loss": 3.1622,
"step": 90550
},
{
"epoch": 26.38354010134545,
"grad_norm": 0.43501943349838257,
"learning_rate": 0.00028356993006993004,
"loss": 3.1506,
"step": 90600
},
{
"epoch": 26.39810122895917,
"grad_norm": 0.42838984727859497,
"learning_rate": 0.0002833951048951049,
"loss": 3.1577,
"step": 90650
},
{
"epoch": 26.41266235657289,
"grad_norm": 0.4120590090751648,
"learning_rate": 0.0002832202797202797,
"loss": 3.1554,
"step": 90700
},
{
"epoch": 26.427223484186616,
"grad_norm": 0.49454623460769653,
"learning_rate": 0.0002830454545454545,
"loss": 3.1677,
"step": 90750
},
{
"epoch": 26.441784611800337,
"grad_norm": 0.43354135751724243,
"learning_rate": 0.00028287062937062935,
"loss": 3.1539,
"step": 90800
},
{
"epoch": 26.45634573941406,
"grad_norm": 0.43948525190353394,
"learning_rate": 0.00028269580419580415,
"loss": 3.1679,
"step": 90850
},
{
"epoch": 26.470906867027782,
"grad_norm": 0.4159654676914215,
"learning_rate": 0.000282520979020979,
"loss": 3.1556,
"step": 90900
},
{
"epoch": 26.485467994641503,
"grad_norm": 0.4314415156841278,
"learning_rate": 0.0002823461538461538,
"loss": 3.1691,
"step": 90950
},
{
"epoch": 26.500029122255228,
"grad_norm": 0.4420700967311859,
"learning_rate": 0.00028217132867132866,
"loss": 3.1721,
"step": 91000
},
{
"epoch": 26.500029122255228,
"eval_accuracy": 0.3742696129110077,
"eval_loss": 3.544522762298584,
"eval_runtime": 179.9049,
"eval_samples_per_second": 92.527,
"eval_steps_per_second": 5.786,
"step": 91000
},
{
"epoch": 26.51459024986895,
"grad_norm": 0.43976011872291565,
"learning_rate": 0.00028199650349650346,
"loss": 3.1713,
"step": 91050
},
{
"epoch": 26.529151377482673,
"grad_norm": 0.42499929666519165,
"learning_rate": 0.0002818216783216783,
"loss": 3.1779,
"step": 91100
},
{
"epoch": 26.543712505096394,
"grad_norm": 0.4342220425605774,
"learning_rate": 0.0002816468531468531,
"loss": 3.1629,
"step": 91150
},
{
"epoch": 26.55827363271012,
"grad_norm": 0.4181844890117645,
"learning_rate": 0.00028147202797202796,
"loss": 3.1721,
"step": 91200
},
{
"epoch": 26.57283476032384,
"grad_norm": 0.4265999495983124,
"learning_rate": 0.00028129720279720276,
"loss": 3.186,
"step": 91250
},
{
"epoch": 26.58739588793756,
"grad_norm": 0.4465186297893524,
"learning_rate": 0.0002811223776223776,
"loss": 3.1735,
"step": 91300
},
{
"epoch": 26.601957015551285,
"grad_norm": 0.4092189371585846,
"learning_rate": 0.0002809475524475524,
"loss": 3.1739,
"step": 91350
},
{
"epoch": 26.616518143165006,
"grad_norm": 0.4340270459651947,
"learning_rate": 0.00028077272727272727,
"loss": 3.1693,
"step": 91400
},
{
"epoch": 26.63107927077873,
"grad_norm": 0.48418542742729187,
"learning_rate": 0.00028059790209790207,
"loss": 3.1732,
"step": 91450
},
{
"epoch": 26.64564039839245,
"grad_norm": 0.4325987696647644,
"learning_rate": 0.00028042307692307687,
"loss": 3.1749,
"step": 91500
},
{
"epoch": 26.660201526006173,
"grad_norm": 0.3916352391242981,
"learning_rate": 0.0002802482517482517,
"loss": 3.1674,
"step": 91550
},
{
"epoch": 26.674762653619897,
"grad_norm": 0.4213244318962097,
"learning_rate": 0.0002800734265734265,
"loss": 3.1871,
"step": 91600
},
{
"epoch": 26.689323781233618,
"grad_norm": 0.4683827757835388,
"learning_rate": 0.0002798986013986014,
"loss": 3.1804,
"step": 91650
},
{
"epoch": 26.703884908847343,
"grad_norm": 0.44533535838127136,
"learning_rate": 0.0002797237762237762,
"loss": 3.1719,
"step": 91700
},
{
"epoch": 26.718446036461064,
"grad_norm": 0.43670111894607544,
"learning_rate": 0.00027954895104895103,
"loss": 3.1858,
"step": 91750
},
{
"epoch": 26.733007164074785,
"grad_norm": 0.43646425008773804,
"learning_rate": 0.0002793741258741259,
"loss": 3.1873,
"step": 91800
},
{
"epoch": 26.74756829168851,
"grad_norm": 0.4080584645271301,
"learning_rate": 0.0002791993006993007,
"loss": 3.1912,
"step": 91850
},
{
"epoch": 26.76212941930223,
"grad_norm": 0.46796929836273193,
"learning_rate": 0.0002790244755244755,
"loss": 3.1798,
"step": 91900
},
{
"epoch": 26.776690546915955,
"grad_norm": 0.4078224003314972,
"learning_rate": 0.00027884965034965034,
"loss": 3.1904,
"step": 91950
},
{
"epoch": 26.791251674529676,
"grad_norm": 0.402871698141098,
"learning_rate": 0.00027867482517482514,
"loss": 3.1848,
"step": 92000
},
{
"epoch": 26.791251674529676,
"eval_accuracy": 0.3749665240475489,
"eval_loss": 3.5339274406433105,
"eval_runtime": 179.9101,
"eval_samples_per_second": 92.524,
"eval_steps_per_second": 5.786,
"step": 92000
},
{
"epoch": 26.805812802143397,
"grad_norm": 0.4266301393508911,
"learning_rate": 0.0002785,
"loss": 3.1884,
"step": 92050
},
{
"epoch": 26.82037392975712,
"grad_norm": 0.41707316040992737,
"learning_rate": 0.0002783251748251748,
"loss": 3.1793,
"step": 92100
},
{
"epoch": 26.834935057370842,
"grad_norm": 0.4078686237335205,
"learning_rate": 0.00027815034965034965,
"loss": 3.1732,
"step": 92150
},
{
"epoch": 26.849496184984567,
"grad_norm": 0.4338757395744324,
"learning_rate": 0.00027797552447552445,
"loss": 3.1861,
"step": 92200
},
{
"epoch": 26.864057312598288,
"grad_norm": 0.4215943515300751,
"learning_rate": 0.00027780069930069925,
"loss": 3.1934,
"step": 92250
},
{
"epoch": 26.87861844021201,
"grad_norm": 0.4491233825683594,
"learning_rate": 0.0002776258741258741,
"loss": 3.1784,
"step": 92300
},
{
"epoch": 26.893179567825733,
"grad_norm": 0.4336360692977905,
"learning_rate": 0.0002774510489510489,
"loss": 3.178,
"step": 92350
},
{
"epoch": 26.907740695439454,
"grad_norm": 0.43273472785949707,
"learning_rate": 0.00027727622377622375,
"loss": 3.197,
"step": 92400
},
{
"epoch": 26.92230182305318,
"grad_norm": 0.421914666891098,
"learning_rate": 0.00027710139860139855,
"loss": 3.1916,
"step": 92450
},
{
"epoch": 26.9368629506669,
"grad_norm": 0.4407094120979309,
"learning_rate": 0.0002769265734265734,
"loss": 3.1919,
"step": 92500
},
{
"epoch": 26.95142407828062,
"grad_norm": 0.4097945988178253,
"learning_rate": 0.00027675174825174826,
"loss": 3.1854,
"step": 92550
},
{
"epoch": 26.965985205894345,
"grad_norm": 0.4385409951210022,
"learning_rate": 0.00027657692307692306,
"loss": 3.1991,
"step": 92600
},
{
"epoch": 26.980546333508066,
"grad_norm": 0.427116721868515,
"learning_rate": 0.00027640209790209786,
"loss": 3.1969,
"step": 92650
},
{
"epoch": 26.99510746112179,
"grad_norm": 0.41636037826538086,
"learning_rate": 0.0002762272727272727,
"loss": 3.1876,
"step": 92700
},
{
"epoch": 27.009610344225056,
"grad_norm": 0.4315323531627655,
"learning_rate": 0.0002760524475524475,
"loss": 3.1355,
"step": 92750
},
{
"epoch": 27.02417147183878,
"grad_norm": 0.4142589569091797,
"learning_rate": 0.00027587762237762237,
"loss": 3.0923,
"step": 92800
},
{
"epoch": 27.0387325994525,
"grad_norm": 0.4673878848552704,
"learning_rate": 0.00027570279720279717,
"loss": 3.0964,
"step": 92850
},
{
"epoch": 27.053293727066222,
"grad_norm": 0.46009519696235657,
"learning_rate": 0.000275527972027972,
"loss": 3.1095,
"step": 92900
},
{
"epoch": 27.067854854679947,
"grad_norm": 0.4197070002555847,
"learning_rate": 0.0002753531468531468,
"loss": 3.1225,
"step": 92950
},
{
"epoch": 27.082415982293668,
"grad_norm": 0.42871737480163574,
"learning_rate": 0.0002751783216783216,
"loss": 3.1138,
"step": 93000
},
{
"epoch": 27.082415982293668,
"eval_accuracy": 0.3740296676293477,
"eval_loss": 3.5543622970581055,
"eval_runtime": 179.714,
"eval_samples_per_second": 92.625,
"eval_steps_per_second": 5.793,
"step": 93000
},
{
"epoch": 27.096977109907392,
"grad_norm": 0.46484479308128357,
"learning_rate": 0.0002750034965034965,
"loss": 3.1031,
"step": 93050
},
{
"epoch": 27.111538237521113,
"grad_norm": 0.41515466570854187,
"learning_rate": 0.0002748286713286713,
"loss": 3.1183,
"step": 93100
},
{
"epoch": 27.126099365134834,
"grad_norm": 0.43212100863456726,
"learning_rate": 0.00027465384615384613,
"loss": 3.1177,
"step": 93150
},
{
"epoch": 27.14066049274856,
"grad_norm": 0.4330689013004303,
"learning_rate": 0.000274479020979021,
"loss": 3.1191,
"step": 93200
},
{
"epoch": 27.15522162036228,
"grad_norm": 0.46141743659973145,
"learning_rate": 0.0002743041958041958,
"loss": 3.1164,
"step": 93250
},
{
"epoch": 27.169782747976004,
"grad_norm": 0.42743605375289917,
"learning_rate": 0.00027412937062937064,
"loss": 3.1285,
"step": 93300
},
{
"epoch": 27.184343875589725,
"grad_norm": 0.45486384630203247,
"learning_rate": 0.00027395454545454544,
"loss": 3.1362,
"step": 93350
},
{
"epoch": 27.19890500320345,
"grad_norm": 0.45941078662872314,
"learning_rate": 0.00027377972027972024,
"loss": 3.1198,
"step": 93400
},
{
"epoch": 27.21346613081717,
"grad_norm": 0.44040271639823914,
"learning_rate": 0.0002736048951048951,
"loss": 3.1319,
"step": 93450
},
{
"epoch": 27.228027258430892,
"grad_norm": 0.4241819679737091,
"learning_rate": 0.0002734300699300699,
"loss": 3.1243,
"step": 93500
},
{
"epoch": 27.242588386044616,
"grad_norm": 0.46160954236984253,
"learning_rate": 0.00027325524475524474,
"loss": 3.142,
"step": 93550
},
{
"epoch": 27.257149513658337,
"grad_norm": 0.4334246814250946,
"learning_rate": 0.00027308041958041954,
"loss": 3.1297,
"step": 93600
},
{
"epoch": 27.271710641272062,
"grad_norm": 0.4573898911476135,
"learning_rate": 0.0002729055944055944,
"loss": 3.1421,
"step": 93650
},
{
"epoch": 27.286271768885783,
"grad_norm": 0.4379344880580902,
"learning_rate": 0.0002727307692307692,
"loss": 3.1389,
"step": 93700
},
{
"epoch": 27.300832896499504,
"grad_norm": 0.4442446529865265,
"learning_rate": 0.000272555944055944,
"loss": 3.1505,
"step": 93750
},
{
"epoch": 27.31539402411323,
"grad_norm": 0.4473417103290558,
"learning_rate": 0.00027238111888111885,
"loss": 3.1305,
"step": 93800
},
{
"epoch": 27.32995515172695,
"grad_norm": 0.4124809503555298,
"learning_rate": 0.0002722062937062937,
"loss": 3.1383,
"step": 93850
},
{
"epoch": 27.344516279340674,
"grad_norm": 0.44610223174095154,
"learning_rate": 0.0002720314685314685,
"loss": 3.1296,
"step": 93900
},
{
"epoch": 27.359077406954395,
"grad_norm": 0.4158053994178772,
"learning_rate": 0.00027185664335664336,
"loss": 3.1463,
"step": 93950
},
{
"epoch": 27.373638534568116,
"grad_norm": 0.4296194911003113,
"learning_rate": 0.00027168181818181816,
"loss": 3.1423,
"step": 94000
},
{
"epoch": 27.373638534568116,
"eval_accuracy": 0.37445030663854884,
"eval_loss": 3.54975962638855,
"eval_runtime": 179.4681,
"eval_samples_per_second": 92.752,
"eval_steps_per_second": 5.8,
"step": 94000
},
{
"epoch": 27.38819966218184,
"grad_norm": 0.45299991965293884,
"learning_rate": 0.000271506993006993,
"loss": 3.1461,
"step": 94050
},
{
"epoch": 27.40276078979556,
"grad_norm": 0.43401360511779785,
"learning_rate": 0.0002713321678321678,
"loss": 3.1442,
"step": 94100
},
{
"epoch": 27.417321917409286,
"grad_norm": 0.45293885469436646,
"learning_rate": 0.0002711573426573426,
"loss": 3.1452,
"step": 94150
},
{
"epoch": 27.431883045023007,
"grad_norm": 0.4478358030319214,
"learning_rate": 0.00027098251748251746,
"loss": 3.155,
"step": 94200
},
{
"epoch": 27.446444172636728,
"grad_norm": 0.415910542011261,
"learning_rate": 0.00027080769230769226,
"loss": 3.1386,
"step": 94250
},
{
"epoch": 27.461005300250452,
"grad_norm": 0.4332504868507385,
"learning_rate": 0.0002706328671328671,
"loss": 3.1429,
"step": 94300
},
{
"epoch": 27.475566427864173,
"grad_norm": 0.4292246699333191,
"learning_rate": 0.0002704580419580419,
"loss": 3.151,
"step": 94350
},
{
"epoch": 27.490127555477898,
"grad_norm": 0.44255971908569336,
"learning_rate": 0.00027028321678321677,
"loss": 3.1511,
"step": 94400
},
{
"epoch": 27.50468868309162,
"grad_norm": 0.41606923937797546,
"learning_rate": 0.00027010839160839157,
"loss": 3.1535,
"step": 94450
},
{
"epoch": 27.51924981070534,
"grad_norm": 0.4303070902824402,
"learning_rate": 0.00026993356643356637,
"loss": 3.1551,
"step": 94500
},
{
"epoch": 27.533810938319064,
"grad_norm": 0.4394899010658264,
"learning_rate": 0.0002697587412587412,
"loss": 3.1537,
"step": 94550
},
{
"epoch": 27.548372065932785,
"grad_norm": 0.46788570284843445,
"learning_rate": 0.0002695839160839161,
"loss": 3.1601,
"step": 94600
},
{
"epoch": 27.56293319354651,
"grad_norm": 0.4206959903240204,
"learning_rate": 0.0002694090909090909,
"loss": 3.164,
"step": 94650
},
{
"epoch": 27.57749432116023,
"grad_norm": 0.4316728711128235,
"learning_rate": 0.00026923426573426573,
"loss": 3.1676,
"step": 94700
},
{
"epoch": 27.59205544877395,
"grad_norm": 0.44622600078582764,
"learning_rate": 0.00026905944055944053,
"loss": 3.1731,
"step": 94750
},
{
"epoch": 27.606616576387676,
"grad_norm": 0.4393576383590698,
"learning_rate": 0.0002688846153846154,
"loss": 3.1578,
"step": 94800
},
{
"epoch": 27.621177704001397,
"grad_norm": 0.42997148633003235,
"learning_rate": 0.0002687097902097902,
"loss": 3.1693,
"step": 94850
},
{
"epoch": 27.63573883161512,
"grad_norm": 0.4639645218849182,
"learning_rate": 0.000268534965034965,
"loss": 3.1624,
"step": 94900
},
{
"epoch": 27.650299959228843,
"grad_norm": 0.42244911193847656,
"learning_rate": 0.00026836013986013984,
"loss": 3.1642,
"step": 94950
},
{
"epoch": 27.664861086842564,
"grad_norm": 0.41115155816078186,
"learning_rate": 0.00026818531468531464,
"loss": 3.1693,
"step": 95000
},
{
"epoch": 27.664861086842564,
"eval_accuracy": 0.3748031120232924,
"eval_loss": 3.5409553050994873,
"eval_runtime": 179.4828,
"eval_samples_per_second": 92.744,
"eval_steps_per_second": 5.8,
"step": 95000
},
{
"epoch": 27.679422214456288,
"grad_norm": 0.40346843004226685,
"learning_rate": 0.0002680104895104895,
"loss": 3.1738,
"step": 95050
},
{
"epoch": 27.69398334207001,
"grad_norm": 0.43340983986854553,
"learning_rate": 0.0002678356643356643,
"loss": 3.1798,
"step": 95100
},
{
"epoch": 27.708544469683734,
"grad_norm": 0.42665207386016846,
"learning_rate": 0.00026766083916083915,
"loss": 3.1673,
"step": 95150
},
{
"epoch": 27.723105597297454,
"grad_norm": 0.4190896153450012,
"learning_rate": 0.00026748601398601395,
"loss": 3.178,
"step": 95200
},
{
"epoch": 27.737666724911175,
"grad_norm": 0.4432176649570465,
"learning_rate": 0.0002673111888111888,
"loss": 3.1849,
"step": 95250
},
{
"epoch": 27.7522278525249,
"grad_norm": 0.415159672498703,
"learning_rate": 0.0002671363636363636,
"loss": 3.1811,
"step": 95300
},
{
"epoch": 27.76678898013862,
"grad_norm": 0.41852837800979614,
"learning_rate": 0.00026696153846153845,
"loss": 3.1709,
"step": 95350
},
{
"epoch": 27.781350107752345,
"grad_norm": 0.44601112604141235,
"learning_rate": 0.00026678671328671325,
"loss": 3.1783,
"step": 95400
},
{
"epoch": 27.795911235366066,
"grad_norm": 0.4142734706401825,
"learning_rate": 0.0002666118881118881,
"loss": 3.1712,
"step": 95450
},
{
"epoch": 27.810472362979787,
"grad_norm": 0.4157761335372925,
"learning_rate": 0.0002664370629370629,
"loss": 3.1946,
"step": 95500
},
{
"epoch": 27.825033490593512,
"grad_norm": 0.43406689167022705,
"learning_rate": 0.00026626223776223776,
"loss": 3.1675,
"step": 95550
},
{
"epoch": 27.839594618207233,
"grad_norm": 0.4497780501842499,
"learning_rate": 0.00026608741258741256,
"loss": 3.1911,
"step": 95600
},
{
"epoch": 27.854155745820957,
"grad_norm": 0.4544839560985565,
"learning_rate": 0.00026591258741258736,
"loss": 3.1824,
"step": 95650
},
{
"epoch": 27.86871687343468,
"grad_norm": 0.4285454750061035,
"learning_rate": 0.0002657377622377622,
"loss": 3.1875,
"step": 95700
},
{
"epoch": 27.883278001048403,
"grad_norm": 0.4224247634410858,
"learning_rate": 0.000265562937062937,
"loss": 3.1819,
"step": 95750
},
{
"epoch": 27.897839128662124,
"grad_norm": 0.4537641108036041,
"learning_rate": 0.00026538811188811187,
"loss": 3.1797,
"step": 95800
},
{
"epoch": 27.912400256275845,
"grad_norm": 0.4123370051383972,
"learning_rate": 0.00026521328671328667,
"loss": 3.1787,
"step": 95850
},
{
"epoch": 27.92696138388957,
"grad_norm": 0.5115052461624146,
"learning_rate": 0.0002650384615384615,
"loss": 3.1819,
"step": 95900
},
{
"epoch": 27.94152251150329,
"grad_norm": 0.4419485926628113,
"learning_rate": 0.0002648636363636364,
"loss": 3.1809,
"step": 95950
},
{
"epoch": 27.956083639117015,
"grad_norm": 0.48116227984428406,
"learning_rate": 0.0002646888111888112,
"loss": 3.1911,
"step": 96000
},
{
"epoch": 27.956083639117015,
"eval_accuracy": 0.3750263634147047,
"eval_loss": 3.536466598510742,
"eval_runtime": 179.45,
"eval_samples_per_second": 92.761,
"eval_steps_per_second": 5.801,
"step": 96000
},
{
"epoch": 27.970644766730736,
"grad_norm": 0.4137166440486908,
"learning_rate": 0.000264513986013986,
"loss": 3.1921,
"step": 96050
},
{
"epoch": 27.985205894344457,
"grad_norm": 0.4321348965167999,
"learning_rate": 0.00026433916083916083,
"loss": 3.1882,
"step": 96100
},
{
"epoch": 27.99976702195818,
"grad_norm": 0.4369487762451172,
"learning_rate": 0.00026416433566433563,
"loss": 3.1977,
"step": 96150
},
{
"epoch": 28.014269905061447,
"grad_norm": 0.43086695671081543,
"learning_rate": 0.0002639895104895105,
"loss": 3.0824,
"step": 96200
},
{
"epoch": 28.02883103267517,
"grad_norm": 0.41935721039772034,
"learning_rate": 0.0002638146853146853,
"loss": 3.0929,
"step": 96250
},
{
"epoch": 28.043392160288892,
"grad_norm": 0.4585091173648834,
"learning_rate": 0.00026363986013986014,
"loss": 3.0923,
"step": 96300
},
{
"epoch": 28.057953287902617,
"grad_norm": 0.4404480457305908,
"learning_rate": 0.00026346503496503494,
"loss": 3.0984,
"step": 96350
},
{
"epoch": 28.072514415516338,
"grad_norm": 0.4495074152946472,
"learning_rate": 0.00026329020979020974,
"loss": 3.0999,
"step": 96400
},
{
"epoch": 28.08707554313006,
"grad_norm": 0.4308162331581116,
"learning_rate": 0.0002631153846153846,
"loss": 3.0954,
"step": 96450
},
{
"epoch": 28.101636670743783,
"grad_norm": 0.44255316257476807,
"learning_rate": 0.0002629405594405594,
"loss": 3.103,
"step": 96500
},
{
"epoch": 28.116197798357504,
"grad_norm": 0.44353339076042175,
"learning_rate": 0.00026276573426573424,
"loss": 3.0974,
"step": 96550
},
{
"epoch": 28.13075892597123,
"grad_norm": 0.4401057958602905,
"learning_rate": 0.00026259090909090904,
"loss": 3.1141,
"step": 96600
},
{
"epoch": 28.14532005358495,
"grad_norm": 0.45138344168663025,
"learning_rate": 0.0002624160839160839,
"loss": 3.103,
"step": 96650
},
{
"epoch": 28.15988118119867,
"grad_norm": 0.4253145158290863,
"learning_rate": 0.00026224125874125875,
"loss": 3.1303,
"step": 96700
},
{
"epoch": 28.174442308812395,
"grad_norm": 0.4274202287197113,
"learning_rate": 0.00026206643356643355,
"loss": 3.1122,
"step": 96750
},
{
"epoch": 28.189003436426116,
"grad_norm": 0.4482264816761017,
"learning_rate": 0.00026189160839160835,
"loss": 3.1273,
"step": 96800
},
{
"epoch": 28.20356456403984,
"grad_norm": 0.41307345032691956,
"learning_rate": 0.0002617167832167832,
"loss": 3.134,
"step": 96850
},
{
"epoch": 28.21812569165356,
"grad_norm": 0.45169275999069214,
"learning_rate": 0.000261541958041958,
"loss": 3.1229,
"step": 96900
},
{
"epoch": 28.232686819267283,
"grad_norm": 0.44852888584136963,
"learning_rate": 0.00026136713286713286,
"loss": 3.1208,
"step": 96950
},
{
"epoch": 28.247247946881007,
"grad_norm": 0.4608646333217621,
"learning_rate": 0.00026119230769230766,
"loss": 3.1122,
"step": 97000
},
{
"epoch": 28.247247946881007,
"eval_accuracy": 0.37415828112182004,
"eval_loss": 3.55517840385437,
"eval_runtime": 179.5739,
"eval_samples_per_second": 92.697,
"eval_steps_per_second": 5.797,
"step": 97000
},
{
"epoch": 28.261809074494728,
"grad_norm": 0.44030946493148804,
"learning_rate": 0.0002610174825174825,
"loss": 3.1255,
"step": 97050
},
{
"epoch": 28.276370202108453,
"grad_norm": 0.44799578189849854,
"learning_rate": 0.0002608426573426573,
"loss": 3.1196,
"step": 97100
},
{
"epoch": 28.290931329722174,
"grad_norm": 0.4262312054634094,
"learning_rate": 0.0002606678321678321,
"loss": 3.1272,
"step": 97150
},
{
"epoch": 28.305492457335895,
"grad_norm": 0.4538463056087494,
"learning_rate": 0.00026049300699300696,
"loss": 3.1364,
"step": 97200
},
{
"epoch": 28.32005358494962,
"grad_norm": 0.46358099579811096,
"learning_rate": 0.00026031818181818176,
"loss": 3.1404,
"step": 97250
},
{
"epoch": 28.33461471256334,
"grad_norm": 0.44027483463287354,
"learning_rate": 0.0002601433566433566,
"loss": 3.1262,
"step": 97300
},
{
"epoch": 28.349175840177065,
"grad_norm": 0.45905429124832153,
"learning_rate": 0.00025996853146853147,
"loss": 3.144,
"step": 97350
},
{
"epoch": 28.363736967790786,
"grad_norm": 0.47435009479522705,
"learning_rate": 0.00025979370629370627,
"loss": 3.1333,
"step": 97400
},
{
"epoch": 28.378298095404507,
"grad_norm": 0.4567977488040924,
"learning_rate": 0.0002596188811188811,
"loss": 3.1458,
"step": 97450
},
{
"epoch": 28.39285922301823,
"grad_norm": 0.4301726222038269,
"learning_rate": 0.0002594440559440559,
"loss": 3.1454,
"step": 97500
},
{
"epoch": 28.407420350631952,
"grad_norm": 0.4424678683280945,
"learning_rate": 0.0002592692307692307,
"loss": 3.1348,
"step": 97550
},
{
"epoch": 28.421981478245677,
"grad_norm": 0.42886584997177124,
"learning_rate": 0.0002590944055944056,
"loss": 3.1415,
"step": 97600
},
{
"epoch": 28.436542605859398,
"grad_norm": 0.43304717540740967,
"learning_rate": 0.0002589195804195804,
"loss": 3.1392,
"step": 97650
},
{
"epoch": 28.45110373347312,
"grad_norm": 0.44938918948173523,
"learning_rate": 0.00025874475524475523,
"loss": 3.1554,
"step": 97700
},
{
"epoch": 28.465664861086843,
"grad_norm": 0.43526050448417664,
"learning_rate": 0.00025856993006993003,
"loss": 3.1389,
"step": 97750
},
{
"epoch": 28.480225988700564,
"grad_norm": 0.48536691069602966,
"learning_rate": 0.0002583951048951049,
"loss": 3.1482,
"step": 97800
},
{
"epoch": 28.49478711631429,
"grad_norm": 0.4353760778903961,
"learning_rate": 0.0002582202797202797,
"loss": 3.1515,
"step": 97850
},
{
"epoch": 28.50934824392801,
"grad_norm": 0.44917377829551697,
"learning_rate": 0.0002580454545454545,
"loss": 3.1557,
"step": 97900
},
{
"epoch": 28.523909371541734,
"grad_norm": 0.4558669626712799,
"learning_rate": 0.00025787062937062934,
"loss": 3.146,
"step": 97950
},
{
"epoch": 28.538470499155455,
"grad_norm": 0.4887709319591522,
"learning_rate": 0.0002576958041958042,
"loss": 3.1469,
"step": 98000
},
{
"epoch": 28.538470499155455,
"eval_accuracy": 0.3742345792540088,
"eval_loss": 3.5468759536743164,
"eval_runtime": 179.236,
"eval_samples_per_second": 92.872,
"eval_steps_per_second": 5.808,
"step": 98000
},
{
"epoch": 28.553031626769176,
"grad_norm": 0.45244932174682617,
"learning_rate": 0.000257520979020979,
"loss": 3.1466,
"step": 98050
},
{
"epoch": 28.5675927543829,
"grad_norm": 0.45052096247673035,
"learning_rate": 0.00025734615384615385,
"loss": 3.1547,
"step": 98100
},
{
"epoch": 28.58215388199662,
"grad_norm": 0.4334174394607544,
"learning_rate": 0.00025717132867132865,
"loss": 3.1472,
"step": 98150
},
{
"epoch": 28.596715009610342,
"grad_norm": 0.460929811000824,
"learning_rate": 0.0002569965034965035,
"loss": 3.1516,
"step": 98200
},
{
"epoch": 28.611276137224067,
"grad_norm": 0.4317983388900757,
"learning_rate": 0.0002568216783216783,
"loss": 3.1695,
"step": 98250
},
{
"epoch": 28.625837264837788,
"grad_norm": 0.45057880878448486,
"learning_rate": 0.0002566468531468531,
"loss": 3.1666,
"step": 98300
},
{
"epoch": 28.640398392451512,
"grad_norm": 0.44278305768966675,
"learning_rate": 0.00025647202797202795,
"loss": 3.1755,
"step": 98350
},
{
"epoch": 28.654959520065233,
"grad_norm": 0.43399128317832947,
"learning_rate": 0.00025629720279720275,
"loss": 3.1689,
"step": 98400
},
{
"epoch": 28.669520647678958,
"grad_norm": 0.4680730104446411,
"learning_rate": 0.0002561223776223776,
"loss": 3.1478,
"step": 98450
},
{
"epoch": 28.68408177529268,
"grad_norm": 0.4823758602142334,
"learning_rate": 0.0002559475524475524,
"loss": 3.1582,
"step": 98500
},
{
"epoch": 28.6986429029064,
"grad_norm": 0.44013476371765137,
"learning_rate": 0.00025577272727272726,
"loss": 3.1636,
"step": 98550
},
{
"epoch": 28.713204030520124,
"grad_norm": 0.4259727895259857,
"learning_rate": 0.00025559790209790206,
"loss": 3.158,
"step": 98600
},
{
"epoch": 28.727765158133845,
"grad_norm": 0.44608891010284424,
"learning_rate": 0.00025542307692307686,
"loss": 3.1723,
"step": 98650
},
{
"epoch": 28.74232628574757,
"grad_norm": 0.4473435580730438,
"learning_rate": 0.00025524825174825177,
"loss": 3.1615,
"step": 98700
},
{
"epoch": 28.75688741336129,
"grad_norm": 0.43744006752967834,
"learning_rate": 0.00025507342657342657,
"loss": 3.1664,
"step": 98750
},
{
"epoch": 28.771448540975012,
"grad_norm": 0.4511035084724426,
"learning_rate": 0.00025489860139860137,
"loss": 3.1674,
"step": 98800
},
{
"epoch": 28.786009668588736,
"grad_norm": 0.454174280166626,
"learning_rate": 0.0002547237762237762,
"loss": 3.1589,
"step": 98850
},
{
"epoch": 28.800570796202457,
"grad_norm": 0.42523393034935,
"learning_rate": 0.000254548951048951,
"loss": 3.1804,
"step": 98900
},
{
"epoch": 28.815131923816182,
"grad_norm": 0.419761598110199,
"learning_rate": 0.0002543741258741259,
"loss": 3.1715,
"step": 98950
},
{
"epoch": 28.829693051429903,
"grad_norm": 0.4770107567310333,
"learning_rate": 0.0002541993006993007,
"loss": 3.1669,
"step": 99000
},
{
"epoch": 28.829693051429903,
"eval_accuracy": 0.3751707302965658,
"eval_loss": 3.538731098175049,
"eval_runtime": 179.2899,
"eval_samples_per_second": 92.844,
"eval_steps_per_second": 5.806,
"step": 99000
},
{
"epoch": 28.844254179043624,
"grad_norm": 0.4531731903553009,
"learning_rate": 0.0002540244755244755,
"loss": 3.1592,
"step": 99050
},
{
"epoch": 28.85881530665735,
"grad_norm": 0.46171900629997253,
"learning_rate": 0.00025384965034965033,
"loss": 3.1737,
"step": 99100
},
{
"epoch": 28.87337643427107,
"grad_norm": 0.4452742636203766,
"learning_rate": 0.00025367482517482513,
"loss": 3.1732,
"step": 99150
},
{
"epoch": 28.887937561884794,
"grad_norm": 0.47405216097831726,
"learning_rate": 0.0002535,
"loss": 3.1608,
"step": 99200
},
{
"epoch": 28.902498689498515,
"grad_norm": 0.45681318640708923,
"learning_rate": 0.0002533251748251748,
"loss": 3.1769,
"step": 99250
},
{
"epoch": 28.917059817112236,
"grad_norm": 0.44045165181159973,
"learning_rate": 0.00025315034965034964,
"loss": 3.1743,
"step": 99300
},
{
"epoch": 28.93162094472596,
"grad_norm": 0.42967620491981506,
"learning_rate": 0.00025297552447552444,
"loss": 3.1762,
"step": 99350
},
{
"epoch": 28.94618207233968,
"grad_norm": 0.40536534786224365,
"learning_rate": 0.0002528006993006993,
"loss": 3.1726,
"step": 99400
},
{
"epoch": 28.960743199953406,
"grad_norm": 0.43178820610046387,
"learning_rate": 0.00025262587412587414,
"loss": 3.1692,
"step": 99450
},
{
"epoch": 28.975304327567127,
"grad_norm": 0.4307922124862671,
"learning_rate": 0.00025245104895104894,
"loss": 3.1837,
"step": 99500
},
{
"epoch": 28.989865455180848,
"grad_norm": 0.43028488755226135,
"learning_rate": 0.00025227622377622374,
"loss": 3.1809,
"step": 99550
},
{
"epoch": 29.004368338284117,
"grad_norm": 0.42854586243629456,
"learning_rate": 0.0002521013986013986,
"loss": 3.1529,
"step": 99600
},
{
"epoch": 29.018929465897838,
"grad_norm": 0.4413714110851288,
"learning_rate": 0.0002519265734265734,
"loss": 3.0869,
"step": 99650
},
{
"epoch": 29.033490593511562,
"grad_norm": 0.43991145491600037,
"learning_rate": 0.00025175174825174825,
"loss": 3.0813,
"step": 99700
},
{
"epoch": 29.048051721125283,
"grad_norm": 0.46903711557388306,
"learning_rate": 0.00025157692307692305,
"loss": 3.0853,
"step": 99750
},
{
"epoch": 29.062612848739008,
"grad_norm": 0.4625730514526367,
"learning_rate": 0.0002514020979020979,
"loss": 3.0856,
"step": 99800
},
{
"epoch": 29.07717397635273,
"grad_norm": 0.4436502158641815,
"learning_rate": 0.0002512272727272727,
"loss": 3.0929,
"step": 99850
},
{
"epoch": 29.09173510396645,
"grad_norm": 0.4613840878009796,
"learning_rate": 0.0002510524475524475,
"loss": 3.1013,
"step": 99900
},
{
"epoch": 29.106296231580174,
"grad_norm": 0.43023690581321716,
"learning_rate": 0.00025087762237762236,
"loss": 3.1029,
"step": 99950
},
{
"epoch": 29.120857359193895,
"grad_norm": 0.4351300001144409,
"learning_rate": 0.00025070279720279716,
"loss": 3.0958,
"step": 100000
},
{
"epoch": 29.120857359193895,
"eval_accuracy": 0.3741589864974643,
"eval_loss": 3.556124210357666,
"eval_runtime": 179.3086,
"eval_samples_per_second": 92.834,
"eval_steps_per_second": 5.806,
"step": 100000
},
{
"epoch": 29.120857359193895,
"step": 100000,
"total_flos": 2.090205609984e+18,
"train_loss": 0.632617908782959,
"train_runtime": 39806.2336,
"train_samples_per_second": 345.046,
"train_steps_per_second": 4.313
}
],
"logging_steps": 50,
"max_steps": 171700,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 10000,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 20,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 20
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.090205609984e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}