gpt2-xl-lora-multi-shared-512-top / trainer_state.json
MHGanainy's picture
MHGanainy/gpt2-xl-lora-multi-shared-512-top
dae7b79 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 87356,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011447410595723248,
"grad_norm": 0.08282724022865295,
"learning_rate": 2.8332665560070978e-08,
"loss": 2.747,
"step": 100
},
{
"epoch": 0.0022894821191446497,
"grad_norm": 0.09139522910118103,
"learning_rate": 5.6665331120141957e-08,
"loss": 2.7153,
"step": 200
},
{
"epoch": 0.003434223178716974,
"grad_norm": 0.14882275462150574,
"learning_rate": 8.528418522122374e-08,
"loss": 2.7207,
"step": 300
},
{
"epoch": 0.004578964238289299,
"grad_norm": 0.15320613980293274,
"learning_rate": 1.1390303932230555e-07,
"loss": 2.7451,
"step": 400
},
{
"epoch": 0.005723705297861623,
"grad_norm": 0.10282430797815323,
"learning_rate": 1.4223570488237652e-07,
"loss": 2.7131,
"step": 500
},
{
"epoch": 0.006868446357433948,
"grad_norm": 0.14666467905044556,
"learning_rate": 1.7085455898345834e-07,
"loss": 2.7244,
"step": 600
},
{
"epoch": 0.008013187417006273,
"grad_norm": 0.13914579153060913,
"learning_rate": 1.994734130845401e-07,
"loss": 2.7248,
"step": 700
},
{
"epoch": 0.009157928476578599,
"grad_norm": 0.13069643080234528,
"learning_rate": 2.278060786446111e-07,
"loss": 2.7237,
"step": 800
},
{
"epoch": 0.010302669536150923,
"grad_norm": 0.11404985189437866,
"learning_rate": 2.564249327456929e-07,
"loss": 2.7372,
"step": 900
},
{
"epoch": 0.011447410595723247,
"grad_norm": 0.2278125137090683,
"learning_rate": 2.8504378684677464e-07,
"loss": 2.7491,
"step": 1000
},
{
"epoch": 0.012592151655295572,
"grad_norm": 0.19849324226379395,
"learning_rate": 3.1366264094785646e-07,
"loss": 2.7059,
"step": 1100
},
{
"epoch": 0.013736892714867896,
"grad_norm": 0.21892410516738892,
"learning_rate": 3.422814950489383e-07,
"loss": 2.7291,
"step": 1200
},
{
"epoch": 0.014881633774440222,
"grad_norm": 0.09724584966897964,
"learning_rate": 3.709003491500201e-07,
"loss": 2.7131,
"step": 1300
},
{
"epoch": 0.016026374834012546,
"grad_norm": 0.13547281920909882,
"learning_rate": 3.9923301471009105e-07,
"loss": 2.6872,
"step": 1400
},
{
"epoch": 0.01717111589358487,
"grad_norm": 0.21319937705993652,
"learning_rate": 4.278518688111728e-07,
"loss": 2.7306,
"step": 1500
},
{
"epoch": 0.018315856953157197,
"grad_norm": 0.15332601964473724,
"learning_rate": 4.564707229122546e-07,
"loss": 2.6786,
"step": 1600
},
{
"epoch": 0.01946059801272952,
"grad_norm": 0.13409346342086792,
"learning_rate": 4.850895770133364e-07,
"loss": 2.7021,
"step": 1700
},
{
"epoch": 0.020605339072301845,
"grad_norm": 0.27683642506599426,
"learning_rate": 5.137084311144182e-07,
"loss": 2.7098,
"step": 1800
},
{
"epoch": 0.02175008013187417,
"grad_norm": 0.2598477900028229,
"learning_rate": 5.423272852155001e-07,
"loss": 2.6628,
"step": 1900
},
{
"epoch": 0.022894821191446493,
"grad_norm": 0.2136494219303131,
"learning_rate": 5.709461393165818e-07,
"loss": 2.6882,
"step": 2000
},
{
"epoch": 0.02403956225101882,
"grad_norm": 0.1683170348405838,
"learning_rate": 5.995649934176636e-07,
"loss": 2.6787,
"step": 2100
},
{
"epoch": 0.025184303310591145,
"grad_norm": 0.12898527085781097,
"learning_rate": 6.281838475187455e-07,
"loss": 2.6507,
"step": 2200
},
{
"epoch": 0.02632904437016347,
"grad_norm": 0.25819605588912964,
"learning_rate": 6.568027016198272e-07,
"loss": 2.6485,
"step": 2300
},
{
"epoch": 0.027473785429735793,
"grad_norm": 0.26418036222457886,
"learning_rate": 6.85421555720909e-07,
"loss": 2.665,
"step": 2400
},
{
"epoch": 0.02861852648930812,
"grad_norm": 0.24633832275867462,
"learning_rate": 7.140404098219908e-07,
"loss": 2.6122,
"step": 2500
},
{
"epoch": 0.029763267548880444,
"grad_norm": 0.11409879475831985,
"learning_rate": 7.426592639230726e-07,
"loss": 2.61,
"step": 2600
},
{
"epoch": 0.030908008608452768,
"grad_norm": 0.18866164982318878,
"learning_rate": 7.712781180241544e-07,
"loss": 2.6109,
"step": 2700
},
{
"epoch": 0.03205274966802509,
"grad_norm": 0.2056051343679428,
"learning_rate": 7.998969721252361e-07,
"loss": 2.6274,
"step": 2800
},
{
"epoch": 0.033197490727597416,
"grad_norm": 0.18269231915473938,
"learning_rate": 8.285158262263179e-07,
"loss": 2.6012,
"step": 2900
},
{
"epoch": 0.03434223178716974,
"grad_norm": 0.17360204458236694,
"learning_rate": 8.571346803273998e-07,
"loss": 2.6238,
"step": 3000
},
{
"epoch": 0.035486972846742064,
"grad_norm": 0.1858297735452652,
"learning_rate": 8.857535344284815e-07,
"loss": 2.5836,
"step": 3100
},
{
"epoch": 0.036631713906314395,
"grad_norm": 0.18415091931819916,
"learning_rate": 9.143723885295633e-07,
"loss": 2.5919,
"step": 3200
},
{
"epoch": 0.03777645496588672,
"grad_norm": 0.19720780849456787,
"learning_rate": 9.429912426306452e-07,
"loss": 2.5742,
"step": 3300
},
{
"epoch": 0.03892119602545904,
"grad_norm": 0.21820122003555298,
"learning_rate": 9.71323908190716e-07,
"loss": 2.5653,
"step": 3400
},
{
"epoch": 0.04006593708503137,
"grad_norm": 0.2850651741027832,
"learning_rate": 9.999427622917978e-07,
"loss": 2.5936,
"step": 3500
},
{
"epoch": 0.04121067814460369,
"grad_norm": 0.19742882251739502,
"learning_rate": 1.028275427851869e-06,
"loss": 2.5751,
"step": 3600
},
{
"epoch": 0.042355419204176015,
"grad_norm": 0.1881546527147293,
"learning_rate": 1.0568942819529507e-06,
"loss": 2.569,
"step": 3700
},
{
"epoch": 0.04350016026374834,
"grad_norm": 0.16125181317329407,
"learning_rate": 1.0855131360540325e-06,
"loss": 2.5522,
"step": 3800
},
{
"epoch": 0.04464490132332066,
"grad_norm": 0.17413508892059326,
"learning_rate": 1.1141319901551142e-06,
"loss": 2.5577,
"step": 3900
},
{
"epoch": 0.04578964238289299,
"grad_norm": 0.36808159947395325,
"learning_rate": 1.142750844256196e-06,
"loss": 2.541,
"step": 4000
},
{
"epoch": 0.04693438344246532,
"grad_norm": 0.2387821078300476,
"learning_rate": 1.1713696983572778e-06,
"loss": 2.5481,
"step": 4100
},
{
"epoch": 0.04807912450203764,
"grad_norm": 0.28203991055488586,
"learning_rate": 1.1999885524583595e-06,
"loss": 2.5099,
"step": 4200
},
{
"epoch": 0.049223865561609965,
"grad_norm": 0.36418649554252625,
"learning_rate": 1.2286074065594415e-06,
"loss": 2.5158,
"step": 4300
},
{
"epoch": 0.05036860662118229,
"grad_norm": 0.24819067120552063,
"learning_rate": 1.2572262606605233e-06,
"loss": 2.5189,
"step": 4400
},
{
"epoch": 0.05151334768075461,
"grad_norm": 0.24554955959320068,
"learning_rate": 1.285845114761605e-06,
"loss": 2.5275,
"step": 4500
},
{
"epoch": 0.05265808874032694,
"grad_norm": 0.3186652660369873,
"learning_rate": 1.3144639688626868e-06,
"loss": 2.5156,
"step": 4600
},
{
"epoch": 0.05380282979989926,
"grad_norm": 0.2992977797985077,
"learning_rate": 1.3430828229637685e-06,
"loss": 2.5009,
"step": 4700
},
{
"epoch": 0.054947570859471585,
"grad_norm": 0.3505355715751648,
"learning_rate": 1.3717016770648503e-06,
"loss": 2.4923,
"step": 4800
},
{
"epoch": 0.05609231191904391,
"grad_norm": 0.17197385430335999,
"learning_rate": 1.4003205311659323e-06,
"loss": 2.4825,
"step": 4900
},
{
"epoch": 0.05723705297861624,
"grad_norm": 0.21533966064453125,
"learning_rate": 1.4289393852670142e-06,
"loss": 2.484,
"step": 5000
},
{
"epoch": 0.058381794038188564,
"grad_norm": 0.17859208583831787,
"learning_rate": 1.457558239368096e-06,
"loss": 2.4734,
"step": 5100
},
{
"epoch": 0.05952653509776089,
"grad_norm": 0.36880823969841003,
"learning_rate": 1.4861770934691778e-06,
"loss": 2.4747,
"step": 5200
},
{
"epoch": 0.06067127615733321,
"grad_norm": 0.26152077317237854,
"learning_rate": 1.5147959475702595e-06,
"loss": 2.4696,
"step": 5300
},
{
"epoch": 0.061816017216905536,
"grad_norm": 0.37325313687324524,
"learning_rate": 1.5434148016713413e-06,
"loss": 2.4908,
"step": 5400
},
{
"epoch": 0.06296075827647786,
"grad_norm": 0.20671696960926056,
"learning_rate": 1.572033655772423e-06,
"loss": 2.4929,
"step": 5500
},
{
"epoch": 0.06410549933605018,
"grad_norm": 0.17978721857070923,
"learning_rate": 1.6006525098735048e-06,
"loss": 2.4557,
"step": 5600
},
{
"epoch": 0.06525024039562251,
"grad_norm": 0.17210538685321808,
"learning_rate": 1.6292713639745866e-06,
"loss": 2.4646,
"step": 5700
},
{
"epoch": 0.06639498145519483,
"grad_norm": 0.17783130705356598,
"learning_rate": 1.6578902180756683e-06,
"loss": 2.4563,
"step": 5800
},
{
"epoch": 0.06753972251476716,
"grad_norm": 0.16413679718971252,
"learning_rate": 1.68650907217675e-06,
"loss": 2.4823,
"step": 5900
},
{
"epoch": 0.06868446357433948,
"grad_norm": 0.1958412379026413,
"learning_rate": 1.715127926277832e-06,
"loss": 2.4733,
"step": 6000
},
{
"epoch": 0.0698292046339118,
"grad_norm": 0.20609253644943237,
"learning_rate": 1.7437467803789138e-06,
"loss": 2.4613,
"step": 6100
},
{
"epoch": 0.07097394569348413,
"grad_norm": 0.292453795671463,
"learning_rate": 1.7723656344799956e-06,
"loss": 2.4585,
"step": 6200
},
{
"epoch": 0.07211868675305645,
"grad_norm": 0.22299447655677795,
"learning_rate": 1.8009844885810774e-06,
"loss": 2.4572,
"step": 6300
},
{
"epoch": 0.07326342781262879,
"grad_norm": 0.18281777203083038,
"learning_rate": 1.8296033426821591e-06,
"loss": 2.4511,
"step": 6400
},
{
"epoch": 0.07440816887220111,
"grad_norm": 0.19673572480678558,
"learning_rate": 1.8582221967832409e-06,
"loss": 2.4377,
"step": 6500
},
{
"epoch": 0.07555290993177344,
"grad_norm": 0.2068740874528885,
"learning_rate": 1.8868410508843226e-06,
"loss": 2.4483,
"step": 6600
},
{
"epoch": 0.07669765099134576,
"grad_norm": 0.22345593571662903,
"learning_rate": 1.9154599049854046e-06,
"loss": 2.4708,
"step": 6700
},
{
"epoch": 0.07784239205091809,
"grad_norm": 0.37216469645500183,
"learning_rate": 1.944078759086486e-06,
"loss": 2.453,
"step": 6800
},
{
"epoch": 0.07898713311049041,
"grad_norm": 0.20201215147972107,
"learning_rate": 1.972697613187568e-06,
"loss": 2.4536,
"step": 6900
},
{
"epoch": 0.08013187417006273,
"grad_norm": 0.32299327850341797,
"learning_rate": 2.0013164672886497e-06,
"loss": 2.4464,
"step": 7000
},
{
"epoch": 0.08127661522963506,
"grad_norm": 0.2131340205669403,
"learning_rate": 2.0299353213897317e-06,
"loss": 2.4699,
"step": 7100
},
{
"epoch": 0.08242135628920738,
"grad_norm": 0.2809932231903076,
"learning_rate": 2.0585541754908132e-06,
"loss": 2.4494,
"step": 7200
},
{
"epoch": 0.0835660973487797,
"grad_norm": 0.19964805245399475,
"learning_rate": 2.0871730295918956e-06,
"loss": 2.4575,
"step": 7300
},
{
"epoch": 0.08471083840835203,
"grad_norm": 0.2731245160102844,
"learning_rate": 2.115791883692977e-06,
"loss": 2.4513,
"step": 7400
},
{
"epoch": 0.08585557946792435,
"grad_norm": 0.22060319781303406,
"learning_rate": 2.144410737794059e-06,
"loss": 2.4459,
"step": 7500
},
{
"epoch": 0.08700032052749668,
"grad_norm": 0.23072516918182373,
"learning_rate": 2.1730295918951407e-06,
"loss": 2.4469,
"step": 7600
},
{
"epoch": 0.088145061587069,
"grad_norm": 0.2869388163089752,
"learning_rate": 2.2016484459962227e-06,
"loss": 2.426,
"step": 7700
},
{
"epoch": 0.08928980264664133,
"grad_norm": 0.4073362648487091,
"learning_rate": 2.230267300097304e-06,
"loss": 2.4399,
"step": 7800
},
{
"epoch": 0.09043454370621365,
"grad_norm": 0.3338267207145691,
"learning_rate": 2.258886154198386e-06,
"loss": 2.4442,
"step": 7900
},
{
"epoch": 0.09157928476578597,
"grad_norm": 0.32599982619285583,
"learning_rate": 2.2875050082994677e-06,
"loss": 2.448,
"step": 8000
},
{
"epoch": 0.0927240258253583,
"grad_norm": 0.38292455673217773,
"learning_rate": 2.3161238624005497e-06,
"loss": 2.4501,
"step": 8100
},
{
"epoch": 0.09386876688493064,
"grad_norm": 0.2262214571237564,
"learning_rate": 2.3447427165016317e-06,
"loss": 2.4306,
"step": 8200
},
{
"epoch": 0.09501350794450296,
"grad_norm": 0.23351378738880157,
"learning_rate": 2.3733615706027132e-06,
"loss": 2.4542,
"step": 8300
},
{
"epoch": 0.09615824900407528,
"grad_norm": 0.24008594453334808,
"learning_rate": 2.401980424703795e-06,
"loss": 2.4491,
"step": 8400
},
{
"epoch": 0.09730299006364761,
"grad_norm": 0.3131836950778961,
"learning_rate": 2.4305992788048767e-06,
"loss": 2.4402,
"step": 8500
},
{
"epoch": 0.09844773112321993,
"grad_norm": 0.3088011145591736,
"learning_rate": 2.4592181329059587e-06,
"loss": 2.4236,
"step": 8600
},
{
"epoch": 0.09959247218279225,
"grad_norm": 0.31873244047164917,
"learning_rate": 2.4878369870070403e-06,
"loss": 2.4425,
"step": 8700
},
{
"epoch": 0.10073721324236458,
"grad_norm": 0.19671718776226044,
"learning_rate": 2.5164558411081227e-06,
"loss": 2.4277,
"step": 8800
},
{
"epoch": 0.1018819543019369,
"grad_norm": 0.3244757056236267,
"learning_rate": 2.5450746952092042e-06,
"loss": 2.4328,
"step": 8900
},
{
"epoch": 0.10302669536150923,
"grad_norm": 0.32051828503608704,
"learning_rate": 2.573693549310286e-06,
"loss": 2.4303,
"step": 9000
},
{
"epoch": 0.10417143642108155,
"grad_norm": 0.2557377517223358,
"learning_rate": 2.6023124034113677e-06,
"loss": 2.4426,
"step": 9100
},
{
"epoch": 0.10531617748065387,
"grad_norm": 0.29920217394828796,
"learning_rate": 2.6309312575124497e-06,
"loss": 2.4247,
"step": 9200
},
{
"epoch": 0.1064609185402262,
"grad_norm": 0.2528887987136841,
"learning_rate": 2.6595501116135313e-06,
"loss": 2.4337,
"step": 9300
},
{
"epoch": 0.10760565959979852,
"grad_norm": 0.2672332525253296,
"learning_rate": 2.6881689657146132e-06,
"loss": 2.4327,
"step": 9400
},
{
"epoch": 0.10875040065937085,
"grad_norm": 0.26545771956443787,
"learning_rate": 2.7167878198156948e-06,
"loss": 2.411,
"step": 9500
},
{
"epoch": 0.10989514171894317,
"grad_norm": 0.2251209020614624,
"learning_rate": 2.7454066739167768e-06,
"loss": 2.4346,
"step": 9600
},
{
"epoch": 0.1110398827785155,
"grad_norm": 0.25982141494750977,
"learning_rate": 2.7740255280178583e-06,
"loss": 2.4155,
"step": 9700
},
{
"epoch": 0.11218462383808782,
"grad_norm": 0.2948269844055176,
"learning_rate": 2.8026443821189403e-06,
"loss": 2.4056,
"step": 9800
},
{
"epoch": 0.11332936489766014,
"grad_norm": 0.2271622121334076,
"learning_rate": 2.8312632362200223e-06,
"loss": 2.4266,
"step": 9900
},
{
"epoch": 0.11447410595723248,
"grad_norm": 0.25277993083000183,
"learning_rate": 2.859882090321104e-06,
"loss": 2.4267,
"step": 10000
},
{
"epoch": 0.1156188470168048,
"grad_norm": 0.2651556730270386,
"learning_rate": 2.8885009444221858e-06,
"loss": 2.4423,
"step": 10100
},
{
"epoch": 0.11676358807637713,
"grad_norm": 0.22295983135700226,
"learning_rate": 2.9171197985232673e-06,
"loss": 2.4227,
"step": 10200
},
{
"epoch": 0.11790832913594945,
"grad_norm": 0.2351614385843277,
"learning_rate": 2.9457386526243493e-06,
"loss": 2.4237,
"step": 10300
},
{
"epoch": 0.11905307019552178,
"grad_norm": 0.327232301235199,
"learning_rate": 2.974357506725431e-06,
"loss": 2.4195,
"step": 10400
},
{
"epoch": 0.1201978112550941,
"grad_norm": 0.234052836894989,
"learning_rate": 3.002976360826513e-06,
"loss": 2.4265,
"step": 10500
},
{
"epoch": 0.12134255231466642,
"grad_norm": 0.29197776317596436,
"learning_rate": 3.0315952149275944e-06,
"loss": 2.4175,
"step": 10600
},
{
"epoch": 0.12248729337423875,
"grad_norm": 0.3510327935218811,
"learning_rate": 3.0602140690286763e-06,
"loss": 2.4018,
"step": 10700
},
{
"epoch": 0.12363203443381107,
"grad_norm": 0.24532395601272583,
"learning_rate": 3.0888329231297583e-06,
"loss": 2.4081,
"step": 10800
},
{
"epoch": 0.1247767754933834,
"grad_norm": 0.29377228021621704,
"learning_rate": 3.11745177723084e-06,
"loss": 2.4152,
"step": 10900
},
{
"epoch": 0.12592151655295572,
"grad_norm": 0.3314598798751831,
"learning_rate": 3.146070631331922e-06,
"loss": 2.4224,
"step": 11000
},
{
"epoch": 0.12706625761252804,
"grad_norm": 0.31806275248527527,
"learning_rate": 3.1746894854330034e-06,
"loss": 2.4087,
"step": 11100
},
{
"epoch": 0.12821099867210037,
"grad_norm": 0.29323023557662964,
"learning_rate": 3.2033083395340854e-06,
"loss": 2.3941,
"step": 11200
},
{
"epoch": 0.1293557397316727,
"grad_norm": 0.230011448264122,
"learning_rate": 3.231927193635167e-06,
"loss": 2.4154,
"step": 11300
},
{
"epoch": 0.13050048079124502,
"grad_norm": 0.36185070872306824,
"learning_rate": 3.260546047736249e-06,
"loss": 2.4163,
"step": 11400
},
{
"epoch": 0.13164522185081734,
"grad_norm": 0.35968175530433655,
"learning_rate": 3.2891649018373304e-06,
"loss": 2.4148,
"step": 11500
},
{
"epoch": 0.13278996291038966,
"grad_norm": 0.3145340383052826,
"learning_rate": 3.3177837559384124e-06,
"loss": 2.4102,
"step": 11600
},
{
"epoch": 0.133934703969962,
"grad_norm": 0.26297980546951294,
"learning_rate": 3.3464026100394944e-06,
"loss": 2.4047,
"step": 11700
},
{
"epoch": 0.1350794450295343,
"grad_norm": 0.24281686544418335,
"learning_rate": 3.375021464140576e-06,
"loss": 2.3916,
"step": 11800
},
{
"epoch": 0.13622418608910664,
"grad_norm": 0.2922670245170593,
"learning_rate": 3.403640318241658e-06,
"loss": 2.4226,
"step": 11900
},
{
"epoch": 0.13736892714867896,
"grad_norm": 0.28737780451774597,
"learning_rate": 3.4322591723427395e-06,
"loss": 2.4099,
"step": 12000
},
{
"epoch": 0.13851366820825128,
"grad_norm": 0.7136600613594055,
"learning_rate": 3.4608780264438214e-06,
"loss": 2.4025,
"step": 12100
},
{
"epoch": 0.1396584092678236,
"grad_norm": 0.2455575317144394,
"learning_rate": 3.489496880544903e-06,
"loss": 2.403,
"step": 12200
},
{
"epoch": 0.14080315032739593,
"grad_norm": 0.24030736088752747,
"learning_rate": 3.518115734645985e-06,
"loss": 2.3824,
"step": 12300
},
{
"epoch": 0.14194789138696826,
"grad_norm": 0.28610554337501526,
"learning_rate": 3.5467345887470665e-06,
"loss": 2.414,
"step": 12400
},
{
"epoch": 0.14309263244654058,
"grad_norm": 0.28286024928092957,
"learning_rate": 3.5753534428481485e-06,
"loss": 2.403,
"step": 12500
},
{
"epoch": 0.1442373735061129,
"grad_norm": 0.27423399686813354,
"learning_rate": 3.60397229694923e-06,
"loss": 2.3912,
"step": 12600
},
{
"epoch": 0.14538211456568526,
"grad_norm": 0.3900642991065979,
"learning_rate": 3.632591151050312e-06,
"loss": 2.391,
"step": 12700
},
{
"epoch": 0.14652685562525758,
"grad_norm": 0.3418841063976288,
"learning_rate": 3.661210005151394e-06,
"loss": 2.3931,
"step": 12800
},
{
"epoch": 0.1476715966848299,
"grad_norm": 0.22310563921928406,
"learning_rate": 3.6898288592524755e-06,
"loss": 2.4003,
"step": 12900
},
{
"epoch": 0.14881633774440223,
"grad_norm": 0.3633168935775757,
"learning_rate": 3.7184477133535575e-06,
"loss": 2.388,
"step": 13000
},
{
"epoch": 0.14996107880397455,
"grad_norm": 0.24249403178691864,
"learning_rate": 3.747066567454639e-06,
"loss": 2.3798,
"step": 13100
},
{
"epoch": 0.15110581986354688,
"grad_norm": 0.3075302243232727,
"learning_rate": 3.7756854215557214e-06,
"loss": 2.379,
"step": 13200
},
{
"epoch": 0.1522505609231192,
"grad_norm": 0.2638431787490845,
"learning_rate": 3.8043042756568034e-06,
"loss": 2.3826,
"step": 13300
},
{
"epoch": 0.15339530198269152,
"grad_norm": 0.26029646396636963,
"learning_rate": 3.832923129757885e-06,
"loss": 2.3876,
"step": 13400
},
{
"epoch": 0.15454004304226385,
"grad_norm": 0.297446072101593,
"learning_rate": 3.8615419838589665e-06,
"loss": 2.3791,
"step": 13500
},
{
"epoch": 0.15568478410183617,
"grad_norm": 0.34628915786743164,
"learning_rate": 3.890160837960049e-06,
"loss": 2.4032,
"step": 13600
},
{
"epoch": 0.1568295251614085,
"grad_norm": 0.2669197916984558,
"learning_rate": 3.9187796920611305e-06,
"loss": 2.4,
"step": 13700
},
{
"epoch": 0.15797426622098082,
"grad_norm": 0.3261224627494812,
"learning_rate": 3.947398546162212e-06,
"loss": 2.3816,
"step": 13800
},
{
"epoch": 0.15911900728055314,
"grad_norm": 0.3039107024669647,
"learning_rate": 3.9760174002632936e-06,
"loss": 2.3783,
"step": 13900
},
{
"epoch": 0.16026374834012547,
"grad_norm": 0.26345106959342957,
"learning_rate": 4.004636254364376e-06,
"loss": 2.382,
"step": 14000
},
{
"epoch": 0.1614084893996978,
"grad_norm": 0.3465179204940796,
"learning_rate": 4.0332551084654575e-06,
"loss": 2.3873,
"step": 14100
},
{
"epoch": 0.16255323045927011,
"grad_norm": 0.2453349232673645,
"learning_rate": 4.061873962566539e-06,
"loss": 2.3881,
"step": 14200
},
{
"epoch": 0.16369797151884244,
"grad_norm": 0.24121074378490448,
"learning_rate": 4.0904928166676215e-06,
"loss": 2.3895,
"step": 14300
},
{
"epoch": 0.16484271257841476,
"grad_norm": 0.3282526135444641,
"learning_rate": 4.119111670768703e-06,
"loss": 2.391,
"step": 14400
},
{
"epoch": 0.1659874536379871,
"grad_norm": 0.32719773054122925,
"learning_rate": 4.1477305248697846e-06,
"loss": 2.3729,
"step": 14500
},
{
"epoch": 0.1671321946975594,
"grad_norm": 0.2743726074695587,
"learning_rate": 4.176349378970866e-06,
"loss": 2.3967,
"step": 14600
},
{
"epoch": 0.16827693575713173,
"grad_norm": 0.2472705990076065,
"learning_rate": 4.2049682330719485e-06,
"loss": 2.3871,
"step": 14700
},
{
"epoch": 0.16942167681670406,
"grad_norm": 0.23346185684204102,
"learning_rate": 4.23358708717303e-06,
"loss": 2.3762,
"step": 14800
},
{
"epoch": 0.17056641787627638,
"grad_norm": 0.261417031288147,
"learning_rate": 4.262205941274112e-06,
"loss": 2.3947,
"step": 14900
},
{
"epoch": 0.1717111589358487,
"grad_norm": 0.3324854373931885,
"learning_rate": 4.290824795375194e-06,
"loss": 2.381,
"step": 15000
},
{
"epoch": 0.17285589999542103,
"grad_norm": 0.3383265733718872,
"learning_rate": 4.3194436494762755e-06,
"loss": 2.3716,
"step": 15100
},
{
"epoch": 0.17400064105499335,
"grad_norm": 0.34629401564598083,
"learning_rate": 4.348062503577357e-06,
"loss": 2.369,
"step": 15200
},
{
"epoch": 0.17514538211456568,
"grad_norm": 0.24686865508556366,
"learning_rate": 4.376681357678439e-06,
"loss": 2.3883,
"step": 15300
},
{
"epoch": 0.176290123174138,
"grad_norm": 0.38007158041000366,
"learning_rate": 4.405300211779521e-06,
"loss": 2.3869,
"step": 15400
},
{
"epoch": 0.17743486423371033,
"grad_norm": 0.313494473695755,
"learning_rate": 4.433919065880603e-06,
"loss": 2.3859,
"step": 15500
},
{
"epoch": 0.17857960529328265,
"grad_norm": 0.2624611556529999,
"learning_rate": 4.462537919981684e-06,
"loss": 2.3865,
"step": 15600
},
{
"epoch": 0.17972434635285497,
"grad_norm": 0.2802521884441376,
"learning_rate": 4.491156774082766e-06,
"loss": 2.3939,
"step": 15700
},
{
"epoch": 0.1808690874124273,
"grad_norm": 0.3011086881160736,
"learning_rate": 4.519775628183848e-06,
"loss": 2.385,
"step": 15800
},
{
"epoch": 0.18201382847199962,
"grad_norm": 0.28954678773880005,
"learning_rate": 4.54839448228493e-06,
"loss": 2.3897,
"step": 15900
},
{
"epoch": 0.18315856953157195,
"grad_norm": 0.2933329939842224,
"learning_rate": 4.577013336386011e-06,
"loss": 2.3761,
"step": 16000
},
{
"epoch": 0.18430331059114427,
"grad_norm": 0.2496791034936905,
"learning_rate": 4.605632190487094e-06,
"loss": 2.3801,
"step": 16100
},
{
"epoch": 0.1854480516507166,
"grad_norm": 0.28926581144332886,
"learning_rate": 4.634251044588175e-06,
"loss": 2.3754,
"step": 16200
},
{
"epoch": 0.18659279271028895,
"grad_norm": 0.3181098997592926,
"learning_rate": 4.662869898689257e-06,
"loss": 2.3625,
"step": 16300
},
{
"epoch": 0.18773753376986127,
"grad_norm": 0.3494364321231842,
"learning_rate": 4.691488752790338e-06,
"loss": 2.3688,
"step": 16400
},
{
"epoch": 0.1888822748294336,
"grad_norm": 0.4044504463672638,
"learning_rate": 4.720107606891421e-06,
"loss": 2.379,
"step": 16500
},
{
"epoch": 0.19002701588900592,
"grad_norm": 0.34538954496383667,
"learning_rate": 4.748726460992502e-06,
"loss": 2.3746,
"step": 16600
},
{
"epoch": 0.19117175694857824,
"grad_norm": 0.2930959165096283,
"learning_rate": 4.777345315093584e-06,
"loss": 2.3628,
"step": 16700
},
{
"epoch": 0.19231649800815057,
"grad_norm": 0.28704413771629333,
"learning_rate": 4.805964169194665e-06,
"loss": 2.3803,
"step": 16800
},
{
"epoch": 0.1934612390677229,
"grad_norm": 0.34928804636001587,
"learning_rate": 4.834583023295748e-06,
"loss": 2.3745,
"step": 16900
},
{
"epoch": 0.19460598012729521,
"grad_norm": 0.4369732439517975,
"learning_rate": 4.863201877396829e-06,
"loss": 2.3732,
"step": 17000
},
{
"epoch": 0.19575072118686754,
"grad_norm": 0.31494560837745667,
"learning_rate": 4.891820731497911e-06,
"loss": 2.3747,
"step": 17100
},
{
"epoch": 0.19689546224643986,
"grad_norm": 0.6197758913040161,
"learning_rate": 4.920439585598993e-06,
"loss": 2.3824,
"step": 17200
},
{
"epoch": 0.19804020330601219,
"grad_norm": 0.4105755388736725,
"learning_rate": 4.949058439700075e-06,
"loss": 2.3705,
"step": 17300
},
{
"epoch": 0.1991849443655845,
"grad_norm": 0.49081698060035706,
"learning_rate": 4.977677293801156e-06,
"loss": 2.3591,
"step": 17400
},
{
"epoch": 0.20032968542515683,
"grad_norm": 0.2819342017173767,
"learning_rate": 5.006296147902239e-06,
"loss": 2.3844,
"step": 17500
},
{
"epoch": 0.20147442648472916,
"grad_norm": 0.30305904150009155,
"learning_rate": 5.03491500200332e-06,
"loss": 2.3749,
"step": 17600
},
{
"epoch": 0.20261916754430148,
"grad_norm": 0.36479583382606506,
"learning_rate": 5.063533856104403e-06,
"loss": 2.3765,
"step": 17700
},
{
"epoch": 0.2037639086038738,
"grad_norm": 0.3018398880958557,
"learning_rate": 5.092152710205483e-06,
"loss": 2.3673,
"step": 17800
},
{
"epoch": 0.20490864966344613,
"grad_norm": 0.32054489850997925,
"learning_rate": 5.120771564306566e-06,
"loss": 2.3658,
"step": 17900
},
{
"epoch": 0.20605339072301845,
"grad_norm": 0.3198222219944,
"learning_rate": 5.149390418407647e-06,
"loss": 2.3693,
"step": 18000
},
{
"epoch": 0.20719813178259078,
"grad_norm": 0.405718594789505,
"learning_rate": 5.17800927250873e-06,
"loss": 2.3586,
"step": 18100
},
{
"epoch": 0.2083428728421631,
"grad_norm": 0.2990506887435913,
"learning_rate": 5.20662812660981e-06,
"loss": 2.3493,
"step": 18200
},
{
"epoch": 0.20948761390173543,
"grad_norm": 0.3617069721221924,
"learning_rate": 5.235246980710893e-06,
"loss": 2.3556,
"step": 18300
},
{
"epoch": 0.21063235496130775,
"grad_norm": 0.3361080586910248,
"learning_rate": 5.263865834811974e-06,
"loss": 2.3538,
"step": 18400
},
{
"epoch": 0.21177709602088007,
"grad_norm": 0.37158912420272827,
"learning_rate": 5.292484688913057e-06,
"loss": 2.3538,
"step": 18500
},
{
"epoch": 0.2129218370804524,
"grad_norm": 0.3937898576259613,
"learning_rate": 5.321103543014137e-06,
"loss": 2.368,
"step": 18600
},
{
"epoch": 0.21406657814002472,
"grad_norm": 0.42322373390197754,
"learning_rate": 5.34972239711522e-06,
"loss": 2.3646,
"step": 18700
},
{
"epoch": 0.21521131919959705,
"grad_norm": 0.411285400390625,
"learning_rate": 5.378341251216301e-06,
"loss": 2.3604,
"step": 18800
},
{
"epoch": 0.21635606025916937,
"grad_norm": 0.30224665999412537,
"learning_rate": 5.406960105317384e-06,
"loss": 2.3579,
"step": 18900
},
{
"epoch": 0.2175008013187417,
"grad_norm": 0.339236319065094,
"learning_rate": 5.435578959418465e-06,
"loss": 2.3741,
"step": 19000
},
{
"epoch": 0.21864554237831402,
"grad_norm": 0.37957480549812317,
"learning_rate": 5.464197813519547e-06,
"loss": 2.3549,
"step": 19100
},
{
"epoch": 0.21979028343788634,
"grad_norm": 0.2912836968898773,
"learning_rate": 5.492816667620628e-06,
"loss": 2.352,
"step": 19200
},
{
"epoch": 0.22093502449745867,
"grad_norm": 0.2871094346046448,
"learning_rate": 5.521435521721711e-06,
"loss": 2.3602,
"step": 19300
},
{
"epoch": 0.222079765557031,
"grad_norm": 0.33945441246032715,
"learning_rate": 5.550054375822792e-06,
"loss": 2.3715,
"step": 19400
},
{
"epoch": 0.2232245066166033,
"grad_norm": 0.36321476101875305,
"learning_rate": 5.578673229923875e-06,
"loss": 2.3601,
"step": 19500
},
{
"epoch": 0.22436924767617564,
"grad_norm": 0.27080637216567993,
"learning_rate": 5.6072920840249555e-06,
"loss": 2.3533,
"step": 19600
},
{
"epoch": 0.22551398873574796,
"grad_norm": 0.2903904318809509,
"learning_rate": 5.635910938126038e-06,
"loss": 2.3658,
"step": 19700
},
{
"epoch": 0.22665872979532029,
"grad_norm": 0.3127708435058594,
"learning_rate": 5.664529792227119e-06,
"loss": 2.3484,
"step": 19800
},
{
"epoch": 0.22780347085489264,
"grad_norm": 0.37779003381729126,
"learning_rate": 5.693148646328202e-06,
"loss": 2.3635,
"step": 19900
},
{
"epoch": 0.22894821191446496,
"grad_norm": 0.28905001282691956,
"learning_rate": 5.7217675004292825e-06,
"loss": 2.3645,
"step": 20000
},
{
"epoch": 0.23009295297403728,
"grad_norm": 0.3213961720466614,
"learning_rate": 5.750386354530365e-06,
"loss": 2.3584,
"step": 20100
},
{
"epoch": 0.2312376940336096,
"grad_norm": 0.28322041034698486,
"learning_rate": 5.7790052086314464e-06,
"loss": 2.3616,
"step": 20200
},
{
"epoch": 0.23238243509318193,
"grad_norm": 0.3427826762199402,
"learning_rate": 5.807624062732529e-06,
"loss": 2.366,
"step": 20300
},
{
"epoch": 0.23352717615275426,
"grad_norm": 0.35720208287239075,
"learning_rate": 5.836242916833611e-06,
"loss": 2.3604,
"step": 20400
},
{
"epoch": 0.23467191721232658,
"grad_norm": 0.3247853219509125,
"learning_rate": 5.864861770934692e-06,
"loss": 2.3528,
"step": 20500
},
{
"epoch": 0.2358166582718989,
"grad_norm": 0.31638818979263306,
"learning_rate": 5.893480625035774e-06,
"loss": 2.3659,
"step": 20600
},
{
"epoch": 0.23696139933147123,
"grad_norm": 0.3150993287563324,
"learning_rate": 5.922099479136856e-06,
"loss": 2.3542,
"step": 20700
},
{
"epoch": 0.23810614039104355,
"grad_norm": 0.28134357929229736,
"learning_rate": 5.950718333237938e-06,
"loss": 2.3645,
"step": 20800
},
{
"epoch": 0.23925088145061588,
"grad_norm": 0.344279408454895,
"learning_rate": 5.979337187339019e-06,
"loss": 2.3553,
"step": 20900
},
{
"epoch": 0.2403956225101882,
"grad_norm": 0.3156017065048218,
"learning_rate": 6.007956041440101e-06,
"loss": 2.3399,
"step": 21000
},
{
"epoch": 0.24154036356976052,
"grad_norm": 0.31625500321388245,
"learning_rate": 6.036574895541183e-06,
"loss": 2.3458,
"step": 21100
},
{
"epoch": 0.24268510462933285,
"grad_norm": 0.3604189455509186,
"learning_rate": 6.065193749642265e-06,
"loss": 2.3426,
"step": 21200
},
{
"epoch": 0.24382984568890517,
"grad_norm": 0.3052213490009308,
"learning_rate": 6.093526415202336e-06,
"loss": 2.3498,
"step": 21300
},
{
"epoch": 0.2449745867484775,
"grad_norm": 0.40419507026672363,
"learning_rate": 6.122145269303418e-06,
"loss": 2.3524,
"step": 21400
},
{
"epoch": 0.24611932780804982,
"grad_norm": 0.3688088357448578,
"learning_rate": 6.150764123404499e-06,
"loss": 2.3434,
"step": 21500
},
{
"epoch": 0.24726406886762214,
"grad_norm": 0.3363403379917145,
"learning_rate": 6.179382977505581e-06,
"loss": 2.3524,
"step": 21600
},
{
"epoch": 0.24840880992719447,
"grad_norm": 0.516409695148468,
"learning_rate": 6.208001831606663e-06,
"loss": 2.3424,
"step": 21700
},
{
"epoch": 0.2495535509867668,
"grad_norm": 0.3187144100666046,
"learning_rate": 6.236620685707745e-06,
"loss": 2.3427,
"step": 21800
},
{
"epoch": 0.25069829204633914,
"grad_norm": 0.3366081416606903,
"learning_rate": 6.265239539808826e-06,
"loss": 2.3461,
"step": 21900
},
{
"epoch": 0.25184303310591144,
"grad_norm": 0.3357242941856384,
"learning_rate": 6.293858393909908e-06,
"loss": 2.3423,
"step": 22000
},
{
"epoch": 0.2529877741654838,
"grad_norm": 0.3461867570877075,
"learning_rate": 6.32247724801099e-06,
"loss": 2.3448,
"step": 22100
},
{
"epoch": 0.2541325152250561,
"grad_norm": 0.3421408236026764,
"learning_rate": 6.351096102112072e-06,
"loss": 2.3513,
"step": 22200
},
{
"epoch": 0.25527725628462844,
"grad_norm": 0.3435458242893219,
"learning_rate": 6.379714956213153e-06,
"loss": 2.359,
"step": 22300
},
{
"epoch": 0.25642199734420074,
"grad_norm": 0.31256601214408875,
"learning_rate": 6.408333810314235e-06,
"loss": 2.3417,
"step": 22400
},
{
"epoch": 0.2575667384037731,
"grad_norm": 0.4700869023799896,
"learning_rate": 6.436952664415317e-06,
"loss": 2.3609,
"step": 22500
},
{
"epoch": 0.2587114794633454,
"grad_norm": 0.3140374422073364,
"learning_rate": 6.465571518516399e-06,
"loss": 2.3749,
"step": 22600
},
{
"epoch": 0.25985622052291774,
"grad_norm": 0.35399436950683594,
"learning_rate": 6.49419037261748e-06,
"loss": 2.35,
"step": 22700
},
{
"epoch": 0.26100096158249003,
"grad_norm": 0.3978697657585144,
"learning_rate": 6.5228092267185625e-06,
"loss": 2.3486,
"step": 22800
},
{
"epoch": 0.2621457026420624,
"grad_norm": 0.4412658214569092,
"learning_rate": 6.551428080819644e-06,
"loss": 2.3555,
"step": 22900
},
{
"epoch": 0.2632904437016347,
"grad_norm": 0.3189774453639984,
"learning_rate": 6.580046934920726e-06,
"loss": 2.3482,
"step": 23000
},
{
"epoch": 0.26443518476120703,
"grad_norm": 0.40347644686698914,
"learning_rate": 6.608665789021808e-06,
"loss": 2.3528,
"step": 23100
},
{
"epoch": 0.26557992582077933,
"grad_norm": 0.3658533990383148,
"learning_rate": 6.6372846431228895e-06,
"loss": 2.3328,
"step": 23200
},
{
"epoch": 0.2667246668803517,
"grad_norm": 0.31631380319595337,
"learning_rate": 6.665903497223972e-06,
"loss": 2.3554,
"step": 23300
},
{
"epoch": 0.267869407939924,
"grad_norm": 0.3406871259212494,
"learning_rate": 6.6945223513250534e-06,
"loss": 2.358,
"step": 23400
},
{
"epoch": 0.2690141489994963,
"grad_norm": 0.34881141781806946,
"learning_rate": 6.723141205426136e-06,
"loss": 2.343,
"step": 23500
},
{
"epoch": 0.2701588900590686,
"grad_norm": 0.5699728727340698,
"learning_rate": 6.751760059527217e-06,
"loss": 2.353,
"step": 23600
},
{
"epoch": 0.271303631118641,
"grad_norm": 0.36337733268737793,
"learning_rate": 6.780378913628299e-06,
"loss": 2.3463,
"step": 23700
},
{
"epoch": 0.27244837217821327,
"grad_norm": 0.34283825755119324,
"learning_rate": 6.8089977677293805e-06,
"loss": 2.3343,
"step": 23800
},
{
"epoch": 0.2735931132377856,
"grad_norm": 0.3663531243801117,
"learning_rate": 6.837616621830463e-06,
"loss": 2.3477,
"step": 23900
},
{
"epoch": 0.2747378542973579,
"grad_norm": 0.4608674943447113,
"learning_rate": 6.8662354759315444e-06,
"loss": 2.3396,
"step": 24000
},
{
"epoch": 0.27588259535693027,
"grad_norm": 0.32811376452445984,
"learning_rate": 6.894854330032627e-06,
"loss": 2.3409,
"step": 24100
},
{
"epoch": 0.27702733641650257,
"grad_norm": 0.3832899034023285,
"learning_rate": 6.9231869955926965e-06,
"loss": 2.3573,
"step": 24200
},
{
"epoch": 0.2781720774760749,
"grad_norm": 0.3628441095352173,
"learning_rate": 6.951805849693779e-06,
"loss": 2.3515,
"step": 24300
},
{
"epoch": 0.2793168185356472,
"grad_norm": 0.35530924797058105,
"learning_rate": 6.98042470379486e-06,
"loss": 2.3608,
"step": 24400
},
{
"epoch": 0.28046155959521957,
"grad_norm": 0.33606886863708496,
"learning_rate": 7.009043557895943e-06,
"loss": 2.3445,
"step": 24500
},
{
"epoch": 0.28160630065479186,
"grad_norm": 0.3475317358970642,
"learning_rate": 7.0376624119970235e-06,
"loss": 2.3527,
"step": 24600
},
{
"epoch": 0.2827510417143642,
"grad_norm": 0.3616209030151367,
"learning_rate": 7.066281266098106e-06,
"loss": 2.3456,
"step": 24700
},
{
"epoch": 0.2838957827739365,
"grad_norm": 0.4273635745048523,
"learning_rate": 7.0949001201991875e-06,
"loss": 2.3356,
"step": 24800
},
{
"epoch": 0.28504052383350886,
"grad_norm": 0.32688596844673157,
"learning_rate": 7.12351897430027e-06,
"loss": 2.3435,
"step": 24900
},
{
"epoch": 0.28618526489308116,
"grad_norm": 0.3584359288215637,
"learning_rate": 7.1521378284013506e-06,
"loss": 2.3431,
"step": 25000
},
{
"epoch": 0.2873300059526535,
"grad_norm": 0.35065096616744995,
"learning_rate": 7.180756682502433e-06,
"loss": 2.3316,
"step": 25100
},
{
"epoch": 0.2884747470122258,
"grad_norm": 0.3658662736415863,
"learning_rate": 7.2093755366035145e-06,
"loss": 2.342,
"step": 25200
},
{
"epoch": 0.28961948807179816,
"grad_norm": 0.3615448772907257,
"learning_rate": 7.237994390704597e-06,
"loss": 2.3413,
"step": 25300
},
{
"epoch": 0.2907642291313705,
"grad_norm": 0.37493348121643066,
"learning_rate": 7.2666132448056785e-06,
"loss": 2.3261,
"step": 25400
},
{
"epoch": 0.2919089701909428,
"grad_norm": 0.43712013959884644,
"learning_rate": 7.29523209890676e-06,
"loss": 2.3367,
"step": 25500
},
{
"epoch": 0.29305371125051516,
"grad_norm": 0.37490740418434143,
"learning_rate": 7.3238509530078416e-06,
"loss": 2.323,
"step": 25600
},
{
"epoch": 0.29419845231008745,
"grad_norm": 0.33274149894714355,
"learning_rate": 7.352469807108924e-06,
"loss": 2.3333,
"step": 25700
},
{
"epoch": 0.2953431933696598,
"grad_norm": 0.41356754302978516,
"learning_rate": 7.3810886612100055e-06,
"loss": 2.3253,
"step": 25800
},
{
"epoch": 0.2964879344292321,
"grad_norm": 0.3717619478702545,
"learning_rate": 7.409707515311088e-06,
"loss": 2.3483,
"step": 25900
},
{
"epoch": 0.29763267548880445,
"grad_norm": 0.3571007251739502,
"learning_rate": 7.438326369412169e-06,
"loss": 2.3429,
"step": 26000
},
{
"epoch": 0.29877741654837675,
"grad_norm": 0.33724385499954224,
"learning_rate": 7.466945223513251e-06,
"loss": 2.3387,
"step": 26100
},
{
"epoch": 0.2999221576079491,
"grad_norm": 0.4527655243873596,
"learning_rate": 7.495564077614333e-06,
"loss": 2.3387,
"step": 26200
},
{
"epoch": 0.3010668986675214,
"grad_norm": 0.5135347247123718,
"learning_rate": 7.524182931715415e-06,
"loss": 2.3476,
"step": 26300
},
{
"epoch": 0.30221163972709375,
"grad_norm": 0.36313098669052124,
"learning_rate": 7.552801785816497e-06,
"loss": 2.3419,
"step": 26400
},
{
"epoch": 0.30335638078666605,
"grad_norm": 0.34982365369796753,
"learning_rate": 7.581420639917578e-06,
"loss": 2.3333,
"step": 26500
},
{
"epoch": 0.3045011218462384,
"grad_norm": 0.5387922525405884,
"learning_rate": 7.6100394940186604e-06,
"loss": 2.3267,
"step": 26600
},
{
"epoch": 0.3056458629058107,
"grad_norm": 0.3917737305164337,
"learning_rate": 7.638658348119742e-06,
"loss": 2.3441,
"step": 26700
},
{
"epoch": 0.30679060396538305,
"grad_norm": 0.362425297498703,
"learning_rate": 7.667277202220824e-06,
"loss": 2.3279,
"step": 26800
},
{
"epoch": 0.30793534502495534,
"grad_norm": 0.3922022581100464,
"learning_rate": 7.695896056321905e-06,
"loss": 2.328,
"step": 26900
},
{
"epoch": 0.3090800860845277,
"grad_norm": 0.3867265582084656,
"learning_rate": 7.724514910422987e-06,
"loss": 2.3343,
"step": 27000
},
{
"epoch": 0.3102248271441,
"grad_norm": 0.3767886757850647,
"learning_rate": 7.753133764524068e-06,
"loss": 2.3209,
"step": 27100
},
{
"epoch": 0.31136956820367234,
"grad_norm": 0.3708842694759369,
"learning_rate": 7.78175261862515e-06,
"loss": 2.3406,
"step": 27200
},
{
"epoch": 0.31251430926324464,
"grad_norm": 0.387917697429657,
"learning_rate": 7.810085284185221e-06,
"loss": 2.3314,
"step": 27300
},
{
"epoch": 0.313659050322817,
"grad_norm": 0.38667747378349304,
"learning_rate": 7.838704138286303e-06,
"loss": 2.3534,
"step": 27400
},
{
"epoch": 0.3148037913823893,
"grad_norm": 0.3695323169231415,
"learning_rate": 7.867322992387386e-06,
"loss": 2.3217,
"step": 27500
},
{
"epoch": 0.31594853244196164,
"grad_norm": 0.4034270644187927,
"learning_rate": 7.895941846488467e-06,
"loss": 2.3202,
"step": 27600
},
{
"epoch": 0.31709327350153393,
"grad_norm": 0.3806785047054291,
"learning_rate": 7.924560700589549e-06,
"loss": 2.3181,
"step": 27700
},
{
"epoch": 0.3182380145611063,
"grad_norm": 0.4768499433994293,
"learning_rate": 7.953179554690631e-06,
"loss": 2.3294,
"step": 27800
},
{
"epoch": 0.3193827556206786,
"grad_norm": 0.35020050406455994,
"learning_rate": 7.981798408791712e-06,
"loss": 2.348,
"step": 27900
},
{
"epoch": 0.32052749668025093,
"grad_norm": 0.37575003504753113,
"learning_rate": 8.010417262892794e-06,
"loss": 2.3327,
"step": 28000
},
{
"epoch": 0.32167223773982323,
"grad_norm": 0.38812920451164246,
"learning_rate": 8.039036116993875e-06,
"loss": 2.3488,
"step": 28100
},
{
"epoch": 0.3228169787993956,
"grad_norm": 0.4457553029060364,
"learning_rate": 8.067654971094958e-06,
"loss": 2.3239,
"step": 28200
},
{
"epoch": 0.3239617198589679,
"grad_norm": 0.4105226695537567,
"learning_rate": 8.09627382519604e-06,
"loss": 2.337,
"step": 28300
},
{
"epoch": 0.32510646091854023,
"grad_norm": 0.3988581597805023,
"learning_rate": 8.124892679297122e-06,
"loss": 2.3405,
"step": 28400
},
{
"epoch": 0.3262512019781125,
"grad_norm": 0.5257512927055359,
"learning_rate": 8.153511533398203e-06,
"loss": 2.3093,
"step": 28500
},
{
"epoch": 0.3273959430376849,
"grad_norm": 0.4554663896560669,
"learning_rate": 8.182130387499285e-06,
"loss": 2.3376,
"step": 28600
},
{
"epoch": 0.3285406840972572,
"grad_norm": 0.5744247436523438,
"learning_rate": 8.210749241600366e-06,
"loss": 2.3282,
"step": 28700
},
{
"epoch": 0.3296854251568295,
"grad_norm": 0.4724181592464447,
"learning_rate": 8.239368095701449e-06,
"loss": 2.3158,
"step": 28800
},
{
"epoch": 0.3308301662164019,
"grad_norm": 0.37950775027275085,
"learning_rate": 8.267986949802531e-06,
"loss": 2.33,
"step": 28900
},
{
"epoch": 0.3319749072759742,
"grad_norm": 0.3992998003959656,
"learning_rate": 8.296605803903612e-06,
"loss": 2.3445,
"step": 29000
},
{
"epoch": 0.3331196483355465,
"grad_norm": 0.3856017291545868,
"learning_rate": 8.325224658004694e-06,
"loss": 2.3205,
"step": 29100
},
{
"epoch": 0.3342643893951188,
"grad_norm": 0.4017845690250397,
"learning_rate": 8.353843512105776e-06,
"loss": 2.3236,
"step": 29200
},
{
"epoch": 0.3354091304546912,
"grad_norm": 0.557650625705719,
"learning_rate": 8.382462366206859e-06,
"loss": 2.3188,
"step": 29300
},
{
"epoch": 0.33655387151426347,
"grad_norm": 0.5162211060523987,
"learning_rate": 8.410795031766928e-06,
"loss": 2.317,
"step": 29400
},
{
"epoch": 0.3376986125738358,
"grad_norm": 0.4619830846786499,
"learning_rate": 8.43941388586801e-06,
"loss": 2.325,
"step": 29500
},
{
"epoch": 0.3388433536334081,
"grad_norm": 0.36212870478630066,
"learning_rate": 8.468032739969092e-06,
"loss": 2.3266,
"step": 29600
},
{
"epoch": 0.33998809469298047,
"grad_norm": 0.3987511098384857,
"learning_rate": 8.496651594070175e-06,
"loss": 2.3352,
"step": 29700
},
{
"epoch": 0.34113283575255277,
"grad_norm": 0.49319741129875183,
"learning_rate": 8.525270448171256e-06,
"loss": 2.331,
"step": 29800
},
{
"epoch": 0.3422775768121251,
"grad_norm": 0.3750767111778259,
"learning_rate": 8.553889302272338e-06,
"loss": 2.3197,
"step": 29900
},
{
"epoch": 0.3434223178716974,
"grad_norm": 0.42684081196784973,
"learning_rate": 8.582508156373419e-06,
"loss": 2.3196,
"step": 30000
},
{
"epoch": 0.34456705893126977,
"grad_norm": 0.4298862814903259,
"learning_rate": 8.611127010474501e-06,
"loss": 2.309,
"step": 30100
},
{
"epoch": 0.34571179999084206,
"grad_norm": 0.4002019762992859,
"learning_rate": 8.639745864575583e-06,
"loss": 2.312,
"step": 30200
},
{
"epoch": 0.3468565410504144,
"grad_norm": 0.3674704134464264,
"learning_rate": 8.668364718676666e-06,
"loss": 2.3147,
"step": 30300
},
{
"epoch": 0.3480012821099867,
"grad_norm": 0.5854533314704895,
"learning_rate": 8.696983572777747e-06,
"loss": 2.3236,
"step": 30400
},
{
"epoch": 0.34914602316955906,
"grad_norm": 0.4132590889930725,
"learning_rate": 8.725602426878829e-06,
"loss": 2.3359,
"step": 30500
},
{
"epoch": 0.35029076422913136,
"grad_norm": 0.39070025086402893,
"learning_rate": 8.75422128097991e-06,
"loss": 2.3223,
"step": 30600
},
{
"epoch": 0.3514355052887037,
"grad_norm": 0.40703412890434265,
"learning_rate": 8.782840135080992e-06,
"loss": 2.3243,
"step": 30700
},
{
"epoch": 0.352580246348276,
"grad_norm": 0.3701010048389435,
"learning_rate": 8.811458989182073e-06,
"loss": 2.3297,
"step": 30800
},
{
"epoch": 0.35372498740784836,
"grad_norm": 0.5442121028900146,
"learning_rate": 8.840077843283155e-06,
"loss": 2.3256,
"step": 30900
},
{
"epoch": 0.35486972846742065,
"grad_norm": 0.4204414486885071,
"learning_rate": 8.868696697384238e-06,
"loss": 2.3178,
"step": 31000
},
{
"epoch": 0.356014469526993,
"grad_norm": 0.4551771283149719,
"learning_rate": 8.89731555148532e-06,
"loss": 2.3299,
"step": 31100
},
{
"epoch": 0.3571592105865653,
"grad_norm": 0.4303077757358551,
"learning_rate": 8.9259344055864e-06,
"loss": 2.3251,
"step": 31200
},
{
"epoch": 0.35830395164613765,
"grad_norm": 0.41671931743621826,
"learning_rate": 8.954553259687483e-06,
"loss": 2.3279,
"step": 31300
},
{
"epoch": 0.35944869270570995,
"grad_norm": 0.4018367528915405,
"learning_rate": 8.983172113788564e-06,
"loss": 2.3069,
"step": 31400
},
{
"epoch": 0.3605934337652823,
"grad_norm": 0.5130965113639832,
"learning_rate": 9.011790967889646e-06,
"loss": 2.3315,
"step": 31500
},
{
"epoch": 0.3617381748248546,
"grad_norm": 0.4209829270839691,
"learning_rate": 9.040409821990729e-06,
"loss": 2.324,
"step": 31600
},
{
"epoch": 0.36288291588442695,
"grad_norm": 0.41159483790397644,
"learning_rate": 9.06902867609181e-06,
"loss": 2.321,
"step": 31700
},
{
"epoch": 0.36402765694399924,
"grad_norm": 0.46957677602767944,
"learning_rate": 9.097647530192892e-06,
"loss": 2.3249,
"step": 31800
},
{
"epoch": 0.3651723980035716,
"grad_norm": 0.39695438742637634,
"learning_rate": 9.126266384293974e-06,
"loss": 2.3112,
"step": 31900
},
{
"epoch": 0.3663171390631439,
"grad_norm": 0.3984281122684479,
"learning_rate": 9.154885238395056e-06,
"loss": 2.3269,
"step": 32000
},
{
"epoch": 0.36746188012271624,
"grad_norm": 0.40746673941612244,
"learning_rate": 9.183504092496137e-06,
"loss": 2.333,
"step": 32100
},
{
"epoch": 0.36860662118228854,
"grad_norm": 0.39766183495521545,
"learning_rate": 9.21212294659722e-06,
"loss": 2.3354,
"step": 32200
},
{
"epoch": 0.3697513622418609,
"grad_norm": 0.42040807008743286,
"learning_rate": 9.2407418006983e-06,
"loss": 2.3165,
"step": 32300
},
{
"epoch": 0.3708961033014332,
"grad_norm": 0.43911924958229065,
"learning_rate": 9.269360654799383e-06,
"loss": 2.3386,
"step": 32400
},
{
"epoch": 0.37204084436100554,
"grad_norm": 0.46791312098503113,
"learning_rate": 9.297979508900465e-06,
"loss": 2.3116,
"step": 32500
},
{
"epoch": 0.3731855854205779,
"grad_norm": 0.41716283559799194,
"learning_rate": 9.326598363001547e-06,
"loss": 2.3298,
"step": 32600
},
{
"epoch": 0.3743303264801502,
"grad_norm": 0.38628315925598145,
"learning_rate": 9.355217217102628e-06,
"loss": 2.3093,
"step": 32700
},
{
"epoch": 0.37547506753972254,
"grad_norm": 0.4715399146080017,
"learning_rate": 9.38383607120371e-06,
"loss": 2.3353,
"step": 32800
},
{
"epoch": 0.37661980859929484,
"grad_norm": 0.4824056923389435,
"learning_rate": 9.412454925304791e-06,
"loss": 2.3165,
"step": 32900
},
{
"epoch": 0.3777645496588672,
"grad_norm": 0.3745131194591522,
"learning_rate": 9.441073779405874e-06,
"loss": 2.31,
"step": 33000
},
{
"epoch": 0.3789092907184395,
"grad_norm": 0.4431547522544861,
"learning_rate": 9.469406444965944e-06,
"loss": 2.3158,
"step": 33100
},
{
"epoch": 0.38005403177801184,
"grad_norm": 0.43701136112213135,
"learning_rate": 9.498025299067026e-06,
"loss": 2.3184,
"step": 33200
},
{
"epoch": 0.38119877283758413,
"grad_norm": 0.40076711773872375,
"learning_rate": 9.526644153168107e-06,
"loss": 2.3247,
"step": 33300
},
{
"epoch": 0.3823435138971565,
"grad_norm": Infinity,
"learning_rate": 9.55497681872818e-06,
"loss": 2.3038,
"step": 33400
},
{
"epoch": 0.3834882549567288,
"grad_norm": 0.4818211793899536,
"learning_rate": 9.583595672829262e-06,
"loss": 2.3125,
"step": 33500
},
{
"epoch": 0.38463299601630113,
"grad_norm": 0.44447603821754456,
"learning_rate": 9.612214526930342e-06,
"loss": 2.3056,
"step": 33600
},
{
"epoch": 0.38577773707587343,
"grad_norm": 0.42127880454063416,
"learning_rate": 9.640833381031425e-06,
"loss": 2.318,
"step": 33700
},
{
"epoch": 0.3869224781354458,
"grad_norm": 0.42268356680870056,
"learning_rate": 9.669452235132506e-06,
"loss": 2.3247,
"step": 33800
},
{
"epoch": 0.3880672191950181,
"grad_norm": 0.42822974920272827,
"learning_rate": 9.698071089233588e-06,
"loss": 2.3164,
"step": 33900
},
{
"epoch": 0.38921196025459043,
"grad_norm": 0.4107573926448822,
"learning_rate": 9.72668994333467e-06,
"loss": 2.3187,
"step": 34000
},
{
"epoch": 0.3903567013141627,
"grad_norm": 0.40011221170425415,
"learning_rate": 9.755308797435751e-06,
"loss": 2.3197,
"step": 34100
},
{
"epoch": 0.3915014423737351,
"grad_norm": 0.4174409508705139,
"learning_rate": 9.783927651536833e-06,
"loss": 2.3082,
"step": 34200
},
{
"epoch": 0.39264618343330737,
"grad_norm": 0.4117099344730377,
"learning_rate": 9.812546505637916e-06,
"loss": 2.3197,
"step": 34300
},
{
"epoch": 0.3937909244928797,
"grad_norm": 0.4379769563674927,
"learning_rate": 9.841165359738997e-06,
"loss": 2.3151,
"step": 34400
},
{
"epoch": 0.394935665552452,
"grad_norm": 0.5240621566772461,
"learning_rate": 9.869784213840079e-06,
"loss": 2.309,
"step": 34500
},
{
"epoch": 0.39608040661202437,
"grad_norm": 0.38917648792266846,
"learning_rate": 9.89840306794116e-06,
"loss": 2.3122,
"step": 34600
},
{
"epoch": 0.39722514767159667,
"grad_norm": 0.47040241956710815,
"learning_rate": 9.927021922042242e-06,
"loss": 2.3053,
"step": 34700
},
{
"epoch": 0.398369888731169,
"grad_norm": 0.42958155274391174,
"learning_rate": 9.955640776143324e-06,
"loss": 2.3038,
"step": 34800
},
{
"epoch": 0.3995146297907413,
"grad_norm": 0.4274247884750366,
"learning_rate": 9.984259630244407e-06,
"loss": 2.3104,
"step": 34900
},
{
"epoch": 0.40065937085031367,
"grad_norm": 0.4401596188545227,
"learning_rate": 1.0012878484345488e-05,
"loss": 2.2919,
"step": 35000
},
{
"epoch": 0.40180411190988596,
"grad_norm": 0.4685971438884735,
"learning_rate": 1.0041497338446568e-05,
"loss": 2.323,
"step": 35100
},
{
"epoch": 0.4029488529694583,
"grad_norm": 0.4304906725883484,
"learning_rate": 1.0070116192547652e-05,
"loss": 2.3182,
"step": 35200
},
{
"epoch": 0.4040935940290306,
"grad_norm": 0.44299623370170593,
"learning_rate": 1.0098735046648733e-05,
"loss": 2.3041,
"step": 35300
},
{
"epoch": 0.40523833508860296,
"grad_norm": 0.42946410179138184,
"learning_rate": 1.0127353900749814e-05,
"loss": 2.3094,
"step": 35400
},
{
"epoch": 0.40638307614817526,
"grad_norm": 0.4753871262073517,
"learning_rate": 1.0155972754850896e-05,
"loss": 2.3141,
"step": 35500
},
{
"epoch": 0.4075278172077476,
"grad_norm": 0.4177212417125702,
"learning_rate": 1.0184591608951979e-05,
"loss": 2.3255,
"step": 35600
},
{
"epoch": 0.4086725582673199,
"grad_norm": 0.4225813150405884,
"learning_rate": 1.0213210463053061e-05,
"loss": 2.3238,
"step": 35700
},
{
"epoch": 0.40981729932689226,
"grad_norm": 0.39542898535728455,
"learning_rate": 1.0241829317154142e-05,
"loss": 2.3013,
"step": 35800
},
{
"epoch": 0.41096204038646456,
"grad_norm": 0.44745051860809326,
"learning_rate": 1.0270448171255222e-05,
"loss": 2.3142,
"step": 35900
},
{
"epoch": 0.4121067814460369,
"grad_norm": 0.46079352498054504,
"learning_rate": 1.0299067025356306e-05,
"loss": 2.3153,
"step": 36000
},
{
"epoch": 0.41325152250560926,
"grad_norm": 0.4723173975944519,
"learning_rate": 1.0327685879457387e-05,
"loss": 2.3053,
"step": 36100
},
{
"epoch": 0.41439626356518156,
"grad_norm": 0.4553997218608856,
"learning_rate": 1.035630473355847e-05,
"loss": 2.3143,
"step": 36200
},
{
"epoch": 0.4155410046247539,
"grad_norm": 0.43542203307151794,
"learning_rate": 1.038492358765955e-05,
"loss": 2.32,
"step": 36300
},
{
"epoch": 0.4166857456843262,
"grad_norm": 0.5056464076042175,
"learning_rate": 1.0413542441760633e-05,
"loss": 2.3055,
"step": 36400
},
{
"epoch": 0.41783048674389855,
"grad_norm": 0.424907922744751,
"learning_rate": 1.0442161295861715e-05,
"loss": 2.312,
"step": 36500
},
{
"epoch": 0.41897522780347085,
"grad_norm": 0.4522388279438019,
"learning_rate": 1.0470780149962796e-05,
"loss": 2.304,
"step": 36600
},
{
"epoch": 0.4201199688630432,
"grad_norm": 0.42723220586776733,
"learning_rate": 1.0499399004063876e-05,
"loss": 2.3128,
"step": 36700
},
{
"epoch": 0.4212647099226155,
"grad_norm": 0.4533430337905884,
"learning_rate": 1.052801785816496e-05,
"loss": 2.3273,
"step": 36800
},
{
"epoch": 0.42240945098218785,
"grad_norm": 0.4739341735839844,
"learning_rate": 1.0556636712266041e-05,
"loss": 2.3114,
"step": 36900
},
{
"epoch": 0.42355419204176015,
"grad_norm": 0.43692803382873535,
"learning_rate": 1.0585255566367124e-05,
"loss": 2.3093,
"step": 37000
},
{
"epoch": 0.4246989331013325,
"grad_norm": 0.437219500541687,
"learning_rate": 1.0613874420468204e-05,
"loss": 2.2975,
"step": 37100
},
{
"epoch": 0.4258436741609048,
"grad_norm": 0.5384771227836609,
"learning_rate": 1.0642493274569288e-05,
"loss": 2.3272,
"step": 37200
},
{
"epoch": 0.42698841522047715,
"grad_norm": 0.3731122314929962,
"learning_rate": 1.0671112128670369e-05,
"loss": 2.3239,
"step": 37300
},
{
"epoch": 0.42813315628004944,
"grad_norm": 0.4134446978569031,
"learning_rate": 1.069973098277145e-05,
"loss": 2.2992,
"step": 37400
},
{
"epoch": 0.4292778973396218,
"grad_norm": 0.3970800042152405,
"learning_rate": 1.0728349836872532e-05,
"loss": 2.3166,
"step": 37500
},
{
"epoch": 0.4304226383991941,
"grad_norm": 0.4207904040813446,
"learning_rate": 1.0756968690973615e-05,
"loss": 2.2956,
"step": 37600
},
{
"epoch": 0.43156737945876644,
"grad_norm": 0.43248116970062256,
"learning_rate": 1.0785587545074695e-05,
"loss": 2.2988,
"step": 37700
},
{
"epoch": 0.43271212051833874,
"grad_norm": 0.5825778245925903,
"learning_rate": 1.0814206399175778e-05,
"loss": 2.3124,
"step": 37800
},
{
"epoch": 0.4338568615779111,
"grad_norm": 0.4129347503185272,
"learning_rate": 1.084282525327686e-05,
"loss": 2.3045,
"step": 37900
},
{
"epoch": 0.4350016026374834,
"grad_norm": 0.5687834024429321,
"learning_rate": 1.0871444107377943e-05,
"loss": 2.3107,
"step": 38000
},
{
"epoch": 0.43614634369705574,
"grad_norm": 0.4236217737197876,
"learning_rate": 1.0900062961479023e-05,
"loss": 2.3019,
"step": 38100
},
{
"epoch": 0.43729108475662803,
"grad_norm": 0.4377936124801636,
"learning_rate": 1.0928681815580104e-05,
"loss": 2.3093,
"step": 38200
},
{
"epoch": 0.4384358258162004,
"grad_norm": 0.7092427611351013,
"learning_rate": 1.0957300669681188e-05,
"loss": 2.3122,
"step": 38300
},
{
"epoch": 0.4395805668757727,
"grad_norm": 0.5125077366828918,
"learning_rate": 1.0985919523782269e-05,
"loss": 2.3101,
"step": 38400
},
{
"epoch": 0.44072530793534503,
"grad_norm": 0.5460866093635559,
"learning_rate": 1.1014252189342341e-05,
"loss": 2.3059,
"step": 38500
},
{
"epoch": 0.44187004899491733,
"grad_norm": 0.5048667788505554,
"learning_rate": 1.1042871043443422e-05,
"loss": 2.2934,
"step": 38600
},
{
"epoch": 0.4430147900544897,
"grad_norm": 0.505024790763855,
"learning_rate": 1.1071489897544502e-05,
"loss": 2.3106,
"step": 38700
},
{
"epoch": 0.444159531114062,
"grad_norm": 0.4877238869667053,
"learning_rate": 1.1100108751645585e-05,
"loss": 2.3108,
"step": 38800
},
{
"epoch": 0.44530427217363433,
"grad_norm": 0.43280166387557983,
"learning_rate": 1.1128727605746667e-05,
"loss": 2.2922,
"step": 38900
},
{
"epoch": 0.4464490132332066,
"grad_norm": 0.47969332337379456,
"learning_rate": 1.115734645984775e-05,
"loss": 2.3093,
"step": 39000
},
{
"epoch": 0.447593754292779,
"grad_norm": 0.45905813574790955,
"learning_rate": 1.118596531394883e-05,
"loss": 2.302,
"step": 39100
},
{
"epoch": 0.4487384953523513,
"grad_norm": 0.4151560962200165,
"learning_rate": 1.1214584168049911e-05,
"loss": 2.3278,
"step": 39200
},
{
"epoch": 0.4498832364119236,
"grad_norm": 0.47377634048461914,
"learning_rate": 1.1243203022150995e-05,
"loss": 2.3144,
"step": 39300
},
{
"epoch": 0.4510279774714959,
"grad_norm": 0.5002289414405823,
"learning_rate": 1.1271821876252076e-05,
"loss": 2.3065,
"step": 39400
},
{
"epoch": 0.4521727185310683,
"grad_norm": 0.48427700996398926,
"learning_rate": 1.1300440730353156e-05,
"loss": 2.3175,
"step": 39500
},
{
"epoch": 0.45331745959064057,
"grad_norm": 0.41482630372047424,
"learning_rate": 1.1329059584454239e-05,
"loss": 2.3115,
"step": 39600
},
{
"epoch": 0.4544622006502129,
"grad_norm": 0.45701828598976135,
"learning_rate": 1.1357678438555321e-05,
"loss": 2.3166,
"step": 39700
},
{
"epoch": 0.4556069417097853,
"grad_norm": 0.4916311502456665,
"learning_rate": 1.1386297292656404e-05,
"loss": 2.3017,
"step": 39800
},
{
"epoch": 0.45675168276935757,
"grad_norm": 0.4516671299934387,
"learning_rate": 1.1414916146757484e-05,
"loss": 2.2997,
"step": 39900
},
{
"epoch": 0.4578964238289299,
"grad_norm": 0.46147215366363525,
"learning_rate": 1.1443535000858565e-05,
"loss": 2.3052,
"step": 40000
},
{
"epoch": 0.4590411648885022,
"grad_norm": 0.48540788888931274,
"learning_rate": 1.1472153854959649e-05,
"loss": 2.3032,
"step": 40100
},
{
"epoch": 0.46018590594807457,
"grad_norm": 0.47240906953811646,
"learning_rate": 1.150077270906073e-05,
"loss": 2.2986,
"step": 40200
},
{
"epoch": 0.46133064700764687,
"grad_norm": 0.5069533586502075,
"learning_rate": 1.1529391563161812e-05,
"loss": 2.3019,
"step": 40300
},
{
"epoch": 0.4624753880672192,
"grad_norm": 0.5581790208816528,
"learning_rate": 1.1558010417262893e-05,
"loss": 2.297,
"step": 40400
},
{
"epoch": 0.4636201291267915,
"grad_norm": 0.42001545429229736,
"learning_rate": 1.1586629271363975e-05,
"loss": 2.3023,
"step": 40500
},
{
"epoch": 0.46476487018636387,
"grad_norm": 0.43199682235717773,
"learning_rate": 1.1615248125465058e-05,
"loss": 2.307,
"step": 40600
},
{
"epoch": 0.46590961124593616,
"grad_norm": 0.39316660165786743,
"learning_rate": 1.1643580791025128e-05,
"loss": 2.3006,
"step": 40700
},
{
"epoch": 0.4670543523055085,
"grad_norm": 0.4300936162471771,
"learning_rate": 1.167219964512621e-05,
"loss": 2.2991,
"step": 40800
},
{
"epoch": 0.4681990933650808,
"grad_norm": 0.3927803933620453,
"learning_rate": 1.1700818499227291e-05,
"loss": 2.302,
"step": 40900
},
{
"epoch": 0.46934383442465316,
"grad_norm": 0.4072405993938446,
"learning_rate": 1.1729437353328375e-05,
"loss": 2.3233,
"step": 41000
},
{
"epoch": 0.47048857548422546,
"grad_norm": 0.5310955047607422,
"learning_rate": 1.1758056207429456e-05,
"loss": 2.2919,
"step": 41100
},
{
"epoch": 0.4716333165437978,
"grad_norm": 0.48148131370544434,
"learning_rate": 1.1786675061530537e-05,
"loss": 2.2854,
"step": 41200
},
{
"epoch": 0.4727780576033701,
"grad_norm": 0.4595455527305603,
"learning_rate": 1.1815293915631617e-05,
"loss": 2.2966,
"step": 41300
},
{
"epoch": 0.47392279866294246,
"grad_norm": 0.41068974137306213,
"learning_rate": 1.1843912769732702e-05,
"loss": 2.3041,
"step": 41400
},
{
"epoch": 0.47506753972251475,
"grad_norm": 0.48817092180252075,
"learning_rate": 1.1872531623833782e-05,
"loss": 2.3031,
"step": 41500
},
{
"epoch": 0.4762122807820871,
"grad_norm": 0.4430725872516632,
"learning_rate": 1.1901150477934865e-05,
"loss": 2.306,
"step": 41600
},
{
"epoch": 0.4773570218416594,
"grad_norm": 0.41381534934043884,
"learning_rate": 1.1929769332035945e-05,
"loss": 2.2938,
"step": 41700
},
{
"epoch": 0.47850176290123175,
"grad_norm": 0.5231103897094727,
"learning_rate": 1.195838818613703e-05,
"loss": 2.287,
"step": 41800
},
{
"epoch": 0.47964650396080405,
"grad_norm": 0.45451679825782776,
"learning_rate": 1.198700704023811e-05,
"loss": 2.3014,
"step": 41900
},
{
"epoch": 0.4807912450203764,
"grad_norm": 0.4158308804035187,
"learning_rate": 1.2015625894339191e-05,
"loss": 2.2863,
"step": 42000
},
{
"epoch": 0.4819359860799487,
"grad_norm": 0.4745527505874634,
"learning_rate": 1.2044244748440273e-05,
"loss": 2.3069,
"step": 42100
},
{
"epoch": 0.48308072713952105,
"grad_norm": 0.4103641211986542,
"learning_rate": 1.2072863602541356e-05,
"loss": 2.3,
"step": 42200
},
{
"epoch": 0.48422546819909335,
"grad_norm": 0.4857043921947479,
"learning_rate": 1.2101482456642436e-05,
"loss": 2.2829,
"step": 42300
},
{
"epoch": 0.4853702092586657,
"grad_norm": 0.4224902093410492,
"learning_rate": 1.2130101310743519e-05,
"loss": 2.3012,
"step": 42400
},
{
"epoch": 0.486514950318238,
"grad_norm": 0.4603799283504486,
"learning_rate": 1.21587201648446e-05,
"loss": 2.29,
"step": 42500
},
{
"epoch": 0.48765969137781034,
"grad_norm": 0.43657830357551575,
"learning_rate": 1.2187339018945684e-05,
"loss": 2.3105,
"step": 42600
},
{
"epoch": 0.48880443243738264,
"grad_norm": 0.411188006401062,
"learning_rate": 1.2215957873046764e-05,
"loss": 2.2911,
"step": 42700
},
{
"epoch": 0.489949173496955,
"grad_norm": 0.4367277920246124,
"learning_rate": 1.2244576727147845e-05,
"loss": 2.3071,
"step": 42800
},
{
"epoch": 0.4910939145565273,
"grad_norm": 0.4958134889602661,
"learning_rate": 1.2272909392707917e-05,
"loss": 2.3065,
"step": 42900
},
{
"epoch": 0.49223865561609964,
"grad_norm": 0.4951634705066681,
"learning_rate": 1.2301528246808998e-05,
"loss": 2.2884,
"step": 43000
},
{
"epoch": 0.49338339667567194,
"grad_norm": 0.44968706369400024,
"learning_rate": 1.2330147100910082e-05,
"loss": 2.2957,
"step": 43100
},
{
"epoch": 0.4945281377352443,
"grad_norm": 0.45405635237693787,
"learning_rate": 1.2358765955011163e-05,
"loss": 2.2915,
"step": 43200
},
{
"epoch": 0.4956728787948166,
"grad_norm": 0.5005086660385132,
"learning_rate": 1.2387384809112243e-05,
"loss": 2.3055,
"step": 43300
},
{
"epoch": 0.49681761985438894,
"grad_norm": 0.5079677104949951,
"learning_rate": 1.2416003663213326e-05,
"loss": 2.3047,
"step": 43400
},
{
"epoch": 0.4979623609139613,
"grad_norm": 0.47394371032714844,
"learning_rate": 1.2444336328773396e-05,
"loss": 2.2976,
"step": 43500
},
{
"epoch": 0.4991071019735336,
"grad_norm": 0.5010650157928467,
"learning_rate": 1.2472955182874477e-05,
"loss": 2.2843,
"step": 43600
},
{
"epoch": 0.5002518430331059,
"grad_norm": 0.443935751914978,
"learning_rate": 1.2501574036975561e-05,
"loss": 2.3022,
"step": 43700
},
{
"epoch": 0.5013965840926783,
"grad_norm": 0.44586101174354553,
"learning_rate": 1.2530192891076642e-05,
"loss": 2.3024,
"step": 43800
},
{
"epoch": 0.5025413251522506,
"grad_norm": 0.4664213955402374,
"learning_rate": 1.2558811745177724e-05,
"loss": 2.2865,
"step": 43900
},
{
"epoch": 0.5036860662118229,
"grad_norm": 0.4366970956325531,
"learning_rate": 1.2587430599278805e-05,
"loss": 2.3052,
"step": 44000
},
{
"epoch": 0.5048308072713952,
"grad_norm": 0.44286203384399414,
"learning_rate": 1.2616049453379889e-05,
"loss": 2.3006,
"step": 44100
},
{
"epoch": 0.5059755483309676,
"grad_norm": 0.4843718707561493,
"learning_rate": 1.264466830748097e-05,
"loss": 2.2882,
"step": 44200
},
{
"epoch": 0.5071202893905399,
"grad_norm": 0.4327425956726074,
"learning_rate": 1.267328716158205e-05,
"loss": 2.3146,
"step": 44300
},
{
"epoch": 0.5082650304501122,
"grad_norm": 0.45917651057243347,
"learning_rate": 1.2701906015683134e-05,
"loss": 2.2763,
"step": 44400
},
{
"epoch": 0.5094097715096845,
"grad_norm": 0.4044801890850067,
"learning_rate": 1.2730524869784215e-05,
"loss": 2.2919,
"step": 44500
},
{
"epoch": 0.5105545125692569,
"grad_norm": 0.43837985396385193,
"learning_rate": 1.2759143723885296e-05,
"loss": 2.3,
"step": 44600
},
{
"epoch": 0.5116992536288292,
"grad_norm": 0.5559438467025757,
"learning_rate": 1.2787762577986378e-05,
"loss": 2.2841,
"step": 44700
},
{
"epoch": 0.5128439946884015,
"grad_norm": 0.5006335377693176,
"learning_rate": 1.281638143208746e-05,
"loss": 2.3024,
"step": 44800
},
{
"epoch": 0.5139887357479738,
"grad_norm": 0.5347406268119812,
"learning_rate": 1.2845000286188543e-05,
"loss": 2.2915,
"step": 44900
},
{
"epoch": 0.5151334768075462,
"grad_norm": 0.49963897466659546,
"learning_rate": 1.2873619140289624e-05,
"loss": 2.3094,
"step": 45000
},
{
"epoch": 0.5162782178671185,
"grad_norm": 0.4746800661087036,
"learning_rate": 1.2902237994390704e-05,
"loss": 2.288,
"step": 45100
},
{
"epoch": 0.5174229589266908,
"grad_norm": 0.44089171290397644,
"learning_rate": 1.2930856848491788e-05,
"loss": 2.2791,
"step": 45200
},
{
"epoch": 0.5185676999862631,
"grad_norm": 0.46968016028404236,
"learning_rate": 1.295947570259287e-05,
"loss": 2.3088,
"step": 45300
},
{
"epoch": 0.5197124410458355,
"grad_norm": 0.45375433564186096,
"learning_rate": 1.2988094556693952e-05,
"loss": 2.3083,
"step": 45400
},
{
"epoch": 0.5208571821054078,
"grad_norm": 0.5065542459487915,
"learning_rate": 1.3016713410795032e-05,
"loss": 2.2694,
"step": 45500
},
{
"epoch": 0.5220019231649801,
"grad_norm": 0.5144473910331726,
"learning_rate": 1.3045332264896116e-05,
"loss": 2.3044,
"step": 45600
},
{
"epoch": 0.5231466642245524,
"grad_norm": 0.5982611179351807,
"learning_rate": 1.3073951118997197e-05,
"loss": 2.2964,
"step": 45700
},
{
"epoch": 0.5242914052841248,
"grad_norm": 0.42570099234580994,
"learning_rate": 1.3102569973098278e-05,
"loss": 2.2898,
"step": 45800
},
{
"epoch": 0.5254361463436971,
"grad_norm": 0.5816085934638977,
"learning_rate": 1.3131188827199358e-05,
"loss": 2.2974,
"step": 45900
},
{
"epoch": 0.5265808874032694,
"grad_norm": 0.5254452228546143,
"learning_rate": 1.3159807681300443e-05,
"loss": 2.3149,
"step": 46000
},
{
"epoch": 0.5277256284628417,
"grad_norm": 0.43442779779434204,
"learning_rate": 1.3188426535401523e-05,
"loss": 2.2951,
"step": 46100
},
{
"epoch": 0.5288703695224141,
"grad_norm": 0.4493260085582733,
"learning_rate": 1.3217045389502606e-05,
"loss": 2.2768,
"step": 46200
},
{
"epoch": 0.5300151105819864,
"grad_norm": 0.4513915777206421,
"learning_rate": 1.3245378055062676e-05,
"loss": 2.3075,
"step": 46300
},
{
"epoch": 0.5311598516415587,
"grad_norm": 0.45114508271217346,
"learning_rate": 1.3273996909163757e-05,
"loss": 2.2895,
"step": 46400
},
{
"epoch": 0.532304592701131,
"grad_norm": 0.43823984265327454,
"learning_rate": 1.330261576326484e-05,
"loss": 2.3087,
"step": 46500
},
{
"epoch": 0.5334493337607034,
"grad_norm": 0.453106164932251,
"learning_rate": 1.3331234617365922e-05,
"loss": 2.2851,
"step": 46600
},
{
"epoch": 0.5345940748202757,
"grad_norm": 0.46690353751182556,
"learning_rate": 1.3359853471467004e-05,
"loss": 2.2914,
"step": 46700
},
{
"epoch": 0.535738815879848,
"grad_norm": 0.46535834670066833,
"learning_rate": 1.3388472325568085e-05,
"loss": 2.2938,
"step": 46800
},
{
"epoch": 0.5368835569394202,
"grad_norm": 0.45468568801879883,
"learning_rate": 1.3417091179669165e-05,
"loss": 2.2974,
"step": 46900
},
{
"epoch": 0.5380282979989927,
"grad_norm": 0.4835493862628937,
"learning_rate": 1.344571003377025e-05,
"loss": 2.2882,
"step": 47000
},
{
"epoch": 0.539173039058565,
"grad_norm": 0.45315301418304443,
"learning_rate": 1.347432888787133e-05,
"loss": 2.2875,
"step": 47100
},
{
"epoch": 0.5403177801181372,
"grad_norm": 0.5241557359695435,
"learning_rate": 1.3502947741972413e-05,
"loss": 2.2927,
"step": 47200
},
{
"epoch": 0.5414625211777097,
"grad_norm": 0.4486404061317444,
"learning_rate": 1.3531566596073495e-05,
"loss": 2.2898,
"step": 47300
},
{
"epoch": 0.542607262237282,
"grad_norm": 0.4849669933319092,
"learning_rate": 1.3560185450174577e-05,
"loss": 2.283,
"step": 47400
},
{
"epoch": 0.5437520032968542,
"grad_norm": 0.6526544690132141,
"learning_rate": 1.3588804304275658e-05,
"loss": 2.287,
"step": 47500
},
{
"epoch": 0.5448967443564265,
"grad_norm": 0.4628201723098755,
"learning_rate": 1.3617423158376739e-05,
"loss": 2.3099,
"step": 47600
},
{
"epoch": 0.546041485415999,
"grad_norm": 0.5132496356964111,
"learning_rate": 1.3646042012477823e-05,
"loss": 2.2969,
"step": 47700
},
{
"epoch": 0.5471862264755712,
"grad_norm": 0.545789897441864,
"learning_rate": 1.3674660866578904e-05,
"loss": 2.2948,
"step": 47800
},
{
"epoch": 0.5483309675351435,
"grad_norm": 0.5407856106758118,
"learning_rate": 1.3703279720679984e-05,
"loss": 2.2861,
"step": 47900
},
{
"epoch": 0.5494757085947158,
"grad_norm": 0.494488000869751,
"learning_rate": 1.3731898574781067e-05,
"loss": 2.2687,
"step": 48000
},
{
"epoch": 0.5506204496542882,
"grad_norm": 0.44262704253196716,
"learning_rate": 1.3760517428882149e-05,
"loss": 2.2879,
"step": 48100
},
{
"epoch": 0.5517651907138605,
"grad_norm": 0.4556616544723511,
"learning_rate": 1.3789136282983232e-05,
"loss": 2.2733,
"step": 48200
},
{
"epoch": 0.5529099317734328,
"grad_norm": 0.5023077130317688,
"learning_rate": 1.3817755137084312e-05,
"loss": 2.284,
"step": 48300
},
{
"epoch": 0.5540546728330051,
"grad_norm": 0.44959184527397156,
"learning_rate": 1.3846373991185393e-05,
"loss": 2.2821,
"step": 48400
},
{
"epoch": 0.5551994138925775,
"grad_norm": 0.6102599501609802,
"learning_rate": 1.3874992845286477e-05,
"loss": 2.2889,
"step": 48500
},
{
"epoch": 0.5563441549521498,
"grad_norm": 0.508794367313385,
"learning_rate": 1.3903611699387558e-05,
"loss": 2.2766,
"step": 48600
},
{
"epoch": 0.5574888960117221,
"grad_norm": 0.5081732869148254,
"learning_rate": 1.3932230553488638e-05,
"loss": 2.2843,
"step": 48700
},
{
"epoch": 0.5586336370712944,
"grad_norm": 0.4801699221134186,
"learning_rate": 1.396084940758972e-05,
"loss": 2.2749,
"step": 48800
},
{
"epoch": 0.5597783781308668,
"grad_norm": 0.5260947346687317,
"learning_rate": 1.3989468261690803e-05,
"loss": 2.2883,
"step": 48900
},
{
"epoch": 0.5609231191904391,
"grad_norm": 0.44729700684547424,
"learning_rate": 1.4018087115791886e-05,
"loss": 2.2889,
"step": 49000
},
{
"epoch": 0.5620678602500114,
"grad_norm": 0.4468446671962738,
"learning_rate": 1.4046705969892966e-05,
"loss": 2.2892,
"step": 49100
},
{
"epoch": 0.5632126013095837,
"grad_norm": 0.4718739688396454,
"learning_rate": 1.4075324823994047e-05,
"loss": 2.2894,
"step": 49200
},
{
"epoch": 0.5643573423691561,
"grad_norm": 0.5241585373878479,
"learning_rate": 1.4103943678095131e-05,
"loss": 2.2922,
"step": 49300
},
{
"epoch": 0.5655020834287284,
"grad_norm": 0.43622729182243347,
"learning_rate": 1.4132562532196212e-05,
"loss": 2.2816,
"step": 49400
},
{
"epoch": 0.5666468244883007,
"grad_norm": 0.4169420599937439,
"learning_rate": 1.4161181386297294e-05,
"loss": 2.3186,
"step": 49500
},
{
"epoch": 0.567791565547873,
"grad_norm": 0.4381948411464691,
"learning_rate": 1.4189800240398375e-05,
"loss": 2.2903,
"step": 49600
},
{
"epoch": 0.5689363066074454,
"grad_norm": 0.5486495494842529,
"learning_rate": 1.4218419094499459e-05,
"loss": 2.2964,
"step": 49700
},
{
"epoch": 0.5700810476670177,
"grad_norm": 0.42850059270858765,
"learning_rate": 1.424703794860054e-05,
"loss": 2.29,
"step": 49800
},
{
"epoch": 0.57122578872659,
"grad_norm": 0.4907155930995941,
"learning_rate": 1.427565680270162e-05,
"loss": 2.3085,
"step": 49900
},
{
"epoch": 0.5723705297861623,
"grad_norm": 0.43422576785087585,
"learning_rate": 1.4304275656802701e-05,
"loss": 2.2787,
"step": 50000
},
{
"epoch": 0.5735152708457347,
"grad_norm": 0.4992702007293701,
"learning_rate": 1.4332894510903785e-05,
"loss": 2.287,
"step": 50100
},
{
"epoch": 0.574660011905307,
"grad_norm": 0.4858098030090332,
"learning_rate": 1.4361227176463857e-05,
"loss": 2.2761,
"step": 50200
},
{
"epoch": 0.5758047529648793,
"grad_norm": 0.48108112812042236,
"learning_rate": 1.4389846030564938e-05,
"loss": 2.2883,
"step": 50300
},
{
"epoch": 0.5769494940244516,
"grad_norm": 0.38939031958580017,
"learning_rate": 1.4418464884666019e-05,
"loss": 2.2837,
"step": 50400
},
{
"epoch": 0.578094235084024,
"grad_norm": 0.488679438829422,
"learning_rate": 1.44470837387671e-05,
"loss": 2.298,
"step": 50500
},
{
"epoch": 0.5792389761435963,
"grad_norm": 0.5358524918556213,
"learning_rate": 1.4475702592868184e-05,
"loss": 2.2897,
"step": 50600
},
{
"epoch": 0.5803837172031686,
"grad_norm": 0.48244425654411316,
"learning_rate": 1.4504321446969264e-05,
"loss": 2.2748,
"step": 50700
},
{
"epoch": 0.581528458262741,
"grad_norm": 0.49125197529792786,
"learning_rate": 1.4532940301070347e-05,
"loss": 2.2666,
"step": 50800
},
{
"epoch": 0.5826731993223133,
"grad_norm": 0.5043622851371765,
"learning_rate": 1.4561559155171427e-05,
"loss": 2.2948,
"step": 50900
},
{
"epoch": 0.5838179403818856,
"grad_norm": 0.4543743431568146,
"learning_rate": 1.4590178009272512e-05,
"loss": 2.2801,
"step": 51000
},
{
"epoch": 0.5849626814414579,
"grad_norm": 0.45934557914733887,
"learning_rate": 1.4618796863373592e-05,
"loss": 2.2897,
"step": 51100
},
{
"epoch": 0.5861074225010303,
"grad_norm": 0.48373672366142273,
"learning_rate": 1.4647415717474673e-05,
"loss": 2.2808,
"step": 51200
},
{
"epoch": 0.5872521635606026,
"grad_norm": 0.42684435844421387,
"learning_rate": 1.4676034571575755e-05,
"loss": 2.2707,
"step": 51300
},
{
"epoch": 0.5883969046201749,
"grad_norm": 0.439179927110672,
"learning_rate": 1.4704653425676838e-05,
"loss": 2.2887,
"step": 51400
},
{
"epoch": 0.5895416456797472,
"grad_norm": 0.48160520195961,
"learning_rate": 1.473327227977792e-05,
"loss": 2.282,
"step": 51500
},
{
"epoch": 0.5906863867393196,
"grad_norm": 0.48224136233329773,
"learning_rate": 1.4761891133879e-05,
"loss": 2.2766,
"step": 51600
},
{
"epoch": 0.5918311277988919,
"grad_norm": 0.46199363470077515,
"learning_rate": 1.4790509987980082e-05,
"loss": 2.2836,
"step": 51700
},
{
"epoch": 0.5929758688584642,
"grad_norm": 0.4785059690475464,
"learning_rate": 1.4819128842081166e-05,
"loss": 2.2865,
"step": 51800
},
{
"epoch": 0.5941206099180365,
"grad_norm": 0.43915683031082153,
"learning_rate": 1.4847747696182246e-05,
"loss": 2.2785,
"step": 51900
},
{
"epoch": 0.5952653509776089,
"grad_norm": 0.5053157806396484,
"learning_rate": 1.4876366550283327e-05,
"loss": 2.2903,
"step": 52000
},
{
"epoch": 0.5964100920371812,
"grad_norm": 0.4726928174495697,
"learning_rate": 1.490498540438441e-05,
"loss": 2.2818,
"step": 52100
},
{
"epoch": 0.5975548330967535,
"grad_norm": 0.430034875869751,
"learning_rate": 1.4933604258485492e-05,
"loss": 2.29,
"step": 52200
},
{
"epoch": 0.5986995741563258,
"grad_norm": 0.4643426835536957,
"learning_rate": 1.4962223112586574e-05,
"loss": 2.2897,
"step": 52300
},
{
"epoch": 0.5998443152158982,
"grad_norm": 0.5476269125938416,
"learning_rate": 1.4990841966687655e-05,
"loss": 2.2786,
"step": 52400
},
{
"epoch": 0.6009890562754705,
"grad_norm": 0.4216204285621643,
"learning_rate": 1.5019174632247725e-05,
"loss": 2.2814,
"step": 52500
},
{
"epoch": 0.6021337973350428,
"grad_norm": 0.4980791211128235,
"learning_rate": 1.5047793486348808e-05,
"loss": 2.2648,
"step": 52600
},
{
"epoch": 0.6032785383946151,
"grad_norm": 0.48100781440734863,
"learning_rate": 1.5076412340449888e-05,
"loss": 2.2881,
"step": 52700
},
{
"epoch": 0.6044232794541875,
"grad_norm": 0.5112878084182739,
"learning_rate": 1.5105031194550973e-05,
"loss": 2.2761,
"step": 52800
},
{
"epoch": 0.6055680205137598,
"grad_norm": 0.4899493455886841,
"learning_rate": 1.5133650048652053e-05,
"loss": 2.2852,
"step": 52900
},
{
"epoch": 0.6067127615733321,
"grad_norm": 0.486299067735672,
"learning_rate": 1.5162268902753134e-05,
"loss": 2.2958,
"step": 53000
},
{
"epoch": 0.6078575026329044,
"grad_norm": 0.580345630645752,
"learning_rate": 1.5190887756854218e-05,
"loss": 2.2716,
"step": 53100
},
{
"epoch": 0.6090022436924768,
"grad_norm": 0.4456554055213928,
"learning_rate": 1.5219506610955299e-05,
"loss": 2.285,
"step": 53200
},
{
"epoch": 0.6101469847520491,
"grad_norm": 0.4706750512123108,
"learning_rate": 1.5248125465056381e-05,
"loss": 2.2968,
"step": 53300
},
{
"epoch": 0.6112917258116214,
"grad_norm": 0.47107580304145813,
"learning_rate": 1.527674431915746e-05,
"loss": 2.2733,
"step": 53400
},
{
"epoch": 0.6124364668711937,
"grad_norm": 0.45870354771614075,
"learning_rate": 1.5305363173258546e-05,
"loss": 2.2962,
"step": 53500
},
{
"epoch": 0.6135812079307661,
"grad_norm": 0.526592493057251,
"learning_rate": 1.5333982027359625e-05,
"loss": 2.28,
"step": 53600
},
{
"epoch": 0.6147259489903384,
"grad_norm": 0.4595036506652832,
"learning_rate": 1.5362600881460707e-05,
"loss": 2.2751,
"step": 53700
},
{
"epoch": 0.6158706900499107,
"grad_norm": 0.47698622941970825,
"learning_rate": 1.539121973556179e-05,
"loss": 2.2764,
"step": 53800
},
{
"epoch": 0.617015431109483,
"grad_norm": 0.47543615102767944,
"learning_rate": 1.5419838589662872e-05,
"loss": 2.269,
"step": 53900
},
{
"epoch": 0.6181601721690554,
"grad_norm": 0.46203306317329407,
"learning_rate": 1.5448457443763955e-05,
"loss": 2.3127,
"step": 54000
},
{
"epoch": 0.6193049132286277,
"grad_norm": 0.4622338116168976,
"learning_rate": 1.5477076297865034e-05,
"loss": 2.2795,
"step": 54100
},
{
"epoch": 0.6204496542882,
"grad_norm": 0.43615639209747314,
"learning_rate": 1.5505695151966116e-05,
"loss": 2.2751,
"step": 54200
},
{
"epoch": 0.6215943953477724,
"grad_norm": 0.4956182837486267,
"learning_rate": 1.55343140060672e-05,
"loss": 2.2888,
"step": 54300
},
{
"epoch": 0.6227391364073447,
"grad_norm": 0.44354909658432007,
"learning_rate": 1.556293286016828e-05,
"loss": 2.278,
"step": 54400
},
{
"epoch": 0.623883877466917,
"grad_norm": 0.46796587109565735,
"learning_rate": 1.5591551714269363e-05,
"loss": 2.2843,
"step": 54500
},
{
"epoch": 0.6250286185264893,
"grad_norm": 0.43353140354156494,
"learning_rate": 1.5620170568370442e-05,
"loss": 2.2814,
"step": 54600
},
{
"epoch": 0.6261733595860617,
"grad_norm": 0.47816458344459534,
"learning_rate": 1.5648789422471528e-05,
"loss": 2.2708,
"step": 54700
},
{
"epoch": 0.627318100645634,
"grad_norm": 0.4949074387550354,
"learning_rate": 1.5677408276572607e-05,
"loss": 2.2785,
"step": 54800
},
{
"epoch": 0.6284628417052063,
"grad_norm": 0.41498610377311707,
"learning_rate": 1.570602713067369e-05,
"loss": 2.2752,
"step": 54900
},
{
"epoch": 0.6296075827647786,
"grad_norm": 0.41272154450416565,
"learning_rate": 1.5734645984774772e-05,
"loss": 2.2679,
"step": 55000
},
{
"epoch": 0.630752323824351,
"grad_norm": 0.47275310754776,
"learning_rate": 1.5763264838875854e-05,
"loss": 2.2838,
"step": 55100
},
{
"epoch": 0.6318970648839233,
"grad_norm": 0.41480526328086853,
"learning_rate": 1.5791883692976933e-05,
"loss": 2.2905,
"step": 55200
},
{
"epoch": 0.6330418059434956,
"grad_norm": 0.45607683062553406,
"learning_rate": 1.5820502547078016e-05,
"loss": 2.2793,
"step": 55300
},
{
"epoch": 0.6341865470030679,
"grad_norm": 0.4298737645149231,
"learning_rate": 1.5849121401179098e-05,
"loss": 2.2864,
"step": 55400
},
{
"epoch": 0.6353312880626403,
"grad_norm": 0.45687663555145264,
"learning_rate": 1.587774025528018e-05,
"loss": 2.2761,
"step": 55500
},
{
"epoch": 0.6364760291222126,
"grad_norm": 0.4270581901073456,
"learning_rate": 1.5906359109381263e-05,
"loss": 2.2859,
"step": 55600
},
{
"epoch": 0.6376207701817849,
"grad_norm": 0.4622785449028015,
"learning_rate": 1.5934977963482342e-05,
"loss": 2.2751,
"step": 55700
},
{
"epoch": 0.6387655112413572,
"grad_norm": 0.4890844523906708,
"learning_rate": 1.5963596817583424e-05,
"loss": 2.2908,
"step": 55800
},
{
"epoch": 0.6399102523009296,
"grad_norm": 0.4259001910686493,
"learning_rate": 1.5992215671684507e-05,
"loss": 2.2724,
"step": 55900
},
{
"epoch": 0.6410549933605019,
"grad_norm": 0.5524899363517761,
"learning_rate": 1.602083452578559e-05,
"loss": 2.2707,
"step": 56000
},
{
"epoch": 0.6421997344200742,
"grad_norm": 0.4600765109062195,
"learning_rate": 1.604945337988667e-05,
"loss": 2.2828,
"step": 56100
},
{
"epoch": 0.6433444754796465,
"grad_norm": 0.4435892403125763,
"learning_rate": 1.607807223398775e-05,
"loss": 2.2781,
"step": 56200
},
{
"epoch": 0.6444892165392189,
"grad_norm": 0.47321733832359314,
"learning_rate": 1.6106691088088836e-05,
"loss": 2.2494,
"step": 56300
},
{
"epoch": 0.6456339575987912,
"grad_norm": 0.530928373336792,
"learning_rate": 1.6135309942189915e-05,
"loss": 2.275,
"step": 56400
},
{
"epoch": 0.6467786986583635,
"grad_norm": 0.5238829851150513,
"learning_rate": 1.6163928796290998e-05,
"loss": 2.2823,
"step": 56500
},
{
"epoch": 0.6479234397179358,
"grad_norm": 0.5112258791923523,
"learning_rate": 1.619254765039208e-05,
"loss": 2.2724,
"step": 56600
},
{
"epoch": 0.6490681807775082,
"grad_norm": 0.5023364424705505,
"learning_rate": 1.6221166504493162e-05,
"loss": 2.2655,
"step": 56700
},
{
"epoch": 0.6502129218370805,
"grad_norm": 0.45399200916290283,
"learning_rate": 1.6249785358594245e-05,
"loss": 2.2697,
"step": 56800
},
{
"epoch": 0.6513576628966528,
"grad_norm": 0.4420014023780823,
"learning_rate": 1.6278404212695324e-05,
"loss": 2.2772,
"step": 56900
},
{
"epoch": 0.652502403956225,
"grad_norm": 0.4523197412490845,
"learning_rate": 1.6307023066796406e-05,
"loss": 2.2743,
"step": 57000
},
{
"epoch": 0.6536471450157975,
"grad_norm": 0.5409209728240967,
"learning_rate": 1.633564192089749e-05,
"loss": 2.2727,
"step": 57100
},
{
"epoch": 0.6547918860753698,
"grad_norm": 0.5374095439910889,
"learning_rate": 1.636426077499857e-05,
"loss": 2.261,
"step": 57200
},
{
"epoch": 0.655936627134942,
"grad_norm": 0.5544825196266174,
"learning_rate": 1.6392879629099653e-05,
"loss": 2.272,
"step": 57300
},
{
"epoch": 0.6570813681945143,
"grad_norm": 0.4391005337238312,
"learning_rate": 1.6421498483200732e-05,
"loss": 2.2752,
"step": 57400
},
{
"epoch": 0.6582261092540868,
"grad_norm": 0.524519145488739,
"learning_rate": 1.6450117337301815e-05,
"loss": 2.2798,
"step": 57500
},
{
"epoch": 0.659370850313659,
"grad_norm": 0.4723650813102722,
"learning_rate": 1.6478736191402897e-05,
"loss": 2.2566,
"step": 57600
},
{
"epoch": 0.6605155913732313,
"grad_norm": 0.4081030786037445,
"learning_rate": 1.6507068856962968e-05,
"loss": 2.2812,
"step": 57700
},
{
"epoch": 0.6616603324328038,
"grad_norm": 0.4323836863040924,
"learning_rate": 1.653568771106405e-05,
"loss": 2.2651,
"step": 57800
},
{
"epoch": 0.662805073492376,
"grad_norm": 0.5252947807312012,
"learning_rate": 1.6564306565165132e-05,
"loss": 2.2754,
"step": 57900
},
{
"epoch": 0.6639498145519483,
"grad_norm": 0.4679439663887024,
"learning_rate": 1.6592925419266215e-05,
"loss": 2.2735,
"step": 58000
},
{
"epoch": 0.6650945556115206,
"grad_norm": 0.4603148102760315,
"learning_rate": 1.6621544273367297e-05,
"loss": 2.2741,
"step": 58100
},
{
"epoch": 0.666239296671093,
"grad_norm": 0.44324785470962524,
"learning_rate": 1.6650163127468376e-05,
"loss": 2.2858,
"step": 58200
},
{
"epoch": 0.6673840377306653,
"grad_norm": 0.45351341366767883,
"learning_rate": 1.667878198156946e-05,
"loss": 2.2761,
"step": 58300
},
{
"epoch": 0.6685287787902376,
"grad_norm": 0.49566417932510376,
"learning_rate": 1.670740083567054e-05,
"loss": 2.2684,
"step": 58400
},
{
"epoch": 0.6696735198498099,
"grad_norm": 0.48700281977653503,
"learning_rate": 1.6736019689771623e-05,
"loss": 2.2683,
"step": 58500
},
{
"epoch": 0.6708182609093823,
"grad_norm": 0.47343066334724426,
"learning_rate": 1.6764638543872706e-05,
"loss": 2.2727,
"step": 58600
},
{
"epoch": 0.6719630019689546,
"grad_norm": 0.4507409334182739,
"learning_rate": 1.6793257397973785e-05,
"loss": 2.2774,
"step": 58700
},
{
"epoch": 0.6731077430285269,
"grad_norm": 0.5125613808631897,
"learning_rate": 1.682187625207487e-05,
"loss": 2.2724,
"step": 58800
},
{
"epoch": 0.6742524840880992,
"grad_norm": 0.4266802966594696,
"learning_rate": 1.685049510617595e-05,
"loss": 2.264,
"step": 58900
},
{
"epoch": 0.6753972251476716,
"grad_norm": 0.4939129650592804,
"learning_rate": 1.687882777173602e-05,
"loss": 2.2736,
"step": 59000
},
{
"epoch": 0.6765419662072439,
"grad_norm": 0.4348323345184326,
"learning_rate": 1.6907446625837102e-05,
"loss": 2.2647,
"step": 59100
},
{
"epoch": 0.6776867072668162,
"grad_norm": 0.4148264527320862,
"learning_rate": 1.6936065479938185e-05,
"loss": 2.2731,
"step": 59200
},
{
"epoch": 0.6788314483263885,
"grad_norm": 0.5018272399902344,
"learning_rate": 1.6964684334039267e-05,
"loss": 2.2798,
"step": 59300
},
{
"epoch": 0.6799761893859609,
"grad_norm": 0.405222624540329,
"learning_rate": 1.699330318814035e-05,
"loss": 2.2716,
"step": 59400
},
{
"epoch": 0.6811209304455332,
"grad_norm": 0.43340057134628296,
"learning_rate": 1.702192204224143e-05,
"loss": 2.2616,
"step": 59500
},
{
"epoch": 0.6822656715051055,
"grad_norm": 0.4897302985191345,
"learning_rate": 1.705054089634251e-05,
"loss": 2.2937,
"step": 59600
},
{
"epoch": 0.6834104125646778,
"grad_norm": 0.45297375321388245,
"learning_rate": 1.7079159750443593e-05,
"loss": 2.2646,
"step": 59700
},
{
"epoch": 0.6845551536242502,
"grad_norm": 0.5536375641822815,
"learning_rate": 1.7107778604544676e-05,
"loss": 2.2762,
"step": 59800
},
{
"epoch": 0.6856998946838225,
"grad_norm": 0.42413586378097534,
"learning_rate": 1.7136397458645758e-05,
"loss": 2.2578,
"step": 59900
},
{
"epoch": 0.6868446357433948,
"grad_norm": 0.45568087697029114,
"learning_rate": 1.7165016312746837e-05,
"loss": 2.2875,
"step": 60000
},
{
"epoch": 0.6879893768029671,
"grad_norm": 0.45397791266441345,
"learning_rate": 1.7193635166847923e-05,
"loss": 2.28,
"step": 60100
},
{
"epoch": 0.6891341178625395,
"grad_norm": 0.4058510661125183,
"learning_rate": 1.7222254020949002e-05,
"loss": 2.2845,
"step": 60200
},
{
"epoch": 0.6902788589221118,
"grad_norm": 0.4380168318748474,
"learning_rate": 1.7250872875050084e-05,
"loss": 2.2574,
"step": 60300
},
{
"epoch": 0.6914235999816841,
"grad_norm": 0.4758777320384979,
"learning_rate": 1.7279491729151167e-05,
"loss": 2.2597,
"step": 60400
},
{
"epoch": 0.6925683410412564,
"grad_norm": 0.45504075288772583,
"learning_rate": 1.730811058325225e-05,
"loss": 2.2763,
"step": 60500
},
{
"epoch": 0.6937130821008288,
"grad_norm": 0.4878067374229431,
"learning_rate": 1.733672943735333e-05,
"loss": 2.2811,
"step": 60600
},
{
"epoch": 0.6948578231604011,
"grad_norm": 0.49453550577163696,
"learning_rate": 1.736534829145441e-05,
"loss": 2.2545,
"step": 60700
},
{
"epoch": 0.6960025642199734,
"grad_norm": 0.43168744444847107,
"learning_rate": 1.7393967145555493e-05,
"loss": 2.2584,
"step": 60800
},
{
"epoch": 0.6971473052795457,
"grad_norm": 0.5180889368057251,
"learning_rate": 1.7422585999656575e-05,
"loss": 2.274,
"step": 60900
},
{
"epoch": 0.6982920463391181,
"grad_norm": 0.4250308573246002,
"learning_rate": 1.7451204853757658e-05,
"loss": 2.2531,
"step": 61000
},
{
"epoch": 0.6994367873986904,
"grad_norm": 0.40109291672706604,
"learning_rate": 1.7479823707858737e-05,
"loss": 2.2771,
"step": 61100
},
{
"epoch": 0.7005815284582627,
"grad_norm": 0.4581041932106018,
"learning_rate": 1.750844256195982e-05,
"loss": 2.2713,
"step": 61200
},
{
"epoch": 0.701726269517835,
"grad_norm": 0.44385623931884766,
"learning_rate": 1.75370614160609e-05,
"loss": 2.2559,
"step": 61300
},
{
"epoch": 0.7028710105774074,
"grad_norm": 0.44979363679885864,
"learning_rate": 1.7565680270161984e-05,
"loss": 2.2533,
"step": 61400
},
{
"epoch": 0.7040157516369797,
"grad_norm": 0.44290637969970703,
"learning_rate": 1.7594299124263066e-05,
"loss": 2.2726,
"step": 61500
},
{
"epoch": 0.705160492696552,
"grad_norm": 0.4479668438434601,
"learning_rate": 1.7622917978364145e-05,
"loss": 2.2673,
"step": 61600
},
{
"epoch": 0.7063052337561244,
"grad_norm": 0.416456401348114,
"learning_rate": 1.765153683246523e-05,
"loss": 2.266,
"step": 61700
},
{
"epoch": 0.7074499748156967,
"grad_norm": 0.45117634534835815,
"learning_rate": 1.768015568656631e-05,
"loss": 2.2791,
"step": 61800
},
{
"epoch": 0.708594715875269,
"grad_norm": 0.5188822150230408,
"learning_rate": 1.7708774540667393e-05,
"loss": 2.266,
"step": 61900
},
{
"epoch": 0.7097394569348413,
"grad_norm": 0.47650662064552307,
"learning_rate": 1.7737393394768475e-05,
"loss": 2.2695,
"step": 62000
},
{
"epoch": 0.7108841979944137,
"grad_norm": 0.5149694681167603,
"learning_rate": 1.7766012248869557e-05,
"loss": 2.2775,
"step": 62100
},
{
"epoch": 0.712028939053986,
"grad_norm": 0.4305098354816437,
"learning_rate": 1.779463110297064e-05,
"loss": 2.2722,
"step": 62200
},
{
"epoch": 0.7131736801135583,
"grad_norm": 0.48085054755210876,
"learning_rate": 1.782324995707172e-05,
"loss": 2.2712,
"step": 62300
},
{
"epoch": 0.7143184211731306,
"grad_norm": 0.4287306070327759,
"learning_rate": 1.78518688111728e-05,
"loss": 2.2629,
"step": 62400
},
{
"epoch": 0.715463162232703,
"grad_norm": 0.45178937911987305,
"learning_rate": 1.7880487665273884e-05,
"loss": 2.2787,
"step": 62500
},
{
"epoch": 0.7166079032922753,
"grad_norm": 0.5393545031547546,
"learning_rate": 1.7909106519374966e-05,
"loss": 2.2803,
"step": 62600
},
{
"epoch": 0.7177526443518476,
"grad_norm": 0.4494490325450897,
"learning_rate": 1.793772537347605e-05,
"loss": 2.2714,
"step": 62700
},
{
"epoch": 0.7188973854114199,
"grad_norm": 0.43690425157546997,
"learning_rate": 1.7966344227577127e-05,
"loss": 2.2639,
"step": 62800
},
{
"epoch": 0.7200421264709923,
"grad_norm": 0.5241349339485168,
"learning_rate": 1.7994963081678213e-05,
"loss": 2.2639,
"step": 62900
},
{
"epoch": 0.7211868675305646,
"grad_norm": 0.5191497206687927,
"learning_rate": 1.8023581935779292e-05,
"loss": 2.269,
"step": 63000
},
{
"epoch": 0.7223316085901369,
"grad_norm": 0.4875340461730957,
"learning_rate": 1.8052200789880375e-05,
"loss": 2.262,
"step": 63100
},
{
"epoch": 0.7234763496497092,
"grad_norm": 0.47728395462036133,
"learning_rate": 1.8080533455440445e-05,
"loss": 2.2653,
"step": 63200
},
{
"epoch": 0.7246210907092816,
"grad_norm": 0.517727792263031,
"learning_rate": 1.8109152309541528e-05,
"loss": 2.2505,
"step": 63300
},
{
"epoch": 0.7257658317688539,
"grad_norm": 0.5039493441581726,
"learning_rate": 1.813777116364261e-05,
"loss": 2.2858,
"step": 63400
},
{
"epoch": 0.7269105728284262,
"grad_norm": 0.5291385054588318,
"learning_rate": 1.8166390017743692e-05,
"loss": 2.2559,
"step": 63500
},
{
"epoch": 0.7280553138879985,
"grad_norm": 0.4564548432826996,
"learning_rate": 1.819500887184477e-05,
"loss": 2.2834,
"step": 63600
},
{
"epoch": 0.7292000549475709,
"grad_norm": 0.4725103974342346,
"learning_rate": 1.8223627725945854e-05,
"loss": 2.2542,
"step": 63700
},
{
"epoch": 0.7303447960071432,
"grad_norm": 0.475724995136261,
"learning_rate": 1.8252246580046936e-05,
"loss": 2.2672,
"step": 63800
},
{
"epoch": 0.7314895370667155,
"grad_norm": 0.46552959084510803,
"learning_rate": 1.828086543414802e-05,
"loss": 2.276,
"step": 63900
},
{
"epoch": 0.7326342781262878,
"grad_norm": 0.4661727845668793,
"learning_rate": 1.83094842882491e-05,
"loss": 2.2629,
"step": 64000
},
{
"epoch": 0.7337790191858602,
"grad_norm": 0.5402230620384216,
"learning_rate": 1.833810314235018e-05,
"loss": 2.2578,
"step": 64100
},
{
"epoch": 0.7349237602454325,
"grad_norm": 0.5088352560997009,
"learning_rate": 1.8366721996451266e-05,
"loss": 2.2584,
"step": 64200
},
{
"epoch": 0.7360685013050048,
"grad_norm": 0.4990089535713196,
"learning_rate": 1.8395340850552345e-05,
"loss": 2.2565,
"step": 64300
},
{
"epoch": 0.7372132423645771,
"grad_norm": 0.4477214813232422,
"learning_rate": 1.8423959704653427e-05,
"loss": 2.2702,
"step": 64400
},
{
"epoch": 0.7383579834241495,
"grad_norm": 0.43167996406555176,
"learning_rate": 1.845257855875451e-05,
"loss": 2.2744,
"step": 64500
},
{
"epoch": 0.7395027244837218,
"grad_norm": 0.4676847457885742,
"learning_rate": 1.8481197412855592e-05,
"loss": 2.2624,
"step": 64600
},
{
"epoch": 0.7406474655432941,
"grad_norm": 0.5216718912124634,
"learning_rate": 1.8509816266956674e-05,
"loss": 2.2801,
"step": 64700
},
{
"epoch": 0.7417922066028664,
"grad_norm": 0.4484277665615082,
"learning_rate": 1.8538435121057753e-05,
"loss": 2.2567,
"step": 64800
},
{
"epoch": 0.7429369476624388,
"grad_norm": 0.47648686170578003,
"learning_rate": 1.8567053975158836e-05,
"loss": 2.2605,
"step": 64900
},
{
"epoch": 0.7440816887220111,
"grad_norm": 0.46372881531715393,
"learning_rate": 1.8595672829259918e-05,
"loss": 2.2673,
"step": 65000
},
{
"epoch": 0.7452264297815834,
"grad_norm": 0.43808409571647644,
"learning_rate": 1.8624291683361e-05,
"loss": 2.263,
"step": 65100
},
{
"epoch": 0.7463711708411558,
"grad_norm": 0.4345923364162445,
"learning_rate": 1.865291053746208e-05,
"loss": 2.2638,
"step": 65200
},
{
"epoch": 0.7475159119007281,
"grad_norm": 0.49368196725845337,
"learning_rate": 1.8681529391563162e-05,
"loss": 2.271,
"step": 65300
},
{
"epoch": 0.7486606529603004,
"grad_norm": 0.46279481053352356,
"learning_rate": 1.8710148245664244e-05,
"loss": 2.2721,
"step": 65400
},
{
"epoch": 0.7498053940198727,
"grad_norm": 0.423225998878479,
"learning_rate": 1.8738767099765327e-05,
"loss": 2.2634,
"step": 65500
},
{
"epoch": 0.7509501350794451,
"grad_norm": 0.48171648383140564,
"learning_rate": 1.876738595386641e-05,
"loss": 2.2656,
"step": 65600
},
{
"epoch": 0.7520948761390174,
"grad_norm": 0.41427454352378845,
"learning_rate": 1.879571861942648e-05,
"loss": 2.2663,
"step": 65700
},
{
"epoch": 0.7532396171985897,
"grad_norm": 0.5507281422615051,
"learning_rate": 1.8824337473527562e-05,
"loss": 2.2572,
"step": 65800
},
{
"epoch": 0.754384358258162,
"grad_norm": 0.44962623715400696,
"learning_rate": 1.8852956327628644e-05,
"loss": 2.2745,
"step": 65900
},
{
"epoch": 0.7555290993177344,
"grad_norm": 0.5174722671508789,
"learning_rate": 1.8881575181729727e-05,
"loss": 2.2387,
"step": 66000
},
{
"epoch": 0.7566738403773067,
"grad_norm": 0.5622259974479675,
"learning_rate": 1.8910194035830806e-05,
"loss": 2.2602,
"step": 66100
},
{
"epoch": 0.757818581436879,
"grad_norm": 0.5210707187652588,
"learning_rate": 1.8938812889931888e-05,
"loss": 2.2643,
"step": 66200
},
{
"epoch": 0.7589633224964513,
"grad_norm": 0.486509770154953,
"learning_rate": 1.896743174403297e-05,
"loss": 2.2707,
"step": 66300
},
{
"epoch": 0.7601080635560237,
"grad_norm": 0.47192618250846863,
"learning_rate": 1.8996050598134053e-05,
"loss": 2.2613,
"step": 66400
},
{
"epoch": 0.761252804615596,
"grad_norm": 0.5344927906990051,
"learning_rate": 1.9024669452235135e-05,
"loss": 2.2561,
"step": 66500
},
{
"epoch": 0.7623975456751683,
"grad_norm": 0.5391865372657776,
"learning_rate": 1.9053288306336214e-05,
"loss": 2.2763,
"step": 66600
},
{
"epoch": 0.7635422867347406,
"grad_norm": 0.45489776134490967,
"learning_rate": 1.90819071604373e-05,
"loss": 2.2611,
"step": 66700
},
{
"epoch": 0.764687027794313,
"grad_norm": 0.38119086623191833,
"learning_rate": 1.911052601453838e-05,
"loss": 2.2602,
"step": 66800
},
{
"epoch": 0.7658317688538853,
"grad_norm": 0.49369150400161743,
"learning_rate": 1.913914486863946e-05,
"loss": 2.2613,
"step": 66900
},
{
"epoch": 0.7669765099134576,
"grad_norm": 0.6532511711120605,
"learning_rate": 1.916776372274054e-05,
"loss": 2.268,
"step": 67000
},
{
"epoch": 0.7681212509730299,
"grad_norm": 0.4440617263317108,
"learning_rate": 1.9196382576841626e-05,
"loss": 2.2776,
"step": 67100
},
{
"epoch": 0.7692659920326023,
"grad_norm": 0.46082597970962524,
"learning_rate": 1.9225001430942705e-05,
"loss": 2.255,
"step": 67200
},
{
"epoch": 0.7704107330921746,
"grad_norm": 0.40288957953453064,
"learning_rate": 1.9253620285043788e-05,
"loss": 2.2652,
"step": 67300
},
{
"epoch": 0.7715554741517469,
"grad_norm": 0.51495760679245,
"learning_rate": 1.928223913914487e-05,
"loss": 2.2651,
"step": 67400
},
{
"epoch": 0.7727002152113192,
"grad_norm": 0.5247004628181458,
"learning_rate": 1.9310857993245953e-05,
"loss": 2.2534,
"step": 67500
},
{
"epoch": 0.7738449562708916,
"grad_norm": 0.4820224344730377,
"learning_rate": 1.9339476847347035e-05,
"loss": 2.2811,
"step": 67600
},
{
"epoch": 0.7749896973304639,
"grad_norm": 0.43811190128326416,
"learning_rate": 1.9368095701448114e-05,
"loss": 2.2638,
"step": 67700
},
{
"epoch": 0.7761344383900362,
"grad_norm": 0.4226974546909332,
"learning_rate": 1.9396714555549196e-05,
"loss": 2.2605,
"step": 67800
},
{
"epoch": 0.7772791794496084,
"grad_norm": 0.4748658835887909,
"learning_rate": 1.9425047221109267e-05,
"loss": 2.2617,
"step": 67900
},
{
"epoch": 0.7784239205091809,
"grad_norm": 0.4761633574962616,
"learning_rate": 1.9453666075210353e-05,
"loss": 2.2614,
"step": 68000
},
{
"epoch": 0.7795686615687532,
"grad_norm": 0.48032355308532715,
"learning_rate": 1.948228492931143e-05,
"loss": 2.2443,
"step": 68100
},
{
"epoch": 0.7807134026283254,
"grad_norm": 0.4990929961204529,
"learning_rate": 1.9510903783412514e-05,
"loss": 2.2702,
"step": 68200
},
{
"epoch": 0.7818581436878977,
"grad_norm": 0.40114450454711914,
"learning_rate": 1.9539522637513596e-05,
"loss": 2.2482,
"step": 68300
},
{
"epoch": 0.7830028847474702,
"grad_norm": 0.4980379343032837,
"learning_rate": 1.9567855303073667e-05,
"loss": 2.2706,
"step": 68400
},
{
"epoch": 0.7841476258070424,
"grad_norm": 0.42115017771720886,
"learning_rate": 1.9596474157174746e-05,
"loss": 2.2741,
"step": 68500
},
{
"epoch": 0.7852923668666147,
"grad_norm": 0.4727267622947693,
"learning_rate": 1.9625093011275832e-05,
"loss": 2.2649,
"step": 68600
},
{
"epoch": 0.7864371079261872,
"grad_norm": 0.47092244029045105,
"learning_rate": 1.965371186537691e-05,
"loss": 2.2482,
"step": 68700
},
{
"epoch": 0.7875818489857594,
"grad_norm": 0.4399222433567047,
"learning_rate": 1.9682330719477993e-05,
"loss": 2.2601,
"step": 68800
},
{
"epoch": 0.7887265900453317,
"grad_norm": 0.44569170475006104,
"learning_rate": 1.9710949573579076e-05,
"loss": 2.2679,
"step": 68900
},
{
"epoch": 0.789871331104904,
"grad_norm": 0.43348217010498047,
"learning_rate": 1.9739568427680158e-05,
"loss": 2.2529,
"step": 69000
},
{
"epoch": 0.7910160721644764,
"grad_norm": 0.4533138573169708,
"learning_rate": 1.976818728178124e-05,
"loss": 2.2634,
"step": 69100
},
{
"epoch": 0.7921608132240487,
"grad_norm": 0.5940411686897278,
"learning_rate": 1.979680613588232e-05,
"loss": 2.2526,
"step": 69200
},
{
"epoch": 0.793305554283621,
"grad_norm": 0.4553944766521454,
"learning_rate": 1.9825424989983402e-05,
"loss": 2.2514,
"step": 69300
},
{
"epoch": 0.7944502953431933,
"grad_norm": 0.5016659498214722,
"learning_rate": 1.9854043844084484e-05,
"loss": 2.2452,
"step": 69400
},
{
"epoch": 0.7955950364027657,
"grad_norm": 0.45411109924316406,
"learning_rate": 1.9882662698185567e-05,
"loss": 2.2576,
"step": 69500
},
{
"epoch": 0.796739777462338,
"grad_norm": 0.3970607817173004,
"learning_rate": 1.991128155228665e-05,
"loss": 2.2403,
"step": 69600
},
{
"epoch": 0.7978845185219103,
"grad_norm": 0.47627323865890503,
"learning_rate": 1.9939900406387728e-05,
"loss": 2.2614,
"step": 69700
},
{
"epoch": 0.7990292595814826,
"grad_norm": 0.4684958755970001,
"learning_rate": 1.9968519260488814e-05,
"loss": 2.2614,
"step": 69800
},
{
"epoch": 0.800174000641055,
"grad_norm": 0.418066143989563,
"learning_rate": 1.9997138114589893e-05,
"loss": 2.2627,
"step": 69900
},
{
"epoch": 0.8013187417006273,
"grad_norm": 0.43731051683425903,
"learning_rate": 1.9998690637047088e-05,
"loss": 2.2473,
"step": 70000
},
{
"epoch": 0.8024634827601996,
"grad_norm": 0.43731534481048584,
"learning_rate": 1.999416488433588e-05,
"loss": 2.2512,
"step": 70100
},
{
"epoch": 0.8036082238197719,
"grad_norm": 0.46741268038749695,
"learning_rate": 1.99864080397093e-05,
"loss": 2.2373,
"step": 70200
},
{
"epoch": 0.8047529648793443,
"grad_norm": 0.4025750160217285,
"learning_rate": 1.9975422610938463e-05,
"loss": 2.2711,
"step": 70300
},
{
"epoch": 0.8058977059389166,
"grad_norm": 0.5251903533935547,
"learning_rate": 1.996121214958875e-05,
"loss": 2.2627,
"step": 70400
},
{
"epoch": 0.8070424469984889,
"grad_norm": 0.5502318143844604,
"learning_rate": 1.9943781249871618e-05,
"loss": 2.2615,
"step": 70500
},
{
"epoch": 0.8081871880580612,
"grad_norm": 0.4485688805580139,
"learning_rate": 1.992313554715929e-05,
"loss": 2.2486,
"step": 70600
},
{
"epoch": 0.8093319291176336,
"grad_norm": 0.42768144607543945,
"learning_rate": 1.9899281716162846e-05,
"loss": 2.2602,
"step": 70700
},
{
"epoch": 0.8104766701772059,
"grad_norm": 0.446748286485672,
"learning_rate": 1.987222746877431e-05,
"loss": 2.2705,
"step": 70800
},
{
"epoch": 0.8116214112367782,
"grad_norm": 0.4175347685813904,
"learning_rate": 1.9841981551573424e-05,
"loss": 2.2437,
"step": 70900
},
{
"epoch": 0.8127661522963505,
"grad_norm": 0.4600895047187805,
"learning_rate": 1.9808553742999863e-05,
"loss": 2.2592,
"step": 71000
},
{
"epoch": 0.8139108933559229,
"grad_norm": 0.4427413046360016,
"learning_rate": 1.9771954850191927e-05,
"loss": 2.2508,
"step": 71100
},
{
"epoch": 0.8150556344154952,
"grad_norm": 0.41499242186546326,
"learning_rate": 1.973219670549259e-05,
"loss": 2.2677,
"step": 71200
},
{
"epoch": 0.8162003754750675,
"grad_norm": 0.48436206579208374,
"learning_rate": 1.9689292162624135e-05,
"loss": 2.2569,
"step": 71300
},
{
"epoch": 0.8173451165346398,
"grad_norm": 0.6114481687545776,
"learning_rate": 1.9643255092532582e-05,
"loss": 2.262,
"step": 71400
},
{
"epoch": 0.8184898575942122,
"grad_norm": 0.489258348941803,
"learning_rate": 1.959410037890323e-05,
"loss": 2.2376,
"step": 71500
},
{
"epoch": 0.8196345986537845,
"grad_norm": 0.5034488439559937,
"learning_rate": 1.9541843913348804e-05,
"loss": 2.2542,
"step": 71600
},
{
"epoch": 0.8207793397133568,
"grad_norm": 0.4198042154312134,
"learning_rate": 1.948650259027172e-05,
"loss": 2.2509,
"step": 71700
},
{
"epoch": 0.8219240807729291,
"grad_norm": 0.462247759103775,
"learning_rate": 1.9428094301402164e-05,
"loss": 2.2493,
"step": 71800
},
{
"epoch": 0.8230688218325015,
"grad_norm": 0.4464458227157593,
"learning_rate": 1.936663793001374e-05,
"loss": 2.2645,
"step": 71900
},
{
"epoch": 0.8242135628920738,
"grad_norm": 0.5643649697303772,
"learning_rate": 1.930215334481855e-05,
"loss": 2.2455,
"step": 72000
},
{
"epoch": 0.8253583039516461,
"grad_norm": 0.48486921191215515,
"learning_rate": 1.9234661393543668e-05,
"loss": 2.256,
"step": 72100
},
{
"epoch": 0.8265030450112185,
"grad_norm": 0.4897540807723999,
"learning_rate": 1.9164183896191133e-05,
"loss": 2.2453,
"step": 72200
},
{
"epoch": 0.8276477860707908,
"grad_norm": 0.47083780169487,
"learning_rate": 1.9090743637983577e-05,
"loss": 2.239,
"step": 72300
},
{
"epoch": 0.8287925271303631,
"grad_norm": 0.42039257287979126,
"learning_rate": 1.9014364361997813e-05,
"loss": 2.2483,
"step": 72400
},
{
"epoch": 0.8299372681899354,
"grad_norm": 0.5170241594314575,
"learning_rate": 1.8935070761488754e-05,
"loss": 2.2686,
"step": 72500
},
{
"epoch": 0.8310820092495078,
"grad_norm": 0.46110501885414124,
"learning_rate": 1.885288847190614e-05,
"loss": 2.2444,
"step": 72600
},
{
"epoch": 0.8322267503090801,
"grad_norm": 0.48973456025123596,
"learning_rate": 1.876784406260664e-05,
"loss": 2.2475,
"step": 72700
},
{
"epoch": 0.8333714913686524,
"grad_norm": 0.45546847581863403,
"learning_rate": 1.8679965028264055e-05,
"loss": 2.2519,
"step": 72800
},
{
"epoch": 0.8345162324282247,
"grad_norm": 0.49703338742256165,
"learning_rate": 1.859020042770291e-05,
"loss": 2.2518,
"step": 72900
},
{
"epoch": 0.8356609734877971,
"grad_norm": 0.494150310754776,
"learning_rate": 1.8496765904697226e-05,
"loss": 2.2682,
"step": 73000
},
{
"epoch": 0.8368057145473694,
"grad_norm": 0.46213245391845703,
"learning_rate": 1.840058439563126e-05,
"loss": 2.2499,
"step": 73100
},
{
"epoch": 0.8379504556069417,
"grad_norm": 0.40609198808670044,
"learning_rate": 1.830168699577909e-05,
"loss": 2.2706,
"step": 73200
},
{
"epoch": 0.839095196666514,
"grad_norm": 0.6065017580986023,
"learning_rate": 1.820010567845644e-05,
"loss": 2.2553,
"step": 73300
},
{
"epoch": 0.8402399377260864,
"grad_norm": 0.44308215379714966,
"learning_rate": 1.809587328468373e-05,
"loss": 2.2503,
"step": 73400
},
{
"epoch": 0.8413846787856587,
"grad_norm": 0.41164156794548035,
"learning_rate": 1.7989023512568686e-05,
"loss": 2.2491,
"step": 73500
},
{
"epoch": 0.842529419845231,
"grad_norm": 0.4938010573387146,
"learning_rate": 1.7879590906411786e-05,
"loss": 2.2581,
"step": 73600
},
{
"epoch": 0.8436741609048033,
"grad_norm": 0.4634738862514496,
"learning_rate": 1.7767610845538178e-05,
"loss": 2.2665,
"step": 73700
},
{
"epoch": 0.8448189019643757,
"grad_norm": 0.43750569224357605,
"learning_rate": 1.7653119532859626e-05,
"loss": 2.2497,
"step": 73800
},
{
"epoch": 0.845963643023948,
"grad_norm": 0.5143136978149414,
"learning_rate": 1.7536153983170157e-05,
"loss": 2.2318,
"step": 73900
},
{
"epoch": 0.8471083840835203,
"grad_norm": 0.5141619443893433,
"learning_rate": 1.7416752011179294e-05,
"loss": 2.2418,
"step": 74000
},
{
"epoch": 0.8482531251430926,
"grad_norm": 0.46615302562713623,
"learning_rate": 1.7296181957754126e-05,
"loss": 2.2496,
"step": 74100
},
{
"epoch": 0.849397866202665,
"grad_norm": 0.4517097473144531,
"learning_rate": 1.71720471105587e-05,
"loss": 2.2501,
"step": 74200
},
{
"epoch": 0.8505426072622373,
"grad_norm": 0.4446674883365631,
"learning_rate": 1.7045593556027164e-05,
"loss": 2.2487,
"step": 74300
},
{
"epoch": 0.8516873483218096,
"grad_norm": 0.5222780108451843,
"learning_rate": 1.691686217632051e-05,
"loss": 2.2495,
"step": 74400
},
{
"epoch": 0.8528320893813819,
"grad_norm": 0.4754522442817688,
"learning_rate": 1.678589459001567e-05,
"loss": 2.251,
"step": 74500
},
{
"epoch": 0.8539768304409543,
"grad_norm": 0.46791428327560425,
"learning_rate": 1.6652733138650367e-05,
"loss": 2.2502,
"step": 74600
},
{
"epoch": 0.8551215715005266,
"grad_norm": 0.5582058429718018,
"learning_rate": 1.651742087303412e-05,
"loss": 2.249,
"step": 74700
},
{
"epoch": 0.8562663125600989,
"grad_norm": 0.5812872648239136,
"learning_rate": 1.6380001539330088e-05,
"loss": 2.2402,
"step": 74800
},
{
"epoch": 0.8574110536196712,
"grad_norm": 0.4505755305290222,
"learning_rate": 1.624051956491196e-05,
"loss": 2.2445,
"step": 74900
},
{
"epoch": 0.8585557946792436,
"grad_norm": 0.46958789229393005,
"learning_rate": 1.609902004400073e-05,
"loss": 2.2729,
"step": 75000
},
{
"epoch": 0.8597005357388159,
"grad_norm": 0.44104379415512085,
"learning_rate": 1.5955548723085804e-05,
"loss": 2.2429,
"step": 75100
},
{
"epoch": 0.8608452767983882,
"grad_norm": 0.4598877429962158,
"learning_rate": 1.581015198613528e-05,
"loss": 2.2471,
"step": 75200
},
{
"epoch": 0.8619900178579605,
"grad_norm": 0.43592801690101624,
"learning_rate": 1.5662876839600084e-05,
"loss": 2.2537,
"step": 75300
},
{
"epoch": 0.8631347589175329,
"grad_norm": 0.46111100912094116,
"learning_rate": 1.551377089721692e-05,
"loss": 2.2262,
"step": 75400
},
{
"epoch": 0.8642794999771052,
"grad_norm": 0.4764077961444855,
"learning_rate": 1.5362882364614825e-05,
"loss": 2.2556,
"step": 75500
},
{
"epoch": 0.8654242410366775,
"grad_norm": 0.5170374512672424,
"learning_rate": 1.5210260023730402e-05,
"loss": 2.2654,
"step": 75600
},
{
"epoch": 0.8665689820962498,
"grad_norm": 0.4534991383552551,
"learning_rate": 1.5055953217036735e-05,
"loss": 2.2647,
"step": 75700
},
{
"epoch": 0.8677137231558222,
"grad_norm": 0.42923828959465027,
"learning_rate": 1.490001183159105e-05,
"loss": 2.2366,
"step": 75800
},
{
"epoch": 0.8688584642153945,
"grad_norm": 0.5356833934783936,
"learning_rate": 1.474248628290637e-05,
"loss": 2.2459,
"step": 75900
},
{
"epoch": 0.8700032052749668,
"grad_norm": 0.5153579115867615,
"learning_rate": 1.4583427498652252e-05,
"loss": 2.2416,
"step": 76000
},
{
"epoch": 0.8711479463345392,
"grad_norm": 0.48512566089630127,
"learning_rate": 1.4422886902190014e-05,
"loss": 2.2519,
"step": 76100
},
{
"epoch": 0.8722926873941115,
"grad_norm": 0.5230854153633118,
"learning_rate": 1.4260916395947657e-05,
"loss": 2.2429,
"step": 76200
},
{
"epoch": 0.8734374284536838,
"grad_norm": 0.4530618190765381,
"learning_rate": 1.4097568344639916e-05,
"loss": 2.2675,
"step": 76300
},
{
"epoch": 0.8745821695132561,
"grad_norm": 0.4915354251861572,
"learning_rate": 1.3932895558338879e-05,
"loss": 2.2443,
"step": 76400
},
{
"epoch": 0.8757269105728285,
"grad_norm": 0.437199205160141,
"learning_rate": 1.3766951275400596e-05,
"loss": 2.2549,
"step": 76500
},
{
"epoch": 0.8768716516324008,
"grad_norm": 0.48987364768981934,
"learning_rate": 1.3599789145253226e-05,
"loss": 2.2294,
"step": 76600
},
{
"epoch": 0.8780163926919731,
"grad_norm": 0.40682971477508545,
"learning_rate": 1.34314632110523e-05,
"loss": 2.2404,
"step": 76700
},
{
"epoch": 0.8791611337515454,
"grad_norm": 0.4797169268131256,
"learning_rate": 1.3262027892208696e-05,
"loss": 2.2613,
"step": 76800
},
{
"epoch": 0.8803058748111178,
"grad_norm": 0.5203677415847778,
"learning_rate": 1.3091537966794933e-05,
"loss": 2.2509,
"step": 76900
},
{
"epoch": 0.8814506158706901,
"grad_norm": 0.519087553024292,
"learning_rate": 1.2920048553835574e-05,
"loss": 2.2625,
"step": 77000
},
{
"epoch": 0.8825953569302624,
"grad_norm": 0.4839503765106201,
"learning_rate": 1.2747615095487331e-05,
"loss": 2.2487,
"step": 77100
},
{
"epoch": 0.8837400979898347,
"grad_norm": 0.4150638282299042,
"learning_rate": 1.2574293339114757e-05,
"loss": 2.254,
"step": 77200
},
{
"epoch": 0.8848848390494071,
"grad_norm": 0.5131925940513611,
"learning_rate": 1.240013931926724e-05,
"loss": 2.238,
"step": 77300
},
{
"epoch": 0.8860295801089794,
"grad_norm": 0.5299201607704163,
"learning_rate": 1.2225209339563144e-05,
"loss": 2.2517,
"step": 77400
},
{
"epoch": 0.8871743211685517,
"grad_norm": 0.4446033239364624,
"learning_rate": 1.204955995448699e-05,
"loss": 2.2521,
"step": 77500
},
{
"epoch": 0.888319062228124,
"grad_norm": 0.4393691122531891,
"learning_rate": 1.1873247951105489e-05,
"loss": 2.2634,
"step": 77600
},
{
"epoch": 0.8894638032876964,
"grad_norm": 0.48134180903434753,
"learning_rate": 1.1696330330708421e-05,
"loss": 2.2478,
"step": 77700
},
{
"epoch": 0.8906085443472687,
"grad_norm": 0.4115225076675415,
"learning_rate": 1.1518864290380249e-05,
"loss": 2.2197,
"step": 77800
},
{
"epoch": 0.891753285406841,
"grad_norm": 0.47142502665519714,
"learning_rate": 1.1340907204508403e-05,
"loss": 2.2344,
"step": 77900
},
{
"epoch": 0.8928980264664133,
"grad_norm": 0.5209864377975464,
"learning_rate": 1.1162516606234276e-05,
"loss": 2.2486,
"step": 78000
},
{
"epoch": 0.8940427675259857,
"grad_norm": 0.4119342267513275,
"learning_rate": 1.09837501688529e-05,
"loss": 2.2425,
"step": 78100
},
{
"epoch": 0.895187508585558,
"grad_norm": 0.4754474461078644,
"learning_rate": 1.0806457916272542e-05,
"loss": 2.246,
"step": 78200
},
{
"epoch": 0.8963322496451303,
"grad_norm": 0.511458694934845,
"learning_rate": 1.0627115602490508e-05,
"loss": 2.2453,
"step": 78300
},
{
"epoch": 0.8974769907047025,
"grad_norm": 0.5435287356376648,
"learning_rate": 1.0447570543589034e-05,
"loss": 2.2452,
"step": 78400
},
{
"epoch": 0.898621731764275,
"grad_norm": 0.4102800190448761,
"learning_rate": 1.0267880786097762e-05,
"loss": 2.2459,
"step": 78500
},
{
"epoch": 0.8997664728238473,
"grad_norm": 0.3983183801174164,
"learning_rate": 1.0088104423327082e-05,
"loss": 2.2274,
"step": 78600
},
{
"epoch": 0.9009112138834195,
"grad_norm": 0.44764354825019836,
"learning_rate": 9.908299576586684e-06,
"loss": 2.2512,
"step": 78700
},
{
"epoch": 0.9020559549429918,
"grad_norm": 0.5015277862548828,
"learning_rate": 9.728524376395068e-06,
"loss": 2.244,
"step": 78800
},
{
"epoch": 0.9032006960025643,
"grad_norm": 0.4522726237773895,
"learning_rate": 9.548836943686055e-06,
"loss": 2.2443,
"step": 78900
},
{
"epoch": 0.9043454370621365,
"grad_norm": 0.4915413558483124,
"learning_rate": 9.369295371018442e-06,
"loss": 2.246,
"step": 79000
},
{
"epoch": 0.9054901781217088,
"grad_norm": 0.45682036876678467,
"learning_rate": 9.18995770379478e-06,
"loss": 2.2353,
"step": 79100
},
{
"epoch": 0.9066349191812811,
"grad_norm": 0.41946882009506226,
"learning_rate": 9.010881921495438e-06,
"loss": 2.2468,
"step": 79200
},
{
"epoch": 0.9077796602408535,
"grad_norm": 0.4232065975666046,
"learning_rate": 8.832125918933955e-06,
"loss": 2.2575,
"step": 79300
},
{
"epoch": 0.9089244013004258,
"grad_norm": 0.4463779926300049,
"learning_rate": 8.653747487539764e-06,
"loss": 2.2379,
"step": 79400
},
{
"epoch": 0.9100691423599981,
"grad_norm": 0.43841028213500977,
"learning_rate": 8.47580429667436e-06,
"loss": 2.2414,
"step": 79500
},
{
"epoch": 0.9112138834195705,
"grad_norm": 0.43061941862106323,
"learning_rate": 8.29835387498692e-06,
"loss": 2.2475,
"step": 79600
},
{
"epoch": 0.9123586244791428,
"grad_norm": 0.41848114132881165,
"learning_rate": 8.121453591815401e-06,
"loss": 2.2535,
"step": 79700
},
{
"epoch": 0.9135033655387151,
"grad_norm": 0.4669823944568634,
"learning_rate": 7.94516063863917e-06,
"loss": 2.2551,
"step": 79800
},
{
"epoch": 0.9146481065982874,
"grad_norm": 0.4634804129600525,
"learning_rate": 7.769532010589123e-06,
"loss": 2.2393,
"step": 79900
},
{
"epoch": 0.9157928476578598,
"grad_norm": 0.43346285820007324,
"learning_rate": 7.594624488021274e-06,
"loss": 2.2607,
"step": 80000
},
{
"epoch": 0.9169375887174321,
"grad_norm": 0.43006187677383423,
"learning_rate": 7.420494618159791e-06,
"loss": 2.2408,
"step": 80100
},
{
"epoch": 0.9180823297770044,
"grad_norm": 0.577170729637146,
"learning_rate": 7.2471986968154075e-06,
"loss": 2.2421,
"step": 80200
},
{
"epoch": 0.9192270708365767,
"grad_norm": 0.43540096282958984,
"learning_rate": 7.074792750185093e-06,
"loss": 2.2577,
"step": 80300
},
{
"epoch": 0.9203718118961491,
"grad_norm": 0.41239428520202637,
"learning_rate": 6.905042255517393e-06,
"loss": 2.2246,
"step": 80400
},
{
"epoch": 0.9215165529557214,
"grad_norm": 0.4757671654224396,
"learning_rate": 6.734572883175328e-06,
"loss": 2.243,
"step": 80500
},
{
"epoch": 0.9226612940152937,
"grad_norm": 0.5074242353439331,
"learning_rate": 6.5651592163653885e-06,
"loss": 2.243,
"step": 80600
},
{
"epoch": 0.923806035074866,
"grad_norm": 0.4626462459564209,
"learning_rate": 6.3968560261600545e-06,
"loss": 2.2369,
"step": 80700
},
{
"epoch": 0.9249507761344384,
"grad_norm": 0.4515824317932129,
"learning_rate": 6.229717724617108e-06,
"loss": 2.2479,
"step": 80800
},
{
"epoch": 0.9260955171940107,
"grad_norm": 0.43820834159851074,
"learning_rate": 6.063798347188343e-06,
"loss": 2.2437,
"step": 80900
},
{
"epoch": 0.927240258253583,
"grad_norm": 0.47702258825302124,
"learning_rate": 5.899151535250031e-06,
"loss": 2.2382,
"step": 81000
},
{
"epoch": 0.9283849993131553,
"grad_norm": 0.4041842520236969,
"learning_rate": 5.735830518760757e-06,
"loss": 2.2412,
"step": 81100
},
{
"epoch": 0.9295297403727277,
"grad_norm": 0.5503548979759216,
"learning_rate": 5.573888099052307e-06,
"loss": 2.2515,
"step": 81200
},
{
"epoch": 0.9306744814323,
"grad_norm": 0.4641313850879669,
"learning_rate": 5.413376631759115e-06,
"loss": 2.2299,
"step": 81300
},
{
"epoch": 0.9318192224918723,
"grad_norm": 0.4651617109775543,
"learning_rate": 5.254348009891777e-06,
"loss": 2.2394,
"step": 81400
},
{
"epoch": 0.9329639635514446,
"grad_norm": 0.5168155431747437,
"learning_rate": 5.096853647060169e-06,
"loss": 2.2464,
"step": 81500
},
{
"epoch": 0.934108704611017,
"grad_norm": 0.48371872305870056,
"learning_rate": 4.940944460851545e-06,
"loss": 2.2488,
"step": 81600
},
{
"epoch": 0.9352534456705893,
"grad_norm": 0.4888920187950134,
"learning_rate": 4.7866708563689654e-06,
"loss": 2.2428,
"step": 81700
},
{
"epoch": 0.9363981867301616,
"grad_norm": 0.4613405764102936,
"learning_rate": 4.635600085943046e-06,
"loss": 2.2641,
"step": 81800
},
{
"epoch": 0.9375429277897339,
"grad_norm": 0.39950716495513916,
"learning_rate": 4.484729138719958e-06,
"loss": 2.2513,
"step": 81900
},
{
"epoch": 0.9386876688493063,
"grad_norm": 0.44666656851768494,
"learning_rate": 4.335641266650937e-06,
"loss": 2.2352,
"step": 82000
},
{
"epoch": 0.9398324099088786,
"grad_norm": 0.6108360886573792,
"learning_rate": 4.188384669522936e-06,
"loss": 2.2526,
"step": 82100
},
{
"epoch": 0.9409771509684509,
"grad_norm": 0.4861396551132202,
"learning_rate": 4.043006955075667e-06,
"loss": 2.2421,
"step": 82200
},
{
"epoch": 0.9421218920280232,
"grad_norm": 0.4066333472728729,
"learning_rate": 3.899555123610131e-06,
"loss": 2.2425,
"step": 82300
},
{
"epoch": 0.9432666330875956,
"grad_norm": 0.47433343529701233,
"learning_rate": 3.7580755527935232e-06,
"loss": 2.2357,
"step": 82400
},
{
"epoch": 0.9444113741471679,
"grad_norm": 0.447336882352829,
"learning_rate": 3.6186139826654253e-06,
"loss": 2.2458,
"step": 82500
},
{
"epoch": 0.9455561152067402,
"grad_norm": 0.48044124245643616,
"learning_rate": 3.4812155008501692e-06,
"loss": 2.2451,
"step": 82600
},
{
"epoch": 0.9467008562663125,
"grad_norm": 0.43815353512763977,
"learning_rate": 3.3459245279800846e-06,
"loss": 2.2457,
"step": 82700
},
{
"epoch": 0.9478455973258849,
"grad_norm": 0.4744986295700073,
"learning_rate": 3.2127848033344124e-06,
"loss": 2.2303,
"step": 82800
},
{
"epoch": 0.9489903383854572,
"grad_norm": 0.46926042437553406,
"learning_rate": 3.0818393706984906e-06,
"loss": 2.2346,
"step": 82900
},
{
"epoch": 0.9501350794450295,
"grad_norm": 0.423065721988678,
"learning_rate": 2.9531305644477883e-06,
"loss": 2.2323,
"step": 83000
},
{
"epoch": 0.9512798205046019,
"grad_norm": 0.4213075339794159,
"learning_rate": 2.8266999958613017e-06,
"loss": 2.2362,
"step": 83100
},
{
"epoch": 0.9524245615641742,
"grad_norm": 0.45430952310562134,
"learning_rate": 2.7025885396687145e-06,
"loss": 2.2441,
"step": 83200
},
{
"epoch": 0.9535693026237465,
"grad_norm": 0.48012691736221313,
"learning_rate": 2.5808363208356746e-06,
"loss": 2.2623,
"step": 83300
},
{
"epoch": 0.9547140436833188,
"grad_norm": 0.4386986494064331,
"learning_rate": 2.461482701591493e-06,
"loss": 2.2329,
"step": 83400
},
{
"epoch": 0.9558587847428912,
"grad_norm": 0.4368671476840973,
"learning_rate": 2.3445662687034143e-06,
"loss": 2.2391,
"step": 83500
},
{
"epoch": 0.9570035258024635,
"grad_norm": 0.4472872316837311,
"learning_rate": 2.2301248210016024e-06,
"loss": 2.2378,
"step": 83600
},
{
"epoch": 0.9581482668620358,
"grad_norm": 0.4506527781486511,
"learning_rate": 2.1181953571588596e-06,
"loss": 2.2362,
"step": 83700
},
{
"epoch": 0.9592930079216081,
"grad_norm": 0.4752218425273895,
"learning_rate": 2.0088140637290265e-06,
"loss": 2.2397,
"step": 83800
},
{
"epoch": 0.9604377489811805,
"grad_norm": 0.4612346589565277,
"learning_rate": 1.9020163034479567e-06,
"loss": 2.2447,
"step": 83900
},
{
"epoch": 0.9615824900407528,
"grad_norm": 0.49490582942962646,
"learning_rate": 1.7978366038008234e-06,
"loss": 2.243,
"step": 84000
},
{
"epoch": 0.9627272311003251,
"grad_norm": 0.4314175844192505,
"learning_rate": 1.696308645859447e-06,
"loss": 2.2398,
"step": 84100
},
{
"epoch": 0.9638719721598974,
"grad_norm": 0.4729178249835968,
"learning_rate": 1.5974652533932833e-06,
"loss": 2.2528,
"step": 84200
},
{
"epoch": 0.9650167132194698,
"grad_norm": 0.45053282380104065,
"learning_rate": 1.5013383822575766e-06,
"loss": 2.2393,
"step": 84300
},
{
"epoch": 0.9661614542790421,
"grad_norm": 0.3972727358341217,
"learning_rate": 1.4079591100620837e-06,
"loss": 2.2551,
"step": 84400
},
{
"epoch": 0.9673061953386144,
"grad_norm": 0.4374849200248718,
"learning_rate": 1.317357626123772e-06,
"loss": 2.2434,
"step": 84500
},
{
"epoch": 0.9684509363981867,
"grad_norm": 0.44703078269958496,
"learning_rate": 1.2295632217066567e-06,
"loss": 2.2446,
"step": 84600
},
{
"epoch": 0.9695956774577591,
"grad_norm": 0.49270525574684143,
"learning_rate": 1.1446042805520098e-06,
"loss": 2.2554,
"step": 84700
},
{
"epoch": 0.9707404185173314,
"grad_norm": 0.45350950956344604,
"learning_rate": 1.062508269701963e-06,
"loss": 2.2649,
"step": 84800
},
{
"epoch": 0.9718851595769037,
"grad_norm": 0.45871463418006897,
"learning_rate": 9.833017306194558e-07,
"loss": 2.2386,
"step": 84900
},
{
"epoch": 0.973029900636476,
"grad_norm": 0.4371441900730133,
"learning_rate": 9.07010270607459e-07,
"loss": 2.2543,
"step": 85000
},
{
"epoch": 0.9741746416960484,
"grad_norm": 0.47834667563438416,
"learning_rate": 8.33658554530169e-07,
"loss": 2.2385,
"step": 85100
},
{
"epoch": 0.9753193827556207,
"grad_norm": 0.5165786743164062,
"learning_rate": 7.632702968389205e-07,
"loss": 2.2423,
"step": 85200
},
{
"epoch": 0.976464123815193,
"grad_norm": 0.4625995457172394,
"learning_rate": 6.958682539053563e-07,
"loss": 2.2618,
"step": 85300
},
{
"epoch": 0.9776088648747653,
"grad_norm": 0.5300964713096619,
"learning_rate": 6.314742166643406e-07,
"loss": 2.2383,
"step": 85400
},
{
"epoch": 0.9787536059343377,
"grad_norm": 0.49308186769485474,
"learning_rate": 5.701090035689949e-07,
"loss": 2.2657,
"step": 85500
},
{
"epoch": 0.97989834699391,
"grad_norm": 0.4634435176849365,
"learning_rate": 5.117924538601371e-07,
"loss": 2.2347,
"step": 85600
},
{
"epoch": 0.9810430880534823,
"grad_norm": 0.4959612488746643,
"learning_rate": 4.565434211522979e-07,
"loss": 2.2421,
"step": 85700
},
{
"epoch": 0.9821878291130546,
"grad_norm": 0.43508780002593994,
"learning_rate": 4.0437976733838757e-07,
"loss": 2.2601,
"step": 85800
},
{
"epoch": 0.983332570172627,
"grad_norm": 0.4364226460456848,
"learning_rate": 3.557935619007491e-07,
"loss": 2.2362,
"step": 85900
},
{
"epoch": 0.9844773112321993,
"grad_norm": 0.4956010580062866,
"learning_rate": 3.09818999860273e-07,
"loss": 2.2456,
"step": 86000
},
{
"epoch": 0.9856220522917716,
"grad_norm": 0.45080068707466125,
"learning_rate": 2.6697725240214076e-07,
"loss": 2.2333,
"step": 86100
},
{
"epoch": 0.9867667933513439,
"grad_norm": 0.47665461897850037,
"learning_rate": 2.2728217017075395e-07,
"loss": 2.2434,
"step": 86200
},
{
"epoch": 0.9879115344109163,
"grad_norm": 0.48589542508125305,
"learning_rate": 1.9074658650043764e-07,
"loss": 2.2461,
"step": 86300
},
{
"epoch": 0.9890562754704886,
"grad_norm": 0.47914570569992065,
"learning_rate": 1.5738231326645758e-07,
"loss": 2.2246,
"step": 86400
},
{
"epoch": 0.9902010165300609,
"grad_norm": 0.5974397659301758,
"learning_rate": 1.2720013706627122e-07,
"loss": 2.227,
"step": 86500
},
{
"epoch": 0.9913457575896332,
"grad_norm": 0.5031090378761292,
"learning_rate": 1.002098157322362e-07,
"loss": 2.2415,
"step": 86600
},
{
"epoch": 0.9924904986492056,
"grad_norm": 0.470460444688797,
"learning_rate": 7.642007517693062e-08,
"loss": 2.2416,
"step": 86700
},
{
"epoch": 0.9936352397087779,
"grad_norm": 0.4445749521255493,
"learning_rate": 5.5838606572078404e-08,
"loss": 2.228,
"step": 86800
},
{
"epoch": 0.9947799807683502,
"grad_norm": 0.45365962386131287,
"learning_rate": 3.847206386201507e-08,
"loss": 2.2488,
"step": 86900
},
{
"epoch": 0.9959247218279226,
"grad_norm": 0.4715825319290161,
"learning_rate": 2.4326061612479633e-08,
"loss": 2.2225,
"step": 87000
},
{
"epoch": 0.9970694628874949,
"grad_norm": 0.4650324583053589,
"learning_rate": 1.340517319543877e-08,
"loss": 2.2315,
"step": 87100
},
{
"epoch": 0.9982142039470672,
"grad_norm": 0.4570050537586212,
"learning_rate": 5.712929310521809e-09,
"loss": 2.2593,
"step": 87200
},
{
"epoch": 0.9993589450066395,
"grad_norm": 0.47539758682250977,
"learning_rate": 1.2518168435593502e-09,
"loss": 2.2317,
"step": 87300
},
{
"epoch": 1.0,
"step": 87356,
"total_flos": 1.2729340395184456e+19,
"train_loss": 2.3255439768606525,
"train_runtime": 52634.826,
"train_samples_per_second": 26.555,
"train_steps_per_second": 1.66
}
],
"logging_steps": 100,
"max_steps": 87356,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2729340395184456e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}