Darius07's picture
End of training
40407d4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 15.0,
"eval_steps": 500,
"global_step": 184080,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04074315514993481,
"grad_norm": 1.712476134300232,
"learning_rate": 0.00029918513689700127,
"loss": 1.0726,
"step": 500
},
{
"epoch": 0.08148631029986962,
"grad_norm": 1.8538917303085327,
"learning_rate": 0.0002983702737940026,
"loss": 0.8568,
"step": 1000
},
{
"epoch": 0.12222946544980444,
"grad_norm": 3.2072715759277344,
"learning_rate": 0.0002975554106910039,
"loss": 0.7747,
"step": 1500
},
{
"epoch": 0.16297262059973924,
"grad_norm": 2.7058048248291016,
"learning_rate": 0.0002967405475880052,
"loss": 0.7254,
"step": 2000
},
{
"epoch": 0.20371577574967406,
"grad_norm": 2.862517833709717,
"learning_rate": 0.0002959256844850065,
"loss": 0.6899,
"step": 2500
},
{
"epoch": 0.24445893089960888,
"grad_norm": 1.9533528089523315,
"learning_rate": 0.0002951108213820078,
"loss": 0.6615,
"step": 3000
},
{
"epoch": 0.28520208604954367,
"grad_norm": 1.9216587543487549,
"learning_rate": 0.0002942959582790091,
"loss": 0.6518,
"step": 3500
},
{
"epoch": 0.3259452411994785,
"grad_norm": 2.019871711730957,
"learning_rate": 0.0002934810951760104,
"loss": 0.6282,
"step": 4000
},
{
"epoch": 0.3666883963494133,
"grad_norm": 2.8539257049560547,
"learning_rate": 0.0002926662320730117,
"loss": 0.6356,
"step": 4500
},
{
"epoch": 0.4074315514993481,
"grad_norm": 2.5572264194488525,
"learning_rate": 0.000291851368970013,
"loss": 0.6119,
"step": 5000
},
{
"epoch": 0.44817470664928294,
"grad_norm": 2.307138681411743,
"learning_rate": 0.0002910365058670143,
"loss": 0.6074,
"step": 5500
},
{
"epoch": 0.48891786179921776,
"grad_norm": 1.5693212747573853,
"learning_rate": 0.0002902216427640156,
"loss": 0.5995,
"step": 6000
},
{
"epoch": 0.5296610169491526,
"grad_norm": 1.5355511903762817,
"learning_rate": 0.00028940677966101693,
"loss": 0.5984,
"step": 6500
},
{
"epoch": 0.5704041720990873,
"grad_norm": 2.0896735191345215,
"learning_rate": 0.0002885919165580182,
"loss": 0.5802,
"step": 7000
},
{
"epoch": 0.6111473272490222,
"grad_norm": 2.17629075050354,
"learning_rate": 0.00028777705345501956,
"loss": 0.5848,
"step": 7500
},
{
"epoch": 0.651890482398957,
"grad_norm": 1.2360671758651733,
"learning_rate": 0.00028696219035202086,
"loss": 0.5867,
"step": 8000
},
{
"epoch": 0.6926336375488917,
"grad_norm": 1.9337974786758423,
"learning_rate": 0.00028614732724902215,
"loss": 0.5769,
"step": 8500
},
{
"epoch": 0.7333767926988266,
"grad_norm": 1.8343297243118286,
"learning_rate": 0.00028533246414602344,
"loss": 0.5785,
"step": 9000
},
{
"epoch": 0.7741199478487614,
"grad_norm": 3.1703989505767822,
"learning_rate": 0.00028451760104302473,
"loss": 0.5726,
"step": 9500
},
{
"epoch": 0.8148631029986962,
"grad_norm": 2.3858680725097656,
"learning_rate": 0.000283702737940026,
"loss": 0.562,
"step": 10000
},
{
"epoch": 0.855606258148631,
"grad_norm": 2.681757688522339,
"learning_rate": 0.00028288787483702737,
"loss": 0.5628,
"step": 10500
},
{
"epoch": 0.8963494132985659,
"grad_norm": 1.7328672409057617,
"learning_rate": 0.00028207301173402866,
"loss": 0.5584,
"step": 11000
},
{
"epoch": 0.9370925684485006,
"grad_norm": 2.7428441047668457,
"learning_rate": 0.00028125814863102995,
"loss": 0.5577,
"step": 11500
},
{
"epoch": 0.9778357235984355,
"grad_norm": 2.219202995300293,
"learning_rate": 0.00028044328552803124,
"loss": 0.554,
"step": 12000
},
{
"epoch": 1.0,
"eval_accuracy": 0.7979919910430908,
"eval_loss": 0.5014437437057495,
"eval_runtime": 3.8836,
"eval_samples_per_second": 641.161,
"eval_steps_per_second": 80.338,
"step": 12272
},
{
"epoch": 1.0185788787483703,
"grad_norm": 1.668083667755127,
"learning_rate": 0.00027962842242503253,
"loss": 0.5461,
"step": 12500
},
{
"epoch": 1.0593220338983051,
"grad_norm": 2.8752121925354004,
"learning_rate": 0.0002788135593220339,
"loss": 0.5399,
"step": 13000
},
{
"epoch": 1.1000651890482398,
"grad_norm": 1.8229767084121704,
"learning_rate": 0.00027799869621903517,
"loss": 0.5415,
"step": 13500
},
{
"epoch": 1.1408083441981747,
"grad_norm": 1.681829571723938,
"learning_rate": 0.00027718383311603646,
"loss": 0.535,
"step": 14000
},
{
"epoch": 1.1815514993481095,
"grad_norm": 1.5809299945831299,
"learning_rate": 0.0002763689700130378,
"loss": 0.5485,
"step": 14500
},
{
"epoch": 1.2222946544980444,
"grad_norm": 1.6424498558044434,
"learning_rate": 0.0002755541069100391,
"loss": 0.5353,
"step": 15000
},
{
"epoch": 1.263037809647979,
"grad_norm": 2.1335270404815674,
"learning_rate": 0.0002747392438070404,
"loss": 0.5356,
"step": 15500
},
{
"epoch": 1.303780964797914,
"grad_norm": 3.0127382278442383,
"learning_rate": 0.0002739243807040417,
"loss": 0.53,
"step": 16000
},
{
"epoch": 1.3445241199478488,
"grad_norm": 1.603621006011963,
"learning_rate": 0.000273109517601043,
"loss": 0.5298,
"step": 16500
},
{
"epoch": 1.3852672750977835,
"grad_norm": 3.251509428024292,
"learning_rate": 0.0002722946544980443,
"loss": 0.5326,
"step": 17000
},
{
"epoch": 1.4260104302477183,
"grad_norm": 1.379150390625,
"learning_rate": 0.0002714797913950456,
"loss": 0.5316,
"step": 17500
},
{
"epoch": 1.4667535853976532,
"grad_norm": 1.778817892074585,
"learning_rate": 0.0002706649282920469,
"loss": 0.5233,
"step": 18000
},
{
"epoch": 1.5074967405475879,
"grad_norm": 1.4545488357543945,
"learning_rate": 0.00026985006518904825,
"loss": 0.5292,
"step": 18500
},
{
"epoch": 1.548239895697523,
"grad_norm": 1.7037944793701172,
"learning_rate": 0.00026903520208604954,
"loss": 0.5168,
"step": 19000
},
{
"epoch": 1.5889830508474576,
"grad_norm": 2.0902042388916016,
"learning_rate": 0.00026822033898305083,
"loss": 0.53,
"step": 19500
},
{
"epoch": 1.6297262059973925,
"grad_norm": 1.4677634239196777,
"learning_rate": 0.0002674054758800521,
"loss": 0.5258,
"step": 20000
},
{
"epoch": 1.6704693611473274,
"grad_norm": 1.562299132347107,
"learning_rate": 0.0002665906127770534,
"loss": 0.5232,
"step": 20500
},
{
"epoch": 1.711212516297262,
"grad_norm": 2.906895637512207,
"learning_rate": 0.00026577574967405476,
"loss": 0.5153,
"step": 21000
},
{
"epoch": 1.7519556714471969,
"grad_norm": 3.625960350036621,
"learning_rate": 0.00026496088657105605,
"loss": 0.5137,
"step": 21500
},
{
"epoch": 1.7926988265971318,
"grad_norm": 1.401186466217041,
"learning_rate": 0.00026414602346805735,
"loss": 0.5155,
"step": 22000
},
{
"epoch": 1.8334419817470664,
"grad_norm": 4.507143974304199,
"learning_rate": 0.00026333116036505864,
"loss": 0.5239,
"step": 22500
},
{
"epoch": 1.8741851368970013,
"grad_norm": 2.317826986312866,
"learning_rate": 0.00026251629726205993,
"loss": 0.5059,
"step": 23000
},
{
"epoch": 1.9149282920469362,
"grad_norm": 1.684119701385498,
"learning_rate": 0.0002617014341590612,
"loss": 0.5206,
"step": 23500
},
{
"epoch": 1.9556714471968708,
"grad_norm": 3.68534255027771,
"learning_rate": 0.00026088657105606257,
"loss": 0.5088,
"step": 24000
},
{
"epoch": 1.996414602346806,
"grad_norm": 1.978193759918213,
"learning_rate": 0.00026007170795306386,
"loss": 0.5077,
"step": 24500
},
{
"epoch": 2.0,
"eval_accuracy": 0.8108433485031128,
"eval_loss": 0.48567625880241394,
"eval_runtime": 3.8076,
"eval_samples_per_second": 653.949,
"eval_steps_per_second": 81.941,
"step": 24544
},
{
"epoch": 2.0371577574967406,
"grad_norm": 2.52195143699646,
"learning_rate": 0.0002592568448500652,
"loss": 0.4994,
"step": 25000
},
{
"epoch": 2.077900912646675,
"grad_norm": 2.3119261264801025,
"learning_rate": 0.0002584419817470665,
"loss": 0.4989,
"step": 25500
},
{
"epoch": 2.1186440677966103,
"grad_norm": 1.8698070049285889,
"learning_rate": 0.0002576271186440678,
"loss": 0.505,
"step": 26000
},
{
"epoch": 2.159387222946545,
"grad_norm": 1.762832760810852,
"learning_rate": 0.0002568122555410691,
"loss": 0.5061,
"step": 26500
},
{
"epoch": 2.2001303780964796,
"grad_norm": 1.8201305866241455,
"learning_rate": 0.00025599739243807037,
"loss": 0.5009,
"step": 27000
},
{
"epoch": 2.2408735332464147,
"grad_norm": 2.5493578910827637,
"learning_rate": 0.00025518252933507166,
"loss": 0.4942,
"step": 27500
},
{
"epoch": 2.2816166883963493,
"grad_norm": 1.689488172531128,
"learning_rate": 0.000254367666232073,
"loss": 0.505,
"step": 28000
},
{
"epoch": 2.322359843546284,
"grad_norm": 1.5578385591506958,
"learning_rate": 0.0002535528031290743,
"loss": 0.4827,
"step": 28500
},
{
"epoch": 2.363102998696219,
"grad_norm": 1.9981471300125122,
"learning_rate": 0.0002527379400260756,
"loss": 0.4901,
"step": 29000
},
{
"epoch": 2.4038461538461537,
"grad_norm": 3.95417857170105,
"learning_rate": 0.0002519230769230769,
"loss": 0.4954,
"step": 29500
},
{
"epoch": 2.444589308996089,
"grad_norm": 2.437424898147583,
"learning_rate": 0.0002511082138200782,
"loss": 0.5051,
"step": 30000
},
{
"epoch": 2.4853324641460235,
"grad_norm": 2.264540910720825,
"learning_rate": 0.0002502933507170795,
"loss": 0.4995,
"step": 30500
},
{
"epoch": 2.526075619295958,
"grad_norm": 1.7900969982147217,
"learning_rate": 0.0002494784876140808,
"loss": 0.5043,
"step": 31000
},
{
"epoch": 2.5668187744458932,
"grad_norm": 2.914376735687256,
"learning_rate": 0.0002486636245110821,
"loss": 0.501,
"step": 31500
},
{
"epoch": 2.607561929595828,
"grad_norm": 2.3340747356414795,
"learning_rate": 0.00024784876140808345,
"loss": 0.4856,
"step": 32000
},
{
"epoch": 2.648305084745763,
"grad_norm": 1.6408894062042236,
"learning_rate": 0.00024703389830508474,
"loss": 0.4925,
"step": 32500
},
{
"epoch": 2.6890482398956976,
"grad_norm": 2.726527690887451,
"learning_rate": 0.00024621903520208603,
"loss": 0.4925,
"step": 33000
},
{
"epoch": 2.7297913950456323,
"grad_norm": 1.7461756467819214,
"learning_rate": 0.0002454041720990873,
"loss": 0.4949,
"step": 33500
},
{
"epoch": 2.770534550195567,
"grad_norm": 1.021328330039978,
"learning_rate": 0.0002445893089960886,
"loss": 0.5003,
"step": 34000
},
{
"epoch": 2.811277705345502,
"grad_norm": 1.8558237552642822,
"learning_rate": 0.00024377444589308996,
"loss": 0.4969,
"step": 34500
},
{
"epoch": 2.8520208604954367,
"grad_norm": 1.888190507888794,
"learning_rate": 0.00024295958279009125,
"loss": 0.4763,
"step": 35000
},
{
"epoch": 2.8927640156453718,
"grad_norm": 1.9714635610580444,
"learning_rate": 0.00024214471968709255,
"loss": 0.4823,
"step": 35500
},
{
"epoch": 2.9335071707953064,
"grad_norm": 2.248117446899414,
"learning_rate": 0.00024132985658409386,
"loss": 0.4971,
"step": 36000
},
{
"epoch": 2.974250325945241,
"grad_norm": 2.586214065551758,
"learning_rate": 0.00024051499348109516,
"loss": 0.4793,
"step": 36500
},
{
"epoch": 3.0,
"eval_accuracy": 0.7767068147659302,
"eval_loss": 0.5534041523933411,
"eval_runtime": 3.8006,
"eval_samples_per_second": 655.161,
"eval_steps_per_second": 82.092,
"step": 36816
},
{
"epoch": 3.014993481095176,
"grad_norm": 1.693083643913269,
"learning_rate": 0.00023970013037809645,
"loss": 0.4938,
"step": 37000
},
{
"epoch": 3.055736636245111,
"grad_norm": 1.6453471183776855,
"learning_rate": 0.00023888526727509777,
"loss": 0.479,
"step": 37500
},
{
"epoch": 3.0964797913950455,
"grad_norm": 2.535121202468872,
"learning_rate": 0.00023807040417209906,
"loss": 0.4721,
"step": 38000
},
{
"epoch": 3.1372229465449806,
"grad_norm": 2.6568410396575928,
"learning_rate": 0.00023725554106910038,
"loss": 0.4825,
"step": 38500
},
{
"epoch": 3.1779661016949152,
"grad_norm": 1.9132291078567505,
"learning_rate": 0.00023644067796610167,
"loss": 0.4706,
"step": 39000
},
{
"epoch": 3.21870925684485,
"grad_norm": 1.7388460636138916,
"learning_rate": 0.00023562581486310296,
"loss": 0.4722,
"step": 39500
},
{
"epoch": 3.259452411994785,
"grad_norm": 2.760587692260742,
"learning_rate": 0.0002348109517601043,
"loss": 0.4776,
"step": 40000
},
{
"epoch": 3.3001955671447196,
"grad_norm": 1.4599848985671997,
"learning_rate": 0.0002339960886571056,
"loss": 0.4784,
"step": 40500
},
{
"epoch": 3.3409387222946547,
"grad_norm": 1.7198021411895752,
"learning_rate": 0.0002331812255541069,
"loss": 0.4727,
"step": 41000
},
{
"epoch": 3.3816818774445894,
"grad_norm": 1.6919358968734741,
"learning_rate": 0.0002323663624511082,
"loss": 0.4851,
"step": 41500
},
{
"epoch": 3.422425032594524,
"grad_norm": 2.1125240325927734,
"learning_rate": 0.0002315514993481095,
"loss": 0.4593,
"step": 42000
},
{
"epoch": 3.463168187744459,
"grad_norm": 1.8193351030349731,
"learning_rate": 0.00023073663624511082,
"loss": 0.4753,
"step": 42500
},
{
"epoch": 3.5039113428943938,
"grad_norm": 1.8467501401901245,
"learning_rate": 0.0002299217731421121,
"loss": 0.4769,
"step": 43000
},
{
"epoch": 3.5446544980443284,
"grad_norm": 2.4582698345184326,
"learning_rate": 0.0002291069100391134,
"loss": 0.4725,
"step": 43500
},
{
"epoch": 3.5853976531942635,
"grad_norm": 2.6381258964538574,
"learning_rate": 0.00022829204693611472,
"loss": 0.471,
"step": 44000
},
{
"epoch": 3.626140808344198,
"grad_norm": 3.26521635055542,
"learning_rate": 0.000227477183833116,
"loss": 0.47,
"step": 44500
},
{
"epoch": 3.666883963494133,
"grad_norm": 3.6461341381073,
"learning_rate": 0.0002266623207301173,
"loss": 0.4836,
"step": 45000
},
{
"epoch": 3.707627118644068,
"grad_norm": 4.348172664642334,
"learning_rate": 0.00022584745762711862,
"loss": 0.478,
"step": 45500
},
{
"epoch": 3.7483702737940026,
"grad_norm": 2.3575916290283203,
"learning_rate": 0.0002250325945241199,
"loss": 0.468,
"step": 46000
},
{
"epoch": 3.7891134289439377,
"grad_norm": 1.4927334785461426,
"learning_rate": 0.0002242177314211212,
"loss": 0.4611,
"step": 46500
},
{
"epoch": 3.8298565840938723,
"grad_norm": 2.9326066970825195,
"learning_rate": 0.00022340286831812255,
"loss": 0.476,
"step": 47000
},
{
"epoch": 3.870599739243807,
"grad_norm": 1.4343719482421875,
"learning_rate": 0.00022258800521512384,
"loss": 0.4792,
"step": 47500
},
{
"epoch": 3.9113428943937416,
"grad_norm": 2.0450692176818848,
"learning_rate": 0.00022177314211212516,
"loss": 0.4822,
"step": 48000
},
{
"epoch": 3.9520860495436767,
"grad_norm": 1.492274284362793,
"learning_rate": 0.00022095827900912645,
"loss": 0.4602,
"step": 48500
},
{
"epoch": 3.9928292046936114,
"grad_norm": 2.1512324810028076,
"learning_rate": 0.00022014341590612774,
"loss": 0.4791,
"step": 49000
},
{
"epoch": 4.0,
"eval_accuracy": 0.8224899768829346,
"eval_loss": 0.4442506432533264,
"eval_runtime": 3.8847,
"eval_samples_per_second": 640.977,
"eval_steps_per_second": 80.315,
"step": 49088
},
{
"epoch": 4.0335723598435465,
"grad_norm": 1.4296700954437256,
"learning_rate": 0.00021932855280312906,
"loss": 0.4649,
"step": 49500
},
{
"epoch": 4.074315514993481,
"grad_norm": 3.133362293243408,
"learning_rate": 0.00021851368970013035,
"loss": 0.4501,
"step": 50000
},
{
"epoch": 4.115058670143416,
"grad_norm": 2.441312551498413,
"learning_rate": 0.00021769882659713165,
"loss": 0.4592,
"step": 50500
},
{
"epoch": 4.15580182529335,
"grad_norm": 2.3577542304992676,
"learning_rate": 0.00021688396349413296,
"loss": 0.4532,
"step": 51000
},
{
"epoch": 4.196544980443286,
"grad_norm": 2.5034992694854736,
"learning_rate": 0.00021606910039113426,
"loss": 0.4537,
"step": 51500
},
{
"epoch": 4.237288135593221,
"grad_norm": 2.869847297668457,
"learning_rate": 0.0002152542372881356,
"loss": 0.4558,
"step": 52000
},
{
"epoch": 4.278031290743155,
"grad_norm": 2.8850553035736084,
"learning_rate": 0.0002144393741851369,
"loss": 0.4559,
"step": 52500
},
{
"epoch": 4.31877444589309,
"grad_norm": 2.0999245643615723,
"learning_rate": 0.00021362451108213819,
"loss": 0.4651,
"step": 53000
},
{
"epoch": 4.3595176010430245,
"grad_norm": 2.392293930053711,
"learning_rate": 0.0002128096479791395,
"loss": 0.4621,
"step": 53500
},
{
"epoch": 4.400260756192959,
"grad_norm": 1.9911226034164429,
"learning_rate": 0.0002119947848761408,
"loss": 0.4623,
"step": 54000
},
{
"epoch": 4.441003911342895,
"grad_norm": 2.728233575820923,
"learning_rate": 0.0002111799217731421,
"loss": 0.4589,
"step": 54500
},
{
"epoch": 4.481747066492829,
"grad_norm": 1.9613022804260254,
"learning_rate": 0.0002103650586701434,
"loss": 0.4588,
"step": 55000
},
{
"epoch": 4.522490221642764,
"grad_norm": 2.742570161819458,
"learning_rate": 0.0002095501955671447,
"loss": 0.4541,
"step": 55500
},
{
"epoch": 4.563233376792699,
"grad_norm": 1.8285661935806274,
"learning_rate": 0.00020873533246414602,
"loss": 0.4563,
"step": 56000
},
{
"epoch": 4.603976531942633,
"grad_norm": 2.059859037399292,
"learning_rate": 0.0002079204693611473,
"loss": 0.459,
"step": 56500
},
{
"epoch": 4.644719687092568,
"grad_norm": 2.426276445388794,
"learning_rate": 0.0002071056062581486,
"loss": 0.465,
"step": 57000
},
{
"epoch": 4.6854628422425035,
"grad_norm": 2.499319553375244,
"learning_rate": 0.00020629074315514992,
"loss": 0.463,
"step": 57500
},
{
"epoch": 4.726205997392438,
"grad_norm": 2.2192931175231934,
"learning_rate": 0.0002054758800521512,
"loss": 0.4556,
"step": 58000
},
{
"epoch": 4.766949152542373,
"grad_norm": 3.1158642768859863,
"learning_rate": 0.0002046610169491525,
"loss": 0.457,
"step": 58500
},
{
"epoch": 4.8076923076923075,
"grad_norm": 1.7781977653503418,
"learning_rate": 0.00020384615384615385,
"loss": 0.4664,
"step": 59000
},
{
"epoch": 4.848435462842242,
"grad_norm": 1.9802038669586182,
"learning_rate": 0.00020303129074315514,
"loss": 0.4565,
"step": 59500
},
{
"epoch": 4.889178617992178,
"grad_norm": 1.7128177881240845,
"learning_rate": 0.00020221642764015643,
"loss": 0.4573,
"step": 60000
},
{
"epoch": 4.929921773142112,
"grad_norm": 3.5915613174438477,
"learning_rate": 0.00020140156453715775,
"loss": 0.4603,
"step": 60500
},
{
"epoch": 4.970664928292047,
"grad_norm": 2.1721646785736084,
"learning_rate": 0.00020058670143415904,
"loss": 0.4541,
"step": 61000
},
{
"epoch": 5.0,
"eval_accuracy": 0.8257027864456177,
"eval_loss": 0.44336947798728943,
"eval_runtime": 3.8541,
"eval_samples_per_second": 646.057,
"eval_steps_per_second": 80.952,
"step": 61360
},
{
"epoch": 5.011408083441982,
"grad_norm": 4.467598915100098,
"learning_rate": 0.00019977183833116036,
"loss": 0.462,
"step": 61500
},
{
"epoch": 5.052151238591916,
"grad_norm": 3.215426445007324,
"learning_rate": 0.00019895697522816165,
"loss": 0.4433,
"step": 62000
},
{
"epoch": 5.092894393741851,
"grad_norm": 3.5593578815460205,
"learning_rate": 0.00019814211212516294,
"loss": 0.4412,
"step": 62500
},
{
"epoch": 5.1336375488917865,
"grad_norm": 1.5453704595565796,
"learning_rate": 0.00019732724902216426,
"loss": 0.4392,
"step": 63000
},
{
"epoch": 5.174380704041721,
"grad_norm": 3.003392219543457,
"learning_rate": 0.00019651238591916555,
"loss": 0.4434,
"step": 63500
},
{
"epoch": 5.215123859191656,
"grad_norm": 1.394499659538269,
"learning_rate": 0.00019569752281616684,
"loss": 0.4377,
"step": 64000
},
{
"epoch": 5.25586701434159,
"grad_norm": 1.8028594255447388,
"learning_rate": 0.00019488265971316816,
"loss": 0.4457,
"step": 64500
},
{
"epoch": 5.296610169491525,
"grad_norm": 2.3185994625091553,
"learning_rate": 0.00019406779661016945,
"loss": 0.4396,
"step": 65000
},
{
"epoch": 5.337353324641461,
"grad_norm": 2.245239019393921,
"learning_rate": 0.0001932529335071708,
"loss": 0.4548,
"step": 65500
},
{
"epoch": 5.378096479791395,
"grad_norm": 1.6174944639205933,
"learning_rate": 0.0001924380704041721,
"loss": 0.4431,
"step": 66000
},
{
"epoch": 5.41883963494133,
"grad_norm": 2.5241715908050537,
"learning_rate": 0.00019162320730117338,
"loss": 0.4382,
"step": 66500
},
{
"epoch": 5.459582790091265,
"grad_norm": 3.4499781131744385,
"learning_rate": 0.0001908083441981747,
"loss": 0.4552,
"step": 67000
},
{
"epoch": 5.500325945241199,
"grad_norm": 2.127242088317871,
"learning_rate": 0.000189993481095176,
"loss": 0.4511,
"step": 67500
},
{
"epoch": 5.541069100391134,
"grad_norm": 2.030122756958008,
"learning_rate": 0.00018917861799217729,
"loss": 0.4553,
"step": 68000
},
{
"epoch": 5.581812255541069,
"grad_norm": 2.0190391540527344,
"learning_rate": 0.0001883637548891786,
"loss": 0.4471,
"step": 68500
},
{
"epoch": 5.622555410691004,
"grad_norm": 3.642808198928833,
"learning_rate": 0.0001875488917861799,
"loss": 0.4494,
"step": 69000
},
{
"epoch": 5.663298565840939,
"grad_norm": 2.013524055480957,
"learning_rate": 0.00018673402868318121,
"loss": 0.4425,
"step": 69500
},
{
"epoch": 5.704041720990873,
"grad_norm": 3.961810350418091,
"learning_rate": 0.0001859191655801825,
"loss": 0.4438,
"step": 70000
},
{
"epoch": 5.744784876140808,
"grad_norm": 1.9334365129470825,
"learning_rate": 0.0001851043024771838,
"loss": 0.4477,
"step": 70500
},
{
"epoch": 5.7855280312907436,
"grad_norm": 2.67224383354187,
"learning_rate": 0.00018428943937418514,
"loss": 0.4522,
"step": 71000
},
{
"epoch": 5.826271186440678,
"grad_norm": 2.349132776260376,
"learning_rate": 0.00018347457627118644,
"loss": 0.4467,
"step": 71500
},
{
"epoch": 5.867014341590613,
"grad_norm": 2.8674731254577637,
"learning_rate": 0.00018265971316818773,
"loss": 0.4379,
"step": 72000
},
{
"epoch": 5.9077574967405475,
"grad_norm": 3.5750834941864014,
"learning_rate": 0.00018184485006518905,
"loss": 0.4445,
"step": 72500
},
{
"epoch": 5.948500651890482,
"grad_norm": 2.297048330307007,
"learning_rate": 0.00018102998696219034,
"loss": 0.4415,
"step": 73000
},
{
"epoch": 5.989243807040417,
"grad_norm": 2.4230237007141113,
"learning_rate": 0.00018021512385919163,
"loss": 0.4405,
"step": 73500
},
{
"epoch": 6.0,
"eval_accuracy": 0.83253014087677,
"eval_loss": 0.42729276418685913,
"eval_runtime": 3.8615,
"eval_samples_per_second": 644.821,
"eval_steps_per_second": 80.797,
"step": 73632
},
{
"epoch": 6.029986962190352,
"grad_norm": 1.7936686277389526,
"learning_rate": 0.00017940026075619295,
"loss": 0.437,
"step": 74000
},
{
"epoch": 6.070730117340287,
"grad_norm": 2.2741811275482178,
"learning_rate": 0.00017858539765319424,
"loss": 0.434,
"step": 74500
},
{
"epoch": 6.111473272490222,
"grad_norm": 2.475470542907715,
"learning_rate": 0.00017777053455019556,
"loss": 0.4298,
"step": 75000
},
{
"epoch": 6.152216427640156,
"grad_norm": 1.6614030599594116,
"learning_rate": 0.00017695567144719685,
"loss": 0.43,
"step": 75500
},
{
"epoch": 6.192959582790091,
"grad_norm": 2.708757162094116,
"learning_rate": 0.00017614080834419814,
"loss": 0.4335,
"step": 76000
},
{
"epoch": 6.2337027379400265,
"grad_norm": 2.0995919704437256,
"learning_rate": 0.00017532594524119946,
"loss": 0.4378,
"step": 76500
},
{
"epoch": 6.274445893089961,
"grad_norm": 2.3114981651306152,
"learning_rate": 0.00017451108213820075,
"loss": 0.445,
"step": 77000
},
{
"epoch": 6.315189048239896,
"grad_norm": 2.9290952682495117,
"learning_rate": 0.00017369621903520204,
"loss": 0.4257,
"step": 77500
},
{
"epoch": 6.3559322033898304,
"grad_norm": 3.3918747901916504,
"learning_rate": 0.0001728813559322034,
"loss": 0.4375,
"step": 78000
},
{
"epoch": 6.396675358539765,
"grad_norm": 2.7095093727111816,
"learning_rate": 0.00017206649282920468,
"loss": 0.4464,
"step": 78500
},
{
"epoch": 6.4374185136897,
"grad_norm": 1.904373049736023,
"learning_rate": 0.000171251629726206,
"loss": 0.4328,
"step": 79000
},
{
"epoch": 6.478161668839635,
"grad_norm": 2.086244583129883,
"learning_rate": 0.0001704367666232073,
"loss": 0.4379,
"step": 79500
},
{
"epoch": 6.51890482398957,
"grad_norm": 2.2188174724578857,
"learning_rate": 0.00016962190352020858,
"loss": 0.432,
"step": 80000
},
{
"epoch": 6.559647979139505,
"grad_norm": 2.3787903785705566,
"learning_rate": 0.0001688070404172099,
"loss": 0.4368,
"step": 80500
},
{
"epoch": 6.600391134289439,
"grad_norm": 2.9371328353881836,
"learning_rate": 0.0001679921773142112,
"loss": 0.4312,
"step": 81000
},
{
"epoch": 6.641134289439374,
"grad_norm": 2.65150785446167,
"learning_rate": 0.00016717731421121248,
"loss": 0.4376,
"step": 81500
},
{
"epoch": 6.681877444589309,
"grad_norm": 2.1882104873657227,
"learning_rate": 0.0001663624511082138,
"loss": 0.4365,
"step": 82000
},
{
"epoch": 6.722620599739244,
"grad_norm": 2.618929862976074,
"learning_rate": 0.0001655475880052151,
"loss": 0.427,
"step": 82500
},
{
"epoch": 6.763363754889179,
"grad_norm": 2.831859588623047,
"learning_rate": 0.00016473272490221644,
"loss": 0.4322,
"step": 83000
},
{
"epoch": 6.804106910039113,
"grad_norm": 2.4017553329467773,
"learning_rate": 0.00016391786179921773,
"loss": 0.4425,
"step": 83500
},
{
"epoch": 6.844850065189048,
"grad_norm": 2.4107367992401123,
"learning_rate": 0.00016310299869621902,
"loss": 0.4261,
"step": 84000
},
{
"epoch": 6.885593220338983,
"grad_norm": 1.5284911394119263,
"learning_rate": 0.00016228813559322034,
"loss": 0.428,
"step": 84500
},
{
"epoch": 6.926336375488918,
"grad_norm": 1.875391960144043,
"learning_rate": 0.00016147327249022163,
"loss": 0.4248,
"step": 85000
},
{
"epoch": 6.967079530638853,
"grad_norm": 2.9802966117858887,
"learning_rate": 0.00016065840938722293,
"loss": 0.4208,
"step": 85500
},
{
"epoch": 7.0,
"eval_accuracy": 0.8313252925872803,
"eval_loss": 0.44049832224845886,
"eval_runtime": 3.8652,
"eval_samples_per_second": 644.21,
"eval_steps_per_second": 80.72,
"step": 85904
},
{
"epoch": 7.0078226857887875,
"grad_norm": 4.224608898162842,
"learning_rate": 0.00015984354628422424,
"loss": 0.4366,
"step": 86000
},
{
"epoch": 7.048565840938722,
"grad_norm": 2.473148822784424,
"learning_rate": 0.00015902868318122554,
"loss": 0.4106,
"step": 86500
},
{
"epoch": 7.089308996088657,
"grad_norm": 2.758864164352417,
"learning_rate": 0.00015821382007822685,
"loss": 0.4249,
"step": 87000
},
{
"epoch": 7.130052151238592,
"grad_norm": 2.041701555252075,
"learning_rate": 0.00015739895697522815,
"loss": 0.4187,
"step": 87500
},
{
"epoch": 7.170795306388527,
"grad_norm": 2.219381093978882,
"learning_rate": 0.00015658409387222944,
"loss": 0.4239,
"step": 88000
},
{
"epoch": 7.211538461538462,
"grad_norm": 2.239011526107788,
"learning_rate": 0.00015576923076923076,
"loss": 0.4256,
"step": 88500
},
{
"epoch": 7.252281616688396,
"grad_norm": 1.8178561925888062,
"learning_rate": 0.00015495436766623205,
"loss": 0.4273,
"step": 89000
},
{
"epoch": 7.293024771838331,
"grad_norm": 3.339010238647461,
"learning_rate": 0.00015413950456323334,
"loss": 0.4273,
"step": 89500
},
{
"epoch": 7.333767926988266,
"grad_norm": 1.5560699701309204,
"learning_rate": 0.00015332464146023469,
"loss": 0.4254,
"step": 90000
},
{
"epoch": 7.374511082138201,
"grad_norm": 2.376141309738159,
"learning_rate": 0.00015250977835723598,
"loss": 0.4231,
"step": 90500
},
{
"epoch": 7.415254237288136,
"grad_norm": 2.4076344966888428,
"learning_rate": 0.00015169491525423727,
"loss": 0.4126,
"step": 91000
},
{
"epoch": 7.4559973924380705,
"grad_norm": 1.661089301109314,
"learning_rate": 0.0001508800521512386,
"loss": 0.4224,
"step": 91500
},
{
"epoch": 7.496740547588005,
"grad_norm": 2.04123854637146,
"learning_rate": 0.00015006518904823988,
"loss": 0.4272,
"step": 92000
},
{
"epoch": 7.53748370273794,
"grad_norm": 1.8965297937393188,
"learning_rate": 0.00014925032594524117,
"loss": 0.4211,
"step": 92500
},
{
"epoch": 7.578226857887875,
"grad_norm": 2.6887612342834473,
"learning_rate": 0.0001484354628422425,
"loss": 0.4249,
"step": 93000
},
{
"epoch": 7.61897001303781,
"grad_norm": 2.034926414489746,
"learning_rate": 0.0001476205997392438,
"loss": 0.4289,
"step": 93500
},
{
"epoch": 7.659713168187745,
"grad_norm": 3.313100814819336,
"learning_rate": 0.0001468057366362451,
"loss": 0.4127,
"step": 94000
},
{
"epoch": 7.700456323337679,
"grad_norm": 2.1167919635772705,
"learning_rate": 0.0001459908735332464,
"loss": 0.4264,
"step": 94500
},
{
"epoch": 7.741199478487614,
"grad_norm": 2.0670166015625,
"learning_rate": 0.0001451760104302477,
"loss": 0.4317,
"step": 95000
},
{
"epoch": 7.781942633637549,
"grad_norm": 3.6086575984954834,
"learning_rate": 0.00014436114732724903,
"loss": 0.4232,
"step": 95500
},
{
"epoch": 7.822685788787483,
"grad_norm": 2.8180601596832275,
"learning_rate": 0.00014354628422425032,
"loss": 0.424,
"step": 96000
},
{
"epoch": 7.863428943937419,
"grad_norm": 2.9117753505706787,
"learning_rate": 0.0001427314211212516,
"loss": 0.4225,
"step": 96500
},
{
"epoch": 7.904172099087353,
"grad_norm": 2.2281785011291504,
"learning_rate": 0.00014191655801825293,
"loss": 0.4236,
"step": 97000
},
{
"epoch": 7.944915254237288,
"grad_norm": 3.034166097640991,
"learning_rate": 0.00014110169491525422,
"loss": 0.4283,
"step": 97500
},
{
"epoch": 7.985658409387223,
"grad_norm": 2.297738552093506,
"learning_rate": 0.00014028683181225551,
"loss": 0.4278,
"step": 98000
},
{
"epoch": 8.0,
"eval_accuracy": 0.8297188878059387,
"eval_loss": 0.43755677342414856,
"eval_runtime": 3.7829,
"eval_samples_per_second": 658.227,
"eval_steps_per_second": 82.477,
"step": 98176
},
{
"epoch": 8.026401564537158,
"grad_norm": 2.4886224269866943,
"learning_rate": 0.00013947196870925683,
"loss": 0.4122,
"step": 98500
},
{
"epoch": 8.067144719687093,
"grad_norm": 1.8089336156845093,
"learning_rate": 0.00013865710560625815,
"loss": 0.403,
"step": 99000
},
{
"epoch": 8.107887874837028,
"grad_norm": 3.1478216648101807,
"learning_rate": 0.00013784224250325944,
"loss": 0.4078,
"step": 99500
},
{
"epoch": 8.148631029986962,
"grad_norm": 3.5064048767089844,
"learning_rate": 0.00013702737940026073,
"loss": 0.4212,
"step": 100000
},
{
"epoch": 8.189374185136897,
"grad_norm": 3.1338980197906494,
"learning_rate": 0.00013621251629726205,
"loss": 0.4102,
"step": 100500
},
{
"epoch": 8.230117340286832,
"grad_norm": 2.583284616470337,
"learning_rate": 0.00013539765319426334,
"loss": 0.4113,
"step": 101000
},
{
"epoch": 8.270860495436766,
"grad_norm": 3.1805083751678467,
"learning_rate": 0.00013458279009126466,
"loss": 0.4144,
"step": 101500
},
{
"epoch": 8.3116036505867,
"grad_norm": 2.0652964115142822,
"learning_rate": 0.00013376792698826596,
"loss": 0.4077,
"step": 102000
},
{
"epoch": 8.352346805736635,
"grad_norm": 1.8258506059646606,
"learning_rate": 0.00013295306388526727,
"loss": 0.4151,
"step": 102500
},
{
"epoch": 8.393089960886572,
"grad_norm": 2.371155261993408,
"learning_rate": 0.00013213820078226857,
"loss": 0.4225,
"step": 103000
},
{
"epoch": 8.433833116036507,
"grad_norm": 3.617539405822754,
"learning_rate": 0.00013132333767926986,
"loss": 0.4158,
"step": 103500
},
{
"epoch": 8.474576271186441,
"grad_norm": 2.629683017730713,
"learning_rate": 0.00013050847457627118,
"loss": 0.4099,
"step": 104000
},
{
"epoch": 8.515319426336376,
"grad_norm": 2.359873056411743,
"learning_rate": 0.00012969361147327247,
"loss": 0.4171,
"step": 104500
},
{
"epoch": 8.55606258148631,
"grad_norm": 2.8503479957580566,
"learning_rate": 0.00012887874837027379,
"loss": 0.4194,
"step": 105000
},
{
"epoch": 8.596805736636245,
"grad_norm": 2.8921594619750977,
"learning_rate": 0.00012806388526727508,
"loss": 0.4125,
"step": 105500
},
{
"epoch": 8.63754889178618,
"grad_norm": 1.8355835676193237,
"learning_rate": 0.0001272490221642764,
"loss": 0.412,
"step": 106000
},
{
"epoch": 8.678292046936114,
"grad_norm": 3.0607216358184814,
"learning_rate": 0.0001264341590612777,
"loss": 0.4265,
"step": 106500
},
{
"epoch": 8.719035202086049,
"grad_norm": 2.338379144668579,
"learning_rate": 0.000125619295958279,
"loss": 0.4142,
"step": 107000
},
{
"epoch": 8.759778357235984,
"grad_norm": 2.316218137741089,
"learning_rate": 0.0001248044328552803,
"loss": 0.4099,
"step": 107500
},
{
"epoch": 8.800521512385918,
"grad_norm": 2.4564082622528076,
"learning_rate": 0.0001239895697522816,
"loss": 0.4194,
"step": 108000
},
{
"epoch": 8.841264667535853,
"grad_norm": 1.8719547986984253,
"learning_rate": 0.0001231747066492829,
"loss": 0.4132,
"step": 108500
},
{
"epoch": 8.88200782268579,
"grad_norm": 3.052569627761841,
"learning_rate": 0.00012235984354628423,
"loss": 0.4194,
"step": 109000
},
{
"epoch": 8.922750977835724,
"grad_norm": 2.181389808654785,
"learning_rate": 0.0001215449804432855,
"loss": 0.417,
"step": 109500
},
{
"epoch": 8.963494132985659,
"grad_norm": 3.063595771789551,
"learning_rate": 0.00012073011734028682,
"loss": 0.422,
"step": 110000
},
{
"epoch": 9.0,
"eval_accuracy": 0.833734929561615,
"eval_loss": 0.43286681175231934,
"eval_runtime": 3.8739,
"eval_samples_per_second": 642.755,
"eval_steps_per_second": 80.538,
"step": 110448
},
{
"epoch": 9.004237288135593,
"grad_norm": 2.0804457664489746,
"learning_rate": 0.00011991525423728813,
"loss": 0.4072,
"step": 110500
},
{
"epoch": 9.044980443285528,
"grad_norm": 1.9648699760437012,
"learning_rate": 0.00011910039113428943,
"loss": 0.4031,
"step": 111000
},
{
"epoch": 9.085723598435463,
"grad_norm": 3.1396656036376953,
"learning_rate": 0.00011828552803129073,
"loss": 0.3942,
"step": 111500
},
{
"epoch": 9.126466753585397,
"grad_norm": 2.4233107566833496,
"learning_rate": 0.00011747066492829203,
"loss": 0.4171,
"step": 112000
},
{
"epoch": 9.167209908735332,
"grad_norm": 1.7238380908966064,
"learning_rate": 0.00011665580182529335,
"loss": 0.4056,
"step": 112500
},
{
"epoch": 9.207953063885267,
"grad_norm": 2.2021853923797607,
"learning_rate": 0.00011584093872229466,
"loss": 0.4089,
"step": 113000
},
{
"epoch": 9.248696219035201,
"grad_norm": 2.9419503211975098,
"learning_rate": 0.00011502607561929595,
"loss": 0.399,
"step": 113500
},
{
"epoch": 9.289439374185136,
"grad_norm": 2.092937469482422,
"learning_rate": 0.00011421121251629725,
"loss": 0.4113,
"step": 114000
},
{
"epoch": 9.330182529335072,
"grad_norm": 2.0860626697540283,
"learning_rate": 0.00011339634941329856,
"loss": 0.4108,
"step": 114500
},
{
"epoch": 9.370925684485007,
"grad_norm": 1.9479416608810425,
"learning_rate": 0.00011258148631029986,
"loss": 0.4068,
"step": 115000
},
{
"epoch": 9.411668839634942,
"grad_norm": 2.6916277408599854,
"learning_rate": 0.00011176662320730115,
"loss": 0.4061,
"step": 115500
},
{
"epoch": 9.452411994784876,
"grad_norm": 1.601837158203125,
"learning_rate": 0.00011095176010430247,
"loss": 0.4074,
"step": 116000
},
{
"epoch": 9.493155149934811,
"grad_norm": 2.331357002258301,
"learning_rate": 0.00011013689700130378,
"loss": 0.4118,
"step": 116500
},
{
"epoch": 9.533898305084746,
"grad_norm": 2.559669256210327,
"learning_rate": 0.00010932203389830507,
"loss": 0.4142,
"step": 117000
},
{
"epoch": 9.57464146023468,
"grad_norm": 2.3902297019958496,
"learning_rate": 0.00010850717079530637,
"loss": 0.3977,
"step": 117500
},
{
"epoch": 9.615384615384615,
"grad_norm": 2.525848388671875,
"learning_rate": 0.00010769230769230768,
"loss": 0.4037,
"step": 118000
},
{
"epoch": 9.65612777053455,
"grad_norm": 3.530219316482544,
"learning_rate": 0.00010687744458930898,
"loss": 0.4019,
"step": 118500
},
{
"epoch": 9.696870925684484,
"grad_norm": 2.741429090499878,
"learning_rate": 0.00010606258148631028,
"loss": 0.4096,
"step": 119000
},
{
"epoch": 9.737614080834419,
"grad_norm": 3.318553924560547,
"learning_rate": 0.0001052477183833116,
"loss": 0.4056,
"step": 119500
},
{
"epoch": 9.778357235984355,
"grad_norm": 2.6523921489715576,
"learning_rate": 0.0001044328552803129,
"loss": 0.4007,
"step": 120000
},
{
"epoch": 9.81910039113429,
"grad_norm": 3.7088372707366943,
"learning_rate": 0.0001036179921773142,
"loss": 0.4016,
"step": 120500
},
{
"epoch": 9.859843546284225,
"grad_norm": 2.5519940853118896,
"learning_rate": 0.0001028031290743155,
"loss": 0.4143,
"step": 121000
},
{
"epoch": 9.90058670143416,
"grad_norm": 2.149285316467285,
"learning_rate": 0.0001019882659713168,
"loss": 0.4083,
"step": 121500
},
{
"epoch": 9.941329856584094,
"grad_norm": 4.22469425201416,
"learning_rate": 0.00010117340286831812,
"loss": 0.404,
"step": 122000
},
{
"epoch": 9.982073011734029,
"grad_norm": 2.2363908290863037,
"learning_rate": 0.00010035853976531943,
"loss": 0.4085,
"step": 122500
},
{
"epoch": 10.0,
"eval_accuracy": 0.8401606678962708,
"eval_loss": 0.42958390712738037,
"eval_runtime": 3.786,
"eval_samples_per_second": 657.679,
"eval_steps_per_second": 82.408,
"step": 122720
},
{
"epoch": 10.022816166883963,
"grad_norm": 3.384526491165161,
"learning_rate": 9.954367666232072e-05,
"loss": 0.3999,
"step": 123000
},
{
"epoch": 10.063559322033898,
"grad_norm": 3.051342725753784,
"learning_rate": 9.872881355932202e-05,
"loss": 0.3996,
"step": 123500
},
{
"epoch": 10.104302477183833,
"grad_norm": 3.707674026489258,
"learning_rate": 9.791395045632333e-05,
"loss": 0.4043,
"step": 124000
},
{
"epoch": 10.145045632333767,
"grad_norm": 1.7124032974243164,
"learning_rate": 9.709908735332463e-05,
"loss": 0.3918,
"step": 124500
},
{
"epoch": 10.185788787483702,
"grad_norm": 2.3350818157196045,
"learning_rate": 9.628422425032592e-05,
"loss": 0.396,
"step": 125000
},
{
"epoch": 10.226531942633638,
"grad_norm": 1.8520114421844482,
"learning_rate": 9.546936114732724e-05,
"loss": 0.3906,
"step": 125500
},
{
"epoch": 10.267275097783573,
"grad_norm": 2.7649943828582764,
"learning_rate": 9.465449804432855e-05,
"loss": 0.3902,
"step": 126000
},
{
"epoch": 10.308018252933508,
"grad_norm": 3.0913712978363037,
"learning_rate": 9.383963494132985e-05,
"loss": 0.4061,
"step": 126500
},
{
"epoch": 10.348761408083442,
"grad_norm": 3.6730563640594482,
"learning_rate": 9.302477183833115e-05,
"loss": 0.4036,
"step": 127000
},
{
"epoch": 10.389504563233377,
"grad_norm": 2.8968472480773926,
"learning_rate": 9.220990873533245e-05,
"loss": 0.3948,
"step": 127500
},
{
"epoch": 10.430247718383312,
"grad_norm": 1.9545537233352661,
"learning_rate": 9.139504563233377e-05,
"loss": 0.4039,
"step": 128000
},
{
"epoch": 10.470990873533246,
"grad_norm": 2.1482009887695312,
"learning_rate": 9.058018252933507e-05,
"loss": 0.4032,
"step": 128500
},
{
"epoch": 10.51173402868318,
"grad_norm": 4.479248046875,
"learning_rate": 8.976531942633637e-05,
"loss": 0.3827,
"step": 129000
},
{
"epoch": 10.552477183833116,
"grad_norm": 2.6518211364746094,
"learning_rate": 8.895045632333767e-05,
"loss": 0.3963,
"step": 129500
},
{
"epoch": 10.59322033898305,
"grad_norm": 2.565751314163208,
"learning_rate": 8.813559322033898e-05,
"loss": 0.4135,
"step": 130000
},
{
"epoch": 10.633963494132985,
"grad_norm": 3.31779146194458,
"learning_rate": 8.732073011734028e-05,
"loss": 0.4073,
"step": 130500
},
{
"epoch": 10.674706649282921,
"grad_norm": 1.9514780044555664,
"learning_rate": 8.650586701434157e-05,
"loss": 0.401,
"step": 131000
},
{
"epoch": 10.715449804432856,
"grad_norm": 4.615423679351807,
"learning_rate": 8.569100391134289e-05,
"loss": 0.3972,
"step": 131500
},
{
"epoch": 10.75619295958279,
"grad_norm": 3.4876339435577393,
"learning_rate": 8.48761408083442e-05,
"loss": 0.3958,
"step": 132000
},
{
"epoch": 10.796936114732725,
"grad_norm": 1.969255805015564,
"learning_rate": 8.406127770534549e-05,
"loss": 0.3997,
"step": 132500
},
{
"epoch": 10.83767926988266,
"grad_norm": 3.2826197147369385,
"learning_rate": 8.32464146023468e-05,
"loss": 0.4014,
"step": 133000
},
{
"epoch": 10.878422425032594,
"grad_norm": 2.9294662475585938,
"learning_rate": 8.24315514993481e-05,
"loss": 0.3942,
"step": 133500
},
{
"epoch": 10.91916558018253,
"grad_norm": 2.2191972732543945,
"learning_rate": 8.161668839634942e-05,
"loss": 0.3821,
"step": 134000
},
{
"epoch": 10.959908735332464,
"grad_norm": 2.2126150131225586,
"learning_rate": 8.080182529335071e-05,
"loss": 0.396,
"step": 134500
},
{
"epoch": 11.0,
"eval_accuracy": 0.8333333134651184,
"eval_loss": 0.4348280727863312,
"eval_runtime": 3.8732,
"eval_samples_per_second": 642.875,
"eval_steps_per_second": 80.553,
"step": 134992
},
{
"epoch": 11.000651890482398,
"grad_norm": 1.9272228479385376,
"learning_rate": 7.998696219035201e-05,
"loss": 0.409,
"step": 135000
},
{
"epoch": 11.041395045632333,
"grad_norm": 3.325286865234375,
"learning_rate": 7.917209908735332e-05,
"loss": 0.3948,
"step": 135500
},
{
"epoch": 11.082138200782268,
"grad_norm": 2.996323585510254,
"learning_rate": 7.835723598435462e-05,
"loss": 0.3884,
"step": 136000
},
{
"epoch": 11.122881355932204,
"grad_norm": 2.5405139923095703,
"learning_rate": 7.754237288135592e-05,
"loss": 0.3932,
"step": 136500
},
{
"epoch": 11.163624511082139,
"grad_norm": 2.4877593517303467,
"learning_rate": 7.672750977835722e-05,
"loss": 0.3908,
"step": 137000
},
{
"epoch": 11.204367666232073,
"grad_norm": 2.917015552520752,
"learning_rate": 7.591264667535854e-05,
"loss": 0.3827,
"step": 137500
},
{
"epoch": 11.245110821382008,
"grad_norm": 2.060572624206543,
"learning_rate": 7.509778357235985e-05,
"loss": 0.3938,
"step": 138000
},
{
"epoch": 11.285853976531943,
"grad_norm": 3.6868770122528076,
"learning_rate": 7.428292046936114e-05,
"loss": 0.3943,
"step": 138500
},
{
"epoch": 11.326597131681877,
"grad_norm": 2.118516683578491,
"learning_rate": 7.346805736636244e-05,
"loss": 0.3871,
"step": 139000
},
{
"epoch": 11.367340286831812,
"grad_norm": 2.2013978958129883,
"learning_rate": 7.265319426336375e-05,
"loss": 0.3875,
"step": 139500
},
{
"epoch": 11.408083441981747,
"grad_norm": 2.284522533416748,
"learning_rate": 7.183833116036505e-05,
"loss": 0.3937,
"step": 140000
},
{
"epoch": 11.448826597131681,
"grad_norm": 1.935478925704956,
"learning_rate": 7.102346805736636e-05,
"loss": 0.3933,
"step": 140500
},
{
"epoch": 11.489569752281616,
"grad_norm": 3.882283926010132,
"learning_rate": 7.020860495436766e-05,
"loss": 0.3882,
"step": 141000
},
{
"epoch": 11.53031290743155,
"grad_norm": 2.2980778217315674,
"learning_rate": 6.939374185136897e-05,
"loss": 0.3994,
"step": 141500
},
{
"epoch": 11.571056062581487,
"grad_norm": 3.7042973041534424,
"learning_rate": 6.857887874837027e-05,
"loss": 0.3894,
"step": 142000
},
{
"epoch": 11.611799217731422,
"grad_norm": 2.877511739730835,
"learning_rate": 6.776401564537158e-05,
"loss": 0.4033,
"step": 142500
},
{
"epoch": 11.652542372881356,
"grad_norm": 3.1929280757904053,
"learning_rate": 6.694915254237287e-05,
"loss": 0.3913,
"step": 143000
},
{
"epoch": 11.693285528031291,
"grad_norm": 2.0072107315063477,
"learning_rate": 6.613428943937419e-05,
"loss": 0.3971,
"step": 143500
},
{
"epoch": 11.734028683181226,
"grad_norm": 1.9861186742782593,
"learning_rate": 6.531942633637548e-05,
"loss": 0.4003,
"step": 144000
},
{
"epoch": 11.77477183833116,
"grad_norm": 2.227025032043457,
"learning_rate": 6.450456323337679e-05,
"loss": 0.4003,
"step": 144500
},
{
"epoch": 11.815514993481095,
"grad_norm": 2.0405077934265137,
"learning_rate": 6.368970013037809e-05,
"loss": 0.3931,
"step": 145000
},
{
"epoch": 11.85625814863103,
"grad_norm": 3.3660271167755127,
"learning_rate": 6.28748370273794e-05,
"loss": 0.3934,
"step": 145500
},
{
"epoch": 11.897001303780964,
"grad_norm": 2.728158473968506,
"learning_rate": 6.20599739243807e-05,
"loss": 0.3843,
"step": 146000
},
{
"epoch": 11.937744458930899,
"grad_norm": 2.6212921142578125,
"learning_rate": 6.124511082138199e-05,
"loss": 0.3852,
"step": 146500
},
{
"epoch": 11.978487614080834,
"grad_norm": 2.473024368286133,
"learning_rate": 6.0430247718383304e-05,
"loss": 0.3909,
"step": 147000
},
{
"epoch": 12.0,
"eval_accuracy": 0.8357429504394531,
"eval_loss": 0.42502352595329285,
"eval_runtime": 3.8531,
"eval_samples_per_second": 646.227,
"eval_steps_per_second": 80.973,
"step": 147264
},
{
"epoch": 12.01923076923077,
"grad_norm": 3.567250967025757,
"learning_rate": 5.961538461538461e-05,
"loss": 0.3983,
"step": 147500
},
{
"epoch": 12.059973924380705,
"grad_norm": 1.7462067604064941,
"learning_rate": 5.8800521512385915e-05,
"loss": 0.3794,
"step": 148000
},
{
"epoch": 12.10071707953064,
"grad_norm": 2.288787364959717,
"learning_rate": 5.798565840938721e-05,
"loss": 0.3925,
"step": 148500
},
{
"epoch": 12.141460234680574,
"grad_norm": 3.1145968437194824,
"learning_rate": 5.7170795306388525e-05,
"loss": 0.3793,
"step": 149000
},
{
"epoch": 12.182203389830509,
"grad_norm": 2.16363525390625,
"learning_rate": 5.6355932203389824e-05,
"loss": 0.3804,
"step": 149500
},
{
"epoch": 12.222946544980443,
"grad_norm": 2.6342670917510986,
"learning_rate": 5.554106910039113e-05,
"loss": 0.3861,
"step": 150000
},
{
"epoch": 12.263689700130378,
"grad_norm": 2.9809041023254395,
"learning_rate": 5.4726205997392434e-05,
"loss": 0.3913,
"step": 150500
},
{
"epoch": 12.304432855280313,
"grad_norm": 3.3812155723571777,
"learning_rate": 5.391134289439374e-05,
"loss": 0.3857,
"step": 151000
},
{
"epoch": 12.345176010430247,
"grad_norm": 2.890817165374756,
"learning_rate": 5.309647979139504e-05,
"loss": 0.393,
"step": 151500
},
{
"epoch": 12.385919165580182,
"grad_norm": 3.3339462280273438,
"learning_rate": 5.228161668839635e-05,
"loss": 0.3873,
"step": 152000
},
{
"epoch": 12.426662320730117,
"grad_norm": 2.7341129779815674,
"learning_rate": 5.146675358539765e-05,
"loss": 0.3876,
"step": 152500
},
{
"epoch": 12.467405475880053,
"grad_norm": 3.096959114074707,
"learning_rate": 5.065189048239895e-05,
"loss": 0.3908,
"step": 153000
},
{
"epoch": 12.508148631029988,
"grad_norm": 1.7687112092971802,
"learning_rate": 4.983702737940025e-05,
"loss": 0.394,
"step": 153500
},
{
"epoch": 12.548891786179922,
"grad_norm": 2.028165102005005,
"learning_rate": 4.902216427640156e-05,
"loss": 0.3883,
"step": 154000
},
{
"epoch": 12.589634941329857,
"grad_norm": 2.485379934310913,
"learning_rate": 4.820730117340286e-05,
"loss": 0.3855,
"step": 154500
},
{
"epoch": 12.630378096479792,
"grad_norm": 1.7456655502319336,
"learning_rate": 4.7392438070404173e-05,
"loss": 0.3968,
"step": 155000
},
{
"epoch": 12.671121251629726,
"grad_norm": 2.4976985454559326,
"learning_rate": 4.657757496740547e-05,
"loss": 0.3798,
"step": 155500
},
{
"epoch": 12.711864406779661,
"grad_norm": 3.9520442485809326,
"learning_rate": 4.576271186440678e-05,
"loss": 0.3894,
"step": 156000
},
{
"epoch": 12.752607561929596,
"grad_norm": 2.648386240005493,
"learning_rate": 4.4947848761408075e-05,
"loss": 0.3851,
"step": 156500
},
{
"epoch": 12.79335071707953,
"grad_norm": 2.492152690887451,
"learning_rate": 4.413298565840939e-05,
"loss": 0.3822,
"step": 157000
},
{
"epoch": 12.834093872229465,
"grad_norm": 2.9962518215179443,
"learning_rate": 4.3318122555410686e-05,
"loss": 0.3858,
"step": 157500
},
{
"epoch": 12.8748370273794,
"grad_norm": 2.332040309906006,
"learning_rate": 4.2503259452412e-05,
"loss": 0.3925,
"step": 158000
},
{
"epoch": 12.915580182529336,
"grad_norm": 2.6551926136016846,
"learning_rate": 4.1688396349413296e-05,
"loss": 0.3899,
"step": 158500
},
{
"epoch": 12.95632333767927,
"grad_norm": 1.7805209159851074,
"learning_rate": 4.08735332464146e-05,
"loss": 0.3885,
"step": 159000
},
{
"epoch": 12.997066492829205,
"grad_norm": 2.9438204765319824,
"learning_rate": 4.00586701434159e-05,
"loss": 0.3956,
"step": 159500
},
{
"epoch": 13.0,
"eval_accuracy": 0.8381525874137878,
"eval_loss": 0.4289119243621826,
"eval_runtime": 3.8072,
"eval_samples_per_second": 654.032,
"eval_steps_per_second": 81.951,
"step": 159536
},
{
"epoch": 13.03780964797914,
"grad_norm": 2.8224751949310303,
"learning_rate": 3.924380704041721e-05,
"loss": 0.3818,
"step": 160000
},
{
"epoch": 13.078552803129075,
"grad_norm": 2.0187859535217285,
"learning_rate": 3.842894393741851e-05,
"loss": 0.3791,
"step": 160500
},
{
"epoch": 13.11929595827901,
"grad_norm": 3.1369576454162598,
"learning_rate": 3.761408083441981e-05,
"loss": 0.3799,
"step": 161000
},
{
"epoch": 13.160039113428944,
"grad_norm": 2.373286485671997,
"learning_rate": 3.679921773142112e-05,
"loss": 0.3733,
"step": 161500
},
{
"epoch": 13.200782268578878,
"grad_norm": 2.583207130432129,
"learning_rate": 3.5984354628422425e-05,
"loss": 0.3958,
"step": 162000
},
{
"epoch": 13.241525423728813,
"grad_norm": 2.5118906497955322,
"learning_rate": 3.5169491525423724e-05,
"loss": 0.3915,
"step": 162500
},
{
"epoch": 13.282268578878748,
"grad_norm": 5.202625751495361,
"learning_rate": 3.435462842242503e-05,
"loss": 0.3879,
"step": 163000
},
{
"epoch": 13.323011734028682,
"grad_norm": 2.9979419708251953,
"learning_rate": 3.3539765319426334e-05,
"loss": 0.3737,
"step": 163500
},
{
"epoch": 13.363754889178619,
"grad_norm": 2.0817720890045166,
"learning_rate": 3.272490221642764e-05,
"loss": 0.3912,
"step": 164000
},
{
"epoch": 13.404498044328554,
"grad_norm": 2.691849946975708,
"learning_rate": 3.1910039113428944e-05,
"loss": 0.3689,
"step": 164500
},
{
"epoch": 13.445241199478488,
"grad_norm": 2.358008861541748,
"learning_rate": 3.109517601043025e-05,
"loss": 0.3793,
"step": 165000
},
{
"epoch": 13.485984354628423,
"grad_norm": 2.514547109603882,
"learning_rate": 3.028031290743155e-05,
"loss": 0.3868,
"step": 165500
},
{
"epoch": 13.526727509778357,
"grad_norm": 2.5108165740966797,
"learning_rate": 2.9465449804432853e-05,
"loss": 0.385,
"step": 166000
},
{
"epoch": 13.567470664928292,
"grad_norm": 3.075470447540283,
"learning_rate": 2.8650586701434158e-05,
"loss": 0.3774,
"step": 166500
},
{
"epoch": 13.608213820078227,
"grad_norm": 4.978045463562012,
"learning_rate": 2.7835723598435463e-05,
"loss": 0.3787,
"step": 167000
},
{
"epoch": 13.648956975228161,
"grad_norm": 2.699185609817505,
"learning_rate": 2.7020860495436762e-05,
"loss": 0.3772,
"step": 167500
},
{
"epoch": 13.689700130378096,
"grad_norm": 1.7423195838928223,
"learning_rate": 2.6205997392438067e-05,
"loss": 0.3838,
"step": 168000
},
{
"epoch": 13.73044328552803,
"grad_norm": 1.603785753250122,
"learning_rate": 2.539113428943937e-05,
"loss": 0.3804,
"step": 168500
},
{
"epoch": 13.771186440677965,
"grad_norm": 2.6994235515594482,
"learning_rate": 2.4576271186440674e-05,
"loss": 0.3803,
"step": 169000
},
{
"epoch": 13.811929595827902,
"grad_norm": 3.038980484008789,
"learning_rate": 2.376140808344198e-05,
"loss": 0.3823,
"step": 169500
},
{
"epoch": 13.852672750977836,
"grad_norm": 3.170668840408325,
"learning_rate": 2.294654498044328e-05,
"loss": 0.3803,
"step": 170000
},
{
"epoch": 13.893415906127771,
"grad_norm": 2.6691057682037354,
"learning_rate": 2.2131681877444586e-05,
"loss": 0.3809,
"step": 170500
},
{
"epoch": 13.934159061277706,
"grad_norm": 1.789117455482483,
"learning_rate": 2.131681877444589e-05,
"loss": 0.3917,
"step": 171000
},
{
"epoch": 13.97490221642764,
"grad_norm": 2.110405445098877,
"learning_rate": 2.0501955671447193e-05,
"loss": 0.3792,
"step": 171500
},
{
"epoch": 14.0,
"eval_accuracy": 0.8365461826324463,
"eval_loss": 0.4235801696777344,
"eval_runtime": 3.8552,
"eval_samples_per_second": 645.875,
"eval_steps_per_second": 80.929,
"step": 171808
},
{
"epoch": 14.015645371577575,
"grad_norm": 3.1774420738220215,
"learning_rate": 1.9687092568448498e-05,
"loss": 0.3883,
"step": 172000
},
{
"epoch": 14.05638852672751,
"grad_norm": 3.016127109527588,
"learning_rate": 1.8872229465449803e-05,
"loss": 0.3798,
"step": 172500
},
{
"epoch": 14.097131681877444,
"grad_norm": 1.6009718179702759,
"learning_rate": 1.8057366362451105e-05,
"loss": 0.3712,
"step": 173000
},
{
"epoch": 14.137874837027379,
"grad_norm": 3.2171220779418945,
"learning_rate": 1.724250325945241e-05,
"loss": 0.3752,
"step": 173500
},
{
"epoch": 14.178617992177314,
"grad_norm": 2.144103765487671,
"learning_rate": 1.6427640156453715e-05,
"loss": 0.3752,
"step": 174000
},
{
"epoch": 14.219361147327248,
"grad_norm": 1.7222505807876587,
"learning_rate": 1.5612777053455017e-05,
"loss": 0.389,
"step": 174500
},
{
"epoch": 14.260104302477185,
"grad_norm": 1.7213879823684692,
"learning_rate": 1.4797913950456322e-05,
"loss": 0.3752,
"step": 175000
},
{
"epoch": 14.30084745762712,
"grad_norm": 3.261892080307007,
"learning_rate": 1.3983050847457626e-05,
"loss": 0.3813,
"step": 175500
},
{
"epoch": 14.341590612777054,
"grad_norm": 3.9659616947174072,
"learning_rate": 1.3168187744458931e-05,
"loss": 0.3846,
"step": 176000
},
{
"epoch": 14.382333767926989,
"grad_norm": 2.451526403427124,
"learning_rate": 1.2353324641460234e-05,
"loss": 0.3762,
"step": 176500
},
{
"epoch": 14.423076923076923,
"grad_norm": 3.402191638946533,
"learning_rate": 1.1538461538461538e-05,
"loss": 0.3783,
"step": 177000
},
{
"epoch": 14.463820078226858,
"grad_norm": 1.925841212272644,
"learning_rate": 1.0723598435462841e-05,
"loss": 0.3673,
"step": 177500
},
{
"epoch": 14.504563233376793,
"grad_norm": 2.2188963890075684,
"learning_rate": 9.908735332464146e-06,
"loss": 0.3754,
"step": 178000
},
{
"epoch": 14.545306388526727,
"grad_norm": 2.604687213897705,
"learning_rate": 9.093872229465448e-06,
"loss": 0.3789,
"step": 178500
},
{
"epoch": 14.586049543676662,
"grad_norm": 3.5460221767425537,
"learning_rate": 8.279009126466753e-06,
"loss": 0.3684,
"step": 179000
},
{
"epoch": 14.626792698826597,
"grad_norm": 1.9674248695373535,
"learning_rate": 7.464146023468057e-06,
"loss": 0.3732,
"step": 179500
},
{
"epoch": 14.667535853976531,
"grad_norm": 2.8252646923065186,
"learning_rate": 6.649282920469361e-06,
"loss": 0.3735,
"step": 180000
},
{
"epoch": 14.708279009126468,
"grad_norm": 3.43896746635437,
"learning_rate": 5.834419817470664e-06,
"loss": 0.383,
"step": 180500
},
{
"epoch": 14.749022164276402,
"grad_norm": 2.6553750038146973,
"learning_rate": 5.019556714471968e-06,
"loss": 0.3877,
"step": 181000
},
{
"epoch": 14.789765319426337,
"grad_norm": 3.002777576446533,
"learning_rate": 4.2046936114732716e-06,
"loss": 0.3757,
"step": 181500
},
{
"epoch": 14.830508474576272,
"grad_norm": 3.0359461307525635,
"learning_rate": 3.389830508474576e-06,
"loss": 0.3903,
"step": 182000
},
{
"epoch": 14.871251629726206,
"grad_norm": 2.365903615951538,
"learning_rate": 2.5749674054758798e-06,
"loss": 0.3818,
"step": 182500
},
{
"epoch": 14.911994784876141,
"grad_norm": 3.1819570064544678,
"learning_rate": 1.7601043024771837e-06,
"loss": 0.3835,
"step": 183000
},
{
"epoch": 14.952737940026076,
"grad_norm": 2.5495858192443848,
"learning_rate": 9.452411994784876e-07,
"loss": 0.3751,
"step": 183500
},
{
"epoch": 14.99348109517601,
"grad_norm": 2.4265575408935547,
"learning_rate": 1.303780964797914e-07,
"loss": 0.3759,
"step": 184000
},
{
"epoch": 15.0,
"eval_accuracy": 0.8361445665359497,
"eval_loss": 0.4246699810028076,
"eval_runtime": 3.8083,
"eval_samples_per_second": 653.84,
"eval_steps_per_second": 81.927,
"step": 184080
},
{
"epoch": 15.0,
"step": 184080,
"total_flos": 3.921542539724851e+17,
"train_loss": 0.4436988375695671,
"train_runtime": 13089.0372,
"train_samples_per_second": 450.035,
"train_steps_per_second": 14.064
}
],
"logging_steps": 500,
"max_steps": 184080,
"num_input_tokens_seen": 0,
"num_train_epochs": 15,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.921542539724851e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}