End of training

Browse files

Files changed (4) hide show

all_results.json +16 -0
eval_results.json +10 -0
train_results.json +9 -0
trainer_state.json +1847 -0

all_results.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "avg_token_length": 41.888755020080325,
+    "epoch": 10.0,
+    "eval_accuracy": 0.8449799418449402,
+    "eval_loss": 0.4318901598453522,
+    "eval_runtime": 1.7906,
+    "eval_samples": 2490,
+    "eval_samples_per_second": 1390.588,
+    "eval_steps_per_second": 174.242,
+    "total_flos": 2.6187994846190592e+17,
+    "train_loss": 0.44300876322750005,
+    "train_runtime": 3819.0977,
+    "train_samples": 392702,
+    "train_samples_per_second": 1028.259,
+    "train_steps_per_second": 32.133
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "avg_token_length": 41.888755020080325,
+    "epoch": 10.0,
+    "eval_accuracy": 0.8449799418449402,
+    "eval_loss": 0.4318901598453522,
+    "eval_runtime": 1.7906,
+    "eval_samples": 2490,
+    "eval_samples_per_second": 1390.588,
+    "eval_steps_per_second": 174.242
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 10.0,
+    "total_flos": 2.6187994846190592e+17,
+    "train_loss": 0.44300876322750005,
+    "train_runtime": 3819.0977,
+    "train_samples": 392702,
+    "train_samples_per_second": 1028.259,
+    "train_steps_per_second": 32.133
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1847 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 10.0,
+  "eval_steps": 500,
+  "global_step": 122720,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.04074315514993481,
+      "grad_norm": 3.771338939666748,
+      "learning_rate": 0.0002987777053455019,
+      "loss": 0.9611,
+      "step": 500
+    },
+    {
+      "epoch": 0.08148631029986962,
+      "grad_norm": 3.4215829372406006,
+      "learning_rate": 0.0002975554106910039,
+      "loss": 0.7577,
+      "step": 1000
+    },
+    {
+      "epoch": 0.12222946544980444,
+      "grad_norm": 3.9583778381347656,
+      "learning_rate": 0.00029633311603650584,
+      "loss": 0.6919,
+      "step": 1500
+    },
+    {
+      "epoch": 0.16297262059973924,
+      "grad_norm": 4.326165676116943,
+      "learning_rate": 0.0002951108213820078,
+      "loss": 0.6589,
+      "step": 2000
+    },
+    {
+      "epoch": 0.20371577574967406,
+      "grad_norm": 4.510104179382324,
+      "learning_rate": 0.00029388852672750977,
+      "loss": 0.6394,
+      "step": 2500
+    },
+    {
+      "epoch": 0.24445893089960888,
+      "grad_norm": 4.456064224243164,
+      "learning_rate": 0.0002926662320730117,
+      "loss": 0.6127,
+      "step": 3000
+    },
+    {
+      "epoch": 0.28520208604954367,
+      "grad_norm": 3.8408055305480957,
+      "learning_rate": 0.00029144393741851364,
+      "loss": 0.6057,
+      "step": 3500
+    },
+    {
+      "epoch": 0.3259452411994785,
+      "grad_norm": 4.424233913421631,
+      "learning_rate": 0.0002902216427640156,
+      "loss": 0.5903,
+      "step": 4000
+    },
+    {
+      "epoch": 0.3666883963494133,
+      "grad_norm": 4.306754112243652,
+      "learning_rate": 0.0002889993481095176,
+      "loss": 0.5925,
+      "step": 4500
+    },
+    {
+      "epoch": 0.4074315514993481,
+      "grad_norm": 4.172561168670654,
+      "learning_rate": 0.00028777705345501956,
+      "loss": 0.5808,
+      "step": 5000
+    },
+    {
+      "epoch": 0.44817470664928294,
+      "grad_norm": 3.9224565029144287,
+      "learning_rate": 0.0002865547588005215,
+      "loss": 0.5701,
+      "step": 5500
+    },
+    {
+      "epoch": 0.48891786179921776,
+      "grad_norm": 3.4274065494537354,
+      "learning_rate": 0.00028533246414602344,
+      "loss": 0.5717,
+      "step": 6000
+    },
+    {
+      "epoch": 0.5296610169491526,
+      "grad_norm": 2.3418281078338623,
+      "learning_rate": 0.00028411016949152543,
+      "loss": 0.5702,
+      "step": 6500
+    },
+    {
+      "epoch": 0.5704041720990873,
+      "grad_norm": 3.590977191925049,
+      "learning_rate": 0.00028288787483702737,
+      "loss": 0.5557,
+      "step": 7000
+    },
+    {
+      "epoch": 0.6111473272490222,
+      "grad_norm": 5.837788105010986,
+      "learning_rate": 0.0002816655801825293,
+      "loss": 0.5578,
+      "step": 7500
+    },
+    {
+      "epoch": 0.651890482398957,
+      "grad_norm": 3.356754779815674,
+      "learning_rate": 0.00028044328552803124,
+      "loss": 0.5604,
+      "step": 8000
+    },
+    {
+      "epoch": 0.6926336375488917,
+      "grad_norm": 3.8409948348999023,
+      "learning_rate": 0.00027922099087353323,
+      "loss": 0.5547,
+      "step": 8500
+    },
+    {
+      "epoch": 0.7333767926988266,
+      "grad_norm": 3.8805811405181885,
+      "learning_rate": 0.00027799869621903517,
+      "loss": 0.561,
+      "step": 9000
+    },
+    {
+      "epoch": 0.7741199478487614,
+      "grad_norm": 4.566688060760498,
+      "learning_rate": 0.0002767764015645371,
+      "loss": 0.5529,
+      "step": 9500
+    },
+    {
+      "epoch": 0.8148631029986962,
+      "grad_norm": 4.487757682800293,
+      "learning_rate": 0.0002755541069100391,
+      "loss": 0.5346,
+      "step": 10000
+    },
+    {
+      "epoch": 0.855606258148631,
+      "grad_norm": 5.617890357971191,
+      "learning_rate": 0.00027433181225554104,
+      "loss": 0.5421,
+      "step": 10500
+    },
+    {
+      "epoch": 0.8963494132985659,
+      "grad_norm": 3.5767805576324463,
+      "learning_rate": 0.000273109517601043,
+      "loss": 0.5449,
+      "step": 11000
+    },
+    {
+      "epoch": 0.9370925684485006,
+      "grad_norm": 3.796847343444824,
+      "learning_rate": 0.00027188722294654497,
+      "loss": 0.5373,
+      "step": 11500
+    },
+    {
+      "epoch": 0.9778357235984355,
+      "grad_norm": 5.0998334884643555,
+      "learning_rate": 0.0002706649282920469,
+      "loss": 0.5318,
+      "step": 12000
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.8032128810882568,
+      "eval_loss": 0.48824790120124817,
+      "eval_runtime": 1.7924,
+      "eval_samples_per_second": 1389.179,
+      "eval_steps_per_second": 174.066,
+      "step": 12272
+    },
+    {
+      "epoch": 1.0185788787483703,
+      "grad_norm": 3.083867073059082,
+      "learning_rate": 0.00026944263363754884,
+      "loss": 0.5339,
+      "step": 12500
+    },
+    {
+      "epoch": 1.0593220338983051,
+      "grad_norm": 4.44858980178833,
+      "learning_rate": 0.00026822033898305083,
+      "loss": 0.5111,
+      "step": 13000
+    },
+    {
+      "epoch": 1.1000651890482398,
+      "grad_norm": 4.110743045806885,
+      "learning_rate": 0.00026699804432855277,
+      "loss": 0.5207,
+      "step": 13500
+    },
+    {
+      "epoch": 1.1408083441981747,
+      "grad_norm": 3.78286075592041,
+      "learning_rate": 0.00026577574967405476,
+      "loss": 0.5204,
+      "step": 14000
+    },
+    {
+      "epoch": 1.1815514993481095,
+      "grad_norm": 3.7647550106048584,
+      "learning_rate": 0.0002645534550195567,
+      "loss": 0.5195,
+      "step": 14500
+    },
+    {
+      "epoch": 1.2222946544980444,
+      "grad_norm": 3.9962730407714844,
+      "learning_rate": 0.00026333116036505864,
+      "loss": 0.5103,
+      "step": 15000
+    },
+    {
+      "epoch": 1.263037809647979,
+      "grad_norm": 7.238346099853516,
+      "learning_rate": 0.00026210886571056063,
+      "loss": 0.5204,
+      "step": 15500
+    },
+    {
+      "epoch": 1.303780964797914,
+      "grad_norm": 5.9857916831970215,
+      "learning_rate": 0.00026088657105606257,
+      "loss": 0.5137,
+      "step": 16000
+    },
+    {
+      "epoch": 1.3445241199478488,
+      "grad_norm": 3.4517877101898193,
+      "learning_rate": 0.0002596642764015645,
+      "loss": 0.5152,
+      "step": 16500
+    },
+    {
+      "epoch": 1.3852672750977835,
+      "grad_norm": 4.7614850997924805,
+      "learning_rate": 0.0002584419817470665,
+      "loss": 0.5201,
+      "step": 17000
+    },
+    {
+      "epoch": 1.4260104302477183,
+      "grad_norm": 5.093620300292969,
+      "learning_rate": 0.00025721968709256843,
+      "loss": 0.5115,
+      "step": 17500
+    },
+    {
+      "epoch": 1.4667535853976532,
+      "grad_norm": 3.9020328521728516,
+      "learning_rate": 0.00025599739243807037,
+      "loss": 0.5111,
+      "step": 18000
+    },
+    {
+      "epoch": 1.5074967405475879,
+      "grad_norm": 4.338925361633301,
+      "learning_rate": 0.0002547750977835723,
+      "loss": 0.5124,
+      "step": 18500
+    },
+    {
+      "epoch": 1.548239895697523,
+      "grad_norm": 3.3417322635650635,
+      "learning_rate": 0.0002535528031290743,
+      "loss": 0.5017,
+      "step": 19000
+    },
+    {
+      "epoch": 1.5889830508474576,
+      "grad_norm": 4.39253568649292,
+      "learning_rate": 0.00025233050847457624,
+      "loss": 0.5117,
+      "step": 19500
+    },
+    {
+      "epoch": 1.6297262059973925,
+      "grad_norm": 2.9685988426208496,
+      "learning_rate": 0.0002511082138200782,
+      "loss": 0.5118,
+      "step": 20000
+    },
+    {
+      "epoch": 1.6704693611473274,
+      "grad_norm": 4.1028733253479,
+      "learning_rate": 0.00024988591916558017,
+      "loss": 0.5034,
+      "step": 20500
+    },
+    {
+      "epoch": 1.711212516297262,
+      "grad_norm": 3.8489856719970703,
+      "learning_rate": 0.0002486636245110821,
+      "loss": 0.505,
+      "step": 21000
+    },
+    {
+      "epoch": 1.7519556714471969,
+      "grad_norm": 4.785423278808594,
+      "learning_rate": 0.00024744132985658404,
+      "loss": 0.5014,
+      "step": 21500
+    },
+    {
+      "epoch": 1.7926988265971318,
+      "grad_norm": 2.9591927528381348,
+      "learning_rate": 0.00024621903520208603,
+      "loss": 0.5034,
+      "step": 22000
+    },
+    {
+      "epoch": 1.8334419817470664,
+      "grad_norm": 5.5640482902526855,
+      "learning_rate": 0.000244996740547588,
+      "loss": 0.5118,
+      "step": 22500
+    },
+    {
+      "epoch": 1.8741851368970013,
+      "grad_norm": 5.140857696533203,
+      "learning_rate": 0.00024377444589308996,
+      "loss": 0.4922,
+      "step": 23000
+    },
+    {
+      "epoch": 1.9149282920469362,
+      "grad_norm": 2.477415084838867,
+      "learning_rate": 0.0002425521512385919,
+      "loss": 0.5016,
+      "step": 23500
+    },
+    {
+      "epoch": 1.9556714471968708,
+      "grad_norm": 3.9377715587615967,
+      "learning_rate": 0.00024132985658409386,
+      "loss": 0.4996,
+      "step": 24000
+    },
+    {
+      "epoch": 1.996414602346806,
+      "grad_norm": 3.377265691757202,
+      "learning_rate": 0.00024010756192959583,
+      "loss": 0.4972,
+      "step": 24500
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.8277108669281006,
+      "eval_loss": 0.45558586716651917,
+      "eval_runtime": 1.8005,
+      "eval_samples_per_second": 1382.923,
+      "eval_steps_per_second": 173.282,
+      "step": 24544
+    },
+    {
+      "epoch": 2.0371577574967406,
+      "grad_norm": 4.437685489654541,
+      "learning_rate": 0.00023888526727509777,
+      "loss": 0.4829,
+      "step": 25000
+    },
+    {
+      "epoch": 2.077900912646675,
+      "grad_norm": 4.157956600189209,
+      "learning_rate": 0.00023766297262059973,
+      "loss": 0.4749,
+      "step": 25500
+    },
+    {
+      "epoch": 2.1186440677966103,
+      "grad_norm": 4.1620025634765625,
+      "learning_rate": 0.00023644067796610167,
+      "loss": 0.4858,
+      "step": 26000
+    },
+    {
+      "epoch": 2.159387222946545,
+      "grad_norm": 3.910991907119751,
+      "learning_rate": 0.00023521838331160363,
+      "loss": 0.4871,
+      "step": 26500
+    },
+    {
+      "epoch": 2.2001303780964796,
+      "grad_norm": 4.1535115242004395,
+      "learning_rate": 0.0002339960886571056,
+      "loss": 0.4824,
+      "step": 27000
+    },
+    {
+      "epoch": 2.2408735332464147,
+      "grad_norm": 5.351711273193359,
+      "learning_rate": 0.00023277379400260753,
+      "loss": 0.4783,
+      "step": 27500
+    },
+    {
+      "epoch": 2.2816166883963493,
+      "grad_norm": 4.426401138305664,
+      "learning_rate": 0.0002315514993481095,
+      "loss": 0.4889,
+      "step": 28000
+    },
+    {
+      "epoch": 2.322359843546284,
+      "grad_norm": 3.755659580230713,
+      "learning_rate": 0.00023032920469361144,
+      "loss": 0.4665,
+      "step": 28500
+    },
+    {
+      "epoch": 2.363102998696219,
+      "grad_norm": 7.201287746429443,
+      "learning_rate": 0.0002291069100391134,
+      "loss": 0.4783,
+      "step": 29000
+    },
+    {
+      "epoch": 2.4038461538461537,
+      "grad_norm": 5.291903018951416,
+      "learning_rate": 0.00022788461538461537,
+      "loss": 0.4766,
+      "step": 29500
+    },
+    {
+      "epoch": 2.444589308996089,
+      "grad_norm": 4.227901935577393,
+      "learning_rate": 0.0002266623207301173,
+      "loss": 0.4813,
+      "step": 30000
+    },
+    {
+      "epoch": 2.4853324641460235,
+      "grad_norm": 6.243842601776123,
+      "learning_rate": 0.00022544002607561927,
+      "loss": 0.4852,
+      "step": 30500
+    },
+    {
+      "epoch": 2.526075619295958,
+      "grad_norm": 5.371899604797363,
+      "learning_rate": 0.0002242177314211212,
+      "loss": 0.489,
+      "step": 31000
+    },
+    {
+      "epoch": 2.5668187744458932,
+      "grad_norm": 5.8773651123046875,
+      "learning_rate": 0.0002229954367666232,
+      "loss": 0.4817,
+      "step": 31500
+    },
+    {
+      "epoch": 2.607561929595828,
+      "grad_norm": 4.5435791015625,
+      "learning_rate": 0.00022177314211212516,
+      "loss": 0.4704,
+      "step": 32000
+    },
+    {
+      "epoch": 2.648305084745763,
+      "grad_norm": 3.9447853565216064,
+      "learning_rate": 0.0002205508474576271,
+      "loss": 0.4771,
+      "step": 32500
+    },
+    {
+      "epoch": 2.6890482398956976,
+      "grad_norm": 4.153613567352295,
+      "learning_rate": 0.00021932855280312906,
+      "loss": 0.4791,
+      "step": 33000
+    },
+    {
+      "epoch": 2.7297913950456323,
+      "grad_norm": 2.882225513458252,
+      "learning_rate": 0.00021810625814863103,
+      "loss": 0.4792,
+      "step": 33500
+    },
+    {
+      "epoch": 2.770534550195567,
+      "grad_norm": 3.0806946754455566,
+      "learning_rate": 0.00021688396349413296,
+      "loss": 0.4855,
+      "step": 34000
+    },
+    {
+      "epoch": 2.811277705345502,
+      "grad_norm": 3.908801317214966,
+      "learning_rate": 0.00021566166883963493,
+      "loss": 0.486,
+      "step": 34500
+    },
+    {
+      "epoch": 2.8520208604954367,
+      "grad_norm": 4.365808010101318,
+      "learning_rate": 0.0002144393741851369,
+      "loss": 0.4645,
+      "step": 35000
+    },
+    {
+      "epoch": 2.8927640156453718,
+      "grad_norm": 6.386261940002441,
+      "learning_rate": 0.00021321707953063883,
+      "loss": 0.4659,
+      "step": 35500
+    },
+    {
+      "epoch": 2.9335071707953064,
+      "grad_norm": 3.9260733127593994,
+      "learning_rate": 0.0002119947848761408,
+      "loss": 0.4775,
+      "step": 36000
+    },
+    {
+      "epoch": 2.974250325945241,
+      "grad_norm": 4.963658809661865,
+      "learning_rate": 0.00021077249022164273,
+      "loss": 0.4683,
+      "step": 36500
+    },
+    {
+      "epoch": 3.0,
+      "eval_accuracy": 0.7927711009979248,
+      "eval_loss": 0.5098682641983032,
+      "eval_runtime": 1.7828,
+      "eval_samples_per_second": 1396.693,
+      "eval_steps_per_second": 175.007,
+      "step": 36816
+    },
+    {
+      "epoch": 3.014993481095176,
+      "grad_norm": 4.122433662414551,
+      "learning_rate": 0.0002095501955671447,
+      "loss": 0.473,
+      "step": 37000
+    },
+    {
+      "epoch": 3.055736636245111,
+      "grad_norm": 4.342956066131592,
+      "learning_rate": 0.00020832790091264666,
+      "loss": 0.4556,
+      "step": 37500
+    },
+    {
+      "epoch": 3.0964797913950455,
+      "grad_norm": 5.436068058013916,
+      "learning_rate": 0.0002071056062581486,
+      "loss": 0.4478,
+      "step": 38000
+    },
+    {
+      "epoch": 3.1372229465449806,
+      "grad_norm": 5.401863098144531,
+      "learning_rate": 0.00020588331160365056,
+      "loss": 0.4584,
+      "step": 38500
+    },
+    {
+      "epoch": 3.1779661016949152,
+      "grad_norm": 4.654440879821777,
+      "learning_rate": 0.0002046610169491525,
+      "loss": 0.4517,
+      "step": 39000
+    },
+    {
+      "epoch": 3.21870925684485,
+      "grad_norm": 7.957165241241455,
+      "learning_rate": 0.00020343872229465447,
+      "loss": 0.4485,
+      "step": 39500
+    },
+    {
+      "epoch": 3.259452411994785,
+      "grad_norm": 6.018537521362305,
+      "learning_rate": 0.00020221642764015643,
+      "loss": 0.46,
+      "step": 40000
+    },
+    {
+      "epoch": 3.3001955671447196,
+      "grad_norm": 2.580153703689575,
+      "learning_rate": 0.0002009941329856584,
+      "loss": 0.4546,
+      "step": 40500
+    },
+    {
+      "epoch": 3.3409387222946547,
+      "grad_norm": 4.212210178375244,
+      "learning_rate": 0.00019977183833116036,
+      "loss": 0.4495,
+      "step": 41000
+    },
+    {
+      "epoch": 3.3816818774445894,
+      "grad_norm": 4.371641159057617,
+      "learning_rate": 0.00019854954367666232,
+      "loss": 0.4592,
+      "step": 41500
+    },
+    {
+      "epoch": 3.422425032594524,
+      "grad_norm": 4.2390007972717285,
+      "learning_rate": 0.00019732724902216426,
+      "loss": 0.4478,
+      "step": 42000
+    },
+    {
+      "epoch": 3.463168187744459,
+      "grad_norm": 4.006130695343018,
+      "learning_rate": 0.00019610495436766623,
+      "loss": 0.4553,
+      "step": 42500
+    },
+    {
+      "epoch": 3.5039113428943938,
+      "grad_norm": 5.3540802001953125,
+      "learning_rate": 0.00019488265971316816,
+      "loss": 0.457,
+      "step": 43000
+    },
+    {
+      "epoch": 3.5446544980443284,
+      "grad_norm": 5.2962212562561035,
+      "learning_rate": 0.00019366036505867013,
+      "loss": 0.4598,
+      "step": 43500
+    },
+    {
+      "epoch": 3.5853976531942635,
+      "grad_norm": 6.111700057983398,
+      "learning_rate": 0.0001924380704041721,
+      "loss": 0.4488,
+      "step": 44000
+    },
+    {
+      "epoch": 3.626140808344198,
+      "grad_norm": 7.8704071044921875,
+      "learning_rate": 0.00019121577574967403,
+      "loss": 0.4517,
+      "step": 44500
+    },
+    {
+      "epoch": 3.666883963494133,
+      "grad_norm": 6.922665596008301,
+      "learning_rate": 0.000189993481095176,
+      "loss": 0.46,
+      "step": 45000
+    },
+    {
+      "epoch": 3.707627118644068,
+      "grad_norm": 5.465078830718994,
+      "learning_rate": 0.00018877118644067796,
+      "loss": 0.4564,
+      "step": 45500
+    },
+    {
+      "epoch": 3.7483702737940026,
+      "grad_norm": 4.081344127655029,
+      "learning_rate": 0.0001875488917861799,
+      "loss": 0.4514,
+      "step": 46000
+    },
+    {
+      "epoch": 3.7891134289439377,
+      "grad_norm": 3.888843059539795,
+      "learning_rate": 0.00018632659713168186,
+      "loss": 0.4468,
+      "step": 46500
+    },
+    {
+      "epoch": 3.8298565840938723,
+      "grad_norm": 6.119438171386719,
+      "learning_rate": 0.0001851043024771838,
+      "loss": 0.4572,
+      "step": 47000
+    },
+    {
+      "epoch": 3.870599739243807,
+      "grad_norm": 3.99289870262146,
+      "learning_rate": 0.00018388200782268576,
+      "loss": 0.4582,
+      "step": 47500
+    },
+    {
+      "epoch": 3.9113428943937416,
+      "grad_norm": 5.044804096221924,
+      "learning_rate": 0.00018265971316818773,
+      "loss": 0.4702,
+      "step": 48000
+    },
+    {
+      "epoch": 3.9520860495436767,
+      "grad_norm": 3.187396764755249,
+      "learning_rate": 0.00018143741851368966,
+      "loss": 0.4435,
+      "step": 48500
+    },
+    {
+      "epoch": 3.9928292046936114,
+      "grad_norm": 5.24553108215332,
+      "learning_rate": 0.00018021512385919163,
+      "loss": 0.4599,
+      "step": 49000
+    },
+    {
+      "epoch": 4.0,
+      "eval_accuracy": 0.83253014087677,
+      "eval_loss": 0.4357285499572754,
+      "eval_runtime": 1.7926,
+      "eval_samples_per_second": 1389.018,
+      "eval_steps_per_second": 174.046,
+      "step": 49088
+    },
+    {
+      "epoch": 4.0335723598435465,
+      "grad_norm": 4.205733299255371,
+      "learning_rate": 0.00017899282920469362,
+      "loss": 0.4365,
+      "step": 49500
+    },
+    {
+      "epoch": 4.074315514993481,
+      "grad_norm": 7.088078498840332,
+      "learning_rate": 0.00017777053455019556,
+      "loss": 0.4278,
+      "step": 50000
+    },
+    {
+      "epoch": 4.115058670143416,
+      "grad_norm": 3.6376285552978516,
+      "learning_rate": 0.00017654823989569752,
+      "loss": 0.4338,
+      "step": 50500
+    },
+    {
+      "epoch": 4.15580182529335,
+      "grad_norm": 4.326455116271973,
+      "learning_rate": 0.00017532594524119946,
+      "loss": 0.4268,
+      "step": 51000
+    },
+    {
+      "epoch": 4.196544980443286,
+      "grad_norm": 4.494229316711426,
+      "learning_rate": 0.00017410365058670142,
+      "loss": 0.4296,
+      "step": 51500
+    },
+    {
+      "epoch": 4.237288135593221,
+      "grad_norm": 5.18319034576416,
+      "learning_rate": 0.0001728813559322034,
+      "loss": 0.4326,
+      "step": 52000
+    },
+    {
+      "epoch": 4.278031290743155,
+      "grad_norm": 5.478448867797852,
+      "learning_rate": 0.00017165906127770533,
+      "loss": 0.431,
+      "step": 52500
+    },
+    {
+      "epoch": 4.31877444589309,
+      "grad_norm": 6.5846123695373535,
+      "learning_rate": 0.0001704367666232073,
+      "loss": 0.4402,
+      "step": 53000
+    },
+    {
+      "epoch": 4.3595176010430245,
+      "grad_norm": 4.9706130027771,
+      "learning_rate": 0.00016921447196870926,
+      "loss": 0.4397,
+      "step": 53500
+    },
+    {
+      "epoch": 4.400260756192959,
+      "grad_norm": 3.205167531967163,
+      "learning_rate": 0.0001679921773142112,
+      "loss": 0.4401,
+      "step": 54000
+    },
+    {
+      "epoch": 4.441003911342895,
+      "grad_norm": 5.135682106018066,
+      "learning_rate": 0.00016676988265971316,
+      "loss": 0.4399,
+      "step": 54500
+    },
+    {
+      "epoch": 4.481747066492829,
+      "grad_norm": 4.552931785583496,
+      "learning_rate": 0.0001655475880052151,
+      "loss": 0.4369,
+      "step": 55000
+    },
+    {
+      "epoch": 4.522490221642764,
+      "grad_norm": 5.677199840545654,
+      "learning_rate": 0.00016432529335071706,
+      "loss": 0.4355,
+      "step": 55500
+    },
+    {
+      "epoch": 4.563233376792699,
+      "grad_norm": 2.8678014278411865,
+      "learning_rate": 0.00016310299869621902,
+      "loss": 0.4316,
+      "step": 56000
+    },
+    {
+      "epoch": 4.603976531942633,
+      "grad_norm": 5.2582688331604,
+      "learning_rate": 0.00016188070404172096,
+      "loss": 0.4402,
+      "step": 56500
+    },
+    {
+      "epoch": 4.644719687092568,
+      "grad_norm": 6.264452934265137,
+      "learning_rate": 0.00016065840938722293,
+      "loss": 0.4463,
+      "step": 57000
+    },
+    {
+      "epoch": 4.6854628422425035,
+      "grad_norm": 4.346625804901123,
+      "learning_rate": 0.00015943611473272486,
+      "loss": 0.4408,
+      "step": 57500
+    },
+    {
+      "epoch": 4.726205997392438,
+      "grad_norm": 4.8868727684021,
+      "learning_rate": 0.00015821382007822685,
+      "loss": 0.4391,
+      "step": 58000
+    },
+    {
+      "epoch": 4.766949152542373,
+      "grad_norm": 5.607365608215332,
+      "learning_rate": 0.00015699152542372882,
+      "loss": 0.4394,
+      "step": 58500
+    },
+    {
+      "epoch": 4.8076923076923075,
+      "grad_norm": 4.039600849151611,
+      "learning_rate": 0.00015576923076923076,
+      "loss": 0.4459,
+      "step": 59000
+    },
+    {
+      "epoch": 4.848435462842242,
+      "grad_norm": 4.866095542907715,
+      "learning_rate": 0.00015454693611473272,
+      "loss": 0.4376,
+      "step": 59500
+    },
+    {
+      "epoch": 4.889178617992178,
+      "grad_norm": 5.535662651062012,
+      "learning_rate": 0.00015332464146023469,
+      "loss": 0.4336,
+      "step": 60000
+    },
+    {
+      "epoch": 4.929921773142112,
+      "grad_norm": 6.601640701293945,
+      "learning_rate": 0.00015210234680573662,
+      "loss": 0.4397,
+      "step": 60500
+    },
+    {
+      "epoch": 4.970664928292047,
+      "grad_norm": 5.474071502685547,
+      "learning_rate": 0.0001508800521512386,
+      "loss": 0.4332,
+      "step": 61000
+    },
+    {
+      "epoch": 5.0,
+      "eval_accuracy": 0.8401606678962708,
+      "eval_loss": 0.42502373456954956,
+      "eval_runtime": 1.7703,
+      "eval_samples_per_second": 1406.534,
+      "eval_steps_per_second": 176.24,
+      "step": 61360
+    },
+    {
+      "epoch": 5.011408083441982,
+      "grad_norm": 5.534514427185059,
+      "learning_rate": 0.00014965775749674052,
+      "loss": 0.4352,
+      "step": 61500
+    },
+    {
+      "epoch": 5.052151238591916,
+      "grad_norm": 3.912092924118042,
+      "learning_rate": 0.0001484354628422425,
+      "loss": 0.4175,
+      "step": 62000
+    },
+    {
+      "epoch": 5.092894393741851,
+      "grad_norm": 5.1398606300354,
+      "learning_rate": 0.00014721316818774445,
+      "loss": 0.4106,
+      "step": 62500
+    },
+    {
+      "epoch": 5.1336375488917865,
+      "grad_norm": 2.5312321186065674,
+      "learning_rate": 0.0001459908735332464,
+      "loss": 0.419,
+      "step": 63000
+    },
+    {
+      "epoch": 5.174380704041721,
+      "grad_norm": 5.211716175079346,
+      "learning_rate": 0.00014476857887874836,
+      "loss": 0.4237,
+      "step": 63500
+    },
+    {
+      "epoch": 5.215123859191656,
+      "grad_norm": 3.6106162071228027,
+      "learning_rate": 0.00014354628422425032,
+      "loss": 0.4098,
+      "step": 64000
+    },
+    {
+      "epoch": 5.25586701434159,
+      "grad_norm": 4.845898628234863,
+      "learning_rate": 0.00014232398956975226,
+      "loss": 0.4185,
+      "step": 64500
+    },
+    {
+      "epoch": 5.296610169491525,
+      "grad_norm": 5.072364330291748,
+      "learning_rate": 0.00014110169491525422,
+      "loss": 0.4178,
+      "step": 65000
+    },
+    {
+      "epoch": 5.337353324641461,
+      "grad_norm": 4.90826416015625,
+      "learning_rate": 0.0001398794002607562,
+      "loss": 0.4227,
+      "step": 65500
+    },
+    {
+      "epoch": 5.378096479791395,
+      "grad_norm": 4.06459379196167,
+      "learning_rate": 0.00013865710560625815,
+      "loss": 0.4203,
+      "step": 66000
+    },
+    {
+      "epoch": 5.41883963494133,
+      "grad_norm": 5.159337520599365,
+      "learning_rate": 0.0001374348109517601,
+      "loss": 0.4182,
+      "step": 66500
+    },
+    {
+      "epoch": 5.459582790091265,
+      "grad_norm": 7.072538375854492,
+      "learning_rate": 0.00013621251629726205,
+      "loss": 0.4272,
+      "step": 67000
+    },
+    {
+      "epoch": 5.500325945241199,
+      "grad_norm": 3.3313496112823486,
+      "learning_rate": 0.000134990221642764,
+      "loss": 0.4239,
+      "step": 67500
+    },
+    {
+      "epoch": 5.541069100391134,
+      "grad_norm": 3.258646011352539,
+      "learning_rate": 0.00013376792698826596,
+      "loss": 0.4271,
+      "step": 68000
+    },
+    {
+      "epoch": 5.581812255541069,
+      "grad_norm": 5.835603713989258,
+      "learning_rate": 0.00013254563233376792,
+      "loss": 0.4198,
+      "step": 68500
+    },
+    {
+      "epoch": 5.622555410691004,
+      "grad_norm": 7.842447757720947,
+      "learning_rate": 0.00013132333767926986,
+      "loss": 0.4154,
+      "step": 69000
+    },
+    {
+      "epoch": 5.663298565840939,
+      "grad_norm": 3.8904993534088135,
+      "learning_rate": 0.00013010104302477182,
+      "loss": 0.417,
+      "step": 69500
+    },
+    {
+      "epoch": 5.704041720990873,
+      "grad_norm": 5.669804573059082,
+      "learning_rate": 0.00012887874837027379,
+      "loss": 0.4234,
+      "step": 70000
+    },
+    {
+      "epoch": 5.744784876140808,
+      "grad_norm": 5.677347183227539,
+      "learning_rate": 0.00012765645371577575,
+      "loss": 0.4298,
+      "step": 70500
+    },
+    {
+      "epoch": 5.7855280312907436,
+      "grad_norm": 6.545490264892578,
+      "learning_rate": 0.0001264341590612777,
+      "loss": 0.4227,
+      "step": 71000
+    },
+    {
+      "epoch": 5.826271186440678,
+      "grad_norm": 3.859914779663086,
+      "learning_rate": 0.00012521186440677965,
+      "loss": 0.4242,
+      "step": 71500
+    },
+    {
+      "epoch": 5.867014341590613,
+      "grad_norm": 5.925098419189453,
+      "learning_rate": 0.0001239895697522816,
+      "loss": 0.4136,
+      "step": 72000
+    },
+    {
+      "epoch": 5.9077574967405475,
+      "grad_norm": 6.217051029205322,
+      "learning_rate": 0.00012276727509778355,
+      "loss": 0.4176,
+      "step": 72500
+    },
+    {
+      "epoch": 5.948500651890482,
+      "grad_norm": 4.789456367492676,
+      "learning_rate": 0.0001215449804432855,
+      "loss": 0.4213,
+      "step": 73000
+    },
+    {
+      "epoch": 5.989243807040417,
+      "grad_norm": 5.505807399749756,
+      "learning_rate": 0.00012032268578878747,
+      "loss": 0.4159,
+      "step": 73500
+    },
+    {
+      "epoch": 6.0,
+      "eval_accuracy": 0.8333333134651184,
+      "eval_loss": 0.42933139204978943,
+      "eval_runtime": 1.775,
+      "eval_samples_per_second": 1402.807,
+      "eval_steps_per_second": 175.773,
+      "step": 73632
+    },
+    {
+      "epoch": 6.029986962190352,
+      "grad_norm": 6.6407599449157715,
+      "learning_rate": 0.00011910039113428943,
+      "loss": 0.4039,
+      "step": 74000
+    },
+    {
+      "epoch": 6.070730117340287,
+      "grad_norm": 3.8927536010742188,
+      "learning_rate": 0.00011787809647979139,
+      "loss": 0.4017,
+      "step": 74500
+    },
+    {
+      "epoch": 6.111473272490222,
+      "grad_norm": 4.711552143096924,
+      "learning_rate": 0.00011665580182529335,
+      "loss": 0.3975,
+      "step": 75000
+    },
+    {
+      "epoch": 6.152216427640156,
+      "grad_norm": 4.727341175079346,
+      "learning_rate": 0.0001154335071707953,
+      "loss": 0.3996,
+      "step": 75500
+    },
+    {
+      "epoch": 6.192959582790091,
+      "grad_norm": 8.397231101989746,
+      "learning_rate": 0.00011421121251629725,
+      "loss": 0.4016,
+      "step": 76000
+    },
+    {
+      "epoch": 6.2337027379400265,
+      "grad_norm": 5.553770542144775,
+      "learning_rate": 0.0001129889178617992,
+      "loss": 0.4101,
+      "step": 76500
+    },
+    {
+      "epoch": 6.274445893089961,
+      "grad_norm": 4.467512607574463,
+      "learning_rate": 0.00011176662320730115,
+      "loss": 0.4082,
+      "step": 77000
+    },
+    {
+      "epoch": 6.315189048239896,
+      "grad_norm": 5.879927635192871,
+      "learning_rate": 0.00011054432855280312,
+      "loss": 0.3972,
+      "step": 77500
+    },
+    {
+      "epoch": 6.3559322033898304,
+      "grad_norm": 10.20730972290039,
+      "learning_rate": 0.00010932203389830507,
+      "loss": 0.4061,
+      "step": 78000
+    },
+    {
+      "epoch": 6.396675358539765,
+      "grad_norm": 6.049427032470703,
+      "learning_rate": 0.00010809973924380703,
+      "loss": 0.4085,
+      "step": 78500
+    },
+    {
+      "epoch": 6.4374185136897,
+      "grad_norm": 4.389947891235352,
+      "learning_rate": 0.00010687744458930898,
+      "loss": 0.4028,
+      "step": 79000
+    },
+    {
+      "epoch": 6.478161668839635,
+      "grad_norm": 4.622443199157715,
+      "learning_rate": 0.00010565514993481095,
+      "loss": 0.4065,
+      "step": 79500
+    },
+    {
+      "epoch": 6.51890482398957,
+      "grad_norm": 5.906533241271973,
+      "learning_rate": 0.0001044328552803129,
+      "loss": 0.4,
+      "step": 80000
+    },
+    {
+      "epoch": 6.559647979139505,
+      "grad_norm": 5.73863410949707,
+      "learning_rate": 0.00010321056062581485,
+      "loss": 0.4098,
+      "step": 80500
+    },
+    {
+      "epoch": 6.600391134289439,
+      "grad_norm": 7.226048469543457,
+      "learning_rate": 0.0001019882659713168,
+      "loss": 0.4026,
+      "step": 81000
+    },
+    {
+      "epoch": 6.641134289439374,
+      "grad_norm": 4.637491226196289,
+      "learning_rate": 0.00010076597131681877,
+      "loss": 0.4089,
+      "step": 81500
+    },
+    {
+      "epoch": 6.681877444589309,
+      "grad_norm": 5.565640926361084,
+      "learning_rate": 9.954367666232072e-05,
+      "loss": 0.4048,
+      "step": 82000
+    },
+    {
+      "epoch": 6.722620599739244,
+      "grad_norm": 6.45740270614624,
+      "learning_rate": 9.832138200782268e-05,
+      "loss": 0.4033,
+      "step": 82500
+    },
+    {
+      "epoch": 6.763363754889179,
+      "grad_norm": 6.09902811050415,
+      "learning_rate": 9.709908735332463e-05,
+      "loss": 0.4003,
+      "step": 83000
+    },
+    {
+      "epoch": 6.804106910039113,
+      "grad_norm": 6.923085689544678,
+      "learning_rate": 9.58767926988266e-05,
+      "loss": 0.4087,
+      "step": 83500
+    },
+    {
+      "epoch": 6.844850065189048,
+      "grad_norm": 4.642100811004639,
+      "learning_rate": 9.465449804432855e-05,
+      "loss": 0.3934,
+      "step": 84000
+    },
+    {
+      "epoch": 6.885593220338983,
+      "grad_norm": 2.7084505558013916,
+      "learning_rate": 9.34322033898305e-05,
+      "loss": 0.3974,
+      "step": 84500
+    },
+    {
+      "epoch": 6.926336375488918,
+      "grad_norm": 4.693856716156006,
+      "learning_rate": 9.220990873533245e-05,
+      "loss": 0.401,
+      "step": 85000
+    },
+    {
+      "epoch": 6.967079530638853,
+      "grad_norm": 5.789867877960205,
+      "learning_rate": 9.098761408083442e-05,
+      "loss": 0.3916,
+      "step": 85500
+    },
+    {
+      "epoch": 7.0,
+      "eval_accuracy": 0.8381525874137878,
+      "eval_loss": 0.427287757396698,
+      "eval_runtime": 1.7782,
+      "eval_samples_per_second": 1400.285,
+      "eval_steps_per_second": 175.457,
+      "step": 85904
+    },
+    {
+      "epoch": 7.0078226857887875,
+      "grad_norm": 7.188635349273682,
+      "learning_rate": 8.976531942633637e-05,
+      "loss": 0.3956,
+      "step": 86000
+    },
+    {
+      "epoch": 7.048565840938722,
+      "grad_norm": 5.446046352386475,
+      "learning_rate": 8.854302477183832e-05,
+      "loss": 0.3756,
+      "step": 86500
+    },
+    {
+      "epoch": 7.089308996088657,
+      "grad_norm": 7.191761493682861,
+      "learning_rate": 8.732073011734028e-05,
+      "loss": 0.3885,
+      "step": 87000
+    },
+    {
+      "epoch": 7.130052151238592,
+      "grad_norm": 5.897053241729736,
+      "learning_rate": 8.609843546284225e-05,
+      "loss": 0.3825,
+      "step": 87500
+    },
+    {
+      "epoch": 7.170795306388527,
+      "grad_norm": 5.92161226272583,
+      "learning_rate": 8.48761408083442e-05,
+      "loss": 0.3865,
+      "step": 88000
+    },
+    {
+      "epoch": 7.211538461538462,
+      "grad_norm": 5.343398571014404,
+      "learning_rate": 8.365384615384615e-05,
+      "loss": 0.3904,
+      "step": 88500
+    },
+    {
+      "epoch": 7.252281616688396,
+      "grad_norm": 4.425090789794922,
+      "learning_rate": 8.24315514993481e-05,
+      "loss": 0.3899,
+      "step": 89000
+    },
+    {
+      "epoch": 7.293024771838331,
+      "grad_norm": 8.485005378723145,
+      "learning_rate": 8.120925684485006e-05,
+      "loss": 0.3921,
+      "step": 89500
+    },
+    {
+      "epoch": 7.333767926988266,
+      "grad_norm": 4.123074054718018,
+      "learning_rate": 7.998696219035201e-05,
+      "loss": 0.3938,
+      "step": 90000
+    },
+    {
+      "epoch": 7.374511082138201,
+      "grad_norm": 5.574413299560547,
+      "learning_rate": 7.876466753585397e-05,
+      "loss": 0.3912,
+      "step": 90500
+    },
+    {
+      "epoch": 7.415254237288136,
+      "grad_norm": 7.376166820526123,
+      "learning_rate": 7.754237288135592e-05,
+      "loss": 0.3806,
+      "step": 91000
+    },
+    {
+      "epoch": 7.4559973924380705,
+      "grad_norm": 4.350152015686035,
+      "learning_rate": 7.63200782268579e-05,
+      "loss": 0.3927,
+      "step": 91500
+    },
+    {
+      "epoch": 7.496740547588005,
+      "grad_norm": 5.158257484436035,
+      "learning_rate": 7.509778357235985e-05,
+      "loss": 0.3906,
+      "step": 92000
+    },
+    {
+      "epoch": 7.53748370273794,
+      "grad_norm": 4.428380489349365,
+      "learning_rate": 7.387548891786178e-05,
+      "loss": 0.3868,
+      "step": 92500
+    },
+    {
+      "epoch": 7.578226857887875,
+      "grad_norm": 5.342788219451904,
+      "learning_rate": 7.265319426336375e-05,
+      "loss": 0.3929,
+      "step": 93000
+    },
+    {
+      "epoch": 7.61897001303781,
+      "grad_norm": 6.24040412902832,
+      "learning_rate": 7.14308996088657e-05,
+      "loss": 0.3941,
+      "step": 93500
+    },
+    {
+      "epoch": 7.659713168187745,
+      "grad_norm": 9.805680274963379,
+      "learning_rate": 7.020860495436766e-05,
+      "loss": 0.3792,
+      "step": 94000
+    },
+    {
+      "epoch": 7.700456323337679,
+      "grad_norm": 4.802555084228516,
+      "learning_rate": 6.898631029986961e-05,
+      "loss": 0.3931,
+      "step": 94500
+    },
+    {
+      "epoch": 7.741199478487614,
+      "grad_norm": 5.951593399047852,
+      "learning_rate": 6.776401564537158e-05,
+      "loss": 0.3977,
+      "step": 95000
+    },
+    {
+      "epoch": 7.781942633637549,
+      "grad_norm": 6.838685989379883,
+      "learning_rate": 6.654172099087353e-05,
+      "loss": 0.391,
+      "step": 95500
+    },
+    {
+      "epoch": 7.822685788787483,
+      "grad_norm": 6.688655853271484,
+      "learning_rate": 6.531942633637548e-05,
+      "loss": 0.3875,
+      "step": 96000
+    },
+    {
+      "epoch": 7.863428943937419,
+      "grad_norm": 4.0376763343811035,
+      "learning_rate": 6.409713168187743e-05,
+      "loss": 0.3877,
+      "step": 96500
+    },
+    {
+      "epoch": 7.904172099087353,
+      "grad_norm": 6.482248306274414,
+      "learning_rate": 6.28748370273794e-05,
+      "loss": 0.3843,
+      "step": 97000
+    },
+    {
+      "epoch": 7.944915254237288,
+      "grad_norm": 5.90971040725708,
+      "learning_rate": 6.165254237288135e-05,
+      "loss": 0.3937,
+      "step": 97500
+    },
+    {
+      "epoch": 7.985658409387223,
+      "grad_norm": 5.311962127685547,
+      "learning_rate": 6.0430247718383304e-05,
+      "loss": 0.3895,
+      "step": 98000
+    },
+    {
+      "epoch": 8.0,
+      "eval_accuracy": 0.8409638404846191,
+      "eval_loss": 0.42223989963531494,
+      "eval_runtime": 1.7761,
+      "eval_samples_per_second": 1401.969,
+      "eval_steps_per_second": 175.668,
+      "step": 98176
+    },
+    {
+      "epoch": 8.026401564537158,
+      "grad_norm": 5.411905288696289,
+      "learning_rate": 5.920795306388526e-05,
+      "loss": 0.3813,
+      "step": 98500
+    },
+    {
+      "epoch": 8.067144719687093,
+      "grad_norm": 3.557790994644165,
+      "learning_rate": 5.798565840938721e-05,
+      "loss": 0.3684,
+      "step": 99000
+    },
+    {
+      "epoch": 8.107887874837028,
+      "grad_norm": 4.9928131103515625,
+      "learning_rate": 5.676336375488918e-05,
+      "loss": 0.3706,
+      "step": 99500
+    },
+    {
+      "epoch": 8.148631029986962,
+      "grad_norm": 5.919467926025391,
+      "learning_rate": 5.554106910039113e-05,
+      "loss": 0.3812,
+      "step": 100000
+    },
+    {
+      "epoch": 8.189374185136897,
+      "grad_norm": 8.409658432006836,
+      "learning_rate": 5.4318774445893086e-05,
+      "loss": 0.3731,
+      "step": 100500
+    },
+    {
+      "epoch": 8.230117340286832,
+      "grad_norm": 4.893514633178711,
+      "learning_rate": 5.309647979139504e-05,
+      "loss": 0.3744,
+      "step": 101000
+    },
+    {
+      "epoch": 8.270860495436766,
+      "grad_norm": 5.366138458251953,
+      "learning_rate": 5.1874185136897e-05,
+      "loss": 0.3813,
+      "step": 101500
+    },
+    {
+      "epoch": 8.3116036505867,
+      "grad_norm": 4.139840602874756,
+      "learning_rate": 5.065189048239895e-05,
+      "loss": 0.3674,
+      "step": 102000
+    },
+    {
+      "epoch": 8.352346805736635,
+      "grad_norm": 3.822544813156128,
+      "learning_rate": 4.942959582790091e-05,
+      "loss": 0.3765,
+      "step": 102500
+    },
+    {
+      "epoch": 8.393089960886572,
+      "grad_norm": 3.482146739959717,
+      "learning_rate": 4.820730117340286e-05,
+      "loss": 0.38,
+      "step": 103000
+    },
+    {
+      "epoch": 8.433833116036507,
+      "grad_norm": 5.679060935974121,
+      "learning_rate": 4.698500651890482e-05,
+      "loss": 0.3725,
+      "step": 103500
+    },
+    {
+      "epoch": 8.474576271186441,
+      "grad_norm": 5.182998180389404,
+      "learning_rate": 4.576271186440678e-05,
+      "loss": 0.3741,
+      "step": 104000
+    },
+    {
+      "epoch": 8.515319426336376,
+      "grad_norm": 6.0669941902160645,
+      "learning_rate": 4.4540417209908735e-05,
+      "loss": 0.3834,
+      "step": 104500
+    },
+    {
+      "epoch": 8.55606258148631,
+      "grad_norm": 6.93448543548584,
+      "learning_rate": 4.3318122555410686e-05,
+      "loss": 0.3735,
+      "step": 105000
+    },
+    {
+      "epoch": 8.596805736636245,
+      "grad_norm": 6.46242094039917,
+      "learning_rate": 4.2095827900912643e-05,
+      "loss": 0.3719,
+      "step": 105500
+    },
+    {
+      "epoch": 8.63754889178618,
+      "grad_norm": 6.7451019287109375,
+      "learning_rate": 4.08735332464146e-05,
+      "loss": 0.3687,
+      "step": 106000
+    },
+    {
+      "epoch": 8.678292046936114,
+      "grad_norm": 6.002355575561523,
+      "learning_rate": 3.965123859191656e-05,
+      "loss": 0.3857,
+      "step": 106500
+    },
+    {
+      "epoch": 8.719035202086049,
+      "grad_norm": 3.5218327045440674,
+      "learning_rate": 3.842894393741851e-05,
+      "loss": 0.3795,
+      "step": 107000
+    },
+    {
+      "epoch": 8.759778357235984,
+      "grad_norm": 6.2277703285217285,
+      "learning_rate": 3.720664928292047e-05,
+      "loss": 0.3662,
+      "step": 107500
+    },
+    {
+      "epoch": 8.800521512385918,
+      "grad_norm": 5.253796100616455,
+      "learning_rate": 3.5984354628422425e-05,
+      "loss": 0.3806,
+      "step": 108000
+    },
+    {
+      "epoch": 8.841264667535853,
+      "grad_norm": 2.701765775680542,
+      "learning_rate": 3.4762059973924376e-05,
+      "loss": 0.37,
+      "step": 108500
+    },
+    {
+      "epoch": 8.88200782268579,
+      "grad_norm": 5.994917392730713,
+      "learning_rate": 3.3539765319426334e-05,
+      "loss": 0.3709,
+      "step": 109000
+    },
+    {
+      "epoch": 8.922750977835724,
+      "grad_norm": 4.478329658508301,
+      "learning_rate": 3.2317470664928285e-05,
+      "loss": 0.3831,
+      "step": 109500
+    },
+    {
+      "epoch": 8.963494132985659,
+      "grad_norm": 5.174659729003906,
+      "learning_rate": 3.109517601043025e-05,
+      "loss": 0.3813,
+      "step": 110000
+    },
+    {
+      "epoch": 9.0,
+      "eval_accuracy": 0.845381498336792,
+      "eval_loss": 0.4268535077571869,
+      "eval_runtime": 1.775,
+      "eval_samples_per_second": 1402.849,
+      "eval_steps_per_second": 175.779,
+      "step": 110448
+    },
+    {
+      "epoch": 9.004237288135593,
+      "grad_norm": 4.46301794052124,
+      "learning_rate": 2.98728813559322e-05,
+      "loss": 0.3641,
+      "step": 110500
+    },
+    {
+      "epoch": 9.044980443285528,
+      "grad_norm": 7.094146728515625,
+      "learning_rate": 2.8650586701434158e-05,
+      "loss": 0.3654,
+      "step": 111000
+    },
+    {
+      "epoch": 9.085723598435463,
+      "grad_norm": 9.200092315673828,
+      "learning_rate": 2.7428292046936113e-05,
+      "loss": 0.3586,
+      "step": 111500
+    },
+    {
+      "epoch": 9.126466753585397,
+      "grad_norm": 8.71438980102539,
+      "learning_rate": 2.6205997392438067e-05,
+      "loss": 0.3749,
+      "step": 112000
+    },
+    {
+      "epoch": 9.167209908735332,
+      "grad_norm": 4.38647985458374,
+      "learning_rate": 2.4983702737940025e-05,
+      "loss": 0.3672,
+      "step": 112500
+    },
+    {
+      "epoch": 9.207953063885267,
+      "grad_norm": 4.057114124298096,
+      "learning_rate": 2.376140808344198e-05,
+      "loss": 0.3621,
+      "step": 113000
+    },
+    {
+      "epoch": 9.248696219035201,
+      "grad_norm": 7.707796096801758,
+      "learning_rate": 2.2539113428943937e-05,
+      "loss": 0.3659,
+      "step": 113500
+    },
+    {
+      "epoch": 9.289439374185136,
+      "grad_norm": 6.152095317840576,
+      "learning_rate": 2.131681877444589e-05,
+      "loss": 0.3645,
+      "step": 114000
+    },
+    {
+      "epoch": 9.330182529335072,
+      "grad_norm": 3.31390643119812,
+      "learning_rate": 2.009452411994785e-05,
+      "loss": 0.3765,
+      "step": 114500
+    },
+    {
+      "epoch": 9.370925684485007,
+      "grad_norm": 4.196287155151367,
+      "learning_rate": 1.8872229465449803e-05,
+      "loss": 0.363,
+      "step": 115000
+    },
+    {
+      "epoch": 9.411668839634942,
+      "grad_norm": 5.799023628234863,
+      "learning_rate": 1.7649934810951758e-05,
+      "loss": 0.3625,
+      "step": 115500
+    },
+    {
+      "epoch": 9.452411994784876,
+      "grad_norm": 2.445194721221924,
+      "learning_rate": 1.6427640156453715e-05,
+      "loss": 0.3656,
+      "step": 116000
+    },
+    {
+      "epoch": 9.493155149934811,
+      "grad_norm": 6.226719856262207,
+      "learning_rate": 1.520534550195567e-05,
+      "loss": 0.369,
+      "step": 116500
+    },
+    {
+      "epoch": 9.533898305084746,
+      "grad_norm": 5.221319675445557,
+      "learning_rate": 1.3983050847457626e-05,
+      "loss": 0.3691,
+      "step": 117000
+    },
+    {
+      "epoch": 9.57464146023468,
+      "grad_norm": 7.645327091217041,
+      "learning_rate": 1.2760756192959582e-05,
+      "loss": 0.3584,
+      "step": 117500
+    },
+    {
+      "epoch": 9.615384615384615,
+      "grad_norm": 5.081396102905273,
+      "learning_rate": 1.1538461538461538e-05,
+      "loss": 0.359,
+      "step": 118000
+    },
+    {
+      "epoch": 9.65612777053455,
+      "grad_norm": 7.702281951904297,
+      "learning_rate": 1.0316166883963494e-05,
+      "loss": 0.3543,
+      "step": 118500
+    },
+    {
+      "epoch": 9.696870925684484,
+      "grad_norm": 5.906215190887451,
+      "learning_rate": 9.093872229465448e-06,
+      "loss": 0.3679,
+      "step": 119000
+    },
+    {
+      "epoch": 9.737614080834419,
+      "grad_norm": 3.9540443420410156,
+      "learning_rate": 7.871577574967404e-06,
+      "loss": 0.3629,
+      "step": 119500
+    },
+    {
+      "epoch": 9.778357235984355,
+      "grad_norm": 4.546215057373047,
+      "learning_rate": 6.649282920469361e-06,
+      "loss": 0.3541,
+      "step": 120000
+    },
+    {
+      "epoch": 9.81910039113429,
+      "grad_norm": 3.7983944416046143,
+      "learning_rate": 5.426988265971316e-06,
+      "loss": 0.3571,
+      "step": 120500
+    },
+    {
+      "epoch": 9.859843546284225,
+      "grad_norm": 5.201981544494629,
+      "learning_rate": 4.2046936114732716e-06,
+      "loss": 0.3719,
+      "step": 121000
+    },
+    {
+      "epoch": 9.90058670143416,
+      "grad_norm": 5.931910991668701,
+      "learning_rate": 2.982398956975228e-06,
+      "loss": 0.3635,
+      "step": 121500
+    },
+    {
+      "epoch": 9.941329856584094,
+      "grad_norm": 7.897836208343506,
+      "learning_rate": 1.7601043024771837e-06,
+      "loss": 0.3625,
+      "step": 122000
+    },
+    {
+      "epoch": 9.982073011734029,
+      "grad_norm": 6.08315896987915,
+      "learning_rate": 5.378096479791394e-07,
+      "loss": 0.3603,
+      "step": 122500
+    },
+    {
+      "epoch": 10.0,
+      "eval_accuracy": 0.8449799418449402,
+      "eval_loss": 0.4318901598453522,
+      "eval_runtime": 1.8054,
+      "eval_samples_per_second": 1379.208,
+      "eval_steps_per_second": 172.816,
+      "step": 122720
+    },
+    {
+      "epoch": 10.0,
+      "step": 122720,
+      "total_flos": 2.6187994846190592e+17,
+      "train_loss": 0.44300876322750005,
+      "train_runtime": 3819.0977,
+      "train_samples_per_second": 1028.259,
+      "train_steps_per_second": 32.133
+    }
+  ],
+  "logging_steps": 500,
+  "max_steps": 122720,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.6187994846190592e+17,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}