🍻 cheers

Browse files

Files changed (6) hide show

README.md +6 -5
all_results.json +13 -0
eval_results.json +8 -0
runs/May09_14-30-54_ed3f40b4fac8/events.out.tfevents.1715268878.ed3f40b4fac8.34.1 +3 -0
train_results.json +8 -0
trainer_state.json +3727 -0

README.md CHANGED Viewed

@@ -2,6 +2,7 @@
 license: apache-2.0
 base_model: microsoft/swinv2-tiny-patch4-window8-256
 tags:
 - generated_from_trainer
 datasets:
 - imagefolder
@@ -14,7 +15,7 @@ model-index:
       name: Image Classification
       type: image-classification
     dataset:
-      name: imagefolder
       type: imagefolder
       config: default
       split: train
@@ -22,7 +23,7 @@ model-index:
     metrics:
     - name: Accuracy
       type: accuracy
-      value: 0.9415760869565217
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -30,10 +31,10 @@ should probably proofread and complete it, then remove this comment. -->
 # microsoft_swinv2-tiny-patch4-window8-256-batch_16_epoch_4_classes_24_final_withAug
-This model is a fine-tuned version of [microsoft/swinv2-tiny-patch4-window8-256](https://huggingface.co/microsoft/swinv2-tiny-patch4-window8-256) on the imagefolder dataset.
 It achieves the following results on the evaluation set:
-- Loss: 0.2491
-- Accuracy: 0.9416
 ## Model description

 license: apache-2.0
 base_model: microsoft/swinv2-tiny-patch4-window8-256
 tags:
+- image-classification
 - generated_from_trainer
 datasets:
 - imagefolder
       name: Image Classification
       type: image-classification
     dataset:
+      name: bengali_food_images
       type: imagefolder
       config: default
       split: train
     metrics:
     - name: Accuracy
       type: accuracy
+      value: 0.9456521739130435
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 # microsoft_swinv2-tiny-patch4-window8-256-batch_16_epoch_4_classes_24_final_withAug
+This model is a fine-tuned version of [microsoft/swinv2-tiny-patch4-window8-256](https://huggingface.co/microsoft/swinv2-tiny-patch4-window8-256) on the bengali_food_images dataset.
 It achieves the following results on the evaluation set:
+- Loss: 0.2321
+- Accuracy: 0.9457
 ## Model description

all_results.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "epoch": 4.0,
+    "eval_accuracy": 0.9456521739130435,
+    "eval_loss": 0.2321375608444214,
+    "eval_runtime": 21.9373,
+    "eval_samples_per_second": 33.55,
+    "eval_steps_per_second": 4.194,
+    "total_flos": 1.870424802038661e+18,
+    "train_loss": 0.5094418521630693,
+    "train_runtime": 3760.9881,
+    "train_samples_per_second": 19.953,
+    "train_steps_per_second": 1.248
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 4.0,
+    "eval_accuracy": 0.9456521739130435,
+    "eval_loss": 0.2321375608444214,
+    "eval_runtime": 21.9373,
+    "eval_samples_per_second": 33.55,
+    "eval_steps_per_second": 4.194
+}

runs/May09_14-30-54_ed3f40b4fac8/events.out.tfevents.1715268878.ed3f40b4fac8.34.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fc912e12f32a3672091165624bbf81548153c987aa1d6d9682bea375879ef575
+size 411

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 4.0,
+    "total_flos": 1.870424802038661e+18,
+    "train_loss": 0.5094418521630693,
+    "train_runtime": 3760.9881,
+    "train_samples_per_second": 19.953,
+    "train_steps_per_second": 1.248
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,3727 @@

+{
+  "best_metric": 0.2321375608444214,
+  "best_model_checkpoint": "/kaggle/working/Model/microsoft_swinv2-tiny-patch4-window8-256-batch_16_epoch_4_classes_24_final_withAug/checkpoint-4400",
+  "epoch": 4.0,
+  "eval_steps": 100,
+  "global_step": 4692,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01,
+      "grad_norm": NaN,
+      "learning_rate": 0.0002,
+      "loss": 3.2461,
+      "step": 10
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": NaN,
+      "learning_rate": 0.0002,
+      "loss": 3.2031,
+      "step": 20
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 5.058732509613037,
+      "learning_rate": 0.0001999147485080989,
+      "loss": 3.2604,
+      "step": 30
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.7387874126434326,
+      "learning_rate": 0.00019948849104859336,
+      "loss": 3.1794,
+      "step": 40
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.7913503646850586,
+      "learning_rate": 0.0001990622335890878,
+      "loss": 2.9532,
+      "step": 50
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 4.803112030029297,
+      "learning_rate": 0.00019867860187553282,
+      "loss": 2.692,
+      "step": 60
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 5.479024887084961,
+      "learning_rate": 0.0001982523444160273,
+      "loss": 2.4673,
+      "step": 70
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 5.984738826751709,
+      "learning_rate": 0.00019782608695652175,
+      "loss": 2.2248,
+      "step": 80
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 8.306194305419922,
+      "learning_rate": 0.0001973998294970162,
+      "loss": 1.9695,
+      "step": 90
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 6.3569512367248535,
+      "learning_rate": 0.00019697357203751065,
+      "loss": 1.7162,
+      "step": 100
+    },
+    {
+      "epoch": 0.09,
+      "eval_accuracy": 0.7078804347826086,
+      "eval_loss": 1.4225009679794312,
+      "eval_runtime": 26.8467,
+      "eval_samples_per_second": 27.415,
+      "eval_steps_per_second": 3.427,
+      "step": 100
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 6.7053914070129395,
+      "learning_rate": 0.00019654731457800512,
+      "loss": 1.5791,
+      "step": 110
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 6.75339412689209,
+      "learning_rate": 0.00019616368286445014,
+      "loss": 1.5743,
+      "step": 120
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 6.368953704833984,
+      "learning_rate": 0.0001957374254049446,
+      "loss": 1.4221,
+      "step": 130
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 7.507160663604736,
+      "learning_rate": 0.00019531116794543904,
+      "loss": 1.4591,
+      "step": 140
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 6.467655181884766,
+      "learning_rate": 0.00019488491048593351,
+      "loss": 1.5203,
+      "step": 150
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 7.42750883102417,
+      "learning_rate": 0.00019445865302642796,
+      "loss": 1.3534,
+      "step": 160
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 8.539589881896973,
+      "learning_rate": 0.00019403239556692244,
+      "loss": 1.517,
+      "step": 170
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 8.488027572631836,
+      "learning_rate": 0.0001936061381074169,
+      "loss": 1.3041,
+      "step": 180
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 6.249123573303223,
+      "learning_rate": 0.00019317988064791134,
+      "loss": 1.178,
+      "step": 190
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 8.70413589477539,
+      "learning_rate": 0.0001927536231884058,
+      "loss": 1.2286,
+      "step": 200
+    },
+    {
+      "epoch": 0.17,
+      "eval_accuracy": 0.7934782608695652,
+      "eval_loss": 0.946118175983429,
+      "eval_runtime": 22.2277,
+      "eval_samples_per_second": 33.112,
+      "eval_steps_per_second": 4.139,
+      "step": 200
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 6.648161888122559,
+      "learning_rate": 0.00019232736572890027,
+      "loss": 1.1328,
+      "step": 210
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 6.3985419273376465,
+      "learning_rate": 0.00019190110826939472,
+      "loss": 1.3468,
+      "step": 220
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 6.855421543121338,
+      "learning_rate": 0.0001914748508098892,
+      "loss": 1.121,
+      "step": 230
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 7.411104202270508,
+      "learning_rate": 0.00019104859335038364,
+      "loss": 1.1668,
+      "step": 240
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 6.608767986297607,
+      "learning_rate": 0.0001906223358908781,
+      "loss": 1.1276,
+      "step": 250
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 7.618008613586426,
+      "learning_rate": 0.00019019607843137254,
+      "loss": 1.0934,
+      "step": 260
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 8.76831340789795,
+      "learning_rate": 0.00018976982097186702,
+      "loss": 1.0502,
+      "step": 270
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 6.6039323806762695,
+      "learning_rate": 0.00018934356351236147,
+      "loss": 0.9731,
+      "step": 280
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 5.2799072265625,
+      "learning_rate": 0.00018891730605285594,
+      "loss": 0.9236,
+      "step": 290
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 8.616353988647461,
+      "learning_rate": 0.0001884910485933504,
+      "loss": 1.0323,
+      "step": 300
+    },
+    {
+      "epoch": 0.26,
+      "eval_accuracy": 0.8355978260869565,
+      "eval_loss": 0.7366186380386353,
+      "eval_runtime": 22.3822,
+      "eval_samples_per_second": 32.883,
+      "eval_steps_per_second": 4.11,
+      "step": 300
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 6.827239513397217,
+      "learning_rate": 0.00018806479113384484,
+      "loss": 1.0934,
+      "step": 310
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.527755260467529,
+      "learning_rate": 0.0001876385336743393,
+      "loss": 1.0528,
+      "step": 320
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 6.007165431976318,
+      "learning_rate": 0.00018721227621483377,
+      "loss": 0.8891,
+      "step": 330
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 8.186945915222168,
+      "learning_rate": 0.00018678601875532822,
+      "loss": 0.965,
+      "step": 340
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 6.866254806518555,
+      "learning_rate": 0.0001863597612958227,
+      "loss": 0.8948,
+      "step": 350
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 6.580564975738525,
+      "learning_rate": 0.00018593350383631715,
+      "loss": 0.9438,
+      "step": 360
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 5.725220680236816,
+      "learning_rate": 0.0001855072463768116,
+      "loss": 0.8196,
+      "step": 370
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.180054664611816,
+      "learning_rate": 0.00018508098891730605,
+      "loss": 0.8645,
+      "step": 380
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 5.196268081665039,
+      "learning_rate": 0.00018469735720375109,
+      "loss": 0.8335,
+      "step": 390
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 5.930848121643066,
+      "learning_rate": 0.00018427109974424554,
+      "loss": 0.8678,
+      "step": 400
+    },
+    {
+      "epoch": 0.34,
+      "eval_accuracy": 0.8627717391304348,
+      "eval_loss": 0.6210773587226868,
+      "eval_runtime": 22.1531,
+      "eval_samples_per_second": 33.223,
+      "eval_steps_per_second": 4.153,
+      "step": 400
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 7.423695087432861,
+      "learning_rate": 0.00018384484228473998,
+      "loss": 0.7715,
+      "step": 410
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 4.49029016494751,
+      "learning_rate": 0.00018341858482523443,
+      "loss": 0.6917,
+      "step": 420
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 4.553171157836914,
+      "learning_rate": 0.0001829923273657289,
+      "loss": 0.7546,
+      "step": 430
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 5.762471675872803,
+      "learning_rate": 0.00018256606990622336,
+      "loss": 0.876,
+      "step": 440
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 7.234831809997559,
+      "learning_rate": 0.00018213981244671784,
+      "loss": 0.819,
+      "step": 450
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 6.18890380859375,
+      "learning_rate": 0.0001817135549872123,
+      "loss": 0.8027,
+      "step": 460
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 5.732609272003174,
+      "learning_rate": 0.00018128729752770674,
+      "loss": 0.7294,
+      "step": 470
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 6.72756290435791,
+      "learning_rate": 0.0001808610400682012,
+      "loss": 0.8894,
+      "step": 480
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 4.72413444519043,
+      "learning_rate": 0.00018043478260869566,
+      "loss": 0.7052,
+      "step": 490
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 3.8523027896881104,
+      "learning_rate": 0.0001800085251491901,
+      "loss": 0.7849,
+      "step": 500
+    },
+    {
+      "epoch": 0.43,
+      "eval_accuracy": 0.8654891304347826,
+      "eval_loss": 0.5353636741638184,
+      "eval_runtime": 22.0834,
+      "eval_samples_per_second": 33.328,
+      "eval_steps_per_second": 4.166,
+      "step": 500
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 5.818722248077393,
+      "learning_rate": 0.0001795822676896846,
+      "loss": 0.6421,
+      "step": 510
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 7.893730640411377,
+      "learning_rate": 0.00017915601023017904,
+      "loss": 0.7131,
+      "step": 520
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 6.222229957580566,
+      "learning_rate": 0.0001787297527706735,
+      "loss": 0.642,
+      "step": 530
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 6.1911139488220215,
+      "learning_rate": 0.00017830349531116794,
+      "loss": 0.7096,
+      "step": 540
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 7.124819278717041,
+      "learning_rate": 0.00017787723785166242,
+      "loss": 0.6946,
+      "step": 550
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 4.969889163970947,
+      "learning_rate": 0.00017745098039215687,
+      "loss": 0.6995,
+      "step": 560
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 4.456979751586914,
+      "learning_rate": 0.00017702472293265134,
+      "loss": 0.6533,
+      "step": 570
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 7.239711284637451,
+      "learning_rate": 0.0001765984654731458,
+      "loss": 0.755,
+      "step": 580
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 4.915600776672363,
+      "learning_rate": 0.00017617220801364024,
+      "loss": 0.6239,
+      "step": 590
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 4.89735221862793,
+      "learning_rate": 0.0001757459505541347,
+      "loss": 0.7105,
+      "step": 600
+    },
+    {
+      "epoch": 0.51,
+      "eval_accuracy": 0.8899456521739131,
+      "eval_loss": 0.47926023602485657,
+      "eval_runtime": 21.8438,
+      "eval_samples_per_second": 33.694,
+      "eval_steps_per_second": 4.212,
+      "step": 600
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 4.159371376037598,
+      "learning_rate": 0.00017531969309462917,
+      "loss": 0.5948,
+      "step": 610
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 4.996119976043701,
+      "learning_rate": 0.00017489343563512362,
+      "loss": 0.6705,
+      "step": 620
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 5.600012302398682,
+      "learning_rate": 0.0001744671781756181,
+      "loss": 0.6742,
+      "step": 630
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 8.147639274597168,
+      "learning_rate": 0.00017404092071611254,
+      "loss": 0.7171,
+      "step": 640
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 4.924251556396484,
+      "learning_rate": 0.000173614663256607,
+      "loss": 0.6655,
+      "step": 650
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 4.968270301818848,
+      "learning_rate": 0.00017318840579710144,
+      "loss": 0.641,
+      "step": 660
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 6.600828647613525,
+      "learning_rate": 0.00017276214833759592,
+      "loss": 0.6549,
+      "step": 670
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 5.408564567565918,
+      "learning_rate": 0.00017233589087809037,
+      "loss": 0.6431,
+      "step": 680
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.0379228591918945,
+      "learning_rate": 0.00017190963341858485,
+      "loss": 0.6226,
+      "step": 690
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 6.015142917633057,
+      "learning_rate": 0.0001714833759590793,
+      "loss": 0.6198,
+      "step": 700
+    },
+    {
+      "epoch": 0.6,
+      "eval_accuracy": 0.9089673913043478,
+      "eval_loss": 0.43188050389289856,
+      "eval_runtime": 21.8402,
+      "eval_samples_per_second": 33.699,
+      "eval_steps_per_second": 4.212,
+      "step": 700
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 5.850244045257568,
+      "learning_rate": 0.00017105711849957375,
+      "loss": 0.5817,
+      "step": 710
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 3.7645974159240723,
+      "learning_rate": 0.0001706308610400682,
+      "loss": 0.6146,
+      "step": 720
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 4.481291770935059,
+      "learning_rate": 0.00017020460358056267,
+      "loss": 0.6362,
+      "step": 730
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 3.906785488128662,
+      "learning_rate": 0.00016977834612105712,
+      "loss": 0.603,
+      "step": 740
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 6.198018550872803,
+      "learning_rate": 0.0001693520886615516,
+      "loss": 0.547,
+      "step": 750
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 3.281864881515503,
+      "learning_rate": 0.00016892583120204605,
+      "loss": 0.5979,
+      "step": 760
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 3.2829701900482178,
+      "learning_rate": 0.0001684995737425405,
+      "loss": 0.6047,
+      "step": 770
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 4.538403511047363,
+      "learning_rate": 0.00016807331628303495,
+      "loss": 0.6037,
+      "step": 780
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 3.8807320594787598,
+      "learning_rate": 0.00016764705882352942,
+      "loss": 0.4783,
+      "step": 790
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 5.805025100708008,
+      "learning_rate": 0.00016722080136402387,
+      "loss": 0.6276,
+      "step": 800
+    },
+    {
+      "epoch": 0.68,
+      "eval_accuracy": 0.8980978260869565,
+      "eval_loss": 0.4021691083908081,
+      "eval_runtime": 22.1281,
+      "eval_samples_per_second": 33.261,
+      "eval_steps_per_second": 4.158,
+      "step": 800
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 3.921274185180664,
+      "learning_rate": 0.00016679454390451835,
+      "loss": 0.6383,
+      "step": 810
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 8.53153133392334,
+      "learning_rate": 0.0001663682864450128,
+      "loss": 0.6362,
+      "step": 820
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 3.754373073577881,
+      "learning_rate": 0.00016594202898550725,
+      "loss": 0.502,
+      "step": 830
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 5.654985427856445,
+      "learning_rate": 0.0001655157715260017,
+      "loss": 0.6056,
+      "step": 840
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 4.316379070281982,
+      "learning_rate": 0.00016508951406649618,
+      "loss": 0.5892,
+      "step": 850
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 5.630128860473633,
+      "learning_rate": 0.00016466325660699063,
+      "loss": 0.6279,
+      "step": 860
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 5.5036797523498535,
+      "learning_rate": 0.0001642369991474851,
+      "loss": 0.5664,
+      "step": 870
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 3.7804858684539795,
+      "learning_rate": 0.00016381074168797955,
+      "loss": 0.6377,
+      "step": 880
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 3.757089138031006,
+      "learning_rate": 0.000163384484228474,
+      "loss": 0.6601,
+      "step": 890
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 5.010238170623779,
+      "learning_rate": 0.00016295822676896845,
+      "loss": 0.5411,
+      "step": 900
+    },
+    {
+      "epoch": 0.77,
+      "eval_accuracy": 0.9116847826086957,
+      "eval_loss": 0.38156262040138245,
+      "eval_runtime": 22.137,
+      "eval_samples_per_second": 33.248,
+      "eval_steps_per_second": 4.156,
+      "step": 900
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 4.0843987464904785,
+      "learning_rate": 0.00016253196930946293,
+      "loss": 0.5379,
+      "step": 910
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 2.8297243118286133,
+      "learning_rate": 0.00016210571184995738,
+      "loss": 0.4184,
+      "step": 920
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 5.259457111358643,
+      "learning_rate": 0.00016167945439045185,
+      "loss": 0.4892,
+      "step": 930
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 6.436057090759277,
+      "learning_rate": 0.0001612531969309463,
+      "loss": 0.5708,
+      "step": 940
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 5.9696736335754395,
+      "learning_rate": 0.00016082693947144075,
+      "loss": 0.5395,
+      "step": 950
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 5.614426136016846,
+      "learning_rate": 0.0001604006820119352,
+      "loss": 0.573,
+      "step": 960
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 3.9581282138824463,
+      "learning_rate": 0.00015997442455242968,
+      "loss": 0.5813,
+      "step": 970
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 4.684549808502197,
+      "learning_rate": 0.00015954816709292413,
+      "loss": 0.5533,
+      "step": 980
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 5.443897247314453,
+      "learning_rate": 0.0001591219096334186,
+      "loss": 0.5539,
+      "step": 990
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 4.034463882446289,
+      "learning_rate": 0.00015869565217391306,
+      "loss": 0.4984,
+      "step": 1000
+    },
+    {
+      "epoch": 0.85,
+      "eval_accuracy": 0.9021739130434783,
+      "eval_loss": 0.38239946961402893,
+      "eval_runtime": 21.9989,
+      "eval_samples_per_second": 33.456,
+      "eval_steps_per_second": 4.182,
+      "step": 1000
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 3.4797463417053223,
+      "learning_rate": 0.0001582693947144075,
+      "loss": 0.5063,
+      "step": 1010
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 4.582981586456299,
+      "learning_rate": 0.00015784313725490196,
+      "loss": 0.5875,
+      "step": 1020
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 4.804072856903076,
+      "learning_rate": 0.00015741687979539643,
+      "loss": 0.5107,
+      "step": 1030
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 4.848588943481445,
+      "learning_rate": 0.00015699062233589088,
+      "loss": 0.5255,
+      "step": 1040
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 3.0660464763641357,
+      "learning_rate": 0.00015656436487638536,
+      "loss": 0.4437,
+      "step": 1050
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 5.267394065856934,
+      "learning_rate": 0.0001561381074168798,
+      "loss": 0.5443,
+      "step": 1060
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 5.676567077636719,
+      "learning_rate": 0.00015571184995737426,
+      "loss": 0.5238,
+      "step": 1070
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 4.262234210968018,
+      "learning_rate": 0.0001552855924978687,
+      "loss": 0.5148,
+      "step": 1080
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 3.7277231216430664,
+      "learning_rate": 0.00015485933503836318,
+      "loss": 0.4952,
+      "step": 1090
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 4.57068395614624,
+      "learning_rate": 0.00015443307757885763,
+      "loss": 0.5665,
+      "step": 1100
+    },
+    {
+      "epoch": 0.94,
+      "eval_accuracy": 0.9211956521739131,
+      "eval_loss": 0.34602978825569153,
+      "eval_runtime": 22.2733,
+      "eval_samples_per_second": 33.044,
+      "eval_steps_per_second": 4.131,
+      "step": 1100
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 3.960221767425537,
+      "learning_rate": 0.0001540068201193521,
+      "loss": 0.4565,
+      "step": 1110
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 3.6928794384002686,
+      "learning_rate": 0.00015358056265984656,
+      "loss": 0.4239,
+      "step": 1120
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 2.560225009918213,
+      "learning_rate": 0.000153154305200341,
+      "loss": 0.5735,
+      "step": 1130
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 4.45659065246582,
+      "learning_rate": 0.00015272804774083546,
+      "loss": 0.4299,
+      "step": 1140
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 5.458113670349121,
+      "learning_rate": 0.00015230179028132994,
+      "loss": 0.4727,
+      "step": 1150
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 4.833206653594971,
+      "learning_rate": 0.0001518755328218244,
+      "loss": 0.4722,
+      "step": 1160
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 4.604361057281494,
+      "learning_rate": 0.00015144927536231886,
+      "loss": 0.5225,
+      "step": 1170
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 2.7065069675445557,
+      "learning_rate": 0.0001510230179028133,
+      "loss": 0.4548,
+      "step": 1180
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 4.440161228179932,
+      "learning_rate": 0.00015059676044330776,
+      "loss": 0.5144,
+      "step": 1190
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 4.055997371673584,
+      "learning_rate": 0.0001501705029838022,
+      "loss": 0.5741,
+      "step": 1200
+    },
+    {
+      "epoch": 1.02,
+      "eval_accuracy": 0.9157608695652174,
+      "eval_loss": 0.3336072862148285,
+      "eval_runtime": 21.7333,
+      "eval_samples_per_second": 33.865,
+      "eval_steps_per_second": 4.233,
+      "step": 1200
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 3.851508617401123,
+      "learning_rate": 0.0001497442455242967,
+      "loss": 0.5196,
+      "step": 1210
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 6.436770915985107,
+      "learning_rate": 0.00014931798806479114,
+      "loss": 0.5433,
+      "step": 1220
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 4.73512601852417,
+      "learning_rate": 0.00014889173060528562,
+      "loss": 0.466,
+      "step": 1230
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 6.008419036865234,
+      "learning_rate": 0.00014846547314578007,
+      "loss": 0.4656,
+      "step": 1240
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 4.851034164428711,
+      "learning_rate": 0.00014803921568627451,
+      "loss": 0.4271,
+      "step": 1250
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 4.398035526275635,
+      "learning_rate": 0.00014761295822676896,
+      "loss": 0.4296,
+      "step": 1260
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 4.0159454345703125,
+      "learning_rate": 0.00014718670076726344,
+      "loss": 0.4648,
+      "step": 1270
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 5.387024879455566,
+      "learning_rate": 0.0001467604433077579,
+      "loss": 0.4818,
+      "step": 1280
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 4.121237277984619,
+      "learning_rate": 0.00014633418584825237,
+      "loss": 0.4996,
+      "step": 1290
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 2.9251198768615723,
+      "learning_rate": 0.00014590792838874682,
+      "loss": 0.4039,
+      "step": 1300
+    },
+    {
+      "epoch": 1.11,
+      "eval_accuracy": 0.9130434782608695,
+      "eval_loss": 0.32037827372550964,
+      "eval_runtime": 22.2814,
+      "eval_samples_per_second": 33.032,
+      "eval_steps_per_second": 4.129,
+      "step": 1300
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 5.795650482177734,
+      "learning_rate": 0.00014548167092924127,
+      "loss": 0.3762,
+      "step": 1310
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 3.4435791969299316,
+      "learning_rate": 0.00014505541346973572,
+      "loss": 0.4195,
+      "step": 1320
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 5.9583563804626465,
+      "learning_rate": 0.0001446291560102302,
+      "loss": 0.4561,
+      "step": 1330
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 4.873754501342773,
+      "learning_rate": 0.00014420289855072464,
+      "loss": 0.4756,
+      "step": 1340
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 2.2037878036499023,
+      "learning_rate": 0.00014377664109121912,
+      "loss": 0.4494,
+      "step": 1350
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 3.80642032623291,
+      "learning_rate": 0.00014335038363171357,
+      "loss": 0.454,
+      "step": 1360
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 3.6881439685821533,
+      "learning_rate": 0.00014292412617220802,
+      "loss": 0.5275,
+      "step": 1370
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 3.6306328773498535,
+      "learning_rate": 0.00014249786871270247,
+      "loss": 0.3859,
+      "step": 1380
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 2.9194376468658447,
+      "learning_rate": 0.00014207161125319695,
+      "loss": 0.3732,
+      "step": 1390
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 6.103586673736572,
+      "learning_rate": 0.0001416453537936914,
+      "loss": 0.4347,
+      "step": 1400
+    },
+    {
+      "epoch": 1.19,
+      "eval_accuracy": 0.9307065217391305,
+      "eval_loss": 0.3037649989128113,
+      "eval_runtime": 21.9092,
+      "eval_samples_per_second": 33.593,
+      "eval_steps_per_second": 4.199,
+      "step": 1400
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 3.9999098777770996,
+      "learning_rate": 0.00014121909633418587,
+      "loss": 0.33,
+      "step": 1410
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 4.35922384262085,
+      "learning_rate": 0.00014079283887468032,
+      "loss": 0.4913,
+      "step": 1420
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 5.2613019943237305,
+      "learning_rate": 0.00014036658141517477,
+      "loss": 0.4595,
+      "step": 1430
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 4.525338172912598,
+      "learning_rate": 0.00013994032395566922,
+      "loss": 0.4338,
+      "step": 1440
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 3.5422630310058594,
+      "learning_rate": 0.0001395140664961637,
+      "loss": 0.3084,
+      "step": 1450
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 4.331848621368408,
+      "learning_rate": 0.00013908780903665815,
+      "loss": 0.4736,
+      "step": 1460
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 3.7015225887298584,
+      "learning_rate": 0.00013866155157715262,
+      "loss": 0.4691,
+      "step": 1470
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 3.8468117713928223,
+      "learning_rate": 0.00013823529411764707,
+      "loss": 0.3247,
+      "step": 1480
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 4.560232639312744,
+      "learning_rate": 0.00013780903665814152,
+      "loss": 0.4162,
+      "step": 1490
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 4.275567531585693,
+      "learning_rate": 0.00013738277919863597,
+      "loss": 0.3639,
+      "step": 1500
+    },
+    {
+      "epoch": 1.28,
+      "eval_accuracy": 0.9252717391304348,
+      "eval_loss": 0.2954687476158142,
+      "eval_runtime": 21.3813,
+      "eval_samples_per_second": 34.423,
+      "eval_steps_per_second": 4.303,
+      "step": 1500
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 4.1551513671875,
+      "learning_rate": 0.00013695652173913045,
+      "loss": 0.4241,
+      "step": 1510
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 3.172189712524414,
+      "learning_rate": 0.0001365302642796249,
+      "loss": 0.5576,
+      "step": 1520
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 4.926159858703613,
+      "learning_rate": 0.00013610400682011938,
+      "loss": 0.3687,
+      "step": 1530
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 5.438605785369873,
+      "learning_rate": 0.00013567774936061383,
+      "loss": 0.4823,
+      "step": 1540
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 4.588318347930908,
+      "learning_rate": 0.00013525149190110828,
+      "loss": 0.3816,
+      "step": 1550
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 3.6473512649536133,
+      "learning_rate": 0.00013482523444160273,
+      "loss": 0.4009,
+      "step": 1560
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 1.983258605003357,
+      "learning_rate": 0.0001343989769820972,
+      "loss": 0.412,
+      "step": 1570
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 4.585762023925781,
+      "learning_rate": 0.00013397271952259165,
+      "loss": 0.4427,
+      "step": 1580
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 3.362663984298706,
+      "learning_rate": 0.00013354646206308613,
+      "loss": 0.3946,
+      "step": 1590
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 2.671499729156494,
+      "learning_rate": 0.00013312020460358058,
+      "loss": 0.4282,
+      "step": 1600
+    },
+    {
+      "epoch": 1.36,
+      "eval_accuracy": 0.9293478260869565,
+      "eval_loss": 0.29482707381248474,
+      "eval_runtime": 22.1321,
+      "eval_samples_per_second": 33.255,
+      "eval_steps_per_second": 4.157,
+      "step": 1600
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 4.811617374420166,
+      "learning_rate": 0.00013269394714407503,
+      "loss": 0.4589,
+      "step": 1610
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 3.9043726921081543,
+      "learning_rate": 0.00013226768968456948,
+      "loss": 0.4281,
+      "step": 1620
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 3.5376808643341064,
+      "learning_rate": 0.00013184143222506395,
+      "loss": 0.3975,
+      "step": 1630
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 4.834260940551758,
+      "learning_rate": 0.0001314151747655584,
+      "loss": 0.4094,
+      "step": 1640
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 4.845081806182861,
+      "learning_rate": 0.00013098891730605288,
+      "loss": 0.3806,
+      "step": 1650
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 4.995219707489014,
+      "learning_rate": 0.00013056265984654733,
+      "loss": 0.4762,
+      "step": 1660
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 4.277749061584473,
+      "learning_rate": 0.00013013640238704178,
+      "loss": 0.4923,
+      "step": 1670
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 5.35748815536499,
+      "learning_rate": 0.00012971014492753623,
+      "loss": 0.3878,
+      "step": 1680
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 3.3792693614959717,
+      "learning_rate": 0.0001292838874680307,
+      "loss": 0.369,
+      "step": 1690
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 4.180275917053223,
+      "learning_rate": 0.00012885763000852516,
+      "loss": 0.4375,
+      "step": 1700
+    },
+    {
+      "epoch": 1.45,
+      "eval_accuracy": 0.9211956521739131,
+      "eval_loss": 0.2868107855319977,
+      "eval_runtime": 22.1903,
+      "eval_samples_per_second": 33.168,
+      "eval_steps_per_second": 4.146,
+      "step": 1700
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 2.387194871902466,
+      "learning_rate": 0.00012843137254901963,
+      "loss": 0.4346,
+      "step": 1710
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.87982177734375,
+      "learning_rate": 0.00012800511508951408,
+      "loss": 0.3212,
+      "step": 1720
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 2.5354747772216797,
+      "learning_rate": 0.00012757885763000853,
+      "loss": 0.3389,
+      "step": 1730
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 3.975893259048462,
+      "learning_rate": 0.00012715260017050298,
+      "loss": 0.422,
+      "step": 1740
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 5.26716423034668,
+      "learning_rate": 0.00012672634271099746,
+      "loss": 0.356,
+      "step": 1750
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 3.5218489170074463,
+      "learning_rate": 0.0001263000852514919,
+      "loss": 0.5009,
+      "step": 1760
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 3.965714931488037,
+      "learning_rate": 0.00012587382779198638,
+      "loss": 0.4176,
+      "step": 1770
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 4.640725612640381,
+      "learning_rate": 0.00012544757033248083,
+      "loss": 0.4312,
+      "step": 1780
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 3.008267879486084,
+      "learning_rate": 0.00012502131287297528,
+      "loss": 0.4443,
+      "step": 1790
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 2.9221649169921875,
+      "learning_rate": 0.00012459505541346973,
+      "loss": 0.3063,
+      "step": 1800
+    },
+    {
+      "epoch": 1.53,
+      "eval_accuracy": 0.9334239130434783,
+      "eval_loss": 0.28607919812202454,
+      "eval_runtime": 21.4966,
+      "eval_samples_per_second": 34.238,
+      "eval_steps_per_second": 4.28,
+      "step": 1800
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 4.676169395446777,
+      "learning_rate": 0.0001241687979539642,
+      "loss": 0.4685,
+      "step": 1810
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 4.044462203979492,
+      "learning_rate": 0.00012374254049445866,
+      "loss": 0.4094,
+      "step": 1820
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 3.891371726989746,
+      "learning_rate": 0.00012331628303495314,
+      "loss": 0.4552,
+      "step": 1830
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 5.24341344833374,
+      "learning_rate": 0.0001228900255754476,
+      "loss": 0.2682,
+      "step": 1840
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 5.291731834411621,
+      "learning_rate": 0.00012246376811594204,
+      "loss": 0.4047,
+      "step": 1850
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 2.1534364223480225,
+      "learning_rate": 0.0001220375106564365,
+      "loss": 0.2985,
+      "step": 1860
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 4.051937580108643,
+      "learning_rate": 0.00012161125319693096,
+      "loss": 0.3872,
+      "step": 1870
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 4.822917461395264,
+      "learning_rate": 0.00012118499573742541,
+      "loss": 0.3532,
+      "step": 1880
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 2.520695209503174,
+      "learning_rate": 0.00012075873827791986,
+      "loss": 0.3878,
+      "step": 1890
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 1.7465733289718628,
+      "learning_rate": 0.00012033248081841433,
+      "loss": 0.3549,
+      "step": 1900
+    },
+    {
+      "epoch": 1.62,
+      "eval_accuracy": 0.9293478260869565,
+      "eval_loss": 0.28257426619529724,
+      "eval_runtime": 22.1114,
+      "eval_samples_per_second": 33.286,
+      "eval_steps_per_second": 4.161,
+      "step": 1900
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 4.117008686065674,
+      "learning_rate": 0.00011990622335890877,
+      "loss": 0.345,
+      "step": 1910
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 2.7594146728515625,
+      "learning_rate": 0.00011947996589940325,
+      "loss": 0.3515,
+      "step": 1920
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 3.63336181640625,
+      "learning_rate": 0.00011905370843989769,
+      "loss": 0.4509,
+      "step": 1930
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 2.090906858444214,
+      "learning_rate": 0.00011862745098039216,
+      "loss": 0.4249,
+      "step": 1940
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 7.205491542816162,
+      "learning_rate": 0.00011820119352088661,
+      "loss": 0.3791,
+      "step": 1950
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 4.450414657592773,
+      "learning_rate": 0.00011777493606138108,
+      "loss": 0.3818,
+      "step": 1960
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 3.7030208110809326,
+      "learning_rate": 0.00011734867860187553,
+      "loss": 0.3881,
+      "step": 1970
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 3.2162046432495117,
+      "learning_rate": 0.00011692242114237,
+      "loss": 0.4438,
+      "step": 1980
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 3.964151382446289,
+      "learning_rate": 0.00011649616368286444,
+      "loss": 0.3491,
+      "step": 1990
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 6.592412948608398,
+      "learning_rate": 0.00011606990622335892,
+      "loss": 0.4326,
+      "step": 2000
+    },
+    {
+      "epoch": 1.71,
+      "eval_accuracy": 0.9347826086956522,
+      "eval_loss": 0.26979920268058777,
+      "eval_runtime": 21.9026,
+      "eval_samples_per_second": 33.603,
+      "eval_steps_per_second": 4.2,
+      "step": 2000
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 3.8649954795837402,
+      "learning_rate": 0.00011564364876385337,
+      "loss": 0.4652,
+      "step": 2010
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 4.012170791625977,
+      "learning_rate": 0.00011521739130434783,
+      "loss": 0.4145,
+      "step": 2020
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 3.8874316215515137,
+      "learning_rate": 0.00011479113384484228,
+      "loss": 0.3241,
+      "step": 2030
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 3.7281954288482666,
+      "learning_rate": 0.00011436487638533676,
+      "loss": 0.3844,
+      "step": 2040
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 3.496342182159424,
+      "learning_rate": 0.00011393861892583119,
+      "loss": 0.4773,
+      "step": 2050
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 2.408202886581421,
+      "learning_rate": 0.00011351236146632567,
+      "loss": 0.44,
+      "step": 2060
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 1.211472749710083,
+      "learning_rate": 0.00011308610400682012,
+      "loss": 0.2852,
+      "step": 2070
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 3.826094627380371,
+      "learning_rate": 0.00011265984654731458,
+      "loss": 0.4446,
+      "step": 2080
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 3.4480748176574707,
+      "learning_rate": 0.00011223358908780903,
+      "loss": 0.4111,
+      "step": 2090
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 4.725152492523193,
+      "learning_rate": 0.00011180733162830351,
+      "loss": 0.3697,
+      "step": 2100
+    },
+    {
+      "epoch": 1.79,
+      "eval_accuracy": 0.9279891304347826,
+      "eval_loss": 0.2602107524871826,
+      "eval_runtime": 21.9521,
+      "eval_samples_per_second": 33.528,
+      "eval_steps_per_second": 4.191,
+      "step": 2100
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 3.1105542182922363,
+      "learning_rate": 0.00011138107416879794,
+      "loss": 0.3935,
+      "step": 2110
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 4.203391075134277,
+      "learning_rate": 0.00011095481670929242,
+      "loss": 0.4265,
+      "step": 2120
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 3.371384382247925,
+      "learning_rate": 0.00011052855924978687,
+      "loss": 0.3914,
+      "step": 2130
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 4.558941841125488,
+      "learning_rate": 0.00011010230179028133,
+      "loss": 0.4196,
+      "step": 2140
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 2.8182742595672607,
+      "learning_rate": 0.00010967604433077578,
+      "loss": 0.3749,
+      "step": 2150
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 5.837876319885254,
+      "learning_rate": 0.00010924978687127026,
+      "loss": 0.3435,
+      "step": 2160
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 4.261623859405518,
+      "learning_rate": 0.0001088235294117647,
+      "loss": 0.377,
+      "step": 2170
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 3.3105015754699707,
+      "learning_rate": 0.00010839727195225917,
+      "loss": 0.3968,
+      "step": 2180
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 4.884303092956543,
+      "learning_rate": 0.00010797101449275362,
+      "loss": 0.3476,
+      "step": 2190
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 2.950490951538086,
+      "learning_rate": 0.00010754475703324809,
+      "loss": 0.3155,
+      "step": 2200
+    },
+    {
+      "epoch": 1.88,
+      "eval_accuracy": 0.936141304347826,
+      "eval_loss": 0.2523466646671295,
+      "eval_runtime": 21.8751,
+      "eval_samples_per_second": 33.646,
+      "eval_steps_per_second": 4.206,
+      "step": 2200
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 2.758512258529663,
+      "learning_rate": 0.00010711849957374254,
+      "loss": 0.2808,
+      "step": 2210
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 3.4424259662628174,
+      "learning_rate": 0.00010669224211423701,
+      "loss": 0.3434,
+      "step": 2220
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 3.334716558456421,
+      "learning_rate": 0.00010626598465473145,
+      "loss": 0.4246,
+      "step": 2230
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 3.309183120727539,
+      "learning_rate": 0.00010583972719522593,
+      "loss": 0.3535,
+      "step": 2240
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 3.0825111865997314,
+      "learning_rate": 0.00010541346973572037,
+      "loss": 0.4027,
+      "step": 2250
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 4.07969856262207,
+      "learning_rate": 0.00010498721227621484,
+      "loss": 0.4758,
+      "step": 2260
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 3.2373547554016113,
+      "learning_rate": 0.00010456095481670929,
+      "loss": 0.4168,
+      "step": 2270
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 3.9943110942840576,
+      "learning_rate": 0.00010413469735720376,
+      "loss": 0.3319,
+      "step": 2280
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 4.852569103240967,
+      "learning_rate": 0.0001037084398976982,
+      "loss": 0.3544,
+      "step": 2290
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 2.613586902618408,
+      "learning_rate": 0.00010328218243819268,
+      "loss": 0.3348,
+      "step": 2300
+    },
+    {
+      "epoch": 1.96,
+      "eval_accuracy": 0.9470108695652174,
+      "eval_loss": 0.2506195604801178,
+      "eval_runtime": 22.0934,
+      "eval_samples_per_second": 33.313,
+      "eval_steps_per_second": 4.164,
+      "step": 2300
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 3.5085036754608154,
+      "learning_rate": 0.00010285592497868713,
+      "loss": 0.3908,
+      "step": 2310
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 2.5577197074890137,
+      "learning_rate": 0.00010242966751918159,
+      "loss": 0.3995,
+      "step": 2320
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 3.079270124435425,
+      "learning_rate": 0.00010200341005967604,
+      "loss": 0.2755,
+      "step": 2330
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 2.5068159103393555,
+      "learning_rate": 0.00010157715260017052,
+      "loss": 0.4502,
+      "step": 2340
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 4.730781555175781,
+      "learning_rate": 0.00010115089514066495,
+      "loss": 0.3456,
+      "step": 2350
+    },
+    {
+      "epoch": 2.01,
+      "grad_norm": 3.066279888153076,
+      "learning_rate": 0.00010072463768115943,
+      "loss": 0.3106,
+      "step": 2360
+    },
+    {
+      "epoch": 2.02,
+      "grad_norm": 5.494949817657471,
+      "learning_rate": 0.00010029838022165388,
+      "loss": 0.4091,
+      "step": 2370
+    },
+    {
+      "epoch": 2.03,
+      "grad_norm": 4.616675853729248,
+      "learning_rate": 9.987212276214834e-05,
+      "loss": 0.349,
+      "step": 2380
+    },
+    {
+      "epoch": 2.04,
+      "grad_norm": 3.0624680519104004,
+      "learning_rate": 9.94458653026428e-05,
+      "loss": 0.3441,
+      "step": 2390
+    },
+    {
+      "epoch": 2.05,
+      "grad_norm": 3.0316741466522217,
+      "learning_rate": 9.901960784313727e-05,
+      "loss": 0.3854,
+      "step": 2400
+    },
+    {
+      "epoch": 2.05,
+      "eval_accuracy": 0.9320652173913043,
+      "eval_loss": 0.25646623969078064,
+      "eval_runtime": 21.873,
+      "eval_samples_per_second": 33.649,
+      "eval_steps_per_second": 4.206,
+      "step": 2400
+    },
+    {
+      "epoch": 2.05,
+      "grad_norm": 3.854029893875122,
+      "learning_rate": 9.859335038363172e-05,
+      "loss": 0.3514,
+      "step": 2410
+    },
+    {
+      "epoch": 2.06,
+      "grad_norm": 4.106563568115234,
+      "learning_rate": 9.816709292412618e-05,
+      "loss": 0.3358,
+      "step": 2420
+    },
+    {
+      "epoch": 2.07,
+      "grad_norm": 3.2283833026885986,
+      "learning_rate": 9.774083546462064e-05,
+      "loss": 0.4073,
+      "step": 2430
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 3.7362446784973145,
+      "learning_rate": 9.73145780051151e-05,
+      "loss": 0.3858,
+      "step": 2440
+    },
+    {
+      "epoch": 2.09,
+      "grad_norm": 2.262639284133911,
+      "learning_rate": 9.688832054560956e-05,
+      "loss": 0.3603,
+      "step": 2450
+    },
+    {
+      "epoch": 2.1,
+      "grad_norm": 3.2846786975860596,
+      "learning_rate": 9.646206308610402e-05,
+      "loss": 0.3249,
+      "step": 2460
+    },
+    {
+      "epoch": 2.11,
+      "grad_norm": 3.0911221504211426,
+      "learning_rate": 9.603580562659847e-05,
+      "loss": 0.3276,
+      "step": 2470
+    },
+    {
+      "epoch": 2.11,
+      "grad_norm": 3.0129778385162354,
+      "learning_rate": 9.560954816709293e-05,
+      "loss": 0.3321,
+      "step": 2480
+    },
+    {
+      "epoch": 2.12,
+      "grad_norm": 1.167391061782837,
+      "learning_rate": 9.51832907075874e-05,
+      "loss": 0.3583,
+      "step": 2490
+    },
+    {
+      "epoch": 2.13,
+      "grad_norm": 3.6704165935516357,
+      "learning_rate": 9.475703324808185e-05,
+      "loss": 0.3951,
+      "step": 2500
+    },
+    {
+      "epoch": 2.13,
+      "eval_accuracy": 0.9402173913043478,
+      "eval_loss": 0.24821878969669342,
+      "eval_runtime": 22.0497,
+      "eval_samples_per_second": 33.379,
+      "eval_steps_per_second": 4.172,
+      "step": 2500
+    },
+    {
+      "epoch": 2.14,
+      "grad_norm": 4.554658889770508,
+      "learning_rate": 9.433077578857631e-05,
+      "loss": 0.4089,
+      "step": 2510
+    },
+    {
+      "epoch": 2.15,
+      "grad_norm": 1.6168781518936157,
+      "learning_rate": 9.390451832907077e-05,
+      "loss": 0.381,
+      "step": 2520
+    },
+    {
+      "epoch": 2.16,
+      "grad_norm": 3.4385123252868652,
+      "learning_rate": 9.347826086956522e-05,
+      "loss": 0.3826,
+      "step": 2530
+    },
+    {
+      "epoch": 2.17,
+      "grad_norm": 2.3573708534240723,
+      "learning_rate": 9.305200341005969e-05,
+      "loss": 0.4197,
+      "step": 2540
+    },
+    {
+      "epoch": 2.17,
+      "grad_norm": 3.4988811016082764,
+      "learning_rate": 9.262574595055415e-05,
+      "loss": 0.3862,
+      "step": 2550
+    },
+    {
+      "epoch": 2.18,
+      "grad_norm": 2.5783936977386475,
+      "learning_rate": 9.21994884910486e-05,
+      "loss": 0.4243,
+      "step": 2560
+    },
+    {
+      "epoch": 2.19,
+      "grad_norm": 4.15625,
+      "learning_rate": 9.177323103154306e-05,
+      "loss": 0.4088,
+      "step": 2570
+    },
+    {
+      "epoch": 2.2,
+      "grad_norm": 1.7744431495666504,
+      "learning_rate": 9.134697357203753e-05,
+      "loss": 0.2933,
+      "step": 2580
+    },
+    {
+      "epoch": 2.21,
+      "grad_norm": 3.8392601013183594,
+      "learning_rate": 9.092071611253197e-05,
+      "loss": 0.3433,
+      "step": 2590
+    },
+    {
+      "epoch": 2.22,
+      "grad_norm": 1.8646149635314941,
+      "learning_rate": 9.049445865302644e-05,
+      "loss": 0.3531,
+      "step": 2600
+    },
+    {
+      "epoch": 2.22,
+      "eval_accuracy": 0.9402173913043478,
+      "eval_loss": 0.24554158747196198,
+      "eval_runtime": 22.0249,
+      "eval_samples_per_second": 33.417,
+      "eval_steps_per_second": 4.177,
+      "step": 2600
+    },
+    {
+      "epoch": 2.23,
+      "grad_norm": 2.4835286140441895,
+      "learning_rate": 9.00682011935209e-05,
+      "loss": 0.3631,
+      "step": 2610
+    },
+    {
+      "epoch": 2.23,
+      "grad_norm": 4.589507102966309,
+      "learning_rate": 8.964194373401535e-05,
+      "loss": 0.2747,
+      "step": 2620
+    },
+    {
+      "epoch": 2.24,
+      "grad_norm": 4.478066444396973,
+      "learning_rate": 8.921568627450981e-05,
+      "loss": 0.352,
+      "step": 2630
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": 3.647719383239746,
+      "learning_rate": 8.878942881500428e-05,
+      "loss": 0.3804,
+      "step": 2640
+    },
+    {
+      "epoch": 2.26,
+      "grad_norm": 3.516533613204956,
+      "learning_rate": 8.836317135549873e-05,
+      "loss": 0.3542,
+      "step": 2650
+    },
+    {
+      "epoch": 2.27,
+      "grad_norm": 4.30831241607666,
+      "learning_rate": 8.793691389599319e-05,
+      "loss": 0.4834,
+      "step": 2660
+    },
+    {
+      "epoch": 2.28,
+      "grad_norm": 3.8890135288238525,
+      "learning_rate": 8.751065643648765e-05,
+      "loss": 0.4297,
+      "step": 2670
+    },
+    {
+      "epoch": 2.28,
+      "grad_norm": 2.649815559387207,
+      "learning_rate": 8.70843989769821e-05,
+      "loss": 0.4457,
+      "step": 2680
+    },
+    {
+      "epoch": 2.29,
+      "grad_norm": 2.874537467956543,
+      "learning_rate": 8.665814151747657e-05,
+      "loss": 0.3606,
+      "step": 2690
+    },
+    {
+      "epoch": 2.3,
+      "grad_norm": 3.6767563819885254,
+      "learning_rate": 8.623188405797103e-05,
+      "loss": 0.3643,
+      "step": 2700
+    },
+    {
+      "epoch": 2.3,
+      "eval_accuracy": 0.9375,
+      "eval_loss": 0.25128769874572754,
+      "eval_runtime": 22.204,
+      "eval_samples_per_second": 33.147,
+      "eval_steps_per_second": 4.143,
+      "step": 2700
+    },
+    {
+      "epoch": 2.31,
+      "grad_norm": 2.8548221588134766,
+      "learning_rate": 8.580562659846548e-05,
+      "loss": 0.4496,
+      "step": 2710
+    },
+    {
+      "epoch": 2.32,
+      "grad_norm": 3.32646107673645,
+      "learning_rate": 8.537936913895993e-05,
+      "loss": 0.4037,
+      "step": 2720
+    },
+    {
+      "epoch": 2.33,
+      "grad_norm": 4.917088508605957,
+      "learning_rate": 8.495311167945439e-05,
+      "loss": 0.3816,
+      "step": 2730
+    },
+    {
+      "epoch": 2.34,
+      "grad_norm": 4.138692378997803,
+      "learning_rate": 8.452685421994884e-05,
+      "loss": 0.4166,
+      "step": 2740
+    },
+    {
+      "epoch": 2.34,
+      "grad_norm": 2.5747594833374023,
+      "learning_rate": 8.41005967604433e-05,
+      "loss": 0.3947,
+      "step": 2750
+    },
+    {
+      "epoch": 2.35,
+      "grad_norm": 3.0434372425079346,
+      "learning_rate": 8.367433930093777e-05,
+      "loss": 0.2987,
+      "step": 2760
+    },
+    {
+      "epoch": 2.36,
+      "grad_norm": 4.922779083251953,
+      "learning_rate": 8.324808184143222e-05,
+      "loss": 0.3555,
+      "step": 2770
+    },
+    {
+      "epoch": 2.37,
+      "grad_norm": 3.4687201976776123,
+      "learning_rate": 8.282182438192668e-05,
+      "loss": 0.3274,
+      "step": 2780
+    },
+    {
+      "epoch": 2.38,
+      "grad_norm": 3.288496494293213,
+      "learning_rate": 8.239556692242114e-05,
+      "loss": 0.3321,
+      "step": 2790
+    },
+    {
+      "epoch": 2.39,
+      "grad_norm": 3.5918378829956055,
+      "learning_rate": 8.19693094629156e-05,
+      "loss": 0.3393,
+      "step": 2800
+    },
+    {
+      "epoch": 2.39,
+      "eval_accuracy": 0.9429347826086957,
+      "eval_loss": 0.24919526278972626,
+      "eval_runtime": 21.7658,
+      "eval_samples_per_second": 33.814,
+      "eval_steps_per_second": 4.227,
+      "step": 2800
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 2.7565417289733887,
+      "learning_rate": 8.154305200341006e-05,
+      "loss": 0.3414,
+      "step": 2810
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 3.525897979736328,
+      "learning_rate": 8.111679454390452e-05,
+      "loss": 0.4407,
+      "step": 2820
+    },
+    {
+      "epoch": 2.41,
+      "grad_norm": 2.774212121963501,
+      "learning_rate": 8.069053708439897e-05,
+      "loss": 0.3403,
+      "step": 2830
+    },
+    {
+      "epoch": 2.42,
+      "grad_norm": 3.12599515914917,
+      "learning_rate": 8.026427962489343e-05,
+      "loss": 0.3312,
+      "step": 2840
+    },
+    {
+      "epoch": 2.43,
+      "grad_norm": 5.231620788574219,
+      "learning_rate": 7.98380221653879e-05,
+      "loss": 0.3958,
+      "step": 2850
+    },
+    {
+      "epoch": 2.44,
+      "grad_norm": 3.423818588256836,
+      "learning_rate": 7.941176470588235e-05,
+      "loss": 0.3513,
+      "step": 2860
+    },
+    {
+      "epoch": 2.45,
+      "grad_norm": 2.218860387802124,
+      "learning_rate": 7.898550724637681e-05,
+      "loss": 0.3747,
+      "step": 2870
+    },
+    {
+      "epoch": 2.46,
+      "grad_norm": 4.602892875671387,
+      "learning_rate": 7.855924978687127e-05,
+      "loss": 0.4701,
+      "step": 2880
+    },
+    {
+      "epoch": 2.46,
+      "grad_norm": 3.3462207317352295,
+      "learning_rate": 7.813299232736572e-05,
+      "loss": 0.3967,
+      "step": 2890
+    },
+    {
+      "epoch": 2.47,
+      "grad_norm": 2.690305233001709,
+      "learning_rate": 7.770673486786019e-05,
+      "loss": 0.3635,
+      "step": 2900
+    },
+    {
+      "epoch": 2.47,
+      "eval_accuracy": 0.9402173913043478,
+      "eval_loss": 0.2394125610589981,
+      "eval_runtime": 21.6739,
+      "eval_samples_per_second": 33.958,
+      "eval_steps_per_second": 4.245,
+      "step": 2900
+    },
+    {
+      "epoch": 2.48,
+      "grad_norm": 3.6983702182769775,
+      "learning_rate": 7.728047740835465e-05,
+      "loss": 0.3699,
+      "step": 2910
+    },
+    {
+      "epoch": 2.49,
+      "grad_norm": 3.6911110877990723,
+      "learning_rate": 7.68542199488491e-05,
+      "loss": 0.4104,
+      "step": 2920
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 2.3493428230285645,
+      "learning_rate": 7.642796248934356e-05,
+      "loss": 0.323,
+      "step": 2930
+    },
+    {
+      "epoch": 2.51,
+      "grad_norm": 3.3958208560943604,
+      "learning_rate": 7.600170502983802e-05,
+      "loss": 0.3458,
+      "step": 2940
+    },
+    {
+      "epoch": 2.51,
+      "grad_norm": 3.3971476554870605,
+      "learning_rate": 7.557544757033247e-05,
+      "loss": 0.3147,
+      "step": 2950
+    },
+    {
+      "epoch": 2.52,
+      "grad_norm": 3.6205222606658936,
+      "learning_rate": 7.514919011082694e-05,
+      "loss": 0.4776,
+      "step": 2960
+    },
+    {
+      "epoch": 2.53,
+      "grad_norm": 2.697317123413086,
+      "learning_rate": 7.47229326513214e-05,
+      "loss": 0.3806,
+      "step": 2970
+    },
+    {
+      "epoch": 2.54,
+      "grad_norm": 4.160821437835693,
+      "learning_rate": 7.429667519181585e-05,
+      "loss": 0.4149,
+      "step": 2980
+    },
+    {
+      "epoch": 2.55,
+      "grad_norm": 4.857287406921387,
+      "learning_rate": 7.387041773231031e-05,
+      "loss": 0.2937,
+      "step": 2990
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 3.1837852001190186,
+      "learning_rate": 7.344416027280478e-05,
+      "loss": 0.3624,
+      "step": 3000
+    },
+    {
+      "epoch": 2.56,
+      "eval_accuracy": 0.938858695652174,
+      "eval_loss": 0.24252061545848846,
+      "eval_runtime": 22.0444,
+      "eval_samples_per_second": 33.387,
+      "eval_steps_per_second": 4.173,
+      "step": 3000
+    },
+    {
+      "epoch": 2.57,
+      "grad_norm": 4.424566745758057,
+      "learning_rate": 7.301790281329923e-05,
+      "loss": 0.2947,
+      "step": 3010
+    },
+    {
+      "epoch": 2.57,
+      "grad_norm": 4.352055549621582,
+      "learning_rate": 7.259164535379369e-05,
+      "loss": 0.408,
+      "step": 3020
+    },
+    {
+      "epoch": 2.58,
+      "grad_norm": 4.074793338775635,
+      "learning_rate": 7.216538789428815e-05,
+      "loss": 0.3518,
+      "step": 3030
+    },
+    {
+      "epoch": 2.59,
+      "grad_norm": 2.518249750137329,
+      "learning_rate": 7.17391304347826e-05,
+      "loss": 0.3168,
+      "step": 3040
+    },
+    {
+      "epoch": 2.6,
+      "grad_norm": 3.670358180999756,
+      "learning_rate": 7.131287297527707e-05,
+      "loss": 0.3239,
+      "step": 3050
+    },
+    {
+      "epoch": 2.61,
+      "grad_norm": 3.765688180923462,
+      "learning_rate": 7.088661551577153e-05,
+      "loss": 0.3946,
+      "step": 3060
+    },
+    {
+      "epoch": 2.62,
+      "grad_norm": 3.1018218994140625,
+      "learning_rate": 7.046035805626598e-05,
+      "loss": 0.3853,
+      "step": 3070
+    },
+    {
+      "epoch": 2.63,
+      "grad_norm": 1.9965764284133911,
+      "learning_rate": 7.003410059676044e-05,
+      "loss": 0.3506,
+      "step": 3080
+    },
+    {
+      "epoch": 2.63,
+      "grad_norm": 4.730674743652344,
+      "learning_rate": 6.96078431372549e-05,
+      "loss": 0.3642,
+      "step": 3090
+    },
+    {
+      "epoch": 2.64,
+      "grad_norm": 2.4963278770446777,
+      "learning_rate": 6.918158567774935e-05,
+      "loss": 0.3608,
+      "step": 3100
+    },
+    {
+      "epoch": 2.64,
+      "eval_accuracy": 0.9456521739130435,
+      "eval_loss": 0.2389669418334961,
+      "eval_runtime": 22.1191,
+      "eval_samples_per_second": 33.274,
+      "eval_steps_per_second": 4.159,
+      "step": 3100
+    },
+    {
+      "epoch": 2.65,
+      "grad_norm": 2.049652576446533,
+      "learning_rate": 6.875532821824382e-05,
+      "loss": 0.374,
+      "step": 3110
+    },
+    {
+      "epoch": 2.66,
+      "grad_norm": 3.511080026626587,
+      "learning_rate": 6.832907075873828e-05,
+      "loss": 0.3842,
+      "step": 3120
+    },
+    {
+      "epoch": 2.67,
+      "grad_norm": 4.446420192718506,
+      "learning_rate": 6.790281329923273e-05,
+      "loss": 0.288,
+      "step": 3130
+    },
+    {
+      "epoch": 2.68,
+      "grad_norm": 3.2927162647247314,
+      "learning_rate": 6.74765558397272e-05,
+      "loss": 0.2999,
+      "step": 3140
+    },
+    {
+      "epoch": 2.69,
+      "grad_norm": 3.2927627563476562,
+      "learning_rate": 6.705029838022166e-05,
+      "loss": 0.2792,
+      "step": 3150
+    },
+    {
+      "epoch": 2.69,
+      "grad_norm": 4.26877498626709,
+      "learning_rate": 6.66240409207161e-05,
+      "loss": 0.3553,
+      "step": 3160
+    },
+    {
+      "epoch": 2.7,
+      "grad_norm": 2.3921847343444824,
+      "learning_rate": 6.619778346121057e-05,
+      "loss": 0.3472,
+      "step": 3170
+    },
+    {
+      "epoch": 2.71,
+      "grad_norm": 2.2627205848693848,
+      "learning_rate": 6.577152600170503e-05,
+      "loss": 0.3367,
+      "step": 3180
+    },
+    {
+      "epoch": 2.72,
+      "grad_norm": 4.409063816070557,
+      "learning_rate": 6.534526854219948e-05,
+      "loss": 0.4007,
+      "step": 3190
+    },
+    {
+      "epoch": 2.73,
+      "grad_norm": 2.80055570602417,
+      "learning_rate": 6.491901108269395e-05,
+      "loss": 0.3215,
+      "step": 3200
+    },
+    {
+      "epoch": 2.73,
+      "eval_accuracy": 0.9320652173913043,
+      "eval_loss": 0.2482815384864807,
+      "eval_runtime": 22.058,
+      "eval_samples_per_second": 33.367,
+      "eval_steps_per_second": 4.171,
+      "step": 3200
+    },
+    {
+      "epoch": 2.74,
+      "grad_norm": 2.1410982608795166,
+      "learning_rate": 6.449275362318841e-05,
+      "loss": 0.3288,
+      "step": 3210
+    },
+    {
+      "epoch": 2.75,
+      "grad_norm": 1.486505389213562,
+      "learning_rate": 6.406649616368286e-05,
+      "loss": 0.3833,
+      "step": 3220
+    },
+    {
+      "epoch": 2.75,
+      "grad_norm": 2.9427695274353027,
+      "learning_rate": 6.364023870417732e-05,
+      "loss": 0.3629,
+      "step": 3230
+    },
+    {
+      "epoch": 2.76,
+      "grad_norm": 6.134848594665527,
+      "learning_rate": 6.321398124467179e-05,
+      "loss": 0.3732,
+      "step": 3240
+    },
+    {
+      "epoch": 2.77,
+      "grad_norm": 2.851616144180298,
+      "learning_rate": 6.278772378516623e-05,
+      "loss": 0.3866,
+      "step": 3250
+    },
+    {
+      "epoch": 2.78,
+      "grad_norm": 4.025167465209961,
+      "learning_rate": 6.23614663256607e-05,
+      "loss": 0.3245,
+      "step": 3260
+    },
+    {
+      "epoch": 2.79,
+      "grad_norm": 6.220050811767578,
+      "learning_rate": 6.193520886615516e-05,
+      "loss": 0.3233,
+      "step": 3270
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 2.960378408432007,
+      "learning_rate": 6.150895140664961e-05,
+      "loss": 0.3752,
+      "step": 3280
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 3.8177359104156494,
+      "learning_rate": 6.108269394714407e-05,
+      "loss": 0.3581,
+      "step": 3290
+    },
+    {
+      "epoch": 2.81,
+      "grad_norm": 3.3736770153045654,
+      "learning_rate": 6.065643648763854e-05,
+      "loss": 0.2971,
+      "step": 3300
+    },
+    {
+      "epoch": 2.81,
+      "eval_accuracy": 0.9402173913043478,
+      "eval_loss": 0.2455403059720993,
+      "eval_runtime": 22.1061,
+      "eval_samples_per_second": 33.294,
+      "eval_steps_per_second": 4.162,
+      "step": 3300
+    },
+    {
+      "epoch": 2.82,
+      "grad_norm": 5.297087669372559,
+      "learning_rate": 6.0230179028132994e-05,
+      "loss": 0.3563,
+      "step": 3310
+    },
+    {
+      "epoch": 2.83,
+      "grad_norm": 2.4093313217163086,
+      "learning_rate": 5.980392156862745e-05,
+      "loss": 0.2741,
+      "step": 3320
+    },
+    {
+      "epoch": 2.84,
+      "grad_norm": 2.5030243396759033,
+      "learning_rate": 5.9377664109121913e-05,
+      "loss": 0.37,
+      "step": 3330
+    },
+    {
+      "epoch": 2.85,
+      "grad_norm": 3.5400922298431396,
+      "learning_rate": 5.895140664961637e-05,
+      "loss": 0.3389,
+      "step": 3340
+    },
+    {
+      "epoch": 2.86,
+      "grad_norm": 2.54128098487854,
+      "learning_rate": 5.8525149190110826e-05,
+      "loss": 0.3459,
+      "step": 3350
+    },
+    {
+      "epoch": 2.86,
+      "grad_norm": 3.479625701904297,
+      "learning_rate": 5.809889173060529e-05,
+      "loss": 0.4011,
+      "step": 3360
+    },
+    {
+      "epoch": 2.87,
+      "grad_norm": 2.196369171142578,
+      "learning_rate": 5.7672634271099746e-05,
+      "loss": 0.4212,
+      "step": 3370
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 4.405612468719482,
+      "learning_rate": 5.72463768115942e-05,
+      "loss": 0.4886,
+      "step": 3380
+    },
+    {
+      "epoch": 2.89,
+      "grad_norm": 4.257590293884277,
+      "learning_rate": 5.6820119352088666e-05,
+      "loss": 0.3434,
+      "step": 3390
+    },
+    {
+      "epoch": 2.9,
+      "grad_norm": 5.413020610809326,
+      "learning_rate": 5.639386189258312e-05,
+      "loss": 0.3838,
+      "step": 3400
+    },
+    {
+      "epoch": 2.9,
+      "eval_accuracy": 0.9470108695652174,
+      "eval_loss": 0.23631170392036438,
+      "eval_runtime": 22.1897,
+      "eval_samples_per_second": 33.169,
+      "eval_steps_per_second": 4.146,
+      "step": 3400
+    },
+    {
+      "epoch": 2.91,
+      "grad_norm": 2.7590572834014893,
+      "learning_rate": 5.596760443307758e-05,
+      "loss": 0.3481,
+      "step": 3410
+    },
+    {
+      "epoch": 2.92,
+      "grad_norm": 3.4297289848327637,
+      "learning_rate": 5.554134697357204e-05,
+      "loss": 0.3478,
+      "step": 3420
+    },
+    {
+      "epoch": 2.92,
+      "grad_norm": 3.9188895225524902,
+      "learning_rate": 5.51150895140665e-05,
+      "loss": 0.4562,
+      "step": 3430
+    },
+    {
+      "epoch": 2.93,
+      "grad_norm": 2.648061513900757,
+      "learning_rate": 5.4688832054560955e-05,
+      "loss": 0.2639,
+      "step": 3440
+    },
+    {
+      "epoch": 2.94,
+      "grad_norm": 3.416527509689331,
+      "learning_rate": 5.426257459505542e-05,
+      "loss": 0.3647,
+      "step": 3450
+    },
+    {
+      "epoch": 2.95,
+      "grad_norm": 2.152862310409546,
+      "learning_rate": 5.3836317135549874e-05,
+      "loss": 0.3866,
+      "step": 3460
+    },
+    {
+      "epoch": 2.96,
+      "grad_norm": 3.239790439605713,
+      "learning_rate": 5.341005967604433e-05,
+      "loss": 0.3115,
+      "step": 3470
+    },
+    {
+      "epoch": 2.97,
+      "grad_norm": 4.865437030792236,
+      "learning_rate": 5.2983802216538794e-05,
+      "loss": 0.4108,
+      "step": 3480
+    },
+    {
+      "epoch": 2.98,
+      "grad_norm": 2.4955708980560303,
+      "learning_rate": 5.255754475703325e-05,
+      "loss": 0.3349,
+      "step": 3490
+    },
+    {
+      "epoch": 2.98,
+      "grad_norm": 3.560739755630493,
+      "learning_rate": 5.213128729752771e-05,
+      "loss": 0.3036,
+      "step": 3500
+    },
+    {
+      "epoch": 2.98,
+      "eval_accuracy": 0.9402173913043478,
+      "eval_loss": 0.2422226220369339,
+      "eval_runtime": 22.1146,
+      "eval_samples_per_second": 33.281,
+      "eval_steps_per_second": 4.16,
+      "step": 3500
+    },
+    {
+      "epoch": 2.99,
+      "grad_norm": 2.6449167728424072,
+      "learning_rate": 5.170502983802217e-05,
+      "loss": 0.3553,
+      "step": 3510
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 3.7996511459350586,
+      "learning_rate": 5.1278772378516626e-05,
+      "loss": 0.2752,
+      "step": 3520
+    },
+    {
+      "epoch": 3.01,
+      "grad_norm": 2.710324764251709,
+      "learning_rate": 5.085251491901108e-05,
+      "loss": 0.3815,
+      "step": 3530
+    },
+    {
+      "epoch": 3.02,
+      "grad_norm": 3.8157262802124023,
+      "learning_rate": 5.0426257459505546e-05,
+      "loss": 0.3643,
+      "step": 3540
+    },
+    {
+      "epoch": 3.03,
+      "grad_norm": 3.8065927028656006,
+      "learning_rate": 5e-05,
+      "loss": 0.3013,
+      "step": 3550
+    },
+    {
+      "epoch": 3.03,
+      "grad_norm": 3.4787912368774414,
+      "learning_rate": 4.957374254049446e-05,
+      "loss": 0.3826,
+      "step": 3560
+    },
+    {
+      "epoch": 3.04,
+      "grad_norm": 3.8217809200286865,
+      "learning_rate": 4.914748508098892e-05,
+      "loss": 0.3892,
+      "step": 3570
+    },
+    {
+      "epoch": 3.05,
+      "grad_norm": 3.754225015640259,
+      "learning_rate": 4.872122762148338e-05,
+      "loss": 0.258,
+      "step": 3580
+    },
+    {
+      "epoch": 3.06,
+      "grad_norm": 3.32442045211792,
+      "learning_rate": 4.8294970161977835e-05,
+      "loss": 0.2541,
+      "step": 3590
+    },
+    {
+      "epoch": 3.07,
+      "grad_norm": 3.7520785331726074,
+      "learning_rate": 4.78687127024723e-05,
+      "loss": 0.401,
+      "step": 3600
+    },
+    {
+      "epoch": 3.07,
+      "eval_accuracy": 0.9429347826086957,
+      "eval_loss": 0.23978637158870697,
+      "eval_runtime": 21.9597,
+      "eval_samples_per_second": 33.516,
+      "eval_steps_per_second": 4.189,
+      "step": 3600
+    },
+    {
+      "epoch": 3.08,
+      "grad_norm": 2.6882004737854004,
+      "learning_rate": 4.7442455242966755e-05,
+      "loss": 0.4118,
+      "step": 3610
+    },
+    {
+      "epoch": 3.09,
+      "grad_norm": 2.4365506172180176,
+      "learning_rate": 4.701619778346121e-05,
+      "loss": 0.3586,
+      "step": 3620
+    },
+    {
+      "epoch": 3.09,
+      "grad_norm": 1.2782219648361206,
+      "learning_rate": 4.6589940323955674e-05,
+      "loss": 0.3192,
+      "step": 3630
+    },
+    {
+      "epoch": 3.1,
+      "grad_norm": 4.13897180557251,
+      "learning_rate": 4.616368286445013e-05,
+      "loss": 0.3101,
+      "step": 3640
+    },
+    {
+      "epoch": 3.11,
+      "grad_norm": 1.7676732540130615,
+      "learning_rate": 4.573742540494459e-05,
+      "loss": 0.3854,
+      "step": 3650
+    },
+    {
+      "epoch": 3.12,
+      "grad_norm": 3.285656213760376,
+      "learning_rate": 4.531116794543905e-05,
+      "loss": 0.4299,
+      "step": 3660
+    },
+    {
+      "epoch": 3.13,
+      "grad_norm": 2.670168876647949,
+      "learning_rate": 4.488491048593351e-05,
+      "loss": 0.2919,
+      "step": 3670
+    },
+    {
+      "epoch": 3.14,
+      "grad_norm": 3.4926791191101074,
+      "learning_rate": 4.445865302642796e-05,
+      "loss": 0.356,
+      "step": 3680
+    },
+    {
+      "epoch": 3.15,
+      "grad_norm": 1.534806728363037,
+      "learning_rate": 4.4032395566922426e-05,
+      "loss": 0.3157,
+      "step": 3690
+    },
+    {
+      "epoch": 3.15,
+      "grad_norm": 2.911130905151367,
+      "learning_rate": 4.360613810741688e-05,
+      "loss": 0.3458,
+      "step": 3700
+    },
+    {
+      "epoch": 3.15,
+      "eval_accuracy": 0.9429347826086957,
+      "eval_loss": 0.2517460286617279,
+      "eval_runtime": 22.0673,
+      "eval_samples_per_second": 33.353,
+      "eval_steps_per_second": 4.169,
+      "step": 3700
+    },
+    {
+      "epoch": 3.16,
+      "grad_norm": 3.810058116912842,
+      "learning_rate": 4.317988064791134e-05,
+      "loss": 0.351,
+      "step": 3710
+    },
+    {
+      "epoch": 3.17,
+      "grad_norm": 3.799814462661743,
+      "learning_rate": 4.27536231884058e-05,
+      "loss": 0.4715,
+      "step": 3720
+    },
+    {
+      "epoch": 3.18,
+      "grad_norm": 2.787813663482666,
+      "learning_rate": 4.232736572890026e-05,
+      "loss": 0.3037,
+      "step": 3730
+    },
+    {
+      "epoch": 3.19,
+      "grad_norm": 3.3836419582366943,
+      "learning_rate": 4.1901108269394715e-05,
+      "loss": 0.3279,
+      "step": 3740
+    },
+    {
+      "epoch": 3.2,
+      "grad_norm": 3.981520652770996,
+      "learning_rate": 4.147485080988918e-05,
+      "loss": 0.4078,
+      "step": 3750
+    },
+    {
+      "epoch": 3.21,
+      "grad_norm": 2.715733051300049,
+      "learning_rate": 4.1048593350383635e-05,
+      "loss": 0.3375,
+      "step": 3760
+    },
+    {
+      "epoch": 3.21,
+      "grad_norm": 3.2656917572021484,
+      "learning_rate": 4.062233589087809e-05,
+      "loss": 0.3842,
+      "step": 3770
+    },
+    {
+      "epoch": 3.22,
+      "grad_norm": 4.852041721343994,
+      "learning_rate": 4.0196078431372555e-05,
+      "loss": 0.3119,
+      "step": 3780
+    },
+    {
+      "epoch": 3.23,
+      "grad_norm": 3.6040267944335938,
+      "learning_rate": 3.976982097186701e-05,
+      "loss": 0.3623,
+      "step": 3790
+    },
+    {
+      "epoch": 3.24,
+      "grad_norm": 2.597151279449463,
+      "learning_rate": 3.934356351236147e-05,
+      "loss": 0.2908,
+      "step": 3800
+    },
+    {
+      "epoch": 3.24,
+      "eval_accuracy": 0.9456521739130435,
+      "eval_loss": 0.24226675927639008,
+      "eval_runtime": 21.9179,
+      "eval_samples_per_second": 33.58,
+      "eval_steps_per_second": 4.197,
+      "step": 3800
+    },
+    {
+      "epoch": 3.25,
+      "grad_norm": 4.236294269561768,
+      "learning_rate": 3.891730605285593e-05,
+      "loss": 0.3942,
+      "step": 3810
+    },
+    {
+      "epoch": 3.26,
+      "grad_norm": 1.8580868244171143,
+      "learning_rate": 3.849104859335039e-05,
+      "loss": 0.2885,
+      "step": 3820
+    },
+    {
+      "epoch": 3.27,
+      "grad_norm": 2.5142822265625,
+      "learning_rate": 3.8064791133844843e-05,
+      "loss": 0.322,
+      "step": 3830
+    },
+    {
+      "epoch": 3.27,
+      "grad_norm": 3.2371296882629395,
+      "learning_rate": 3.763853367433931e-05,
+      "loss": 0.2543,
+      "step": 3840
+    },
+    {
+      "epoch": 3.28,
+      "grad_norm": 3.4256720542907715,
+      "learning_rate": 3.721227621483376e-05,
+      "loss": 0.3408,
+      "step": 3850
+    },
+    {
+      "epoch": 3.29,
+      "grad_norm": 2.5072007179260254,
+      "learning_rate": 3.678601875532822e-05,
+      "loss": 0.3094,
+      "step": 3860
+    },
+    {
+      "epoch": 3.3,
+      "grad_norm": 3.4915339946746826,
+      "learning_rate": 3.635976129582268e-05,
+      "loss": 0.4899,
+      "step": 3870
+    },
+    {
+      "epoch": 3.31,
+      "grad_norm": 2.5262367725372314,
+      "learning_rate": 3.593350383631714e-05,
+      "loss": 0.377,
+      "step": 3880
+    },
+    {
+      "epoch": 3.32,
+      "grad_norm": 2.8197975158691406,
+      "learning_rate": 3.5507246376811596e-05,
+      "loss": 0.4031,
+      "step": 3890
+    },
+    {
+      "epoch": 3.32,
+      "grad_norm": 1.553293228149414,
+      "learning_rate": 3.508098891730606e-05,
+      "loss": 0.3016,
+      "step": 3900
+    },
+    {
+      "epoch": 3.32,
+      "eval_accuracy": 0.9442934782608695,
+      "eval_loss": 0.24018557369709015,
+      "eval_runtime": 21.8606,
+      "eval_samples_per_second": 33.668,
+      "eval_steps_per_second": 4.208,
+      "step": 3900
+    },
+    {
+      "epoch": 3.33,
+      "grad_norm": 4.228199481964111,
+      "learning_rate": 3.4654731457800515e-05,
+      "loss": 0.3767,
+      "step": 3910
+    },
+    {
+      "epoch": 3.34,
+      "grad_norm": 3.0265350341796875,
+      "learning_rate": 3.422847399829497e-05,
+      "loss": 0.3735,
+      "step": 3920
+    },
+    {
+      "epoch": 3.35,
+      "grad_norm": 5.04302453994751,
+      "learning_rate": 3.3802216538789435e-05,
+      "loss": 0.3223,
+      "step": 3930
+    },
+    {
+      "epoch": 3.36,
+      "grad_norm": 3.406611919403076,
+      "learning_rate": 3.337595907928389e-05,
+      "loss": 0.3978,
+      "step": 3940
+    },
+    {
+      "epoch": 3.37,
+      "grad_norm": 3.3483784198760986,
+      "learning_rate": 3.294970161977835e-05,
+      "loss": 0.2903,
+      "step": 3950
+    },
+    {
+      "epoch": 3.38,
+      "grad_norm": 4.2682905197143555,
+      "learning_rate": 3.252344416027281e-05,
+      "loss": 0.3356,
+      "step": 3960
+    },
+    {
+      "epoch": 3.38,
+      "grad_norm": 1.5132865905761719,
+      "learning_rate": 3.209718670076726e-05,
+      "loss": 0.3522,
+      "step": 3970
+    },
+    {
+      "epoch": 3.39,
+      "grad_norm": 2.6350772380828857,
+      "learning_rate": 3.1670929241261724e-05,
+      "loss": 0.3271,
+      "step": 3980
+    },
+    {
+      "epoch": 3.4,
+      "grad_norm": 2.6944427490234375,
+      "learning_rate": 3.124467178175618e-05,
+      "loss": 0.2867,
+      "step": 3990
+    },
+    {
+      "epoch": 3.41,
+      "grad_norm": 2.3761823177337646,
+      "learning_rate": 3.081841432225064e-05,
+      "loss": 0.2961,
+      "step": 4000
+    },
+    {
+      "epoch": 3.41,
+      "eval_accuracy": 0.9456521739130435,
+      "eval_loss": 0.2413594275712967,
+      "eval_runtime": 22.1329,
+      "eval_samples_per_second": 33.254,
+      "eval_steps_per_second": 4.157,
+      "step": 4000
+    },
+    {
+      "epoch": 3.42,
+      "grad_norm": 3.0776336193084717,
+      "learning_rate": 3.0392156862745097e-05,
+      "loss": 0.333,
+      "step": 4010
+    },
+    {
+      "epoch": 3.43,
+      "grad_norm": 2.033477783203125,
+      "learning_rate": 2.9965899403239556e-05,
+      "loss": 0.3263,
+      "step": 4020
+    },
+    {
+      "epoch": 3.44,
+      "grad_norm": 4.718287467956543,
+      "learning_rate": 2.9539641943734013e-05,
+      "loss": 0.3373,
+      "step": 4030
+    },
+    {
+      "epoch": 3.44,
+      "grad_norm": 5.81247615814209,
+      "learning_rate": 2.9113384484228473e-05,
+      "loss": 0.3212,
+      "step": 4040
+    },
+    {
+      "epoch": 3.45,
+      "grad_norm": 4.8096723556518555,
+      "learning_rate": 2.8687127024722932e-05,
+      "loss": 0.3458,
+      "step": 4050
+    },
+    {
+      "epoch": 3.46,
+      "grad_norm": 3.833155632019043,
+      "learning_rate": 2.826086956521739e-05,
+      "loss": 0.3212,
+      "step": 4060
+    },
+    {
+      "epoch": 3.47,
+      "grad_norm": 2.942125082015991,
+      "learning_rate": 2.783461210571185e-05,
+      "loss": 0.2945,
+      "step": 4070
+    },
+    {
+      "epoch": 3.48,
+      "grad_norm": 5.171932697296143,
+      "learning_rate": 2.740835464620631e-05,
+      "loss": 0.3338,
+      "step": 4080
+    },
+    {
+      "epoch": 3.49,
+      "grad_norm": 4.480559825897217,
+      "learning_rate": 2.6982097186700765e-05,
+      "loss": 0.3755,
+      "step": 4090
+    },
+    {
+      "epoch": 3.5,
+      "grad_norm": 3.524714708328247,
+      "learning_rate": 2.6555839727195225e-05,
+      "loss": 0.3822,
+      "step": 4100
+    },
+    {
+      "epoch": 3.5,
+      "eval_accuracy": 0.9415760869565217,
+      "eval_loss": 0.2412695288658142,
+      "eval_runtime": 22.1034,
+      "eval_samples_per_second": 33.298,
+      "eval_steps_per_second": 4.162,
+      "step": 4100
+    },
+    {
+      "epoch": 3.5,
+      "grad_norm": 2.857635736465454,
+      "learning_rate": 2.6129582267689685e-05,
+      "loss": 0.3709,
+      "step": 4110
+    },
+    {
+      "epoch": 3.51,
+      "grad_norm": 3.15679931640625,
+      "learning_rate": 2.5703324808184144e-05,
+      "loss": 0.2529,
+      "step": 4120
+    },
+    {
+      "epoch": 3.52,
+      "grad_norm": 2.1586387157440186,
+      "learning_rate": 2.52770673486786e-05,
+      "loss": 0.2946,
+      "step": 4130
+    },
+    {
+      "epoch": 3.53,
+      "grad_norm": 3.978802442550659,
+      "learning_rate": 2.4850809889173064e-05,
+      "loss": 0.3867,
+      "step": 4140
+    },
+    {
+      "epoch": 3.54,
+      "grad_norm": 4.345022201538086,
+      "learning_rate": 2.442455242966752e-05,
+      "loss": 0.3664,
+      "step": 4150
+    },
+    {
+      "epoch": 3.55,
+      "grad_norm": 3.7032830715179443,
+      "learning_rate": 2.399829497016198e-05,
+      "loss": 0.3485,
+      "step": 4160
+    },
+    {
+      "epoch": 3.55,
+      "grad_norm": 3.59366774559021,
+      "learning_rate": 2.357203751065644e-05,
+      "loss": 0.3255,
+      "step": 4170
+    },
+    {
+      "epoch": 3.56,
+      "grad_norm": 5.068453311920166,
+      "learning_rate": 2.3145780051150897e-05,
+      "loss": 0.3121,
+      "step": 4180
+    },
+    {
+      "epoch": 3.57,
+      "grad_norm": 5.033252239227295,
+      "learning_rate": 2.2719522591645353e-05,
+      "loss": 0.368,
+      "step": 4190
+    },
+    {
+      "epoch": 3.58,
+      "grad_norm": 2.29129958152771,
+      "learning_rate": 2.2293265132139813e-05,
+      "loss": 0.2596,
+      "step": 4200
+    },
+    {
+      "epoch": 3.58,
+      "eval_accuracy": 0.9456521739130435,
+      "eval_loss": 0.23559102416038513,
+      "eval_runtime": 22.0843,
+      "eval_samples_per_second": 33.327,
+      "eval_steps_per_second": 4.166,
+      "step": 4200
+    },
+    {
+      "epoch": 3.59,
+      "grad_norm": 3.8676092624664307,
+      "learning_rate": 2.1867007672634273e-05,
+      "loss": 0.3853,
+      "step": 4210
+    },
+    {
+      "epoch": 3.6,
+      "grad_norm": 2.257540702819824,
+      "learning_rate": 2.144075021312873e-05,
+      "loss": 0.4,
+      "step": 4220
+    },
+    {
+      "epoch": 3.61,
+      "grad_norm": 4.077911853790283,
+      "learning_rate": 2.101449275362319e-05,
+      "loss": 0.3717,
+      "step": 4230
+    },
+    {
+      "epoch": 3.61,
+      "grad_norm": 3.7997450828552246,
+      "learning_rate": 2.058823529411765e-05,
+      "loss": 0.3895,
+      "step": 4240
+    },
+    {
+      "epoch": 3.62,
+      "grad_norm": 2.1893258094787598,
+      "learning_rate": 2.0161977834612105e-05,
+      "loss": 0.3243,
+      "step": 4250
+    },
+    {
+      "epoch": 3.63,
+      "grad_norm": 2.298306941986084,
+      "learning_rate": 1.9735720375106565e-05,
+      "loss": 0.3166,
+      "step": 4260
+    },
+    {
+      "epoch": 3.64,
+      "grad_norm": 2.121025562286377,
+      "learning_rate": 1.9309462915601025e-05,
+      "loss": 0.2771,
+      "step": 4270
+    },
+    {
+      "epoch": 3.65,
+      "grad_norm": 1.8228780031204224,
+      "learning_rate": 1.888320545609548e-05,
+      "loss": 0.2858,
+      "step": 4280
+    },
+    {
+      "epoch": 3.66,
+      "grad_norm": 4.4333977699279785,
+      "learning_rate": 1.845694799658994e-05,
+      "loss": 0.3386,
+      "step": 4290
+    },
+    {
+      "epoch": 3.67,
+      "grad_norm": 3.167982578277588,
+      "learning_rate": 1.80306905370844e-05,
+      "loss": 0.3064,
+      "step": 4300
+    },
+    {
+      "epoch": 3.67,
+      "eval_accuracy": 0.9497282608695652,
+      "eval_loss": 0.23243670165538788,
+      "eval_runtime": 22.1685,
+      "eval_samples_per_second": 33.2,
+      "eval_steps_per_second": 4.15,
+      "step": 4300
+    },
+    {
+      "epoch": 3.67,
+      "grad_norm": 3.177164316177368,
+      "learning_rate": 1.7604433077578857e-05,
+      "loss": 0.2947,
+      "step": 4310
+    },
+    {
+      "epoch": 3.68,
+      "grad_norm": 4.344371318817139,
+      "learning_rate": 1.7178175618073317e-05,
+      "loss": 0.3845,
+      "step": 4320
+    },
+    {
+      "epoch": 3.69,
+      "grad_norm": 2.364387035369873,
+      "learning_rate": 1.6751918158567777e-05,
+      "loss": 0.3276,
+      "step": 4330
+    },
+    {
+      "epoch": 3.7,
+      "grad_norm": 4.086526870727539,
+      "learning_rate": 1.6325660699062233e-05,
+      "loss": 0.3708,
+      "step": 4340
+    },
+    {
+      "epoch": 3.71,
+      "grad_norm": 4.65876579284668,
+      "learning_rate": 1.5899403239556693e-05,
+      "loss": 0.3258,
+      "step": 4350
+    },
+    {
+      "epoch": 3.72,
+      "grad_norm": 4.176472187042236,
+      "learning_rate": 1.5473145780051153e-05,
+      "loss": 0.2992,
+      "step": 4360
+    },
+    {
+      "epoch": 3.73,
+      "grad_norm": 3.989961624145508,
+      "learning_rate": 1.504688832054561e-05,
+      "loss": 0.3359,
+      "step": 4370
+    },
+    {
+      "epoch": 3.73,
+      "grad_norm": 2.337566614151001,
+      "learning_rate": 1.462063086104007e-05,
+      "loss": 0.3066,
+      "step": 4380
+    },
+    {
+      "epoch": 3.74,
+      "grad_norm": 2.9135842323303223,
+      "learning_rate": 1.4194373401534527e-05,
+      "loss": 0.2812,
+      "step": 4390
+    },
+    {
+      "epoch": 3.75,
+      "grad_norm": 4.203680038452148,
+      "learning_rate": 1.3768115942028985e-05,
+      "loss": 0.3059,
+      "step": 4400
+    },
+    {
+      "epoch": 3.75,
+      "eval_accuracy": 0.9456521739130435,
+      "eval_loss": 0.2321375608444214,
+      "eval_runtime": 21.9466,
+      "eval_samples_per_second": 33.536,
+      "eval_steps_per_second": 4.192,
+      "step": 4400
+    },
+    {
+      "epoch": 3.76,
+      "grad_norm": 4.143373489379883,
+      "learning_rate": 1.3341858482523445e-05,
+      "loss": 0.354,
+      "step": 4410
+    },
+    {
+      "epoch": 3.77,
+      "grad_norm": 2.7329490184783936,
+      "learning_rate": 1.2915601023017903e-05,
+      "loss": 0.3319,
+      "step": 4420
+    },
+    {
+      "epoch": 3.78,
+      "grad_norm": 3.1333062648773193,
+      "learning_rate": 1.2489343563512362e-05,
+      "loss": 0.3421,
+      "step": 4430
+    },
+    {
+      "epoch": 3.79,
+      "grad_norm": 3.976710319519043,
+      "learning_rate": 1.2063086104006821e-05,
+      "loss": 0.3263,
+      "step": 4440
+    },
+    {
+      "epoch": 3.79,
+      "grad_norm": 1.6640021800994873,
+      "learning_rate": 1.163682864450128e-05,
+      "loss": 0.3224,
+      "step": 4450
+    },
+    {
+      "epoch": 3.8,
+      "grad_norm": 2.7301018238067627,
+      "learning_rate": 1.121057118499574e-05,
+      "loss": 0.3694,
+      "step": 4460
+    },
+    {
+      "epoch": 3.81,
+      "grad_norm": 2.9358885288238525,
+      "learning_rate": 1.0784313725490197e-05,
+      "loss": 0.3103,
+      "step": 4470
+    },
+    {
+      "epoch": 3.82,
+      "grad_norm": 4.6855387687683105,
+      "learning_rate": 1.0400682011935209e-05,
+      "loss": 0.3927,
+      "step": 4480
+    },
+    {
+      "epoch": 3.83,
+      "grad_norm": 2.2591495513916016,
+      "learning_rate": 9.974424552429668e-06,
+      "loss": 0.4017,
+      "step": 4490
+    },
+    {
+      "epoch": 3.84,
+      "grad_norm": 2.375493049621582,
+      "learning_rate": 9.548167092924126e-06,
+      "loss": 0.42,
+      "step": 4500
+    },
+    {
+      "epoch": 3.84,
+      "eval_accuracy": 0.9402173913043478,
+      "eval_loss": 0.25556182861328125,
+      "eval_runtime": 22.225,
+      "eval_samples_per_second": 33.116,
+      "eval_steps_per_second": 4.139,
+      "step": 4500
+    },
+    {
+      "epoch": 3.84,
+      "grad_norm": 2.5054142475128174,
+      "learning_rate": 9.121909633418585e-06,
+      "loss": 0.3716,
+      "step": 4510
+    },
+    {
+      "epoch": 3.85,
+      "grad_norm": 3.321662425994873,
+      "learning_rate": 8.695652173913044e-06,
+      "loss": 0.3185,
+      "step": 4520
+    },
+    {
+      "epoch": 3.86,
+      "grad_norm": 2.8269574642181396,
+      "learning_rate": 8.269394714407503e-06,
+      "loss": 0.2985,
+      "step": 4530
+    },
+    {
+      "epoch": 3.87,
+      "grad_norm": 3.426715612411499,
+      "learning_rate": 7.84313725490196e-06,
+      "loss": 0.3068,
+      "step": 4540
+    },
+    {
+      "epoch": 3.88,
+      "grad_norm": 6.179994583129883,
+      "learning_rate": 7.41687979539642e-06,
+      "loss": 0.411,
+      "step": 4550
+    },
+    {
+      "epoch": 3.89,
+      "grad_norm": 3.369870662689209,
+      "learning_rate": 6.990622335890879e-06,
+      "loss": 0.3128,
+      "step": 4560
+    },
+    {
+      "epoch": 3.9,
+      "grad_norm": 4.271053791046143,
+      "learning_rate": 6.564364876385337e-06,
+      "loss": 0.3924,
+      "step": 4570
+    },
+    {
+      "epoch": 3.9,
+      "grad_norm": 3.3079607486724854,
+      "learning_rate": 6.138107416879796e-06,
+      "loss": 0.34,
+      "step": 4580
+    },
+    {
+      "epoch": 3.91,
+      "grad_norm": 6.019475936889648,
+      "learning_rate": 5.711849957374255e-06,
+      "loss": 0.3955,
+      "step": 4590
+    },
+    {
+      "epoch": 3.92,
+      "grad_norm": 3.133575916290283,
+      "learning_rate": 5.285592497868714e-06,
+      "loss": 0.2959,
+      "step": 4600
+    },
+    {
+      "epoch": 3.92,
+      "eval_accuracy": 0.9415760869565217,
+      "eval_loss": 0.24908488988876343,
+      "eval_runtime": 22.0463,
+      "eval_samples_per_second": 33.384,
+      "eval_steps_per_second": 4.173,
+      "step": 4600
+    },
+    {
+      "epoch": 3.93,
+      "grad_norm": 4.321446895599365,
+      "learning_rate": 4.859335038363171e-06,
+      "loss": 0.4081,
+      "step": 4610
+    },
+    {
+      "epoch": 3.94,
+      "grad_norm": 4.398526191711426,
+      "learning_rate": 4.43307757885763e-06,
+      "loss": 0.289,
+      "step": 4620
+    },
+    {
+      "epoch": 3.95,
+      "grad_norm": 3.48781681060791,
+      "learning_rate": 4.006820119352089e-06,
+      "loss": 0.311,
+      "step": 4630
+    },
+    {
+      "epoch": 3.96,
+      "grad_norm": 3.4008705615997314,
+      "learning_rate": 3.5805626598465474e-06,
+      "loss": 0.3046,
+      "step": 4640
+    },
+    {
+      "epoch": 3.96,
+      "grad_norm": 3.372878074645996,
+      "learning_rate": 3.154305200341006e-06,
+      "loss": 0.4128,
+      "step": 4650
+    },
+    {
+      "epoch": 3.97,
+      "grad_norm": 4.755558967590332,
+      "learning_rate": 2.728047740835465e-06,
+      "loss": 0.3619,
+      "step": 4660
+    },
+    {
+      "epoch": 3.98,
+      "grad_norm": 2.060793161392212,
+      "learning_rate": 2.3017902813299235e-06,
+      "loss": 0.2876,
+      "step": 4670
+    },
+    {
+      "epoch": 3.99,
+      "grad_norm": 4.377244472503662,
+      "learning_rate": 1.875532821824382e-06,
+      "loss": 0.3278,
+      "step": 4680
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 3.747706651687622,
+      "learning_rate": 1.4492753623188406e-06,
+      "loss": 0.3186,
+      "step": 4690
+    },
+    {
+      "epoch": 4.0,
+      "step": 4692,
+      "total_flos": 1.870424802038661e+18,
+      "train_loss": 0.5094418521630693,
+      "train_runtime": 3760.9881,
+      "train_samples_per_second": 19.953,
+      "train_steps_per_second": 1.248
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 4692,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 100,
+  "total_flos": 1.870424802038661e+18,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}