🍻 cheers

Browse files

Files changed (6) hide show

README.md +5 -2
all_results.json +13 -0
eval_results.json +8 -0
runs/May24_15-27-09_a15a230f540e/events.out.tfevents.1716567389.a15a230f540e.2637.1 +3 -0
train_results.json +8 -0
trainer_state.json +1732 -0

README.md CHANGED Viewed

@@ -1,5 +1,8 @@
 ---
 tags:
 - generated_from_trainer
 datasets:
 - imagefolder
@@ -12,7 +15,7 @@ model-index:
       name: Image Classification
       type: image-classification
     dataset:
-      name: imagefolder
       type: imagefolder
       config: default
       split: train
@@ -28,7 +31,7 @@ should probably proofread and complete it, then remove this comment. -->
 # vit_epochs5_batch64_lr0.001_size224_tiles1_seed1_vit_old_transform_old_hp
-This model was trained from scratch on the imagefolder dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.5220
 - Accuracy: 0.7539

 ---
+license: apache-2.0
+base_model: google/vit-base-patch16-224-in21k
 tags:
+- image-classification
 - generated_from_trainer
 datasets:
 - imagefolder
       name: Image Classification
       type: image-classification
     dataset:
+      name: Dogs_vs_Cats
       type: imagefolder
       config: default
       split: train
 # vit_epochs5_batch64_lr0.001_size224_tiles1_seed1_vit_old_transform_old_hp
+This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the Dogs_vs_Cats dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.5220
 - Accuracy: 0.7539

all_results.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "epoch": 5.0,
+    "eval_accuracy": 0.7538666666666667,
+    "eval_loss": 0.5220404863357544,
+    "eval_runtime": 53.1687,
+    "eval_samples_per_second": 70.53,
+    "eval_steps_per_second": 1.11,
+    "total_flos": 5.8118992210944e+18,
+    "train_loss": 0.5645027552259729,
+    "train_runtime": 2886.1261,
+    "train_samples_per_second": 25.986,
+    "train_steps_per_second": 0.407
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 5.0,
+    "eval_accuracy": 0.7538666666666667,
+    "eval_loss": 0.5220404863357544,
+    "eval_runtime": 53.1687,
+    "eval_samples_per_second": 70.53,
+    "eval_steps_per_second": 1.11
+}

runs/May24_15-27-09_a15a230f540e/events.out.tfevents.1716567389.a15a230f540e.2637.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f0874aec3c337a6286fc83cc31e8127d1f869e3eb633b8a42ddc571003217660
+size 411

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 5.0,
+    "total_flos": 5.8118992210944e+18,
+    "train_loss": 0.5645027552259729,
+    "train_runtime": 2886.1261,
+    "train_samples_per_second": 25.986,
+    "train_steps_per_second": 0.407
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1732 @@

+{
+  "best_metric": 0.5220404863357544,
+  "best_model_checkpoint": "vit_epochs5_batch64_lr0.001_size224_tiles1_seed1_vit_old_transform_old_hp/checkpoint-1175",
+  "epoch": 5.0,
+  "eval_steps": 500,
+  "global_step": 1175,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02127659574468085,
+      "grad_norm": 1.5236927270889282,
+      "learning_rate": 0.000995744680851064,
+      "loss": 0.8552,
+      "step": 5
+    },
+    {
+      "epoch": 0.0425531914893617,
+      "grad_norm": 0.1933981478214264,
+      "learning_rate": 0.0009914893617021276,
+      "loss": 0.6984,
+      "step": 10
+    },
+    {
+      "epoch": 0.06382978723404255,
+      "grad_norm": 0.2786834239959717,
+      "learning_rate": 0.0009872340425531915,
+      "loss": 0.684,
+      "step": 15
+    },
+    {
+      "epoch": 0.0851063829787234,
+      "grad_norm": 0.3437531888484955,
+      "learning_rate": 0.0009829787234042554,
+      "loss": 0.699,
+      "step": 20
+    },
+    {
+      "epoch": 0.10638297872340426,
+      "grad_norm": 0.17921054363250732,
+      "learning_rate": 0.0009787234042553192,
+      "loss": 0.6876,
+      "step": 25
+    },
+    {
+      "epoch": 0.1276595744680851,
+      "grad_norm": 0.2969794273376465,
+      "learning_rate": 0.0009744680851063829,
+      "loss": 0.7084,
+      "step": 30
+    },
+    {
+      "epoch": 0.14893617021276595,
+      "grad_norm": 0.2975955307483673,
+      "learning_rate": 0.0009702127659574468,
+      "loss": 0.6938,
+      "step": 35
+    },
+    {
+      "epoch": 0.1702127659574468,
+      "grad_norm": 0.049827929586172104,
+      "learning_rate": 0.0009659574468085106,
+      "loss": 0.6834,
+      "step": 40
+    },
+    {
+      "epoch": 0.19148936170212766,
+      "grad_norm": 0.6071491837501526,
+      "learning_rate": 0.0009617021276595745,
+      "loss": 0.6737,
+      "step": 45
+    },
+    {
+      "epoch": 0.2127659574468085,
+      "grad_norm": 0.1733636111021042,
+      "learning_rate": 0.0009574468085106384,
+      "loss": 0.6401,
+      "step": 50
+    },
+    {
+      "epoch": 0.23404255319148937,
+      "grad_norm": 0.6925361752510071,
+      "learning_rate": 0.0009531914893617022,
+      "loss": 0.6786,
+      "step": 55
+    },
+    {
+      "epoch": 0.2553191489361702,
+      "grad_norm": 1.0148730278015137,
+      "learning_rate": 0.000948936170212766,
+      "loss": 0.6925,
+      "step": 60
+    },
+    {
+      "epoch": 0.2765957446808511,
+      "grad_norm": 0.4391551911830902,
+      "learning_rate": 0.0009446808510638298,
+      "loss": 0.7001,
+      "step": 65
+    },
+    {
+      "epoch": 0.2978723404255319,
+      "grad_norm": 0.10365554690361023,
+      "learning_rate": 0.0009404255319148937,
+      "loss": 0.661,
+      "step": 70
+    },
+    {
+      "epoch": 0.3191489361702128,
+      "grad_norm": 0.5373475551605225,
+      "learning_rate": 0.0009361702127659575,
+      "loss": 0.6646,
+      "step": 75
+    },
+    {
+      "epoch": 0.3404255319148936,
+      "grad_norm": 0.26909396052360535,
+      "learning_rate": 0.0009319148936170214,
+      "loss": 0.6496,
+      "step": 80
+    },
+    {
+      "epoch": 0.3617021276595745,
+      "grad_norm": 0.7345396876335144,
+      "learning_rate": 0.0009276595744680851,
+      "loss": 0.6809,
+      "step": 85
+    },
+    {
+      "epoch": 0.3829787234042553,
+      "grad_norm": 0.17642471194267273,
+      "learning_rate": 0.0009234042553191489,
+      "loss": 0.6689,
+      "step": 90
+    },
+    {
+      "epoch": 0.40425531914893614,
+      "grad_norm": 0.24865615367889404,
+      "learning_rate": 0.0009191489361702128,
+      "loss": 0.6668,
+      "step": 95
+    },
+    {
+      "epoch": 0.425531914893617,
+      "grad_norm": 0.0725848600268364,
+      "learning_rate": 0.0009148936170212766,
+      "loss": 0.6955,
+      "step": 100
+    },
+    {
+      "epoch": 0.44680851063829785,
+      "grad_norm": 0.6779701113700867,
+      "learning_rate": 0.0009106382978723405,
+      "loss": 0.6643,
+      "step": 105
+    },
+    {
+      "epoch": 0.46808510638297873,
+      "grad_norm": 0.2594638466835022,
+      "learning_rate": 0.0009063829787234043,
+      "loss": 0.6774,
+      "step": 110
+    },
+    {
+      "epoch": 0.48936170212765956,
+      "grad_norm": 0.41974830627441406,
+      "learning_rate": 0.000902127659574468,
+      "loss": 0.6632,
+      "step": 115
+    },
+    {
+      "epoch": 0.5106382978723404,
+      "grad_norm": 0.2086678445339203,
+      "learning_rate": 0.0008978723404255319,
+      "loss": 0.6264,
+      "step": 120
+    },
+    {
+      "epoch": 0.5319148936170213,
+      "grad_norm": 0.45617616176605225,
+      "learning_rate": 0.0008936170212765957,
+      "loss": 0.6538,
+      "step": 125
+    },
+    {
+      "epoch": 0.5531914893617021,
+      "grad_norm": 0.32972219586372375,
+      "learning_rate": 0.0008893617021276596,
+      "loss": 0.6471,
+      "step": 130
+    },
+    {
+      "epoch": 0.574468085106383,
+      "grad_norm": 0.5587528347969055,
+      "learning_rate": 0.0008851063829787234,
+      "loss": 0.624,
+      "step": 135
+    },
+    {
+      "epoch": 0.5957446808510638,
+      "grad_norm": 0.5918276906013489,
+      "learning_rate": 0.0008808510638297873,
+      "loss": 0.6576,
+      "step": 140
+    },
+    {
+      "epoch": 0.6170212765957447,
+      "grad_norm": 0.35423263907432556,
+      "learning_rate": 0.0008765957446808511,
+      "loss": 0.6376,
+      "step": 145
+    },
+    {
+      "epoch": 0.6382978723404256,
+      "grad_norm": 0.49659672379493713,
+      "learning_rate": 0.0008723404255319149,
+      "loss": 0.6555,
+      "step": 150
+    },
+    {
+      "epoch": 0.6595744680851063,
+      "grad_norm": 0.26542067527770996,
+      "learning_rate": 0.0008680851063829788,
+      "loss": 0.6457,
+      "step": 155
+    },
+    {
+      "epoch": 0.6808510638297872,
+      "grad_norm": 0.5932815670967102,
+      "learning_rate": 0.0008638297872340426,
+      "loss": 0.6706,
+      "step": 160
+    },
+    {
+      "epoch": 0.7021276595744681,
+      "grad_norm": 0.18936298787593842,
+      "learning_rate": 0.0008595744680851064,
+      "loss": 0.6923,
+      "step": 165
+    },
+    {
+      "epoch": 0.723404255319149,
+      "grad_norm": 0.2216617614030838,
+      "learning_rate": 0.0008553191489361703,
+      "loss": 0.6805,
+      "step": 170
+    },
+    {
+      "epoch": 0.7446808510638298,
+      "grad_norm": 0.2572282552719116,
+      "learning_rate": 0.000851063829787234,
+      "loss": 0.6803,
+      "step": 175
+    },
+    {
+      "epoch": 0.7659574468085106,
+      "grad_norm": 0.2624934911727905,
+      "learning_rate": 0.0008468085106382979,
+      "loss": 0.6796,
+      "step": 180
+    },
+    {
+      "epoch": 0.7872340425531915,
+      "grad_norm": 0.3983383774757385,
+      "learning_rate": 0.0008425531914893617,
+      "loss": 0.652,
+      "step": 185
+    },
+    {
+      "epoch": 0.8085106382978723,
+      "grad_norm": 0.7851768136024475,
+      "learning_rate": 0.0008382978723404256,
+      "loss": 0.6972,
+      "step": 190
+    },
+    {
+      "epoch": 0.8297872340425532,
+      "grad_norm": 0.08407687395811081,
+      "learning_rate": 0.0008340425531914894,
+      "loss": 0.7127,
+      "step": 195
+    },
+    {
+      "epoch": 0.851063829787234,
+      "grad_norm": 0.2317022830247879,
+      "learning_rate": 0.0008297872340425531,
+      "loss": 0.6879,
+      "step": 200
+    },
+    {
+      "epoch": 0.8723404255319149,
+      "grad_norm": 0.10921870172023773,
+      "learning_rate": 0.000825531914893617,
+      "loss": 0.6909,
+      "step": 205
+    },
+    {
+      "epoch": 0.8936170212765957,
+      "grad_norm": 0.06697387248277664,
+      "learning_rate": 0.0008212765957446808,
+      "loss": 0.6858,
+      "step": 210
+    },
+    {
+      "epoch": 0.9148936170212766,
+      "grad_norm": 0.16396264731884003,
+      "learning_rate": 0.0008170212765957447,
+      "loss": 0.6836,
+      "step": 215
+    },
+    {
+      "epoch": 0.9361702127659575,
+      "grad_norm": 0.07334744930267334,
+      "learning_rate": 0.0008127659574468085,
+      "loss": 0.6835,
+      "step": 220
+    },
+    {
+      "epoch": 0.9574468085106383,
+      "grad_norm": 0.28075695037841797,
+      "learning_rate": 0.0008085106382978723,
+      "loss": 0.6616,
+      "step": 225
+    },
+    {
+      "epoch": 0.9787234042553191,
+      "grad_norm": 0.32385650277137756,
+      "learning_rate": 0.0008042553191489363,
+      "loss": 0.6763,
+      "step": 230
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.6150110960006714,
+      "learning_rate": 0.0008,
+      "loss": 0.6668,
+      "step": 235
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.5725333333333333,
+      "eval_loss": 0.6652818918228149,
+      "eval_runtime": 52.8415,
+      "eval_samples_per_second": 70.967,
+      "eval_steps_per_second": 1.117,
+      "step": 235
+    },
+    {
+      "epoch": 1.0212765957446808,
+      "grad_norm": 0.36133354902267456,
+      "learning_rate": 0.0007957446808510639,
+      "loss": 0.6505,
+      "step": 240
+    },
+    {
+      "epoch": 1.0425531914893618,
+      "grad_norm": 0.2631653845310211,
+      "learning_rate": 0.0007914893617021277,
+      "loss": 0.6666,
+      "step": 245
+    },
+    {
+      "epoch": 1.0638297872340425,
+      "grad_norm": 0.40402382612228394,
+      "learning_rate": 0.0007872340425531915,
+      "loss": 0.6406,
+      "step": 250
+    },
+    {
+      "epoch": 1.0851063829787233,
+      "grad_norm": 0.22335675358772278,
+      "learning_rate": 0.0007829787234042554,
+      "loss": 0.6584,
+      "step": 255
+    },
+    {
+      "epoch": 1.1063829787234043,
+      "grad_norm": 0.38019102811813354,
+      "learning_rate": 0.0007787234042553192,
+      "loss": 0.6773,
+      "step": 260
+    },
+    {
+      "epoch": 1.127659574468085,
+      "grad_norm": 0.6945547461509705,
+      "learning_rate": 0.000774468085106383,
+      "loss": 0.66,
+      "step": 265
+    },
+    {
+      "epoch": 1.148936170212766,
+      "grad_norm": 0.2084246724843979,
+      "learning_rate": 0.0007702127659574468,
+      "loss": 0.6512,
+      "step": 270
+    },
+    {
+      "epoch": 1.1702127659574468,
+      "grad_norm": 0.1295584738254547,
+      "learning_rate": 0.0007659574468085106,
+      "loss": 0.6521,
+      "step": 275
+    },
+    {
+      "epoch": 1.1914893617021276,
+      "grad_norm": 0.12610581517219543,
+      "learning_rate": 0.0007617021276595745,
+      "loss": 0.6281,
+      "step": 280
+    },
+    {
+      "epoch": 1.2127659574468086,
+      "grad_norm": 0.5777516961097717,
+      "learning_rate": 0.0007574468085106383,
+      "loss": 0.6315,
+      "step": 285
+    },
+    {
+      "epoch": 1.2340425531914894,
+      "grad_norm": 0.4698016047477722,
+      "learning_rate": 0.0007531914893617022,
+      "loss": 0.6736,
+      "step": 290
+    },
+    {
+      "epoch": 1.2553191489361701,
+      "grad_norm": 0.306220680475235,
+      "learning_rate": 0.0007489361702127659,
+      "loss": 0.6616,
+      "step": 295
+    },
+    {
+      "epoch": 1.2765957446808511,
+      "grad_norm": 0.1651347577571869,
+      "learning_rate": 0.0007446808510638298,
+      "loss": 0.6624,
+      "step": 300
+    },
+    {
+      "epoch": 1.297872340425532,
+      "grad_norm": 0.1671248823404312,
+      "learning_rate": 0.0007404255319148936,
+      "loss": 0.6537,
+      "step": 305
+    },
+    {
+      "epoch": 1.3191489361702127,
+      "grad_norm": 0.5579215288162231,
+      "learning_rate": 0.0007361702127659574,
+      "loss": 0.6547,
+      "step": 310
+    },
+    {
+      "epoch": 1.3404255319148937,
+      "grad_norm": 0.20245681703090668,
+      "learning_rate": 0.0007319148936170213,
+      "loss": 0.6477,
+      "step": 315
+    },
+    {
+      "epoch": 1.3617021276595744,
+      "grad_norm": 0.1913478672504425,
+      "learning_rate": 0.0007276595744680852,
+      "loss": 0.6311,
+      "step": 320
+    },
+    {
+      "epoch": 1.3829787234042552,
+      "grad_norm": 0.4945693016052246,
+      "learning_rate": 0.000723404255319149,
+      "loss": 0.5979,
+      "step": 325
+    },
+    {
+      "epoch": 1.4042553191489362,
+      "grad_norm": 0.1921028196811676,
+      "learning_rate": 0.0007191489361702128,
+      "loss": 0.7027,
+      "step": 330
+    },
+    {
+      "epoch": 1.425531914893617,
+      "grad_norm": 0.26029083132743835,
+      "learning_rate": 0.0007148936170212766,
+      "loss": 0.6733,
+      "step": 335
+    },
+    {
+      "epoch": 1.4468085106382977,
+      "grad_norm": 0.3045407831668854,
+      "learning_rate": 0.0007106382978723405,
+      "loss": 0.6619,
+      "step": 340
+    },
+    {
+      "epoch": 1.4680851063829787,
+      "grad_norm": 0.12488707154989243,
+      "learning_rate": 0.0007063829787234043,
+      "loss": 0.666,
+      "step": 345
+    },
+    {
+      "epoch": 1.4893617021276595,
+      "grad_norm": 0.15467241406440735,
+      "learning_rate": 0.0007021276595744682,
+      "loss": 0.634,
+      "step": 350
+    },
+    {
+      "epoch": 1.5106382978723403,
+      "grad_norm": 0.23499886691570282,
+      "learning_rate": 0.0006978723404255319,
+      "loss": 0.6257,
+      "step": 355
+    },
+    {
+      "epoch": 1.5319148936170213,
+      "grad_norm": 0.48748576641082764,
+      "learning_rate": 0.0006936170212765957,
+      "loss": 0.6369,
+      "step": 360
+    },
+    {
+      "epoch": 1.5531914893617023,
+      "grad_norm": 0.3014831244945526,
+      "learning_rate": 0.0006893617021276596,
+      "loss": 0.6274,
+      "step": 365
+    },
+    {
+      "epoch": 1.574468085106383,
+      "grad_norm": 0.12689495086669922,
+      "learning_rate": 0.0006851063829787234,
+      "loss": 0.6427,
+      "step": 370
+    },
+    {
+      "epoch": 1.5957446808510638,
+      "grad_norm": 0.3490160405635834,
+      "learning_rate": 0.0006808510638297873,
+      "loss": 0.6885,
+      "step": 375
+    },
+    {
+      "epoch": 1.6170212765957448,
+      "grad_norm": 0.2676607370376587,
+      "learning_rate": 0.000676595744680851,
+      "loss": 0.6436,
+      "step": 380
+    },
+    {
+      "epoch": 1.6382978723404256,
+      "grad_norm": 0.26951488852500916,
+      "learning_rate": 0.0006723404255319148,
+      "loss": 0.6387,
+      "step": 385
+    },
+    {
+      "epoch": 1.6595744680851063,
+      "grad_norm": 0.3769073784351349,
+      "learning_rate": 0.0006680851063829787,
+      "loss": 0.6003,
+      "step": 390
+    },
+    {
+      "epoch": 1.6808510638297873,
+      "grad_norm": 0.43915122747421265,
+      "learning_rate": 0.0006638297872340425,
+      "loss": 0.6477,
+      "step": 395
+    },
+    {
+      "epoch": 1.702127659574468,
+      "grad_norm": 0.2419726401567459,
+      "learning_rate": 0.0006595744680851064,
+      "loss": 0.6174,
+      "step": 400
+    },
+    {
+      "epoch": 1.7234042553191489,
+      "grad_norm": 0.5210821628570557,
+      "learning_rate": 0.0006553191489361702,
+      "loss": 0.625,
+      "step": 405
+    },
+    {
+      "epoch": 1.7446808510638299,
+      "grad_norm": 0.5546556115150452,
+      "learning_rate": 0.0006510638297872342,
+      "loss": 0.604,
+      "step": 410
+    },
+    {
+      "epoch": 1.7659574468085106,
+      "grad_norm": 0.5459072589874268,
+      "learning_rate": 0.0006468085106382979,
+      "loss": 0.6322,
+      "step": 415
+    },
+    {
+      "epoch": 1.7872340425531914,
+      "grad_norm": 0.28615137934684753,
+      "learning_rate": 0.0006425531914893617,
+      "loss": 0.6288,
+      "step": 420
+    },
+    {
+      "epoch": 1.8085106382978724,
+      "grad_norm": 0.25826430320739746,
+      "learning_rate": 0.0006382978723404256,
+      "loss": 0.6377,
+      "step": 425
+    },
+    {
+      "epoch": 1.8297872340425532,
+      "grad_norm": 0.27113598585128784,
+      "learning_rate": 0.0006340425531914894,
+      "loss": 0.6155,
+      "step": 430
+    },
+    {
+      "epoch": 1.851063829787234,
+      "grad_norm": 0.3145448565483093,
+      "learning_rate": 0.0006297872340425533,
+      "loss": 0.6258,
+      "step": 435
+    },
+    {
+      "epoch": 1.872340425531915,
+      "grad_norm": 0.221902996301651,
+      "learning_rate": 0.000625531914893617,
+      "loss": 0.6133,
+      "step": 440
+    },
+    {
+      "epoch": 1.8936170212765957,
+      "grad_norm": 0.2308581918478012,
+      "learning_rate": 0.0006212765957446808,
+      "loss": 0.5883,
+      "step": 445
+    },
+    {
+      "epoch": 1.9148936170212765,
+      "grad_norm": 0.2169838696718216,
+      "learning_rate": 0.0006170212765957447,
+      "loss": 0.6219,
+      "step": 450
+    },
+    {
+      "epoch": 1.9361702127659575,
+      "grad_norm": 0.32386860251426697,
+      "learning_rate": 0.0006127659574468085,
+      "loss": 0.6102,
+      "step": 455
+    },
+    {
+      "epoch": 1.9574468085106385,
+      "grad_norm": 0.13700896501541138,
+      "learning_rate": 0.0006085106382978724,
+      "loss": 0.6436,
+      "step": 460
+    },
+    {
+      "epoch": 1.978723404255319,
+      "grad_norm": 0.18552586436271667,
+      "learning_rate": 0.0006042553191489362,
+      "loss": 0.6524,
+      "step": 465
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.5744425058364868,
+      "learning_rate": 0.0006,
+      "loss": 0.6527,
+      "step": 470
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.6528,
+      "eval_loss": 0.6233171224594116,
+      "eval_runtime": 52.0761,
+      "eval_samples_per_second": 72.01,
+      "eval_steps_per_second": 1.133,
+      "step": 470
+    },
+    {
+      "epoch": 2.021276595744681,
+      "grad_norm": 0.39053860306739807,
+      "learning_rate": 0.0005957446808510638,
+      "loss": 0.5832,
+      "step": 475
+    },
+    {
+      "epoch": 2.0425531914893615,
+      "grad_norm": 0.2939192056655884,
+      "learning_rate": 0.0005914893617021276,
+      "loss": 0.5808,
+      "step": 480
+    },
+    {
+      "epoch": 2.0638297872340425,
+      "grad_norm": 0.5998929142951965,
+      "learning_rate": 0.0005872340425531915,
+      "loss": 0.6119,
+      "step": 485
+    },
+    {
+      "epoch": 2.0851063829787235,
+      "grad_norm": 0.48165130615234375,
+      "learning_rate": 0.0005829787234042553,
+      "loss": 0.5868,
+      "step": 490
+    },
+    {
+      "epoch": 2.106382978723404,
+      "grad_norm": 0.2857578694820404,
+      "learning_rate": 0.0005787234042553191,
+      "loss": 0.5843,
+      "step": 495
+    },
+    {
+      "epoch": 2.127659574468085,
+      "grad_norm": 0.28461429476737976,
+      "learning_rate": 0.0005744680851063831,
+      "loss": 0.5843,
+      "step": 500
+    },
+    {
+      "epoch": 2.148936170212766,
+      "grad_norm": 0.30877211689949036,
+      "learning_rate": 0.0005702127659574468,
+      "loss": 0.5652,
+      "step": 505
+    },
+    {
+      "epoch": 2.1702127659574466,
+      "grad_norm": 0.7491441369056702,
+      "learning_rate": 0.0005659574468085107,
+      "loss": 0.5687,
+      "step": 510
+    },
+    {
+      "epoch": 2.1914893617021276,
+      "grad_norm": 0.29466772079467773,
+      "learning_rate": 0.0005617021276595745,
+      "loss": 0.6339,
+      "step": 515
+    },
+    {
+      "epoch": 2.2127659574468086,
+      "grad_norm": 0.44021138548851013,
+      "learning_rate": 0.0005574468085106383,
+      "loss": 0.5629,
+      "step": 520
+    },
+    {
+      "epoch": 2.2340425531914896,
+      "grad_norm": 0.19135086238384247,
+      "learning_rate": 0.0005531914893617022,
+      "loss": 0.6169,
+      "step": 525
+    },
+    {
+      "epoch": 2.25531914893617,
+      "grad_norm": 0.6730530858039856,
+      "learning_rate": 0.000548936170212766,
+      "loss": 0.6063,
+      "step": 530
+    },
+    {
+      "epoch": 2.276595744680851,
+      "grad_norm": 0.4451698362827301,
+      "learning_rate": 0.0005446808510638298,
+      "loss": 0.614,
+      "step": 535
+    },
+    {
+      "epoch": 2.297872340425532,
+      "grad_norm": 0.19956566393375397,
+      "learning_rate": 0.0005404255319148936,
+      "loss": 0.5848,
+      "step": 540
+    },
+    {
+      "epoch": 2.3191489361702127,
+      "grad_norm": 0.3573627471923828,
+      "learning_rate": 0.0005361702127659575,
+      "loss": 0.5963,
+      "step": 545
+    },
+    {
+      "epoch": 2.3404255319148937,
+      "grad_norm": 0.22617582976818085,
+      "learning_rate": 0.0005319148936170213,
+      "loss": 0.5512,
+      "step": 550
+    },
+    {
+      "epoch": 2.3617021276595747,
+      "grad_norm": 0.2276870310306549,
+      "learning_rate": 0.0005276595744680851,
+      "loss": 0.5801,
+      "step": 555
+    },
+    {
+      "epoch": 2.382978723404255,
+      "grad_norm": 0.3912278413772583,
+      "learning_rate": 0.000523404255319149,
+      "loss": 0.6101,
+      "step": 560
+    },
+    {
+      "epoch": 2.404255319148936,
+      "grad_norm": 0.20038598775863647,
+      "learning_rate": 0.0005191489361702127,
+      "loss": 0.5842,
+      "step": 565
+    },
+    {
+      "epoch": 2.425531914893617,
+      "grad_norm": 0.27847474813461304,
+      "learning_rate": 0.0005148936170212766,
+      "loss": 0.5597,
+      "step": 570
+    },
+    {
+      "epoch": 2.4468085106382977,
+      "grad_norm": 0.49357470870018005,
+      "learning_rate": 0.0005106382978723404,
+      "loss": 0.5374,
+      "step": 575
+    },
+    {
+      "epoch": 2.4680851063829787,
+      "grad_norm": 0.22584182024002075,
+      "learning_rate": 0.0005063829787234042,
+      "loss": 0.6416,
+      "step": 580
+    },
+    {
+      "epoch": 2.4893617021276597,
+      "grad_norm": 0.4970340430736542,
+      "learning_rate": 0.0005021276595744681,
+      "loss": 0.6101,
+      "step": 585
+    },
+    {
+      "epoch": 2.5106382978723403,
+      "grad_norm": 0.23562884330749512,
+      "learning_rate": 0.000497872340425532,
+      "loss": 0.5728,
+      "step": 590
+    },
+    {
+      "epoch": 2.5319148936170213,
+      "grad_norm": 0.2772935926914215,
+      "learning_rate": 0.0004936170212765957,
+      "loss": 0.5969,
+      "step": 595
+    },
+    {
+      "epoch": 2.5531914893617023,
+      "grad_norm": 0.466553658246994,
+      "learning_rate": 0.0004893617021276596,
+      "loss": 0.5722,
+      "step": 600
+    },
+    {
+      "epoch": 2.574468085106383,
+      "grad_norm": 0.1931866854429245,
+      "learning_rate": 0.0004851063829787234,
+      "loss": 0.5947,
+      "step": 605
+    },
+    {
+      "epoch": 2.595744680851064,
+      "grad_norm": 0.3345823884010315,
+      "learning_rate": 0.00048085106382978723,
+      "loss": 0.5464,
+      "step": 610
+    },
+    {
+      "epoch": 2.617021276595745,
+      "grad_norm": 0.8605038523674011,
+      "learning_rate": 0.0004765957446808511,
+      "loss": 0.616,
+      "step": 615
+    },
+    {
+      "epoch": 2.6382978723404253,
+      "grad_norm": 0.467629611492157,
+      "learning_rate": 0.0004723404255319149,
+      "loss": 0.5997,
+      "step": 620
+    },
+    {
+      "epoch": 2.6595744680851063,
+      "grad_norm": 0.30429497361183167,
+      "learning_rate": 0.00046808510638297874,
+      "loss": 0.5498,
+      "step": 625
+    },
+    {
+      "epoch": 2.6808510638297873,
+      "grad_norm": 0.2898688316345215,
+      "learning_rate": 0.00046382978723404257,
+      "loss": 0.5526,
+      "step": 630
+    },
+    {
+      "epoch": 2.702127659574468,
+      "grad_norm": 0.24966174364089966,
+      "learning_rate": 0.0004595744680851064,
+      "loss": 0.568,
+      "step": 635
+    },
+    {
+      "epoch": 2.723404255319149,
+      "grad_norm": 0.31960707902908325,
+      "learning_rate": 0.00045531914893617024,
+      "loss": 0.5573,
+      "step": 640
+    },
+    {
+      "epoch": 2.74468085106383,
+      "grad_norm": 0.17629045248031616,
+      "learning_rate": 0.000451063829787234,
+      "loss": 0.5793,
+      "step": 645
+    },
+    {
+      "epoch": 2.7659574468085104,
+      "grad_norm": 0.3344897925853729,
+      "learning_rate": 0.00044680851063829785,
+      "loss": 0.5782,
+      "step": 650
+    },
+    {
+      "epoch": 2.7872340425531914,
+      "grad_norm": 0.6426132917404175,
+      "learning_rate": 0.0004425531914893617,
+      "loss": 0.6065,
+      "step": 655
+    },
+    {
+      "epoch": 2.8085106382978724,
+      "grad_norm": 0.4149859547615051,
+      "learning_rate": 0.00043829787234042557,
+      "loss": 0.6095,
+      "step": 660
+    },
+    {
+      "epoch": 2.829787234042553,
+      "grad_norm": 0.2638397812843323,
+      "learning_rate": 0.0004340425531914894,
+      "loss": 0.5651,
+      "step": 665
+    },
+    {
+      "epoch": 2.851063829787234,
+      "grad_norm": 0.47826263308525085,
+      "learning_rate": 0.0004297872340425532,
+      "loss": 0.6366,
+      "step": 670
+    },
+    {
+      "epoch": 2.872340425531915,
+      "grad_norm": 0.47488388419151306,
+      "learning_rate": 0.000425531914893617,
+      "loss": 0.5498,
+      "step": 675
+    },
+    {
+      "epoch": 2.8936170212765955,
+      "grad_norm": 0.29856908321380615,
+      "learning_rate": 0.00042127659574468085,
+      "loss": 0.5576,
+      "step": 680
+    },
+    {
+      "epoch": 2.9148936170212765,
+      "grad_norm": 0.3228590488433838,
+      "learning_rate": 0.0004170212765957447,
+      "loss": 0.5527,
+      "step": 685
+    },
+    {
+      "epoch": 2.9361702127659575,
+      "grad_norm": 0.28109100461006165,
+      "learning_rate": 0.0004127659574468085,
+      "loss": 0.5421,
+      "step": 690
+    },
+    {
+      "epoch": 2.9574468085106385,
+      "grad_norm": 0.43624716997146606,
+      "learning_rate": 0.00040851063829787235,
+      "loss": 0.5419,
+      "step": 695
+    },
+    {
+      "epoch": 2.978723404255319,
+      "grad_norm": 0.33003830909729004,
+      "learning_rate": 0.00040425531914893613,
+      "loss": 0.5614,
+      "step": 700
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 0.8071190118789673,
+      "learning_rate": 0.0004,
+      "loss": 0.5628,
+      "step": 705
+    },
+    {
+      "epoch": 3.0,
+      "eval_accuracy": 0.7048,
+      "eval_loss": 0.5658010244369507,
+      "eval_runtime": 52.264,
+      "eval_samples_per_second": 71.751,
+      "eval_steps_per_second": 1.129,
+      "step": 705
+    },
+    {
+      "epoch": 3.021276595744681,
+      "grad_norm": 0.34832167625427246,
+      "learning_rate": 0.00039574468085106385,
+      "loss": 0.5412,
+      "step": 710
+    },
+    {
+      "epoch": 3.0425531914893615,
+      "grad_norm": 0.3105883002281189,
+      "learning_rate": 0.0003914893617021277,
+      "loss": 0.5428,
+      "step": 715
+    },
+    {
+      "epoch": 3.0638297872340425,
+      "grad_norm": 0.48978525400161743,
+      "learning_rate": 0.0003872340425531915,
+      "loss": 0.5074,
+      "step": 720
+    },
+    {
+      "epoch": 3.0851063829787235,
+      "grad_norm": 0.3323807120323181,
+      "learning_rate": 0.0003829787234042553,
+      "loss": 0.5388,
+      "step": 725
+    },
+    {
+      "epoch": 3.106382978723404,
+      "grad_norm": 0.23931725323200226,
+      "learning_rate": 0.00037872340425531913,
+      "loss": 0.5329,
+      "step": 730
+    },
+    {
+      "epoch": 3.127659574468085,
+      "grad_norm": 0.4094422459602356,
+      "learning_rate": 0.00037446808510638297,
+      "loss": 0.5256,
+      "step": 735
+    },
+    {
+      "epoch": 3.148936170212766,
+      "grad_norm": 0.2427910566329956,
+      "learning_rate": 0.0003702127659574468,
+      "loss": 0.4994,
+      "step": 740
+    },
+    {
+      "epoch": 3.1702127659574466,
+      "grad_norm": 0.46753978729248047,
+      "learning_rate": 0.00036595744680851063,
+      "loss": 0.5841,
+      "step": 745
+    },
+    {
+      "epoch": 3.1914893617021276,
+      "grad_norm": 0.60309898853302,
+      "learning_rate": 0.0003617021276595745,
+      "loss": 0.5018,
+      "step": 750
+    },
+    {
+      "epoch": 3.2127659574468086,
+      "grad_norm": 0.32367798686027527,
+      "learning_rate": 0.0003574468085106383,
+      "loss": 0.5112,
+      "step": 755
+    },
+    {
+      "epoch": 3.2340425531914896,
+      "grad_norm": 0.31850096583366394,
+      "learning_rate": 0.00035319148936170213,
+      "loss": 0.5197,
+      "step": 760
+    },
+    {
+      "epoch": 3.25531914893617,
+      "grad_norm": 0.40993842482566833,
+      "learning_rate": 0.00034893617021276597,
+      "loss": 0.491,
+      "step": 765
+    },
+    {
+      "epoch": 3.276595744680851,
+      "grad_norm": 0.31502920389175415,
+      "learning_rate": 0.0003446808510638298,
+      "loss": 0.5134,
+      "step": 770
+    },
+    {
+      "epoch": 3.297872340425532,
+      "grad_norm": 0.34986236691474915,
+      "learning_rate": 0.00034042553191489364,
+      "loss": 0.5093,
+      "step": 775
+    },
+    {
+      "epoch": 3.3191489361702127,
+      "grad_norm": 0.30818113684654236,
+      "learning_rate": 0.0003361702127659574,
+      "loss": 0.4668,
+      "step": 780
+    },
+    {
+      "epoch": 3.3404255319148937,
+      "grad_norm": 0.45690372586250305,
+      "learning_rate": 0.00033191489361702125,
+      "loss": 0.4793,
+      "step": 785
+    },
+    {
+      "epoch": 3.3617021276595747,
+      "grad_norm": 0.431671142578125,
+      "learning_rate": 0.0003276595744680851,
+      "loss": 0.5449,
+      "step": 790
+    },
+    {
+      "epoch": 3.382978723404255,
+      "grad_norm": 0.6079233288764954,
+      "learning_rate": 0.00032340425531914897,
+      "loss": 0.5055,
+      "step": 795
+    },
+    {
+      "epoch": 3.404255319148936,
+      "grad_norm": 0.25394123792648315,
+      "learning_rate": 0.0003191489361702128,
+      "loss": 0.5137,
+      "step": 800
+    },
+    {
+      "epoch": 3.425531914893617,
+      "grad_norm": 0.2768719494342804,
+      "learning_rate": 0.00031489361702127664,
+      "loss": 0.5378,
+      "step": 805
+    },
+    {
+      "epoch": 3.4468085106382977,
+      "grad_norm": 0.33412039279937744,
+      "learning_rate": 0.0003106382978723404,
+      "loss": 0.5529,
+      "step": 810
+    },
+    {
+      "epoch": 3.4680851063829787,
+      "grad_norm": 0.45218709111213684,
+      "learning_rate": 0.00030638297872340425,
+      "loss": 0.514,
+      "step": 815
+    },
+    {
+      "epoch": 3.4893617021276597,
+      "grad_norm": 0.29416921734809875,
+      "learning_rate": 0.0003021276595744681,
+      "loss": 0.471,
+      "step": 820
+    },
+    {
+      "epoch": 3.5106382978723403,
+      "grad_norm": 0.4108869433403015,
+      "learning_rate": 0.0002978723404255319,
+      "loss": 0.5222,
+      "step": 825
+    },
+    {
+      "epoch": 3.5319148936170213,
+      "grad_norm": 0.5049691200256348,
+      "learning_rate": 0.00029361702127659575,
+      "loss": 0.5103,
+      "step": 830
+    },
+    {
+      "epoch": 3.5531914893617023,
+      "grad_norm": 0.37521079182624817,
+      "learning_rate": 0.00028936170212765953,
+      "loss": 0.5088,
+      "step": 835
+    },
+    {
+      "epoch": 3.574468085106383,
+      "grad_norm": 0.6042494177818298,
+      "learning_rate": 0.0002851063829787234,
+      "loss": 0.4886,
+      "step": 840
+    },
+    {
+      "epoch": 3.595744680851064,
+      "grad_norm": 0.3379281163215637,
+      "learning_rate": 0.00028085106382978725,
+      "loss": 0.4878,
+      "step": 845
+    },
+    {
+      "epoch": 3.617021276595745,
+      "grad_norm": 0.42538291215896606,
+      "learning_rate": 0.0002765957446808511,
+      "loss": 0.5241,
+      "step": 850
+    },
+    {
+      "epoch": 3.6382978723404253,
+      "grad_norm": 0.34973302483558655,
+      "learning_rate": 0.0002723404255319149,
+      "loss": 0.497,
+      "step": 855
+    },
+    {
+      "epoch": 3.6595744680851063,
+      "grad_norm": 0.5937588214874268,
+      "learning_rate": 0.00026808510638297875,
+      "loss": 0.5004,
+      "step": 860
+    },
+    {
+      "epoch": 3.6808510638297873,
+      "grad_norm": 0.3566235601902008,
+      "learning_rate": 0.00026382978723404253,
+      "loss": 0.5192,
+      "step": 865
+    },
+    {
+      "epoch": 3.702127659574468,
+      "grad_norm": 0.7297813296318054,
+      "learning_rate": 0.00025957446808510637,
+      "loss": 0.5313,
+      "step": 870
+    },
+    {
+      "epoch": 3.723404255319149,
+      "grad_norm": 0.3060586452484131,
+      "learning_rate": 0.0002553191489361702,
+      "loss": 0.5057,
+      "step": 875
+    },
+    {
+      "epoch": 3.74468085106383,
+      "grad_norm": 0.3572905361652374,
+      "learning_rate": 0.00025106382978723403,
+      "loss": 0.5078,
+      "step": 880
+    },
+    {
+      "epoch": 3.7659574468085104,
+      "grad_norm": 0.5359181761741638,
+      "learning_rate": 0.00024680851063829787,
+      "loss": 0.4953,
+      "step": 885
+    },
+    {
+      "epoch": 3.7872340425531914,
+      "grad_norm": 0.676404595375061,
+      "learning_rate": 0.0002425531914893617,
+      "loss": 0.4878,
+      "step": 890
+    },
+    {
+      "epoch": 3.8085106382978724,
+      "grad_norm": 0.7736416459083557,
+      "learning_rate": 0.00023829787234042556,
+      "loss": 0.4897,
+      "step": 895
+    },
+    {
+      "epoch": 3.829787234042553,
+      "grad_norm": 0.6416388154029846,
+      "learning_rate": 0.00023404255319148937,
+      "loss": 0.5031,
+      "step": 900
+    },
+    {
+      "epoch": 3.851063829787234,
+      "grad_norm": 1.1011937856674194,
+      "learning_rate": 0.0002297872340425532,
+      "loss": 0.4563,
+      "step": 905
+    },
+    {
+      "epoch": 3.872340425531915,
+      "grad_norm": 0.4412100613117218,
+      "learning_rate": 0.000225531914893617,
+      "loss": 0.525,
+      "step": 910
+    },
+    {
+      "epoch": 3.8936170212765955,
+      "grad_norm": 0.6614885926246643,
+      "learning_rate": 0.00022127659574468084,
+      "loss": 0.5163,
+      "step": 915
+    },
+    {
+      "epoch": 3.9148936170212765,
+      "grad_norm": 0.38106369972229004,
+      "learning_rate": 0.0002170212765957447,
+      "loss": 0.5182,
+      "step": 920
+    },
+    {
+      "epoch": 3.9361702127659575,
+      "grad_norm": 0.44818058609962463,
+      "learning_rate": 0.0002127659574468085,
+      "loss": 0.4875,
+      "step": 925
+    },
+    {
+      "epoch": 3.9574468085106385,
+      "grad_norm": 0.3101024925708771,
+      "learning_rate": 0.00020851063829787234,
+      "loss": 0.5498,
+      "step": 930
+    },
+    {
+      "epoch": 3.978723404255319,
+      "grad_norm": 0.4035079777240753,
+      "learning_rate": 0.00020425531914893618,
+      "loss": 0.5139,
+      "step": 935
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 0.4626338481903076,
+      "learning_rate": 0.0002,
+      "loss": 0.4683,
+      "step": 940
+    },
+    {
+      "epoch": 4.0,
+      "eval_accuracy": 0.7290666666666666,
+      "eval_loss": 0.5313977003097534,
+      "eval_runtime": 52.259,
+      "eval_samples_per_second": 71.758,
+      "eval_steps_per_second": 1.129,
+      "step": 940
+    },
+    {
+      "epoch": 4.0212765957446805,
+      "grad_norm": 0.3656058609485626,
+      "learning_rate": 0.00019574468085106384,
+      "loss": 0.4576,
+      "step": 945
+    },
+    {
+      "epoch": 4.042553191489362,
+      "grad_norm": 0.6442248225212097,
+      "learning_rate": 0.00019148936170212765,
+      "loss": 0.4869,
+      "step": 950
+    },
+    {
+      "epoch": 4.0638297872340425,
+      "grad_norm": 0.8725343942642212,
+      "learning_rate": 0.00018723404255319148,
+      "loss": 0.4081,
+      "step": 955
+    },
+    {
+      "epoch": 4.085106382978723,
+      "grad_norm": 0.5488789677619934,
+      "learning_rate": 0.00018297872340425532,
+      "loss": 0.3774,
+      "step": 960
+    },
+    {
+      "epoch": 4.1063829787234045,
+      "grad_norm": 0.45871075987815857,
+      "learning_rate": 0.00017872340425531915,
+      "loss": 0.3895,
+      "step": 965
+    },
+    {
+      "epoch": 4.127659574468085,
+      "grad_norm": 0.7183250784873962,
+      "learning_rate": 0.00017446808510638298,
+      "loss": 0.4216,
+      "step": 970
+    },
+    {
+      "epoch": 4.148936170212766,
+      "grad_norm": 0.43252503871917725,
+      "learning_rate": 0.00017021276595744682,
+      "loss": 0.4306,
+      "step": 975
+    },
+    {
+      "epoch": 4.170212765957447,
+      "grad_norm": 0.5714681148529053,
+      "learning_rate": 0.00016595744680851062,
+      "loss": 0.4607,
+      "step": 980
+    },
+    {
+      "epoch": 4.191489361702128,
+      "grad_norm": 0.5099291801452637,
+      "learning_rate": 0.00016170212765957449,
+      "loss": 0.372,
+      "step": 985
+    },
+    {
+      "epoch": 4.212765957446808,
+      "grad_norm": 0.5010551810264587,
+      "learning_rate": 0.00015744680851063832,
+      "loss": 0.4414,
+      "step": 990
+    },
+    {
+      "epoch": 4.23404255319149,
+      "grad_norm": 0.6585486531257629,
+      "learning_rate": 0.00015319148936170213,
+      "loss": 0.4191,
+      "step": 995
+    },
+    {
+      "epoch": 4.25531914893617,
+      "grad_norm": 0.5043871402740479,
+      "learning_rate": 0.00014893617021276596,
+      "loss": 0.4273,
+      "step": 1000
+    },
+    {
+      "epoch": 4.276595744680851,
+      "grad_norm": 0.4368508756160736,
+      "learning_rate": 0.00014468085106382977,
+      "loss": 0.4329,
+      "step": 1005
+    },
+    {
+      "epoch": 4.297872340425532,
+      "grad_norm": 0.5174155235290527,
+      "learning_rate": 0.00014042553191489363,
+      "loss": 0.4256,
+      "step": 1010
+    },
+    {
+      "epoch": 4.319148936170213,
+      "grad_norm": 0.7088821530342102,
+      "learning_rate": 0.00013617021276595746,
+      "loss": 0.4025,
+      "step": 1015
+    },
+    {
+      "epoch": 4.340425531914893,
+      "grad_norm": 0.41731932759284973,
+      "learning_rate": 0.00013191489361702127,
+      "loss": 0.4018,
+      "step": 1020
+    },
+    {
+      "epoch": 4.361702127659575,
+      "grad_norm": 0.47780218720436096,
+      "learning_rate": 0.0001276595744680851,
+      "loss": 0.4683,
+      "step": 1025
+    },
+    {
+      "epoch": 4.382978723404255,
+      "grad_norm": 0.49915027618408203,
+      "learning_rate": 0.00012340425531914893,
+      "loss": 0.4643,
+      "step": 1030
+    },
+    {
+      "epoch": 4.404255319148936,
+      "grad_norm": 0.5682059526443481,
+      "learning_rate": 0.00011914893617021278,
+      "loss": 0.4259,
+      "step": 1035
+    },
+    {
+      "epoch": 4.425531914893617,
+      "grad_norm": 0.36220914125442505,
+      "learning_rate": 0.0001148936170212766,
+      "loss": 0.4116,
+      "step": 1040
+    },
+    {
+      "epoch": 4.446808510638298,
+      "grad_norm": 0.5478158593177795,
+      "learning_rate": 0.00011063829787234042,
+      "loss": 0.4299,
+      "step": 1045
+    },
+    {
+      "epoch": 4.468085106382979,
+      "grad_norm": 0.5897641181945801,
+      "learning_rate": 0.00010638297872340425,
+      "loss": 0.3619,
+      "step": 1050
+    },
+    {
+      "epoch": 4.48936170212766,
+      "grad_norm": 1.084243893623352,
+      "learning_rate": 0.00010212765957446809,
+      "loss": 0.4211,
+      "step": 1055
+    },
+    {
+      "epoch": 4.51063829787234,
+      "grad_norm": 0.7980880737304688,
+      "learning_rate": 9.787234042553192e-05,
+      "loss": 0.4053,
+      "step": 1060
+    },
+    {
+      "epoch": 4.531914893617021,
+      "grad_norm": 0.9330500364303589,
+      "learning_rate": 9.361702127659574e-05,
+      "loss": 0.4183,
+      "step": 1065
+    },
+    {
+      "epoch": 4.553191489361702,
+      "grad_norm": 0.40023094415664673,
+      "learning_rate": 8.936170212765958e-05,
+      "loss": 0.4343,
+      "step": 1070
+    },
+    {
+      "epoch": 4.574468085106383,
+      "grad_norm": 0.6411470770835876,
+      "learning_rate": 8.510638297872341e-05,
+      "loss": 0.4096,
+      "step": 1075
+    },
+    {
+      "epoch": 4.595744680851064,
+      "grad_norm": 0.4613640308380127,
+      "learning_rate": 8.085106382978724e-05,
+      "loss": 0.4089,
+      "step": 1080
+    },
+    {
+      "epoch": 4.617021276595745,
+      "grad_norm": 0.5364215970039368,
+      "learning_rate": 7.659574468085106e-05,
+      "loss": 0.406,
+      "step": 1085
+    },
+    {
+      "epoch": 4.638297872340425,
+      "grad_norm": 0.7170926928520203,
+      "learning_rate": 7.234042553191488e-05,
+      "loss": 0.3827,
+      "step": 1090
+    },
+    {
+      "epoch": 4.659574468085106,
+      "grad_norm": 0.5427092909812927,
+      "learning_rate": 6.808510638297873e-05,
+      "loss": 0.4274,
+      "step": 1095
+    },
+    {
+      "epoch": 4.680851063829787,
+      "grad_norm": 0.44160687923431396,
+      "learning_rate": 6.382978723404255e-05,
+      "loss": 0.4182,
+      "step": 1100
+    },
+    {
+      "epoch": 4.702127659574468,
+      "grad_norm": 0.5841237902641296,
+      "learning_rate": 5.957446808510639e-05,
+      "loss": 0.4459,
+      "step": 1105
+    },
+    {
+      "epoch": 4.723404255319149,
+      "grad_norm": 0.6145776510238647,
+      "learning_rate": 5.531914893617021e-05,
+      "loss": 0.4415,
+      "step": 1110
+    },
+    {
+      "epoch": 4.74468085106383,
+      "grad_norm": 0.44807735085487366,
+      "learning_rate": 5.1063829787234044e-05,
+      "loss": 0.4248,
+      "step": 1115
+    },
+    {
+      "epoch": 4.76595744680851,
+      "grad_norm": 0.7016127109527588,
+      "learning_rate": 4.680851063829787e-05,
+      "loss": 0.3613,
+      "step": 1120
+    },
+    {
+      "epoch": 4.787234042553192,
+      "grad_norm": 0.5572742819786072,
+      "learning_rate": 4.2553191489361704e-05,
+      "loss": 0.4027,
+      "step": 1125
+    },
+    {
+      "epoch": 4.808510638297872,
+      "grad_norm": 0.5368435978889465,
+      "learning_rate": 3.829787234042553e-05,
+      "loss": 0.4111,
+      "step": 1130
+    },
+    {
+      "epoch": 4.829787234042553,
+      "grad_norm": 0.4862489700317383,
+      "learning_rate": 3.4042553191489365e-05,
+      "loss": 0.3663,
+      "step": 1135
+    },
+    {
+      "epoch": 4.851063829787234,
+      "grad_norm": 0.6198825240135193,
+      "learning_rate": 2.9787234042553195e-05,
+      "loss": 0.3791,
+      "step": 1140
+    },
+    {
+      "epoch": 4.872340425531915,
+      "grad_norm": 0.5688868165016174,
+      "learning_rate": 2.5531914893617022e-05,
+      "loss": 0.393,
+      "step": 1145
+    },
+    {
+      "epoch": 4.8936170212765955,
+      "grad_norm": 0.44319066405296326,
+      "learning_rate": 2.1276595744680852e-05,
+      "loss": 0.3728,
+      "step": 1150
+    },
+    {
+      "epoch": 4.914893617021277,
+      "grad_norm": 0.6268962621688843,
+      "learning_rate": 1.7021276595744682e-05,
+      "loss": 0.3832,
+      "step": 1155
+    },
+    {
+      "epoch": 4.9361702127659575,
+      "grad_norm": 0.41478314995765686,
+      "learning_rate": 1.2765957446808511e-05,
+      "loss": 0.3647,
+      "step": 1160
+    },
+    {
+      "epoch": 4.957446808510638,
+      "grad_norm": 0.8043156266212463,
+      "learning_rate": 8.510638297872341e-06,
+      "loss": 0.3705,
+      "step": 1165
+    },
+    {
+      "epoch": 4.9787234042553195,
+      "grad_norm": 0.5503541827201843,
+      "learning_rate": 4.255319148936171e-06,
+      "loss": 0.3862,
+      "step": 1170
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 0.6788877844810486,
+      "learning_rate": 0.0,
+      "loss": 0.3694,
+      "step": 1175
+    },
+    {
+      "epoch": 5.0,
+      "eval_accuracy": 0.7538666666666667,
+      "eval_loss": 0.5220404863357544,
+      "eval_runtime": 52.803,
+      "eval_samples_per_second": 71.019,
+      "eval_steps_per_second": 1.117,
+      "step": 1175
+    },
+    {
+      "epoch": 5.0,
+      "step": 1175,
+      "total_flos": 5.8118992210944e+18,
+      "train_loss": 0.5645027552259729,
+      "train_runtime": 2886.1261,
+      "train_samples_per_second": 25.986,
+      "train_steps_per_second": 0.407
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1175,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.8118992210944e+18,
+  "train_batch_size": 64,
+  "trial_name": null,
+  "trial_params": null
+}