🍻 cheers

Browse files

Files changed (6) hide show

README.md +5 -2
all_results.json +9 -9
eval_results.json +5 -5
runs/Apr03_18-00-45_X5C922065N/events.out.tfevents.1712217271.X5C922065N.13113.2 +3 -0
train_results.json +4 -4
trainer_state.json +703 -314

README.md CHANGED Viewed

@@ -1,6 +1,9 @@
 ---
 base_model: d071696/vit-finetune-scrap
 tags:
 - generated_from_trainer
 datasets:
 - arrow
@@ -13,7 +16,7 @@ model-index:
       name: Image Classification
       type: image-classification
     dataset:
-      name: arrow
       type: arrow
       config: default
       split: train
@@ -29,7 +32,7 @@ should probably proofread and complete it, then remove this comment. -->
 # vit-finetune-scrap
-This model is a fine-tuned version of [d071696/vit-finetune-scrap](https://huggingface.co/d071696/vit-finetune-scrap) on the arrow dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.3599
 - Accuracy: 0.9260

 ---
 base_model: d071696/vit-finetune-scrap
 tags:
+- image-classification
+- image-feature-extraction
+- image-to-text
 - generated_from_trainer
 datasets:
 - arrow
       name: Image Classification
       type: image-classification
     dataset:
+      name: d071696/scraps1
       type: arrow
       config: default
       split: train
 # vit-finetune-scrap
+This model is a fine-tuned version of [d071696/vit-finetune-scrap](https://huggingface.co/d071696/vit-finetune-scrap) on the d071696/scraps1 dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.3599
 - Accuracy: 0.9260

all_results.json CHANGED Viewed

@@ -1,13 +1,13 @@
 {
     "epoch": 4.0,
-    "eval_accuracy": 0.954983922829582,
-    "eval_loss": 0.15877817571163177,
-    "eval_runtime": 9.6813,
-    "eval_samples_per_second": 64.248,
-    "eval_steps_per_second": 8.057,
     "total_flos": 7.703325099767808e+17,
-    "train_loss": 0.15572628828410345,
-    "train_runtime": 552.98,
-    "train_samples_per_second": 17.975,
-    "train_steps_per_second": 1.128
 }

 {
     "epoch": 4.0,
+    "eval_accuracy": 0.9260450160771704,
+    "eval_loss": 0.3599020838737488,
+    "eval_runtime": 9.9383,
+    "eval_samples_per_second": 62.586,
+    "eval_steps_per_second": 7.848,
     "total_flos": 7.703325099767808e+17,
+    "train_loss": 0.11289434264701495,
+    "train_runtime": 3405.5271,
+    "train_samples_per_second": 2.919,
+    "train_steps_per_second": 0.365
 }

eval_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "epoch": 4.0,
-    "eval_accuracy": 0.954983922829582,
-    "eval_loss": 0.15877817571163177,
-    "eval_runtime": 9.6813,
-    "eval_samples_per_second": 64.248,
-    "eval_steps_per_second": 8.057
 }

 {
     "epoch": 4.0,
+    "eval_accuracy": 0.9260450160771704,
+    "eval_loss": 0.3599020838737488,
+    "eval_runtime": 9.9383,
+    "eval_samples_per_second": 62.586,
+    "eval_steps_per_second": 7.848
 }

runs/Apr03_18-00-45_X5C922065N/events.out.tfevents.1712217271.X5C922065N.13113.2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:76e34509a2c262075a85ede1cd4f01009894a490b5cef5fecbb8a7a82387e238
+size 734

train_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "epoch": 4.0,
     "total_flos": 7.703325099767808e+17,
-    "train_loss": 0.15572628828410345,
-    "train_runtime": 552.98,
-    "train_samples_per_second": 17.975,
-    "train_steps_per_second": 1.128
 }

 {
     "epoch": 4.0,
     "total_flos": 7.703325099767808e+17,
+    "train_loss": 0.11289434264701495,
+    "train_runtime": 3405.5271,
+    "train_samples_per_second": 2.919,
+    "train_steps_per_second": 0.365
 }

trainer_state.json CHANGED Viewed

@@ -1,518 +1,907 @@
 {
-  "best_metric": 0.15877817571163177,
-  "best_model_checkpoint": "./vit-finetune-scrap/checkpoint-300",
   "epoch": 4.0,
-  "eval_steps": 100,
-  "global_step": 624,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.06,
-      "grad_norm": 2.869983673095703,
-      "learning_rate": 0.00019679487179487178,
-      "loss": 1.9747,
       "step": 10
     },
     {
-      "epoch": 0.13,
-      "grad_norm": 2.9758315086364746,
-      "learning_rate": 0.0001935897435897436,
-      "loss": 1.209,
       "step": 20
     },
     {
-      "epoch": 0.19,
-      "grad_norm": 3.3387157917022705,
-      "learning_rate": 0.00019038461538461538,
-      "loss": 0.7205,
       "step": 30
     },
     {
-      "epoch": 0.26,
-      "grad_norm": 2.921093463897705,
-      "learning_rate": 0.0001871794871794872,
-      "loss": 0.4159,
       "step": 40
     },
     {
-      "epoch": 0.32,
-      "grad_norm": 2.4197134971618652,
-      "learning_rate": 0.00018397435897435897,
-      "loss": 0.3879,
       "step": 50
     },
     {
-      "epoch": 0.38,
-      "grad_norm": 2.498006582260132,
-      "learning_rate": 0.00018076923076923077,
-      "loss": 0.2695,
       "step": 60
     },
     {
-      "epoch": 0.45,
-      "grad_norm": 0.9913655519485474,
-      "learning_rate": 0.00017756410256410257,
-      "loss": 0.33,
       "step": 70
     },
     {
-      "epoch": 0.51,
-      "grad_norm": 5.167428493499756,
-      "learning_rate": 0.00017435897435897436,
-      "loss": 0.2374,
       "step": 80
     },
     {
-      "epoch": 0.58,
-      "grad_norm": 5.158258438110352,
-      "learning_rate": 0.00017115384615384616,
-      "loss": 0.2531,
       "step": 90
     },
     {
-      "epoch": 0.64,
-      "grad_norm": 2.1700403690338135,
-      "learning_rate": 0.00016794871794871796,
-      "loss": 0.1672,
-      "step": 100
-    },
-    {
-      "epoch": 0.64,
-      "eval_accuracy": 0.9485530546623794,
-      "eval_loss": 0.22496841847896576,
-      "eval_runtime": 11.2945,
-      "eval_samples_per_second": 55.071,
-      "eval_steps_per_second": 6.906,
       "step": 100
     },
     {
-      "epoch": 0.71,
-      "grad_norm": 4.994268894195557,
-      "learning_rate": 0.00016474358974358976,
-      "loss": 0.1319,
       "step": 110
     },
     {
-      "epoch": 0.77,
-      "grad_norm": 2.2666022777557373,
-      "learning_rate": 0.00016153846153846155,
-      "loss": 0.283,
       "step": 120
     },
     {
-      "epoch": 0.83,
-      "grad_norm": 3.1912319660186768,
-      "learning_rate": 0.00015833333333333332,
-      "loss": 0.1666,
       "step": 130
     },
     {
-      "epoch": 0.9,
-      "grad_norm": 3.1358578205108643,
-      "learning_rate": 0.00015512820512820515,
-      "loss": 0.2819,
       "step": 140
     },
     {
-      "epoch": 0.96,
-      "grad_norm": 2.050241470336914,
-      "learning_rate": 0.00015192307692307692,
-      "loss": 0.2874,
       "step": 150
     },
     {
-      "epoch": 1.03,
-      "grad_norm": 5.193918704986572,
-      "learning_rate": 0.00014871794871794872,
-      "loss": 0.2103,
       "step": 160
     },
     {
-      "epoch": 1.09,
-      "grad_norm": 0.405056893825531,
-      "learning_rate": 0.00014551282051282051,
-      "loss": 0.1818,
       "step": 170
     },
     {
-      "epoch": 1.15,
-      "grad_norm": 11.979884147644043,
-      "learning_rate": 0.0001423076923076923,
-      "loss": 0.1626,
       "step": 180
     },
     {
-      "epoch": 1.22,
-      "grad_norm": 0.3927968442440033,
-      "learning_rate": 0.0001391025641025641,
-      "loss": 0.0824,
       "step": 190
     },
     {
-      "epoch": 1.28,
-      "grad_norm": 0.11850783228874207,
-      "learning_rate": 0.0001358974358974359,
-      "loss": 0.1277,
-      "step": 200
-    },
-    {
-      "epoch": 1.28,
-      "eval_accuracy": 0.9372990353697749,
-      "eval_loss": 0.24667073786258698,
-      "eval_runtime": 76.2101,
-      "eval_samples_per_second": 8.162,
-      "eval_steps_per_second": 1.023,
       "step": 200
     },
     {
-      "epoch": 1.35,
-      "grad_norm": 0.8870755434036255,
-      "learning_rate": 0.0001326923076923077,
-      "loss": 0.1308,
       "step": 210
     },
     {
-      "epoch": 1.41,
-      "grad_norm": 0.12857934832572937,
-      "learning_rate": 0.0001294871794871795,
-      "loss": 0.1047,
       "step": 220
     },
     {
-      "epoch": 1.47,
-      "grad_norm": 0.11967829614877701,
-      "learning_rate": 0.00012628205128205127,
-      "loss": 0.0523,
       "step": 230
     },
     {
-      "epoch": 1.54,
-      "grad_norm": 1.8435248136520386,
-      "learning_rate": 0.0001230769230769231,
-      "loss": 0.089,
       "step": 240
     },
     {
-      "epoch": 1.6,
-      "grad_norm": 0.07049544900655746,
-      "learning_rate": 0.00011987179487179487,
-      "loss": 0.0651,
       "step": 250
     },
     {
-      "epoch": 1.67,
-      "grad_norm": 7.795147895812988,
-      "learning_rate": 0.00011666666666666668,
-      "loss": 0.0827,
       "step": 260
     },
     {
-      "epoch": 1.73,
-      "grad_norm": 0.06726662069559097,
-      "learning_rate": 0.00011346153846153846,
-      "loss": 0.1727,
       "step": 270
     },
     {
-      "epoch": 1.79,
-      "grad_norm": 4.7732672691345215,
-      "learning_rate": 0.00011025641025641027,
-      "loss": 0.0867,
       "step": 280
     },
     {
-      "epoch": 1.86,
-      "grad_norm": 0.08257576823234558,
-      "learning_rate": 0.00010705128205128206,
-      "loss": 0.0349,
       "step": 290
     },
     {
-      "epoch": 1.92,
-      "grad_norm": 0.15855157375335693,
-      "learning_rate": 0.00010384615384615386,
-      "loss": 0.0253,
-      "step": 300
-    },
-    {
-      "epoch": 1.92,
-      "eval_accuracy": 0.954983922829582,
-      "eval_loss": 0.15877817571163177,
-      "eval_runtime": 9.849,
-      "eval_samples_per_second": 63.153,
-      "eval_steps_per_second": 7.92,
       "step": 300
     },
     {
-      "epoch": 1.99,
-      "grad_norm": 11.21109676361084,
-      "learning_rate": 0.00010064102564102564,
-      "loss": 0.0988,
       "step": 310
     },
     {
-      "epoch": 2.05,
-      "grad_norm": 0.06449388712644577,
-      "learning_rate": 9.743589743589744e-05,
-      "loss": 0.0666,
       "step": 320
     },
     {
-      "epoch": 2.12,
-      "grad_norm": 0.988405168056488,
-      "learning_rate": 9.423076923076924e-05,
-      "loss": 0.0295,
       "step": 330
     },
     {
-      "epoch": 2.18,
-      "grad_norm": 0.06675518304109573,
-      "learning_rate": 9.102564102564103e-05,
-      "loss": 0.018,
       "step": 340
     },
     {
-      "epoch": 2.24,
-      "grad_norm": 0.08486536890268326,
-      "learning_rate": 8.782051282051283e-05,
-      "loss": 0.0714,
       "step": 350
     },
     {
-      "epoch": 2.31,
-      "grad_norm": 0.05260853096842766,
-      "learning_rate": 8.461538461538461e-05,
-      "loss": 0.0354,
       "step": 360
     },
     {
-      "epoch": 2.37,
-      "grad_norm": 0.053938668221235275,
-      "learning_rate": 8.141025641025641e-05,
-      "loss": 0.0548,
       "step": 370
     },
     {
-      "epoch": 2.44,
-      "grad_norm": 0.06470278650522232,
-      "learning_rate": 7.820512820512821e-05,
-      "loss": 0.0162,
       "step": 380
     },
     {
-      "epoch": 2.5,
-      "grad_norm": 0.0850602388381958,
-      "learning_rate": 7.500000000000001e-05,
-      "loss": 0.033,
       "step": 390
     },
     {
-      "epoch": 2.56,
-      "grad_norm": 0.04342366382479668,
-      "learning_rate": 7.17948717948718e-05,
-      "loss": 0.0224,
-      "step": 400
-    },
-    {
-      "epoch": 2.56,
-      "eval_accuracy": 0.9533762057877814,
-      "eval_loss": 0.16905710101127625,
-      "eval_runtime": 9.8491,
-      "eval_samples_per_second": 63.153,
-      "eval_steps_per_second": 7.92,
       "step": 400
     },
     {
-      "epoch": 2.63,
-      "grad_norm": 0.05912560597062111,
-      "learning_rate": 6.858974358974359e-05,
-      "loss": 0.0503,
       "step": 410
     },
     {
-      "epoch": 2.69,
-      "grad_norm": 0.04359501227736473,
-      "learning_rate": 6.538461538461539e-05,
-      "loss": 0.0537,
       "step": 420
     },
     {
-      "epoch": 2.76,
-      "grad_norm": 0.0935799852013588,
-      "learning_rate": 6.217948717948718e-05,
-      "loss": 0.0145,
       "step": 430
     },
     {
-      "epoch": 2.82,
-      "grad_norm": 0.05057013779878616,
-      "learning_rate": 5.897435897435898e-05,
-      "loss": 0.0132,
       "step": 440
     },
     {
-      "epoch": 2.88,
-      "grad_norm": 0.4015754461288452,
-      "learning_rate": 5.576923076923077e-05,
-      "loss": 0.0382,
       "step": 450
     },
     {
-      "epoch": 2.95,
-      "grad_norm": 0.03421848267316818,
-      "learning_rate": 5.256410256410257e-05,
-      "loss": 0.0123,
       "step": 460
     },
     {
-      "epoch": 3.01,
-      "grad_norm": 0.043205052614212036,
-      "learning_rate": 4.935897435897436e-05,
-      "loss": 0.0417,
       "step": 470
     },
     {
-      "epoch": 3.08,
-      "grad_norm": 0.03611929342150688,
-      "learning_rate": 4.615384615384616e-05,
-      "loss": 0.0125,
       "step": 480
     },
     {
-      "epoch": 3.14,
-      "grad_norm": 0.04078483581542969,
-      "learning_rate": 4.294871794871795e-05,
-      "loss": 0.0193,
       "step": 490
     },
     {
-      "epoch": 3.21,
-      "grad_norm": 0.043482307344675064,
-      "learning_rate": 3.974358974358974e-05,
-      "loss": 0.0321,
       "step": 500
     },
     {
-      "epoch": 3.21,
-      "eval_accuracy": 0.9565916398713826,
-      "eval_loss": 0.17511475086212158,
-      "eval_runtime": 9.8922,
-      "eval_samples_per_second": 62.878,
-      "eval_steps_per_second": 7.885,
-      "step": 500
-    },
-    {
-      "epoch": 3.27,
-      "grad_norm": 0.03763527050614357,
-      "learning_rate": 3.653846153846154e-05,
-      "loss": 0.0118,
       "step": 510
     },
     {
-      "epoch": 3.33,
-      "grad_norm": 0.05929262936115265,
-      "learning_rate": 3.3333333333333335e-05,
-      "loss": 0.0636,
       "step": 520
     },
     {
-      "epoch": 3.4,
-      "grad_norm": 0.039751436561346054,
-      "learning_rate": 3.012820512820513e-05,
-      "loss": 0.0105,
       "step": 530
     },
     {
-      "epoch": 3.46,
-      "grad_norm": 0.03735564276576042,
-      "learning_rate": 2.6923076923076923e-05,
-      "loss": 0.0495,
       "step": 540
     },
     {
-      "epoch": 3.53,
-      "grad_norm": 0.051983293145895004,
-      "learning_rate": 2.3717948717948718e-05,
-      "loss": 0.011,
       "step": 550
     },
     {
-      "epoch": 3.59,
-      "grad_norm": 0.034572675824165344,
-      "learning_rate": 2.0512820512820512e-05,
-      "loss": 0.0109,
       "step": 560
     },
     {
-      "epoch": 3.65,
-      "grad_norm": 0.04169879108667374,
-      "learning_rate": 1.730769230769231e-05,
-      "loss": 0.0108,
       "step": 570
     },
     {
-      "epoch": 3.72,
-      "grad_norm": 0.032876156270504,
-      "learning_rate": 1.4102564102564104e-05,
-      "loss": 0.0104,
       "step": 580
     },
     {
-      "epoch": 3.78,
-      "grad_norm": 0.03522384166717529,
-      "learning_rate": 1.0897435897435898e-05,
-      "loss": 0.0109,
       "step": 590
     },
     {
-      "epoch": 3.85,
-      "grad_norm": 0.11409874260425568,
-      "learning_rate": 7.692307692307694e-06,
-      "loss": 0.0112,
       "step": 600
     },
     {
-      "epoch": 3.85,
-      "eval_accuracy": 0.954983922829582,
-      "eval_loss": 0.18050101399421692,
-      "eval_runtime": 9.8888,
-      "eval_samples_per_second": 62.899,
-      "eval_steps_per_second": 7.888,
-      "step": 600
-    },
-    {
-      "epoch": 3.91,
-      "grad_norm": 0.04359051212668419,
-      "learning_rate": 4.487179487179488e-06,
-      "loss": 0.0109,
       "step": 610
     },
     {
-      "epoch": 3.97,
-      "grad_norm": 0.03071708232164383,
-      "learning_rate": 1.282051282051282e-06,
-      "loss": 0.0429,
       "step": 620
     },
     {
       "epoch": 4.0,
-      "step": 624,
       "total_flos": 7.703325099767808e+17,
-      "train_loss": 0.15572628828410345,
-      "train_runtime": 552.98,
-      "train_samples_per_second": 17.975,
-      "train_steps_per_second": 1.128
     }
   ],
   "logging_steps": 10,
-  "max_steps": 624,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 4,
-  "save_steps": 100,
   "total_flos": 7.703325099767808e+17,
-  "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null
 }

 {
+  "best_metric": 0.3599020838737488,
+  "best_model_checkpoint": "./vit-finetune-scrap/checkpoint-1000",
   "epoch": 4.0,
+  "eval_steps": 1000,
+  "global_step": 1244,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.03,
+      "grad_norm": 0.7885215282440186,
+      "learning_rate": 0.00019839228295819936,
+      "loss": 0.3445,
       "step": 10
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 0.6006595492362976,
+      "learning_rate": 0.00019678456591639874,
+      "loss": 0.0458,
       "step": 20
     },
     {
+      "epoch": 0.1,
+      "grad_norm": 0.0847816988825798,
+      "learning_rate": 0.00019517684887459809,
+      "loss": 0.1535,
       "step": 30
     },
     {
+      "epoch": 0.13,
+      "grad_norm": 10.684067726135254,
+      "learning_rate": 0.00019356913183279743,
+      "loss": 0.1208,
       "step": 40
     },
     {
+      "epoch": 0.16,
+      "grad_norm": 0.08578234165906906,
+      "learning_rate": 0.00019196141479099678,
+      "loss": 0.0208,
       "step": 50
     },
     {
+      "epoch": 0.19,
+      "grad_norm": 0.06457830220460892,
+      "learning_rate": 0.00019035369774919616,
+      "loss": 0.0838,
       "step": 60
     },
     {
+      "epoch": 0.23,
+      "grad_norm": 12.919880867004395,
+      "learning_rate": 0.0001887459807073955,
+      "loss": 0.0491,
       "step": 70
     },
     {
+      "epoch": 0.26,
+      "grad_norm": 10.229437828063965,
+      "learning_rate": 0.00018713826366559486,
+      "loss": 0.1614,
       "step": 80
     },
     {
+      "epoch": 0.29,
+      "grad_norm": 0.03680131584405899,
+      "learning_rate": 0.0001855305466237942,
+      "loss": 0.2226,
       "step": 90
     },
     {
+      "epoch": 0.32,
+      "grad_norm": 1.575692892074585,
+      "learning_rate": 0.0001839228295819936,
+      "loss": 0.3464,
       "step": 100
     },
     {
+      "epoch": 0.35,
+      "grad_norm": 11.554421424865723,
+      "learning_rate": 0.00018231511254019294,
+      "loss": 0.267,
       "step": 110
     },
     {
+      "epoch": 0.39,
+      "grad_norm": 11.113668441772461,
+      "learning_rate": 0.00018070739549839229,
+      "loss": 0.3107,
       "step": 120
     },
     {
+      "epoch": 0.42,
+      "grad_norm": 5.346070766448975,
+      "learning_rate": 0.00017909967845659166,
+      "loss": 0.6457,
       "step": 130
     },
     {
+      "epoch": 0.45,
+      "grad_norm": 31.744802474975586,
+      "learning_rate": 0.000177491961414791,
+      "loss": 0.4913,
       "step": 140
     },
     {
+      "epoch": 0.48,
+      "grad_norm": 6.202216148376465,
+      "learning_rate": 0.00017588424437299036,
+      "loss": 0.6627,
       "step": 150
     },
     {
+      "epoch": 0.51,
+      "grad_norm": 12.808435440063477,
+      "learning_rate": 0.0001742765273311897,
+      "loss": 0.4071,
       "step": 160
     },
     {
+      "epoch": 0.55,
+      "grad_norm": 0.17637468874454498,
+      "learning_rate": 0.0001726688102893891,
+      "loss": 0.2312,
       "step": 170
     },
     {
+      "epoch": 0.58,
+      "grad_norm": 9.431242942810059,
+      "learning_rate": 0.00017106109324758844,
+      "loss": 0.1206,
       "step": 180
     },
     {
+      "epoch": 0.61,
+      "grad_norm": 0.0686856061220169,
+      "learning_rate": 0.0001694533762057878,
+      "loss": 0.2657,
       "step": 190
     },
     {
+      "epoch": 0.64,
+      "grad_norm": 0.035174936056137085,
+      "learning_rate": 0.00016784565916398716,
+      "loss": 0.2447,
       "step": 200
     },
     {
+      "epoch": 0.68,
+      "grad_norm": 0.20221814513206482,
+      "learning_rate": 0.0001662379421221865,
+      "loss": 0.0932,
       "step": 210
     },
     {
+      "epoch": 0.71,
+      "grad_norm": 22.348093032836914,
+      "learning_rate": 0.00016463022508038586,
+      "loss": 0.5622,
       "step": 220
     },
     {
+      "epoch": 0.74,
+      "grad_norm": 0.09363432973623276,
+      "learning_rate": 0.0001630225080385852,
+      "loss": 0.1989,
       "step": 230
     },
     {
+      "epoch": 0.77,
+      "grad_norm": 0.10725115239620209,
+      "learning_rate": 0.0001614147909967846,
+      "loss": 0.1174,
       "step": 240
     },
     {
+      "epoch": 0.8,
+      "grad_norm": 2.2332615852355957,
+      "learning_rate": 0.00015980707395498394,
+      "loss": 0.3852,
       "step": 250
     },
     {
+      "epoch": 0.84,
+      "grad_norm": 0.08053239434957504,
+      "learning_rate": 0.0001581993569131833,
+      "loss": 0.2247,
       "step": 260
     },
     {
+      "epoch": 0.87,
+      "grad_norm": 0.21441006660461426,
+      "learning_rate": 0.00015659163987138264,
+      "loss": 0.2879,
       "step": 270
     },
     {
+      "epoch": 0.9,
+      "grad_norm": 0.0631105974316597,
+      "learning_rate": 0.00015498392282958201,
+      "loss": 0.2922,
       "step": 280
     },
     {
+      "epoch": 0.93,
+      "grad_norm": 20.96392822265625,
+      "learning_rate": 0.00015337620578778136,
+      "loss": 0.0413,
       "step": 290
     },
     {
+      "epoch": 0.96,
+      "grad_norm": 0.4054552912712097,
+      "learning_rate": 0.0001517684887459807,
+      "loss": 0.1037,
       "step": 300
     },
     {
+      "epoch": 1.0,
+      "grad_norm": 1.8209251165390015,
+      "learning_rate": 0.0001501607717041801,
+      "loss": 0.1802,
       "step": 310
     },
     {
+      "epoch": 1.03,
+      "grad_norm": 11.979750633239746,
+      "learning_rate": 0.00014855305466237944,
+      "loss": 0.1788,
       "step": 320
     },
     {
+      "epoch": 1.06,
+      "grad_norm": 0.10045678168535233,
+      "learning_rate": 0.0001469453376205788,
+      "loss": 0.4287,
       "step": 330
     },
     {
+      "epoch": 1.09,
+      "grad_norm": 0.051606420427560806,
+      "learning_rate": 0.00014533762057877814,
+      "loss": 0.1055,
       "step": 340
     },
     {
+      "epoch": 1.13,
+      "grad_norm": 0.20983509719371796,
+      "learning_rate": 0.00014372990353697752,
+      "loss": 0.1667,
       "step": 350
     },
     {
+      "epoch": 1.16,
+      "grad_norm": 0.15858705341815948,
+      "learning_rate": 0.00014212218649517686,
+      "loss": 0.1135,
       "step": 360
     },
     {
+      "epoch": 1.19,
+      "grad_norm": 1.420276165008545,
+      "learning_rate": 0.00014051446945337621,
+      "loss": 0.2112,
       "step": 370
     },
     {
+      "epoch": 1.22,
+      "grad_norm": 0.02672198973596096,
+      "learning_rate": 0.0001389067524115756,
+      "loss": 0.0313,
       "step": 380
     },
     {
+      "epoch": 1.25,
+      "grad_norm": 0.02562582679092884,
+      "learning_rate": 0.00013729903536977494,
+      "loss": 0.1069,
       "step": 390
     },
     {
+      "epoch": 1.29,
+      "grad_norm": 4.197677135467529,
+      "learning_rate": 0.0001356913183279743,
+      "loss": 0.2342,
       "step": 400
     },
     {
+      "epoch": 1.32,
+      "grad_norm": 0.04306723549962044,
+      "learning_rate": 0.00013408360128617364,
+      "loss": 0.1313,
       "step": 410
     },
     {
+      "epoch": 1.35,
+      "grad_norm": 4.383702278137207,
+      "learning_rate": 0.00013247588424437302,
+      "loss": 0.2145,
       "step": 420
     },
     {
+      "epoch": 1.38,
+      "grad_norm": 0.20137301087379456,
+      "learning_rate": 0.00013086816720257237,
+      "loss": 0.1514,
       "step": 430
     },
     {
+      "epoch": 1.41,
+      "grad_norm": 0.020689483731985092,
+      "learning_rate": 0.00012926045016077172,
+      "loss": 0.0712,
       "step": 440
     },
     {
+      "epoch": 1.45,
+      "grad_norm": 0.018355347216129303,
+      "learning_rate": 0.00012765273311897106,
+      "loss": 0.0923,
       "step": 450
     },
     {
+      "epoch": 1.48,
+      "grad_norm": 17.85742950439453,
+      "learning_rate": 0.00012604501607717044,
+      "loss": 0.0779,
       "step": 460
     },
     {
+      "epoch": 1.51,
+      "grad_norm": 0.05268234387040138,
+      "learning_rate": 0.0001244372990353698,
+      "loss": 0.0096,
       "step": 470
     },
     {
+      "epoch": 1.54,
+      "grad_norm": 0.09329644590616226,
+      "learning_rate": 0.00012282958199356914,
+      "loss": 0.0392,
       "step": 480
     },
     {
+      "epoch": 1.58,
+      "grad_norm": 8.043782234191895,
+      "learning_rate": 0.0001212218649517685,
+      "loss": 0.1322,
       "step": 490
     },
     {
+      "epoch": 1.61,
+      "grad_norm": 0.016368461772799492,
+      "learning_rate": 0.00011961414790996785,
+      "loss": 0.0843,
       "step": 500
     },
     {
+      "epoch": 1.64,
+      "grad_norm": 13.989496231079102,
+      "learning_rate": 0.0001180064308681672,
+      "loss": 0.2665,
       "step": 510
     },
     {
+      "epoch": 1.67,
+      "grad_norm": 14.703727722167969,
+      "learning_rate": 0.00011639871382636655,
+      "loss": 0.1551,
       "step": 520
     },
     {
+      "epoch": 1.7,
+      "grad_norm": 0.13277527689933777,
+      "learning_rate": 0.00011479099678456593,
+      "loss": 0.1346,
       "step": 530
     },
     {
+      "epoch": 1.74,
+      "grad_norm": 0.04265744984149933,
+      "learning_rate": 0.00011318327974276528,
+      "loss": 0.1725,
       "step": 540
     },
     {
+      "epoch": 1.77,
+      "grad_norm": 0.04861054942011833,
+      "learning_rate": 0.00011157556270096463,
+      "loss": 0.0696,
       "step": 550
     },
     {
+      "epoch": 1.8,
+      "grad_norm": 0.015978263691067696,
+      "learning_rate": 0.00010996784565916398,
+      "loss": 0.0583,
       "step": 560
     },
     {
+      "epoch": 1.83,
+      "grad_norm": 0.21797218918800354,
+      "learning_rate": 0.00010836012861736335,
+      "loss": 0.1746,
       "step": 570
     },
     {
+      "epoch": 1.86,
+      "grad_norm": 16.11418342590332,
+      "learning_rate": 0.0001067524115755627,
+      "loss": 0.2571,
       "step": 580
     },
     {
+      "epoch": 1.9,
+      "grad_norm": 0.025191914290189743,
+      "learning_rate": 0.00010514469453376205,
+      "loss": 0.1326,
       "step": 590
     },
     {
+      "epoch": 1.93,
+      "grad_norm": 0.03328488767147064,
+      "learning_rate": 0.00010353697749196143,
+      "loss": 0.1601,
       "step": 600
     },
     {
+      "epoch": 1.96,
+      "grad_norm": 0.017355144023895264,
+      "learning_rate": 0.00010192926045016078,
+      "loss": 0.1607,
       "step": 610
     },
     {
+      "epoch": 1.99,
+      "grad_norm": 0.15774357318878174,
+      "learning_rate": 0.00010032154340836013,
+      "loss": 0.0182,
       "step": 620
     },
+    {
+      "epoch": 2.03,
+      "grad_norm": 0.039228443056344986,
+      "learning_rate": 9.871382636655949e-05,
+      "loss": 0.0898,
+      "step": 630
+    },
+    {
+      "epoch": 2.06,
+      "grad_norm": 27.718074798583984,
+      "learning_rate": 9.710610932475884e-05,
+      "loss": 0.1895,
+      "step": 640
+    },
+    {
+      "epoch": 2.09,
+      "grad_norm": 0.10635074228048325,
+      "learning_rate": 9.54983922829582e-05,
+      "loss": 0.0805,
+      "step": 650
+    },
+    {
+      "epoch": 2.12,
+      "grad_norm": 0.01466443482786417,
+      "learning_rate": 9.389067524115757e-05,
+      "loss": 0.0697,
+      "step": 660
+    },
+    {
+      "epoch": 2.15,
+      "grad_norm": 0.037685755640268326,
+      "learning_rate": 9.228295819935692e-05,
+      "loss": 0.0031,
+      "step": 670
+    },
+    {
+      "epoch": 2.19,
+      "grad_norm": 0.009445942007005215,
+      "learning_rate": 9.067524115755628e-05,
+      "loss": 0.0636,
+      "step": 680
+    },
+    {
+      "epoch": 2.22,
+      "grad_norm": 0.011714440770447254,
+      "learning_rate": 8.906752411575563e-05,
+      "loss": 0.0554,
+      "step": 690
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": 0.018049364909529686,
+      "learning_rate": 8.7459807073955e-05,
+      "loss": 0.0073,
+      "step": 700
+    },
+    {
+      "epoch": 2.28,
+      "grad_norm": 0.013684015721082687,
+      "learning_rate": 8.585209003215434e-05,
+      "loss": 0.004,
+      "step": 710
+    },
+    {
+      "epoch": 2.32,
+      "grad_norm": 0.0740511417388916,
+      "learning_rate": 8.42443729903537e-05,
+      "loss": 0.0724,
+      "step": 720
+    },
+    {
+      "epoch": 2.35,
+      "grad_norm": 0.0066555822268128395,
+      "learning_rate": 8.263665594855306e-05,
+      "loss": 0.0029,
+      "step": 730
+    },
+    {
+      "epoch": 2.38,
+      "grad_norm": 0.00826901663094759,
+      "learning_rate": 8.102893890675242e-05,
+      "loss": 0.0745,
+      "step": 740
+    },
+    {
+      "epoch": 2.41,
+      "grad_norm": 0.009980367496609688,
+      "learning_rate": 7.942122186495177e-05,
+      "loss": 0.0026,
+      "step": 750
+    },
+    {
+      "epoch": 2.44,
+      "grad_norm": 7.373602390289307,
+      "learning_rate": 7.781350482315113e-05,
+      "loss": 0.1221,
+      "step": 760
+    },
+    {
+      "epoch": 2.48,
+      "grad_norm": 0.011884603649377823,
+      "learning_rate": 7.62057877813505e-05,
+      "loss": 0.0404,
+      "step": 770
+    },
+    {
+      "epoch": 2.51,
+      "grad_norm": 28.61383628845215,
+      "learning_rate": 7.459807073954984e-05,
+      "loss": 0.0483,
+      "step": 780
+    },
+    {
+      "epoch": 2.54,
+      "grad_norm": 0.006681604776531458,
+      "learning_rate": 7.299035369774921e-05,
+      "loss": 0.0729,
+      "step": 790
+    },
+    {
+      "epoch": 2.57,
+      "grad_norm": 0.02536676451563835,
+      "learning_rate": 7.138263665594856e-05,
+      "loss": 0.0023,
+      "step": 800
+    },
+    {
+      "epoch": 2.6,
+      "grad_norm": 0.016948092728853226,
+      "learning_rate": 6.977491961414792e-05,
+      "loss": 0.0426,
+      "step": 810
+    },
+    {
+      "epoch": 2.64,
+      "grad_norm": 0.021912137046456337,
+      "learning_rate": 6.816720257234727e-05,
+      "loss": 0.064,
+      "step": 820
+    },
+    {
+      "epoch": 2.67,
+      "grad_norm": 0.014551441185176373,
+      "learning_rate": 6.655948553054663e-05,
+      "loss": 0.065,
+      "step": 830
+    },
+    {
+      "epoch": 2.7,
+      "grad_norm": 0.3249826431274414,
+      "learning_rate": 6.495176848874598e-05,
+      "loss": 0.1171,
+      "step": 840
+    },
+    {
+      "epoch": 2.73,
+      "grad_norm": 17.302183151245117,
+      "learning_rate": 6.334405144694535e-05,
+      "loss": 0.066,
+      "step": 850
+    },
+    {
+      "epoch": 2.77,
+      "grad_norm": 0.03599601984024048,
+      "learning_rate": 6.173633440514471e-05,
+      "loss": 0.0203,
+      "step": 860
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 0.011334868147969246,
+      "learning_rate": 6.012861736334405e-05,
+      "loss": 0.0188,
+      "step": 870
+    },
+    {
+      "epoch": 2.83,
+      "grad_norm": 0.012830100953578949,
+      "learning_rate": 5.8520900321543414e-05,
+      "loss": 0.0028,
+      "step": 880
+    },
+    {
+      "epoch": 2.86,
+      "grad_norm": 0.09628697484731674,
+      "learning_rate": 5.6913183279742764e-05,
+      "loss": 0.0857,
+      "step": 890
+    },
+    {
+      "epoch": 2.89,
+      "grad_norm": 0.008330035023391247,
+      "learning_rate": 5.530546623794213e-05,
+      "loss": 0.0018,
+      "step": 900
+    },
+    {
+      "epoch": 2.93,
+      "grad_norm": 0.007389857899397612,
+      "learning_rate": 5.369774919614148e-05,
+      "loss": 0.0019,
+      "step": 910
+    },
+    {
+      "epoch": 2.96,
+      "grad_norm": 0.011146631091833115,
+      "learning_rate": 5.209003215434084e-05,
+      "loss": 0.104,
+      "step": 920
+    },
+    {
+      "epoch": 2.99,
+      "grad_norm": 0.021591678261756897,
+      "learning_rate": 5.048231511254019e-05,
+      "loss": 0.0016,
+      "step": 930
+    },
+    {
+      "epoch": 3.02,
+      "grad_norm": 0.09255488216876984,
+      "learning_rate": 4.887459807073955e-05,
+      "loss": 0.1024,
+      "step": 940
+    },
+    {
+      "epoch": 3.05,
+      "grad_norm": 0.01300421915948391,
+      "learning_rate": 4.726688102893891e-05,
+      "loss": 0.0019,
+      "step": 950
+    },
+    {
+      "epoch": 3.09,
+      "grad_norm": 0.009576304815709591,
+      "learning_rate": 4.5659163987138265e-05,
+      "loss": 0.0036,
+      "step": 960
+    },
+    {
+      "epoch": 3.12,
+      "grad_norm": 0.006058481056243181,
+      "learning_rate": 4.405144694533762e-05,
+      "loss": 0.0333,
+      "step": 970
+    },
+    {
+      "epoch": 3.15,
+      "grad_norm": 0.00888640247285366,
+      "learning_rate": 4.244372990353698e-05,
+      "loss": 0.0026,
+      "step": 980
+    },
+    {
+      "epoch": 3.18,
+      "grad_norm": 0.008608637377619743,
+      "learning_rate": 4.083601286173634e-05,
+      "loss": 0.002,
+      "step": 990
+    },
+    {
+      "epoch": 3.22,
+      "grad_norm": 0.017658203840255737,
+      "learning_rate": 3.92282958199357e-05,
+      "loss": 0.0021,
+      "step": 1000
+    },
+    {
+      "epoch": 3.22,
+      "eval_accuracy": 0.9260450160771704,
+      "eval_loss": 0.3599020838737488,
+      "eval_runtime": 19.6251,
+      "eval_samples_per_second": 31.694,
+      "eval_steps_per_second": 3.975,
+      "step": 1000
+    },
+    {
+      "epoch": 3.25,
+      "grad_norm": 0.015870286151766777,
+      "learning_rate": 3.7620578778135054e-05,
+      "loss": 0.002,
+      "step": 1010
+    },
+    {
+      "epoch": 3.28,
+      "grad_norm": 0.006019009742885828,
+      "learning_rate": 3.601286173633441e-05,
+      "loss": 0.0021,
+      "step": 1020
+    },
+    {
+      "epoch": 3.31,
+      "grad_norm": 0.02200642041862011,
+      "learning_rate": 3.4405144694533766e-05,
+      "loss": 0.0484,
+      "step": 1030
+    },
+    {
+      "epoch": 3.34,
+      "grad_norm": 0.00513369170948863,
+      "learning_rate": 3.279742765273312e-05,
+      "loss": 0.0021,
+      "step": 1040
+    },
+    {
+      "epoch": 3.38,
+      "grad_norm": 0.0066593969240784645,
+      "learning_rate": 3.118971061093248e-05,
+      "loss": 0.0017,
+      "step": 1050
+    },
+    {
+      "epoch": 3.41,
+      "grad_norm": 0.010374795645475388,
+      "learning_rate": 2.9581993569131832e-05,
+      "loss": 0.0015,
+      "step": 1060
+    },
+    {
+      "epoch": 3.44,
+      "grad_norm": 0.006329710595309734,
+      "learning_rate": 2.7974276527331188e-05,
+      "loss": 0.0727,
+      "step": 1070
+    },
+    {
+      "epoch": 3.47,
+      "grad_norm": 0.012291769497096539,
+      "learning_rate": 2.6366559485530545e-05,
+      "loss": 0.0022,
+      "step": 1080
+    },
+    {
+      "epoch": 3.5,
+      "grad_norm": 0.009208714589476585,
+      "learning_rate": 2.4758842443729904e-05,
+      "loss": 0.0017,
+      "step": 1090
+    },
+    {
+      "epoch": 3.54,
+      "grad_norm": 0.008606062270700932,
+      "learning_rate": 2.315112540192926e-05,
+      "loss": 0.0355,
+      "step": 1100
+    },
+    {
+      "epoch": 3.57,
+      "grad_norm": 0.005816610064357519,
+      "learning_rate": 2.154340836012862e-05,
+      "loss": 0.0024,
+      "step": 1110
+    },
+    {
+      "epoch": 3.6,
+      "grad_norm": 0.016983453184366226,
+      "learning_rate": 1.9935691318327977e-05,
+      "loss": 0.0022,
+      "step": 1120
+    },
+    {
+      "epoch": 3.63,
+      "grad_norm": 0.013002891093492508,
+      "learning_rate": 1.8327974276527333e-05,
+      "loss": 0.0027,
+      "step": 1130
+    },
+    {
+      "epoch": 3.67,
+      "grad_norm": 0.004919551312923431,
+      "learning_rate": 1.672025723472669e-05,
+      "loss": 0.0038,
+      "step": 1140
+    },
+    {
+      "epoch": 3.7,
+      "grad_norm": 0.01140748430043459,
+      "learning_rate": 1.5112540192926044e-05,
+      "loss": 0.0018,
+      "step": 1150
+    },
+    {
+      "epoch": 3.73,
+      "grad_norm": 0.0061045498587191105,
+      "learning_rate": 1.3504823151125404e-05,
+      "loss": 0.0015,
+      "step": 1160
+    },
+    {
+      "epoch": 3.76,
+      "grad_norm": 0.01204584538936615,
+      "learning_rate": 1.189710610932476e-05,
+      "loss": 0.0019,
+      "step": 1170
+    },
+    {
+      "epoch": 3.79,
+      "grad_norm": 0.005698794033378363,
+      "learning_rate": 1.0289389067524116e-05,
+      "loss": 0.0015,
+      "step": 1180
+    },
+    {
+      "epoch": 3.83,
+      "grad_norm": 0.008717156946659088,
+      "learning_rate": 8.681672025723474e-06,
+      "loss": 0.0018,
+      "step": 1190
+    },
+    {
+      "epoch": 3.86,
+      "grad_norm": 0.03146166726946831,
+      "learning_rate": 7.07395498392283e-06,
+      "loss": 0.0305,
+      "step": 1200
+    },
+    {
+      "epoch": 3.89,
+      "grad_norm": 0.0084818284958601,
+      "learning_rate": 5.466237942122187e-06,
+      "loss": 0.0046,
+      "step": 1210
+    },
+    {
+      "epoch": 3.92,
+      "grad_norm": 0.026195811107754707,
+      "learning_rate": 3.858520900321544e-06,
+      "loss": 0.0018,
+      "step": 1220
+    },
+    {
+      "epoch": 3.95,
+      "grad_norm": 0.01475218404084444,
+      "learning_rate": 2.2508038585209006e-06,
+      "loss": 0.0019,
+      "step": 1230
+    },
+    {
+      "epoch": 3.99,
+      "grad_norm": 0.006672242656350136,
+      "learning_rate": 6.430868167202573e-07,
+      "loss": 0.0516,
+      "step": 1240
+    },
     {
       "epoch": 4.0,
+      "step": 1244,
       "total_flos": 7.703325099767808e+17,
+      "train_loss": 0.11289434264701495,
+      "train_runtime": 3405.5271,
+      "train_samples_per_second": 2.919,
+      "train_steps_per_second": 0.365
     }
   ],
   "logging_steps": 10,
+  "max_steps": 1244,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 4,
+  "save_steps": 1000,
   "total_flos": 7.703325099767808e+17,
+  "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null
 }