End of training

Browse files

Files changed (5) hide show

all_results.json +10 -10
eval_results.json +5 -5
runs/Apr13_14-54-16_67071951de9d/events.out.tfevents.1713021034.67071951de9d.6427.1 +3 -0
train_results.json +5 -5
trainer_state.json +612 -360

all_results.json CHANGED Viewed

@@ -1,13 +1,13 @@
 {
     "epoch": 2.99,
-    "eval_accuracy": 0.9841333333333333,
-    "eval_loss": 0.0466647744178772,
-    "eval_runtime": 44.3114,
-    "eval_samples_per_second": 338.513,
-    "eval_steps_per_second": 10.584,
-    "total_flos": 2.0513761171988152e+18,
-    "train_loss": 0.37578185836037437,
-    "train_runtime": 710.1831,
-    "train_samples_per_second": 147.849,
-    "train_steps_per_second": 1.153
 }

 {
     "epoch": 2.99,
+    "eval_accuracy": 0.9844,
+    "eval_loss": 0.04590694606304169,
+    "eval_runtime": 32.7189,
+    "eval_samples_per_second": 305.634,
+    "eval_steps_per_second": 9.566,
+    "total_flos": 2.930358373492064e+18,
+    "train_loss": 0.33610059461023056,
+    "train_runtime": 935.657,
+    "train_samples_per_second": 160.315,
+    "train_steps_per_second": 1.25
 }

eval_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "epoch": 2.99,
-    "eval_accuracy": 0.9841333333333333,
-    "eval_loss": 0.0466647744178772,
-    "eval_runtime": 44.3114,
-    "eval_samples_per_second": 338.513,
-    "eval_steps_per_second": 10.584
 }

 {
     "epoch": 2.99,
+    "eval_accuracy": 0.9844,
+    "eval_loss": 0.04590694606304169,
+    "eval_runtime": 32.7189,
+    "eval_samples_per_second": 305.634,
+    "eval_steps_per_second": 9.566
 }

runs/Apr13_14-54-16_67071951de9d/events.out.tfevents.1713021034.67071951de9d.6427.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b142f82300c3b85bcbfdc744203b64d047b3b7d888be2ef12e6a2ec6e995de5
+size 411

train_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "epoch": 2.99,
-    "total_flos": 2.0513761171988152e+18,
-    "train_loss": 0.37578185836037437,
-    "train_runtime": 710.1831,
-    "train_samples_per_second": 147.849,
-    "train_steps_per_second": 1.153
 }

 {
     "epoch": 2.99,
+    "total_flos": 2.930358373492064e+18,
+    "train_loss": 0.33610059461023056,
+    "train_runtime": 935.657,
+    "train_samples_per_second": 160.315,
+    "train_steps_per_second": 1.25
 }

trainer_state.json CHANGED Viewed

@@ -1,623 +1,875 @@
 {
-  "best_metric": 0.9841333333333333,
-  "best_model_checkpoint": "vit-small-patch16-224-finetuned-cifar10/checkpoint-819",
-  "epoch": 2.9945155393053016,
   "eval_steps": 500,
-  "global_step": 819,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.04,
-      "grad_norm": 10.646742820739746,
-      "learning_rate": 6.0975609756097564e-06,
-      "loss": 2.6319,
       "step": 10
     },
     {
-      "epoch": 0.07,
-      "grad_norm": 7.98280668258667,
-      "learning_rate": 1.2195121951219513e-05,
-      "loss": 2.3516,
       "step": 20
     },
     {
-      "epoch": 0.11,
-      "grad_norm": 7.600331783294678,
-      "learning_rate": 1.8292682926829268e-05,
-      "loss": 1.8968,
       "step": 30
     },
     {
-      "epoch": 0.15,
-      "grad_norm": 6.469565391540527,
-      "learning_rate": 2.4390243902439026e-05,
-      "loss": 1.3456,
       "step": 40
     },
     {
-      "epoch": 0.18,
-      "grad_norm": 6.272436618804932,
-      "learning_rate": 3.048780487804878e-05,
-      "loss": 0.9244,
       "step": 50
     },
     {
-      "epoch": 0.22,
-      "grad_norm": 5.537989616394043,
-      "learning_rate": 3.6585365853658535e-05,
-      "loss": 0.5842,
       "step": 60
     },
     {
-      "epoch": 0.26,
-      "grad_norm": 5.8809590339660645,
-      "learning_rate": 4.26829268292683e-05,
-      "loss": 0.4952,
       "step": 70
     },
     {
-      "epoch": 0.29,
-      "grad_norm": 8.118090629577637,
-      "learning_rate": 4.878048780487805e-05,
-      "loss": 0.498,
       "step": 80
     },
     {
-      "epoch": 0.33,
-      "grad_norm": 5.70691442489624,
-      "learning_rate": 4.94572591587517e-05,
-      "loss": 0.4449,
       "step": 90
     },
     {
-      "epoch": 0.37,
-      "grad_norm": 6.3202314376831055,
-      "learning_rate": 4.877883310719132e-05,
-      "loss": 0.4748,
       "step": 100
     },
     {
-      "epoch": 0.4,
-      "grad_norm": 5.919207572937012,
-      "learning_rate": 4.810040705563094e-05,
-      "loss": 0.411,
       "step": 110
     },
     {
-      "epoch": 0.44,
-      "grad_norm": 5.670413494110107,
-      "learning_rate": 4.742198100407056e-05,
-      "loss": 0.355,
       "step": 120
     },
     {
-      "epoch": 0.48,
-      "grad_norm": 5.455657958984375,
-      "learning_rate": 4.674355495251018e-05,
-      "loss": 0.3883,
       "step": 130
     },
     {
-      "epoch": 0.51,
-      "grad_norm": 5.554341793060303,
-      "learning_rate": 4.60651289009498e-05,
-      "loss": 0.4351,
       "step": 140
     },
     {
-      "epoch": 0.55,
-      "grad_norm": 5.045084476470947,
-      "learning_rate": 4.5386702849389416e-05,
-      "loss": 0.3472,
       "step": 150
     },
     {
-      "epoch": 0.59,
-      "grad_norm": 4.392239093780518,
-      "learning_rate": 4.470827679782904e-05,
-      "loss": 0.3645,
       "step": 160
     },
     {
-      "epoch": 0.62,
-      "grad_norm": 5.654812335968018,
-      "learning_rate": 4.402985074626866e-05,
-      "loss": 0.3316,
       "step": 170
     },
     {
-      "epoch": 0.66,
-      "grad_norm": 5.683121204376221,
-      "learning_rate": 4.335142469470828e-05,
-      "loss": 0.3252,
       "step": 180
     },
     {
-      "epoch": 0.69,
-      "grad_norm": 7.785736083984375,
-      "learning_rate": 4.26729986431479e-05,
-      "loss": 0.3363,
       "step": 190
     },
     {
-      "epoch": 0.73,
-      "grad_norm": 5.187464714050293,
-      "learning_rate": 4.199457259158752e-05,
-      "loss": 0.3216,
       "step": 200
     },
     {
-      "epoch": 0.77,
-      "grad_norm": 4.926880836486816,
-      "learning_rate": 4.131614654002714e-05,
-      "loss": 0.2756,
       "step": 210
     },
     {
-      "epoch": 0.8,
-      "grad_norm": 4.24468994140625,
-      "learning_rate": 4.063772048846676e-05,
-      "loss": 0.2944,
       "step": 220
     },
     {
-      "epoch": 0.84,
-      "grad_norm": 6.090999126434326,
-      "learning_rate": 3.995929443690638e-05,
-      "loss": 0.3404,
       "step": 230
     },
     {
-      "epoch": 0.88,
-      "grad_norm": 4.666919708251953,
-      "learning_rate": 3.9280868385345995e-05,
-      "loss": 0.3581,
       "step": 240
     },
     {
-      "epoch": 0.91,
-      "grad_norm": 5.284679412841797,
-      "learning_rate": 3.860244233378562e-05,
-      "loss": 0.3318,
       "step": 250
     },
     {
-      "epoch": 0.95,
-      "grad_norm": 5.953047275543213,
-      "learning_rate": 3.792401628222524e-05,
-      "loss": 0.3297,
       "step": 260
     },
     {
-      "epoch": 0.99,
-      "grad_norm": 6.444422245025635,
-      "learning_rate": 3.724559023066486e-05,
-      "loss": 0.2894,
       "step": 270
     },
     {
-      "epoch": 1.0,
-      "eval_accuracy": 0.9740666666666666,
-      "eval_loss": 0.08225423842668533,
-      "eval_runtime": 44.5616,
-      "eval_samples_per_second": 336.613,
-      "eval_steps_per_second": 10.525,
-      "step": 273
-    },
-    {
-      "epoch": 1.02,
-      "grad_norm": 5.443094253540039,
-      "learning_rate": 3.656716417910448e-05,
-      "loss": 0.292,
       "step": 280
     },
     {
-      "epoch": 1.06,
-      "grad_norm": 6.843343734741211,
-      "learning_rate": 3.58887381275441e-05,
-      "loss": 0.2819,
       "step": 290
     },
     {
-      "epoch": 1.1,
-      "grad_norm": 5.8328142166137695,
-      "learning_rate": 3.521031207598372e-05,
-      "loss": 0.2971,
       "step": 300
     },
     {
-      "epoch": 1.13,
-      "grad_norm": 4.482433319091797,
-      "learning_rate": 3.453188602442334e-05,
-      "loss": 0.26,
       "step": 310
     },
     {
-      "epoch": 1.17,
-      "grad_norm": 5.714442729949951,
-      "learning_rate": 3.385345997286296e-05,
-      "loss": 0.2873,
       "step": 320
     },
     {
-      "epoch": 1.21,
-      "grad_norm": 5.785560607910156,
-      "learning_rate": 3.3175033921302575e-05,
-      "loss": 0.2367,
       "step": 330
     },
     {
-      "epoch": 1.24,
-      "grad_norm": 4.186683177947998,
-      "learning_rate": 3.24966078697422e-05,
-      "loss": 0.2415,
       "step": 340
     },
     {
-      "epoch": 1.28,
-      "grad_norm": 6.123615741729736,
-      "learning_rate": 3.181818181818182e-05,
-      "loss": 0.3272,
       "step": 350
     },
     {
-      "epoch": 1.32,
-      "grad_norm": 4.757399082183838,
-      "learning_rate": 3.113975576662144e-05,
-      "loss": 0.2609,
       "step": 360
     },
     {
-      "epoch": 1.35,
-      "grad_norm": 5.43366003036499,
-      "learning_rate": 3.046132971506106e-05,
-      "loss": 0.2718,
       "step": 370
     },
     {
-      "epoch": 1.39,
-      "grad_norm": 4.398933410644531,
-      "learning_rate": 2.9782903663500678e-05,
-      "loss": 0.2644,
       "step": 380
     },
     {
-      "epoch": 1.43,
-      "grad_norm": 5.111433506011963,
-      "learning_rate": 2.91044776119403e-05,
-      "loss": 0.2689,
       "step": 390
     },
     {
-      "epoch": 1.46,
-      "grad_norm": 5.859113693237305,
-      "learning_rate": 2.842605156037992e-05,
-      "loss": 0.2379,
       "step": 400
     },
     {
-      "epoch": 1.5,
-      "grad_norm": 3.6556389331817627,
-      "learning_rate": 2.7747625508819542e-05,
-      "loss": 0.2641,
       "step": 410
     },
     {
-      "epoch": 1.54,
-      "grad_norm": 6.068279266357422,
-      "learning_rate": 2.7069199457259158e-05,
-      "loss": 0.2393,
       "step": 420
     },
     {
-      "epoch": 1.57,
-      "grad_norm": 4.939550876617432,
-      "learning_rate": 2.639077340569878e-05,
-      "loss": 0.2518,
       "step": 430
     },
     {
-      "epoch": 1.61,
-      "grad_norm": 5.30012321472168,
-      "learning_rate": 2.57123473541384e-05,
-      "loss": 0.2538,
       "step": 440
     },
     {
-      "epoch": 1.65,
-      "grad_norm": 5.058879852294922,
-      "learning_rate": 2.5033921302578023e-05,
-      "loss": 0.2497,
       "step": 450
     },
     {
-      "epoch": 1.68,
-      "grad_norm": 3.878206729888916,
-      "learning_rate": 2.4355495251017642e-05,
-      "loss": 0.2441,
       "step": 460
     },
     {
-      "epoch": 1.72,
-      "grad_norm": 5.299980163574219,
-      "learning_rate": 2.367706919945726e-05,
-      "loss": 0.2498,
       "step": 470
     },
     {
-      "epoch": 1.76,
-      "grad_norm": 6.087621688842773,
-      "learning_rate": 2.299864314789688e-05,
-      "loss": 0.2806,
       "step": 480
     },
     {
-      "epoch": 1.79,
-      "grad_norm": 4.022277355194092,
-      "learning_rate": 2.2320217096336503e-05,
-      "loss": 0.2765,
       "step": 490
     },
     {
-      "epoch": 1.83,
-      "grad_norm": 4.6718220710754395,
-      "learning_rate": 2.164179104477612e-05,
-      "loss": 0.239,
       "step": 500
     },
     {
-      "epoch": 1.86,
-      "grad_norm": 4.384699821472168,
-      "learning_rate": 2.0963364993215738e-05,
-      "loss": 0.2757,
       "step": 510
     },
     {
-      "epoch": 1.9,
-      "grad_norm": 4.581112861633301,
-      "learning_rate": 2.028493894165536e-05,
-      "loss": 0.2678,
       "step": 520
     },
     {
-      "epoch": 1.94,
-      "grad_norm": 3.6300454139709473,
-      "learning_rate": 1.960651289009498e-05,
-      "loss": 0.2458,
       "step": 530
     },
     {
-      "epoch": 1.97,
-      "grad_norm": 5.09318733215332,
-      "learning_rate": 1.89280868385346e-05,
-      "loss": 0.2451,
       "step": 540
     },
     {
-      "epoch": 2.0,
-      "eval_accuracy": 0.9793333333333333,
-      "eval_loss": 0.061371468007564545,
-      "eval_runtime": 44.9848,
-      "eval_samples_per_second": 333.446,
-      "eval_steps_per_second": 10.426,
-      "step": 547
-    },
-    {
-      "epoch": 2.01,
-      "grad_norm": 4.121983528137207,
-      "learning_rate": 1.824966078697422e-05,
-      "loss": 0.2469,
       "step": 550
     },
     {
-      "epoch": 2.05,
-      "grad_norm": 3.6604321002960205,
-      "learning_rate": 1.757123473541384e-05,
-      "loss": 0.24,
       "step": 560
     },
     {
-      "epoch": 2.08,
-      "grad_norm": 5.3272385597229,
-      "learning_rate": 1.689280868385346e-05,
-      "loss": 0.2585,
       "step": 570
     },
     {
-      "epoch": 2.12,
-      "grad_norm": 3.9364449977874756,
-      "learning_rate": 1.6214382632293083e-05,
-      "loss": 0.2234,
       "step": 580
     },
     {
-      "epoch": 2.16,
-      "grad_norm": 4.854574203491211,
-      "learning_rate": 1.55359565807327e-05,
-      "loss": 0.232,
       "step": 590
     },
     {
-      "epoch": 2.19,
-      "grad_norm": 3.7035410404205322,
-      "learning_rate": 1.485753052917232e-05,
-      "loss": 0.2095,
       "step": 600
     },
     {
-      "epoch": 2.23,
-      "grad_norm": 4.301865577697754,
-      "learning_rate": 1.417910447761194e-05,
-      "loss": 0.1954,
       "step": 610
     },
     {
-      "epoch": 2.27,
-      "grad_norm": 4.957614421844482,
-      "learning_rate": 1.3500678426051561e-05,
-      "loss": 0.2287,
       "step": 620
     },
     {
-      "epoch": 2.3,
-      "grad_norm": 4.7505645751953125,
-      "learning_rate": 1.282225237449118e-05,
-      "loss": 0.2181,
       "step": 630
     },
     {
-      "epoch": 2.34,
-      "grad_norm": 5.2432050704956055,
-      "learning_rate": 1.2143826322930801e-05,
-      "loss": 0.2306,
       "step": 640
     },
     {
-      "epoch": 2.38,
-      "grad_norm": 3.746467113494873,
-      "learning_rate": 1.1465400271370422e-05,
-      "loss": 0.2406,
       "step": 650
     },
     {
-      "epoch": 2.41,
-      "grad_norm": 6.041552543640137,
-      "learning_rate": 1.0786974219810041e-05,
-      "loss": 0.2681,
       "step": 660
     },
     {
-      "epoch": 2.45,
-      "grad_norm": 6.18747091293335,
-      "learning_rate": 1.010854816824966e-05,
-      "loss": 0.1891,
       "step": 670
     },
     {
-      "epoch": 2.49,
-      "grad_norm": 4.8129472732543945,
-      "learning_rate": 9.430122116689281e-06,
-      "loss": 0.2047,
       "step": 680
     },
     {
-      "epoch": 2.52,
-      "grad_norm": 3.5734217166900635,
-      "learning_rate": 8.751696065128902e-06,
-      "loss": 0.2172,
       "step": 690
     },
     {
-      "epoch": 2.56,
-      "grad_norm": 3.77048659324646,
-      "learning_rate": 8.073270013568522e-06,
-      "loss": 0.2085,
       "step": 700
     },
     {
-      "epoch": 2.6,
-      "grad_norm": 3.4081170558929443,
-      "learning_rate": 7.394843962008141e-06,
-      "loss": 0.2176,
       "step": 710
     },
     {
-      "epoch": 2.63,
-      "grad_norm": 4.99780797958374,
-      "learning_rate": 6.716417910447762e-06,
-      "loss": 0.1924,
       "step": 720
     },
     {
-      "epoch": 2.67,
-      "grad_norm": 4.378239631652832,
-      "learning_rate": 6.037991858887382e-06,
-      "loss": 0.2172,
       "step": 730
     },
     {
-      "epoch": 2.71,
-      "grad_norm": 4.286013126373291,
-      "learning_rate": 5.359565807327002e-06,
-      "loss": 0.2281,
       "step": 740
     },
     {
-      "epoch": 2.74,
-      "grad_norm": 4.865882396697998,
-      "learning_rate": 4.681139755766622e-06,
-      "loss": 0.2241,
       "step": 750
     },
     {
-      "epoch": 2.78,
-      "grad_norm": 3.976633071899414,
-      "learning_rate": 4.002713704206242e-06,
-      "loss": 0.2064,
       "step": 760
     },
     {
-      "epoch": 2.82,
-      "grad_norm": 4.488178253173828,
-      "learning_rate": 3.324287652645862e-06,
-      "loss": 0.1979,
       "step": 770
     },
     {
-      "epoch": 2.85,
-      "grad_norm": 3.1040143966674805,
-      "learning_rate": 2.645861601085482e-06,
-      "loss": 0.2191,
       "step": 780
     },
     {
-      "epoch": 2.89,
-      "grad_norm": 4.020033836364746,
-      "learning_rate": 1.967435549525102e-06,
-      "loss": 0.2144,
       "step": 790
     },
     {
-      "epoch": 2.93,
-      "grad_norm": 4.7591142654418945,
-      "learning_rate": 1.289009497964722e-06,
-      "loss": 0.2052,
       "step": 800
     },
     {
-      "epoch": 2.96,
-      "grad_norm": 4.364315986633301,
-      "learning_rate": 6.10583446404342e-07,
-      "loss": 0.2428,
       "step": 810
     },
     {
       "epoch": 2.99,
-      "eval_accuracy": 0.9841333333333333,
-      "eval_loss": 0.0466647744178772,
-      "eval_runtime": 44.9184,
-      "eval_samples_per_second": 333.939,
-      "eval_steps_per_second": 10.441,
-      "step": 819
     },
     {
       "epoch": 2.99,
-      "step": 819,
-      "total_flos": 2.0513761171988152e+18,
-      "train_loss": 0.37578185836037437,
-      "train_runtime": 710.1831,
-      "train_samples_per_second": 147.849,
-      "train_steps_per_second": 1.153
     }
   ],
   "logging_steps": 10,
-  "max_steps": 819,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 3,
   "save_steps": 500,
-  "total_flos": 2.0513761171988152e+18,
   "train_batch_size": 32,
   "trial_name": null,
   "trial_params": null

 {
+  "best_metric": 0.9844,
+  "best_model_checkpoint": "vit-small-patch16-224-finetuned-cifar10/checkpoint-1170",
+  "epoch": 2.9942418426103647,
   "eval_steps": 500,
+  "global_step": 1170,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.03,
+      "grad_norm": 8.970051765441895,
+      "learning_rate": 4.273504273504274e-06,
+      "loss": 2.432,
       "step": 10
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 7.947530746459961,
+      "learning_rate": 8.547008547008548e-06,
+      "loss": 2.2376,
       "step": 20
     },
     {
+      "epoch": 0.08,
+      "grad_norm": 7.60875129699707,
+      "learning_rate": 1.282051282051282e-05,
+      "loss": 1.9698,
       "step": 30
     },
     {
+      "epoch": 0.1,
+      "grad_norm": 6.434678554534912,
+      "learning_rate": 1.7094017094017095e-05,
+      "loss": 1.5489,
       "step": 40
     },
     {
+      "epoch": 0.13,
+      "grad_norm": 6.238937854766846,
+      "learning_rate": 2.1367521367521368e-05,
+      "loss": 1.1375,
       "step": 50
     },
     {
+      "epoch": 0.15,
+      "grad_norm": 5.423764228820801,
+      "learning_rate": 2.564102564102564e-05,
+      "loss": 0.7332,
       "step": 60
     },
     {
+      "epoch": 0.18,
+      "grad_norm": 7.145791530609131,
+      "learning_rate": 2.9914529914529915e-05,
+      "loss": 0.5841,
       "step": 70
     },
     {
+      "epoch": 0.2,
+      "grad_norm": 6.430414199829102,
+      "learning_rate": 3.418803418803419e-05,
+      "loss": 0.4957,
       "step": 80
     },
     {
+      "epoch": 0.23,
+      "grad_norm": 6.975243091583252,
+      "learning_rate": 3.846153846153846e-05,
+      "loss": 0.4523,
       "step": 90
     },
     {
+      "epoch": 0.26,
+      "grad_norm": 6.09722900390625,
+      "learning_rate": 4.2735042735042735e-05,
+      "loss": 0.4462,
       "step": 100
     },
     {
+      "epoch": 0.28,
+      "grad_norm": 5.052177429199219,
+      "learning_rate": 4.700854700854701e-05,
+      "loss": 0.3624,
       "step": 110
     },
     {
+      "epoch": 0.31,
+      "grad_norm": 5.480886459350586,
+      "learning_rate": 4.985754985754986e-05,
+      "loss": 0.3895,
       "step": 120
     },
     {
+      "epoch": 0.33,
+      "grad_norm": 4.691812992095947,
+      "learning_rate": 4.938271604938271e-05,
+      "loss": 0.3446,
       "step": 130
     },
     {
+      "epoch": 0.36,
+      "grad_norm": 6.404294013977051,
+      "learning_rate": 4.890788224121557e-05,
+      "loss": 0.4006,
       "step": 140
     },
     {
+      "epoch": 0.38,
+      "grad_norm": 5.33477258682251,
+      "learning_rate": 4.8433048433048433e-05,
+      "loss": 0.3532,
       "step": 150
     },
     {
+      "epoch": 0.41,
+      "grad_norm": 5.476822853088379,
+      "learning_rate": 4.7958214624881294e-05,
+      "loss": 0.3421,
       "step": 160
     },
     {
+      "epoch": 0.44,
+      "grad_norm": 6.005227565765381,
+      "learning_rate": 4.7483380816714154e-05,
+      "loss": 0.3692,
       "step": 170
     },
     {
+      "epoch": 0.46,
+      "grad_norm": 4.84128999710083,
+      "learning_rate": 4.700854700854701e-05,
+      "loss": 0.3457,
       "step": 180
     },
     {
+      "epoch": 0.49,
+      "grad_norm": 4.650127410888672,
+      "learning_rate": 4.653371320037987e-05,
+      "loss": 0.3685,
       "step": 190
     },
     {
+      "epoch": 0.51,
+      "grad_norm": 5.779717445373535,
+      "learning_rate": 4.605887939221273e-05,
+      "loss": 0.3451,
       "step": 200
     },
     {
+      "epoch": 0.54,
+      "grad_norm": 6.38178825378418,
+      "learning_rate": 4.558404558404559e-05,
+      "loss": 0.3394,
       "step": 210
     },
     {
+      "epoch": 0.56,
+      "grad_norm": 5.148958206176758,
+      "learning_rate": 4.510921177587845e-05,
+      "loss": 0.3641,
       "step": 220
     },
     {
+      "epoch": 0.59,
+      "grad_norm": 3.9889109134674072,
+      "learning_rate": 4.463437796771131e-05,
+      "loss": 0.3108,
       "step": 230
     },
     {
+      "epoch": 0.61,
+      "grad_norm": 6.51384162902832,
+      "learning_rate": 4.415954415954416e-05,
+      "loss": 0.2949,
       "step": 240
     },
     {
+      "epoch": 0.64,
+      "grad_norm": 4.950154781341553,
+      "learning_rate": 4.368471035137702e-05,
+      "loss": 0.3352,
       "step": 250
     },
     {
+      "epoch": 0.67,
+      "grad_norm": 6.234986782073975,
+      "learning_rate": 4.3209876543209875e-05,
+      "loss": 0.309,
       "step": 260
     },
     {
+      "epoch": 0.69,
+      "grad_norm": 5.57274055480957,
+      "learning_rate": 4.2735042735042735e-05,
+      "loss": 0.3161,
       "step": 270
     },
     {
+      "epoch": 0.72,
+      "grad_norm": 5.511316776275635,
+      "learning_rate": 4.2260208926875595e-05,
+      "loss": 0.3018,
       "step": 280
     },
     {
+      "epoch": 0.74,
+      "grad_norm": 3.934835910797119,
+      "learning_rate": 4.1785375118708455e-05,
+      "loss": 0.3116,
       "step": 290
     },
     {
+      "epoch": 0.77,
+      "grad_norm": 5.323722839355469,
+      "learning_rate": 4.131054131054131e-05,
+      "loss": 0.3105,
       "step": 300
     },
     {
+      "epoch": 0.79,
+      "grad_norm": 4.1425909996032715,
+      "learning_rate": 4.083570750237417e-05,
+      "loss": 0.296,
       "step": 310
     },
     {
+      "epoch": 0.82,
+      "grad_norm": 5.026642799377441,
+      "learning_rate": 4.036087369420703e-05,
+      "loss": 0.3594,
       "step": 320
     },
     {
+      "epoch": 0.84,
+      "grad_norm": 3.949415922164917,
+      "learning_rate": 3.988603988603989e-05,
+      "loss": 0.2849,
       "step": 330
     },
     {
+      "epoch": 0.87,
+      "grad_norm": 4.380434989929199,
+      "learning_rate": 3.941120607787275e-05,
+      "loss": 0.2935,
       "step": 340
     },
     {
+      "epoch": 0.9,
+      "grad_norm": 4.884699821472168,
+      "learning_rate": 3.893637226970561e-05,
+      "loss": 0.3128,
       "step": 350
     },
     {
+      "epoch": 0.92,
+      "grad_norm": 6.950473308563232,
+      "learning_rate": 3.846153846153846e-05,
+      "loss": 0.3073,
       "step": 360
     },
     {
+      "epoch": 0.95,
+      "grad_norm": 5.410139560699463,
+      "learning_rate": 3.798670465337132e-05,
+      "loss": 0.2998,
       "step": 370
     },
     {
+      "epoch": 0.97,
+      "grad_norm": 4.638174057006836,
+      "learning_rate": 3.7511870845204176e-05,
+      "loss": 0.3124,
       "step": 380
     },
     {
+      "epoch": 1.0,
+      "grad_norm": 4.711712837219238,
+      "learning_rate": 3.7037037037037037e-05,
+      "loss": 0.2682,
       "step": 390
     },
     {
+      "epoch": 1.0,
+      "eval_accuracy": 0.9713,
+      "eval_loss": 0.0821285992860794,
+      "eval_runtime": 32.3489,
+      "eval_samples_per_second": 309.13,
+      "eval_steps_per_second": 9.676,
+      "step": 390
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 4.979706764221191,
+      "learning_rate": 3.65622032288699e-05,
+      "loss": 0.2836,
       "step": 400
     },
     {
+      "epoch": 1.05,
+      "grad_norm": 3.807408332824707,
+      "learning_rate": 3.608736942070276e-05,
+      "loss": 0.2521,
       "step": 410
     },
     {
+      "epoch": 1.07,
+      "grad_norm": 4.402036190032959,
+      "learning_rate": 3.561253561253561e-05,
+      "loss": 0.2469,
       "step": 420
     },
     {
+      "epoch": 1.1,
+      "grad_norm": 3.0667805671691895,
+      "learning_rate": 3.513770180436847e-05,
+      "loss": 0.2383,
       "step": 430
     },
     {
+      "epoch": 1.13,
+      "grad_norm": 6.403406620025635,
+      "learning_rate": 3.466286799620133e-05,
+      "loss": 0.277,
       "step": 440
     },
     {
+      "epoch": 1.15,
+      "grad_norm": 5.481479167938232,
+      "learning_rate": 3.418803418803419e-05,
+      "loss": 0.2533,
       "step": 450
     },
     {
+      "epoch": 1.18,
+      "grad_norm": 5.324038028717041,
+      "learning_rate": 3.371320037986705e-05,
+      "loss": 0.2813,
       "step": 460
     },
     {
+      "epoch": 1.2,
+      "grad_norm": 2.4851675033569336,
+      "learning_rate": 3.323836657169991e-05,
+      "loss": 0.3055,
       "step": 470
     },
     {
+      "epoch": 1.23,
+      "grad_norm": 4.365708827972412,
+      "learning_rate": 3.2763532763532764e-05,
+      "loss": 0.2797,
       "step": 480
     },
     {
+      "epoch": 1.25,
+      "grad_norm": 3.6004509925842285,
+      "learning_rate": 3.2288698955365625e-05,
+      "loss": 0.2776,
       "step": 490
     },
     {
+      "epoch": 1.28,
+      "grad_norm": 3.7064690589904785,
+      "learning_rate": 3.181386514719848e-05,
+      "loss": 0.2605,
       "step": 500
     },
     {
+      "epoch": 1.31,
+      "grad_norm": 4.883594036102295,
+      "learning_rate": 3.133903133903134e-05,
+      "loss": 0.2403,
       "step": 510
     },
     {
+      "epoch": 1.33,
+      "grad_norm": 5.014550685882568,
+      "learning_rate": 3.08641975308642e-05,
+      "loss": 0.2436,
       "step": 520
     },
     {
+      "epoch": 1.36,
+      "grad_norm": 4.304734230041504,
+      "learning_rate": 3.0389363722697055e-05,
+      "loss": 0.2639,
       "step": 530
     },
     {
+      "epoch": 1.38,
+      "grad_norm": 3.2759273052215576,
+      "learning_rate": 2.9914529914529915e-05,
+      "loss": 0.2651,
       "step": 540
     },
     {
+      "epoch": 1.41,
+      "grad_norm": 3.556528091430664,
+      "learning_rate": 2.9439696106362775e-05,
+      "loss": 0.2359,
       "step": 550
     },
     {
+      "epoch": 1.43,
+      "grad_norm": 4.672804355621338,
+      "learning_rate": 2.8964862298195632e-05,
+      "loss": 0.2376,
       "step": 560
     },
     {
+      "epoch": 1.46,
+      "grad_norm": 4.527768611907959,
+      "learning_rate": 2.8490028490028492e-05,
+      "loss": 0.2504,
       "step": 570
     },
     {
+      "epoch": 1.48,
+      "grad_norm": 5.102674961090088,
+      "learning_rate": 2.8015194681861352e-05,
+      "loss": 0.2604,
       "step": 580
     },
     {
+      "epoch": 1.51,
+      "grad_norm": 4.18637228012085,
+      "learning_rate": 2.754036087369421e-05,
+      "loss": 0.2746,
       "step": 590
     },
     {
+      "epoch": 1.54,
+      "grad_norm": 3.3811545372009277,
+      "learning_rate": 2.706552706552707e-05,
+      "loss": 0.2548,
       "step": 600
     },
     {
+      "epoch": 1.56,
+      "grad_norm": 3.1672608852386475,
+      "learning_rate": 2.6590693257359926e-05,
+      "loss": 0.2423,
       "step": 610
     },
     {
+      "epoch": 1.59,
+      "grad_norm": 3.609534740447998,
+      "learning_rate": 2.611585944919278e-05,
+      "loss": 0.1994,
       "step": 620
     },
     {
+      "epoch": 1.61,
+      "grad_norm": 5.083642482757568,
+      "learning_rate": 2.564102564102564e-05,
+      "loss": 0.292,
       "step": 630
     },
     {
+      "epoch": 1.64,
+      "grad_norm": 4.716630935668945,
+      "learning_rate": 2.51661918328585e-05,
+      "loss": 0.2333,
       "step": 640
     },
     {
+      "epoch": 1.66,
+      "grad_norm": 4.915971755981445,
+      "learning_rate": 2.4691358024691357e-05,
+      "loss": 0.2364,
       "step": 650
     },
     {
+      "epoch": 1.69,
+      "grad_norm": 4.216696739196777,
+      "learning_rate": 2.4216524216524217e-05,
+      "loss": 0.2503,
       "step": 660
     },
     {
+      "epoch": 1.71,
+      "grad_norm": 4.966453552246094,
+      "learning_rate": 2.3741690408357077e-05,
+      "loss": 0.224,
       "step": 670
     },
     {
+      "epoch": 1.74,
+      "grad_norm": 4.153652191162109,
+      "learning_rate": 2.3266856600189934e-05,
+      "loss": 0.2109,
       "step": 680
     },
     {
+      "epoch": 1.77,
+      "grad_norm": 2.9919214248657227,
+      "learning_rate": 2.2792022792022794e-05,
+      "loss": 0.2371,
       "step": 690
     },
     {
+      "epoch": 1.79,
+      "grad_norm": 5.105522155761719,
+      "learning_rate": 2.2317188983855654e-05,
+      "loss": 0.2187,
       "step": 700
     },
     {
+      "epoch": 1.82,
+      "grad_norm": 3.9936702251434326,
+      "learning_rate": 2.184235517568851e-05,
+      "loss": 0.2403,
       "step": 710
     },
     {
+      "epoch": 1.84,
+      "grad_norm": 5.220417022705078,
+      "learning_rate": 2.1367521367521368e-05,
+      "loss": 0.2383,
       "step": 720
     },
     {
+      "epoch": 1.87,
+      "grad_norm": 3.134110450744629,
+      "learning_rate": 2.0892687559354228e-05,
+      "loss": 0.2197,
       "step": 730
     },
     {
+      "epoch": 1.89,
+      "grad_norm": 5.406284809112549,
+      "learning_rate": 2.0417853751187084e-05,
+      "loss": 0.224,
       "step": 740
     },
     {
+      "epoch": 1.92,
+      "grad_norm": 5.273613452911377,
+      "learning_rate": 1.9943019943019945e-05,
+      "loss": 0.1951,
       "step": 750
     },
     {
+      "epoch": 1.94,
+      "grad_norm": 4.695704936981201,
+      "learning_rate": 1.9468186134852805e-05,
+      "loss": 0.2324,
       "step": 760
     },
     {
+      "epoch": 1.97,
+      "grad_norm": 5.734136581420898,
+      "learning_rate": 1.899335232668566e-05,
+      "loss": 0.248,
       "step": 770
     },
     {
+      "epoch": 2.0,
+      "grad_norm": 4.757730960845947,
+      "learning_rate": 1.8518518518518518e-05,
+      "loss": 0.252,
       "step": 780
     },
     {
+      "epoch": 2.0,
+      "eval_accuracy": 0.9842,
+      "eval_loss": 0.050203289836645126,
+      "eval_runtime": 31.7681,
+      "eval_samples_per_second": 314.781,
+      "eval_steps_per_second": 9.853,
+      "step": 781
+    },
+    {
+      "epoch": 2.02,
+      "grad_norm": 4.400415897369385,
+      "learning_rate": 1.804368471035138e-05,
+      "loss": 0.2364,
       "step": 790
     },
     {
+      "epoch": 2.05,
+      "grad_norm": 4.02318000793457,
+      "learning_rate": 1.7568850902184235e-05,
+      "loss": 0.2164,
       "step": 800
     },
     {
+      "epoch": 2.07,
+      "grad_norm": 5.289691925048828,
+      "learning_rate": 1.7094017094017095e-05,
+      "loss": 0.2472,
       "step": 810
     },
+    {
+      "epoch": 2.1,
+      "grad_norm": 3.842559337615967,
+      "learning_rate": 1.6619183285849956e-05,
+      "loss": 0.2209,
+      "step": 820
+    },
+    {
+      "epoch": 2.12,
+      "grad_norm": 5.64124870300293,
+      "learning_rate": 1.6144349477682812e-05,
+      "loss": 0.2473,
+      "step": 830
+    },
+    {
+      "epoch": 2.15,
+      "grad_norm": 4.473005771636963,
+      "learning_rate": 1.566951566951567e-05,
+      "loss": 0.2287,
+      "step": 840
+    },
+    {
+      "epoch": 2.18,
+      "grad_norm": 4.476832389831543,
+      "learning_rate": 1.5194681861348528e-05,
+      "loss": 0.2208,
+      "step": 850
+    },
+    {
+      "epoch": 2.2,
+      "grad_norm": 4.89029598236084,
+      "learning_rate": 1.4719848053181388e-05,
+      "loss": 0.2332,
+      "step": 860
+    },
+    {
+      "epoch": 2.23,
+      "grad_norm": 5.0262627601623535,
+      "learning_rate": 1.4245014245014246e-05,
+      "loss": 0.2001,
+      "step": 870
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": 4.80778169631958,
+      "learning_rate": 1.3770180436847105e-05,
+      "loss": 0.2018,
+      "step": 880
+    },
+    {
+      "epoch": 2.28,
+      "grad_norm": 3.203700304031372,
+      "learning_rate": 1.3295346628679963e-05,
+      "loss": 0.2018,
+      "step": 890
+    },
+    {
+      "epoch": 2.3,
+      "grad_norm": 4.840007305145264,
+      "learning_rate": 1.282051282051282e-05,
+      "loss": 0.2754,
+      "step": 900
+    },
+    {
+      "epoch": 2.33,
+      "grad_norm": 4.501500129699707,
+      "learning_rate": 1.2345679012345678e-05,
+      "loss": 0.1986,
+      "step": 910
+    },
+    {
+      "epoch": 2.35,
+      "grad_norm": 3.4367220401763916,
+      "learning_rate": 1.1870845204178538e-05,
+      "loss": 0.2145,
+      "step": 920
+    },
+    {
+      "epoch": 2.38,
+      "grad_norm": 5.076148509979248,
+      "learning_rate": 1.1396011396011397e-05,
+      "loss": 0.2001,
+      "step": 930
+    },
+    {
+      "epoch": 2.41,
+      "grad_norm": 2.299694299697876,
+      "learning_rate": 1.0921177587844255e-05,
+      "loss": 0.1935,
+      "step": 940
+    },
+    {
+      "epoch": 2.43,
+      "grad_norm": 3.9149887561798096,
+      "learning_rate": 1.0446343779677114e-05,
+      "loss": 0.2158,
+      "step": 950
+    },
+    {
+      "epoch": 2.46,
+      "grad_norm": 5.087822437286377,
+      "learning_rate": 9.971509971509972e-06,
+      "loss": 0.2052,
+      "step": 960
+    },
+    {
+      "epoch": 2.48,
+      "grad_norm": 3.8664252758026123,
+      "learning_rate": 9.49667616334283e-06,
+      "loss": 0.2056,
+      "step": 970
+    },
+    {
+      "epoch": 2.51,
+      "grad_norm": 4.961308479309082,
+      "learning_rate": 9.02184235517569e-06,
+      "loss": 0.2157,
+      "step": 980
+    },
+    {
+      "epoch": 2.53,
+      "grad_norm": 4.553085803985596,
+      "learning_rate": 8.547008547008548e-06,
+      "loss": 0.2079,
+      "step": 990
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 3.396073818206787,
+      "learning_rate": 8.072174738841406e-06,
+      "loss": 0.1919,
+      "step": 1000
+    },
+    {
+      "epoch": 2.58,
+      "grad_norm": 5.101813316345215,
+      "learning_rate": 7.597340930674264e-06,
+      "loss": 0.1812,
+      "step": 1010
+    },
+    {
+      "epoch": 2.61,
+      "grad_norm": 5.189543724060059,
+      "learning_rate": 7.122507122507123e-06,
+      "loss": 0.2123,
+      "step": 1020
+    },
+    {
+      "epoch": 2.64,
+      "grad_norm": 4.240951061248779,
+      "learning_rate": 6.6476733143399815e-06,
+      "loss": 0.218,
+      "step": 1030
+    },
+    {
+      "epoch": 2.66,
+      "grad_norm": 3.139678716659546,
+      "learning_rate": 6.172839506172839e-06,
+      "loss": 0.1824,
+      "step": 1040
+    },
+    {
+      "epoch": 2.69,
+      "grad_norm": 2.84495210647583,
+      "learning_rate": 5.6980056980056985e-06,
+      "loss": 0.1916,
+      "step": 1050
+    },
+    {
+      "epoch": 2.71,
+      "grad_norm": 3.4231679439544678,
+      "learning_rate": 5.223171889838557e-06,
+      "loss": 0.1953,
+      "step": 1060
+    },
+    {
+      "epoch": 2.74,
+      "grad_norm": 4.120250225067139,
+      "learning_rate": 4.748338081671415e-06,
+      "loss": 0.2046,
+      "step": 1070
+    },
+    {
+      "epoch": 2.76,
+      "grad_norm": 5.0515055656433105,
+      "learning_rate": 4.273504273504274e-06,
+      "loss": 0.1941,
+      "step": 1080
+    },
+    {
+      "epoch": 2.79,
+      "grad_norm": 4.197494029998779,
+      "learning_rate": 3.798670465337132e-06,
+      "loss": 0.1908,
+      "step": 1090
+    },
+    {
+      "epoch": 2.82,
+      "grad_norm": 5.316411972045898,
+      "learning_rate": 3.3238366571699908e-06,
+      "loss": 0.1954,
+      "step": 1100
+    },
+    {
+      "epoch": 2.84,
+      "grad_norm": 2.8527472019195557,
+      "learning_rate": 2.8490028490028492e-06,
+      "loss": 0.1466,
+      "step": 1110
+    },
+    {
+      "epoch": 2.87,
+      "grad_norm": 4.074756622314453,
+      "learning_rate": 2.3741690408357077e-06,
+      "loss": 0.2092,
+      "step": 1120
+    },
+    {
+      "epoch": 2.89,
+      "grad_norm": 4.336794853210449,
+      "learning_rate": 1.899335232668566e-06,
+      "loss": 0.1742,
+      "step": 1130
+    },
+    {
+      "epoch": 2.92,
+      "grad_norm": 3.8528594970703125,
+      "learning_rate": 1.4245014245014246e-06,
+      "loss": 0.2002,
+      "step": 1140
+    },
+    {
+      "epoch": 2.94,
+      "grad_norm": 4.159782409667969,
+      "learning_rate": 9.49667616334283e-07,
+      "loss": 0.1847,
+      "step": 1150
+    },
+    {
+      "epoch": 2.97,
+      "grad_norm": 5.317773342132568,
+      "learning_rate": 4.748338081671415e-07,
+      "loss": 0.2293,
+      "step": 1160
+    },
+    {
+      "epoch": 2.99,
+      "grad_norm": 4.241806983947754,
+      "learning_rate": 0.0,
+      "loss": 0.173,
+      "step": 1170
+    },
     {
       "epoch": 2.99,
+      "eval_accuracy": 0.9844,
+      "eval_loss": 0.04590694606304169,
+      "eval_runtime": 32.5039,
+      "eval_samples_per_second": 307.655,
+      "eval_steps_per_second": 9.63,
+      "step": 1170
     },
     {
       "epoch": 2.99,
+      "step": 1170,
+      "total_flos": 2.930358373492064e+18,
+      "train_loss": 0.33610059461023056,
+      "train_runtime": 935.657,
+      "train_samples_per_second": 160.315,
+      "train_steps_per_second": 1.25
     }
   ],
   "logging_steps": 10,
+  "max_steps": 1170,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 3,
   "save_steps": 500,
+  "total_flos": 2.930358373492064e+18,
   "train_batch_size": 32,
   "trial_name": null,
   "trial_params": null