End of training

Browse files

Files changed (6) hide show

README.md +14 -2
all_results.json +16 -0
eval_results.json +10 -0
runs/Jun05_07-51-28_phyl-ling-p01.la.utexas.edu/events.out.tfevents.1717673474.phyl-ling-p01.la.utexas.edu.175344.1 +3 -0
train_results.json +9 -0
trainer_state.json +2807 -0

README.md CHANGED Viewed

@@ -1,11 +1,23 @@
 ---
 tags:
 - generated_from_trainer
 metrics:
 - accuracy
 model-index:
 - name: smolm-autoreg-bpe-counterfactual_babylm_anans_new-3e-4
-  results: []
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -13,7 +25,7 @@ should probably proofread and complete it, then remove this comment. -->
 # smolm-autoreg-bpe-counterfactual_babylm_anans_new-3e-4
-This model was trained from scratch on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 3.4257
 - Accuracy: 0.4096

 ---
 tags:
 - generated_from_trainer
+datasets:
+- kanishka/counterfactual_babylm_anans_new
 metrics:
 - accuracy
 model-index:
 - name: smolm-autoreg-bpe-counterfactual_babylm_anans_new-3e-4
+  results:
+  - task:
+      name: Causal Language Modeling
+      type: text-generation
+    dataset:
+      name: kanishka/counterfactual_babylm_anans_new
+      type: kanishka/counterfactual_babylm_anans_new
+    metrics:
+    - name: Accuracy
+      type: accuracy
+      value: 0.4096272415057219
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 # smolm-autoreg-bpe-counterfactual_babylm_anans_new-3e-4
+This model was trained from scratch on the kanishka/counterfactual_babylm_anans_new dataset.
 It achieves the following results on the evaluation set:
 - Loss: 3.4257
 - Accuracy: 0.4096

all_results.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "epoch": 20.0,
+    "eval_accuracy": 0.4096272415057219,
+    "eval_loss": 3.4256882667541504,
+    "eval_runtime": 154.1089,
+    "eval_samples": 57921,
+    "eval_samples_per_second": 375.845,
+    "eval_steps_per_second": 5.879,
+    "perplexity": 30.74379750505285,
+    "total_flos": 1.5669257538816e+18,
+    "train_loss": 3.0592986363049124,
+    "train_runtime": 81068.1718,
+    "train_samples": 595035,
+    "train_samples_per_second": 146.799,
+    "train_steps_per_second": 4.587
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "epoch": 20.0,
+    "eval_accuracy": 0.4096272415057219,
+    "eval_loss": 3.4256882667541504,
+    "eval_runtime": 154.1089,
+    "eval_samples": 57921,
+    "eval_samples_per_second": 375.845,
+    "eval_steps_per_second": 5.879,
+    "perplexity": 30.74379750505285
+}

runs/Jun05_07-51-28_phyl-ling-p01.la.utexas.edu/events.out.tfevents.1717673474.phyl-ling-p01.la.utexas.edu.175344.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e6e284846fb41be8706eb3897d2cb1f75fe68484b2e10c6bd43cc920b0a8b1ed
+size 417

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 20.0,
+    "total_flos": 1.5669257538816e+18,
+    "train_loss": 3.0592986363049124,
+    "train_runtime": 81068.1718,
+    "train_samples": 595035,
+    "train_samples_per_second": 146.799,
+    "train_steps_per_second": 4.587
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2807 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 20.0,
+  "eval_steps": 500,
+  "global_step": 371900,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.05377789728421619,
+      "grad_norm": 0.7097817659378052,
+      "learning_rate": 9.375e-06,
+      "loss": 6.8536,
+      "step": 1000
+    },
+    {
+      "epoch": 0.10755579456843238,
+      "grad_norm": 0.8630837798118591,
+      "learning_rate": 1.875e-05,
+      "loss": 5.3656,
+      "step": 2000
+    },
+    {
+      "epoch": 0.16133369185264856,
+      "grad_norm": 0.9362223148345947,
+      "learning_rate": 2.8125e-05,
+      "loss": 5.0405,
+      "step": 3000
+    },
+    {
+      "epoch": 0.21511158913686476,
+      "grad_norm": 0.9098538756370544,
+      "learning_rate": 3.75e-05,
+      "loss": 4.8207,
+      "step": 4000
+    },
+    {
+      "epoch": 0.26888948642108096,
+      "grad_norm": 1.0826491117477417,
+      "learning_rate": 4.6874999999999994e-05,
+      "loss": 4.6468,
+      "step": 5000
+    },
+    {
+      "epoch": 0.32266738370529713,
+      "grad_norm": 0.9499918222427368,
+      "learning_rate": 5.625e-05,
+      "loss": 4.4983,
+      "step": 6000
+    },
+    {
+      "epoch": 0.3764452809895133,
+      "grad_norm": 0.9495302438735962,
+      "learning_rate": 6.5625e-05,
+      "loss": 4.39,
+      "step": 7000
+    },
+    {
+      "epoch": 0.4302231782737295,
+      "grad_norm": 0.8918095231056213,
+      "learning_rate": 7.5e-05,
+      "loss": 4.2828,
+      "step": 8000
+    },
+    {
+      "epoch": 0.4840010755579457,
+      "grad_norm": 0.8999391794204712,
+      "learning_rate": 8.437499999999999e-05,
+      "loss": 4.2052,
+      "step": 9000
+    },
+    {
+      "epoch": 0.5377789728421619,
+      "grad_norm": 0.872922956943512,
+      "learning_rate": 9.374999999999999e-05,
+      "loss": 4.1356,
+      "step": 10000
+    },
+    {
+      "epoch": 0.591556870126378,
+      "grad_norm": 0.8972263932228088,
+      "learning_rate": 0.00010312499999999999,
+      "loss": 4.0696,
+      "step": 11000
+    },
+    {
+      "epoch": 0.6453347674105943,
+      "grad_norm": 0.853663980960846,
+      "learning_rate": 0.000112490625,
+      "loss": 4.008,
+      "step": 12000
+    },
+    {
+      "epoch": 0.6991126646948105,
+      "grad_norm": 0.8272613286972046,
+      "learning_rate": 0.000121865625,
+      "loss": 3.9457,
+      "step": 13000
+    },
+    {
+      "epoch": 0.7528905619790266,
+      "grad_norm": 0.8427807092666626,
+      "learning_rate": 0.00013123125,
+      "loss": 3.8928,
+      "step": 14000
+    },
+    {
+      "epoch": 0.8066684592632428,
+      "grad_norm": 0.7996278405189514,
+      "learning_rate": 0.00014060625,
+      "loss": 3.8373,
+      "step": 15000
+    },
+    {
+      "epoch": 0.860446356547459,
+      "grad_norm": 0.7708893418312073,
+      "learning_rate": 0.000149971875,
+      "loss": 3.8015,
+      "step": 16000
+    },
+    {
+      "epoch": 0.9142242538316752,
+      "grad_norm": 0.750762403011322,
+      "learning_rate": 0.000159346875,
+      "loss": 3.7619,
+      "step": 17000
+    },
+    {
+      "epoch": 0.9680021511158914,
+      "grad_norm": 0.8043661117553711,
+      "learning_rate": 0.000168703125,
+      "loss": 3.7355,
+      "step": 18000
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.3460631369831053,
+      "eval_loss": 3.899465799331665,
+      "eval_runtime": 151.8767,
+      "eval_samples_per_second": 381.369,
+      "eval_steps_per_second": 5.965,
+      "step": 18595
+    },
+    {
+      "epoch": 1.0217800484001076,
+      "grad_norm": 0.7180488109588623,
+      "learning_rate": 0.000178078125,
+      "loss": 3.6982,
+      "step": 19000
+    },
+    {
+      "epoch": 1.0755579456843238,
+      "grad_norm": 0.6873713135719299,
+      "learning_rate": 0.00018745312499999998,
+      "loss": 3.6651,
+      "step": 20000
+    },
+    {
+      "epoch": 1.1293358429685398,
+      "grad_norm": 0.6932269334793091,
+      "learning_rate": 0.00019680937499999996,
+      "loss": 3.6514,
+      "step": 21000
+    },
+    {
+      "epoch": 1.183113740252756,
+      "grad_norm": 0.668073296546936,
+      "learning_rate": 0.00020618437499999995,
+      "loss": 3.6328,
+      "step": 22000
+    },
+    {
+      "epoch": 1.2368916375369723,
+      "grad_norm": 0.6708812713623047,
+      "learning_rate": 0.00021555937499999998,
+      "loss": 3.6162,
+      "step": 23000
+    },
+    {
+      "epoch": 1.2906695348211885,
+      "grad_norm": 0.6274955868721008,
+      "learning_rate": 0.00022493437499999998,
+      "loss": 3.592,
+      "step": 24000
+    },
+    {
+      "epoch": 1.3444474321054047,
+      "grad_norm": 0.6207209229469299,
+      "learning_rate": 0.00023430937499999997,
+      "loss": 3.5712,
+      "step": 25000
+    },
+    {
+      "epoch": 1.398225329389621,
+      "grad_norm": 0.5920027494430542,
+      "learning_rate": 0.00024367499999999997,
+      "loss": 3.5621,
+      "step": 26000
+    },
+    {
+      "epoch": 1.452003226673837,
+      "grad_norm": 0.5753495097160339,
+      "learning_rate": 0.00025305,
+      "loss": 3.5494,
+      "step": 27000
+    },
+    {
+      "epoch": 1.5057811239580532,
+      "grad_norm": 0.5462520718574524,
+      "learning_rate": 0.000262425,
+      "loss": 3.5336,
+      "step": 28000
+    },
+    {
+      "epoch": 1.5595590212422694,
+      "grad_norm": 0.5698980689048767,
+      "learning_rate": 0.000271790625,
+      "loss": 3.5201,
+      "step": 29000
+    },
+    {
+      "epoch": 1.6133369185264856,
+      "grad_norm": 0.5474157333374023,
+      "learning_rate": 0.00028115624999999994,
+      "loss": 3.5162,
+      "step": 30000
+    },
+    {
+      "epoch": 1.6671148158107019,
+      "grad_norm": 0.5613803267478943,
+      "learning_rate": 0.00029053124999999994,
+      "loss": 3.5054,
+      "step": 31000
+    },
+    {
+      "epoch": 1.7208927130949179,
+      "grad_norm": 0.5235997438430786,
+      "learning_rate": 0.00029990624999999993,
+      "loss": 3.4928,
+      "step": 32000
+    },
+    {
+      "epoch": 1.7746706103791343,
+      "grad_norm": 0.5065760016441345,
+      "learning_rate": 0.00029912621359223297,
+      "loss": 3.4811,
+      "step": 33000
+    },
+    {
+      "epoch": 1.8284485076633503,
+      "grad_norm": 0.5031436085700989,
+      "learning_rate": 0.0002982444836716681,
+      "loss": 3.4693,
+      "step": 34000
+    },
+    {
+      "epoch": 1.8822264049475665,
+      "grad_norm": 0.47748515009880066,
+      "learning_rate": 0.00029736187113857014,
+      "loss": 3.4572,
+      "step": 35000
+    },
+    {
+      "epoch": 1.9360043022317828,
+      "grad_norm": 0.47696492075920105,
+      "learning_rate": 0.0002964801412180053,
+      "loss": 3.4431,
+      "step": 36000
+    },
+    {
+      "epoch": 1.9897821995159988,
+      "grad_norm": 0.43063884973526,
+      "learning_rate": 0.0002955975286849073,
+      "loss": 3.4292,
+      "step": 37000
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.37681863334985255,
+      "eval_loss": 3.60871958732605,
+      "eval_runtime": 153.6559,
+      "eval_samples_per_second": 376.953,
+      "eval_steps_per_second": 5.896,
+      "step": 37190
+    },
+    {
+      "epoch": 2.043560096800215,
+      "grad_norm": 0.4518415331840515,
+      "learning_rate": 0.0002947149161518093,
+      "loss": 3.385,
+      "step": 38000
+    },
+    {
+      "epoch": 2.0973379940844312,
+      "grad_norm": 0.45232945680618286,
+      "learning_rate": 0.0002938331862312445,
+      "loss": 3.3727,
+      "step": 39000
+    },
+    {
+      "epoch": 2.1511158913686477,
+      "grad_norm": 0.4550546407699585,
+      "learning_rate": 0.0002929505736981465,
+      "loss": 3.366,
+      "step": 40000
+    },
+    {
+      "epoch": 2.2048937886528637,
+      "grad_norm": 0.43207359313964844,
+      "learning_rate": 0.0002920679611650485,
+      "loss": 3.3641,
+      "step": 41000
+    },
+    {
+      "epoch": 2.2586716859370797,
+      "grad_norm": 0.43842893838882446,
+      "learning_rate": 0.0002911853486319506,
+      "loss": 3.3596,
+      "step": 42000
+    },
+    {
+      "epoch": 2.312449583221296,
+      "grad_norm": 0.420186311006546,
+      "learning_rate": 0.0002903036187113857,
+      "loss": 3.3504,
+      "step": 43000
+    },
+    {
+      "epoch": 2.366227480505512,
+      "grad_norm": 0.4150792062282562,
+      "learning_rate": 0.0002894210061782877,
+      "loss": 3.3515,
+      "step": 44000
+    },
+    {
+      "epoch": 2.4200053777897286,
+      "grad_norm": 0.4212990701198578,
+      "learning_rate": 0.0002885401588702559,
+      "loss": 3.3412,
+      "step": 45000
+    },
+    {
+      "epoch": 2.4737832750739446,
+      "grad_norm": 0.4120098054409027,
+      "learning_rate": 0.00028765754633715794,
+      "loss": 3.3399,
+      "step": 46000
+    },
+    {
+      "epoch": 2.527561172358161,
+      "grad_norm": 0.38389134407043457,
+      "learning_rate": 0.00028677493380406,
+      "loss": 3.3321,
+      "step": 47000
+    },
+    {
+      "epoch": 2.581339069642377,
+      "grad_norm": 0.38017722964286804,
+      "learning_rate": 0.00028589232127096203,
+      "loss": 3.3214,
+      "step": 48000
+    },
+    {
+      "epoch": 2.635116966926593,
+      "grad_norm": 0.3919270634651184,
+      "learning_rate": 0.00028500970873786405,
+      "loss": 3.3228,
+      "step": 49000
+    },
+    {
+      "epoch": 2.6888948642108095,
+      "grad_norm": 0.4108082354068756,
+      "learning_rate": 0.0002841279788172992,
+      "loss": 3.3138,
+      "step": 50000
+    },
+    {
+      "epoch": 2.7426727614950255,
+      "grad_norm": 0.3886261582374573,
+      "learning_rate": 0.0002832453662842012,
+      "loss": 3.3056,
+      "step": 51000
+    },
+    {
+      "epoch": 2.796450658779242,
+      "grad_norm": 0.3850669264793396,
+      "learning_rate": 0.00028236275375110323,
+      "loss": 3.3032,
+      "step": 52000
+    },
+    {
+      "epoch": 2.850228556063458,
+      "grad_norm": 0.3835964798927307,
+      "learning_rate": 0.0002814810238305384,
+      "loss": 3.2969,
+      "step": 53000
+    },
+    {
+      "epoch": 2.904006453347674,
+      "grad_norm": 0.38047948479652405,
+      "learning_rate": 0.0002805992939099735,
+      "loss": 3.2936,
+      "step": 54000
+    },
+    {
+      "epoch": 2.9577843506318904,
+      "grad_norm": 0.3762407600879669,
+      "learning_rate": 0.00027971668137687555,
+      "loss": 3.2927,
+      "step": 55000
+    },
+    {
+      "epoch": 3.0,
+      "eval_accuracy": 0.3895145789948566,
+      "eval_loss": 3.5000998973846436,
+      "eval_runtime": 153.5704,
+      "eval_samples_per_second": 377.162,
+      "eval_steps_per_second": 5.9,
+      "step": 55785
+    },
+    {
+      "epoch": 3.0115622479161064,
+      "grad_norm": 0.4009055495262146,
+      "learning_rate": 0.00027883406884377757,
+      "loss": 3.2765,
+      "step": 56000
+    },
+    {
+      "epoch": 3.065340145200323,
+      "grad_norm": 0.37349995970726013,
+      "learning_rate": 0.0002779514563106796,
+      "loss": 3.2286,
+      "step": 57000
+    },
+    {
+      "epoch": 3.119118042484539,
+      "grad_norm": 0.3908737599849701,
+      "learning_rate": 0.00027706972639011474,
+      "loss": 3.2289,
+      "step": 58000
+    },
+    {
+      "epoch": 3.172895939768755,
+      "grad_norm": 0.407253235578537,
+      "learning_rate": 0.00027618711385701675,
+      "loss": 3.2265,
+      "step": 59000
+    },
+    {
+      "epoch": 3.2266738370529713,
+      "grad_norm": 0.381302148103714,
+      "learning_rate": 0.00027530538393645185,
+      "loss": 3.231,
+      "step": 60000
+    },
+    {
+      "epoch": 3.2804517343371873,
+      "grad_norm": 0.3845687508583069,
+      "learning_rate": 0.0002744227714033539,
+      "loss": 3.228,
+      "step": 61000
+    },
+    {
+      "epoch": 3.3342296316214037,
+      "grad_norm": 0.37134280800819397,
+      "learning_rate": 0.00027354015887025594,
+      "loss": 3.2294,
+      "step": 62000
+    },
+    {
+      "epoch": 3.3880075289056197,
+      "grad_norm": 0.3631034195423126,
+      "learning_rate": 0.0002726584289496911,
+      "loss": 3.2269,
+      "step": 63000
+    },
+    {
+      "epoch": 3.4417854261898357,
+      "grad_norm": 0.39031264185905457,
+      "learning_rate": 0.0002717758164165931,
+      "loss": 3.2245,
+      "step": 64000
+    },
+    {
+      "epoch": 3.495563323474052,
+      "grad_norm": 0.3664679527282715,
+      "learning_rate": 0.0002708932038834951,
+      "loss": 3.2219,
+      "step": 65000
+    },
+    {
+      "epoch": 3.549341220758268,
+      "grad_norm": 0.37691643834114075,
+      "learning_rate": 0.00027001059135039714,
+      "loss": 3.2152,
+      "step": 66000
+    },
+    {
+      "epoch": 3.6031191180424846,
+      "grad_norm": 0.3464328944683075,
+      "learning_rate": 0.00026912886142983224,
+      "loss": 3.2158,
+      "step": 67000
+    },
+    {
+      "epoch": 3.6568970153267006,
+      "grad_norm": 0.360546350479126,
+      "learning_rate": 0.0002682462488967343,
+      "loss": 3.2173,
+      "step": 68000
+    },
+    {
+      "epoch": 3.7106749126109166,
+      "grad_norm": 0.39308884739875793,
+      "learning_rate": 0.00026736451897616946,
+      "loss": 3.2144,
+      "step": 69000
+    },
+    {
+      "epoch": 3.764452809895133,
+      "grad_norm": 0.35938188433647156,
+      "learning_rate": 0.0002664819064430715,
+      "loss": 3.2169,
+      "step": 70000
+    },
+    {
+      "epoch": 3.8182307071793495,
+      "grad_norm": 0.3498573899269104,
+      "learning_rate": 0.0002656001765225066,
+      "loss": 3.2095,
+      "step": 71000
+    },
+    {
+      "epoch": 3.8720086044635655,
+      "grad_norm": 0.3463761806488037,
+      "learning_rate": 0.0002647175639894086,
+      "loss": 3.2078,
+      "step": 72000
+    },
+    {
+      "epoch": 3.9257865017477815,
+      "grad_norm": 0.3538376986980438,
+      "learning_rate": 0.00026383583406884375,
+      "loss": 3.2037,
+      "step": 73000
+    },
+    {
+      "epoch": 3.979564399031998,
+      "grad_norm": 0.350891649723053,
+      "learning_rate": 0.0002629541041482789,
+      "loss": 3.1999,
+      "step": 74000
+    },
+    {
+      "epoch": 4.0,
+      "eval_accuracy": 0.39521783897656926,
+      "eval_loss": 3.474266767501831,
+      "eval_runtime": 154.7996,
+      "eval_samples_per_second": 374.168,
+      "eval_steps_per_second": 5.853,
+      "step": 74380
+    },
+    {
+      "epoch": 4.033342296316214,
+      "grad_norm": 0.3689132034778595,
+      "learning_rate": 0.0002620714916151809,
+      "loss": 3.1644,
+      "step": 75000
+    },
+    {
+      "epoch": 4.08712019360043,
+      "grad_norm": 0.3693839907646179,
+      "learning_rate": 0.00026118887908208293,
+      "loss": 3.1393,
+      "step": 76000
+    },
+    {
+      "epoch": 4.140898090884646,
+      "grad_norm": 0.3670288622379303,
+      "learning_rate": 0.000260306266548985,
+      "loss": 3.1455,
+      "step": 77000
+    },
+    {
+      "epoch": 4.1946759881688624,
+      "grad_norm": 0.3569948673248291,
+      "learning_rate": 0.0002594245366284201,
+      "loss": 3.1473,
+      "step": 78000
+    },
+    {
+      "epoch": 4.2484538854530784,
+      "grad_norm": 0.37387335300445557,
+      "learning_rate": 0.0002585419240953221,
+      "loss": 3.144,
+      "step": 79000
+    },
+    {
+      "epoch": 4.302231782737295,
+      "grad_norm": 0.35249531269073486,
+      "learning_rate": 0.00025765931156222413,
+      "loss": 3.1482,
+      "step": 80000
+    },
+    {
+      "epoch": 4.356009680021511,
+      "grad_norm": 0.3619791269302368,
+      "learning_rate": 0.0002567775816416593,
+      "loss": 3.1468,
+      "step": 81000
+    },
+    {
+      "epoch": 4.409787577305727,
+      "grad_norm": 0.3486943542957306,
+      "learning_rate": 0.00025589585172109444,
+      "loss": 3.1543,
+      "step": 82000
+    },
+    {
+      "epoch": 4.463565474589943,
+      "grad_norm": 0.3588618040084839,
+      "learning_rate": 0.00025501323918799646,
+      "loss": 3.1509,
+      "step": 83000
+    },
+    {
+      "epoch": 4.517343371874159,
+      "grad_norm": 0.36453118920326233,
+      "learning_rate": 0.00025413062665489847,
+      "loss": 3.1487,
+      "step": 84000
+    },
+    {
+      "epoch": 4.571121269158376,
+      "grad_norm": 0.3435422480106354,
+      "learning_rate": 0.0002532480141218005,
+      "loss": 3.1485,
+      "step": 85000
+    },
+    {
+      "epoch": 4.624899166442592,
+      "grad_norm": 0.3634852468967438,
+      "learning_rate": 0.00025236540158870256,
+      "loss": 3.1543,
+      "step": 86000
+    },
+    {
+      "epoch": 4.678677063726808,
+      "grad_norm": 0.3428556025028229,
+      "learning_rate": 0.00025148367166813766,
+      "loss": 3.1482,
+      "step": 87000
+    },
+    {
+      "epoch": 4.732454961011024,
+      "grad_norm": 0.36531776189804077,
+      "learning_rate": 0.0002506010591350397,
+      "loss": 3.1467,
+      "step": 88000
+    },
+    {
+      "epoch": 4.786232858295241,
+      "grad_norm": 0.34510383009910583,
+      "learning_rate": 0.00024971844660194174,
+      "loss": 3.1497,
+      "step": 89000
+    },
+    {
+      "epoch": 4.840010755579457,
+      "grad_norm": 0.33938872814178467,
+      "learning_rate": 0.00024883583406884376,
+      "loss": 3.1455,
+      "step": 90000
+    },
+    {
+      "epoch": 4.893788652863673,
+      "grad_norm": 0.373588502407074,
+      "learning_rate": 0.000247954986760812,
+      "loss": 3.1474,
+      "step": 91000
+    },
+    {
+      "epoch": 4.947566550147889,
+      "grad_norm": 0.34171566367149353,
+      "learning_rate": 0.000247072374227714,
+      "loss": 3.1448,
+      "step": 92000
+    },
+    {
+      "epoch": 5.0,
+      "eval_accuracy": 0.39957591505456547,
+      "eval_loss": 3.4088761806488037,
+      "eval_runtime": 155.1687,
+      "eval_samples_per_second": 373.278,
+      "eval_steps_per_second": 5.839,
+      "step": 92975
+    },
+    {
+      "epoch": 5.001344447432105,
+      "grad_norm": 0.3516329526901245,
+      "learning_rate": 0.00024619064430714916,
+      "loss": 3.1468,
+      "step": 93000
+    },
+    {
+      "epoch": 5.055122344716321,
+      "grad_norm": 0.3526703715324402,
+      "learning_rate": 0.0002453080317740512,
+      "loss": 3.0784,
+      "step": 94000
+    },
+    {
+      "epoch": 5.108900242000538,
+      "grad_norm": 0.3785659670829773,
+      "learning_rate": 0.0002444254192409532,
+      "loss": 3.0804,
+      "step": 95000
+    },
+    {
+      "epoch": 5.162678139284754,
+      "grad_norm": 0.3553713262081146,
+      "learning_rate": 0.00024354280670785524,
+      "loss": 3.0885,
+      "step": 96000
+    },
+    {
+      "epoch": 5.21645603656897,
+      "grad_norm": 0.3591731786727905,
+      "learning_rate": 0.00024266107678729037,
+      "loss": 3.0916,
+      "step": 97000
+    },
+    {
+      "epoch": 5.270233933853186,
+      "grad_norm": 0.3515339195728302,
+      "learning_rate": 0.00024177846425419238,
+      "loss": 3.0929,
+      "step": 98000
+    },
+    {
+      "epoch": 5.324011831137403,
+      "grad_norm": 0.3388463854789734,
+      "learning_rate": 0.0002408958517210944,
+      "loss": 3.0946,
+      "step": 99000
+    },
+    {
+      "epoch": 5.377789728421619,
+      "grad_norm": 0.35906437039375305,
+      "learning_rate": 0.00024001500441306263,
+      "loss": 3.0936,
+      "step": 100000
+    },
+    {
+      "epoch": 5.431567625705835,
+      "grad_norm": 0.35187384486198425,
+      "learning_rate": 0.00023913239187996468,
+      "loss": 3.0965,
+      "step": 101000
+    },
+    {
+      "epoch": 5.485345522990051,
+      "grad_norm": 0.3588041067123413,
+      "learning_rate": 0.00023824977934686672,
+      "loss": 3.0954,
+      "step": 102000
+    },
+    {
+      "epoch": 5.539123420274267,
+      "grad_norm": 0.37199875712394714,
+      "learning_rate": 0.00023736716681376874,
+      "loss": 3.0958,
+      "step": 103000
+    },
+    {
+      "epoch": 5.592901317558484,
+      "grad_norm": 0.36916086077690125,
+      "learning_rate": 0.00023648543689320386,
+      "loss": 3.0974,
+      "step": 104000
+    },
+    {
+      "epoch": 5.6466792148427,
+      "grad_norm": 0.3353819251060486,
+      "learning_rate": 0.0002356028243601059,
+      "loss": 3.1009,
+      "step": 105000
+    },
+    {
+      "epoch": 5.700457112126916,
+      "grad_norm": 0.35051777958869934,
+      "learning_rate": 0.00023472021182700792,
+      "loss": 3.1009,
+      "step": 106000
+    },
+    {
+      "epoch": 5.754235009411132,
+      "grad_norm": 0.36077558994293213,
+      "learning_rate": 0.00023383848190644305,
+      "loss": 3.0988,
+      "step": 107000
+    },
+    {
+      "epoch": 5.808012906695348,
+      "grad_norm": 0.3492220640182495,
+      "learning_rate": 0.0002329558693733451,
+      "loss": 3.1008,
+      "step": 108000
+    },
+    {
+      "epoch": 5.861790803979565,
+      "grad_norm": 0.3349301517009735,
+      "learning_rate": 0.00023207325684024714,
+      "loss": 3.1004,
+      "step": 109000
+    },
+    {
+      "epoch": 5.915568701263781,
+      "grad_norm": 0.3572138249874115,
+      "learning_rate": 0.00023119152691968223,
+      "loss": 3.0991,
+      "step": 110000
+    },
+    {
+      "epoch": 5.969346598547997,
+      "grad_norm": 0.3292669653892517,
+      "learning_rate": 0.00023030891438658428,
+      "loss": 3.0986,
+      "step": 111000
+    },
+    {
+      "epoch": 6.0,
+      "eval_accuracy": 0.4021510044171932,
+      "eval_loss": 3.4165823459625244,
+      "eval_runtime": 153.9568,
+      "eval_samples_per_second": 376.216,
+      "eval_steps_per_second": 5.885,
+      "step": 111570
+    },
+    {
+      "epoch": 6.023124495832213,
+      "grad_norm": 0.3583333492279053,
+      "learning_rate": 0.0002294263018534863,
+      "loss": 3.0671,
+      "step": 112000
+    },
+    {
+      "epoch": 6.076902393116429,
+      "grad_norm": 0.3764239251613617,
+      "learning_rate": 0.00022854368932038834,
+      "loss": 3.035,
+      "step": 113000
+    },
+    {
+      "epoch": 6.130680290400646,
+      "grad_norm": 0.3759283721446991,
+      "learning_rate": 0.00022766195939982344,
+      "loss": 3.034,
+      "step": 114000
+    },
+    {
+      "epoch": 6.184458187684862,
+      "grad_norm": 0.3840562701225281,
+      "learning_rate": 0.0002267802294792586,
+      "loss": 3.0451,
+      "step": 115000
+    },
+    {
+      "epoch": 6.238236084969078,
+      "grad_norm": 0.35660070180892944,
+      "learning_rate": 0.00022589761694616063,
+      "loss": 3.0489,
+      "step": 116000
+    },
+    {
+      "epoch": 6.292013982253294,
+      "grad_norm": 0.3423716723918915,
+      "learning_rate": 0.00022501500441306265,
+      "loss": 3.0487,
+      "step": 117000
+    },
+    {
+      "epoch": 6.34579187953751,
+      "grad_norm": 0.37959685921669006,
+      "learning_rate": 0.00022413327449249777,
+      "loss": 3.0516,
+      "step": 118000
+    },
+    {
+      "epoch": 6.399569776821727,
+      "grad_norm": 0.3404131531715393,
+      "learning_rate": 0.0002232506619593998,
+      "loss": 3.0495,
+      "step": 119000
+    },
+    {
+      "epoch": 6.453347674105943,
+      "grad_norm": 0.34124448895454407,
+      "learning_rate": 0.00022236804942630183,
+      "loss": 3.0556,
+      "step": 120000
+    },
+    {
+      "epoch": 6.507125571390159,
+      "grad_norm": 0.366603821516037,
+      "learning_rate": 0.00022148543689320385,
+      "loss": 3.0585,
+      "step": 121000
+    },
+    {
+      "epoch": 6.560903468674375,
+      "grad_norm": 0.3521302342414856,
+      "learning_rate": 0.00022060370697263897,
+      "loss": 3.0594,
+      "step": 122000
+    },
+    {
+      "epoch": 6.614681365958591,
+      "grad_norm": 0.3434734642505646,
+      "learning_rate": 0.00021972109443954102,
+      "loss": 3.0561,
+      "step": 123000
+    },
+    {
+      "epoch": 6.6684592632428075,
+      "grad_norm": 0.3365206718444824,
+      "learning_rate": 0.00021883936451897614,
+      "loss": 3.0577,
+      "step": 124000
+    },
+    {
+      "epoch": 6.7222371605270235,
+      "grad_norm": 0.34190618991851807,
+      "learning_rate": 0.0002179576345984113,
+      "loss": 3.0585,
+      "step": 125000
+    },
+    {
+      "epoch": 6.7760150578112395,
+      "grad_norm": 0.3417965769767761,
+      "learning_rate": 0.0002170759046778464,
+      "loss": 3.0599,
+      "step": 126000
+    },
+    {
+      "epoch": 6.8297929550954555,
+      "grad_norm": 0.3413834571838379,
+      "learning_rate": 0.00021619329214474844,
+      "loss": 3.0575,
+      "step": 127000
+    },
+    {
+      "epoch": 6.8835708523796715,
+      "grad_norm": 0.3608095347881317,
+      "learning_rate": 0.00021531067961165048,
+      "loss": 3.0588,
+      "step": 128000
+    },
+    {
+      "epoch": 6.937348749663888,
+      "grad_norm": 0.360249400138855,
+      "learning_rate": 0.00021442806707855253,
+      "loss": 3.0598,
+      "step": 129000
+    },
+    {
+      "epoch": 6.991126646948104,
+      "grad_norm": 0.3597511351108551,
+      "learning_rate": 0.00021354633715798762,
+      "loss": 3.0592,
+      "step": 130000
+    },
+    {
+      "epoch": 7.0,
+      "eval_accuracy": 0.40360871244389834,
+      "eval_loss": 3.403170108795166,
+      "eval_runtime": 153.6701,
+      "eval_samples_per_second": 376.918,
+      "eval_steps_per_second": 5.896,
+      "step": 130165
+    },
+    {
+      "epoch": 7.04490454423232,
+      "grad_norm": 0.364961177110672,
+      "learning_rate": 0.00021266372462488964,
+      "loss": 3.0023,
+      "step": 131000
+    },
+    {
+      "epoch": 7.098682441516536,
+      "grad_norm": 0.3606811463832855,
+      "learning_rate": 0.00021178111209179168,
+      "loss": 2.9945,
+      "step": 132000
+    },
+    {
+      "epoch": 7.152460338800753,
+      "grad_norm": 0.3595397472381592,
+      "learning_rate": 0.0002108984995586937,
+      "loss": 3.0056,
+      "step": 133000
+    },
+    {
+      "epoch": 7.206238236084969,
+      "grad_norm": 0.35375916957855225,
+      "learning_rate": 0.00021001676963812883,
+      "loss": 3.0093,
+      "step": 134000
+    },
+    {
+      "epoch": 7.260016133369185,
+      "grad_norm": 0.3839136064052582,
+      "learning_rate": 0.00020913415710503087,
+      "loss": 3.0092,
+      "step": 135000
+    },
+    {
+      "epoch": 7.313794030653401,
+      "grad_norm": 0.36542221903800964,
+      "learning_rate": 0.0002082515445719329,
+      "loss": 3.0137,
+      "step": 136000
+    },
+    {
+      "epoch": 7.367571927937617,
+      "grad_norm": 0.3783646523952484,
+      "learning_rate": 0.00020736893203883493,
+      "loss": 3.0178,
+      "step": 137000
+    },
+    {
+      "epoch": 7.421349825221834,
+      "grad_norm": 0.3594056963920593,
+      "learning_rate": 0.00020648720211827005,
+      "loss": 3.0204,
+      "step": 138000
+    },
+    {
+      "epoch": 7.47512772250605,
+      "grad_norm": 0.3637217581272125,
+      "learning_rate": 0.0002056045895851721,
+      "loss": 3.0189,
+      "step": 139000
+    },
+    {
+      "epoch": 7.528905619790266,
+      "grad_norm": 0.36873236298561096,
+      "learning_rate": 0.00020472197705207414,
+      "loss": 3.0193,
+      "step": 140000
+    },
+    {
+      "epoch": 7.582683517074482,
+      "grad_norm": 0.3408534824848175,
+      "learning_rate": 0.00020384024713150924,
+      "loss": 3.0235,
+      "step": 141000
+    },
+    {
+      "epoch": 7.636461414358699,
+      "grad_norm": 0.34803324937820435,
+      "learning_rate": 0.00020295763459841128,
+      "loss": 3.0237,
+      "step": 142000
+    },
+    {
+      "epoch": 7.690239311642915,
+      "grad_norm": 0.3650054931640625,
+      "learning_rate": 0.0002020759046778464,
+      "loss": 3.0214,
+      "step": 143000
+    },
+    {
+      "epoch": 7.744017208927131,
+      "grad_norm": 0.3592528998851776,
+      "learning_rate": 0.00020119329214474845,
+      "loss": 3.0261,
+      "step": 144000
+    },
+    {
+      "epoch": 7.797795106211347,
+      "grad_norm": 0.37202954292297363,
+      "learning_rate": 0.00020031067961165047,
+      "loss": 3.0253,
+      "step": 145000
+    },
+    {
+      "epoch": 7.851573003495563,
+      "grad_norm": 0.35640275478363037,
+      "learning_rate": 0.0001994280670785525,
+      "loss": 3.027,
+      "step": 146000
+    },
+    {
+      "epoch": 7.90535090077978,
+      "grad_norm": 0.34525027871131897,
+      "learning_rate": 0.00019854633715798764,
+      "loss": 3.0276,
+      "step": 147000
+    },
+    {
+      "epoch": 7.959128798063996,
+      "grad_norm": 0.33090052008628845,
+      "learning_rate": 0.00019766372462488965,
+      "loss": 3.0279,
+      "step": 148000
+    },
+    {
+      "epoch": 8.0,
+      "eval_accuracy": 0.4061836674493643,
+      "eval_loss": 3.3745980262756348,
+      "eval_runtime": 155.0808,
+      "eval_samples_per_second": 373.489,
+      "eval_steps_per_second": 5.842,
+      "step": 148760
+    },
+    {
+      "epoch": 8.012906695348212,
+      "grad_norm": 0.3575202524662018,
+      "learning_rate": 0.00019678199470432478,
+      "loss": 3.0055,
+      "step": 149000
+    },
+    {
+      "epoch": 8.066684592632429,
+      "grad_norm": 0.3376094400882721,
+      "learning_rate": 0.00019589938217122682,
+      "loss": 2.9616,
+      "step": 150000
+    },
+    {
+      "epoch": 8.120462489916644,
+      "grad_norm": 0.36086979508399963,
+      "learning_rate": 0.00019501676963812887,
+      "loss": 2.9642,
+      "step": 151000
+    },
+    {
+      "epoch": 8.17424038720086,
+      "grad_norm": 0.36575761437416077,
+      "learning_rate": 0.00019413503971756397,
+      "loss": 2.9755,
+      "step": 152000
+    },
+    {
+      "epoch": 8.228018284485076,
+      "grad_norm": 0.35903316736221313,
+      "learning_rate": 0.000193252427184466,
+      "loss": 2.9758,
+      "step": 153000
+    },
+    {
+      "epoch": 8.281796181769293,
+      "grad_norm": 0.3707205653190613,
+      "learning_rate": 0.00019237157987643422,
+      "loss": 2.9812,
+      "step": 154000
+    },
+    {
+      "epoch": 8.33557407905351,
+      "grad_norm": 0.3687371611595154,
+      "learning_rate": 0.00019148896734333626,
+      "loss": 2.9833,
+      "step": 155000
+    },
+    {
+      "epoch": 8.389351976337725,
+      "grad_norm": 0.3716333508491516,
+      "learning_rate": 0.0001906063548102383,
+      "loss": 2.988,
+      "step": 156000
+    },
+    {
+      "epoch": 8.443129873621942,
+      "grad_norm": 0.3598574101924896,
+      "learning_rate": 0.00018972374227714032,
+      "loss": 2.9878,
+      "step": 157000
+    },
+    {
+      "epoch": 8.496907770906157,
+      "grad_norm": 0.34485259652137756,
+      "learning_rate": 0.00018884112974404236,
+      "loss": 2.9891,
+      "step": 158000
+    },
+    {
+      "epoch": 8.550685668190374,
+      "grad_norm": 0.3574945628643036,
+      "learning_rate": 0.00018795851721094438,
+      "loss": 2.9923,
+      "step": 159000
+    },
+    {
+      "epoch": 8.60446356547459,
+      "grad_norm": 0.3579365611076355,
+      "learning_rate": 0.0001870767872903795,
+      "loss": 2.9908,
+      "step": 160000
+    },
+    {
+      "epoch": 8.658241462758806,
+      "grad_norm": 0.3805078864097595,
+      "learning_rate": 0.00018619505736981463,
+      "loss": 2.9896,
+      "step": 161000
+    },
+    {
+      "epoch": 8.712019360043023,
+      "grad_norm": 0.3955886960029602,
+      "learning_rate": 0.00018531244483671667,
+      "loss": 2.9956,
+      "step": 162000
+    },
+    {
+      "epoch": 8.765797257327238,
+      "grad_norm": 0.35813698172569275,
+      "learning_rate": 0.00018442983230361872,
+      "loss": 2.9987,
+      "step": 163000
+    },
+    {
+      "epoch": 8.819575154611455,
+      "grad_norm": 0.3720499277114868,
+      "learning_rate": 0.0001835472197705207,
+      "loss": 2.9956,
+      "step": 164000
+    },
+    {
+      "epoch": 8.873353051895672,
+      "grad_norm": 0.35912513732910156,
+      "learning_rate": 0.00018266548984995586,
+      "loss": 2.9985,
+      "step": 165000
+    },
+    {
+      "epoch": 8.927130949179887,
+      "grad_norm": 0.36929354071617126,
+      "learning_rate": 0.00018178287731685788,
+      "loss": 2.9972,
+      "step": 166000
+    },
+    {
+      "epoch": 8.980908846464104,
+      "grad_norm": 0.34516987204551697,
+      "learning_rate": 0.0001809020300088261,
+      "loss": 2.9977,
+      "step": 167000
+    },
+    {
+      "epoch": 9.0,
+      "eval_accuracy": 0.4067501172434183,
+      "eval_loss": 3.3709380626678467,
+      "eval_runtime": 154.3682,
+      "eval_samples_per_second": 375.213,
+      "eval_steps_per_second": 5.869,
+      "step": 167355
+    },
+    {
+      "epoch": 9.034686743748319,
+      "grad_norm": 0.38715243339538574,
+      "learning_rate": 0.00018001941747572815,
+      "loss": 2.9529,
+      "step": 168000
+    },
+    {
+      "epoch": 9.088464641032536,
+      "grad_norm": 0.39728203415870667,
+      "learning_rate": 0.00017913680494263017,
+      "loss": 2.9363,
+      "step": 169000
+    },
+    {
+      "epoch": 9.142242538316752,
+      "grad_norm": 0.3717023730278015,
+      "learning_rate": 0.00017825419240953221,
+      "loss": 2.9421,
+      "step": 170000
+    },
+    {
+      "epoch": 9.196020435600968,
+      "grad_norm": 0.3885519504547119,
+      "learning_rate": 0.00017737157987643426,
+      "loss": 2.9453,
+      "step": 171000
+    },
+    {
+      "epoch": 9.249798332885184,
+      "grad_norm": 0.3886348307132721,
+      "learning_rate": 0.00017648896734333625,
+      "loss": 2.9489,
+      "step": 172000
+    },
+    {
+      "epoch": 9.3035762301694,
+      "grad_norm": 0.3586263656616211,
+      "learning_rate": 0.00017560723742277137,
+      "loss": 2.9544,
+      "step": 173000
+    },
+    {
+      "epoch": 9.357354127453616,
+      "grad_norm": 0.37699151039123535,
+      "learning_rate": 0.00017472462488967342,
+      "loss": 2.9584,
+      "step": 174000
+    },
+    {
+      "epoch": 9.411132024737833,
+      "grad_norm": 0.36616960167884827,
+      "learning_rate": 0.00017384289496910857,
+      "loss": 2.9624,
+      "step": 175000
+    },
+    {
+      "epoch": 9.464909922022049,
+      "grad_norm": 0.36951467394828796,
+      "learning_rate": 0.00017296028243601056,
+      "loss": 2.9585,
+      "step": 176000
+    },
+    {
+      "epoch": 9.518687819306265,
+      "grad_norm": 0.36193570494651794,
+      "learning_rate": 0.0001720776699029126,
+      "loss": 2.9649,
+      "step": 177000
+    },
+    {
+      "epoch": 9.57246571659048,
+      "grad_norm": 0.3632647395133972,
+      "learning_rate": 0.00017119593998234775,
+      "loss": 2.9573,
+      "step": 178000
+    },
+    {
+      "epoch": 9.626243613874697,
+      "grad_norm": 0.3749377727508545,
+      "learning_rate": 0.00017031332744924974,
+      "loss": 2.9642,
+      "step": 179000
+    },
+    {
+      "epoch": 9.680021511158914,
+      "grad_norm": 0.3834667205810547,
+      "learning_rate": 0.00016943159752868487,
+      "loss": 2.9663,
+      "step": 180000
+    },
+    {
+      "epoch": 9.73379940844313,
+      "grad_norm": 0.3825732171535492,
+      "learning_rate": 0.0001685489849955869,
+      "loss": 2.9665,
+      "step": 181000
+    },
+    {
+      "epoch": 9.787577305727346,
+      "grad_norm": 0.3804017901420593,
+      "learning_rate": 0.00016766725507502206,
+      "loss": 2.9656,
+      "step": 182000
+    },
+    {
+      "epoch": 9.841355203011563,
+      "grad_norm": 0.34586480259895325,
+      "learning_rate": 0.0001667846425419241,
+      "loss": 2.9733,
+      "step": 183000
+    },
+    {
+      "epoch": 9.895133100295778,
+      "grad_norm": 0.3566882312297821,
+      "learning_rate": 0.0001659020300088261,
+      "loss": 2.9731,
+      "step": 184000
+    },
+    {
+      "epoch": 9.948910997579995,
+      "grad_norm": 0.36795541644096375,
+      "learning_rate": 0.00016502030008826125,
+      "loss": 2.9761,
+      "step": 185000
+    },
+    {
+      "epoch": 10.0,
+      "eval_accuracy": 0.4070907126485243,
+      "eval_loss": 3.379540205001831,
+      "eval_runtime": 153.7409,
+      "eval_samples_per_second": 376.744,
+      "eval_steps_per_second": 5.893,
+      "step": 185950
+    },
+    {
+      "epoch": 10.00268889486421,
+      "grad_norm": 0.37020811438560486,
+      "learning_rate": 0.00016413768755516327,
+      "loss": 2.9715,
+      "step": 186000
+    },
+    {
+      "epoch": 10.056466792148427,
+      "grad_norm": 0.36936092376708984,
+      "learning_rate": 0.00016325507502206528,
+      "loss": 2.905,
+      "step": 187000
+    },
+    {
+      "epoch": 10.110244689432642,
+      "grad_norm": 0.3851975202560425,
+      "learning_rate": 0.0001623733451015004,
+      "loss": 2.9174,
+      "step": 188000
+    },
+    {
+      "epoch": 10.16402258671686,
+      "grad_norm": 0.37237951159477234,
+      "learning_rate": 0.00016149073256840245,
+      "loss": 2.9218,
+      "step": 189000
+    },
+    {
+      "epoch": 10.217800484001076,
+      "grad_norm": 0.39296218752861023,
+      "learning_rate": 0.0001606081200353045,
+      "loss": 2.9218,
+      "step": 190000
+    },
+    {
+      "epoch": 10.271578381285291,
+      "grad_norm": 0.427022248506546,
+      "learning_rate": 0.0001597255075022065,
+      "loss": 2.9235,
+      "step": 191000
+    },
+    {
+      "epoch": 10.325356278569508,
+      "grad_norm": 0.40037524700164795,
+      "learning_rate": 0.00015884377758164164,
+      "loss": 2.9287,
+      "step": 192000
+    },
+    {
+      "epoch": 10.379134175853725,
+      "grad_norm": 0.37520840764045715,
+      "learning_rate": 0.00015796116504854368,
+      "loss": 2.9303,
+      "step": 193000
+    },
+    {
+      "epoch": 10.43291207313794,
+      "grad_norm": 0.38916105031967163,
+      "learning_rate": 0.00015707855251544572,
+      "loss": 2.9352,
+      "step": 194000
+    },
+    {
+      "epoch": 10.486689970422157,
+      "grad_norm": 0.36788520216941833,
+      "learning_rate": 0.00015619682259488082,
+      "loss": 2.9363,
+      "step": 195000
+    },
+    {
+      "epoch": 10.540467867706372,
+      "grad_norm": 0.3871680796146393,
+      "learning_rate": 0.00015531421006178287,
+      "loss": 2.9393,
+      "step": 196000
+    },
+    {
+      "epoch": 10.594245764990589,
+      "grad_norm": 0.3886531889438629,
+      "learning_rate": 0.0001544333627537511,
+      "loss": 2.938,
+      "step": 197000
+    },
+    {
+      "epoch": 10.648023662274806,
+      "grad_norm": 0.3601588010787964,
+      "learning_rate": 0.00015355075022065312,
+      "loss": 2.9447,
+      "step": 198000
+    },
+    {
+      "epoch": 10.701801559559021,
+      "grad_norm": 0.36893409490585327,
+      "learning_rate": 0.00015266813768755513,
+      "loss": 2.9398,
+      "step": 199000
+    },
+    {
+      "epoch": 10.755579456843238,
+      "grad_norm": 0.3649713397026062,
+      "learning_rate": 0.00015178552515445718,
+      "loss": 2.9445,
+      "step": 200000
+    },
+    {
+      "epoch": 10.809357354127453,
+      "grad_norm": 0.3681996762752533,
+      "learning_rate": 0.0001509037952338923,
+      "loss": 2.9483,
+      "step": 201000
+    },
+    {
+      "epoch": 10.86313525141167,
+      "grad_norm": 0.38864666223526,
+      "learning_rate": 0.00015002206531332745,
+      "loss": 2.9449,
+      "step": 202000
+    },
+    {
+      "epoch": 10.916913148695887,
+      "grad_norm": 0.3854575753211975,
+      "learning_rate": 0.00014913945278022947,
+      "loss": 2.95,
+      "step": 203000
+    },
+    {
+      "epoch": 10.970691045980102,
+      "grad_norm": 0.3969317376613617,
+      "learning_rate": 0.0001482568402471315,
+      "loss": 2.9464,
+      "step": 204000
+    },
+    {
+      "epoch": 11.0,
+      "eval_accuracy": 0.4079866062032567,
+      "eval_loss": 3.378293991088867,
+      "eval_runtime": 154.03,
+      "eval_samples_per_second": 376.037,
+      "eval_steps_per_second": 5.882,
+      "step": 204545
+    },
+    {
+      "epoch": 11.024468943264319,
+      "grad_norm": 0.4027111232280731,
+      "learning_rate": 0.0001473751103265666,
+      "loss": 2.9169,
+      "step": 205000
+    },
+    {
+      "epoch": 11.078246840548534,
+      "grad_norm": 0.38684701919555664,
+      "learning_rate": 0.00014649249779346866,
+      "loss": 2.8892,
+      "step": 206000
+    },
+    {
+      "epoch": 11.13202473783275,
+      "grad_norm": 0.4004645049571991,
+      "learning_rate": 0.00014561076787290378,
+      "loss": 2.8914,
+      "step": 207000
+    },
+    {
+      "epoch": 11.185802635116968,
+      "grad_norm": 0.39821696281433105,
+      "learning_rate": 0.0001447281553398058,
+      "loss": 2.8959,
+      "step": 208000
+    },
+    {
+      "epoch": 11.239580532401183,
+      "grad_norm": 0.37264591455459595,
+      "learning_rate": 0.00014384554280670784,
+      "loss": 2.9001,
+      "step": 209000
+    },
+    {
+      "epoch": 11.2933584296854,
+      "grad_norm": 0.40092912316322327,
+      "learning_rate": 0.00014296293027360986,
+      "loss": 2.903,
+      "step": 210000
+    },
+    {
+      "epoch": 11.347136326969615,
+      "grad_norm": 0.4014595150947571,
+      "learning_rate": 0.000142081200353045,
+      "loss": 2.9087,
+      "step": 211000
+    },
+    {
+      "epoch": 11.400914224253832,
+      "grad_norm": 0.4027450978755951,
+      "learning_rate": 0.00014119858781994703,
+      "loss": 2.9107,
+      "step": 212000
+    },
+    {
+      "epoch": 11.454692121538049,
+      "grad_norm": 0.3879305124282837,
+      "learning_rate": 0.00014031597528684904,
+      "loss": 2.9139,
+      "step": 213000
+    },
+    {
+      "epoch": 11.508470018822264,
+      "grad_norm": 0.42585039138793945,
+      "learning_rate": 0.0001394333627537511,
+      "loss": 2.9174,
+      "step": 214000
+    },
+    {
+      "epoch": 11.56224791610648,
+      "grad_norm": 0.41681861877441406,
+      "learning_rate": 0.00013855163283318624,
+      "loss": 2.918,
+      "step": 215000
+    },
+    {
+      "epoch": 11.616025813390696,
+      "grad_norm": 0.3900226950645447,
+      "learning_rate": 0.00013766990291262134,
+      "loss": 2.919,
+      "step": 216000
+    },
+    {
+      "epoch": 11.669803710674913,
+      "grad_norm": 0.3816623091697693,
+      "learning_rate": 0.00013678729037952338,
+      "loss": 2.9215,
+      "step": 217000
+    },
+    {
+      "epoch": 11.72358160795913,
+      "grad_norm": 0.37244102358818054,
+      "learning_rate": 0.0001359046778464254,
+      "loss": 2.9189,
+      "step": 218000
+    },
+    {
+      "epoch": 11.777359505243345,
+      "grad_norm": 0.39248353242874146,
+      "learning_rate": 0.00013502206531332744,
+      "loss": 2.9165,
+      "step": 219000
+    },
+    {
+      "epoch": 11.831137402527562,
+      "grad_norm": 0.3882627785205841,
+      "learning_rate": 0.00013414033539276257,
+      "loss": 2.9226,
+      "step": 220000
+    },
+    {
+      "epoch": 11.884915299811777,
+      "grad_norm": 0.39201658964157104,
+      "learning_rate": 0.00013325772285966458,
+      "loss": 2.9234,
+      "step": 221000
+    },
+    {
+      "epoch": 11.938693197095994,
+      "grad_norm": 0.39154133200645447,
+      "learning_rate": 0.00013237599293909974,
+      "loss": 2.9267,
+      "step": 222000
+    },
+    {
+      "epoch": 11.99247109438021,
+      "grad_norm": 0.39426833391189575,
+      "learning_rate": 0.00013149338040600175,
+      "loss": 2.9234,
+      "step": 223000
+    },
+    {
+      "epoch": 12.0,
+      "eval_accuracy": 0.40836213447042485,
+      "eval_loss": 3.383244037628174,
+      "eval_runtime": 154.3587,
+      "eval_samples_per_second": 375.236,
+      "eval_steps_per_second": 5.869,
+      "step": 223140
+    },
+    {
+      "epoch": 12.046248991664426,
+      "grad_norm": 0.3944448232650757,
+      "learning_rate": 0.00013061165048543688,
+      "loss": 2.8663,
+      "step": 224000
+    },
+    {
+      "epoch": 12.100026888948642,
+      "grad_norm": 0.409795880317688,
+      "learning_rate": 0.0001297290379523389,
+      "loss": 2.867,
+      "step": 225000
+    },
+    {
+      "epoch": 12.153804786232858,
+      "grad_norm": 0.42699024081230164,
+      "learning_rate": 0.00012884642541924094,
+      "loss": 2.8737,
+      "step": 226000
+    },
+    {
+      "epoch": 12.207582683517074,
+      "grad_norm": 0.40281254053115845,
+      "learning_rate": 0.00012796557811120917,
+      "loss": 2.8802,
+      "step": 227000
+    },
+    {
+      "epoch": 12.261360580801291,
+      "grad_norm": 0.41500693559646606,
+      "learning_rate": 0.0001270829655781112,
+      "loss": 2.8801,
+      "step": 228000
+    },
+    {
+      "epoch": 12.315138478085506,
+      "grad_norm": 0.3833758533000946,
+      "learning_rate": 0.00012620035304501323,
+      "loss": 2.8872,
+      "step": 229000
+    },
+    {
+      "epoch": 12.368916375369723,
+      "grad_norm": 0.40702077746391296,
+      "learning_rate": 0.00012531774051191525,
+      "loss": 2.8852,
+      "step": 230000
+    },
+    {
+      "epoch": 12.422694272653938,
+      "grad_norm": 0.38716500997543335,
+      "learning_rate": 0.0001244351279788173,
+      "loss": 2.8888,
+      "step": 231000
+    },
+    {
+      "epoch": 12.476472169938155,
+      "grad_norm": 0.36926084756851196,
+      "learning_rate": 0.00012355251544571934,
+      "loss": 2.8933,
+      "step": 232000
+    },
+    {
+      "epoch": 12.530250067222372,
+      "grad_norm": 0.40624773502349854,
+      "learning_rate": 0.00012267078552515443,
+      "loss": 2.8932,
+      "step": 233000
+    },
+    {
+      "epoch": 12.584027964506587,
+      "grad_norm": 0.3890169858932495,
+      "learning_rate": 0.00012178817299205648,
+      "loss": 2.8969,
+      "step": 234000
+    },
+    {
+      "epoch": 12.637805861790804,
+      "grad_norm": 0.403980553150177,
+      "learning_rate": 0.0001209064430714916,
+      "loss": 2.895,
+      "step": 235000
+    },
+    {
+      "epoch": 12.69158375907502,
+      "grad_norm": 0.3942379057407379,
+      "learning_rate": 0.00012002383053839363,
+      "loss": 2.8956,
+      "step": 236000
+    },
+    {
+      "epoch": 12.745361656359236,
+      "grad_norm": 0.3773936629295349,
+      "learning_rate": 0.00011914121800529566,
+      "loss": 2.8996,
+      "step": 237000
+    },
+    {
+      "epoch": 12.799139553643453,
+      "grad_norm": 0.40187960863113403,
+      "learning_rate": 0.0001182594880847308,
+      "loss": 2.9019,
+      "step": 238000
+    },
+    {
+      "epoch": 12.852917450927668,
+      "grad_norm": 0.3979538381099701,
+      "learning_rate": 0.00011737687555163282,
+      "loss": 2.9017,
+      "step": 239000
+    },
+    {
+      "epoch": 12.906695348211885,
+      "grad_norm": 0.37608620524406433,
+      "learning_rate": 0.00011649426301853485,
+      "loss": 2.9048,
+      "step": 240000
+    },
+    {
+      "epoch": 12.9604732454961,
+      "grad_norm": 0.39789021015167236,
+      "learning_rate": 0.00011561253309796997,
+      "loss": 2.9068,
+      "step": 241000
+    },
+    {
+      "epoch": 13.0,
+      "eval_accuracy": 0.4087216070567606,
+      "eval_loss": 3.383761167526245,
+      "eval_runtime": 154.4144,
+      "eval_samples_per_second": 375.101,
+      "eval_steps_per_second": 5.867,
+      "step": 241735
+    },
+    {
+      "epoch": 13.014251142780317,
+      "grad_norm": 0.39842721819877625,
+      "learning_rate": 0.00011472992056487202,
+      "loss": 2.889,
+      "step": 242000
+    },
+    {
+      "epoch": 13.068029040064534,
+      "grad_norm": 0.40080752968788147,
+      "learning_rate": 0.00011384730803177405,
+      "loss": 2.851,
+      "step": 243000
+    },
+    {
+      "epoch": 13.12180693734875,
+      "grad_norm": 0.40195319056510925,
+      "learning_rate": 0.00011296557811120917,
+      "loss": 2.8539,
+      "step": 244000
+    },
+    {
+      "epoch": 13.175584834632966,
+      "grad_norm": 0.385681688785553,
+      "learning_rate": 0.0001120829655781112,
+      "loss": 2.8565,
+      "step": 245000
+    },
+    {
+      "epoch": 13.229362731917181,
+      "grad_norm": 0.383973091840744,
+      "learning_rate": 0.00011120035304501322,
+      "loss": 2.8581,
+      "step": 246000
+    },
+    {
+      "epoch": 13.283140629201398,
+      "grad_norm": 0.3838016092777252,
+      "learning_rate": 0.00011031774051191526,
+      "loss": 2.8625,
+      "step": 247000
+    },
+    {
+      "epoch": 13.336918526485615,
+      "grad_norm": 0.38851508498191833,
+      "learning_rate": 0.00010943601059135037,
+      "loss": 2.8618,
+      "step": 248000
+    },
+    {
+      "epoch": 13.39069642376983,
+      "grad_norm": 0.40345388650894165,
+      "learning_rate": 0.00010855428067078551,
+      "loss": 2.8634,
+      "step": 249000
+    },
+    {
+      "epoch": 13.444474321054047,
+      "grad_norm": 0.41579893231391907,
+      "learning_rate": 0.00010767166813768754,
+      "loss": 2.8678,
+      "step": 250000
+    },
+    {
+      "epoch": 13.498252218338264,
+      "grad_norm": 0.4044778347015381,
+      "learning_rate": 0.00010678905560458957,
+      "loss": 2.87,
+      "step": 251000
+    },
+    {
+      "epoch": 13.552030115622479,
+      "grad_norm": 0.39501869678497314,
+      "learning_rate": 0.0001059064430714916,
+      "loss": 2.8777,
+      "step": 252000
+    },
+    {
+      "epoch": 13.605808012906696,
+      "grad_norm": 0.406240850687027,
+      "learning_rate": 0.00010502471315092674,
+      "loss": 2.8775,
+      "step": 253000
+    },
+    {
+      "epoch": 13.659585910190911,
+      "grad_norm": 0.39923685789108276,
+      "learning_rate": 0.00010414210061782876,
+      "loss": 2.8752,
+      "step": 254000
+    },
+    {
+      "epoch": 13.713363807475128,
+      "grad_norm": 0.4037391245365143,
+      "learning_rate": 0.0001032603706972639,
+      "loss": 2.8808,
+      "step": 255000
+    },
+    {
+      "epoch": 13.767141704759345,
+      "grad_norm": 0.3982155919075012,
+      "learning_rate": 0.00010237775816416591,
+      "loss": 2.8814,
+      "step": 256000
+    },
+    {
+      "epoch": 13.82091960204356,
+      "grad_norm": 0.40128931403160095,
+      "learning_rate": 0.00010149691085613415,
+      "loss": 2.8843,
+      "step": 257000
+    },
+    {
+      "epoch": 13.874697499327777,
+      "grad_norm": 0.4084520936012268,
+      "learning_rate": 0.00010061429832303619,
+      "loss": 2.8802,
+      "step": 258000
+    },
+    {
+      "epoch": 13.928475396611992,
+      "grad_norm": 0.38339564204216003,
+      "learning_rate": 9.973168578993821e-05,
+      "loss": 2.8805,
+      "step": 259000
+    },
+    {
+      "epoch": 13.982253293896209,
+      "grad_norm": 0.39040228724479675,
+      "learning_rate": 9.884907325684024e-05,
+      "loss": 2.88,
+      "step": 260000
+    },
+    {
+      "epoch": 14.0,
+      "eval_accuracy": 0.4090670393196906,
+      "eval_loss": 3.3881213665008545,
+      "eval_runtime": 153.9195,
+      "eval_samples_per_second": 376.307,
+      "eval_steps_per_second": 5.886,
+      "step": 260330
+    },
+    {
+      "epoch": 14.036031191180426,
+      "grad_norm": 0.4209740459918976,
+      "learning_rate": 9.796734333627536e-05,
+      "loss": 2.8478,
+      "step": 261000
+    },
+    {
+      "epoch": 14.08980908846464,
+      "grad_norm": 0.40589722990989685,
+      "learning_rate": 9.70847308031774e-05,
+      "loss": 2.8301,
+      "step": 262000
+    },
+    {
+      "epoch": 14.143586985748858,
+      "grad_norm": 0.4551624357700348,
+      "learning_rate": 9.620300088261252e-05,
+      "loss": 2.8316,
+      "step": 263000
+    },
+    {
+      "epoch": 14.197364883033073,
+      "grad_norm": 0.4008908271789551,
+      "learning_rate": 9.532038834951455e-05,
+      "loss": 2.8363,
+      "step": 264000
+    },
+    {
+      "epoch": 14.25114278031729,
+      "grad_norm": 0.4118269681930542,
+      "learning_rate": 9.443865842894969e-05,
+      "loss": 2.8439,
+      "step": 265000
+    },
+    {
+      "epoch": 14.304920677601507,
+      "grad_norm": 0.4041500985622406,
+      "learning_rate": 9.35560458958517e-05,
+      "loss": 2.8449,
+      "step": 266000
+    },
+    {
+      "epoch": 14.358698574885722,
+      "grad_norm": 0.41177451610565186,
+      "learning_rate": 9.267431597528684e-05,
+      "loss": 2.8455,
+      "step": 267000
+    },
+    {
+      "epoch": 14.412476472169939,
+      "grad_norm": 0.399616539478302,
+      "learning_rate": 9.179170344218886e-05,
+      "loss": 2.8519,
+      "step": 268000
+    },
+    {
+      "epoch": 14.466254369454154,
+      "grad_norm": 0.41544005274772644,
+      "learning_rate": 9.09090909090909e-05,
+      "loss": 2.8493,
+      "step": 269000
+    },
+    {
+      "epoch": 14.52003226673837,
+      "grad_norm": 0.41849789023399353,
+      "learning_rate": 9.002647837599293e-05,
+      "loss": 2.8533,
+      "step": 270000
+    },
+    {
+      "epoch": 14.573810164022587,
+      "grad_norm": 0.43560275435447693,
+      "learning_rate": 8.914474845542806e-05,
+      "loss": 2.8566,
+      "step": 271000
+    },
+    {
+      "epoch": 14.627588061306803,
+      "grad_norm": 0.40364864468574524,
+      "learning_rate": 8.82630185348632e-05,
+      "loss": 2.8566,
+      "step": 272000
+    },
+    {
+      "epoch": 14.68136595859102,
+      "grad_norm": 0.4094676375389099,
+      "learning_rate": 8.738040600176521e-05,
+      "loss": 2.8584,
+      "step": 273000
+    },
+    {
+      "epoch": 14.735143855875235,
+      "grad_norm": 0.40439966320991516,
+      "learning_rate": 8.649867608120035e-05,
+      "loss": 2.8592,
+      "step": 274000
+    },
+    {
+      "epoch": 14.788921753159451,
+      "grad_norm": 0.4156148433685303,
+      "learning_rate": 8.561606354810237e-05,
+      "loss": 2.861,
+      "step": 275000
+    },
+    {
+      "epoch": 14.842699650443668,
+      "grad_norm": 0.4287779927253723,
+      "learning_rate": 8.47334510150044e-05,
+      "loss": 2.8616,
+      "step": 276000
+    },
+    {
+      "epoch": 14.896477547727883,
+      "grad_norm": 0.4273608922958374,
+      "learning_rate": 8.385083848190644e-05,
+      "loss": 2.865,
+      "step": 277000
+    },
+    {
+      "epoch": 14.9502554450121,
+      "grad_norm": 0.4030059278011322,
+      "learning_rate": 8.296910856134155e-05,
+      "loss": 2.8614,
+      "step": 278000
+    },
+    {
+      "epoch": 15.0,
+      "eval_accuracy": 0.40971853719714973,
+      "eval_loss": 3.3862648010253906,
+      "eval_runtime": 154.422,
+      "eval_samples_per_second": 375.083,
+      "eval_steps_per_second": 5.867,
+      "step": 278925
+    },
+    {
+      "epoch": 15.004033342296315,
+      "grad_norm": 0.42723333835601807,
+      "learning_rate": 8.20864960282436e-05,
+      "loss": 2.8594,
+      "step": 279000
+    },
+    {
+      "epoch": 15.057811239580532,
+      "grad_norm": 0.420899361371994,
+      "learning_rate": 8.120653133274491e-05,
+      "loss": 2.8111,
+      "step": 280000
+    },
+    {
+      "epoch": 15.11158913686475,
+      "grad_norm": 0.4493071734905243,
+      "learning_rate": 8.032391879964694e-05,
+      "loss": 2.8189,
+      "step": 281000
+    },
+    {
+      "epoch": 15.165367034148964,
+      "grad_norm": 0.4198229908943176,
+      "learning_rate": 7.944130626654899e-05,
+      "loss": 2.8199,
+      "step": 282000
+    },
+    {
+      "epoch": 15.219144931433181,
+      "grad_norm": 0.44265827536582947,
+      "learning_rate": 7.8558693733451e-05,
+      "loss": 2.8237,
+      "step": 283000
+    },
+    {
+      "epoch": 15.272922828717396,
+      "grad_norm": 0.44851958751678467,
+      "learning_rate": 7.767608120035305e-05,
+      "loss": 2.8257,
+      "step": 284000
+    },
+    {
+      "epoch": 15.326700726001613,
+      "grad_norm": 0.4241320788860321,
+      "learning_rate": 7.679435127978816e-05,
+      "loss": 2.8296,
+      "step": 285000
+    },
+    {
+      "epoch": 15.38047862328583,
+      "grad_norm": 0.4106122851371765,
+      "learning_rate": 7.59117387466902e-05,
+      "loss": 2.8334,
+      "step": 286000
+    },
+    {
+      "epoch": 15.434256520570045,
+      "grad_norm": 0.4082964360713959,
+      "learning_rate": 7.502912621359223e-05,
+      "loss": 2.8322,
+      "step": 287000
+    },
+    {
+      "epoch": 15.488034417854262,
+      "grad_norm": 0.4180322587490082,
+      "learning_rate": 7.414739629302736e-05,
+      "loss": 2.8338,
+      "step": 288000
+    },
+    {
+      "epoch": 15.541812315138477,
+      "grad_norm": 0.424429327249527,
+      "learning_rate": 7.326478375992939e-05,
+      "loss": 2.8344,
+      "step": 289000
+    },
+    {
+      "epoch": 15.595590212422694,
+      "grad_norm": 0.4517696797847748,
+      "learning_rate": 7.238305383936451e-05,
+      "loss": 2.8371,
+      "step": 290000
+    },
+    {
+      "epoch": 15.649368109706911,
+      "grad_norm": 0.433946430683136,
+      "learning_rate": 7.150044130626654e-05,
+      "loss": 2.8344,
+      "step": 291000
+    },
+    {
+      "epoch": 15.703146006991126,
+      "grad_norm": 0.43706047534942627,
+      "learning_rate": 7.061782877316857e-05,
+      "loss": 2.8371,
+      "step": 292000
+    },
+    {
+      "epoch": 15.756923904275343,
+      "grad_norm": 0.4216670095920563,
+      "learning_rate": 6.97352162400706e-05,
+      "loss": 2.8416,
+      "step": 293000
+    },
+    {
+      "epoch": 15.81070180155956,
+      "grad_norm": 0.42567121982574463,
+      "learning_rate": 6.885348631950573e-05,
+      "loss": 2.8412,
+      "step": 294000
+    },
+    {
+      "epoch": 15.864479698843775,
+      "grad_norm": 0.45544958114624023,
+      "learning_rate": 6.797087378640776e-05,
+      "loss": 2.8425,
+      "step": 295000
+    },
+    {
+      "epoch": 15.918257596127992,
+      "grad_norm": 0.42355233430862427,
+      "learning_rate": 6.708914386584288e-05,
+      "loss": 2.8427,
+      "step": 296000
+    },
+    {
+      "epoch": 15.972035493412207,
+      "grad_norm": 0.40585145354270935,
+      "learning_rate": 6.620741394527801e-05,
+      "loss": 2.841,
+      "step": 297000
+    },
+    {
+      "epoch": 16.0,
+      "eval_accuracy": 0.40937653104184507,
+      "eval_loss": 3.4092414379119873,
+      "eval_runtime": 154.0074,
+      "eval_samples_per_second": 376.092,
+      "eval_steps_per_second": 5.883,
+      "step": 297520
+    },
+    {
+      "epoch": 16.025813390696424,
+      "grad_norm": 0.43941229581832886,
+      "learning_rate": 6.532480141218005e-05,
+      "loss": 2.8215,
+      "step": 298000
+    },
+    {
+      "epoch": 16.07959128798064,
+      "grad_norm": 0.4342760145664215,
+      "learning_rate": 6.444218887908208e-05,
+      "loss": 2.8046,
+      "step": 299000
+    },
+    {
+      "epoch": 16.133369185264858,
+      "grad_norm": 0.4505048394203186,
+      "learning_rate": 6.356045895851721e-05,
+      "loss": 2.8076,
+      "step": 300000
+    },
+    {
+      "epoch": 16.18714708254907,
+      "grad_norm": 0.45032915472984314,
+      "learning_rate": 6.267784642541924e-05,
+      "loss": 2.8041,
+      "step": 301000
+    },
+    {
+      "epoch": 16.240924979833288,
+      "grad_norm": 0.44145554304122925,
+      "learning_rate": 6.179523389232127e-05,
+      "loss": 2.8097,
+      "step": 302000
+    },
+    {
+      "epoch": 16.294702877117505,
+      "grad_norm": 0.4323660433292389,
+      "learning_rate": 6.091262135922329e-05,
+      "loss": 2.8111,
+      "step": 303000
+    },
+    {
+      "epoch": 16.34848077440172,
+      "grad_norm": 0.43976891040802,
+      "learning_rate": 6.0030891438658424e-05,
+      "loss": 2.8105,
+      "step": 304000
+    },
+    {
+      "epoch": 16.40225867168594,
+      "grad_norm": 0.4359465539455414,
+      "learning_rate": 5.914827890556045e-05,
+      "loss": 2.8109,
+      "step": 305000
+    },
+    {
+      "epoch": 16.456036568970152,
+      "grad_norm": 0.43427878618240356,
+      "learning_rate": 5.8265666372462485e-05,
+      "loss": 2.8154,
+      "step": 306000
+    },
+    {
+      "epoch": 16.50981446625437,
+      "grad_norm": 0.45613136887550354,
+      "learning_rate": 5.7383053839364515e-05,
+      "loss": 2.8171,
+      "step": 307000
+    },
+    {
+      "epoch": 16.563592363538586,
+      "grad_norm": 0.42773687839508057,
+      "learning_rate": 5.650220653133274e-05,
+      "loss": 2.8169,
+      "step": 308000
+    },
+    {
+      "epoch": 16.617370260822803,
+      "grad_norm": 0.4209708869457245,
+      "learning_rate": 5.561959399823477e-05,
+      "loss": 2.8147,
+      "step": 309000
+    },
+    {
+      "epoch": 16.67114815810702,
+      "grad_norm": 0.4385839104652405,
+      "learning_rate": 5.4736981465136795e-05,
+      "loss": 2.822,
+      "step": 310000
+    },
+    {
+      "epoch": 16.724926055391233,
+      "grad_norm": 0.459045946598053,
+      "learning_rate": 5.3855251544571934e-05,
+      "loss": 2.8214,
+      "step": 311000
+    },
+    {
+      "epoch": 16.77870395267545,
+      "grad_norm": 0.43635278940200806,
+      "learning_rate": 5.297263901147396e-05,
+      "loss": 2.8202,
+      "step": 312000
+    },
+    {
+      "epoch": 16.832481849959667,
+      "grad_norm": 0.43077248334884644,
+      "learning_rate": 5.209002647837599e-05,
+      "loss": 2.823,
+      "step": 313000
+    },
+    {
+      "epoch": 16.886259747243884,
+      "grad_norm": 0.43129387497901917,
+      "learning_rate": 5.120741394527802e-05,
+      "loss": 2.8237,
+      "step": 314000
+    },
+    {
+      "epoch": 16.9400376445281,
+      "grad_norm": 0.42089757323265076,
+      "learning_rate": 5.032568402471314e-05,
+      "loss": 2.8228,
+      "step": 315000
+    },
+    {
+      "epoch": 16.993815541812314,
+      "grad_norm": 0.44255971908569336,
+      "learning_rate": 4.944395410414828e-05,
+      "loss": 2.8225,
+      "step": 316000
+    },
+    {
+      "epoch": 17.0,
+      "eval_accuracy": 0.4098076831739891,
+      "eval_loss": 3.3965845108032227,
+      "eval_runtime": 154.459,
+      "eval_samples_per_second": 374.993,
+      "eval_steps_per_second": 5.866,
+      "step": 316115
+    },
+    {
+      "epoch": 17.04759343909653,
+      "grad_norm": 0.4566737115383148,
+      "learning_rate": 4.8561341571050305e-05,
+      "loss": 2.7889,
+      "step": 317000
+    },
+    {
+      "epoch": 17.101371336380748,
+      "grad_norm": 0.4266511797904968,
+      "learning_rate": 4.767961165048544e-05,
+      "loss": 2.7872,
+      "step": 318000
+    },
+    {
+      "epoch": 17.155149233664964,
+      "grad_norm": 0.43798133730888367,
+      "learning_rate": 4.679699911738746e-05,
+      "loss": 2.7905,
+      "step": 319000
+    },
+    {
+      "epoch": 17.20892713094918,
+      "grad_norm": 0.4472503662109375,
+      "learning_rate": 4.591438658428949e-05,
+      "loss": 2.7931,
+      "step": 320000
+    },
+    {
+      "epoch": 17.262705028233395,
+      "grad_norm": 0.45179733633995056,
+      "learning_rate": 4.503177405119153e-05,
+      "loss": 2.7924,
+      "step": 321000
+    },
+    {
+      "epoch": 17.31648292551761,
+      "grad_norm": 0.43849724531173706,
+      "learning_rate": 4.414916151809356e-05,
+      "loss": 2.7959,
+      "step": 322000
+    },
+    {
+      "epoch": 17.37026082280183,
+      "grad_norm": 0.444363534450531,
+      "learning_rate": 4.326743159752868e-05,
+      "loss": 2.7984,
+      "step": 323000
+    },
+    {
+      "epoch": 17.424038720086045,
+      "grad_norm": 0.4618755280971527,
+      "learning_rate": 4.238481906443071e-05,
+      "loss": 2.7971,
+      "step": 324000
+    },
+    {
+      "epoch": 17.477816617370262,
+      "grad_norm": 0.44242867827415466,
+      "learning_rate": 4.150308914386584e-05,
+      "loss": 2.7994,
+      "step": 325000
+    },
+    {
+      "epoch": 17.531594514654476,
+      "grad_norm": 0.4494096636772156,
+      "learning_rate": 4.062047661076787e-05,
+      "loss": 2.7972,
+      "step": 326000
+    },
+    {
+      "epoch": 17.585372411938692,
+      "grad_norm": 0.4398290514945984,
+      "learning_rate": 3.97378640776699e-05,
+      "loss": 2.8002,
+      "step": 327000
+    },
+    {
+      "epoch": 17.63915030922291,
+      "grad_norm": 0.43445536494255066,
+      "learning_rate": 3.885525154457193e-05,
+      "loss": 2.8015,
+      "step": 328000
+    },
+    {
+      "epoch": 17.692928206507126,
+      "grad_norm": 0.43242964148521423,
+      "learning_rate": 3.797352162400706e-05,
+      "loss": 2.8037,
+      "step": 329000
+    },
+    {
+      "epoch": 17.746706103791343,
+      "grad_norm": 0.43721792101860046,
+      "learning_rate": 3.7091791703442186e-05,
+      "loss": 2.8024,
+      "step": 330000
+    },
+    {
+      "epoch": 17.800484001075557,
+      "grad_norm": 0.46508708596229553,
+      "learning_rate": 3.6209179170344216e-05,
+      "loss": 2.8028,
+      "step": 331000
+    },
+    {
+      "epoch": 17.854261898359773,
+      "grad_norm": 0.4641873836517334,
+      "learning_rate": 3.5326566637246246e-05,
+      "loss": 2.8042,
+      "step": 332000
+    },
+    {
+      "epoch": 17.90803979564399,
+      "grad_norm": 0.4373840391635895,
+      "learning_rate": 3.4443954104148276e-05,
+      "loss": 2.8067,
+      "step": 333000
+    },
+    {
+      "epoch": 17.961817692928207,
+      "grad_norm": 0.4608209729194641,
+      "learning_rate": 3.35622241835834e-05,
+      "loss": 2.8062,
+      "step": 334000
+    },
+    {
+      "epoch": 18.0,
+      "eval_accuracy": 0.4095544199240385,
+      "eval_loss": 3.4095396995544434,
+      "eval_runtime": 154.3283,
+      "eval_samples_per_second": 375.31,
+      "eval_steps_per_second": 5.871,
+      "step": 334710
+    },
+    {
+      "epoch": 18.015595590212424,
+      "grad_norm": 0.4512391686439514,
+      "learning_rate": 3.268049426301853e-05,
+      "loss": 2.7991,
+      "step": 335000
+    },
+    {
+      "epoch": 18.069373487496637,
+      "grad_norm": 0.4583674967288971,
+      "learning_rate": 3.179788172992056e-05,
+      "loss": 2.7775,
+      "step": 336000
+    },
+    {
+      "epoch": 18.123151384780854,
+      "grad_norm": 0.43765154480934143,
+      "learning_rate": 3.0915269196822593e-05,
+      "loss": 2.7777,
+      "step": 337000
+    },
+    {
+      "epoch": 18.17692928206507,
+      "grad_norm": 0.4475190341472626,
+      "learning_rate": 3.003265666372462e-05,
+      "loss": 2.7755,
+      "step": 338000
+    },
+    {
+      "epoch": 18.230707179349288,
+      "grad_norm": 0.47484225034713745,
+      "learning_rate": 2.9150926743159752e-05,
+      "loss": 2.7748,
+      "step": 339000
+    },
+    {
+      "epoch": 18.284485076633505,
+      "grad_norm": 0.44132038950920105,
+      "learning_rate": 2.8269196822594877e-05,
+      "loss": 2.7786,
+      "step": 340000
+    },
+    {
+      "epoch": 18.33826297391772,
+      "grad_norm": 0.44409921765327454,
+      "learning_rate": 2.7386584289496907e-05,
+      "loss": 2.7843,
+      "step": 341000
+    },
+    {
+      "epoch": 18.392040871201935,
+      "grad_norm": 0.4480835795402527,
+      "learning_rate": 2.6503971756398938e-05,
+      "loss": 2.7806,
+      "step": 342000
+    },
+    {
+      "epoch": 18.445818768486152,
+      "grad_norm": 0.46462225914001465,
+      "learning_rate": 2.5621359223300968e-05,
+      "loss": 2.7848,
+      "step": 343000
+    },
+    {
+      "epoch": 18.49959666577037,
+      "grad_norm": 0.4442801773548126,
+      "learning_rate": 2.4740511915269195e-05,
+      "loss": 2.7872,
+      "step": 344000
+    },
+    {
+      "epoch": 18.553374563054586,
+      "grad_norm": 0.45635318756103516,
+      "learning_rate": 2.3857899382171225e-05,
+      "loss": 2.7855,
+      "step": 345000
+    },
+    {
+      "epoch": 18.6071524603388,
+      "grad_norm": 0.4429308772087097,
+      "learning_rate": 2.2975286849073255e-05,
+      "loss": 2.7819,
+      "step": 346000
+    },
+    {
+      "epoch": 18.660930357623016,
+      "grad_norm": 0.4405366778373718,
+      "learning_rate": 2.209355692850838e-05,
+      "loss": 2.7859,
+      "step": 347000
+    },
+    {
+      "epoch": 18.714708254907233,
+      "grad_norm": 0.4760023355484009,
+      "learning_rate": 2.1210944395410414e-05,
+      "loss": 2.7845,
+      "step": 348000
+    },
+    {
+      "epoch": 18.76848615219145,
+      "grad_norm": 0.43945708870887756,
+      "learning_rate": 2.032833186231244e-05,
+      "loss": 2.7851,
+      "step": 349000
+    },
+    {
+      "epoch": 18.822264049475667,
+      "grad_norm": 0.4329651892185211,
+      "learning_rate": 1.9446601941747572e-05,
+      "loss": 2.7858,
+      "step": 350000
+    },
+    {
+      "epoch": 18.87604194675988,
+      "grad_norm": 0.4544674754142761,
+      "learning_rate": 1.8563989408649602e-05,
+      "loss": 2.7854,
+      "step": 351000
+    },
+    {
+      "epoch": 18.929819844044097,
+      "grad_norm": 0.45160219073295593,
+      "learning_rate": 1.7681376875551633e-05,
+      "loss": 2.7868,
+      "step": 352000
+    },
+    {
+      "epoch": 18.983597741328314,
+      "grad_norm": 0.4521346092224121,
+      "learning_rate": 1.6798764342453663e-05,
+      "loss": 2.7904,
+      "step": 353000
+    },
+    {
+      "epoch": 19.0,
+      "eval_accuracy": 0.4098326736060797,
+      "eval_loss": 3.416917562484741,
+      "eval_runtime": 154.3288,
+      "eval_samples_per_second": 375.309,
+      "eval_steps_per_second": 5.871,
+      "step": 353305
+    },
+    {
+      "epoch": 19.03737563861253,
+      "grad_norm": 0.4459937810897827,
+      "learning_rate": 1.5916151809355693e-05,
+      "loss": 2.7735,
+      "step": 354000
+    },
+    {
+      "epoch": 19.091153535896748,
+      "grad_norm": 0.4429547190666199,
+      "learning_rate": 1.5035304501323918e-05,
+      "loss": 2.7666,
+      "step": 355000
+    },
+    {
+      "epoch": 19.14493143318096,
+      "grad_norm": 0.45378807187080383,
+      "learning_rate": 1.4152691968225948e-05,
+      "loss": 2.7675,
+      "step": 356000
+    },
+    {
+      "epoch": 19.198709330465178,
+      "grad_norm": 0.4565734267234802,
+      "learning_rate": 1.3270079435127978e-05,
+      "loss": 2.7678,
+      "step": 357000
+    },
+    {
+      "epoch": 19.252487227749395,
+      "grad_norm": 0.46085768938064575,
+      "learning_rate": 1.2387466902030009e-05,
+      "loss": 2.7671,
+      "step": 358000
+    },
+    {
+      "epoch": 19.30626512503361,
+      "grad_norm": 0.45043742656707764,
+      "learning_rate": 1.1505736981465135e-05,
+      "loss": 2.7697,
+      "step": 359000
+    },
+    {
+      "epoch": 19.36004302231783,
+      "grad_norm": 0.44789180159568787,
+      "learning_rate": 1.0623124448367166e-05,
+      "loss": 2.7678,
+      "step": 360000
+    },
+    {
+      "epoch": 19.413820919602042,
+      "grad_norm": 0.46358925104141235,
+      "learning_rate": 9.741394527802292e-06,
+      "loss": 2.7672,
+      "step": 361000
+    },
+    {
+      "epoch": 19.46759881688626,
+      "grad_norm": 0.4664359390735626,
+      "learning_rate": 8.858781994704324e-06,
+      "loss": 2.7646,
+      "step": 362000
+    },
+    {
+      "epoch": 19.521376714170476,
+      "grad_norm": 0.46864446997642517,
+      "learning_rate": 7.976169461606355e-06,
+      "loss": 2.7694,
+      "step": 363000
+    },
+    {
+      "epoch": 19.575154611454693,
+      "grad_norm": 0.4606035351753235,
+      "learning_rate": 7.093556928508385e-06,
+      "loss": 2.7702,
+      "step": 364000
+    },
+    {
+      "epoch": 19.62893250873891,
+      "grad_norm": 0.4529917538166046,
+      "learning_rate": 6.21270962047661e-06,
+      "loss": 2.7703,
+      "step": 365000
+    },
+    {
+      "epoch": 19.682710406023123,
+      "grad_norm": 0.44931086897850037,
+      "learning_rate": 5.33009708737864e-06,
+      "loss": 2.7721,
+      "step": 366000
+    },
+    {
+      "epoch": 19.73648830330734,
+      "grad_norm": 0.4593379497528076,
+      "learning_rate": 4.44748455428067e-06,
+      "loss": 2.766,
+      "step": 367000
+    },
+    {
+      "epoch": 19.790266200591557,
+      "grad_norm": 0.4625178277492523,
+      "learning_rate": 3.565754633715798e-06,
+      "loss": 2.7716,
+      "step": 368000
+    },
+    {
+      "epoch": 19.844044097875774,
+      "grad_norm": 0.441020667552948,
+      "learning_rate": 2.6831421006178284e-06,
+      "loss": 2.771,
+      "step": 369000
+    },
+    {
+      "epoch": 19.89782199515999,
+      "grad_norm": 0.48773327469825745,
+      "learning_rate": 1.8005295675198586e-06,
+      "loss": 2.7657,
+      "step": 370000
+    },
+    {
+      "epoch": 19.951599892444204,
+      "grad_norm": 0.45526123046875,
+      "learning_rate": 9.179170344218888e-07,
+      "loss": 2.775,
+      "step": 371000
+    },
+    {
+      "epoch": 20.0,
+      "eval_accuracy": 0.4096272415057219,
+      "eval_loss": 3.4256882667541504,
+      "eval_runtime": 154.1682,
+      "eval_samples_per_second": 375.7,
+      "eval_steps_per_second": 5.877,
+      "step": 371900
+    },
+    {
+      "epoch": 20.0,
+      "step": 371900,
+      "total_flos": 1.5669257538816e+18,
+      "train_loss": 3.0592986363049124,
+      "train_runtime": 81068.1718,
+      "train_samples_per_second": 146.799,
+      "train_steps_per_second": 4.587
+    }
+  ],
+  "logging_steps": 1000,
+  "max_steps": 371900,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 20,
+  "save_steps": 5000,
+  "total_flos": 1.5669257538816e+18,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}