Model save

Browse files

Files changed (5) hide show

README.md +17 -13
all_results.json +7 -12
runs/May21_04-03-21_deep-diver-main-fluffy-octopus-1-0-0/events.out.tfevents.1716278746.deep-diver-main-fluffy-octopus-1-0-0.384.0 +2 -2
train_results.json +7 -7
trainer_state.json +1290 -508

README.md CHANGED Viewed

@@ -2,13 +2,12 @@
 license: gemma
 library_name: peft
 tags:
-- alignment-handbook
 - trl
 - sft
 - generated_from_trainer
 base_model: google/gemma-7b
 datasets:
-- llama-duo/synth_summarize_dataset
 model-index:
 - name: gemma7b-summarize-gemini1.5flash-30k
   results: []
@@ -17,12 +16,12 @@ model-index:
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 should probably proofread and complete it, then remove this comment. -->
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/chansung18/huggingface/runs/go5tmp0o)
 # gemma7b-summarize-gemini1.5flash-30k
-This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the llama-duo/synth_summarize_dataset dataset.
 It achieves the following results on the evaluation set:
-- Loss: 2.2363
 ## Model description
@@ -53,17 +52,22 @@ The following hyperparameters were used during training:
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.1
-- num_epochs: 5
 ### Training results
-| Training Loss | Epoch  | Step | Validation Loss |
-|:-------------:|:------:|:----:|:---------------:|
-| 0.8665        | 0.9956 | 114  | 2.2684          |
-| 0.7577        | 2.0    | 229  | 2.1722          |
-| 0.6255        | 2.9956 | 343  | 2.1741          |
-| 0.5966        | 4.0    | 458  | 2.2142          |
-| 0.5399        | 4.9782 | 570  | 2.2363          |
 ### Framework versions

 license: gemma
 library_name: peft
 tags:
 - trl
 - sft
 - generated_from_trainer
 base_model: google/gemma-7b
 datasets:
+- generator
 model-index:
 - name: gemma7b-summarize-gemini1.5flash-30k
   results: []
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 should probably proofread and complete it, then remove this comment. -->
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/chansung18/huggingface/runs/gx8z9rab)
 # gemma7b-summarize-gemini1.5flash-30k
+This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
 It achieves the following results on the evaluation set:
+- Loss: 3.3804
 ## Model description
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 10
 ### Training results
+| Training Loss | Epoch | Step | Validation Loss |
+|:-------------:|:-----:|:----:|:---------------:|
+| 1.0256        | 1.0   | 110  | 2.3887          |
+| 0.8325        | 2.0   | 220  | 2.2270          |
+| 0.749         | 3.0   | 330  | 2.2333          |
+| 0.6755        | 4.0   | 440  | 2.2993          |
+| 0.6197        | 5.0   | 550  | 2.3820          |
+| 0.5208        | 6.0   | 660  | 2.5869          |
+| 0.4474        | 7.0   | 770  | 2.8389          |
+| 0.4044        | 8.0   | 880  | 3.1029          |
+| 0.3573        | 9.0   | 990  | 3.3573          |
+| 0.354         | 10.0  | 1100 | 3.3804          |
 ### Framework versions

all_results.json CHANGED Viewed

@@ -1,14 +1,9 @@
 {
-    "epoch": 4.978165938864628,
-    "eval_loss": 2.236276865005493,
-    "eval_runtime": 1.0247,
-    "eval_samples": 25,
-    "eval_samples_per_second": 4.88,
-    "eval_steps_per_second": 1.952,
-    "total_flos": 8.714577082329334e+17,
-    "train_loss": 2.449154797771521,
-    "train_runtime": 4532.0541,
-    "train_samples": 30386,
-    "train_samples_per_second": 2.02,
-    "train_steps_per_second": 0.126
 }

 {
+    "epoch": 10.0,
+    "total_flos": 1.6817604900715233e+18,
+    "train_loss": 1.8264882094209844,
+    "train_runtime": 8654.8771,
+    "train_samples": 32070,
+    "train_samples_per_second": 2.032,
+    "train_steps_per_second": 0.127
 }

runs/May21_04-03-21_deep-diver-main-fluffy-octopus-1-0-0/events.out.tfevents.1716278746.deep-diver-main-fluffy-octopus-1-0-0.384.0 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:305ee7e2fd24953e0fd15b2883138700b6911ae9e5412932516e52792cf096f6
-size 54591

 version https://git-lfs.github.com/spec/v1
+oid sha256:d399ed0c7b92a82aef9ca04d710770e51d053c79de1545fbab9634dff190f34b
+size 55216

train_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-    "epoch": 4.978165938864628,
-    "total_flos": 8.714577082329334e+17,
-    "train_loss": 2.449154797771521,
-    "train_runtime": 4532.0541,
-    "train_samples": 30386,
-    "train_samples_per_second": 2.02,
-    "train_steps_per_second": 0.126
 }

 {
+    "epoch": 10.0,
+    "total_flos": 1.6817604900715233e+18,
+    "train_loss": 1.8264882094209844,
+    "train_runtime": 8654.8771,
+    "train_samples": 32070,
+    "train_samples_per_second": 2.032,
+    "train_steps_per_second": 0.127
 }

trainer_state.json CHANGED Viewed

@@ -1,872 +1,1654 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 4.978165938864628,
   "eval_steps": 500,
-  "global_step": 570,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.008733624454148471,
-      "grad_norm": 648.0,
-      "learning_rate": 3.5087719298245615e-06,
-      "loss": 56.9969,
       "step": 1
     },
     {
-      "epoch": 0.043668122270742356,
-      "grad_norm": 584.0,
-      "learning_rate": 1.7543859649122806e-05,
-      "loss": 59.6974,
       "step": 5
     },
     {
-      "epoch": 0.08733624454148471,
-      "grad_norm": 356.0,
-      "learning_rate": 3.508771929824561e-05,
-      "loss": 42.0524,
       "step": 10
     },
     {
-      "epoch": 0.13100436681222707,
-      "grad_norm": 39.75,
-      "learning_rate": 5.2631578947368424e-05,
-      "loss": 23.223,
       "step": 15
     },
     {
-      "epoch": 0.17467248908296942,
-      "grad_norm": 26.625,
-      "learning_rate": 7.017543859649122e-05,
-      "loss": 20.5089,
       "step": 20
     },
     {
-      "epoch": 0.2183406113537118,
-      "grad_norm": 7.15625,
-      "learning_rate": 8.771929824561403e-05,
-      "loss": 17.4609,
       "step": 25
     },
     {
-      "epoch": 0.26200873362445415,
-      "grad_norm": 10.25,
-      "learning_rate": 0.00010526315789473685,
-      "loss": 16.9699,
       "step": 30
     },
     {
-      "epoch": 0.3056768558951965,
-      "grad_norm": 36.0,
-      "learning_rate": 0.00012280701754385965,
-      "loss": 14.7118,
       "step": 35
     },
     {
-      "epoch": 0.34934497816593885,
-      "grad_norm": 54.75,
-      "learning_rate": 0.00014035087719298245,
-      "loss": 9.0151,
       "step": 40
     },
     {
-      "epoch": 0.3930131004366812,
-      "grad_norm": 4.09375,
-      "learning_rate": 0.00015789473684210527,
-      "loss": 2.2763,
       "step": 45
     },
     {
-      "epoch": 0.4366812227074236,
-      "grad_norm": 2.5,
-      "learning_rate": 0.00017543859649122806,
-      "loss": 1.6984,
       "step": 50
     },
     {
-      "epoch": 0.48034934497816595,
-      "grad_norm": 2.5,
-      "learning_rate": 0.00019298245614035088,
-      "loss": 1.4194,
       "step": 55
     },
     {
-      "epoch": 0.5240174672489083,
-      "grad_norm": 1.9765625,
-      "learning_rate": 0.00019998312416333227,
-      "loss": 1.2842,
       "step": 60
     },
     {
-      "epoch": 0.5676855895196506,
-      "grad_norm": 13.0625,
-      "learning_rate": 0.0001998800146766861,
-      "loss": 1.2936,
       "step": 65
     },
     {
-      "epoch": 0.611353711790393,
-      "grad_norm": 3.15625,
-      "learning_rate": 0.00019968326771610797,
-      "loss": 1.1866,
       "step": 70
     },
     {
-      "epoch": 0.6550218340611353,
-      "grad_norm": 3.765625,
-      "learning_rate": 0.00019939306773179497,
-      "loss": 1.0911,
       "step": 75
     },
     {
-      "epoch": 0.6986899563318777,
-      "grad_norm": 5.34375,
-      "learning_rate": 0.00019900968678611666,
-      "loss": 1.0412,
       "step": 80
     },
     {
-      "epoch": 0.74235807860262,
-      "grad_norm": 2.625,
-      "learning_rate": 0.00019853348429855672,
-      "loss": 0.9954,
       "step": 85
     },
     {
-      "epoch": 0.7860262008733624,
-      "grad_norm": 2.6875,
-      "learning_rate": 0.0001979649067087574,
-      "loss": 0.9395,
       "step": 90
     },
     {
-      "epoch": 0.8296943231441049,
-      "grad_norm": 4.375,
-      "learning_rate": 0.00019730448705798239,
-      "loss": 0.9498,
       "step": 95
     },
     {
-      "epoch": 0.8733624454148472,
-      "grad_norm": 4.875,
-      "learning_rate": 0.00019655284448939094,
-      "loss": 0.8991,
       "step": 100
     },
     {
-      "epoch": 0.9170305676855895,
-      "grad_norm": 1.75,
-      "learning_rate": 0.00019571068366759143,
-      "loss": 0.8827,
       "step": 105
     },
     {
-      "epoch": 0.9606986899563319,
-      "grad_norm": 3.1875,
-      "learning_rate": 0.00019477879411801844,
-      "loss": 0.8665,
       "step": 110
     },
     {
-      "epoch": 0.9956331877729258,
-      "eval_loss": 2.268367052078247,
-      "eval_runtime": 1.0131,
-      "eval_samples_per_second": 4.935,
-      "eval_steps_per_second": 1.974,
-      "step": 114
     },
     {
-      "epoch": 1.0043668122270741,
-      "grad_norm": 1.6015625,
-      "learning_rate": 0.00019375804948675306,
-      "loss": 0.8773,
       "step": 115
     },
     {
-      "epoch": 1.0480349344978166,
-      "grad_norm": 9.1875,
-      "learning_rate": 0.00019264940672148018,
-      "loss": 0.8211,
       "step": 120
     },
     {
-      "epoch": 1.091703056768559,
-      "grad_norm": 1.25,
-      "learning_rate": 0.00019145390517435012,
-      "loss": 0.8465,
       "step": 125
     },
     {
-      "epoch": 1.1353711790393013,
-      "grad_norm": 3.09375,
-      "learning_rate": 0.00019017266562758659,
-      "loss": 0.8369,
       "step": 130
     },
     {
-      "epoch": 1.1790393013100438,
-      "grad_norm": 1.015625,
-      "learning_rate": 0.00018880688924275378,
-      "loss": 0.7924,
       "step": 135
     },
     {
-      "epoch": 1.222707423580786,
-      "grad_norm": 1.9140625,
-      "learning_rate": 0.00018735785643466784,
-      "loss": 0.778,
       "step": 140
     },
     {
-      "epoch": 1.2663755458515285,
-      "grad_norm": 0.98828125,
-      "learning_rate": 0.00018582692567100867,
-      "loss": 0.7681,
       "step": 145
     },
     {
-      "epoch": 1.3100436681222707,
-      "grad_norm": 1.3984375,
-      "learning_rate": 0.00018421553219875658,
-      "loss": 0.8007,
       "step": 150
     },
     {
-      "epoch": 1.3537117903930131,
-      "grad_norm": 80.5,
-      "learning_rate": 0.00018252518669864936,
-      "loss": 0.7683,
       "step": 155
     },
     {
-      "epoch": 1.3973799126637554,
-      "grad_norm": 0.73828125,
-      "learning_rate": 0.0001807574738689193,
-      "loss": 0.7713,
       "step": 160
     },
     {
-      "epoch": 1.4410480349344978,
-      "grad_norm": 2.875,
-      "learning_rate": 0.00017891405093963938,
-      "loss": 0.7784,
       "step": 165
     },
     {
-      "epoch": 1.48471615720524,
-      "grad_norm": 0.54296875,
-      "learning_rate": 0.00017699664611907072,
-      "loss": 0.7742,
       "step": 170
     },
     {
-      "epoch": 1.5283842794759825,
-      "grad_norm": 2.046875,
-      "learning_rate": 0.0001750070569734681,
-      "loss": 0.7356,
       "step": 175
     },
     {
-      "epoch": 1.572052401746725,
-      "grad_norm": 1.375,
-      "learning_rate": 0.0001729471487418621,
-      "loss": 0.7613,
       "step": 180
     },
     {
-      "epoch": 1.6157205240174672,
-      "grad_norm": 1.0234375,
-      "learning_rate": 0.00017081885258739846,
-      "loss": 0.7373,
       "step": 185
     },
     {
-      "epoch": 1.6593886462882095,
-      "grad_norm": 1.109375,
-      "learning_rate": 0.0001686241637868734,
-      "loss": 0.7395,
       "step": 190
     },
     {
-      "epoch": 1.703056768558952,
-      "grad_norm": 0.77734375,
-      "learning_rate": 0.00016636513986016213,
-      "loss": 0.7456,
       "step": 195
     },
     {
-      "epoch": 1.7467248908296944,
-      "grad_norm": 1.5,
-      "learning_rate": 0.00016404389864129533,
-      "loss": 0.7535,
       "step": 200
     },
     {
-      "epoch": 1.7903930131004366,
-      "grad_norm": 1.0703125,
-      "learning_rate": 0.00016166261629298995,
-      "loss": 0.7334,
       "step": 205
     },
     {
-      "epoch": 1.8340611353711789,
-      "grad_norm": 0.99609375,
-      "learning_rate": 0.00015922352526649803,
-      "loss": 0.7087,
       "step": 210
     },
     {
-      "epoch": 1.8777292576419216,
-      "grad_norm": 5.03125,
-      "learning_rate": 0.00015672891220868432,
-      "loss": 0.7264,
       "step": 215
     },
     {
-      "epoch": 1.9213973799126638,
-      "grad_norm": 0.60546875,
-      "learning_rate": 0.00015418111581829574,
-      "loss": 0.7336,
       "step": 220
     },
     {
-      "epoch": 1.965065502183406,
-      "grad_norm": 0.79296875,
-      "learning_rate": 0.00015158252465343242,
-      "loss": 0.7577,
-      "step": 225
     },
     {
-      "epoch": 2.0,
-      "eval_loss": 2.1721692085266113,
-      "eval_runtime": 1.0003,
-      "eval_samples_per_second": 4.999,
-      "eval_steps_per_second": 1.999,
-      "step": 229
     },
     {
-      "epoch": 2.0087336244541483,
-      "grad_norm": 0.5078125,
-      "learning_rate": 0.00014893557489227517,
-      "loss": 0.7045,
       "step": 230
     },
     {
-      "epoch": 2.052401746724891,
-      "grad_norm": 0.80859375,
-      "learning_rate": 0.00014624274804916958,
-      "loss": 0.6618,
       "step": 235
     },
     {
-      "epoch": 2.096069868995633,
-      "grad_norm": 0.62109375,
-      "learning_rate": 0.00014350656864820733,
-      "loss": 0.6572,
       "step": 240
     },
     {
-      "epoch": 2.1397379912663754,
-      "grad_norm": 0.515625,
-      "learning_rate": 0.00014072960185648577,
-      "loss": 0.6606,
       "step": 245
     },
     {
-      "epoch": 2.183406113537118,
-      "grad_norm": 0.75,
-      "learning_rate": 0.00013791445107926478,
-      "loss": 0.6832,
       "step": 250
     },
     {
-      "epoch": 2.2270742358078603,
-      "grad_norm": 0.86328125,
-      "learning_rate": 0.00013506375551927547,
-      "loss": 0.6344,
       "step": 255
     },
     {
-      "epoch": 2.2707423580786026,
-      "grad_norm": 1.0859375,
-      "learning_rate": 0.00013218018770246858,
-      "loss": 0.6513,
       "step": 260
     },
     {
-      "epoch": 2.314410480349345,
-      "grad_norm": 0.83203125,
-      "learning_rate": 0.0001292664509725226,
-      "loss": 0.6569,
       "step": 265
     },
     {
-      "epoch": 2.3580786026200875,
-      "grad_norm": 1.1484375,
-      "learning_rate": 0.00012632527695645993,
-      "loss": 0.6706,
       "step": 270
     },
     {
-      "epoch": 2.4017467248908297,
-      "grad_norm": 1.265625,
-      "learning_rate": 0.00012335942300374788,
-      "loss": 0.6641,
       "step": 275
     },
     {
-      "epoch": 2.445414847161572,
-      "grad_norm": 1.59375,
-      "learning_rate": 0.00012037166960128443,
-      "loss": 0.6636,
       "step": 280
     },
     {
-      "epoch": 2.489082969432314,
-      "grad_norm": 0.765625,
-      "learning_rate": 0.00011736481776669306,
-      "loss": 0.6687,
       "step": 285
     },
     {
-      "epoch": 2.532751091703057,
-      "grad_norm": 0.73046875,
-      "learning_rate": 0.00011434168642236964,
-      "loss": 0.6601,
       "step": 290
     },
     {
-      "epoch": 2.576419213973799,
-      "grad_norm": 0.5703125,
-      "learning_rate": 0.00011130510975274409,
-      "loss": 0.6467,
       "step": 295
     },
     {
-      "epoch": 2.6200873362445414,
-      "grad_norm": 0.53125,
-      "learning_rate": 0.00010825793454723325,
-      "loss": 0.6567,
       "step": 300
     },
     {
-      "epoch": 2.6637554585152836,
-      "grad_norm": 0.875,
-      "learning_rate": 0.00010520301753137724,
-      "loss": 0.6625,
       "step": 305
     },
     {
-      "epoch": 2.7074235807860263,
-      "grad_norm": 0.59375,
-      "learning_rate": 0.00010214322268866032,
-      "loss": 0.6614,
       "step": 310
     },
     {
-      "epoch": 2.7510917030567685,
-      "grad_norm": 0.5703125,
-      "learning_rate": 9.908141857552737e-05,
-      "loss": 0.6382,
       "step": 315
     },
     {
-      "epoch": 2.7947598253275108,
-      "grad_norm": 0.470703125,
-      "learning_rate": 9.602047563211359e-05,
-      "loss": 0.6527,
       "step": 320
     },
     {
-      "epoch": 2.8384279475982535,
-      "grad_norm": 0.89453125,
-      "learning_rate": 9.296326349120785e-05,
-      "loss": 0.682,
       "step": 325
     },
     {
-      "epoch": 2.8820960698689957,
-      "grad_norm": 0.49609375,
-      "learning_rate": 8.991264828797319e-05,
-      "loss": 0.651,
       "step": 330
     },
     {
-      "epoch": 2.925764192139738,
-      "grad_norm": 0.51953125,
-      "learning_rate": 8.687148997294621e-05,
-      "loss": 0.667,
-      "step": 335
     },
     {
-      "epoch": 2.96943231441048,
-      "grad_norm": 0.427734375,
-      "learning_rate": 8.384263963083453e-05,
-      "loss": 0.6255,
-      "step": 340
     },
     {
-      "epoch": 2.995633187772926,
-      "eval_loss": 2.174067974090576,
-      "eval_runtime": 1.051,
-      "eval_samples_per_second": 4.757,
-      "eval_steps_per_second": 1.903,
-      "step": 343
     },
     {
-      "epoch": 3.013100436681223,
-      "grad_norm": 0.62890625,
-      "learning_rate": 8.082893680762619e-05,
-      "loss": 0.6202,
       "step": 345
     },
     {
-      "epoch": 3.056768558951965,
-      "grad_norm": 0.53125,
-      "learning_rate": 7.783320684851614e-05,
-      "loss": 0.6176,
       "step": 350
     },
     {
-      "epoch": 3.1004366812227073,
-      "grad_norm": 0.51171875,
-      "learning_rate": 7.485825824914659e-05,
-      "loss": 0.5783,
       "step": 355
     },
     {
-      "epoch": 3.14410480349345,
-      "grad_norm": 0.50390625,
-      "learning_rate": 7.190688002264308e-05,
-      "loss": 0.5812,
       "step": 360
     },
     {
-      "epoch": 3.1877729257641922,
-      "grad_norm": 0.5546875,
-      "learning_rate": 6.898183908491617e-05,
-      "loss": 0.5981,
       "step": 365
     },
     {
-      "epoch": 3.2314410480349345,
-      "grad_norm": 0.53125,
-      "learning_rate": 6.608587766067852e-05,
-      "loss": 0.5853,
       "step": 370
     },
     {
-      "epoch": 3.2751091703056767,
-      "grad_norm": 0.609375,
-      "learning_rate": 6.322171071261071e-05,
-      "loss": 0.5908,
       "step": 375
     },
     {
-      "epoch": 3.3187772925764194,
-      "grad_norm": 1.125,
-      "learning_rate": 6.039202339608432e-05,
-      "loss": 0.5644,
       "step": 380
     },
     {
-      "epoch": 3.3624454148471616,
-      "grad_norm": 0.63671875,
-      "learning_rate": 5.7599468541830356e-05,
-      "loss": 0.5746,
       "step": 385
     },
     {
-      "epoch": 3.406113537117904,
-      "grad_norm": 0.8828125,
-      "learning_rate": 5.484666416891109e-05,
-      "loss": 0.5929,
       "step": 390
     },
     {
-      "epoch": 3.449781659388646,
-      "grad_norm": 0.5859375,
-      "learning_rate": 5.2136191030328455e-05,
-      "loss": 0.5704,
       "step": 395
     },
     {
-      "epoch": 3.493449781659389,
-      "grad_norm": 0.671875,
-      "learning_rate": 4.9470590193569044e-05,
-      "loss": 0.5971,
       "step": 400
     },
     {
-      "epoch": 3.537117903930131,
-      "grad_norm": 0.51953125,
-      "learning_rate": 4.685236065835443e-05,
-      "loss": 0.6042,
       "step": 405
     },
     {
-      "epoch": 3.5807860262008733,
-      "grad_norm": 0.61328125,
-      "learning_rate": 4.4283957013829846e-05,
-      "loss": 0.5822,
       "step": 410
     },
     {
-      "epoch": 3.6244541484716155,
-      "grad_norm": 0.64453125,
-      "learning_rate": 4.176778713738787e-05,
-      "loss": 0.5709,
       "step": 415
     },
     {
-      "epoch": 3.668122270742358,
-      "grad_norm": 0.49609375,
-      "learning_rate": 3.9306209937284346e-05,
-      "loss": 0.5907,
       "step": 420
     },
     {
-      "epoch": 3.7117903930131004,
-      "grad_norm": 0.5625,
-      "learning_rate": 3.69015331411628e-05,
-      "loss": 0.589,
       "step": 425
     },
     {
-      "epoch": 3.7554585152838427,
-      "grad_norm": 0.65234375,
-      "learning_rate": 3.455601113256073e-05,
-      "loss": 0.6115,
       "step": 430
     },
     {
-      "epoch": 3.7991266375545854,
-      "grad_norm": 0.50390625,
-      "learning_rate": 3.227184283742591e-05,
-      "loss": 0.5629,
       "step": 435
     },
     {
-      "epoch": 3.8427947598253276,
-      "grad_norm": 0.458984375,
-      "learning_rate": 3.0051169662624225e-05,
-      "loss": 0.5943,
       "step": 440
     },
     {
-      "epoch": 3.88646288209607,
-      "grad_norm": 0.4765625,
-      "learning_rate": 2.789607348837153e-05,
-      "loss": 0.5944,
       "step": 445
     },
     {
-      "epoch": 3.930131004366812,
-      "grad_norm": 0.4765625,
-      "learning_rate": 2.5808574716471856e-05,
-      "loss": 0.6155,
       "step": 450
     },
     {
-      "epoch": 3.9737991266375547,
-      "grad_norm": 0.48828125,
-      "learning_rate": 2.379063037619146e-05,
-      "loss": 0.5966,
       "step": 455
     },
     {
-      "epoch": 4.0,
-      "eval_loss": 2.214209794998169,
-      "eval_runtime": 1.0004,
-      "eval_samples_per_second": 4.998,
-      "eval_steps_per_second": 1.999,
-      "step": 458
-    },
-    {
-      "epoch": 4.0174672489082965,
-      "grad_norm": 0.458984375,
-      "learning_rate": 2.184413228954468e-05,
-      "loss": 0.5676,
       "step": 460
     },
     {
-      "epoch": 4.06113537117904,
-      "grad_norm": 0.48828125,
-      "learning_rate": 1.9970905297711606e-05,
-      "loss": 0.5632,
       "step": 465
     },
     {
-      "epoch": 4.104803493449782,
-      "grad_norm": 0.5078125,
-      "learning_rate": 1.8172705550250092e-05,
-      "loss": 0.5515,
       "step": 470
     },
     {
-      "epoch": 4.148471615720524,
-      "grad_norm": 0.47265625,
-      "learning_rate": 1.6451218858706374e-05,
-      "loss": 0.5725,
       "step": 475
     },
     {
-      "epoch": 4.192139737991266,
-      "grad_norm": 0.50390625,
-      "learning_rate": 1.4808059116167305e-05,
-      "loss": 0.5362,
       "step": 480
     },
     {
-      "epoch": 4.235807860262009,
-      "grad_norm": 0.478515625,
-      "learning_rate": 1.3244766784236307e-05,
-      "loss": 0.5569,
       "step": 485
     },
     {
-      "epoch": 4.279475982532751,
-      "grad_norm": 0.46484375,
-      "learning_rate": 1.176280744885121e-05,
-      "loss": 0.573,
       "step": 490
     },
     {
-      "epoch": 4.323144104803493,
-      "grad_norm": 0.478515625,
-      "learning_rate": 1.0363570446297999e-05,
-      "loss": 0.5559,
       "step": 495
     },
     {
-      "epoch": 4.366812227074236,
-      "grad_norm": 0.53125,
-      "learning_rate": 9.048367560708604e-06,
-      "loss": 0.548,
       "step": 500
     },
     {
-      "epoch": 4.4104803493449785,
-      "grad_norm": 0.515625,
-      "learning_rate": 7.818431794263836e-06,
-      "loss": 0.5755,
       "step": 505
     },
     {
-      "epoch": 4.454148471615721,
-      "grad_norm": 0.470703125,
-      "learning_rate": 6.674916211254289e-06,
-      "loss": 0.5559,
       "step": 510
     },
     {
-      "epoch": 4.497816593886463,
-      "grad_norm": 0.50390625,
-      "learning_rate": 5.618892857083069e-06,
-      "loss": 0.5491,
       "step": 515
     },
     {
-      "epoch": 4.541484716157205,
-      "grad_norm": 0.4765625,
-      "learning_rate": 4.65135175322361e-06,
-      "loss": 0.5586,
       "step": 520
     },
     {
-      "epoch": 4.585152838427947,
-      "grad_norm": 0.458984375,
-      "learning_rate": 3.7731999690749585e-06,
-      "loss": 0.5569,
       "step": 525
     },
     {
-      "epoch": 4.62882096069869,
-      "grad_norm": 0.47265625,
-      "learning_rate": 2.9852607715846193e-06,
-      "loss": 0.5458,
       "step": 530
     },
     {
-      "epoch": 4.672489082969433,
-      "grad_norm": 0.470703125,
-      "learning_rate": 2.288272853436013e-06,
-      "loss": 0.5253,
       "step": 535
     },
     {
-      "epoch": 4.716157205240175,
-      "grad_norm": 0.47265625,
-      "learning_rate": 1.6828896405244988e-06,
-      "loss": 0.5595,
       "step": 540
     },
     {
-      "epoch": 4.759825327510917,
-      "grad_norm": 0.486328125,
-      "learning_rate": 1.1696786793707781e-06,
-      "loss": 0.5626,
       "step": 545
     },
     {
-      "epoch": 4.8034934497816595,
-      "grad_norm": 0.5,
-      "learning_rate": 7.491211050462798e-07,
-      "loss": 0.5407,
       "step": 550
     },
     {
-      "epoch": 4.847161572052402,
-      "grad_norm": 0.47265625,
-      "learning_rate": 4.216111901092501e-07,
-      "loss": 0.5542,
       "step": 555
     },
     {
-      "epoch": 4.890829694323144,
-      "grad_norm": 0.5234375,
-      "learning_rate": 1.8745597497433765e-07,
-      "loss": 0.5542,
       "step": 560
     },
     {
-      "epoch": 4.934497816593886,
-      "grad_norm": 0.458984375,
-      "learning_rate": 4.687498006236135e-08,
-      "loss": 0.5682,
       "step": 565
     },
     {
-      "epoch": 4.978165938864628,
-      "grad_norm": 0.49609375,
-      "learning_rate": 0.0,
-      "loss": 0.5399,
       "step": 570
     },
     {
-      "epoch": 4.978165938864628,
-      "eval_loss": 2.236276865005493,
-      "eval_runtime": 0.9998,
-      "eval_samples_per_second": 5.001,
-      "eval_steps_per_second": 2.0,
-      "step": 570
     },
     {
-      "epoch": 4.978165938864628,
-      "step": 570,
-      "total_flos": 8.714577082329334e+17,
-      "train_loss": 2.449154797771521,
-      "train_runtime": 4532.0541,
-      "train_samples_per_second": 2.02,
-      "train_steps_per_second": 0.126
     }
   ],
   "logging_steps": 5,
-  "max_steps": 570,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 5,
   "save_steps": 100,
   "stateful_callbacks": {
     "TrainerControl": {
@@ -875,12 +1657,12 @@
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
-        "should_training_stop": false
       },
       "attributes": {}
     }
   },
-  "total_flos": 8.714577082329334e+17,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 10.0,
   "eval_steps": 500,
+  "global_step": 1100,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.00909090909090909,
+      "grad_norm": 548.0,
+      "learning_rate": 1.818181818181818e-06,
+      "loss": 53.7965,
       "step": 1
     },
     {
+      "epoch": 0.045454545454545456,
+      "grad_norm": 668.0,
+      "learning_rate": 9.090909090909091e-06,
+      "loss": 55.3182,
       "step": 5
     },
     {
+      "epoch": 0.09090909090909091,
+      "grad_norm": 486.0,
+      "learning_rate": 1.8181818181818182e-05,
+      "loss": 51.173,
       "step": 10
     },
     {
+      "epoch": 0.13636363636363635,
+      "grad_norm": 159.0,
+      "learning_rate": 2.7272727272727273e-05,
+      "loss": 33.7467,
       "step": 15
     },
     {
+      "epoch": 0.18181818181818182,
+      "grad_norm": 37.5,
+      "learning_rate": 3.6363636363636364e-05,
+      "loss": 24.0423,
       "step": 20
     },
     {
+      "epoch": 0.22727272727272727,
+      "grad_norm": 32.25,
+      "learning_rate": 4.545454545454546e-05,
+      "loss": 22.1424,
       "step": 25
     },
     {
+      "epoch": 0.2727272727272727,
+      "grad_norm": 21.25,
+      "learning_rate": 5.4545454545454546e-05,
+      "loss": 20.3818,
       "step": 30
     },
     {
+      "epoch": 0.3181818181818182,
+      "grad_norm": 7.90625,
+      "learning_rate": 6.363636363636364e-05,
+      "loss": 18.8064,
       "step": 35
     },
     {
+      "epoch": 0.36363636363636365,
+      "grad_norm": 11.625,
+      "learning_rate": 7.272727272727273e-05,
+      "loss": 17.8515,
       "step": 40
     },
     {
+      "epoch": 0.4090909090909091,
+      "grad_norm": 27.125,
+      "learning_rate": 8.181818181818183e-05,
+      "loss": 16.8604,
       "step": 45
     },
     {
+      "epoch": 0.45454545454545453,
+      "grad_norm": 56.25,
+      "learning_rate": 9.090909090909092e-05,
+      "loss": 12.4796,
       "step": 50
     },
     {
+      "epoch": 0.5,
+      "grad_norm": 11.4375,
+      "learning_rate": 0.0001,
+      "loss": 4.4979,
       "step": 55
     },
     {
+      "epoch": 0.5454545454545454,
+      "grad_norm": 3.25,
+      "learning_rate": 0.00010909090909090909,
+      "loss": 1.9919,
       "step": 60
     },
     {
+      "epoch": 0.5909090909090909,
+      "grad_norm": 25.375,
+      "learning_rate": 0.0001181818181818182,
+      "loss": 1.6619,
       "step": 65
     },
     {
+      "epoch": 0.6363636363636364,
+      "grad_norm": 3.125,
+      "learning_rate": 0.00012727272727272728,
+      "loss": 1.527,
       "step": 70
     },
     {
+      "epoch": 0.6818181818181818,
+      "grad_norm": 9.6875,
+      "learning_rate": 0.00013636363636363637,
+      "loss": 1.3688,
       "step": 75
     },
     {
+      "epoch": 0.7272727272727273,
+      "grad_norm": 8.0625,
+      "learning_rate": 0.00014545454545454546,
+      "loss": 1.2618,
       "step": 80
     },
     {
+      "epoch": 0.7727272727272727,
+      "grad_norm": 6.9375,
+      "learning_rate": 0.00015454545454545454,
+      "loss": 1.238,
       "step": 85
     },
     {
+      "epoch": 0.8181818181818182,
+      "grad_norm": 3.15625,
+      "learning_rate": 0.00016363636363636366,
+      "loss": 1.1871,
       "step": 90
     },
     {
+      "epoch": 0.8636363636363636,
+      "grad_norm": 4.34375,
+      "learning_rate": 0.00017272727272727275,
+      "loss": 1.154,
       "step": 95
     },
     {
+      "epoch": 0.9090909090909091,
+      "grad_norm": 2.65625,
+      "learning_rate": 0.00018181818181818183,
+      "loss": 1.1375,
       "step": 100
     },
     {
+      "epoch": 0.9545454545454546,
+      "grad_norm": 3.296875,
+      "learning_rate": 0.00019090909090909092,
+      "loss": 1.0352,
       "step": 105
     },
     {
+      "epoch": 1.0,
+      "grad_norm": 9.25,
+      "learning_rate": 0.0002,
+      "loss": 1.0256,
       "step": 110
     },
     {
+      "epoch": 1.0,
+      "eval_loss": 2.388709783554077,
+      "eval_runtime": 1.0069,
+      "eval_samples_per_second": 4.966,
+      "eval_steps_per_second": 1.986,
+      "step": 110
     },
     {
+      "epoch": 1.0454545454545454,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.00019998741276738754,
+      "loss": 1.0186,
       "step": 115
     },
     {
+      "epoch": 1.0909090909090908,
+      "grad_norm": 2.5625,
+      "learning_rate": 0.00019994965423831854,
+      "loss": 0.9466,
       "step": 120
     },
     {
+      "epoch": 1.1363636363636362,
+      "grad_norm": 7.21875,
+      "learning_rate": 0.0001998867339183008,
+      "loss": 0.9645,
       "step": 125
     },
     {
+      "epoch": 1.1818181818181819,
+      "grad_norm": 86.5,
+      "learning_rate": 0.00019979866764718843,
+      "loss": 0.9547,
       "step": 130
     },
     {
+      "epoch": 1.2272727272727273,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00019968547759519425,
+      "loss": 0.9527,
       "step": 135
     },
     {
+      "epoch": 1.2727272727272727,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.00019954719225730847,
+      "loss": 0.9091,
       "step": 140
     },
     {
+      "epoch": 1.3181818181818181,
+      "grad_norm": 1.703125,
+      "learning_rate": 0.00019938384644612543,
+      "loss": 0.8789,
       "step": 145
     },
     {
+      "epoch": 1.3636363636363638,
+      "grad_norm": 2.40625,
+      "learning_rate": 0.00019919548128307954,
+      "loss": 0.9036,
       "step": 150
     },
     {
+      "epoch": 1.4090909090909092,
+      "grad_norm": 5.09375,
+      "learning_rate": 0.0001989821441880933,
+      "loss": 0.8787,
       "step": 155
     },
     {
+      "epoch": 1.4545454545454546,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00019874388886763944,
+      "loss": 0.8671,
       "step": 160
     },
     {
+      "epoch": 1.5,
+      "grad_norm": 2.046875,
+      "learning_rate": 0.00019848077530122083,
+      "loss": 0.8872,
       "step": 165
     },
     {
+      "epoch": 1.5454545454545454,
+      "grad_norm": 4.0625,
+      "learning_rate": 0.00019819286972627066,
+      "loss": 0.9179,
       "step": 170
     },
     {
+      "epoch": 1.5909090909090908,
+      "grad_norm": 3.25,
+      "learning_rate": 0.00019788024462147788,
+      "loss": 0.8857,
       "step": 175
     },
     {
+      "epoch": 1.6363636363636362,
+      "grad_norm": 1.9140625,
+      "learning_rate": 0.00019754297868854073,
+      "loss": 0.8474,
       "step": 180
     },
     {
+      "epoch": 1.6818181818181817,
+      "grad_norm": 3.03125,
+      "learning_rate": 0.00019718115683235417,
+      "loss": 0.861,
       "step": 185
     },
     {
+      "epoch": 1.7272727272727273,
+      "grad_norm": 3.515625,
+      "learning_rate": 0.00019679487013963564,
+      "loss": 0.8266,
       "step": 190
     },
     {
+      "epoch": 1.7727272727272727,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00019638421585599423,
+      "loss": 0.8515,
       "step": 195
     },
     {
+      "epoch": 1.8181818181818183,
+      "grad_norm": 2.015625,
+      "learning_rate": 0.00019594929736144976,
+      "loss": 0.8328,
       "step": 200
     },
     {
+      "epoch": 1.8636363636363638,
+      "grad_norm": 1.7578125,
+      "learning_rate": 0.0001954902241444074,
+      "loss": 0.8601,
       "step": 205
     },
     {
+      "epoch": 1.9090909090909092,
+      "grad_norm": 1.65625,
+      "learning_rate": 0.00019500711177409454,
+      "loss": 0.8435,
       "step": 210
     },
     {
+      "epoch": 1.9545454545454546,
+      "grad_norm": 2.421875,
+      "learning_rate": 0.00019450008187146684,
+      "loss": 0.8082,
       "step": 215
     },
     {
+      "epoch": 2.0,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.00019396926207859084,
+      "loss": 0.8325,
       "step": 220
     },
     {
+      "epoch": 2.0,
+      "eval_loss": 2.227036952972412,
+      "eval_runtime": 1.0053,
+      "eval_samples_per_second": 4.974,
+      "eval_steps_per_second": 1.989,
+      "step": 220
     },
     {
+      "epoch": 2.0454545454545454,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.00019341478602651069,
+      "loss": 0.7413,
+      "step": 225
     },
     {
+      "epoch": 2.090909090909091,
+      "grad_norm": 3.796875,
+      "learning_rate": 0.00019283679330160726,
+      "loss": 0.7526,
       "step": 230
     },
     {
+      "epoch": 2.1363636363636362,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.00019223542941045817,
+      "loss": 0.739,
       "step": 235
     },
     {
+      "epoch": 2.1818181818181817,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00019161084574320696,
+      "loss": 0.7665,
       "step": 240
     },
     {
+      "epoch": 2.227272727272727,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.00019096319953545185,
+      "loss": 0.7655,
       "step": 245
     },
     {
+      "epoch": 2.2727272727272725,
+      "grad_norm": 2.53125,
+      "learning_rate": 0.00019029265382866214,
+      "loss": 0.753,
       "step": 250
     },
     {
+      "epoch": 2.3181818181818183,
+      "grad_norm": 2.09375,
+      "learning_rate": 0.00018959937742913359,
+      "loss": 0.7557,
       "step": 255
     },
     {
+      "epoch": 2.3636363636363638,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00018888354486549237,
+      "loss": 0.7805,
       "step": 260
     },
     {
+      "epoch": 2.409090909090909,
+      "grad_norm": 4.75,
+      "learning_rate": 0.00018814533634475822,
+      "loss": 0.7451,
       "step": 265
     },
     {
+      "epoch": 2.4545454545454546,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00018738493770697852,
+      "loss": 0.7308,
       "step": 270
     },
     {
+      "epoch": 2.5,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.8117,
       "step": 275
     },
     {
+      "epoch": 2.5454545454545454,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.00018579834132349772,
+      "loss": 0.7756,
       "step": 280
     },
     {
+      "epoch": 2.590909090909091,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.00018497254299495146,
+      "loss": 0.7503,
       "step": 285
     },
     {
+      "epoch": 2.6363636363636362,
+      "grad_norm": 1.25,
+      "learning_rate": 0.00018412535328311814,
+      "loss": 0.7886,
       "step": 290
     },
     {
+      "epoch": 2.6818181818181817,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00018325698546347715,
+      "loss": 0.7452,
       "step": 295
     },
     {
+      "epoch": 2.7272727272727275,
+      "grad_norm": 4.0,
+      "learning_rate": 0.0001823676581429833,
+      "loss": 0.7579,
       "step": 300
     },
     {
+      "epoch": 2.7727272727272725,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.00018145759520503358,
+      "loss": 0.7494,
       "step": 305
     },
     {
+      "epoch": 2.8181818181818183,
+      "grad_norm": 1.6953125,
+      "learning_rate": 0.00018052702575310588,
+      "loss": 0.7343,
       "step": 310
     },
     {
+      "epoch": 2.8636363636363638,
+      "grad_norm": 3.28125,
+      "learning_rate": 0.00017957618405308324,
+      "loss": 0.7402,
       "step": 315
     },
     {
+      "epoch": 2.909090909090909,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00017860530947427875,
+      "loss": 0.7428,
       "step": 320
     },
     {
+      "epoch": 2.9545454545454546,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0001776146464291757,
+      "loss": 0.7699,
       "step": 325
     },
     {
+      "epoch": 3.0,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0001766044443118978,
+      "loss": 0.749,
       "step": 330
     },
     {
+      "epoch": 3.0,
+      "eval_loss": 2.2333221435546875,
+      "eval_runtime": 1.0046,
+      "eval_samples_per_second": 4.977,
+      "eval_steps_per_second": 1.991,
+      "step": 330
     },
     {
+      "epoch": 3.0454545454545454,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00017557495743542585,
+      "loss": 0.6475,
+      "step": 335
     },
     {
+      "epoch": 3.090909090909091,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0001745264449675755,
+      "loss": 0.6833,
+      "step": 340
     },
     {
+      "epoch": 3.1363636363636362,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.00017345917086575332,
+      "loss": 0.6776,
       "step": 345
     },
     {
+      "epoch": 3.1818181818181817,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00017237340381050703,
+      "loss": 0.6798,
       "step": 350
     },
     {
+      "epoch": 3.227272727272727,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.00017126941713788632,
+      "loss": 0.6606,
       "step": 355
     },
     {
+      "epoch": 3.2727272727272725,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00017014748877063214,
+      "loss": 0.6629,
       "step": 360
     },
     {
+      "epoch": 3.3181818181818183,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00016900790114821122,
+      "loss": 0.6871,
       "step": 365
     },
     {
+      "epoch": 3.3636363636363638,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.00016785094115571322,
+      "loss": 0.6575,
       "step": 370
     },
     {
+      "epoch": 3.409090909090909,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00016667690005162916,
+      "loss": 0.6674,
       "step": 375
     },
     {
+      "epoch": 3.4545454545454546,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.00016548607339452853,
+      "loss": 0.7054,
       "step": 380
     },
     {
+      "epoch": 3.5,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00016427876096865394,
+      "loss": 0.6646,
       "step": 385
     },
     {
+      "epoch": 3.5454545454545454,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.00016305526670845226,
+      "loss": 0.6766,
       "step": 390
     },
     {
+      "epoch": 3.590909090909091,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.00016181589862206052,
+      "loss": 0.6716,
       "step": 395
     },
     {
+      "epoch": 3.6363636363636362,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00016056096871376667,
+      "loss": 0.6976,
       "step": 400
     },
     {
+      "epoch": 3.6818181818181817,
+      "grad_norm": 1.734375,
+      "learning_rate": 0.00015929079290546408,
+      "loss": 0.6624,
       "step": 405
     },
     {
+      "epoch": 3.7272727272727275,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.00015800569095711982,
+      "loss": 0.6907,
       "step": 410
     },
     {
+      "epoch": 3.7727272727272725,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00015670598638627706,
+      "loss": 0.6664,
       "step": 415
     },
     {
+      "epoch": 3.8181818181818183,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00015539200638661104,
+      "loss": 0.6642,
       "step": 420
     },
     {
+      "epoch": 3.8636363636363638,
+      "grad_norm": 3.203125,
+      "learning_rate": 0.00015406408174555976,
+      "loss": 0.7091,
       "step": 425
     },
     {
+      "epoch": 3.909090909090909,
+      "grad_norm": 1.9296875,
+      "learning_rate": 0.00015272254676105025,
+      "loss": 0.7034,
       "step": 430
     },
     {
+      "epoch": 3.9545454545454546,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00015136773915734066,
+      "loss": 0.6679,
       "step": 435
     },
     {
+      "epoch": 4.0,
+      "grad_norm": 2.140625,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.6755,
+      "step": 440
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 2.29927659034729,
+      "eval_runtime": 1.0067,
+      "eval_samples_per_second": 4.967,
+      "eval_steps_per_second": 1.987,
       "step": 440
     },
     {
+      "epoch": 4.045454545454546,
+      "grad_norm": 2.75,
+      "learning_rate": 0.00014861967361004687,
+      "loss": 0.6121,
       "step": 445
     },
     {
+      "epoch": 4.090909090909091,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0001472271074772683,
+      "loss": 0.598,
       "step": 450
     },
     {
+      "epoch": 4.136363636363637,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00014582265217274104,
+      "loss": 0.5832,
       "step": 455
     },
     {
+      "epoch": 4.181818181818182,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00014440666126057744,
+      "loss": 0.591,
       "step": 460
     },
     {
+      "epoch": 4.2272727272727275,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.00014297949120891718,
+      "loss": 0.5767,
       "step": 465
     },
     {
+      "epoch": 4.2727272727272725,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.00014154150130018866,
+      "loss": 0.5742,
       "step": 470
     },
     {
+      "epoch": 4.318181818181818,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.00014009305354066137,
+      "loss": 0.5806,
       "step": 475
     },
     {
+      "epoch": 4.363636363636363,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.00013863451256931287,
+      "loss": 0.605,
       "step": 480
     },
     {
+      "epoch": 4.409090909090909,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.00013716624556603274,
+      "loss": 0.6142,
       "step": 485
     },
     {
+      "epoch": 4.454545454545454,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.00013568862215918717,
+      "loss": 0.586,
       "step": 490
     },
     {
+      "epoch": 4.5,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.00013420201433256689,
+      "loss": 0.6088,
       "step": 495
     },
     {
+      "epoch": 4.545454545454545,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00013270679633174218,
+      "loss": 0.5974,
       "step": 500
     },
     {
+      "epoch": 4.590909090909091,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0001312033445698487,
+      "loss": 0.5808,
       "step": 505
     },
     {
+      "epoch": 4.636363636363637,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.0001296920375328275,
+      "loss": 0.6008,
       "step": 510
     },
     {
+      "epoch": 4.681818181818182,
+      "grad_norm": 1.6640625,
+      "learning_rate": 0.00012817325568414297,
+      "loss": 0.6069,
       "step": 515
     },
     {
+      "epoch": 4.7272727272727275,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.00012664738136900348,
+      "loss": 0.6111,
       "step": 520
     },
     {
+      "epoch": 4.7727272727272725,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0001251147987181079,
+      "loss": 0.6114,
       "step": 525
     },
     {
+      "epoch": 4.818181818181818,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.00012357589355094275,
+      "loss": 0.5905,
       "step": 530
     },
     {
+      "epoch": 4.863636363636363,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.00012203105327865407,
+      "loss": 0.6153,
       "step": 535
     },
     {
+      "epoch": 4.909090909090909,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.00012048066680651908,
+      "loss": 0.6179,
       "step": 540
     },
     {
+      "epoch": 4.954545454545455,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.00011892512443604102,
+      "loss": 0.5833,
       "step": 545
     },
     {
+      "epoch": 5.0,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.00011736481776669306,
+      "loss": 0.6197,
+      "step": 550
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 2.3820290565490723,
+      "eval_runtime": 1.0056,
+      "eval_samples_per_second": 4.972,
+      "eval_steps_per_second": 1.989,
       "step": 550
     },
     {
+      "epoch": 5.045454545454546,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.000115800139597335,
+      "loss": 0.5034,
       "step": 555
     },
     {
+      "epoch": 5.090909090909091,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.00011423148382732853,
+      "loss": 0.5171,
       "step": 560
     },
     {
+      "epoch": 5.136363636363637,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.00011265924535737493,
+      "loss": 0.5233,
       "step": 565
     },
     {
+      "epoch": 5.181818181818182,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.00011108381999010111,
+      "loss": 0.5103,
       "step": 570
     },
     {
+      "epoch": 5.2272727272727275,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.00010950560433041826,
+      "loss": 0.5371,
+      "step": 575
+    },
+    {
+      "epoch": 5.2727272727272725,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.00010792499568567884,
+      "loss": 0.5228,
+      "step": 580
+    },
+    {
+      "epoch": 5.318181818181818,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.00010634239196565646,
+      "loss": 0.5308,
+      "step": 585
+    },
+    {
+      "epoch": 5.363636363636363,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.00010475819158237425,
+      "loss": 0.5349,
+      "step": 590
+    },
+    {
+      "epoch": 5.409090909090909,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.00010317279334980678,
+      "loss": 0.5204,
+      "step": 595
+    },
+    {
+      "epoch": 5.454545454545454,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.00010158659638348081,
+      "loss": 0.5182,
+      "step": 600
+    },
+    {
+      "epoch": 5.5,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5215,
+      "step": 605
+    },
+    {
+      "epoch": 5.545454545454545,
+      "grad_norm": 0.6484375,
+      "learning_rate": 9.84134036165192e-05,
+      "loss": 0.5282,
+      "step": 610
+    },
+    {
+      "epoch": 5.590909090909091,
+      "grad_norm": 0.6015625,
+      "learning_rate": 9.682720665019325e-05,
+      "loss": 0.5103,
+      "step": 615
+    },
+    {
+      "epoch": 5.636363636363637,
+      "grad_norm": 0.63671875,
+      "learning_rate": 9.524180841762577e-05,
+      "loss": 0.5257,
+      "step": 620
+    },
+    {
+      "epoch": 5.681818181818182,
+      "grad_norm": 0.6640625,
+      "learning_rate": 9.365760803434355e-05,
+      "loss": 0.5228,
+      "step": 625
+    },
+    {
+      "epoch": 5.7272727272727275,
+      "grad_norm": 0.60546875,
+      "learning_rate": 9.207500431432115e-05,
+      "loss": 0.5487,
+      "step": 630
+    },
+    {
+      "epoch": 5.7727272727272725,
+      "grad_norm": 0.7734375,
+      "learning_rate": 9.049439566958175e-05,
+      "loss": 0.528,
+      "step": 635
+    },
+    {
+      "epoch": 5.818181818181818,
+      "grad_norm": 0.79296875,
+      "learning_rate": 8.891618000989891e-05,
+      "loss": 0.5213,
+      "step": 640
+    },
+    {
+      "epoch": 5.863636363636363,
+      "grad_norm": 0.640625,
+      "learning_rate": 8.734075464262507e-05,
+      "loss": 0.5312,
+      "step": 645
+    },
+    {
+      "epoch": 5.909090909090909,
+      "grad_norm": 0.62890625,
+      "learning_rate": 8.57685161726715e-05,
+      "loss": 0.5208,
+      "step": 650
+    },
+    {
+      "epoch": 5.954545454545455,
+      "grad_norm": 0.6953125,
+      "learning_rate": 8.4199860402665e-05,
+      "loss": 0.5372,
+      "step": 655
+    },
+    {
+      "epoch": 6.0,
+      "grad_norm": 0.640625,
+      "learning_rate": 8.263518223330697e-05,
+      "loss": 0.5208,
+      "step": 660
+    },
+    {
+      "epoch": 6.0,
+      "eval_loss": 2.586942672729492,
+      "eval_runtime": 1.0065,
+      "eval_samples_per_second": 4.968,
+      "eval_steps_per_second": 1.987,
+      "step": 660
+    },
+    {
+      "epoch": 6.045454545454546,
+      "grad_norm": 0.7734375,
+      "learning_rate": 8.107487556395901e-05,
+      "loss": 0.46,
+      "step": 665
+    },
+    {
+      "epoch": 6.090909090909091,
+      "grad_norm": 0.703125,
+      "learning_rate": 7.951933319348095e-05,
+      "loss": 0.4614,
+      "step": 670
+    },
+    {
+      "epoch": 6.136363636363637,
+      "grad_norm": 0.6640625,
+      "learning_rate": 7.796894672134594e-05,
+      "loss": 0.4648,
+      "step": 675
+    },
+    {
+      "epoch": 6.181818181818182,
+      "grad_norm": 0.76171875,
+      "learning_rate": 7.642410644905726e-05,
+      "loss": 0.4445,
+      "step": 680
+    },
+    {
+      "epoch": 6.2272727272727275,
+      "grad_norm": 0.67578125,
+      "learning_rate": 7.488520128189209e-05,
+      "loss": 0.4483,
+      "step": 685
+    },
+    {
+      "epoch": 6.2727272727272725,
+      "grad_norm": 0.6875,
+      "learning_rate": 7.335261863099651e-05,
+      "loss": 0.4484,
+      "step": 690
+    },
+    {
+      "epoch": 6.318181818181818,
+      "grad_norm": 0.65234375,
+      "learning_rate": 7.182674431585704e-05,
+      "loss": 0.4452,
+      "step": 695
+    },
+    {
+      "epoch": 6.363636363636363,
+      "grad_norm": 0.69140625,
+      "learning_rate": 7.030796246717255e-05,
+      "loss": 0.4615,
+      "step": 700
+    },
+    {
+      "epoch": 6.409090909090909,
+      "grad_norm": 0.66796875,
+      "learning_rate": 6.87966554301513e-05,
+      "loss": 0.4443,
+      "step": 705
+    },
+    {
+      "epoch": 6.454545454545454,
+      "grad_norm": 0.65625,
+      "learning_rate": 6.729320366825784e-05,
+      "loss": 0.4509,
+      "step": 710
+    },
+    {
+      "epoch": 6.5,
+      "grad_norm": 0.734375,
+      "learning_rate": 6.579798566743314e-05,
+      "loss": 0.461,
+      "step": 715
+    },
+    {
+      "epoch": 6.545454545454545,
+      "grad_norm": 0.66015625,
+      "learning_rate": 6.431137784081282e-05,
+      "loss": 0.4571,
+      "step": 720
+    },
+    {
+      "epoch": 6.590909090909091,
+      "grad_norm": 0.63671875,
+      "learning_rate": 6.283375443396726e-05,
+      "loss": 0.4698,
+      "step": 725
+    },
+    {
+      "epoch": 6.636363636363637,
+      "grad_norm": 0.62890625,
+      "learning_rate": 6.136548743068713e-05,
+      "loss": 0.441,
+      "step": 730
+    },
+    {
+      "epoch": 6.681818181818182,
+      "grad_norm": 0.65625,
+      "learning_rate": 5.9906946459338656e-05,
+      "loss": 0.4464,
+      "step": 735
+    },
+    {
+      "epoch": 6.7272727272727275,
+      "grad_norm": 0.71875,
+      "learning_rate": 5.845849869981137e-05,
+      "loss": 0.4455,
+      "step": 740
+    },
+    {
+      "epoch": 6.7727272727272725,
+      "grad_norm": 0.8359375,
+      "learning_rate": 5.702050879108284e-05,
+      "loss": 0.4541,
+      "step": 745
+    },
+    {
+      "epoch": 6.818181818181818,
+      "grad_norm": 0.69921875,
+      "learning_rate": 5.559333873942259e-05,
+      "loss": 0.4546,
+      "step": 750
+    },
+    {
+      "epoch": 6.863636363636363,
+      "grad_norm": 0.66015625,
+      "learning_rate": 5.417734782725896e-05,
+      "loss": 0.4427,
+      "step": 755
+    },
+    {
+      "epoch": 6.909090909090909,
+      "grad_norm": 0.703125,
+      "learning_rate": 5.277289252273174e-05,
+      "loss": 0.4497,
+      "step": 760
+    },
+    {
+      "epoch": 6.954545454545455,
+      "grad_norm": 0.66796875,
+      "learning_rate": 5.138032638995315e-05,
+      "loss": 0.4519,
+      "step": 765
+    },
+    {
+      "epoch": 7.0,
+      "grad_norm": 0.6953125,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.4474,
+      "step": 770
+    },
+    {
+      "epoch": 7.0,
+      "eval_loss": 2.838920831680298,
+      "eval_runtime": 1.0059,
+      "eval_samples_per_second": 4.971,
+      "eval_steps_per_second": 1.988,
+      "step": 770
+    },
+    {
+      "epoch": 7.045454545454546,
+      "grad_norm": 0.66015625,
+      "learning_rate": 4.8632260842659393e-05,
+      "loss": 0.3915,
+      "step": 775
+    },
+    {
+      "epoch": 7.090909090909091,
+      "grad_norm": 0.69140625,
+      "learning_rate": 4.727745323894976e-05,
+      "loss": 0.3938,
+      "step": 780
+    },
+    {
+      "epoch": 7.136363636363637,
+      "grad_norm": 0.81640625,
+      "learning_rate": 4.593591825444028e-05,
+      "loss": 0.3943,
+      "step": 785
+    },
+    {
+      "epoch": 7.181818181818182,
+      "grad_norm": 0.8515625,
+      "learning_rate": 4.4607993613388976e-05,
+      "loss": 0.3993,
+      "step": 790
+    },
+    {
+      "epoch": 7.2272727272727275,
+      "grad_norm": 0.7109375,
+      "learning_rate": 4.329401361372294e-05,
+      "loss": 0.397,
+      "step": 795
+    },
+    {
+      "epoch": 7.2727272727272725,
+      "grad_norm": 0.609375,
+      "learning_rate": 4.19943090428802e-05,
+      "loss": 0.3952,
+      "step": 800
+    },
+    {
+      "epoch": 7.318181818181818,
+      "grad_norm": 0.703125,
+      "learning_rate": 4.070920709453597e-05,
+      "loss": 0.3992,
+      "step": 805
+    },
+    {
+      "epoch": 7.363636363636363,
+      "grad_norm": 0.6953125,
+      "learning_rate": 3.943903128623335e-05,
+      "loss": 0.3869,
+      "step": 810
+    },
+    {
+      "epoch": 7.409090909090909,
+      "grad_norm": 0.65234375,
+      "learning_rate": 3.8184101377939476e-05,
+      "loss": 0.3931,
+      "step": 815
+    },
+    {
+      "epoch": 7.454545454545454,
+      "grad_norm": 0.65625,
+      "learning_rate": 3.694473329154778e-05,
+      "loss": 0.398,
+      "step": 820
+    },
+    {
+      "epoch": 7.5,
+      "grad_norm": 0.66015625,
+      "learning_rate": 3.5721239031346066e-05,
+      "loss": 0.4006,
+      "step": 825
+    },
+    {
+      "epoch": 7.545454545454545,
+      "grad_norm": 0.6640625,
+      "learning_rate": 3.45139266054715e-05,
+      "loss": 0.3935,
+      "step": 830
+    },
+    {
+      "epoch": 7.590909090909091,
+      "grad_norm": 0.74609375,
+      "learning_rate": 3.332309994837085e-05,
+      "loss": 0.3984,
+      "step": 835
+    },
+    {
+      "epoch": 7.636363636363637,
+      "grad_norm": 0.6953125,
+      "learning_rate": 3.21490588442868e-05,
+      "loss": 0.3981,
+      "step": 840
+    },
+    {
+      "epoch": 7.681818181818182,
+      "grad_norm": 0.640625,
+      "learning_rate": 3.099209885178882e-05,
+      "loss": 0.3981,
+      "step": 845
+    },
+    {
+      "epoch": 7.7272727272727275,
+      "grad_norm": 0.65234375,
+      "learning_rate": 2.9852511229367865e-05,
+      "loss": 0.3978,
+      "step": 850
+    },
+    {
+      "epoch": 7.7727272727272725,
+      "grad_norm": 0.6796875,
+      "learning_rate": 2.8730582862113742e-05,
+      "loss": 0.3943,
+      "step": 855
+    },
+    {
+      "epoch": 7.818181818181818,
+      "grad_norm": 0.6484375,
+      "learning_rate": 2.7626596189492983e-05,
+      "loss": 0.3913,
+      "step": 860
+    },
+    {
+      "epoch": 7.863636363636363,
+      "grad_norm": 0.73046875,
+      "learning_rate": 2.654082913424668e-05,
+      "loss": 0.4045,
+      "step": 865
+    },
+    {
+      "epoch": 7.909090909090909,
+      "grad_norm": 0.6796875,
+      "learning_rate": 2.5473555032424533e-05,
+      "loss": 0.3828,
+      "step": 870
+    },
+    {
+      "epoch": 7.954545454545455,
+      "grad_norm": 0.6953125,
+      "learning_rate": 2.4425042564574184e-05,
+      "loss": 0.388,
+      "step": 875
+    },
+    {
+      "epoch": 8.0,
+      "grad_norm": 0.7734375,
+      "learning_rate": 2.339555568810221e-05,
+      "loss": 0.4044,
+      "step": 880
+    },
+    {
+      "epoch": 8.0,
+      "eval_loss": 3.102931499481201,
+      "eval_runtime": 1.0057,
+      "eval_samples_per_second": 4.972,
+      "eval_steps_per_second": 1.989,
+      "step": 880
+    },
+    {
+      "epoch": 8.045454545454545,
+      "grad_norm": 0.59375,
+      "learning_rate": 2.2385353570824308e-05,
+      "loss": 0.3693,
+      "step": 885
+    },
+    {
+      "epoch": 8.090909090909092,
+      "grad_norm": 0.65625,
+      "learning_rate": 2.139469052572127e-05,
+      "loss": 0.3641,
+      "step": 890
+    },
+    {
+      "epoch": 8.136363636363637,
+      "grad_norm": 0.65234375,
+      "learning_rate": 2.042381594691678e-05,
+      "loss": 0.3647,
+      "step": 895
+    },
+    {
+      "epoch": 8.181818181818182,
+      "grad_norm": 0.671875,
+      "learning_rate": 1.947297424689414e-05,
+      "loss": 0.3619,
+      "step": 900
+    },
+    {
+      "epoch": 8.227272727272727,
+      "grad_norm": 0.64453125,
+      "learning_rate": 1.854240479496643e-05,
+      "loss": 0.3566,
+      "step": 905
+    },
+    {
+      "epoch": 8.272727272727273,
+      "grad_norm": 0.6640625,
+      "learning_rate": 1.763234185701673e-05,
+      "loss": 0.354,
+      "step": 910
+    },
+    {
+      "epoch": 8.318181818181818,
+      "grad_norm": 0.6953125,
+      "learning_rate": 1.6743014536522873e-05,
+      "loss": 0.3703,
+      "step": 915
+    },
+    {
+      "epoch": 8.363636363636363,
+      "grad_norm": 0.75,
+      "learning_rate": 1.587464671688187e-05,
+      "loss": 0.3597,
+      "step": 920
+    },
+    {
+      "epoch": 8.409090909090908,
+      "grad_norm": 0.6875,
+      "learning_rate": 1.5027457005048573e-05,
+      "loss": 0.3705,
+      "step": 925
+    },
+    {
+      "epoch": 8.454545454545455,
+      "grad_norm": 0.70703125,
+      "learning_rate": 1.4201658676502294e-05,
+      "loss": 0.367,
+      "step": 930
+    },
+    {
+      "epoch": 8.5,
+      "grad_norm": 0.640625,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.3607,
+      "step": 935
+    },
+    {
+      "epoch": 8.545454545454545,
+      "grad_norm": 0.6484375,
+      "learning_rate": 1.2615062293021507e-05,
+      "loss": 0.3633,
+      "step": 940
+    },
+    {
+      "epoch": 8.590909090909092,
+      "grad_norm": 0.66015625,
+      "learning_rate": 1.1854663655241805e-05,
+      "loss": 0.3622,
+      "step": 945
+    },
+    {
+      "epoch": 8.636363636363637,
+      "grad_norm": 0.65625,
+      "learning_rate": 1.1116455134507664e-05,
+      "loss": 0.3734,
+      "step": 950
+    },
+    {
+      "epoch": 8.681818181818182,
+      "grad_norm": 0.66015625,
+      "learning_rate": 1.0400622570866425e-05,
+      "loss": 0.367,
+      "step": 955
+    },
+    {
+      "epoch": 8.727272727272727,
+      "grad_norm": 0.63671875,
+      "learning_rate": 9.707346171337894e-06,
+      "loss": 0.3578,
+      "step": 960
+    },
+    {
+      "epoch": 8.772727272727273,
+      "grad_norm": 0.6640625,
+      "learning_rate": 9.036800464548157e-06,
+      "loss": 0.364,
+      "step": 965
+    },
+    {
+      "epoch": 8.818181818181818,
+      "grad_norm": 0.61328125,
+      "learning_rate": 8.38915425679304e-06,
+      "loss": 0.3558,
+      "step": 970
+    },
+    {
+      "epoch": 8.863636363636363,
+      "grad_norm": 0.65625,
+      "learning_rate": 7.764570589541875e-06,
+      "loss": 0.3664,
+      "step": 975
+    },
+    {
+      "epoch": 8.909090909090908,
+      "grad_norm": 0.61328125,
+      "learning_rate": 7.163206698392744e-06,
+      "loss": 0.3573,
+      "step": 980
+    },
+    {
+      "epoch": 8.954545454545455,
+      "grad_norm": 0.6484375,
+      "learning_rate": 6.585213973489335e-06,
+      "loss": 0.3717,
+      "step": 985
+    },
+    {
+      "epoch": 9.0,
+      "grad_norm": 0.640625,
+      "learning_rate": 6.030737921409169e-06,
+      "loss": 0.3573,
+      "step": 990
+    },
+    {
+      "epoch": 9.0,
+      "eval_loss": 3.357294797897339,
+      "eval_runtime": 1.005,
+      "eval_samples_per_second": 4.975,
+      "eval_steps_per_second": 1.99,
+      "step": 990
+    },
+    {
+      "epoch": 9.045454545454545,
+      "grad_norm": 0.58984375,
+      "learning_rate": 5.499918128533155e-06,
+      "loss": 0.3595,
+      "step": 995
+    },
+    {
+      "epoch": 9.090909090909092,
+      "grad_norm": 0.5859375,
+      "learning_rate": 4.992888225905468e-06,
+      "loss": 0.359,
+      "step": 1000
+    },
+    {
+      "epoch": 9.136363636363637,
+      "grad_norm": 0.6328125,
+      "learning_rate": 4.509775855592613e-06,
+      "loss": 0.3584,
+      "step": 1005
+    },
+    {
+      "epoch": 9.181818181818182,
+      "grad_norm": 0.6328125,
+      "learning_rate": 4.050702638550275e-06,
+      "loss": 0.3575,
+      "step": 1010
+    },
+    {
+      "epoch": 9.227272727272727,
+      "grad_norm": 0.640625,
+      "learning_rate": 3.615784144005796e-06,
+      "loss": 0.3449,
+      "step": 1015
+    },
+    {
+      "epoch": 9.272727272727273,
+      "grad_norm": 0.61328125,
+      "learning_rate": 3.2051298603643753e-06,
+      "loss": 0.354,
+      "step": 1020
+    },
+    {
+      "epoch": 9.318181818181818,
+      "grad_norm": 0.64453125,
+      "learning_rate": 2.818843167645835e-06,
+      "loss": 0.3463,
+      "step": 1025
+    },
+    {
+      "epoch": 9.363636363636363,
+      "grad_norm": 0.65625,
+      "learning_rate": 2.4570213114592954e-06,
+      "loss": 0.351,
+      "step": 1030
+    },
+    {
+      "epoch": 9.409090909090908,
+      "grad_norm": 0.6640625,
+      "learning_rate": 2.119755378522137e-06,
+      "loss": 0.3594,
+      "step": 1035
+    },
+    {
+      "epoch": 9.454545454545455,
+      "grad_norm": 0.60546875,
+      "learning_rate": 1.8071302737293295e-06,
+      "loss": 0.3541,
+      "step": 1040
+    },
+    {
+      "epoch": 9.5,
+      "grad_norm": 0.66796875,
+      "learning_rate": 1.5192246987791981e-06,
+      "loss": 0.36,
+      "step": 1045
+    },
+    {
+      "epoch": 9.545454545454545,
+      "grad_norm": 0.609375,
+      "learning_rate": 1.2561111323605712e-06,
+      "loss": 0.3573,
+      "step": 1050
+    },
+    {
+      "epoch": 9.590909090909092,
+      "grad_norm": 0.62109375,
+      "learning_rate": 1.0178558119067315e-06,
+      "loss": 0.3601,
+      "step": 1055
+    },
+    {
+      "epoch": 9.636363636363637,
+      "grad_norm": 0.6171875,
+      "learning_rate": 8.04518716920466e-07,
+      "loss": 0.3574,
+      "step": 1060
+    },
+    {
+      "epoch": 9.681818181818182,
+      "grad_norm": 0.64453125,
+      "learning_rate": 6.161535538745878e-07,
+      "loss": 0.3664,
+      "step": 1065
+    },
+    {
+      "epoch": 9.727272727272727,
+      "grad_norm": 0.578125,
+      "learning_rate": 4.5280774269154115e-07,
+      "loss": 0.3542,
+      "step": 1070
+    },
+    {
+      "epoch": 9.772727272727273,
+      "grad_norm": 0.62109375,
+      "learning_rate": 3.145224048057727e-07,
+      "loss": 0.36,
+      "step": 1075
+    },
+    {
+      "epoch": 9.818181818181818,
+      "grad_norm": 0.63671875,
+      "learning_rate": 2.0133235281156736e-07,
+      "loss": 0.3565,
+      "step": 1080
+    },
+    {
+      "epoch": 9.863636363636363,
+      "grad_norm": 0.62109375,
+      "learning_rate": 1.1326608169920372e-07,
+      "loss": 0.3694,
+      "step": 1085
+    },
+    {
+      "epoch": 9.909090909090908,
+      "grad_norm": 0.62109375,
+      "learning_rate": 5.0345761681491746e-08,
+      "loss": 0.354,
+      "step": 1090
+    },
+    {
+      "epoch": 9.954545454545455,
+      "grad_norm": 0.59765625,
+      "learning_rate": 1.2587232612493172e-08,
+      "loss": 0.3516,
+      "step": 1095
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0,
+      "loss": 0.354,
+      "step": 1100
+    },
+    {
+      "epoch": 10.0,
+      "eval_loss": 3.380370616912842,
+      "eval_runtime": 1.0228,
+      "eval_samples_per_second": 4.888,
+      "eval_steps_per_second": 1.955,
+      "step": 1100
     },
     {
+      "epoch": 10.0,
+      "step": 1100,
+      "total_flos": 1.6817604900715233e+18,
+      "train_loss": 1.8264882094209844,
+      "train_runtime": 8654.8771,
+      "train_samples_per_second": 2.032,
+      "train_steps_per_second": 0.127
     }
   ],
   "logging_steps": 5,
+  "max_steps": 1100,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
   "save_steps": 100,
   "stateful_callbacks": {
     "TrainerControl": {
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
+        "should_training_stop": true
       },
       "attributes": {}
     }
   },
+  "total_flos": 1.6817604900715233e+18,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null