Model save

Browse files

Files changed (6) hide show

README.md +10 -7
adapter_model.safetensors +1 -1
all_results.json +6 -11
runs/May19_11-54-28_deep-diver-main-lucky-mouse-1-0-0/events.out.tfevents.1716134219.deep-diver-main-lucky-mouse-1-0-0.385.0 +2 -2
train_results.json +6 -6
trainer_state.json +900 -98

README.md CHANGED Viewed

@@ -2,13 +2,12 @@
 license: gemma
 library_name: peft
 tags:
-- alignment-handbook
 - trl
 - sft
 - generated_from_trainer
 base_model: google/gemma-7b
 datasets:
-- llama-duo/synth_summarize_dataset
 model-index:
 - name: gemma7b-summarize-gpt4o-30k
   results: []
@@ -17,12 +16,12 @@ model-index:
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 should probably proofread and complete it, then remove this comment. -->
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/chansung18/huggingface/runs/gg6giqaz)
 # gemma7b-summarize-gpt4o-30k
-This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the llama-duo/synth_summarize_dataset dataset.
 It achieves the following results on the evaluation set:
-- Loss: 2.3412
 ## Model description
@@ -53,13 +52,17 @@ The following hyperparameters were used during training:
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.1
-- num_epochs: 1
 ### Training results
 | Training Loss | Epoch | Step | Validation Loss |
 |:-------------:|:-----:|:----:|:---------------:|
-| 0.961         | 1.0   | 137  | 2.3412          |
 ### Framework versions

 license: gemma
 library_name: peft
 tags:
 - trl
 - sft
 - generated_from_trainer
 base_model: google/gemma-7b
 datasets:
+- generator
 model-index:
 - name: gemma7b-summarize-gpt4o-30k
   results: []
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 should probably proofread and complete it, then remove this comment. -->
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/chansung18/huggingface/runs/ddvw2m8z)
 # gemma7b-summarize-gpt4o-30k
+This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
 It achieves the following results on the evaluation set:
+- Loss: 2.3811
 ## Model description
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 5
 ### Training results
 | Training Loss | Epoch | Step | Validation Loss |
 |:-------------:|:-----:|:----:|:---------------:|
+| 0.9712        | 1.0   | 137  | 2.3077          |
+| 0.8675        | 2.0   | 274  | 2.2479          |
+| 0.7623        | 3.0   | 411  | 2.2756          |
+| 0.709         | 4.0   | 548  | 2.3417          |
+| 0.6601        | 5.0   | 685  | 2.3811          |
 ### Framework versions

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f5d08bade1b09675cae192cd2ae9dfc9b1209c9ac40ebb4ccf06ae69c64b21ea
 size 50056096

 version https://git-lfs.github.com/spec/v1
+oid sha256:9dead167941940b2975197a1f402c334b3606ea3d8e60a06c79db262f8105d00
 size 50056096

all_results.json CHANGED Viewed

@@ -1,14 +1,9 @@
 {
-    "epoch": 1.0,
-    "eval_loss": 2.341205358505249,
-    "eval_runtime": 1.0267,
-    "eval_samples": 25,
-    "eval_samples_per_second": 4.87,
-    "eval_steps_per_second": 1.948,
-    "total_flos": 2.0945562398778982e+17,
-    "train_loss": 4.772963228887015,
-    "train_runtime": 1080.9502,
     "train_samples": 29787,
-    "train_samples_per_second": 2.028,
-    "train_steps_per_second": 0.127
 }

 {
+    "epoch": 5.0,
+    "total_flos": 1.0472781231601746e+18,
+    "train_loss": 2.151051264783762,
+    "train_runtime": 5341.9856,
     "train_samples": 29787,
+    "train_samples_per_second": 2.052,
+    "train_steps_per_second": 0.128
 }

runs/May19_11-54-28_deep-diver-main-lucky-mouse-1-0-0/events.out.tfevents.1716134219.deep-diver-main-lucky-mouse-1-0-0.385.0 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4b758f0942c1305f23836da55e9b5dd0158c5186be84c9830fcea14dacd4bc0c
-size 32101

 version https://git-lfs.github.com/spec/v1
+oid sha256:2b9bd7ceaaf98ed46022543b9806d0f0f89b6e1bbda57604b1783852001cedbc
+size 36313

train_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-    "epoch": 1.0,
-    "total_flos": 2.0945562398778982e+17,
-    "train_loss": 4.772963228887015,
-    "train_runtime": 1080.9502,
     "train_samples": 29787,
-    "train_samples_per_second": 2.028,
-    "train_steps_per_second": 0.127
 }

 {
+    "epoch": 5.0,
+    "total_flos": 1.0472781231601746e+18,
+    "train_loss": 2.151051264783762,
+    "train_runtime": 5341.9856,
     "train_samples": 29787,
+    "train_samples_per_second": 2.052,
+    "train_steps_per_second": 0.128
 }

trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 1.0,
   "eval_steps": 500,
-  "global_step": 137,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -11,221 +11,1023 @@
     {
       "epoch": 0.0072992700729927005,
       "grad_norm": 708.0,
-      "learning_rate": 1.4285714285714285e-05,
       "loss": 56.8346,
       "step": 1
     },
     {
       "epoch": 0.0364963503649635,
-      "grad_norm": 368.0,
-      "learning_rate": 7.142857142857143e-05,
-      "loss": 45.8547,
       "step": 5
     },
     {
       "epoch": 0.072992700729927,
-      "grad_norm": 30.875,
-      "learning_rate": 0.00014285714285714287,
-      "loss": 19.792,
       "step": 10
     },
     {
       "epoch": 0.10948905109489052,
-      "grad_norm": 6.875,
-      "learning_rate": 0.00019996738360808565,
-      "loss": 15.0745,
       "step": 15
     },
     {
       "epoch": 0.145985401459854,
-      "grad_norm": 17.75,
-      "learning_rate": 0.00019882804237803488,
-      "loss": 13.3169,
       "step": 20
     },
     {
       "epoch": 0.18248175182481752,
-      "grad_norm": 117.0,
-      "learning_rate": 0.00019607909582962477,
-      "loss": 8.4463,
       "step": 25
     },
     {
       "epoch": 0.21897810218978103,
-      "grad_norm": 5.09375,
-      "learning_rate": 0.0001917653158603628,
-      "loss": 2.1599,
       "step": 30
     },
     {
       "epoch": 0.25547445255474455,
-      "grad_norm": 3.921875,
-      "learning_rate": 0.00018595696069872013,
-      "loss": 1.6643,
       "step": 35
     },
     {
       "epoch": 0.291970802919708,
-      "grad_norm": 2.21875,
-      "learning_rate": 0.00017874863061334657,
-      "loss": 1.4899,
       "step": 40
     },
     {
       "epoch": 0.3284671532846715,
-      "grad_norm": 2.1875,
-      "learning_rate": 0.00017025772716520323,
-      "loss": 1.3695,
       "step": 45
     },
     {
       "epoch": 0.36496350364963503,
-      "grad_norm": 2.0,
-      "learning_rate": 0.0001606225410966638,
-      "loss": 1.2794,
       "step": 50
     },
     {
       "epoch": 0.40145985401459855,
-      "grad_norm": 0.953125,
-      "learning_rate": 0.00015000000000000001,
-      "loss": 1.2388,
       "step": 55
     },
     {
       "epoch": 0.43795620437956206,
-      "grad_norm": 3.53125,
-      "learning_rate": 0.0001385631124488136,
-      "loss": 1.2269,
       "step": 60
     },
     {
       "epoch": 0.4744525547445255,
-      "grad_norm": 1.453125,
-      "learning_rate": 0.0001264981502196662,
-      "loss": 1.1434,
       "step": 65
     },
     {
       "epoch": 0.5109489051094891,
-      "grad_norm": 0.921875,
-      "learning_rate": 0.00011400161449686293,
-      "loss": 1.1033,
       "step": 70
     },
     {
       "epoch": 0.5474452554744526,
-      "grad_norm": 1.5703125,
-      "learning_rate": 0.00010127703547159739,
-      "loss": 1.0812,
       "step": 75
     },
     {
       "epoch": 0.583941605839416,
-      "grad_norm": 2.828125,
-      "learning_rate": 8.853165746015997e-05,
-      "loss": 1.061,
       "step": 80
     },
     {
       "epoch": 0.6204379562043796,
-      "grad_norm": 7.4375,
-      "learning_rate": 7.597306353045393e-05,
-      "loss": 1.0362,
       "step": 85
     },
     {
       "epoch": 0.656934306569343,
-      "grad_norm": 0.734375,
-      "learning_rate": 6.380579461128819e-05,
-      "loss": 1.0233,
       "step": 90
     },
     {
       "epoch": 0.6934306569343066,
-      "grad_norm": 1.296875,
-      "learning_rate": 5.222801814877369e-05,
-      "loss": 1.0499,
       "step": 95
     },
     {
       "epoch": 0.7299270072992701,
-      "grad_norm": 1.3125,
-      "learning_rate": 4.142830056718052e-05,
-      "loss": 0.995,
       "step": 100
     },
     {
       "epoch": 0.7664233576642335,
-      "grad_norm": 1.015625,
-      "learning_rate": 3.158253610095697e-05,
-      "loss": 0.9839,
       "step": 105
     },
     {
       "epoch": 0.8029197080291971,
-      "grad_norm": 0.765625,
-      "learning_rate": 2.2851082017805703e-05,
-      "loss": 0.9762,
       "step": 110
     },
     {
       "epoch": 0.8394160583941606,
-      "grad_norm": 0.63671875,
-      "learning_rate": 1.5376146891235598e-05,
-      "loss": 0.9773,
       "step": 115
     },
     {
       "epoch": 0.8759124087591241,
-      "grad_norm": 0.62890625,
-      "learning_rate": 9.279474459608805e-06,
-      "loss": 0.9756,
       "step": 120
     },
     {
       "epoch": 0.9124087591240876,
-      "grad_norm": 0.80859375,
-      "learning_rate": 4.660360794506946e-06,
-      "loss": 0.9625,
       "step": 125
     },
     {
       "epoch": 0.948905109489051,
-      "grad_norm": 0.57421875,
-      "learning_rate": 1.5940370726542863e-06,
-      "loss": 0.9625,
       "step": 130
     },
     {
       "epoch": 0.9854014598540146,
-      "grad_norm": 0.5625,
-      "learning_rate": 1.3044429107700318e-07,
-      "loss": 0.961,
       "step": 135
     },
     {
       "epoch": 1.0,
-      "eval_loss": 2.341205358505249,
-      "eval_runtime": 1.0018,
-      "eval_samples_per_second": 4.991,
-      "eval_steps_per_second": 1.996,
       "step": 137
     },
     {
-      "epoch": 1.0,
-      "step": 137,
-      "total_flos": 2.0945562398778982e+17,
-      "train_loss": 4.772963228887015,
-      "train_runtime": 1080.9502,
-      "train_samples_per_second": 2.028,
-      "train_steps_per_second": 0.127
     }
   ],
   "logging_steps": 5,
-  "max_steps": 137,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 1,
   "save_steps": 100,
   "stateful_callbacks": {
     "TrainerControl": {
@@ -239,7 +1041,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 2.0945562398778982e+17,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 5.0,
   "eval_steps": 500,
+  "global_step": 685,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
     {
       "epoch": 0.0072992700729927005,
       "grad_norm": 708.0,
+      "learning_rate": 2.898550724637681e-06,
       "loss": 56.8346,
       "step": 1
     },
     {
       "epoch": 0.0364963503649635,
+      "grad_norm": 604.0,
+      "learning_rate": 1.4492753623188407e-05,
+      "loss": 52.9742,
       "step": 5
     },
     {
       "epoch": 0.072992700729927,
+      "grad_norm": 340.0,
+      "learning_rate": 2.8985507246376814e-05,
+      "loss": 39.0746,
       "step": 10
     },
     {
       "epoch": 0.10948905109489052,
+      "grad_norm": 40.25,
+      "learning_rate": 4.347826086956522e-05,
+      "loss": 20.8099,
       "step": 15
     },
     {
       "epoch": 0.145985401459854,
+      "grad_norm": 25.5,
+      "learning_rate": 5.797101449275363e-05,
+      "loss": 17.6144,
       "step": 20
     },
     {
       "epoch": 0.18248175182481752,
+      "grad_norm": 7.78125,
+      "learning_rate": 7.246376811594203e-05,
+      "loss": 15.3803,
       "step": 25
     },
     {
       "epoch": 0.21897810218978103,
+      "grad_norm": 6.40625,
+      "learning_rate": 8.695652173913044e-05,
+      "loss": 14.0798,
       "step": 30
     },
     {
       "epoch": 0.25547445255474455,
+      "grad_norm": 13.4375,
+      "learning_rate": 0.00010144927536231885,
+      "loss": 13.4032,
       "step": 35
     },
     {
       "epoch": 0.291970802919708,
+      "grad_norm": 41.0,
+      "learning_rate": 0.00011594202898550725,
+      "loss": 10.8827,
       "step": 40
     },
     {
       "epoch": 0.3284671532846715,
+      "grad_norm": 13.1875,
+      "learning_rate": 0.00013043478260869567,
+      "loss": 4.5915,
       "step": 45
     },
     {
       "epoch": 0.36496350364963503,
+      "grad_norm": 4.09375,
+      "learning_rate": 0.00014492753623188405,
+      "loss": 1.9,
       "step": 50
     },
     {
       "epoch": 0.40145985401459855,
+      "grad_norm": 2.28125,
+      "learning_rate": 0.00015942028985507247,
+      "loss": 1.6474,
       "step": 55
     },
     {
       "epoch": 0.43795620437956206,
+      "grad_norm": 3.5,
+      "learning_rate": 0.00017391304347826088,
+      "loss": 1.477,
       "step": 60
     },
     {
       "epoch": 0.4744525547445255,
+      "grad_norm": 2.28125,
+      "learning_rate": 0.00018840579710144927,
+      "loss": 1.3309,
       "step": 65
     },
     {
       "epoch": 0.5109489051094891,
+      "grad_norm": 1.6171875,
+      "learning_rate": 0.00019999869950890106,
+      "loss": 1.2538,
       "step": 70
     },
     {
       "epoch": 0.5474452554744526,
+      "grad_norm": 5.9375,
+      "learning_rate": 0.0001999531858720213,
+      "loss": 1.224,
       "step": 75
     },
     {
       "epoch": 0.583941605839416,
+      "grad_norm": 2.25,
+      "learning_rate": 0.00019984268150178167,
+      "loss": 1.1823,
       "step": 80
     },
     {
       "epoch": 0.6204379562043796,
+      "grad_norm": 2.078125,
+      "learning_rate": 0.00019966725824941932,
+      "loss": 1.1279,
       "step": 85
     },
     {
       "epoch": 0.656934306569343,
+      "grad_norm": 3.0625,
+      "learning_rate": 0.00019942703017718975,
+      "loss": 1.127,
       "step": 90
     },
     {
       "epoch": 0.6934306569343066,
+      "grad_norm": 1.75,
+      "learning_rate": 0.000199122153484202,
+      "loss": 1.1284,
       "step": 95
     },
     {
       "epoch": 0.7299270072992701,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.00019875282640485645,
+      "loss": 1.0566,
       "step": 100
     },
     {
       "epoch": 0.7664233576642335,
+      "grad_norm": 4.53125,
+      "learning_rate": 0.0001983192890799503,
+      "loss": 1.0361,
       "step": 105
     },
     {
       "epoch": 0.8029197080291971,
+      "grad_norm": 2.5,
+      "learning_rate": 0.0001978218234005352,
+      "loss": 1.0371,
       "step": 110
     },
     {
       "epoch": 0.8394160583941606,
+      "grad_norm": 1.890625,
+      "learning_rate": 0.00019726075282462845,
+      "loss": 1.0235,
       "step": 115
     },
     {
       "epoch": 0.8759124087591241,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.00019663644216689683,
+      "loss": 0.996,
       "step": 120
     },
     {
       "epoch": 0.9124087591240876,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00019594929736144976,
+      "loss": 0.9734,
       "step": 125
     },
     {
       "epoch": 0.948905109489051,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.00019519976519789616,
+      "loss": 0.978,
       "step": 130
     },
     {
       "epoch": 0.9854014598540146,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00019438833303083678,
+      "loss": 0.9712,
       "step": 135
     },
     {
       "epoch": 1.0,
+      "eval_loss": 2.307734489440918,
+      "eval_runtime": 0.9962,
+      "eval_samples_per_second": 5.019,
+      "eval_steps_per_second": 2.008,
       "step": 137
     },
     {
+      "epoch": 1.0218978102189782,
+      "grad_norm": 2.125,
+      "learning_rate": 0.00019351552846298025,
+      "loss": 0.9374,
+      "step": 140
+    },
+    {
+      "epoch": 1.0583941605839415,
+      "grad_norm": 2.265625,
+      "learning_rate": 0.0001925819190020898,
+      "loss": 0.9173,
+      "step": 145
+    },
+    {
+      "epoch": 1.094890510948905,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.00019158811169198313,
+      "loss": 0.8916,
+      "step": 150
+    },
+    {
+      "epoch": 1.1313868613138687,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0001905347527178252,
+      "loss": 0.9418,
+      "step": 155
+    },
+    {
+      "epoch": 1.167883211678832,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00018942252698597113,
+      "loss": 0.9054,
+      "step": 160
+    },
+    {
+      "epoch": 1.2043795620437956,
+      "grad_norm": 2.0625,
+      "learning_rate": 0.00018825215767863214,
+      "loss": 0.9039,
+      "step": 165
+    },
+    {
+      "epoch": 1.2408759124087592,
+      "grad_norm": 1.5859375,
+      "learning_rate": 0.00018702440578365387,
+      "loss": 0.9146,
+      "step": 170
+    },
+    {
+      "epoch": 1.2773722627737225,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.00018574006959971333,
+      "loss": 0.8896,
+      "step": 175
+    },
+    {
+      "epoch": 1.313868613138686,
+      "grad_norm": 2.09375,
+      "learning_rate": 0.00018439998421725554,
+      "loss": 0.8947,
+      "step": 180
+    },
+    {
+      "epoch": 1.3503649635036497,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.00018300502097550806,
+      "loss": 0.881,
+      "step": 185
+    },
+    {
+      "epoch": 1.3868613138686132,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.00018155608689592604,
+      "loss": 0.8906,
+      "step": 190
+    },
+    {
+      "epoch": 1.4233576642335766,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.00018005412409243606,
+      "loss": 0.8939,
+      "step": 195
+    },
+    {
+      "epoch": 1.4598540145985401,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0001785001091588628,
+      "loss": 0.9016,
+      "step": 200
+    },
+    {
+      "epoch": 1.4963503649635037,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0001768950525339362,
+      "loss": 0.8943,
+      "step": 205
+    },
+    {
+      "epoch": 1.5328467153284673,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00017523999784429238,
+      "loss": 0.8614,
+      "step": 210
+    },
+    {
+      "epoch": 1.5693430656934306,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.00017353602122589527,
+      "loss": 0.8788,
+      "step": 215
+    },
+    {
+      "epoch": 1.6058394160583942,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0001717842306243205,
+      "loss": 0.8833,
+      "step": 220
+    },
+    {
+      "epoch": 1.6423357664233578,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.00016998576507435618,
+      "loss": 0.8713,
+      "step": 225
+    },
+    {
+      "epoch": 1.6788321167883211,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00016814179395938913,
+      "loss": 0.8661,
+      "step": 230
+    },
+    {
+      "epoch": 1.7153284671532847,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00016625351625105796,
+      "loss": 0.8413,
+      "step": 235
+    },
+    {
+      "epoch": 1.7518248175182483,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0001643221597296679,
+      "loss": 0.8741,
+      "step": 240
+    },
+    {
+      "epoch": 1.7883211678832116,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.00016234898018587337,
+      "loss": 0.8744,
+      "step": 245
+    },
+    {
+      "epoch": 1.8248175182481752,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.00016033526060414842,
+      "loss": 0.8517,
+      "step": 250
+    },
+    {
+      "epoch": 1.8613138686131387,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00015828231032857503,
+      "loss": 0.8899,
+      "step": 255
+    },
+    {
+      "epoch": 1.897810218978102,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.00015619146421149232,
+      "loss": 0.8537,
+      "step": 260
+    },
+    {
+      "epoch": 1.9343065693430657,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.00015406408174555976,
+      "loss": 0.8329,
+      "step": 265
+    },
+    {
+      "epoch": 1.9708029197080292,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.00015190154617979938,
+      "loss": 0.8675,
+      "step": 270
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 2.247941017150879,
+      "eval_runtime": 0.9979,
+      "eval_samples_per_second": 5.01,
+      "eval_steps_per_second": 2.004,
+      "step": 274
+    },
+    {
+      "epoch": 2.0072992700729926,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.00014970526362019079,
+      "loss": 0.8435,
+      "step": 275
+    },
+    {
+      "epoch": 2.0437956204379564,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.00014747666211540459,
+      "loss": 0.7774,
+      "step": 280
+    },
+    {
+      "epoch": 2.0802919708029197,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00014521719072826858,
+      "loss": 0.79,
+      "step": 285
+    },
+    {
+      "epoch": 2.116788321167883,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.00014292831859356997,
+      "loss": 0.7929,
+      "step": 290
+    },
+    {
+      "epoch": 2.153284671532847,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.00014061153396280674,
+      "loss": 0.8032,
+      "step": 295
+    },
+    {
+      "epoch": 2.18978102189781,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.000138268343236509,
+      "loss": 0.7932,
+      "step": 300
+    },
+    {
+      "epoch": 2.2262773722627736,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.00013590026998475986,
+      "loss": 0.7657,
+      "step": 305
+    },
+    {
+      "epoch": 2.2627737226277373,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0001335088539565523,
+      "loss": 0.783,
+      "step": 310
+    },
+    {
+      "epoch": 2.2992700729927007,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.00013109565007862596,
+      "loss": 0.7755,
+      "step": 315
+    },
+    {
+      "epoch": 2.335766423357664,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0001286622274444361,
+      "loss": 0.7723,
+      "step": 320
+    },
+    {
+      "epoch": 2.372262773722628,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.00012621016829391022,
+      "loss": 0.7739,
+      "step": 325
+    },
+    {
+      "epoch": 2.408759124087591,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00012374106698465732,
+      "loss": 0.7821,
+      "step": 330
+    },
+    {
+      "epoch": 2.445255474452555,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00012125652895529766,
+      "loss": 0.7852,
+      "step": 335
+    },
+    {
+      "epoch": 2.4817518248175183,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.00011875816968158815,
+      "loss": 0.7792,
+      "step": 340
+    },
+    {
+      "epoch": 2.5182481751824817,
+      "grad_norm": 0.625,
+      "learning_rate": 0.00011624761362602061,
+      "loss": 0.7799,
+      "step": 345
+    },
+    {
+      "epoch": 2.554744525547445,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.00011372649318157749,
+      "loss": 0.7914,
+      "step": 350
+    },
+    {
+      "epoch": 2.591240875912409,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.00011119644761033078,
+      "loss": 0.7847,
+      "step": 355
+    },
+    {
+      "epoch": 2.627737226277372,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0001086591219775746,
+      "loss": 0.8049,
+      "step": 360
+    },
+    {
+      "epoch": 2.664233576642336,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.00010611616608218429,
+      "loss": 0.7865,
+      "step": 365
+    },
+    {
+      "epoch": 2.7007299270072993,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.00010356923338389806,
+      "loss": 0.7908,
+      "step": 370
+    },
+    {
+      "epoch": 2.7372262773722627,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.00010101997992821797,
+      "loss": 0.7925,
+      "step": 375
+    },
+    {
+      "epoch": 2.7737226277372264,
+      "grad_norm": 0.49609375,
+      "learning_rate": 9.847006326962974e-05,
+      "loss": 0.799,
+      "step": 380
+    },
+    {
+      "epoch": 2.81021897810219,
+      "grad_norm": 0.51171875,
+      "learning_rate": 9.592114139384145e-05,
+      "loss": 0.7832,
+      "step": 385
+    },
+    {
+      "epoch": 2.846715328467153,
+      "grad_norm": 0.7109375,
+      "learning_rate": 9.337487163974164e-05,
+      "loss": 0.7796,
+      "step": 390
+    },
+    {
+      "epoch": 2.883211678832117,
+      "grad_norm": 0.6328125,
+      "learning_rate": 9.083290962177828e-05,
+      "loss": 0.7839,
+      "step": 395
+    },
+    {
+      "epoch": 2.9197080291970803,
+      "grad_norm": 0.59765625,
+      "learning_rate": 8.829690815345886e-05,
+      "loss": 0.7781,
+      "step": 400
+    },
+    {
+      "epoch": 2.9562043795620436,
+      "grad_norm": 0.58203125,
+      "learning_rate": 8.57685161726715e-05,
+      "loss": 0.7457,
+      "step": 405
+    },
+    {
+      "epoch": 2.9927007299270074,
+      "grad_norm": 0.6171875,
+      "learning_rate": 8.324937766952638e-05,
+      "loss": 0.7623,
+      "step": 410
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 2.275648355484009,
+      "eval_runtime": 0.9945,
+      "eval_samples_per_second": 5.028,
+      "eval_steps_per_second": 2.011,
+      "step": 411
+    },
+    {
+      "epoch": 3.0291970802919708,
+      "grad_norm": 0.8359375,
+      "learning_rate": 8.074113061741397e-05,
+      "loss": 0.7329,
+      "step": 415
+    },
+    {
+      "epoch": 3.065693430656934,
+      "grad_norm": 0.50390625,
+      "learning_rate": 7.824540590797568e-05,
+      "loss": 0.7052,
+      "step": 420
+    },
+    {
+      "epoch": 3.102189781021898,
+      "grad_norm": 0.5703125,
+      "learning_rate": 7.576382629067877e-05,
+      "loss": 0.7015,
+      "step": 425
+    },
+    {
+      "epoch": 3.1386861313868613,
+      "grad_norm": 0.6015625,
+      "learning_rate": 7.329800531768584e-05,
+      "loss": 0.696,
+      "step": 430
+    },
+    {
+      "epoch": 3.1751824817518246,
+      "grad_norm": 0.55078125,
+      "learning_rate": 7.084954629470417e-05,
+      "loss": 0.7154,
+      "step": 435
+    },
+    {
+      "epoch": 3.2116788321167884,
+      "grad_norm": 0.59765625,
+      "learning_rate": 6.842004123849752e-05,
+      "loss": 0.7113,
+      "step": 440
+    },
+    {
+      "epoch": 3.2481751824817517,
+      "grad_norm": 0.5625,
+      "learning_rate": 6.601106984173835e-05,
+      "loss": 0.7139,
+      "step": 445
+    },
+    {
+      "epoch": 3.2846715328467155,
+      "grad_norm": 0.59765625,
+      "learning_rate": 6.362419844587287e-05,
+      "loss": 0.6967,
+      "step": 450
+    },
+    {
+      "epoch": 3.321167883211679,
+      "grad_norm": 0.52734375,
+      "learning_rate": 6.126097902266772e-05,
+      "loss": 0.7073,
+      "step": 455
+    },
+    {
+      "epoch": 3.3576642335766422,
+      "grad_norm": 0.5625,
+      "learning_rate": 5.8922948165099524e-05,
+      "loss": 0.6857,
+      "step": 460
+    },
+    {
+      "epoch": 3.394160583941606,
+      "grad_norm": 0.55859375,
+      "learning_rate": 5.6611626088244194e-05,
+      "loss": 0.7199,
+      "step": 465
+    },
+    {
+      "epoch": 3.4306569343065694,
+      "grad_norm": 0.58203125,
+      "learning_rate": 5.432851564081534e-05,
+      "loss": 0.7075,
+      "step": 470
+    },
+    {
+      "epoch": 3.4671532846715327,
+      "grad_norm": 0.52734375,
+      "learning_rate": 5.207510132799436e-05,
+      "loss": 0.7006,
+      "step": 475
+    },
+    {
+      "epoch": 3.5036496350364965,
+      "grad_norm": 0.53515625,
+      "learning_rate": 4.9852848346187566e-05,
+      "loss": 0.7151,
+      "step": 480
+    },
+    {
+      "epoch": 3.54014598540146,
+      "grad_norm": 0.546875,
+      "learning_rate": 4.7663201630338816e-05,
+      "loss": 0.7129,
+      "step": 485
+    },
+    {
+      "epoch": 3.576642335766423,
+      "grad_norm": 0.5859375,
+      "learning_rate": 4.550758491441526e-05,
+      "loss": 0.7139,
+      "step": 490
+    },
+    {
+      "epoch": 3.613138686131387,
+      "grad_norm": 0.51953125,
+      "learning_rate": 4.3387399805679255e-05,
+      "loss": 0.7162,
+      "step": 495
+    },
+    {
+      "epoch": 3.6496350364963503,
+      "grad_norm": 0.55859375,
+      "learning_rate": 4.1304024873346705e-05,
+      "loss": 0.7132,
+      "step": 500
+    },
+    {
+      "epoch": 3.686131386861314,
+      "grad_norm": 0.57421875,
+      "learning_rate": 3.9258814752225284e-05,
+      "loss": 0.7007,
+      "step": 505
+    },
+    {
+      "epoch": 3.7226277372262775,
+      "grad_norm": 0.546875,
+      "learning_rate": 3.725309926191479e-05,
+      "loss": 0.7037,
+      "step": 510
+    },
+    {
+      "epoch": 3.759124087591241,
+      "grad_norm": 0.73828125,
+      "learning_rate": 3.528818254214329e-05,
+      "loss": 0.7255,
+      "step": 515
+    },
+    {
+      "epoch": 3.795620437956204,
+      "grad_norm": 0.52734375,
+      "learning_rate": 3.336534220479961e-05,
+      "loss": 0.6966,
+      "step": 520
+    },
+    {
+      "epoch": 3.832116788321168,
+      "grad_norm": 0.5078125,
+      "learning_rate": 3.1485828503215585e-05,
+      "loss": 0.7143,
+      "step": 525
+    },
+    {
+      "epoch": 3.8686131386861313,
+      "grad_norm": 0.6328125,
+      "learning_rate": 2.9650863519236418e-05,
+      "loss": 0.7005,
+      "step": 530
+    },
+    {
+      "epoch": 3.905109489051095,
+      "grad_norm": 0.5703125,
+      "learning_rate": 2.7861640368608844e-05,
+      "loss": 0.7005,
+      "step": 535
+    },
+    {
+      "epoch": 3.9416058394160585,
+      "grad_norm": 0.53125,
+      "learning_rate": 2.6119322425203197e-05,
+      "loss": 0.7139,
+      "step": 540
+    },
+    {
+      "epoch": 3.978102189781022,
+      "grad_norm": 0.51953125,
+      "learning_rate": 2.4425042564574184e-05,
+      "loss": 0.709,
+      "step": 545
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 2.341665267944336,
+      "eval_runtime": 0.9977,
+      "eval_samples_per_second": 5.012,
+      "eval_steps_per_second": 2.005,
+      "step": 548
+    },
+    {
+      "epoch": 4.014598540145985,
+      "grad_norm": 0.53515625,
+      "learning_rate": 2.277990242735185e-05,
+      "loss": 0.6801,
+      "step": 550
+    },
+    {
+      "epoch": 4.0510948905109485,
+      "grad_norm": 0.52734375,
+      "learning_rate": 2.118497170294195e-05,
+      "loss": 0.6495,
+      "step": 555
+    },
+    {
+      "epoch": 4.087591240875913,
+      "grad_norm": 0.5625,
+      "learning_rate": 1.9641287434001355e-05,
+      "loss": 0.672,
+      "step": 560
+    },
+    {
+      "epoch": 4.124087591240876,
+      "grad_norm": 0.55078125,
+      "learning_rate": 1.8149853342140645e-05,
+      "loss": 0.6611,
+      "step": 565
+    },
+    {
+      "epoch": 4.160583941605839,
+      "grad_norm": 0.59375,
+      "learning_rate": 1.671163917529285e-05,
+      "loss": 0.662,
+      "step": 570
+    },
+    {
+      "epoch": 4.197080291970803,
+      "grad_norm": 0.51171875,
+      "learning_rate": 1.5327580077171587e-05,
+      "loss": 0.6635,
+      "step": 575
+    },
+    {
+      "epoch": 4.233576642335766,
+      "grad_norm": 0.54296875,
+      "learning_rate": 1.3998575979229944e-05,
+      "loss": 0.6624,
+      "step": 580
+    },
+    {
+      "epoch": 4.2700729927007295,
+      "grad_norm": 0.50390625,
+      "learning_rate": 1.272549101551438e-05,
+      "loss": 0.6523,
+      "step": 585
+    },
+    {
+      "epoch": 4.306569343065694,
+      "grad_norm": 0.51171875,
+      "learning_rate": 1.1509152960794666e-05,
+      "loss": 0.6607,
+      "step": 590
+    },
+    {
+      "epoch": 4.343065693430657,
+      "grad_norm": 0.546875,
+      "learning_rate": 1.035035269233493e-05,
+      "loss": 0.6626,
+      "step": 595
+    },
+    {
+      "epoch": 4.37956204379562,
+      "grad_norm": 0.54296875,
+      "learning_rate": 9.249843675656212e-06,
+      "loss": 0.678,
+      "step": 600
+    },
+    {
+      "epoch": 4.416058394160584,
+      "grad_norm": 0.5234375,
+      "learning_rate": 8.208341474624071e-06,
+      "loss": 0.6783,
+      "step": 605
+    },
+    {
+      "epoch": 4.452554744525547,
+      "grad_norm": 0.53515625,
+      "learning_rate": 7.226523286180776e-06,
+      "loss": 0.6699,
+      "step": 610
+    },
+    {
+      "epoch": 4.489051094890511,
+      "grad_norm": 0.5703125,
+      "learning_rate": 6.3050275000238414e-06,
+      "loss": 0.6607,
+      "step": 615
+    },
+    {
+      "epoch": 4.525547445255475,
+      "grad_norm": 0.5234375,
+      "learning_rate": 5.4444532835175144e-06,
+      "loss": 0.6702,
+      "step": 620
+    },
+    {
+      "epoch": 4.562043795620438,
+      "grad_norm": 0.5234375,
+      "learning_rate": 4.6453601921072395e-06,
+      "loss": 0.6793,
+      "step": 625
+    },
+    {
+      "epoch": 4.598540145985401,
+      "grad_norm": 0.5234375,
+      "learning_rate": 3.908267805490051e-06,
+      "loss": 0.6622,
+      "step": 630
+    },
+    {
+      "epoch": 4.635036496350365,
+      "grad_norm": 0.54296875,
+      "learning_rate": 3.233655389777801e-06,
+      "loss": 0.677,
+      "step": 635
+    },
+    {
+      "epoch": 4.671532846715328,
+      "grad_norm": 0.5234375,
+      "learning_rate": 2.62196158587269e-06,
+      "loss": 0.6588,
+      "step": 640
+    },
+    {
+      "epoch": 4.708029197080292,
+      "grad_norm": 0.5234375,
+      "learning_rate": 2.073584124257899e-06,
+      "loss": 0.6621,
+      "step": 645
+    },
+    {
+      "epoch": 4.744525547445256,
+      "grad_norm": 0.53515625,
+      "learning_rate": 1.5888795663883904e-06,
+      "loss": 0.6655,
+      "step": 650
+    },
+    {
+      "epoch": 4.781021897810219,
+      "grad_norm": 0.515625,
+      "learning_rate": 1.1681630728506699e-06,
+      "loss": 0.6653,
+      "step": 655
+    },
+    {
+      "epoch": 4.817518248175182,
+      "grad_norm": 0.52734375,
+      "learning_rate": 8.117081984415298e-07,
+      "loss": 0.6734,
+      "step": 660
+    },
+    {
+      "epoch": 4.854014598540146,
+      "grad_norm": 0.5390625,
+      "learning_rate": 5.19746714299596e-07,
+      "loss": 0.6541,
+      "step": 665
+    },
+    {
+      "epoch": 4.89051094890511,
+      "grad_norm": 0.5390625,
+      "learning_rate": 2.9246845720496407e-07,
+      "loss": 0.6722,
+      "step": 670
+    },
+    {
+      "epoch": 4.927007299270073,
+      "grad_norm": 0.55859375,
+      "learning_rate": 1.300212061451367e-07,
+      "loss": 0.6472,
+      "step": 675
+    },
+    {
+      "epoch": 4.963503649635037,
+      "grad_norm": 0.51953125,
+      "learning_rate": 3.251058622737446e-08,
+      "loss": 0.667,
+      "step": 680
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0,
+      "loss": 0.6601,
+      "step": 685
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 2.3811252117156982,
+      "eval_runtime": 0.9953,
+      "eval_samples_per_second": 5.024,
+      "eval_steps_per_second": 2.01,
+      "step": 685
+    },
+    {
+      "epoch": 5.0,
+      "step": 685,
+      "total_flos": 1.0472781231601746e+18,
+      "train_loss": 2.151051264783762,
+      "train_runtime": 5341.9856,
+      "train_samples_per_second": 2.052,
+      "train_steps_per_second": 0.128
     }
   ],
   "logging_steps": 5,
+  "max_steps": 685,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
   "save_steps": 100,
   "stateful_callbacks": {
     "TrainerControl": {
       "attributes": {}
     }
   },
+  "total_flos": 1.0472781231601746e+18,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null