Model save

Browse files

Files changed (12) hide show

.gitattributes +1 -0
README.md +69 -0
adapter_config.json +31 -0
adapter_model.safetensors +3 -0
all_results.json +8 -0
runs/Apr26_10-12-14_COE-CS-sv003/events.out.tfevents.1714126416.COE-CS-sv003.480309.0 +3 -0
special_tokens_map.json +28 -0
tokenizer.json +3 -0
tokenizer_config.json +70 -0
train_results.json +8 -0
trainer_state.json +2791 -0
training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,69 @@

+---
+license: gemma
+library_name: peft
+tags:
+- trl
+- sft
+- generated_from_trainer
+base_model: google/gemma-7b
+model-index:
+- name: zephyr-7b-gemma-sft-5p-2048
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# zephyr-7b-gemma-sft-5p-2048
+This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the None dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.1822
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 4
+- eval_batch_size: 4
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 4
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 32
+- total_eval_batch_size: 16
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 3
+### Training results
+| Training Loss | Epoch | Step | Validation Loss |
+|:-------------:|:-----:|:----:|:---------------:|
+| 0.9062        | 1.0   | 651  | 1.2442          |
+| 0.907         | 2.0   | 1303 | 1.1708          |
+| 0.8209        | 3.0   | 1953 | 1.1822          |
+### Framework versions
+- PEFT 0.7.1
+- Transformers 4.39.0.dev0
+- Pytorch 2.1.2
+- Datasets 2.14.6
+- Tokenizers 0.15.2

adapter_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "google/gemma-7b",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 8,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 6,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "q_proj",
+    "gate_proj",
+    "up_proj",
+    "v_proj",
+    "down_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM"
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d2b089dac39d843b1ff4c1788a5cf2494adbc0a1c5f554b886906cd9d48aa612
+size 37555048

all_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 3.0,
+    "train_loss": 1.1336081770219621,
+    "train_runtime": 21652.8936,
+    "train_samples": 20842,
+    "train_samples_per_second": 2.888,
+    "train_steps_per_second": 0.09
+}

runs/Apr26_10-12-14_COE-CS-sv003/events.out.tfevents.1714126416.COE-CS-sv003.480309.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:39c4df3285df824e39695aeb563c785b14a88c9a4f0eaad634c78e2a75accd96
+size 88370

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": "<|im_start|>",
+  "eos_token": "<|im_end|>",
+  "pad_token": "<|im_end|>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:299f8e59a5c4a3b3941dbe1159a7079d69c7a8d5ca34322ace1be9140ae76cc0
+size 17477572

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,70 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "106": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "107": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": "<|im_start|>",
+  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "legacy": null,
+  "model_max_length": 2048,
+  "pad_token": "<|im_end|>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 3.0,
+    "train_loss": 1.1336081770219621,
+    "train_runtime": 21652.8936,
+    "train_samples": 20842,
+    "train_samples_per_second": 2.888,
+    "train_steps_per_second": 0.09
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2791 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.997697620874904,
+  "eval_steps": 500,
+  "global_step": 1953,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "grad_norm": 72.47212998792584,
+      "learning_rate": 1.020408163265306e-06,
+      "loss": 17.0139,
+      "step": 1
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 72.19709441394318,
+      "learning_rate": 5.102040816326531e-06,
+      "loss": 16.8099,
+      "step": 5
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 74.11072818499085,
+      "learning_rate": 1.0204081632653061e-05,
+      "loss": 16.7768,
+      "step": 10
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 67.88747859998395,
+      "learning_rate": 1.5306122448979594e-05,
+      "loss": 15.8745,
+      "step": 15
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 57.15585576506464,
+      "learning_rate": 2.0408163265306123e-05,
+      "loss": 13.4197,
+      "step": 20
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 52.16705016348157,
+      "learning_rate": 2.5510204081632654e-05,
+      "loss": 10.8782,
+      "step": 25
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 34.872891212782,
+      "learning_rate": 3.061224489795919e-05,
+      "loss": 8.0236,
+      "step": 30
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 20.96608544121856,
+      "learning_rate": 3.571428571428572e-05,
+      "loss": 5.7602,
+      "step": 35
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 14.39165011334111,
+      "learning_rate": 4.0816326530612245e-05,
+      "loss": 4.4129,
+      "step": 40
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 9.537119159153542,
+      "learning_rate": 4.591836734693878e-05,
+      "loss": 3.4241,
+      "step": 45
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 10.051565364855378,
+      "learning_rate": 5.102040816326531e-05,
+      "loss": 2.7711,
+      "step": 50
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 5.902644991920862,
+      "learning_rate": 5.6122448979591836e-05,
+      "loss": 2.1132,
+      "step": 55
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 5.282533867200368,
+      "learning_rate": 6.122448979591838e-05,
+      "loss": 1.7932,
+      "step": 60
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.849440118853736,
+      "learning_rate": 6.63265306122449e-05,
+      "loss": 1.5357,
+      "step": 65
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 6.395981446361856,
+      "learning_rate": 7.142857142857143e-05,
+      "loss": 1.4488,
+      "step": 70
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.9290320550828686,
+      "learning_rate": 7.653061224489796e-05,
+      "loss": 1.3654,
+      "step": 75
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.9509560155111505,
+      "learning_rate": 8.163265306122449e-05,
+      "loss": 1.2287,
+      "step": 80
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 1.9800497930938712,
+      "learning_rate": 8.673469387755102e-05,
+      "loss": 1.2863,
+      "step": 85
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 1.7120992753385207,
+      "learning_rate": 9.183673469387756e-05,
+      "loss": 1.1712,
+      "step": 90
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.2653861656665453,
+      "learning_rate": 9.693877551020408e-05,
+      "loss": 1.2174,
+      "step": 95
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 3.453167617907916,
+      "learning_rate": 0.00010204081632653062,
+      "loss": 1.1714,
+      "step": 100
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 1.547620666252296,
+      "learning_rate": 0.00010714285714285715,
+      "loss": 1.1168,
+      "step": 105
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 1.6713272237258214,
+      "learning_rate": 0.00011224489795918367,
+      "loss": 1.0868,
+      "step": 110
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.5402309108002492,
+      "learning_rate": 0.00011734693877551022,
+      "loss": 1.1351,
+      "step": 115
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.5970274628335959,
+      "learning_rate": 0.00012244897959183676,
+      "loss": 1.0946,
+      "step": 120
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 1.6342350847388514,
+      "learning_rate": 0.00012755102040816328,
+      "loss": 1.1211,
+      "step": 125
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.6632080501970268,
+      "learning_rate": 0.0001326530612244898,
+      "loss": 1.0501,
+      "step": 130
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 1.4060120288881937,
+      "learning_rate": 0.00013775510204081635,
+      "loss": 1.0956,
+      "step": 135
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 1.801531788644497,
+      "learning_rate": 0.00014285714285714287,
+      "loss": 1.072,
+      "step": 140
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.520734386032582,
+      "learning_rate": 0.0001479591836734694,
+      "loss": 1.1119,
+      "step": 145
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.840843283748014,
+      "learning_rate": 0.0001530612244897959,
+      "loss": 0.9969,
+      "step": 150
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.07792070441396,
+      "learning_rate": 0.00015816326530612246,
+      "loss": 1.0208,
+      "step": 155
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.9723613912374163,
+      "learning_rate": 0.00016326530612244898,
+      "loss": 0.9975,
+      "step": 160
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.8108507027898844,
+      "learning_rate": 0.00016836734693877553,
+      "loss": 1.0388,
+      "step": 165
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.049204774598262,
+      "learning_rate": 0.00017346938775510205,
+      "loss": 0.9837,
+      "step": 170
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.013771271987093,
+      "learning_rate": 0.0001785714285714286,
+      "loss": 0.956,
+      "step": 175
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.6734633410801107,
+      "learning_rate": 0.00018367346938775512,
+      "loss": 1.0388,
+      "step": 180
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.3629530195645043,
+      "learning_rate": 0.00018877551020408164,
+      "loss": 1.0407,
+      "step": 185
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 2.0061060540694267,
+      "learning_rate": 0.00019387755102040816,
+      "loss": 1.0528,
+      "step": 190
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.8957278214013087,
+      "learning_rate": 0.0001989795918367347,
+      "loss": 1.0637,
+      "step": 195
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 1.659431161796715,
+      "learning_rate": 0.00019999744233089168,
+      "loss": 1.0197,
+      "step": 200
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 1.9623902009306882,
+      "learning_rate": 0.00019998705202436978,
+      "loss": 1.0171,
+      "step": 205
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.666768664331948,
+      "learning_rate": 0.0001999686700559419,
+      "loss": 0.9411,
+      "step": 210
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.6263956387135465,
+      "learning_rate": 0.00019994229789482308,
+      "loss": 1.0452,
+      "step": 215
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 2.104534739444239,
+      "learning_rate": 0.00019990793764886012,
+      "loss": 1.0318,
+      "step": 220
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.5223676518998526,
+      "learning_rate": 0.0001998655920643634,
+      "loss": 0.9393,
+      "step": 225
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.399019073099987,
+      "learning_rate": 0.000199815264525887,
+      "loss": 0.97,
+      "step": 230
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.8651896389488705,
+      "learning_rate": 0.00019975695905595855,
+      "loss": 1.0187,
+      "step": 235
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.536205961219932,
+      "learning_rate": 0.00019969068031475744,
+      "loss": 0.9716,
+      "step": 240
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.7078218228495357,
+      "learning_rate": 0.0001996164335997425,
+      "loss": 0.9959,
+      "step": 245
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.3467510953394168,
+      "learning_rate": 0.0001995342248452285,
+      "loss": 0.9602,
+      "step": 250
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.9084544882836239,
+      "learning_rate": 0.00019944406062191204,
+      "loss": 0.9775,
+      "step": 255
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.5574610066347279,
+      "learning_rate": 0.000199345948136346,
+      "loss": 1.0177,
+      "step": 260
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 2.109450536237562,
+      "learning_rate": 0.00019923989523036394,
+      "loss": 0.9819,
+      "step": 265
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 2.7828828000504573,
+      "learning_rate": 0.00019912591038045307,
+      "loss": 0.9707,
+      "step": 270
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.3138848135299093,
+      "learning_rate": 0.0001990040026970768,
+      "loss": 0.9909,
+      "step": 275
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.5708872027230352,
+      "learning_rate": 0.0001988741819239467,
+      "loss": 0.9744,
+      "step": 280
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.7915877912980807,
+      "learning_rate": 0.0001987364584372435,
+      "loss": 0.9629,
+      "step": 285
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.7688040681479107,
+      "learning_rate": 0.00019859084324478791,
+      "loss": 1.0066,
+      "step": 290
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.2087916028360217,
+      "learning_rate": 0.00019843734798516077,
+      "loss": 0.9273,
+      "step": 295
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.7217374396550518,
+      "learning_rate": 0.00019827598492677283,
+      "loss": 0.9599,
+      "step": 300
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.1827063296813014,
+      "learning_rate": 0.000198106766966884,
+      "loss": 0.9526,
+      "step": 305
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.275288932081838,
+      "learning_rate": 0.0001979297076305728,
+      "loss": 0.9351,
+      "step": 310
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.4572732094413907,
+      "learning_rate": 0.00019774482106965513,
+      "loss": 0.9916,
+      "step": 315
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.3548000574556105,
+      "learning_rate": 0.00019755212206155318,
+      "loss": 0.9894,
+      "step": 320
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.4221295720770253,
+      "learning_rate": 0.00019735162600811447,
+      "loss": 0.9147,
+      "step": 325
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.1147927135270754,
+      "learning_rate": 0.00019714334893438062,
+      "loss": 0.9173,
+      "step": 330
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.5740508551311372,
+      "learning_rate": 0.00019692730748730662,
+      "loss": 1.0049,
+      "step": 335
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.4361894716708776,
+      "learning_rate": 0.0001967035189344303,
+      "loss": 0.9772,
+      "step": 340
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.5979099481664976,
+      "learning_rate": 0.00019647200116249214,
+      "loss": 0.9734,
+      "step": 345
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.1416579629890438,
+      "learning_rate": 0.00019623277267600574,
+      "loss": 0.9695,
+      "step": 350
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.2859502643647958,
+      "learning_rate": 0.0001959858525957786,
+      "loss": 0.9726,
+      "step": 355
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.654528585364595,
+      "learning_rate": 0.00019573126065738415,
+      "loss": 0.9099,
+      "step": 360
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.6432235938460993,
+      "learning_rate": 0.00019546901720958405,
+      "loss": 0.993,
+      "step": 365
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.184408195362666,
+      "learning_rate": 0.00019519914321270196,
+      "loss": 0.983,
+      "step": 370
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.364894780646226,
+      "learning_rate": 0.00019492166023694823,
+      "loss": 0.9385,
+      "step": 375
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.2140596195120288,
+      "learning_rate": 0.0001946365904606957,
+      "loss": 0.928,
+      "step": 380
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.2023189029574775,
+      "learning_rate": 0.00019434395666870734,
+      "loss": 0.9497,
+      "step": 385
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.5293781545066814,
+      "learning_rate": 0.00019404378225031482,
+      "loss": 0.9845,
+      "step": 390
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.2719601216605032,
+      "learning_rate": 0.00019373609119754926,
+      "loss": 0.9535,
+      "step": 395
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.1767287730617413,
+      "learning_rate": 0.00019342090810322361,
+      "loss": 0.9669,
+      "step": 400
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.5092663681250142,
+      "learning_rate": 0.00019309825815896697,
+      "loss": 0.9097,
+      "step": 405
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.065331569108817,
+      "learning_rate": 0.00019276816715321107,
+      "loss": 0.9257,
+      "step": 410
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.0551712975999572,
+      "learning_rate": 0.00019243066146912914,
+      "loss": 0.9179,
+      "step": 415
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.280851911643696,
+      "learning_rate": 0.00019208576808252726,
+      "loss": 0.9322,
+      "step": 420
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.1957795064970969,
+      "learning_rate": 0.00019173351455968805,
+      "loss": 0.9472,
+      "step": 425
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.6691262118179164,
+      "learning_rate": 0.00019137392905516757,
+      "loss": 0.9833,
+      "step": 430
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.2087456453830114,
+      "learning_rate": 0.0001910070403095449,
+      "loss": 0.9554,
+      "step": 435
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.2175939573659607,
+      "learning_rate": 0.00019063287764712513,
+      "loss": 0.9844,
+      "step": 440
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.4498433195544413,
+      "learning_rate": 0.00019025147097359528,
+      "loss": 0.9467,
+      "step": 445
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.429864892305334,
+      "learning_rate": 0.00018986285077363446,
+      "loss": 0.9309,
+      "step": 450
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.170651014792718,
+      "learning_rate": 0.00018946704810847689,
+      "loss": 0.9234,
+      "step": 455
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.3282215747364223,
+      "learning_rate": 0.00018906409461342952,
+      "loss": 0.9536,
+      "step": 460
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.1845632567557303,
+      "learning_rate": 0.00018865402249534347,
+      "loss": 0.9772,
+      "step": 465
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.2127271540342577,
+      "learning_rate": 0.00018823686453003973,
+      "loss": 0.9523,
+      "step": 470
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.2360135960506637,
+      "learning_rate": 0.00018781265405968972,
+      "loss": 0.9135,
+      "step": 475
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.5041667778892749,
+      "learning_rate": 0.0001873814249901501,
+      "loss": 0.9625,
+      "step": 480
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.1504429942546481,
+      "learning_rate": 0.00018694321178825286,
+      "loss": 0.9363,
+      "step": 485
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.0319491329942774,
+      "learning_rate": 0.00018649804947905055,
+      "loss": 0.9054,
+      "step": 490
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.2204274089754297,
+      "learning_rate": 0.0001860459736430169,
+      "loss": 0.9635,
+      "step": 495
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.0611368485238843,
+      "learning_rate": 0.00018558702041320273,
+      "loss": 0.9445,
+      "step": 500
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.1018739389426906,
+      "learning_rate": 0.00018512122647234812,
+      "loss": 0.9289,
+      "step": 505
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.9882389573927103,
+      "learning_rate": 0.0001846486290499505,
+      "loss": 0.9911,
+      "step": 510
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.2994396811323112,
+      "learning_rate": 0.0001841692659192889,
+      "loss": 0.9264,
+      "step": 515
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.5236972647383147,
+      "learning_rate": 0.00018368317539440492,
+      "loss": 0.9563,
+      "step": 520
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.1383444262695794,
+      "learning_rate": 0.0001831903963270404,
+      "loss": 0.977,
+      "step": 525
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.110789605109835,
+      "learning_rate": 0.00018269096810353205,
+      "loss": 0.9388,
+      "step": 530
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.6690032269651018,
+      "learning_rate": 0.00018218493064166353,
+      "loss": 0.923,
+      "step": 535
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.2741603156202803,
+      "learning_rate": 0.00018167232438747485,
+      "loss": 0.959,
+      "step": 540
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.1359315426783947,
+      "learning_rate": 0.00018115319031202965,
+      "loss": 0.958,
+      "step": 545
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.1462965424102658,
+      "learning_rate": 0.00018062756990814058,
+      "loss": 0.9206,
+      "step": 550
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.9795818536509971,
+      "learning_rate": 0.00018009550518705285,
+      "loss": 0.9027,
+      "step": 555
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.9867688236460216,
+      "learning_rate": 0.00017955703867508633,
+      "loss": 0.9283,
+      "step": 560
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.0077012911846719,
+      "learning_rate": 0.00017901221341023673,
+      "loss": 0.9516,
+      "step": 565
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.3421028406940887,
+      "learning_rate": 0.00017846107293873555,
+      "loss": 0.9121,
+      "step": 570
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.4547843706488663,
+      "learning_rate": 0.0001779036613115696,
+      "loss": 0.8875,
+      "step": 575
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.1060535265201301,
+      "learning_rate": 0.00017734002308096014,
+      "loss": 0.9554,
+      "step": 580
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.5055847688809278,
+      "learning_rate": 0.00017677020329680203,
+      "loss": 0.9173,
+      "step": 585
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.1611583032421355,
+      "learning_rate": 0.00017619424750306287,
+      "loss": 0.9086,
+      "step": 590
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.1353445717142476,
+      "learning_rate": 0.00017561220173414297,
+      "loss": 0.967,
+      "step": 595
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.9563792857101335,
+      "learning_rate": 0.00017502411251119586,
+      "loss": 0.9155,
+      "step": 600
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.1337314999879342,
+      "learning_rate": 0.00017443002683841002,
+      "loss": 0.8905,
+      "step": 605
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.0287944149688226,
+      "learning_rate": 0.00017382999219925203,
+      "loss": 0.9092,
+      "step": 610
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.2022884801513027,
+      "learning_rate": 0.00017322405655267122,
+      "loss": 0.8703,
+      "step": 615
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.2018380484271611,
+      "learning_rate": 0.0001726122683292667,
+      "loss": 0.9769,
+      "step": 620
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.2278521149338097,
+      "learning_rate": 0.0001719946764274162,
+      "loss": 0.9632,
+      "step": 625
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.132615620517034,
+      "learning_rate": 0.00017137133020936782,
+      "loss": 0.943,
+      "step": 630
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.0891584621905475,
+      "learning_rate": 0.00017074227949729481,
+      "loss": 0.9249,
+      "step": 635
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.1417202547948113,
+      "learning_rate": 0.00017010757456931334,
+      "loss": 0.9055,
+      "step": 640
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.9346881604970835,
+      "learning_rate": 0.0001694672661554638,
+      "loss": 0.9336,
+      "step": 645
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.1301994877879067,
+      "learning_rate": 0.0001688214054336563,
+      "loss": 0.9062,
+      "step": 650
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.2441989183425903,
+      "eval_runtime": 252.9523,
+      "eval_samples_per_second": 9.132,
+      "eval_steps_per_second": 0.573,
+      "step": 651
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.179898601273253,
+      "learning_rate": 0.00016817004402558012,
+      "loss": 0.9027,
+      "step": 655
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.9417315541112381,
+      "learning_rate": 0.0001675132339925776,
+      "loss": 0.9119,
+      "step": 660
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 1.0837703721555252,
+      "learning_rate": 0.0001668510278314833,
+      "loss": 0.86,
+      "step": 665
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 1.1272174437541118,
+      "learning_rate": 0.00016618347847042778,
+      "loss": 0.8711,
+      "step": 670
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 1.1359544360875102,
+      "learning_rate": 0.00016551063926460748,
+      "loss": 0.8776,
+      "step": 675
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 1.1917117624970714,
+      "learning_rate": 0.00016483256399202006,
+      "loss": 0.9209,
+      "step": 680
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 1.0249074428213159,
+      "learning_rate": 0.00016414930684916613,
+      "loss": 0.8196,
+      "step": 685
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 1.1816776012865573,
+      "learning_rate": 0.00016346092244671746,
+      "loss": 0.8279,
+      "step": 690
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 1.077779310150086,
+      "learning_rate": 0.00016276746580515218,
+      "loss": 0.8997,
+      "step": 695
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 1.2009550281356633,
+      "learning_rate": 0.00016206899235035702,
+      "loss": 0.8938,
+      "step": 700
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 1.0925262506944138,
+      "learning_rate": 0.00016136555790919748,
+      "loss": 0.8856,
+      "step": 705
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.0198591870033458,
+      "learning_rate": 0.0001606572187050556,
+      "loss": 0.8978,
+      "step": 710
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 1.1119975974231693,
+      "learning_rate": 0.0001599440313533363,
+      "loss": 0.8876,
+      "step": 715
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 1.190419344020418,
+      "learning_rate": 0.00015922605285694215,
+      "loss": 0.904,
+      "step": 720
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 2.1639792062638574,
+      "learning_rate": 0.0001585033406017175,
+      "loss": 0.9021,
+      "step": 725
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 1.2040942925245026,
+      "learning_rate": 0.0001577759523518616,
+      "loss": 0.9092,
+      "step": 730
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 1.0704322799863495,
+      "learning_rate": 0.00015704394624531184,
+      "loss": 0.8274,
+      "step": 735
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 1.0627822260269795,
+      "learning_rate": 0.00015630738078909685,
+      "loss": 0.8776,
+      "step": 740
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 0.9905597794182393,
+      "learning_rate": 0.00015556631485466027,
+      "loss": 0.9246,
+      "step": 745
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.173456919024251,
+      "learning_rate": 0.00015482080767315528,
+      "loss": 0.9656,
+      "step": 750
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 1.0873012836166025,
+      "learning_rate": 0.00015407091883071054,
+      "loss": 0.9464,
+      "step": 755
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 1.1473043693064626,
+      "learning_rate": 0.00015331670826366754,
+      "loss": 0.8496,
+      "step": 760
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.985606297895901,
+      "learning_rate": 0.00015255823625379017,
+      "loss": 0.853,
+      "step": 765
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 1.231267953892478,
+      "learning_rate": 0.00015179556342344644,
+      "loss": 0.8652,
+      "step": 770
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.0503811328789359,
+      "learning_rate": 0.00015102875073076324,
+      "loss": 0.9447,
+      "step": 775
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.0561678099502803,
+      "learning_rate": 0.00015025785946475408,
+      "loss": 0.879,
+      "step": 780
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.9127623837677151,
+      "learning_rate": 0.00014948295124042057,
+      "loss": 0.9144,
+      "step": 785
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.147282495230838,
+      "learning_rate": 0.00014870408799382752,
+      "loss": 0.9404,
+      "step": 790
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 1.0608780257484842,
+      "learning_rate": 0.00014792133197715266,
+      "loss": 0.9021,
+      "step": 795
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 1.2163076965365305,
+      "learning_rate": 0.0001471347457537111,
+      "loss": 0.915,
+      "step": 800
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.1177869922249701,
+      "learning_rate": 0.00014634439219295478,
+      "loss": 0.8648,
+      "step": 805
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.194327396103283,
+      "learning_rate": 0.0001455503344654474,
+      "loss": 0.9526,
+      "step": 810
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.0905268007576805,
+      "learning_rate": 0.00014475263603781554,
+      "loss": 0.8757,
+      "step": 815
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 0.9152140993127001,
+      "learning_rate": 0.0001439513606676759,
+      "loss": 0.8722,
+      "step": 820
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 0.9900853991789728,
+      "learning_rate": 0.00014314657239853927,
+      "loss": 0.8669,
+      "step": 825
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 1.0767926512086485,
+      "learning_rate": 0.000142338335554692,
+      "loss": 0.8841,
+      "step": 830
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.8724281805811482,
+      "learning_rate": 0.00014152671473605428,
+      "loss": 0.826,
+      "step": 835
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 0.9929237657179188,
+      "learning_rate": 0.0001407117748130174,
+      "loss": 0.8765,
+      "step": 840
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.0329119732814056,
+      "learning_rate": 0.00013989358092125843,
+      "loss": 0.879,
+      "step": 845
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.8699115976888,
+      "learning_rate": 0.00013907219845653442,
+      "loss": 0.8871,
+      "step": 850
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 2.0311070826655615,
+      "learning_rate": 0.00013824769306945532,
+      "loss": 0.9038,
+      "step": 855
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.2265447591069667,
+      "learning_rate": 0.00013742013066023678,
+      "loss": 0.8918,
+      "step": 860
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 1.2487739691504016,
+      "learning_rate": 0.00013658957737343298,
+      "loss": 0.8986,
+      "step": 865
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.9999833105305884,
+      "learning_rate": 0.00013575609959264994,
+      "loss": 0.9054,
+      "step": 870
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 1.0265246633147944,
+      "learning_rate": 0.0001349197639352395,
+      "loss": 0.8781,
+      "step": 875
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 1.2752835605974824,
+      "learning_rate": 0.00013408063724697499,
+      "loss": 0.9096,
+      "step": 880
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 1.0841900048671298,
+      "learning_rate": 0.00013323878659670836,
+      "loss": 0.8954,
+      "step": 885
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 0.9822221386317229,
+      "learning_rate": 0.00013239427927100964,
+      "loss": 0.9197,
+      "step": 890
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 1.0664898243823369,
+      "learning_rate": 0.00013154718276878872,
+      "loss": 0.8101,
+      "step": 895
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.9728647259678721,
+      "learning_rate": 0.00013069756479590065,
+      "loss": 0.8808,
+      "step": 900
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 1.233908623068181,
+      "learning_rate": 0.00012984549325973394,
+      "loss": 0.8942,
+      "step": 905
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.0507024710570716,
+      "learning_rate": 0.000128991036263783,
+      "loss": 0.8719,
+      "step": 910
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.9728551705551435,
+      "learning_rate": 0.0001281342621022048,
+      "loss": 0.8734,
+      "step": 915
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 0.9127829023580948,
+      "learning_rate": 0.00012727523925436026,
+      "loss": 0.8641,
+      "step": 920
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.114858935845165,
+      "learning_rate": 0.00012641403637934112,
+      "loss": 0.8989,
+      "step": 925
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 1.0021048323600934,
+      "learning_rate": 0.00012555072231048192,
+      "loss": 0.8757,
+      "step": 930
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.9850639243326992,
+      "learning_rate": 0.00012468536604985867,
+      "loss": 0.8595,
+      "step": 935
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.8816979962904702,
+      "learning_rate": 0.00012381803676277345,
+      "loss": 0.8854,
+      "step": 940
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.052381113604825,
+      "learning_rate": 0.00012294880377222649,
+      "loss": 0.8966,
+      "step": 945
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 0.9471916607767124,
+      "learning_rate": 0.0001220777365533751,
+      "loss": 0.8977,
+      "step": 950
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 0.9648558401941475,
+      "learning_rate": 0.00012120490472798112,
+      "loss": 0.8837,
+      "step": 955
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 0.9430401800136761,
+      "learning_rate": 0.0001203303780588458,
+      "loss": 0.9009,
+      "step": 960
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.9624197502185033,
+      "learning_rate": 0.00011945422644423425,
+      "loss": 0.8645,
+      "step": 965
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 0.9260810941977449,
+      "learning_rate": 0.00011857651991228855,
+      "loss": 0.8243,
+      "step": 970
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.9362374564085868,
+      "learning_rate": 0.00011769732861543057,
+      "loss": 0.886,
+      "step": 975
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.9293220698915177,
+      "learning_rate": 0.00011681672282475495,
+      "loss": 0.9028,
+      "step": 980
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 0.9026227086147156,
+      "learning_rate": 0.00011593477292441251,
+      "loss": 0.8253,
+      "step": 985
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.9630673700134131,
+      "learning_rate": 0.00011505154940598468,
+      "loss": 0.8686,
+      "step": 990
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 0.8694549872905225,
+      "learning_rate": 0.00011416712286284943,
+      "loss": 0.8782,
+      "step": 995
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 1.007303227993832,
+      "learning_rate": 0.00011328156398453864,
+      "loss": 0.8633,
+      "step": 1000
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 0.9083011875988664,
+      "learning_rate": 0.00011239494355108848,
+      "loss": 0.9039,
+      "step": 1005
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 1.0177463749412752,
+      "learning_rate": 0.00011150733242738198,
+      "loss": 0.9029,
+      "step": 1010
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 0.8631790225704151,
+      "learning_rate": 0.00011061880155748497,
+      "loss": 0.8385,
+      "step": 1015
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 1.5230132007370771,
+      "learning_rate": 0.00010972942195897582,
+      "loss": 0.9055,
+      "step": 1020
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 0.8997463696925398,
+      "learning_rate": 0.00010883926471726926,
+      "loss": 0.8656,
+      "step": 1025
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 0.9377827912335661,
+      "learning_rate": 0.00010794840097993466,
+      "loss": 0.9163,
+      "step": 1030
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 0.9887953595089554,
+      "learning_rate": 0.00010705690195100939,
+      "loss": 0.8789,
+      "step": 1035
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 1.0514798236504352,
+      "learning_rate": 0.00010616483888530781,
+      "loss": 0.9027,
+      "step": 1040
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.9297004988976107,
+      "learning_rate": 0.00010527228308272605,
+      "loss": 0.9473,
+      "step": 1045
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 1.079573310483164,
+      "learning_rate": 0.0001043793058825431,
+      "loss": 0.8308,
+      "step": 1050
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 0.8437863342382683,
+      "learning_rate": 0.00010348597865771909,
+      "loss": 0.9183,
+      "step": 1055
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 1.0314459958872142,
+      "learning_rate": 0.00010259237280919054,
+      "loss": 0.8965,
+      "step": 1060
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 1.1300004513750828,
+      "learning_rate": 0.00010169855976016345,
+      "loss": 0.9058,
+      "step": 1065
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 0.915751139105661,
+      "learning_rate": 0.00010080461095040476,
+      "loss": 0.8522,
+      "step": 1070
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 0.9282214710118697,
+      "learning_rate": 9.991059783053244e-05,
+      "loss": 0.893,
+      "step": 1075
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 0.9267500154050946,
+      "learning_rate": 9.901659185630445e-05,
+      "loss": 0.9187,
+      "step": 1080
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 0.9851835947209288,
+      "learning_rate": 9.812266448290767e-05,
+      "loss": 0.8489,
+      "step": 1085
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 0.9072684185548759,
+      "learning_rate": 9.722888715924664e-05,
+      "loss": 0.8598,
+      "step": 1090
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 1.0793886242538702,
+      "learning_rate": 9.633533132223293e-05,
+      "loss": 0.9136,
+      "step": 1095
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 0.9100627615470746,
+      "learning_rate": 9.54420683910753e-05,
+      "loss": 0.894,
+      "step": 1100
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 1.053250149548252,
+      "learning_rate": 9.454916976157144e-05,
+      "loss": 0.8604,
+      "step": 1105
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 1.054945196161409,
+      "learning_rate": 9.365670680040157e-05,
+      "loss": 0.8875,
+      "step": 1110
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 1.0344061549877617,
+      "learning_rate": 9.276475083942416e-05,
+      "loss": 0.8612,
+      "step": 1115
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 1.034253018495178,
+      "learning_rate": 9.187337316997476e-05,
+      "loss": 0.8901,
+      "step": 1120
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 0.9350006681122276,
+      "learning_rate": 9.09826450371678e-05,
+      "loss": 0.8913,
+      "step": 1125
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 1.1854492776108583,
+      "learning_rate": 9.009263763420228e-05,
+      "loss": 0.9029,
+      "step": 1130
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 1.0313744595376304,
+      "learning_rate": 8.920342209667136e-05,
+      "loss": 0.8399,
+      "step": 1135
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 1.109169472722535,
+      "learning_rate": 8.831506949687685e-05,
+      "loss": 0.8517,
+      "step": 1140
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.9979711332157661,
+      "learning_rate": 8.74276508381486e-05,
+      "loss": 0.8773,
+      "step": 1145
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 0.9829724669057092,
+      "learning_rate": 8.654123704916927e-05,
+      "loss": 0.879,
+      "step": 1150
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 0.99994654751558,
+      "learning_rate": 8.565589897830543e-05,
+      "loss": 0.8523,
+      "step": 1155
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 0.9478833521675312,
+      "learning_rate": 8.47717073879447e-05,
+      "loss": 0.8541,
+      "step": 1160
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 0.9034620075081865,
+      "learning_rate": 8.388873294884e-05,
+      "loss": 0.8918,
+      "step": 1165
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.8723934369993729,
+      "learning_rate": 8.300704623446111e-05,
+      "loss": 0.9133,
+      "step": 1170
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.9525657563232377,
+      "learning_rate": 8.212671771535379e-05,
+      "loss": 0.8997,
+      "step": 1175
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 0.9812331418181283,
+      "learning_rate": 8.124781775350741e-05,
+      "loss": 0.8877,
+      "step": 1180
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 1.0880452788188242,
+      "learning_rate": 8.037041659673105e-05,
+      "loss": 0.9202,
+      "step": 1185
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 1.097063406936711,
+      "learning_rate": 7.949458437303891e-05,
+      "loss": 0.9068,
+      "step": 1190
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 0.816681343841031,
+      "learning_rate": 7.862039108504513e-05,
+      "loss": 0.8774,
+      "step": 1195
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 1.0107773305774448,
+      "learning_rate": 7.774790660436858e-05,
+      "loss": 0.8973,
+      "step": 1200
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 1.151927912306129,
+      "learning_rate": 7.687720066604844e-05,
+      "loss": 0.8857,
+      "step": 1205
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 0.7574607659244479,
+      "learning_rate": 7.600834286297035e-05,
+      "loss": 0.8681,
+      "step": 1210
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 0.993971472764409,
+      "learning_rate": 7.514140264030413e-05,
+      "loss": 0.9421,
+      "step": 1215
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 1.1317554618491943,
+      "learning_rate": 7.427644928995326e-05,
+      "loss": 0.9151,
+      "step": 1220
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.9946380695782382,
+      "learning_rate": 7.341355194501638e-05,
+      "loss": 0.9331,
+      "step": 1225
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 0.9307204785839222,
+      "learning_rate": 7.2552779574262e-05,
+      "loss": 0.9324,
+      "step": 1230
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.8165240509268185,
+      "learning_rate": 7.16942009766159e-05,
+      "loss": 0.8972,
+      "step": 1235
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 1.0160286848183477,
+      "learning_rate": 7.083788477566206e-05,
+      "loss": 0.888,
+      "step": 1240
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 0.9588077943326088,
+      "learning_rate": 6.998389941415811e-05,
+      "loss": 0.8776,
+      "step": 1245
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 1.0027507311971422,
+      "learning_rate": 6.913231314856467e-05,
+      "loss": 0.892,
+      "step": 1250
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 0.8157424560652131,
+      "learning_rate": 6.828319404358998e-05,
+      "loss": 0.8611,
+      "step": 1255
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 0.9346949667365426,
+      "learning_rate": 6.74366099667495e-05,
+      "loss": 0.9041,
+      "step": 1260
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 0.9310369545608014,
+      "learning_rate": 6.659262858294167e-05,
+      "loss": 0.8348,
+      "step": 1265
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 0.916679588029739,
+      "learning_rate": 6.575131734903952e-05,
+      "loss": 0.8665,
+      "step": 1270
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.8335818591855719,
+      "learning_rate": 6.491274350849914e-05,
+      "loss": 0.8892,
+      "step": 1275
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.9831047716703826,
+      "learning_rate": 6.407697408598497e-05,
+      "loss": 0.8944,
+      "step": 1280
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 0.9268335345851625,
+      "learning_rate": 6.324407588201292e-05,
+      "loss": 0.8536,
+      "step": 1285
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 1.0086597114412514,
+      "learning_rate": 6.241411546761109e-05,
+      "loss": 0.8983,
+      "step": 1290
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 0.9455374773689034,
+      "learning_rate": 6.158715917899893e-05,
+      "loss": 0.8638,
+      "step": 1295
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.9581394982958279,
+      "learning_rate": 6.076327311228522e-05,
+      "loss": 0.907,
+      "step": 1300
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.1707841157913208,
+      "eval_runtime": 252.7994,
+      "eval_samples_per_second": 9.138,
+      "eval_steps_per_second": 0.574,
+      "step": 1303
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 1.0117168856081882,
+      "learning_rate": 5.99425231181853e-05,
+      "loss": 0.8658,
+      "step": 1305
+    },
+    {
+      "epoch": 2.01,
+      "grad_norm": 0.8313758221887009,
+      "learning_rate": 5.9124974796757614e-05,
+      "loss": 0.803,
+      "step": 1310
+    },
+    {
+      "epoch": 2.02,
+      "grad_norm": 1.1429742233636946,
+      "learning_rate": 5.831069349216069e-05,
+      "loss": 0.854,
+      "step": 1315
+    },
+    {
+      "epoch": 2.03,
+      "grad_norm": 1.2005681342348582,
+      "learning_rate": 5.7499744287430366e-05,
+      "loss": 0.8209,
+      "step": 1320
+    },
+    {
+      "epoch": 2.03,
+      "grad_norm": 1.0882362071151326,
+      "learning_rate": 5.6692191999277614e-05,
+      "loss": 0.8182,
+      "step": 1325
+    },
+    {
+      "epoch": 2.04,
+      "grad_norm": 1.0316332870317524,
+      "learning_rate": 5.588810117290843e-05,
+      "loss": 0.888,
+      "step": 1330
+    },
+    {
+      "epoch": 2.05,
+      "grad_norm": 0.8415700670907011,
+      "learning_rate": 5.508753607686452e-05,
+      "loss": 0.8274,
+      "step": 1335
+    },
+    {
+      "epoch": 2.06,
+      "grad_norm": 1.19035695841182,
+      "learning_rate": 5.429056069788663e-05,
+      "loss": 0.8587,
+      "step": 1340
+    },
+    {
+      "epoch": 2.06,
+      "grad_norm": 0.8282530747648139,
+      "learning_rate": 5.3497238735800456e-05,
+      "loss": 0.8582,
+      "step": 1345
+    },
+    {
+      "epoch": 2.07,
+      "grad_norm": 0.8236642419616373,
+      "learning_rate": 5.2707633598425023e-05,
+      "loss": 0.8242,
+      "step": 1350
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 0.9499844942270589,
+      "learning_rate": 5.192180839650482e-05,
+      "loss": 0.8419,
+      "step": 1355
+    },
+    {
+      "epoch": 2.09,
+      "grad_norm": 0.9665172203162608,
+      "learning_rate": 5.1139825938665706e-05,
+      "loss": 0.8168,
+      "step": 1360
+    },
+    {
+      "epoch": 2.1,
+      "grad_norm": 0.9570953639965174,
+      "learning_rate": 5.036174872639443e-05,
+      "loss": 0.7975,
+      "step": 1365
+    },
+    {
+      "epoch": 2.1,
+      "grad_norm": 1.1769715973472896,
+      "learning_rate": 4.95876389490435e-05,
+      "loss": 0.8803,
+      "step": 1370
+    },
+    {
+      "epoch": 2.11,
+      "grad_norm": 1.0737767072312128,
+      "learning_rate": 4.8817558478860316e-05,
+      "loss": 0.8392,
+      "step": 1375
+    },
+    {
+      "epoch": 2.12,
+      "grad_norm": 1.0356976294210105,
+      "learning_rate": 4.805156886604192e-05,
+      "loss": 0.8427,
+      "step": 1380
+    },
+    {
+      "epoch": 2.13,
+      "grad_norm": 0.9731310972525462,
+      "learning_rate": 4.728973133381557e-05,
+      "loss": 0.8422,
+      "step": 1385
+    },
+    {
+      "epoch": 2.13,
+      "grad_norm": 1.0915183222326856,
+      "learning_rate": 4.6532106773545356e-05,
+      "loss": 0.8002,
+      "step": 1390
+    },
+    {
+      "epoch": 2.14,
+      "grad_norm": 1.0419392375432739,
+      "learning_rate": 4.5778755739865234e-05,
+      "loss": 0.8035,
+      "step": 1395
+    },
+    {
+      "epoch": 2.15,
+      "grad_norm": 1.034845804276428,
+      "learning_rate": 4.5029738445839143e-05,
+      "loss": 0.8633,
+      "step": 1400
+    },
+    {
+      "epoch": 2.16,
+      "grad_norm": 0.9808221912256501,
+      "learning_rate": 4.4285114758148385e-05,
+      "loss": 0.8238,
+      "step": 1405
+    },
+    {
+      "epoch": 2.16,
+      "grad_norm": 0.9518518180660357,
+      "learning_rate": 4.3544944192306536e-05,
+      "loss": 0.821,
+      "step": 1410
+    },
+    {
+      "epoch": 2.17,
+      "grad_norm": 1.0002459852548866,
+      "learning_rate": 4.2809285907902804e-05,
+      "loss": 0.8153,
+      "step": 1415
+    },
+    {
+      "epoch": 2.18,
+      "grad_norm": 0.9090185258143434,
+      "learning_rate": 4.207819870387331e-05,
+      "loss": 0.8724,
+      "step": 1420
+    },
+    {
+      "epoch": 2.19,
+      "grad_norm": 0.8794828030312869,
+      "learning_rate": 4.135174101380154e-05,
+      "loss": 0.7995,
+      "step": 1425
+    },
+    {
+      "epoch": 2.19,
+      "grad_norm": 0.8426944935252626,
+      "learning_rate": 4.0629970901248125e-05,
+      "loss": 0.878,
+      "step": 1430
+    },
+    {
+      "epoch": 2.2,
+      "grad_norm": 1.0954146157200644,
+      "learning_rate": 3.991294605510969e-05,
+      "loss": 0.8605,
+      "step": 1435
+    },
+    {
+      "epoch": 2.21,
+      "grad_norm": 1.0887405499363092,
+      "learning_rate": 3.920072378500814e-05,
+      "loss": 0.853,
+      "step": 1440
+    },
+    {
+      "epoch": 2.22,
+      "grad_norm": 1.0547395334098018,
+      "learning_rate": 3.849336101671015e-05,
+      "loss": 0.8921,
+      "step": 1445
+    },
+    {
+      "epoch": 2.23,
+      "grad_norm": 1.069888150135721,
+      "learning_rate": 3.779091428757692e-05,
+      "loss": 0.8161,
+      "step": 1450
+    },
+    {
+      "epoch": 2.23,
+      "grad_norm": 0.9353341753814841,
+      "learning_rate": 3.709343974204577e-05,
+      "loss": 0.8179,
+      "step": 1455
+    },
+    {
+      "epoch": 2.24,
+      "grad_norm": 1.0108851490383193,
+      "learning_rate": 3.640099312714235e-05,
+      "loss": 0.8385,
+      "step": 1460
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": 1.064940293841458,
+      "learning_rate": 3.5713629788025036e-05,
+      "loss": 0.8135,
+      "step": 1465
+    },
+    {
+      "epoch": 2.26,
+      "grad_norm": 1.1487280639099855,
+      "learning_rate": 3.503140466356151e-05,
+      "loss": 0.8021,
+      "step": 1470
+    },
+    {
+      "epoch": 2.26,
+      "grad_norm": 0.9701769325400604,
+      "learning_rate": 3.435437228193741e-05,
+      "loss": 0.8399,
+      "step": 1475
+    },
+    {
+      "epoch": 2.27,
+      "grad_norm": 0.9942305006661968,
+      "learning_rate": 3.3682586756298185e-05,
+      "loss": 0.8427,
+      "step": 1480
+    },
+    {
+      "epoch": 2.28,
+      "grad_norm": 1.0099321403910597,
+      "learning_rate": 3.3016101780424146e-05,
+      "loss": 0.8693,
+      "step": 1485
+    },
+    {
+      "epoch": 2.29,
+      "grad_norm": 1.169305983316981,
+      "learning_rate": 3.235497062443852e-05,
+      "loss": 0.8545,
+      "step": 1490
+    },
+    {
+      "epoch": 2.29,
+      "grad_norm": 0.9289433553993577,
+      "learning_rate": 3.169924613055003e-05,
+      "loss": 0.7912,
+      "step": 1495
+    },
+    {
+      "epoch": 2.3,
+      "grad_norm": 1.1184033184773026,
+      "learning_rate": 3.10489807088294e-05,
+      "loss": 0.8488,
+      "step": 1500
+    },
+    {
+      "epoch": 2.31,
+      "grad_norm": 0.8651320149450171,
+      "learning_rate": 3.0404226333020114e-05,
+      "loss": 0.8063,
+      "step": 1505
+    },
+    {
+      "epoch": 2.32,
+      "grad_norm": 1.015380797479418,
+      "learning_rate": 2.976503453638452e-05,
+      "loss": 0.8281,
+      "step": 1510
+    },
+    {
+      "epoch": 2.33,
+      "grad_norm": 0.9105715510091448,
+      "learning_rate": 2.9131456407584912e-05,
+      "loss": 0.8396,
+      "step": 1515
+    },
+    {
+      "epoch": 2.33,
+      "grad_norm": 0.8953944449056994,
+      "learning_rate": 2.8503542586600095e-05,
+      "loss": 0.8059,
+      "step": 1520
+    },
+    {
+      "epoch": 2.34,
+      "grad_norm": 0.9445583773531011,
+      "learning_rate": 2.7881343260677938e-05,
+      "loss": 0.7555,
+      "step": 1525
+    },
+    {
+      "epoch": 2.35,
+      "grad_norm": 0.9760863299166253,
+      "learning_rate": 2.7264908160324044e-05,
+      "loss": 0.872,
+      "step": 1530
+    },
+    {
+      "epoch": 2.36,
+      "grad_norm": 0.9217062841634305,
+      "learning_rate": 2.66542865553269e-05,
+      "loss": 0.8772,
+      "step": 1535
+    },
+    {
+      "epoch": 2.36,
+      "grad_norm": 0.9862000270745448,
+      "learning_rate": 2.6049527250820048e-05,
+      "loss": 0.8042,
+      "step": 1540
+    },
+    {
+      "epoch": 2.37,
+      "grad_norm": 0.9372789886762245,
+      "learning_rate": 2.5450678583381037e-05,
+      "loss": 0.8373,
+      "step": 1545
+    },
+    {
+      "epoch": 2.38,
+      "grad_norm": 0.9849583322670422,
+      "learning_rate": 2.4857788417168082e-05,
+      "loss": 0.8449,
+      "step": 1550
+    },
+    {
+      "epoch": 2.39,
+      "grad_norm": 0.9531252163149414,
+      "learning_rate": 2.4270904140094597e-05,
+      "loss": 0.8204,
+      "step": 1555
+    },
+    {
+      "epoch": 2.39,
+      "grad_norm": 1.1160019622624047,
+      "learning_rate": 2.3690072660041373e-05,
+      "loss": 0.857,
+      "step": 1560
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 1.0574865907973916,
+      "learning_rate": 2.3115340401107487e-05,
+      "loss": 0.8154,
+      "step": 1565
+    },
+    {
+      "epoch": 2.41,
+      "grad_norm": 1.0014445052570997,
+      "learning_rate": 2.254675329989988e-05,
+      "loss": 0.8526,
+      "step": 1570
+    },
+    {
+      "epoch": 2.42,
+      "grad_norm": 1.1337662282318417,
+      "learning_rate": 2.1984356801861506e-05,
+      "loss": 0.7529,
+      "step": 1575
+    },
+    {
+      "epoch": 2.43,
+      "grad_norm": 1.0240125708817747,
+      "learning_rate": 2.1428195857639256e-05,
+      "loss": 0.8252,
+      "step": 1580
+    },
+    {
+      "epoch": 2.43,
+      "grad_norm": 0.8777879796341344,
+      "learning_rate": 2.0878314919491183e-05,
+      "loss": 0.8349,
+      "step": 1585
+    },
+    {
+      "epoch": 2.44,
+      "grad_norm": 0.9749914994991105,
+      "learning_rate": 2.0334757937733374e-05,
+      "loss": 0.8147,
+      "step": 1590
+    },
+    {
+      "epoch": 2.45,
+      "grad_norm": 1.0047182891795483,
+      "learning_rate": 1.9797568357227293e-05,
+      "loss": 0.8225,
+      "step": 1595
+    },
+    {
+      "epoch": 2.46,
+      "grad_norm": 1.102505598943093,
+      "learning_rate": 1.92667891139074e-05,
+      "loss": 0.8356,
+      "step": 1600
+    },
+    {
+      "epoch": 2.46,
+      "grad_norm": 0.9052644924404957,
+      "learning_rate": 1.8742462631349246e-05,
+      "loss": 0.8509,
+      "step": 1605
+    },
+    {
+      "epoch": 2.47,
+      "grad_norm": 0.9927305294402076,
+      "learning_rate": 1.822463081737883e-05,
+      "loss": 0.8182,
+      "step": 1610
+    },
+    {
+      "epoch": 2.48,
+      "grad_norm": 1.0563552318088962,
+      "learning_rate": 1.7713335060722946e-05,
+      "loss": 0.7578,
+      "step": 1615
+    },
+    {
+      "epoch": 2.49,
+      "grad_norm": 1.0170016112678426,
+      "learning_rate": 1.720861622770116e-05,
+      "loss": 0.7562,
+      "step": 1620
+    },
+    {
+      "epoch": 2.49,
+      "grad_norm": 0.9404587306406071,
+      "learning_rate": 1.671051465895953e-05,
+      "loss": 0.786,
+      "step": 1625
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 1.0171241452919628,
+      "learning_rate": 1.6219070166246154e-05,
+      "loss": 0.8616,
+      "step": 1630
+    },
+    {
+      "epoch": 2.51,
+      "grad_norm": 1.0322115593813674,
+      "learning_rate": 1.5734322029229253e-05,
+      "loss": 0.8592,
+      "step": 1635
+    },
+    {
+      "epoch": 2.52,
+      "grad_norm": 1.0192391783063095,
+      "learning_rate": 1.5256308992357716e-05,
+      "loss": 0.8372,
+      "step": 1640
+    },
+    {
+      "epoch": 2.52,
+      "grad_norm": 1.17212669377524,
+      "learning_rate": 1.4785069261764184e-05,
+      "loss": 0.7713,
+      "step": 1645
+    },
+    {
+      "epoch": 2.53,
+      "grad_norm": 1.0439043355220827,
+      "learning_rate": 1.4320640502211536e-05,
+      "loss": 0.8379,
+      "step": 1650
+    },
+    {
+      "epoch": 2.54,
+      "grad_norm": 0.9140306646774222,
+      "learning_rate": 1.386305983408236e-05,
+      "loss": 0.8041,
+      "step": 1655
+    },
+    {
+      "epoch": 2.55,
+      "grad_norm": 1.061796317725875,
+      "learning_rate": 1.3412363830412078e-05,
+      "loss": 0.8318,
+      "step": 1660
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 1.0834691203563807,
+      "learning_rate": 1.2968588513965706e-05,
+      "loss": 0.851,
+      "step": 1665
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 0.9887481512630522,
+      "learning_rate": 1.2531769354358825e-05,
+      "loss": 0.8728,
+      "step": 1670
+    },
+    {
+      "epoch": 2.57,
+      "grad_norm": 1.058272033476656,
+      "learning_rate": 1.2101941265222373e-05,
+      "loss": 0.8016,
+      "step": 1675
+    },
+    {
+      "epoch": 2.58,
+      "grad_norm": 1.072187651154863,
+      "learning_rate": 1.1679138601412255e-05,
+      "loss": 0.8505,
+      "step": 1680
+    },
+    {
+      "epoch": 2.59,
+      "grad_norm": 1.2017961960493708,
+      "learning_rate": 1.126339515626349e-05,
+      "loss": 0.8601,
+      "step": 1685
+    },
+    {
+      "epoch": 2.59,
+      "grad_norm": 0.9334846571871207,
+      "learning_rate": 1.0854744158889085e-05,
+      "loss": 0.8178,
+      "step": 1690
+    },
+    {
+      "epoch": 2.6,
+      "grad_norm": 0.9385801565830063,
+      "learning_rate": 1.0453218271524224e-05,
+      "loss": 0.8155,
+      "step": 1695
+    },
+    {
+      "epoch": 2.61,
+      "grad_norm": 1.0026372920490085,
+      "learning_rate": 1.0058849586915653e-05,
+      "loss": 0.8463,
+      "step": 1700
+    },
+    {
+      "epoch": 2.62,
+      "grad_norm": 1.0094111622202961,
+      "learning_rate": 9.671669625756574e-06,
+      "loss": 0.8291,
+      "step": 1705
+    },
+    {
+      "epoch": 2.62,
+      "grad_norm": 1.0498204906448678,
+      "learning_rate": 9.291709334167397e-06,
+      "loss": 0.8694,
+      "step": 1710
+    },
+    {
+      "epoch": 2.63,
+      "grad_norm": 0.9108848375007711,
+      "learning_rate": 8.918999081222156e-06,
+      "loss": 0.8154,
+      "step": 1715
+    },
+    {
+      "epoch": 2.64,
+      "grad_norm": 1.1045628537182532,
+      "learning_rate": 8.553568656521293e-06,
+      "loss": 0.8542,
+      "step": 1720
+    },
+    {
+      "epoch": 2.65,
+      "grad_norm": 0.9187751707089163,
+      "learning_rate": 8.195447267810686e-06,
+      "loss": 0.8847,
+      "step": 1725
+    },
+    {
+      "epoch": 2.66,
+      "grad_norm": 1.0976708579458017,
+      "learning_rate": 7.844663538647101e-06,
+      "loss": 0.8168,
+      "step": 1730
+    },
+    {
+      "epoch": 2.66,
+      "grad_norm": 1.0277757622039416,
+      "learning_rate": 7.501245506110433e-06,
+      "loss": 0.824,
+      "step": 1735
+    },
+    {
+      "epoch": 2.67,
+      "grad_norm": 0.9800736010664038,
+      "learning_rate": 7.165220618562751e-06,
+      "loss": 0.8499,
+      "step": 1740
+    },
+    {
+      "epoch": 2.68,
+      "grad_norm": 1.1066605466255004,
+      "learning_rate": 6.83661573345451e-06,
+      "loss": 0.869,
+      "step": 1745
+    },
+    {
+      "epoch": 2.69,
+      "grad_norm": 1.0217949991902664,
+      "learning_rate": 6.515457115177803e-06,
+      "loss": 0.8308,
+      "step": 1750
+    },
+    {
+      "epoch": 2.69,
+      "grad_norm": 1.0285372086807927,
+      "learning_rate": 6.20177043296728e-06,
+      "loss": 0.8513,
+      "step": 1755
+    },
+    {
+      "epoch": 2.7,
+      "grad_norm": 0.9228807374530837,
+      "learning_rate": 5.895580758848318e-06,
+      "loss": 0.8359,
+      "step": 1760
+    },
+    {
+      "epoch": 2.71,
+      "grad_norm": 1.042562235261924,
+      "learning_rate": 5.596912565633184e-06,
+      "loss": 0.8144,
+      "step": 1765
+    },
+    {
+      "epoch": 2.72,
+      "grad_norm": 1.098352136508281,
+      "learning_rate": 5.305789724965038e-06,
+      "loss": 0.8253,
+      "step": 1770
+    },
+    {
+      "epoch": 2.72,
+      "grad_norm": 0.9667182261715308,
+      "learning_rate": 5.022235505409823e-06,
+      "loss": 0.8672,
+      "step": 1775
+    },
+    {
+      "epoch": 2.73,
+      "grad_norm": 1.019562250529469,
+      "learning_rate": 4.746272570596555e-06,
+      "loss": 0.8214,
+      "step": 1780
+    },
+    {
+      "epoch": 2.74,
+      "grad_norm": 1.2161564555157138,
+      "learning_rate": 4.477922977405913e-06,
+      "loss": 0.8113,
+      "step": 1785
+    },
+    {
+      "epoch": 2.75,
+      "grad_norm": 1.0015900069881538,
+      "learning_rate": 4.217208174207199e-06,
+      "loss": 0.8484,
+      "step": 1790
+    },
+    {
+      "epoch": 2.76,
+      "grad_norm": 0.9631542498199687,
+      "learning_rate": 3.964148999144202e-06,
+      "loss": 0.853,
+      "step": 1795
+    },
+    {
+      "epoch": 2.76,
+      "grad_norm": 0.9926634902794105,
+      "learning_rate": 3.71876567846946e-06,
+      "loss": 0.8488,
+      "step": 1800
+    },
+    {
+      "epoch": 2.77,
+      "grad_norm": 0.8438960246283821,
+      "learning_rate": 3.481077824927792e-06,
+      "loss": 0.829,
+      "step": 1805
+    },
+    {
+      "epoch": 2.78,
+      "grad_norm": 1.0027991142986536,
+      "learning_rate": 3.251104436188679e-06,
+      "loss": 0.8416,
+      "step": 1810
+    },
+    {
+      "epoch": 2.79,
+      "grad_norm": 1.0903635867609474,
+      "learning_rate": 3.0288638933277934e-06,
+      "loss": 0.8065,
+      "step": 1815
+    },
+    {
+      "epoch": 2.79,
+      "grad_norm": 1.0073217221247301,
+      "learning_rate": 2.8143739593578856e-06,
+      "loss": 0.7876,
+      "step": 1820
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 0.8171096137144128,
+      "learning_rate": 2.607651777809039e-06,
+      "loss": 0.7574,
+      "step": 1825
+    },
+    {
+      "epoch": 2.81,
+      "grad_norm": 0.9374489147765585,
+      "learning_rate": 2.4087138713584367e-06,
+      "loss": 0.8652,
+      "step": 1830
+    },
+    {
+      "epoch": 2.82,
+      "grad_norm": 1.054601585067846,
+      "learning_rate": 2.2175761405097584e-06,
+      "loss": 0.8471,
+      "step": 1835
+    },
+    {
+      "epoch": 2.82,
+      "grad_norm": 1.064632352850785,
+      "learning_rate": 2.0342538623222997e-06,
+      "loss": 0.8189,
+      "step": 1840
+    },
+    {
+      "epoch": 2.83,
+      "grad_norm": 0.8546672110619797,
+      "learning_rate": 1.8587616891899363e-06,
+      "loss": 0.7818,
+      "step": 1845
+    },
+    {
+      "epoch": 2.84,
+      "grad_norm": 0.9645255792031215,
+      "learning_rate": 1.6911136476699508e-06,
+      "loss": 0.7776,
+      "step": 1850
+    },
+    {
+      "epoch": 2.85,
+      "grad_norm": 1.068610540574793,
+      "learning_rate": 1.5313231373619952e-06,
+      "loss": 0.7857,
+      "step": 1855
+    },
+    {
+      "epoch": 2.85,
+      "grad_norm": 1.100889943747955,
+      "learning_rate": 1.3794029298370814e-06,
+      "loss": 0.8141,
+      "step": 1860
+    },
+    {
+      "epoch": 2.86,
+      "grad_norm": 0.9783722379990912,
+      "learning_rate": 1.2353651676167643e-06,
+      "loss": 0.8358,
+      "step": 1865
+    },
+    {
+      "epoch": 2.87,
+      "grad_norm": 0.8803200725105156,
+      "learning_rate": 1.0992213632026517e-06,
+      "loss": 0.8678,
+      "step": 1870
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 1.1249544133659088,
+      "learning_rate": 9.709823981562282e-07,
+      "loss": 0.8078,
+      "step": 1875
+    },
+    {
+      "epoch": 2.89,
+      "grad_norm": 1.056395258554381,
+      "learning_rate": 8.506585222291752e-07,
+      "loss": 0.7749,
+      "step": 1880
+    },
+    {
+      "epoch": 2.89,
+      "grad_norm": 0.9253933119169012,
+      "learning_rate": 7.382593525440573e-07,
+      "loss": 0.8008,
+      "step": 1885
+    },
+    {
+      "epoch": 2.9,
+      "grad_norm": 0.9212088582083587,
+      "learning_rate": 6.337938728257054e-07,
+      "loss": 0.8333,
+      "step": 1890
+    },
+    {
+      "epoch": 2.91,
+      "grad_norm": 1.100818174265163,
+      "learning_rate": 5.372704326831901e-07,
+      "loss": 0.8342,
+      "step": 1895
+    },
+    {
+      "epoch": 2.92,
+      "grad_norm": 0.8164157006379008,
+      "learning_rate": 4.486967469424008e-07,
+      "loss": 0.8583,
+      "step": 1900
+    },
+    {
+      "epoch": 2.92,
+      "grad_norm": 1.0674250153511942,
+      "learning_rate": 3.6807989502949394e-07,
+      "loss": 0.8213,
+      "step": 1905
+    },
+    {
+      "epoch": 2.93,
+      "grad_norm": 0.9095567427952401,
+      "learning_rate": 2.954263204050123e-07,
+      "loss": 0.8252,
+      "step": 1910
+    },
+    {
+      "epoch": 2.94,
+      "grad_norm": 1.0738638532567681,
+      "learning_rate": 2.3074183004887505e-07,
+      "loss": 0.8448,
+      "step": 1915
+    },
+    {
+      "epoch": 2.95,
+      "grad_norm": 1.1019761677075681,
+      "learning_rate": 1.7403159399629332e-07,
+      "loss": 0.8088,
+      "step": 1920
+    },
+    {
+      "epoch": 2.95,
+      "grad_norm": 0.9934974979812884,
+      "learning_rate": 1.2530014492446728e-07,
+      "loss": 0.8029,
+      "step": 1925
+    },
+    {
+      "epoch": 2.96,
+      "grad_norm": 1.0433381527945313,
+      "learning_rate": 8.455137779038724e-08,
+      "loss": 0.8209,
+      "step": 1930
+    },
+    {
+      "epoch": 2.97,
+      "grad_norm": 1.032700859857025,
+      "learning_rate": 5.1788549519438124e-08,
+      "loss": 0.7365,
+      "step": 1935
+    },
+    {
+      "epoch": 2.98,
+      "grad_norm": 0.9408894102132038,
+      "learning_rate": 2.7014278745163268e-08,
+      "loss": 0.8451,
+      "step": 1940
+    },
+    {
+      "epoch": 2.99,
+      "grad_norm": 1.0657773929136594,
+      "learning_rate": 1.0230545599909658e-08,
+      "loss": 0.8246,
+      "step": 1945
+    },
+    {
+      "epoch": 2.99,
+      "grad_norm": 1.0107360164317165,
+      "learning_rate": 1.438691556565619e-09,
+      "loss": 0.8209,
+      "step": 1950
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 1.1822351217269897,
+      "eval_runtime": 252.7889,
+      "eval_samples_per_second": 9.138,
+      "eval_steps_per_second": 0.574,
+      "step": 1953
+    },
+    {
+      "epoch": 3.0,
+      "step": 1953,
+      "total_flos": 4109935435055104.0,
+      "train_loss": 1.1336081770219621,
+      "train_runtime": 21652.8936,
+      "train_samples_per_second": 2.888,
+      "train_steps_per_second": 0.09
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1953,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "total_flos": 4109935435055104.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2fb6c59092bfc2a53cbab03013e22bef025c879fec236faee27f03a7847365e0
+size 6200