diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cf061e1a117d4f5474bf251f58ae163ae971dbe7
--- /dev/null
+++ b/README.md
@@ -0,0 +1,218 @@
+---
+base_model: alpindale/Mistral-7B-v0.2-hf
+tags:
+- axolotl
+- generated_from_trainer
+model-index:
+- name: Einstein-v6-7B
+  results: []
+---
+
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+
+[<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
+<details><summary>See axolotl config</summary>
+
+axolotl version: `0.4.0`
+```yaml
+base_model: alpindale/Mistral-7B-v0.2-hf
+model_type: MistralForCausalLM
+tokenizer_type: LlamaTokenizer
+is_mistral_derived_model: true
+
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
+chat_template: chatml
+datasets:
+  - path: data/merged_all.json
+    ds_type: json
+    type: alpaca
+    conversation: chatml
+
+  - path: data/gpteacher-instruct-special-alpaca.json
+    ds_type: json
+    type: gpteacher
+    conversation: chatml
+
+  - path: data/wizardlm_evol_instruct_70k_random_half.json
+    ds_type: json
+    type: alpaca
+    conversation: chatml
+
+  - path: data/capybara_sharegpt.json
+    ds_type: json
+    type: sharegpt
+    conversation: chatml
+
+  - path: data/synthia-v1.3_sharegpt_12500.json
+    ds_type: json
+    type: sharegpt
+    conversation: chatml  
+
+  - path: data/cot_alpaca_gpt4_extracted_openhermes_2.5_sharegpt.json
+    ds_type: json
+    type: sharegpt
+    conversation: chatml
+
+  - path: data/slimorca_dedup_filtered_95k_sharegpt.json
+    ds_type: json
+    type: sharegpt
+    conversation: chatml  
+
+  - path: data/airoboros_3.2_without_contextual_slimorca_orca_sharegpt.json
+    ds_type: json
+    type: sharegpt
+    conversation: chatml  
+
+  - path: data/allenai_wild_chat_gpt4_english_toxic_random_half_4k_sharegpt.json
+    ds_type: json
+    type: sharegpt
+    strict: false
+    conversation: chatml  
+
+  - path: data/pippa_bagel_repo_3k_sharegpt.json
+    ds_type: json
+    type: sharegpt
+    conversation: chatml  
+
+  - path: data/gpt4_data_lmys_1m_sharegpt.json
+    ds_type: json
+    type: sharegpt
+    conversation: chatml  
+
+  - path: data/sharegpt_gpt4_english.json
+    ds_type: json
+    type: sharegpt
+    conversation: chatml
+
+  - path: data/no_robots_sharegpt.json
+    ds_type: json
+    type: sharegpt
+    strict: false
+    conversation: chatml
+
+  - path: data/oasst_top1_from_fusechatmixture_sharegpt.json
+    ds_type: json
+    type: sharegpt
+    strict: false
+    conversation: chatml
+
+  - path: data/everythinglm-data-v3_sharegpt.json
+    ds_type: json
+    type: sharegpt
+    strict: false
+    conversation: chatml
+
+dataset_prepared_path: last_run_prepared
+# val_set_size: 0.005
+val_set_size: 0.0
+
+do_bench_eval: true
+
+output_dir: ./Einstein-v6-7B-model
+
+sequence_len: 8192
+sample_packing: true
+pad_to_sequence_len: true
+eval_sample_packing: false
+
+wandb_project: Einstein
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+hub_model_id: Weyaxi/Einstein-v6-7B
+
+save_safetensors: true
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 2
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.000005
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+evals_per_epoch: 3 # changed
+eval_table_size:
+eval_table_max_new_tokens: 128
+saves_per_epoch: 2 # changed
+debug:
+
+deepspeed: zero3_bf16.json
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "<|im_end|>"
+  unk_token: "<unk>"
+tokens:
+  - "<|im_start|>"
+
+```
+
+</details><br>
+
+# Einstein-v6-7B
+
+This model is a fine-tuned version of [alpindale/Mistral-7B-v0.2-hf](https://huggingface.co/alpindale/Mistral-7B-v0.2-hf) on the None dataset.
+
+## Model description
+
+More information needed
+
+## Intended uses & limitations
+
+More information needed
+
+## Training and evaluation data
+
+More information needed
+
+## Training procedure
+
+### Training hyperparameters
+
+The following hyperparameters were used during training:
+- learning_rate: 5e-06
+- train_batch_size: 1
+- eval_batch_size: 1
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 9
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 36
+- total_eval_batch_size: 9
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 10
+- num_epochs: 2
+
+### Training results
+
+
+
+### Framework versions
+
+- Transformers 4.38.2
+- Pytorch 2.1.2+cu118
+- Datasets 2.18.0
+- Tokenizers 0.15.0
diff --git a/added_tokens.json b/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..e36863df2bc13b20909d6711019409e777802fb5
--- /dev/null
+++ b/added_tokens.json
@@ -0,0 +1,4 @@
+{
+  "<|im_end|>": 32000,
+  "<|im_start|>": 32001
+}
diff --git a/checkpoint-1206/config.json b/checkpoint-1206/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e19729ddff0edff2916b7fba6d2fec722621e76
--- /dev/null
+++ b/checkpoint-1206/config.json
@@ -0,0 +1,26 @@
+{
+  "_name_or_path": "alpindale/Mistral-7B-v0.2-hf",
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 32000,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 32768,
+  "model_type": "mistral",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": false,
+  "vocab_size": 32002
+}
diff --git a/checkpoint-1206/generation_config.json b/checkpoint-1206/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..282b497efd8f276cf9270e576fb79be429aebcdc
--- /dev/null
+++ b/checkpoint-1206/generation_config.json
@@ -0,0 +1,7 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "do_sample": true,
+  "eos_token_id": 2,
+  "transformers_version": "4.38.2"
+}
diff --git a/checkpoint-1206/global_step1206/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5db306e9f37c4b097d13acbc2c79fb683244aaab
--- /dev/null
+++ b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:640cdfc27b9b4c7dc135ffc8be7e345ea4e78175fbfaf92f848f3305de92d913
+size 4831623435
diff --git a/checkpoint-1206/global_step1206/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..192d6cd5e2524869a59ad20e9d6d3a0e4ae31b51
--- /dev/null
+++ b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f456b41894b294656b00f40899f37b016b0cbd3fc5f6cfece1ed66c2d1fa1c6
+size 4831623435
diff --git a/checkpoint-1206/global_step1206/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..beb3ad7e6f5dfa8e9d930c194906aa5191abf588
--- /dev/null
+++ b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aee6c39cdf7c911a3c96cf9f19437fe89c97b88a018ec0ed510d422a92d97a20
+size 4831623435
diff --git a/checkpoint-1206/global_step1206/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f79f34d4580e557afd7a07e65d1c1e42b620d79f
--- /dev/null
+++ b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:618e19b231e8d29988022a17a08623a3a47bd17ef0e366fef7e13be132a2dfba
+size 4831623435
diff --git a/checkpoint-1206/global_step1206/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6ce8abbd91ba039f58c55feb8828b4df987e714a
--- /dev/null
+++ b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b0c72752e3f1d839baa84bace95f0baa137721100f7abf9eb6f278ad3d91fd2c
+size 4831623435
diff --git a/checkpoint-1206/global_step1206/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c21bb4fd1a539cac416bc2962bee8bb02b4be2b6
--- /dev/null
+++ b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4cb23bce469bd6b6cf2c4700225f7be1b53f11f4ea466a3240ee4e8eb6dbb02f
+size 4831623435
diff --git a/checkpoint-1206/global_step1206/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cce426895d3fdee311af30caead54979ffc3faa6
--- /dev/null
+++ b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03f48c713177be0b2a6284b27725b4bd58e3a071f12505c23d4044e9e4145384
+size 4831623435
diff --git a/checkpoint-1206/global_step1206/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6648c0d8f7a1ded89f6bf201fd2ae691dd90362d
--- /dev/null
+++ b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8db0c6517392bd44bbb3117e07ef75a7f9163fd4bb1bd249a812ec159706e4ce
+size 4831623435
diff --git a/checkpoint-1206/global_step1206/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bc89bc50151290b748198b98b9690d2b718200ed
--- /dev/null
+++ b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc32e4c829baba0825f6c473240f85e37140a4a3d00bab68dce0a3d4ac83769d
+size 4831623435
diff --git a/checkpoint-1206/global_step1206/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-1206/global_step1206/zero_pp_rank_0_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a023057ba9193fae89248cd8ee4ae0f293858be9
--- /dev/null
+++ b/checkpoint-1206/global_step1206/zero_pp_rank_0_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c9afb76d2016a6260a8effe707b6bccedce48834675b1eab9818a761595b352
+size 153829
diff --git a/checkpoint-1206/global_step1206/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-1206/global_step1206/zero_pp_rank_1_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..60d9f5144856eb42a9f84d6aa325247cd0a3ef4f
--- /dev/null
+++ b/checkpoint-1206/global_step1206/zero_pp_rank_1_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24a2d07e7e08f1d95551f765621032687695cdb304d85023fc9f72d3174a7b5e
+size 153829
diff --git a/checkpoint-1206/global_step1206/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-1206/global_step1206/zero_pp_rank_2_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b5c94358bea170387cd903d10ae4a4ec4999b830
--- /dev/null
+++ b/checkpoint-1206/global_step1206/zero_pp_rank_2_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1781a0983c1da9b1dbe29b99cc54f29e5509633808ff406ce58c6dba525eb461
+size 153829
diff --git a/checkpoint-1206/global_step1206/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-1206/global_step1206/zero_pp_rank_3_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ad2da2a9ef86c60c213fb81ab43bccbc2ce71f78
--- /dev/null
+++ b/checkpoint-1206/global_step1206/zero_pp_rank_3_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a83e1db401ecb9437fbe9a1e0b6eac0ea109f94eef5ff21465b73decb906c72d
+size 153829
diff --git a/checkpoint-1206/global_step1206/zero_pp_rank_4_mp_rank_00_model_states.pt b/checkpoint-1206/global_step1206/zero_pp_rank_4_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..673b461dadbb336b1931ff5a733ddfd4f43319ff
--- /dev/null
+++ b/checkpoint-1206/global_step1206/zero_pp_rank_4_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0cbd3c92ba583215c08495704bd146204e79775b020b7fea83ac7681c043d1f
+size 153829
diff --git a/checkpoint-1206/global_step1206/zero_pp_rank_5_mp_rank_00_model_states.pt b/checkpoint-1206/global_step1206/zero_pp_rank_5_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9f00b141691c99df05bc5ffb1b52eceb3f301b29
--- /dev/null
+++ b/checkpoint-1206/global_step1206/zero_pp_rank_5_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d840c4bd1e9c9875435f80bc1caf93a443c2c195c8e2e3ec0695aded1a1a8456
+size 153829
diff --git a/checkpoint-1206/global_step1206/zero_pp_rank_6_mp_rank_00_model_states.pt b/checkpoint-1206/global_step1206/zero_pp_rank_6_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..96fcbdd43e803c80e9b54ac5e53841bb8b62a1f6
--- /dev/null
+++ b/checkpoint-1206/global_step1206/zero_pp_rank_6_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:043ae2dded1785e74eae756ce3606c5429fdee52fa6db54d8a3f227d081b3904
+size 153829
diff --git a/checkpoint-1206/global_step1206/zero_pp_rank_7_mp_rank_00_model_states.pt b/checkpoint-1206/global_step1206/zero_pp_rank_7_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..db1fa7b54abadef25da18242868b2bc4bffbeaf9
--- /dev/null
+++ b/checkpoint-1206/global_step1206/zero_pp_rank_7_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e6898409ae39821f228d85e5315f2bdc3f01288c5201bccdf1ca9b2cc9cb984
+size 153829
diff --git a/checkpoint-1206/global_step1206/zero_pp_rank_8_mp_rank_00_model_states.pt b/checkpoint-1206/global_step1206/zero_pp_rank_8_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..af430f4bebe1aa35c715d84c5623c64f322feb7e
--- /dev/null
+++ b/checkpoint-1206/global_step1206/zero_pp_rank_8_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b96194551e4c7d97f8be9a35968388ea20559c2e6b909bbbf382e9e4c21a5279
+size 153829
diff --git a/checkpoint-1206/latest b/checkpoint-1206/latest
new file mode 100644
index 0000000000000000000000000000000000000000..34d4c1304a2f32052898ef011354ffe438bb60ad
--- /dev/null
+++ b/checkpoint-1206/latest
@@ -0,0 +1 @@
+global_step1206
\ No newline at end of file
diff --git a/checkpoint-1206/model-00001-of-00003.safetensors b/checkpoint-1206/model-00001-of-00003.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e18149c479d72bb1418c8afe6daf57768e27bada
--- /dev/null
+++ b/checkpoint-1206/model-00001-of-00003.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e1195ff62d6499fd137e1c9f051ecb5c8cc4ebd0936800ce10aa42250f5570a
+size 4943178720
diff --git a/checkpoint-1206/model-00002-of-00003.safetensors b/checkpoint-1206/model-00002-of-00003.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5c6184aff12c7d63b7b50a1b95084fc9c3ca67ba
--- /dev/null
+++ b/checkpoint-1206/model-00002-of-00003.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa17f326c5f2c97b6575de75571734430eb97a441035542b71107ff6a9e094fc
+size 4999819336
diff --git a/checkpoint-1206/model-00003-of-00003.safetensors b/checkpoint-1206/model-00003-of-00003.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..585273dc2f4b1bd3129ad3c2336f45777a401a6b
--- /dev/null
+++ b/checkpoint-1206/model-00003-of-00003.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd7d94ada855cea53fb537d5bc1a78219d57a094fe3d381501aee08b7a7d9ad4
+size 4540532728
diff --git a/checkpoint-1206/model.safetensors.index.json b/checkpoint-1206/model.safetensors.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..71e207631efb42944bc779548c9b6d4b5818ffe2
--- /dev/null
+++ b/checkpoint-1206/model.safetensors.index.json
@@ -0,0 +1,298 @@
+{
+  "metadata": {
+    "total_size": 14483496960
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00003-of-00003.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.norm.weight": "model-00003-of-00003.safetensors"
+  }
+}
diff --git a/checkpoint-1206/rng_state_0.pth b/checkpoint-1206/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..7ae643fef71bb5468722e041971c4fd10143dcde
--- /dev/null
+++ b/checkpoint-1206/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d78df38122b8b51b69a3cce1a8d8cb0f7d8684196dde8fb6d174ef0fd3440d89
+size 16240
diff --git a/checkpoint-1206/rng_state_1.pth b/checkpoint-1206/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..0dec857bd06d8263dc0d1f195ea4d4288bad4641
--- /dev/null
+++ b/checkpoint-1206/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:499f46e15237a5856de1a8f0582d02e4319721d83140e01c31e9e1db92da7108
+size 16240
diff --git a/checkpoint-1206/rng_state_2.pth b/checkpoint-1206/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6d57f4b1f904b392ef605de094c7e5171fced622
--- /dev/null
+++ b/checkpoint-1206/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b32ec8b414a3886bf179af827449dee557e95bfa64a7c20f26c186df2659c9f
+size 16240
diff --git a/checkpoint-1206/rng_state_3.pth b/checkpoint-1206/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..4c8bebc9d459d1ed2d1ab4f27d7ec2da721d0445
--- /dev/null
+++ b/checkpoint-1206/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:82765e3b8fb57ca7779e75617b51182226eed278593e6441a31510115950353d
+size 16240
diff --git a/checkpoint-1206/rng_state_4.pth b/checkpoint-1206/rng_state_4.pth
new file mode 100644
index 0000000000000000000000000000000000000000..71f7ca7b0554bc7702f1e276ae0cd3924ffba0d2
--- /dev/null
+++ b/checkpoint-1206/rng_state_4.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6dd2c24e041054b45b5bf8c50512ea8c4552e5f2e877fe798759dec7a7f3aae1
+size 16240
diff --git a/checkpoint-1206/rng_state_5.pth b/checkpoint-1206/rng_state_5.pth
new file mode 100644
index 0000000000000000000000000000000000000000..2393f7d616bfb4cf0ab81957f29d35b455685a54
--- /dev/null
+++ b/checkpoint-1206/rng_state_5.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92b3e1210264272a2020cbcb79f6ade48528f5682dadcecb7a94805779548161
+size 16240
diff --git a/checkpoint-1206/rng_state_6.pth b/checkpoint-1206/rng_state_6.pth
new file mode 100644
index 0000000000000000000000000000000000000000..46f8e8cc8551391d67e345af829445ad610b17a4
--- /dev/null
+++ b/checkpoint-1206/rng_state_6.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:556ec0b910e14a1a5ab8fb6a1a16d525b89e31c69dd9b6cd8d4a4cccad65b546
+size 16240
diff --git a/checkpoint-1206/rng_state_7.pth b/checkpoint-1206/rng_state_7.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b0723b7d69eb2d78f3ee4bdd7f838269f3f845d1
--- /dev/null
+++ b/checkpoint-1206/rng_state_7.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e830dc416886fe1aafeacfa75da6baacdbe9a61c66d2f1fbc11417753a516513
+size 16240
diff --git a/checkpoint-1206/rng_state_8.pth b/checkpoint-1206/rng_state_8.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b9da906954a171d52c0afc8baea75914a9bb9a62
--- /dev/null
+++ b/checkpoint-1206/rng_state_8.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80d7cb0002af3e22c063c6751b91836d7e06c4267f7ba8e1912c42d6867e4885
+size 16240
diff --git a/checkpoint-1206/scheduler.pt b/checkpoint-1206/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..730aa0679b05c54594576e05c8b57359ad913b4d
--- /dev/null
+++ b/checkpoint-1206/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c74bca99465dbb777fb965aad2291c5beb95242415512d168559d65103eccb89
+size 1064
diff --git a/checkpoint-1206/trainer_state.json b/checkpoint-1206/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..b0aae078d47ea8ed8d31f200f728b870cd9b1094
--- /dev/null
+++ b/checkpoint-1206/trainer_state.json
@@ -0,0 +1,8463 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9997927461139896,
+  "eval_steps": 500,
+  "global_step": 1206,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "grad_norm": 27.81778461909011,
+      "learning_rate": 5.000000000000001e-07,
+      "loss": 0.7993,
+      "step": 1
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 28.63833175363421,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 0.9056,
+      "step": 2
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 25.646828828014854,
+      "learning_rate": 1.5e-06,
+      "loss": 0.8473,
+      "step": 3
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 9.834124771941388,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 0.8192,
+      "step": 4
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 10.558095859980105,
+      "learning_rate": 2.5e-06,
+      "loss": 0.7943,
+      "step": 5
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 7.905789045775758,
+      "learning_rate": 3e-06,
+      "loss": 0.7075,
+      "step": 6
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 7.259519170268483,
+      "learning_rate": 3.5e-06,
+      "loss": 0.7537,
+      "step": 7
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 6.639042051048664,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 0.7471,
+      "step": 8
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 8.515070932390074,
+      "learning_rate": 4.5e-06,
+      "loss": 0.7689,
+      "step": 9
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 8.916410424632533,
+      "learning_rate": 5e-06,
+      "loss": 0.7194,
+      "step": 10
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 4.835046497413255,
+      "learning_rate": 4.9999978617243506e-06,
+      "loss": 0.6949,
+      "step": 11
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 10.065648500649479,
+      "learning_rate": 4.9999914469010585e-06,
+      "loss": 0.7039,
+      "step": 12
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 5.299372887839679,
+      "learning_rate": 4.999980755541098e-06,
+      "loss": 0.7067,
+      "step": 13
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 5.693110837094718,
+      "learning_rate": 4.999965787662758e-06,
+      "loss": 0.7126,
+      "step": 14
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 2.983869635716314,
+      "learning_rate": 4.999946543291642e-06,
+      "loss": 0.6496,
+      "step": 15
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 4.2561193962441175,
+      "learning_rate": 4.999923022460671e-06,
+      "loss": 0.7036,
+      "step": 16
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 3.011772824968437,
+      "learning_rate": 4.999895225210079e-06,
+      "loss": 0.7009,
+      "step": 17
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 3.386638415717137,
+      "learning_rate": 4.9998631515874165e-06,
+      "loss": 0.6624,
+      "step": 18
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 3.764658092125165,
+      "learning_rate": 4.999826801647551e-06,
+      "loss": 0.6687,
+      "step": 19
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.3982096117966614,
+      "learning_rate": 4.999786175452662e-06,
+      "loss": 0.706,
+      "step": 20
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.8051633678260193,
+      "learning_rate": 4.999741273072246e-06,
+      "loss": 0.7031,
+      "step": 21
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 3.1177784624332614,
+      "learning_rate": 4.999692094583114e-06,
+      "loss": 0.7525,
+      "step": 22
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.2533819675617806,
+      "learning_rate": 4.9996386400693906e-06,
+      "loss": 0.6767,
+      "step": 23
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.61893793162573,
+      "learning_rate": 4.999580909622518e-06,
+      "loss": 0.6432,
+      "step": 24
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.76057623723569,
+      "learning_rate": 4.999518903341251e-06,
+      "loss": 0.6809,
+      "step": 25
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.27983032069553,
+      "learning_rate": 4.999452621331657e-06,
+      "loss": 0.6798,
+      "step": 26
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.501904568120582,
+      "learning_rate": 4.99938206370712e-06,
+      "loss": 0.6412,
+      "step": 27
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.819229290729669,
+      "learning_rate": 4.999307230588338e-06,
+      "loss": 0.6188,
+      "step": 28
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.1233212322022212,
+      "learning_rate": 4.9992281221033224e-06,
+      "loss": 0.6378,
+      "step": 29
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.7806911906686755,
+      "learning_rate": 4.999144738387396e-06,
+      "loss": 0.6653,
+      "step": 30
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.4045490257014563,
+      "learning_rate": 4.999057079583199e-06,
+      "loss": 0.6377,
+      "step": 31
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.3803717769210446,
+      "learning_rate": 4.998965145840681e-06,
+      "loss": 0.6855,
+      "step": 32
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.3976652879633473,
+      "learning_rate": 4.998868937317106e-06,
+      "loss": 0.6284,
+      "step": 33
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.2958541157119727,
+      "learning_rate": 4.998768454177051e-06,
+      "loss": 0.6521,
+      "step": 34
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.1925196833696154,
+      "learning_rate": 4.998663696592403e-06,
+      "loss": 0.6619,
+      "step": 35
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.361006042901851,
+      "learning_rate": 4.998554664742362e-06,
+      "loss": 0.6155,
+      "step": 36
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.1577758143653614,
+      "learning_rate": 4.998441358813443e-06,
+      "loss": 0.6398,
+      "step": 37
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.219872074512664,
+      "learning_rate": 4.998323778999467e-06,
+      "loss": 0.6051,
+      "step": 38
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.2907501521408546,
+      "learning_rate": 4.9982019255015705e-06,
+      "loss": 0.6337,
+      "step": 39
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.1769862324666183,
+      "learning_rate": 4.9980757985281955e-06,
+      "loss": 0.6606,
+      "step": 40
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.4252479779661607,
+      "learning_rate": 4.997945398295101e-06,
+      "loss": 0.6685,
+      "step": 41
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.3929541982084657,
+      "learning_rate": 4.99781072502535e-06,
+      "loss": 0.6084,
+      "step": 42
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.932539969840091,
+      "learning_rate": 4.997671778949318e-06,
+      "loss": 0.6123,
+      "step": 43
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.191742541327873,
+      "learning_rate": 4.997528560304688e-06,
+      "loss": 0.6247,
+      "step": 44
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.423376784566499,
+      "learning_rate": 4.997381069336455e-06,
+      "loss": 0.7024,
+      "step": 45
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.0599055392481076,
+      "learning_rate": 4.997229306296918e-06,
+      "loss": 0.6612,
+      "step": 46
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.16832922087532,
+      "learning_rate": 4.997073271445686e-06,
+      "loss": 0.5949,
+      "step": 47
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.0483598654319453,
+      "learning_rate": 4.9969129650496775e-06,
+      "loss": 0.6406,
+      "step": 48
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.963056609139284,
+      "learning_rate": 4.996748387383113e-06,
+      "loss": 0.6361,
+      "step": 49
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.2094923844269307,
+      "learning_rate": 4.996579538727527e-06,
+      "loss": 0.5901,
+      "step": 50
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.1088153449411857,
+      "learning_rate": 4.996406419371749e-06,
+      "loss": 0.6458,
+      "step": 51
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.093448940617732,
+      "learning_rate": 4.996229029611926e-06,
+      "loss": 0.6509,
+      "step": 52
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.075116207412987,
+      "learning_rate": 4.996047369751502e-06,
+      "loss": 0.6295,
+      "step": 53
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.138141165277684,
+      "learning_rate": 4.995861440101229e-06,
+      "loss": 0.6088,
+      "step": 54
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.186316382848445,
+      "learning_rate": 4.995671240979161e-06,
+      "loss": 0.6307,
+      "step": 55
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.2513741083982195,
+      "learning_rate": 4.995476772710657e-06,
+      "loss": 0.6175,
+      "step": 56
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.0827167336870596,
+      "learning_rate": 4.995278035628379e-06,
+      "loss": 0.5935,
+      "step": 57
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.117977588574442,
+      "learning_rate": 4.995075030072291e-06,
+      "loss": 0.5998,
+      "step": 58
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.0996940200235485,
+      "learning_rate": 4.994867756389658e-06,
+      "loss": 0.6159,
+      "step": 59
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.141096165691323,
+      "learning_rate": 4.994656214935045e-06,
+      "loss": 0.6294,
+      "step": 60
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.022748830058395,
+      "learning_rate": 4.994440406070323e-06,
+      "loss": 0.6315,
+      "step": 61
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.209132168720991,
+      "learning_rate": 4.994220330164654e-06,
+      "loss": 0.5645,
+      "step": 62
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.0994557317862674,
+      "learning_rate": 4.993995987594509e-06,
+      "loss": 0.6272,
+      "step": 63
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.204220831053169,
+      "learning_rate": 4.99376737874365e-06,
+      "loss": 0.6379,
+      "step": 64
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.127733932186697,
+      "learning_rate": 4.993534504003141e-06,
+      "loss": 0.622,
+      "step": 65
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.1338506582034316,
+      "learning_rate": 4.993297363771342e-06,
+      "loss": 0.6259,
+      "step": 66
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.104802764460729,
+      "learning_rate": 4.993055958453912e-06,
+      "loss": 0.6414,
+      "step": 67
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.0889535347771675,
+      "learning_rate": 4.9928102884638004e-06,
+      "loss": 0.6466,
+      "step": 68
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.252225316694296,
+      "learning_rate": 4.992560354221258e-06,
+      "loss": 0.6167,
+      "step": 69
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.015392533516649,
+      "learning_rate": 4.992306156153827e-06,
+      "loss": 0.5958,
+      "step": 70
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.151741408948778,
+      "learning_rate": 4.992047694696343e-06,
+      "loss": 0.5875,
+      "step": 71
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.0351299117412696,
+      "learning_rate": 4.991784970290935e-06,
+      "loss": 0.5935,
+      "step": 72
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.0000962363827983,
+      "learning_rate": 4.991517983387026e-06,
+      "loss": 0.6091,
+      "step": 73
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.202881736102415,
+      "learning_rate": 4.99124673444133e-06,
+      "loss": 0.6122,
+      "step": 74
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.015074773396151,
+      "learning_rate": 4.990971223917848e-06,
+      "loss": 0.6134,
+      "step": 75
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.009305960567766,
+      "learning_rate": 4.990691452287877e-06,
+      "loss": 0.6308,
+      "step": 76
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 1.9967884756310221,
+      "learning_rate": 4.990407420029999e-06,
+      "loss": 0.6098,
+      "step": 77
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.0858738033925905,
+      "learning_rate": 4.990119127630085e-06,
+      "loss": 0.6344,
+      "step": 78
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 1.9427707561903895,
+      "learning_rate": 4.989826575581295e-06,
+      "loss": 0.6049,
+      "step": 79
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.157150584766853,
+      "learning_rate": 4.989529764384073e-06,
+      "loss": 0.5965,
+      "step": 80
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.0303527419352583,
+      "learning_rate": 4.989228694546151e-06,
+      "loss": 0.6524,
+      "step": 81
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.128799919475717,
+      "learning_rate": 4.988923366582546e-06,
+      "loss": 0.5524,
+      "step": 82
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.0122786280510696,
+      "learning_rate": 4.988613781015557e-06,
+      "loss": 0.6268,
+      "step": 83
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.104580177719229,
+      "learning_rate": 4.988299938374769e-06,
+      "loss": 0.6229,
+      "step": 84
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.3894843860356834,
+      "learning_rate": 4.9879818391970455e-06,
+      "loss": 0.6194,
+      "step": 85
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 1.9615211372441477,
+      "learning_rate": 4.9876594840265355e-06,
+      "loss": 0.6355,
+      "step": 86
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.4509852093141937,
+      "learning_rate": 4.987332873414666e-06,
+      "loss": 0.6405,
+      "step": 87
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.178942375285086,
+      "learning_rate": 4.987002007920142e-06,
+      "loss": 0.5593,
+      "step": 88
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.2625634345900445,
+      "learning_rate": 4.9866668881089515e-06,
+      "loss": 0.6133,
+      "step": 89
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.363092638811143,
+      "learning_rate": 4.986327514554356e-06,
+      "loss": 0.6298,
+      "step": 90
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.0401982492138546,
+      "learning_rate": 4.985983887836894e-06,
+      "loss": 0.6276,
+      "step": 91
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.276956647922478,
+      "learning_rate": 4.985636008544381e-06,
+      "loss": 0.5691,
+      "step": 92
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.1072762844110233,
+      "learning_rate": 4.985283877271908e-06,
+      "loss": 0.6175,
+      "step": 93
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.2931866879442637,
+      "learning_rate": 4.984927494621836e-06,
+      "loss": 0.6419,
+      "step": 94
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.112474101166308,
+      "learning_rate": 4.984566861203801e-06,
+      "loss": 0.607,
+      "step": 95
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.1816059679212634,
+      "learning_rate": 4.984201977634711e-06,
+      "loss": 0.6136,
+      "step": 96
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.0620776369966554,
+      "learning_rate": 4.9838328445387415e-06,
+      "loss": 0.6372,
+      "step": 97
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.147592836641578,
+      "learning_rate": 4.983459462547341e-06,
+      "loss": 0.606,
+      "step": 98
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.1808001877062453,
+      "learning_rate": 4.983081832299224e-06,
+      "loss": 0.6019,
+      "step": 99
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.3751999527114087,
+      "learning_rate": 4.98269995444037e-06,
+      "loss": 0.6021,
+      "step": 100
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.8769470206406913,
+      "learning_rate": 4.98231382962403e-06,
+      "loss": 0.6082,
+      "step": 101
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.3060925784921347,
+      "learning_rate": 4.981923458510717e-06,
+      "loss": 0.6174,
+      "step": 102
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.1543176832473683,
+      "learning_rate": 4.981528841768206e-06,
+      "loss": 0.6092,
+      "step": 103
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.1558689520522547,
+      "learning_rate": 4.981129980071538e-06,
+      "loss": 0.587,
+      "step": 104
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.3830532005188383,
+      "learning_rate": 4.980726874103014e-06,
+      "loss": 0.6518,
+      "step": 105
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.3333119576634767,
+      "learning_rate": 4.980319524552195e-06,
+      "loss": 0.6096,
+      "step": 106
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.1135146855324214,
+      "learning_rate": 4.9799079321159e-06,
+      "loss": 0.5728,
+      "step": 107
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.2300463384326394,
+      "learning_rate": 4.9794920974982095e-06,
+      "loss": 0.6563,
+      "step": 108
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.1745234017525443,
+      "learning_rate": 4.979072021410458e-06,
+      "loss": 0.5968,
+      "step": 109
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.1536586182562334,
+      "learning_rate": 4.978647704571237e-06,
+      "loss": 0.6189,
+      "step": 110
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.193809374687326,
+      "learning_rate": 4.97821914770639e-06,
+      "loss": 0.5864,
+      "step": 111
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.0525896373682047,
+      "learning_rate": 4.977786351549017e-06,
+      "loss": 0.6101,
+      "step": 112
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.216099286618384,
+      "learning_rate": 4.977349316839467e-06,
+      "loss": 0.5984,
+      "step": 113
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.155122255962579,
+      "learning_rate": 4.97690804432534e-06,
+      "loss": 0.6311,
+      "step": 114
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.2972101190291374,
+      "learning_rate": 4.976462534761487e-06,
+      "loss": 0.5813,
+      "step": 115
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 1.9925413745245948,
+      "learning_rate": 4.9760127889100044e-06,
+      "loss": 0.6157,
+      "step": 116
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.2802548684036568,
+      "learning_rate": 4.975558807540238e-06,
+      "loss": 0.6079,
+      "step": 117
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.048888007394621,
+      "learning_rate": 4.9751005914287775e-06,
+      "loss": 0.6467,
+      "step": 118
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.28661640438254,
+      "learning_rate": 4.974638141359456e-06,
+      "loss": 0.6029,
+      "step": 119
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.004056683755783,
+      "learning_rate": 4.974171458123351e-06,
+      "loss": 0.6289,
+      "step": 120
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.1628470048067667,
+      "learning_rate": 4.97370054251878e-06,
+      "loss": 0.6139,
+      "step": 121
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.056119895466544,
+      "learning_rate": 4.9732253953513e-06,
+      "loss": 0.5798,
+      "step": 122
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.1716513163164275,
+      "learning_rate": 4.972746017433709e-06,
+      "loss": 0.6085,
+      "step": 123
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.255856676525811,
+      "learning_rate": 4.97226240958604e-06,
+      "loss": 0.6342,
+      "step": 124
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.1049280498075373,
+      "learning_rate": 4.971774572635563e-06,
+      "loss": 0.6197,
+      "step": 125
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.133349390995361,
+      "learning_rate": 4.97128250741678e-06,
+      "loss": 0.5751,
+      "step": 126
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.2044887467317578,
+      "learning_rate": 4.97078621477143e-06,
+      "loss": 0.6611,
+      "step": 127
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.1413863795698145,
+      "learning_rate": 4.970285695548481e-06,
+      "loss": 0.625,
+      "step": 128
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.0229587336296615,
+      "learning_rate": 4.969780950604132e-06,
+      "loss": 0.5989,
+      "step": 129
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.0983599595244247,
+      "learning_rate": 4.969271980801808e-06,
+      "loss": 0.5747,
+      "step": 130
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.1059041140010786,
+      "learning_rate": 4.9687587870121645e-06,
+      "loss": 0.5869,
+      "step": 131
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.8967441614595046,
+      "learning_rate": 4.9682413701130815e-06,
+      "loss": 0.6272,
+      "step": 132
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.9976164993621088,
+      "learning_rate": 4.967719730989663e-06,
+      "loss": 0.6282,
+      "step": 133
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.8719131324952145,
+      "learning_rate": 4.967193870534235e-06,
+      "loss": 0.6052,
+      "step": 134
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.071702997476533,
+      "learning_rate": 4.9666637896463455e-06,
+      "loss": 0.5785,
+      "step": 135
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.9549455320048341,
+      "learning_rate": 4.966129489232762e-06,
+      "loss": 0.5739,
+      "step": 136
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.0656898626759315,
+      "learning_rate": 4.9655909702074684e-06,
+      "loss": 0.6651,
+      "step": 137
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.1185948604203038,
+      "learning_rate": 4.965048233491669e-06,
+      "loss": 0.5759,
+      "step": 138
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.08566019272993,
+      "learning_rate": 4.964501280013777e-06,
+      "loss": 0.6271,
+      "step": 139
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.117420903965419,
+      "learning_rate": 4.963950110709425e-06,
+      "loss": 0.5968,
+      "step": 140
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.9784944143818486,
+      "learning_rate": 4.963394726521453e-06,
+      "loss": 0.6112,
+      "step": 141
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.077292948039572,
+      "learning_rate": 4.9628351283999144e-06,
+      "loss": 0.5636,
+      "step": 142
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.223803520245629,
+      "learning_rate": 4.962271317302068e-06,
+      "loss": 0.6658,
+      "step": 143
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.039369072186367,
+      "learning_rate": 4.9617032941923796e-06,
+      "loss": 0.5853,
+      "step": 144
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.071470113085907,
+      "learning_rate": 4.961131060042522e-06,
+      "loss": 0.601,
+      "step": 145
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.437470272347474,
+      "learning_rate": 4.960554615831372e-06,
+      "loss": 0.6593,
+      "step": 146
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.178684122927139,
+      "learning_rate": 4.959973962545005e-06,
+      "loss": 0.607,
+      "step": 147
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.097006749956471,
+      "learning_rate": 4.9593891011767e-06,
+      "loss": 0.5873,
+      "step": 148
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.9801202541822784,
+      "learning_rate": 4.958800032726931e-06,
+      "loss": 0.5877,
+      "step": 149
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.30001951085656,
+      "learning_rate": 4.958206758203373e-06,
+      "loss": 0.6368,
+      "step": 150
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 1.990094260131078,
+      "learning_rate": 4.957609278620891e-06,
+      "loss": 0.59,
+      "step": 151
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.262163752076628,
+      "learning_rate": 4.957007595001548e-06,
+      "loss": 0.5779,
+      "step": 152
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.1970152093220983,
+      "learning_rate": 4.956401708374595e-06,
+      "loss": 0.5894,
+      "step": 153
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.220825872684071,
+      "learning_rate": 4.9557916197764745e-06,
+      "loss": 0.6528,
+      "step": 154
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.099472677591387,
+      "learning_rate": 4.955177330250817e-06,
+      "loss": 0.5798,
+      "step": 155
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.159203936881569,
+      "learning_rate": 4.954558840848437e-06,
+      "loss": 0.6206,
+      "step": 156
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.185152414039555,
+      "learning_rate": 4.953936152627338e-06,
+      "loss": 0.5624,
+      "step": 157
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.0679748168992624,
+      "learning_rate": 4.953309266652701e-06,
+      "loss": 0.5859,
+      "step": 158
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.327237187255128,
+      "learning_rate": 4.952678183996891e-06,
+      "loss": 0.5632,
+      "step": 159
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.2865519679977417,
+      "learning_rate": 4.952042905739451e-06,
+      "loss": 0.6965,
+      "step": 160
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.523435408018699,
+      "learning_rate": 4.9514034329671e-06,
+      "loss": 0.6217,
+      "step": 161
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.4992653226709636,
+      "learning_rate": 4.950759766773734e-06,
+      "loss": 0.6175,
+      "step": 162
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.432752824777114,
+      "learning_rate": 4.950111908260423e-06,
+      "loss": 0.5862,
+      "step": 163
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.137500912204061,
+      "learning_rate": 4.949459858535404e-06,
+      "loss": 0.6124,
+      "step": 164
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.2226376224120474,
+      "learning_rate": 4.94880361871409e-06,
+      "loss": 0.5891,
+      "step": 165
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.3821839805775165,
+      "learning_rate": 4.9481431899190544e-06,
+      "loss": 0.6008,
+      "step": 166
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.306242834684614,
+      "learning_rate": 4.947478573280044e-06,
+      "loss": 0.6159,
+      "step": 167
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.3298092236851518,
+      "learning_rate": 4.946809769933963e-06,
+      "loss": 0.5809,
+      "step": 168
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.364296499621558,
+      "learning_rate": 4.946136781024883e-06,
+      "loss": 0.5895,
+      "step": 169
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.237241095609228,
+      "learning_rate": 4.945459607704029e-06,
+      "loss": 0.6144,
+      "step": 170
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.4027419761972264,
+      "learning_rate": 4.9447782511297905e-06,
+      "loss": 0.5985,
+      "step": 171
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.1547059182244284,
+      "learning_rate": 4.944092712467709e-06,
+      "loss": 0.5763,
+      "step": 172
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.1530221667047984,
+      "learning_rate": 4.9434029928904805e-06,
+      "loss": 0.5692,
+      "step": 173
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.228588593294869,
+      "learning_rate": 4.942709093577954e-06,
+      "loss": 0.5896,
+      "step": 174
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.1597295307130198,
+      "learning_rate": 4.942011015717129e-06,
+      "loss": 0.5864,
+      "step": 175
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.321140955498194,
+      "learning_rate": 4.941308760502149e-06,
+      "loss": 0.6089,
+      "step": 176
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.220124736460707,
+      "learning_rate": 4.940602329134309e-06,
+      "loss": 0.5786,
+      "step": 177
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.1698038563080417,
+      "learning_rate": 4.939891722822043e-06,
+      "loss": 0.5749,
+      "step": 178
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.244425969121411,
+      "learning_rate": 4.93917694278093e-06,
+      "loss": 0.5877,
+      "step": 179
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.143920008069458,
+      "learning_rate": 4.938457990233687e-06,
+      "loss": 0.6024,
+      "step": 180
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.1786040820345813,
+      "learning_rate": 4.937734866410169e-06,
+      "loss": 0.5845,
+      "step": 181
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.301832824481007,
+      "learning_rate": 4.9370075725473665e-06,
+      "loss": 0.6182,
+      "step": 182
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.3748033727083997,
+      "learning_rate": 4.936276109889403e-06,
+      "loss": 0.6073,
+      "step": 183
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.476334487382023,
+      "learning_rate": 4.935540479687534e-06,
+      "loss": 0.5793,
+      "step": 184
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.2509466352322494,
+      "learning_rate": 4.934800683200143e-06,
+      "loss": 0.6133,
+      "step": 185
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.8391697547684873,
+      "learning_rate": 4.934056721692742e-06,
+      "loss": 0.5967,
+      "step": 186
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.4492364225391765,
+      "learning_rate": 4.933308596437965e-06,
+      "loss": 0.5676,
+      "step": 187
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.685548141821295,
+      "learning_rate": 4.932556308715573e-06,
+      "loss": 0.6069,
+      "step": 188
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.261217637824808,
+      "learning_rate": 4.931799859812443e-06,
+      "loss": 0.6411,
+      "step": 189
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.3838284395200966,
+      "learning_rate": 4.931039251022573e-06,
+      "loss": 0.5745,
+      "step": 190
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.2550921344466164,
+      "learning_rate": 4.930274483647074e-06,
+      "loss": 0.5989,
+      "step": 191
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.078406234527636,
+      "learning_rate": 4.929505558994175e-06,
+      "loss": 0.5998,
+      "step": 192
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.592864566091496,
+      "learning_rate": 4.928732478379214e-06,
+      "loss": 0.5842,
+      "step": 193
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.092752299259724,
+      "learning_rate": 4.927955243124638e-06,
+      "loss": 0.5789,
+      "step": 194
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.3799311595696966,
+      "learning_rate": 4.927173854560002e-06,
+      "loss": 0.6265,
+      "step": 195
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.246876688010602,
+      "learning_rate": 4.926388314021964e-06,
+      "loss": 0.6126,
+      "step": 196
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.1409898276704578,
+      "learning_rate": 4.925598622854287e-06,
+      "loss": 0.6073,
+      "step": 197
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.5946158421875385,
+      "learning_rate": 4.924804782407834e-06,
+      "loss": 0.6154,
+      "step": 198
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.1225494320427982,
+      "learning_rate": 4.924006794040562e-06,
+      "loss": 0.583,
+      "step": 199
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.1971323526291338,
+      "learning_rate": 4.923204659117528e-06,
+      "loss": 0.6078,
+      "step": 200
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.289185506404785,
+      "learning_rate": 4.92239837901088e-06,
+      "loss": 0.6127,
+      "step": 201
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.0071007751625354,
+      "learning_rate": 4.921587955099858e-06,
+      "loss": 0.5804,
+      "step": 202
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.2981840149068247,
+      "learning_rate": 4.920773388770789e-06,
+      "loss": 0.6027,
+      "step": 203
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.236179116886702,
+      "learning_rate": 4.919954681417087e-06,
+      "loss": 0.6179,
+      "step": 204
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.007422589251611,
+      "learning_rate": 4.91913183443925e-06,
+      "loss": 0.5647,
+      "step": 205
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.1402813555735483,
+      "learning_rate": 4.918304849244857e-06,
+      "loss": 0.5841,
+      "step": 206
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.0456415785177104,
+      "learning_rate": 4.917473727248565e-06,
+      "loss": 0.5524,
+      "step": 207
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 1.9673558126020942,
+      "learning_rate": 4.916638469872109e-06,
+      "loss": 0.5698,
+      "step": 208
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.015111672496819,
+      "learning_rate": 4.9157990785442964e-06,
+      "loss": 0.5957,
+      "step": 209
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 1.9502065547578398,
+      "learning_rate": 4.9149555547010086e-06,
+      "loss": 0.5592,
+      "step": 210
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.167936522558899,
+      "learning_rate": 4.9141078997851945e-06,
+      "loss": 0.5705,
+      "step": 211
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.2066587458997935,
+      "learning_rate": 4.91325611524687e-06,
+      "loss": 0.5526,
+      "step": 212
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.9132995625903553,
+      "learning_rate": 4.9124002025431136e-06,
+      "loss": 0.5767,
+      "step": 213
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.0097281107801277,
+      "learning_rate": 4.91154016313807e-06,
+      "loss": 0.6185,
+      "step": 214
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.023532008241332,
+      "learning_rate": 4.910675998502938e-06,
+      "loss": 0.6005,
+      "step": 215
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.9253831001776973,
+      "learning_rate": 4.909807710115977e-06,
+      "loss": 0.5769,
+      "step": 216
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.066862408842564,
+      "learning_rate": 4.908935299462497e-06,
+      "loss": 0.5671,
+      "step": 217
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.9412704290792853,
+      "learning_rate": 4.908058768034862e-06,
+      "loss": 0.5568,
+      "step": 218
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.185994457097553,
+      "learning_rate": 4.907178117332487e-06,
+      "loss": 0.5621,
+      "step": 219
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.021517127546353,
+      "learning_rate": 4.906293348861829e-06,
+      "loss": 0.5672,
+      "step": 220
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.099703967072734,
+      "learning_rate": 4.905404464136391e-06,
+      "loss": 0.5366,
+      "step": 221
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.030197056583618,
+      "learning_rate": 4.904511464676718e-06,
+      "loss": 0.6064,
+      "step": 222
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.4170102988954896,
+      "learning_rate": 4.903614352010393e-06,
+      "loss": 0.5919,
+      "step": 223
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.0819468873015476,
+      "learning_rate": 4.9027131276720355e-06,
+      "loss": 0.5366,
+      "step": 224
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.148008018153629,
+      "learning_rate": 4.901807793203299e-06,
+      "loss": 0.597,
+      "step": 225
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.0303725862017186,
+      "learning_rate": 4.900898350152866e-06,
+      "loss": 0.6394,
+      "step": 226
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.1598989214704334,
+      "learning_rate": 4.899984800076449e-06,
+      "loss": 0.5932,
+      "step": 227
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.0816312637185255,
+      "learning_rate": 4.899067144536786e-06,
+      "loss": 0.5909,
+      "step": 228
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 1.9024067197329315,
+      "learning_rate": 4.8981453851036365e-06,
+      "loss": 0.5463,
+      "step": 229
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.1830926868871043,
+      "learning_rate": 4.897219523353781e-06,
+      "loss": 0.5821,
+      "step": 230
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.1156269612794016,
+      "learning_rate": 4.8962895608710195e-06,
+      "loss": 0.5993,
+      "step": 231
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 1.9653407654210864,
+      "learning_rate": 4.895355499246162e-06,
+      "loss": 0.5525,
+      "step": 232
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.367769051061897,
+      "learning_rate": 4.894417340077036e-06,
+      "loss": 0.5683,
+      "step": 233
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.078327064466567,
+      "learning_rate": 4.893475084968474e-06,
+      "loss": 0.6184,
+      "step": 234
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.1661882731589475,
+      "learning_rate": 4.8925287355323195e-06,
+      "loss": 0.6321,
+      "step": 235
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.182760952002799,
+      "learning_rate": 4.891578293387413e-06,
+      "loss": 0.6254,
+      "step": 236
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.998723579962691,
+      "learning_rate": 4.890623760159605e-06,
+      "loss": 0.5371,
+      "step": 237
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.319922346931926,
+      "learning_rate": 4.8896651374817365e-06,
+      "loss": 0.5941,
+      "step": 238
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.090735197217999,
+      "learning_rate": 4.888702426993648e-06,
+      "loss": 0.577,
+      "step": 239
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.1247199987228558,
+      "learning_rate": 4.887735630342173e-06,
+      "loss": 0.5928,
+      "step": 240
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.33151114429804,
+      "learning_rate": 4.8867647491811315e-06,
+      "loss": 0.5838,
+      "step": 241
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.1570026356289147,
+      "learning_rate": 4.885789785171334e-06,
+      "loss": 0.5642,
+      "step": 242
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.049571197047368,
+      "learning_rate": 4.884810739980575e-06,
+      "loss": 0.6684,
+      "step": 243
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.9810062424466381,
+      "learning_rate": 4.883827615283626e-06,
+      "loss": 0.5942,
+      "step": 244
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.145869663660159,
+      "learning_rate": 4.882840412762244e-06,
+      "loss": 0.6356,
+      "step": 245
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.19290302186514,
+      "learning_rate": 4.881849134105156e-06,
+      "loss": 0.6189,
+      "step": 246
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.0561043419872984,
+      "learning_rate": 4.880853781008062e-06,
+      "loss": 0.5563,
+      "step": 247
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 1.8831183793224635,
+      "learning_rate": 4.879854355173638e-06,
+      "loss": 0.5522,
+      "step": 248
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.020981606684741,
+      "learning_rate": 4.878850858311518e-06,
+      "loss": 0.5548,
+      "step": 249
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.060242570493272,
+      "learning_rate": 4.877843292138307e-06,
+      "loss": 0.5715,
+      "step": 250
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.082455778933014,
+      "learning_rate": 4.8768316583775665e-06,
+      "loss": 0.5959,
+      "step": 251
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 1.9830929719438626,
+      "learning_rate": 4.875815958759819e-06,
+      "loss": 0.5813,
+      "step": 252
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 1.9772267506828567,
+      "learning_rate": 4.8747961950225406e-06,
+      "loss": 0.539,
+      "step": 253
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.1492561995002104,
+      "learning_rate": 4.873772368910161e-06,
+      "loss": 0.6059,
+      "step": 254
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.253757247139787,
+      "learning_rate": 4.872744482174058e-06,
+      "loss": 0.5897,
+      "step": 255
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.3282624851882496,
+      "learning_rate": 4.8717125365725545e-06,
+      "loss": 0.5675,
+      "step": 256
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.15573581133063,
+      "learning_rate": 4.8706765338709185e-06,
+      "loss": 0.5958,
+      "step": 257
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.073289220218241,
+      "learning_rate": 4.869636475841358e-06,
+      "loss": 0.6052,
+      "step": 258
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.293714090249444,
+      "learning_rate": 4.8685923642630165e-06,
+      "loss": 0.5786,
+      "step": 259
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.9496544276539172,
+      "learning_rate": 4.867544200921974e-06,
+      "loss": 0.6163,
+      "step": 260
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.5267016753690132,
+      "learning_rate": 4.866491987611239e-06,
+      "loss": 0.6223,
+      "step": 261
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.8731249445320794,
+      "learning_rate": 4.865435726130751e-06,
+      "loss": 0.5632,
+      "step": 262
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.3586331105798863,
+      "learning_rate": 4.86437541828737e-06,
+      "loss": 0.5769,
+      "step": 263
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.0258106914510585,
+      "learning_rate": 4.863311065894883e-06,
+      "loss": 0.6103,
+      "step": 264
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.2543614390885955,
+      "learning_rate": 4.862242670773991e-06,
+      "loss": 0.5844,
+      "step": 265
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.9440299381244668,
+      "learning_rate": 4.861170234752314e-06,
+      "loss": 0.5559,
+      "step": 266
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.254538268495492,
+      "learning_rate": 4.8600937596643815e-06,
+      "loss": 0.5709,
+      "step": 267
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.007651746385687,
+      "learning_rate": 4.8590132473516346e-06,
+      "loss": 0.573,
+      "step": 268
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.0735253118288837,
+      "learning_rate": 4.857928699662421e-06,
+      "loss": 0.5954,
+      "step": 269
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.024775417101569,
+      "learning_rate": 4.856840118451989e-06,
+      "loss": 0.5992,
+      "step": 270
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.1043310699945814,
+      "learning_rate": 4.855747505582488e-06,
+      "loss": 0.6507,
+      "step": 271
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.0386353328313214,
+      "learning_rate": 4.854650862922965e-06,
+      "loss": 0.5666,
+      "step": 272
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.978698841367705,
+      "learning_rate": 4.853550192349358e-06,
+      "loss": 0.5593,
+      "step": 273
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.9386534247633986,
+      "learning_rate": 4.852445495744497e-06,
+      "loss": 0.5735,
+      "step": 274
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.049346245018599,
+      "learning_rate": 4.8513367749981e-06,
+      "loss": 0.5415,
+      "step": 275
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.1051969521216605,
+      "learning_rate": 4.850224032006765e-06,
+      "loss": 0.5532,
+      "step": 276
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.2006792558872315,
+      "learning_rate": 4.849107268673975e-06,
+      "loss": 0.5696,
+      "step": 277
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.0460787736353647,
+      "learning_rate": 4.847986486910088e-06,
+      "loss": 0.5658,
+      "step": 278
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.1161843259225406,
+      "learning_rate": 4.846861688632336e-06,
+      "loss": 0.583,
+      "step": 279
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.8882198480393542,
+      "learning_rate": 4.8457328757648224e-06,
+      "loss": 0.5693,
+      "step": 280
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.1578413701109596,
+      "learning_rate": 4.844600050238517e-06,
+      "loss": 0.5409,
+      "step": 281
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.03912467778954,
+      "learning_rate": 4.843463213991255e-06,
+      "loss": 0.5908,
+      "step": 282
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.2333462480826247,
+      "learning_rate": 4.842322368967731e-06,
+      "loss": 0.6088,
+      "step": 283
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.06698702157327,
+      "learning_rate": 4.8411775171194986e-06,
+      "loss": 0.5953,
+      "step": 284
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.1433923121572045,
+      "learning_rate": 4.840028660404964e-06,
+      "loss": 0.5851,
+      "step": 285
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.214858780835041,
+      "learning_rate": 4.838875800789386e-06,
+      "loss": 0.5913,
+      "step": 286
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.038128612492624,
+      "learning_rate": 4.837718940244871e-06,
+      "loss": 0.5827,
+      "step": 287
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.9894065096959768,
+      "learning_rate": 4.836558080750365e-06,
+      "loss": 0.5769,
+      "step": 288
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.1711590153285822,
+      "learning_rate": 4.835393224291662e-06,
+      "loss": 0.654,
+      "step": 289
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.105004451988696,
+      "learning_rate": 4.834224372861386e-06,
+      "loss": 0.6158,
+      "step": 290
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.9554568023729102,
+      "learning_rate": 4.833051528459001e-06,
+      "loss": 0.5807,
+      "step": 291
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.2693917834500312,
+      "learning_rate": 4.831874693090797e-06,
+      "loss": 0.5557,
+      "step": 292
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.9081391627126192,
+      "learning_rate": 4.830693868769892e-06,
+      "loss": 0.6057,
+      "step": 293
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.2133664110768585,
+      "learning_rate": 4.82950905751623e-06,
+      "loss": 0.6103,
+      "step": 294
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.015392814211589,
+      "learning_rate": 4.8283202613565735e-06,
+      "loss": 0.5578,
+      "step": 295
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.142124020349717,
+      "learning_rate": 4.8271274823245e-06,
+      "loss": 0.5675,
+      "step": 296
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.981611826462286,
+      "learning_rate": 4.825930722460405e-06,
+      "loss": 0.5696,
+      "step": 297
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.966759748348117,
+      "learning_rate": 4.824729983811486e-06,
+      "loss": 0.58,
+      "step": 298
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.0117040369769397,
+      "learning_rate": 4.823525268431754e-06,
+      "loss": 0.6005,
+      "step": 299
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.9579664917991193,
+      "learning_rate": 4.822316578382019e-06,
+      "loss": 0.5472,
+      "step": 300
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.9075723479635032,
+      "learning_rate": 4.821103915729892e-06,
+      "loss": 0.5834,
+      "step": 301
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.289340229011896,
+      "learning_rate": 4.819887282549777e-06,
+      "loss": 0.6088,
+      "step": 302
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.0410700553735235,
+      "learning_rate": 4.818666680922874e-06,
+      "loss": 0.5449,
+      "step": 303
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.074434792511819,
+      "learning_rate": 4.8174421129371675e-06,
+      "loss": 0.5826,
+      "step": 304
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.1377170527698865,
+      "learning_rate": 4.816213580687428e-06,
+      "loss": 0.6262,
+      "step": 305
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.060340839248083,
+      "learning_rate": 4.814981086275209e-06,
+      "loss": 0.5479,
+      "step": 306
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.007036467413588,
+      "learning_rate": 4.813744631808841e-06,
+      "loss": 0.5642,
+      "step": 307
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.016779606220332,
+      "learning_rate": 4.8125042194034285e-06,
+      "loss": 0.5503,
+      "step": 308
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.930004252757651,
+      "learning_rate": 4.811259851180845e-06,
+      "loss": 0.582,
+      "step": 309
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.9179477992752856,
+      "learning_rate": 4.810011529269734e-06,
+      "loss": 0.5678,
+      "step": 310
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.023430757276848,
+      "learning_rate": 4.808759255805498e-06,
+      "loss": 0.614,
+      "step": 311
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.8334738409404936,
+      "learning_rate": 4.807503032930306e-06,
+      "loss": 0.5742,
+      "step": 312
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.937332706274502,
+      "learning_rate": 4.806242862793075e-06,
+      "loss": 0.6257,
+      "step": 313
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.0265383045700363,
+      "learning_rate": 4.8049787475494786e-06,
+      "loss": 0.5733,
+      "step": 314
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.056444039073761,
+      "learning_rate": 4.803710689361939e-06,
+      "loss": 0.578,
+      "step": 315
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.411132719183335,
+      "learning_rate": 4.802438690399622e-06,
+      "loss": 0.5778,
+      "step": 316
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.0233969242222853,
+      "learning_rate": 4.801162752838436e-06,
+      "loss": 0.5649,
+      "step": 317
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.2809121915132815,
+      "learning_rate": 4.799882878861025e-06,
+      "loss": 0.5589,
+      "step": 318
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.9806834041020271,
+      "learning_rate": 4.798599070656768e-06,
+      "loss": 0.5753,
+      "step": 319
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.095099671577702,
+      "learning_rate": 4.797311330421773e-06,
+      "loss": 0.5644,
+      "step": 320
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.1697606190375764,
+      "learning_rate": 4.796019660358877e-06,
+      "loss": 0.6009,
+      "step": 321
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.9549416103216173,
+      "learning_rate": 4.794724062677635e-06,
+      "loss": 0.5429,
+      "step": 322
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.9986949357292838,
+      "learning_rate": 4.793424539594323e-06,
+      "loss": 0.5456,
+      "step": 323
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.9414831957796765,
+      "learning_rate": 4.792121093331935e-06,
+      "loss": 0.5468,
+      "step": 324
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.100702188933012,
+      "learning_rate": 4.7908137261201685e-06,
+      "loss": 0.5763,
+      "step": 325
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.2747471285831025,
+      "learning_rate": 4.789502440195436e-06,
+      "loss": 0.5637,
+      "step": 326
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.8996382919319124,
+      "learning_rate": 4.788187237800849e-06,
+      "loss": 0.5285,
+      "step": 327
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.3451495174978847,
+      "learning_rate": 4.786868121186218e-06,
+      "loss": 0.5638,
+      "step": 328
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.0437536068229565,
+      "learning_rate": 4.7855450926080535e-06,
+      "loss": 0.5282,
+      "step": 329
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.1185488514745554,
+      "learning_rate": 4.784218154329555e-06,
+      "loss": 0.5689,
+      "step": 330
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.08745956731504,
+      "learning_rate": 4.78288730862061e-06,
+      "loss": 0.5772,
+      "step": 331
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.9479507156354359,
+      "learning_rate": 4.781552557757789e-06,
+      "loss": 0.5419,
+      "step": 332
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.0211480847937255,
+      "learning_rate": 4.780213904024346e-06,
+      "loss": 0.5757,
+      "step": 333
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.9075335749936069,
+      "learning_rate": 4.7788713497102094e-06,
+      "loss": 0.5693,
+      "step": 334
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.9590727137410602,
+      "learning_rate": 4.777524897111979e-06,
+      "loss": 0.5501,
+      "step": 335
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.0328480247612752,
+      "learning_rate": 4.776174548532926e-06,
+      "loss": 0.587,
+      "step": 336
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.062540517496736,
+      "learning_rate": 4.774820306282982e-06,
+      "loss": 0.5819,
+      "step": 337
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.0054452800156195,
+      "learning_rate": 4.773462172678744e-06,
+      "loss": 0.5529,
+      "step": 338
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.9641125644599562,
+      "learning_rate": 4.772100150043462e-06,
+      "loss": 0.5895,
+      "step": 339
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.9196744569285298,
+      "learning_rate": 4.77073424070704e-06,
+      "loss": 0.5504,
+      "step": 340
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.0002752186146484,
+      "learning_rate": 4.76936444700603e-06,
+      "loss": 0.5307,
+      "step": 341
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.1068919823054344,
+      "learning_rate": 4.76799077128363e-06,
+      "loss": 0.5908,
+      "step": 342
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.919597745459612,
+      "learning_rate": 4.766613215889678e-06,
+      "loss": 0.5423,
+      "step": 343
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 2.0670928578728716,
+      "learning_rate": 4.765231783180648e-06,
+      "loss": 0.5901,
+      "step": 344
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.906116148793229,
+      "learning_rate": 4.763846475519648e-06,
+      "loss": 0.5919,
+      "step": 345
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.9133575268702454,
+      "learning_rate": 4.762457295276413e-06,
+      "loss": 0.585,
+      "step": 346
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 2.133902651855379,
+      "learning_rate": 4.7610642448273025e-06,
+      "loss": 0.5444,
+      "step": 347
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.95222194640397,
+      "learning_rate": 4.7596673265552985e-06,
+      "loss": 0.5941,
+      "step": 348
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 2.095010268380277,
+      "learning_rate": 4.758266542849997e-06,
+      "loss": 0.6045,
+      "step": 349
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 2.0493864712059655,
+      "learning_rate": 4.756861896107609e-06,
+      "loss": 0.6011,
+      "step": 350
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.9222198823064967,
+      "learning_rate": 4.755453388730949e-06,
+      "loss": 0.5521,
+      "step": 351
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 2.368147154955994,
+      "learning_rate": 4.754041023129442e-06,
+      "loss": 0.6117,
+      "step": 352
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.9734596786106697,
+      "learning_rate": 4.752624801719108e-06,
+      "loss": 0.5727,
+      "step": 353
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 2.151510566977991,
+      "learning_rate": 4.751204726922564e-06,
+      "loss": 0.6085,
+      "step": 354
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.9291219072892685,
+      "learning_rate": 4.74978080116902e-06,
+      "loss": 0.5655,
+      "step": 355
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.838592559018919,
+      "learning_rate": 4.748353026894273e-06,
+      "loss": 0.5508,
+      "step": 356
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 2.069156589116884,
+      "learning_rate": 4.7469214065407e-06,
+      "loss": 0.5942,
+      "step": 357
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.8960817746615841,
+      "learning_rate": 4.745485942557264e-06,
+      "loss": 0.5902,
+      "step": 358
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 2.0606557307859634,
+      "learning_rate": 4.744046637399497e-06,
+      "loss": 0.556,
+      "step": 359
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.9660065879130573,
+      "learning_rate": 4.742603493529505e-06,
+      "loss": 0.5364,
+      "step": 360
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.9647921383638112,
+      "learning_rate": 4.741156513415958e-06,
+      "loss": 0.5601,
+      "step": 361
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 2.049074688423064,
+      "learning_rate": 4.739705699534092e-06,
+      "loss": 0.556,
+      "step": 362
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.962593945802751,
+      "learning_rate": 4.738251054365697e-06,
+      "loss": 0.5609,
+      "step": 363
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 2.059675349950347,
+      "learning_rate": 4.736792580399119e-06,
+      "loss": 0.5499,
+      "step": 364
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.8479566025134508,
+      "learning_rate": 4.7353302801292555e-06,
+      "loss": 0.5621,
+      "step": 365
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.9405450724813613,
+      "learning_rate": 4.733864156057545e-06,
+      "loss": 0.5437,
+      "step": 366
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 2.122487864033456,
+      "learning_rate": 4.7323942106919715e-06,
+      "loss": 0.5984,
+      "step": 367
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.6822841144123046,
+      "learning_rate": 4.730920446547052e-06,
+      "loss": 0.5951,
+      "step": 368
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.001405394086718,
+      "learning_rate": 4.729442866143838e-06,
+      "loss": 0.5552,
+      "step": 369
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.081154186949651,
+      "learning_rate": 4.72796147200991e-06,
+      "loss": 0.587,
+      "step": 370
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.1196544292473236,
+      "learning_rate": 4.72647626667937e-06,
+      "loss": 0.5882,
+      "step": 371
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.107445583509131,
+      "learning_rate": 4.724987252692841e-06,
+      "loss": 0.5389,
+      "step": 372
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 1.9529785007256542,
+      "learning_rate": 4.723494432597462e-06,
+      "loss": 0.6439,
+      "step": 373
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.11513441515607,
+      "learning_rate": 4.72199780894688e-06,
+      "loss": 0.6089,
+      "step": 374
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 1.9769899713721226,
+      "learning_rate": 4.7204973843012504e-06,
+      "loss": 0.5393,
+      "step": 375
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.063749623036316,
+      "learning_rate": 4.718993161227231e-06,
+      "loss": 0.5987,
+      "step": 376
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.0515862288253883,
+      "learning_rate": 4.717485142297977e-06,
+      "loss": 0.5772,
+      "step": 377
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 1.8962297741946081,
+      "learning_rate": 4.715973330093135e-06,
+      "loss": 0.5424,
+      "step": 378
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.2210958340400087,
+      "learning_rate": 4.7144577271988435e-06,
+      "loss": 0.6072,
+      "step": 379
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.067113337475314,
+      "learning_rate": 4.712938336207724e-06,
+      "loss": 0.5482,
+      "step": 380
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.8985489253954526,
+      "learning_rate": 4.711415159718876e-06,
+      "loss": 0.5593,
+      "step": 381
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.085236381118245,
+      "learning_rate": 4.709888200337879e-06,
+      "loss": 0.5704,
+      "step": 382
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.0967664183909784,
+      "learning_rate": 4.708357460676779e-06,
+      "loss": 0.5997,
+      "step": 383
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.0454278026009645,
+      "learning_rate": 4.706822943354092e-06,
+      "loss": 0.5669,
+      "step": 384
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.9171673309342674,
+      "learning_rate": 4.705284650994793e-06,
+      "loss": 0.517,
+      "step": 385
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.2003223432761287,
+      "learning_rate": 4.70374258623032e-06,
+      "loss": 0.5957,
+      "step": 386
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.936392519491186,
+      "learning_rate": 4.702196751698557e-06,
+      "loss": 0.5767,
+      "step": 387
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.354272003403086,
+      "learning_rate": 4.700647150043841e-06,
+      "loss": 0.6515,
+      "step": 388
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.9115059027323418,
+      "learning_rate": 4.699093783916955e-06,
+      "loss": 0.5579,
+      "step": 389
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.9878827587010002,
+      "learning_rate": 4.697536655975115e-06,
+      "loss": 0.572,
+      "step": 390
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.9729552535473858,
+      "learning_rate": 4.69597576888198e-06,
+      "loss": 0.5665,
+      "step": 391
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.177634366499155,
+      "learning_rate": 4.694411125307632e-06,
+      "loss": 0.6363,
+      "step": 392
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.8955146664976508,
+      "learning_rate": 4.692842727928584e-06,
+      "loss": 0.5682,
+      "step": 393
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 2.175305874476245,
+      "learning_rate": 4.691270579427769e-06,
+      "loss": 0.5943,
+      "step": 394
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 2.068140527232831,
+      "learning_rate": 4.689694682494537e-06,
+      "loss": 0.5659,
+      "step": 395
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.9112960694448755,
+      "learning_rate": 4.688115039824648e-06,
+      "loss": 0.6048,
+      "step": 396
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.9778305624626604,
+      "learning_rate": 4.686531654120272e-06,
+      "loss": 0.5695,
+      "step": 397
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 2.096904163204813,
+      "learning_rate": 4.684944528089981e-06,
+      "loss": 0.6113,
+      "step": 398
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 2.0011934144948516,
+      "learning_rate": 4.683353664448745e-06,
+      "loss": 0.5568,
+      "step": 399
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.8562851971757464,
+      "learning_rate": 4.681759065917929e-06,
+      "loss": 0.5474,
+      "step": 400
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.8190547574166316,
+      "learning_rate": 4.680160735225285e-06,
+      "loss": 0.5315,
+      "step": 401
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.9247862956929132,
+      "learning_rate": 4.6785586751049505e-06,
+      "loss": 0.5568,
+      "step": 402
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.8469793674077621,
+      "learning_rate": 4.676952888297442e-06,
+      "loss": 0.5811,
+      "step": 403
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.946943145198674,
+      "learning_rate": 4.675343377549653e-06,
+      "loss": 0.5475,
+      "step": 404
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.991304422730463,
+      "learning_rate": 4.6737301456148445e-06,
+      "loss": 0.5856,
+      "step": 405
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.9168241989446437,
+      "learning_rate": 4.672113195252644e-06,
+      "loss": 0.6069,
+      "step": 406
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.9305433665377905,
+      "learning_rate": 4.670492529229039e-06,
+      "loss": 0.5536,
+      "step": 407
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.8441008898830742,
+      "learning_rate": 4.668868150316377e-06,
+      "loss": 0.5859,
+      "step": 408
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.8879301596961315,
+      "learning_rate": 4.667240061293351e-06,
+      "loss": 0.5483,
+      "step": 409
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 2.024767417636281,
+      "learning_rate": 4.665608264945004e-06,
+      "loss": 0.5414,
+      "step": 410
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 2.1331610141797395,
+      "learning_rate": 4.663972764062722e-06,
+      "loss": 0.5811,
+      "step": 411
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.8132480265817386,
+      "learning_rate": 4.662333561444226e-06,
+      "loss": 0.5573,
+      "step": 412
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.9795813972027145,
+      "learning_rate": 4.6606906598935675e-06,
+      "loss": 0.5814,
+      "step": 413
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.8782931074297053,
+      "learning_rate": 4.6590440622211295e-06,
+      "loss": 0.569,
+      "step": 414
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.8219945335518706,
+      "learning_rate": 4.657393771243614e-06,
+      "loss": 0.5669,
+      "step": 415
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 2.4047268604371306,
+      "learning_rate": 4.6557397897840454e-06,
+      "loss": 0.5602,
+      "step": 416
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 2.064501780523946,
+      "learning_rate": 4.654082120671757e-06,
+      "loss": 0.5699,
+      "step": 417
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.9183128854940252,
+      "learning_rate": 4.65242076674239e-06,
+      "loss": 0.6112,
+      "step": 418
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.9315698971629633,
+      "learning_rate": 4.650755730837894e-06,
+      "loss": 0.5537,
+      "step": 419
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.9527809333659218,
+      "learning_rate": 4.649087015806509e-06,
+      "loss": 0.5423,
+      "step": 420
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.8940523915995442,
+      "learning_rate": 4.647414624502777e-06,
+      "loss": 0.5708,
+      "step": 421
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.9976964785548623,
+      "learning_rate": 4.645738559787524e-06,
+      "loss": 0.6006,
+      "step": 422
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.9098681403283917,
+      "learning_rate": 4.64405882452786e-06,
+      "loss": 0.5591,
+      "step": 423
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.8695612182804557,
+      "learning_rate": 4.642375421597175e-06,
+      "loss": 0.5219,
+      "step": 424
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.8912077704810082,
+      "learning_rate": 4.6406883538751315e-06,
+      "loss": 0.5224,
+      "step": 425
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.9390714726978922,
+      "learning_rate": 4.638997624247664e-06,
+      "loss": 0.5359,
+      "step": 426
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 2.051545992296337,
+      "learning_rate": 4.637303235606968e-06,
+      "loss": 0.544,
+      "step": 427
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 2.0657109136265914,
+      "learning_rate": 4.6356051908515e-06,
+      "loss": 0.5429,
+      "step": 428
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 2.0301022307984793,
+      "learning_rate": 4.63390349288597e-06,
+      "loss": 0.5787,
+      "step": 429
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 2.052515756169346,
+      "learning_rate": 4.632198144621338e-06,
+      "loss": 0.5778,
+      "step": 430
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.9741370495474897,
+      "learning_rate": 4.630489148974807e-06,
+      "loss": 0.5142,
+      "step": 431
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.9713229498863698,
+      "learning_rate": 4.62877650886982e-06,
+      "loss": 0.6127,
+      "step": 432
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 2.1609440121306007,
+      "learning_rate": 4.627060227236055e-06,
+      "loss": 0.5886,
+      "step": 433
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.944966445355139,
+      "learning_rate": 4.625340307009418e-06,
+      "loss": 0.5657,
+      "step": 434
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 2.031003925680835,
+      "learning_rate": 4.623616751132041e-06,
+      "loss": 0.5628,
+      "step": 435
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.8774113373137704,
+      "learning_rate": 4.621889562552272e-06,
+      "loss": 0.6068,
+      "step": 436
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 2.0385201543401785,
+      "learning_rate": 4.620158744224677e-06,
+      "loss": 0.5511,
+      "step": 437
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.8440750841938207,
+      "learning_rate": 4.618424299110028e-06,
+      "loss": 0.5261,
+      "step": 438
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.8978691755923442,
+      "learning_rate": 4.616686230175303e-06,
+      "loss": 0.5862,
+      "step": 439
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.8120850246861446,
+      "learning_rate": 4.614944540393679e-06,
+      "loss": 0.5652,
+      "step": 440
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.1821084695714914,
+      "learning_rate": 4.613199232744525e-06,
+      "loss": 0.5598,
+      "step": 441
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.9626422737625222,
+      "learning_rate": 4.611450310213401e-06,
+      "loss": 0.5267,
+      "step": 442
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.9714913234889215,
+      "learning_rate": 4.6096977757920505e-06,
+      "loss": 0.5658,
+      "step": 443
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.0179324078198233,
+      "learning_rate": 4.607941632478393e-06,
+      "loss": 0.582,
+      "step": 444
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.8565193856331161,
+      "learning_rate": 4.6061818832765246e-06,
+      "loss": 0.5715,
+      "step": 445
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.9798501479599246,
+      "learning_rate": 4.604418531196708e-06,
+      "loss": 0.6007,
+      "step": 446
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.0095846956468257,
+      "learning_rate": 4.602651579255369e-06,
+      "loss": 0.5947,
+      "step": 447
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.9316541079988245,
+      "learning_rate": 4.600881030475093e-06,
+      "loss": 0.5501,
+      "step": 448
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.080069353365406,
+      "learning_rate": 4.599106887884616e-06,
+      "loss": 0.5631,
+      "step": 449
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.965973137652201,
+      "learning_rate": 4.5973291545188235e-06,
+      "loss": 0.5267,
+      "step": 450
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.1082225966704087,
+      "learning_rate": 4.595547833418741e-06,
+      "loss": 0.6418,
+      "step": 451
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.0359312594194083,
+      "learning_rate": 4.593762927631536e-06,
+      "loss": 0.5644,
+      "step": 452
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 2.1254892914109433,
+      "learning_rate": 4.591974440210502e-06,
+      "loss": 0.5693,
+      "step": 453
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.9121188587334927,
+      "learning_rate": 4.590182374215064e-06,
+      "loss": 0.5572,
+      "step": 454
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.9348642624953207,
+      "learning_rate": 4.588386732710765e-06,
+      "loss": 0.5446,
+      "step": 455
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.8667846547370581,
+      "learning_rate": 4.5865875187692695e-06,
+      "loss": 0.5681,
+      "step": 456
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.9219061327454674,
+      "learning_rate": 4.5847847354683465e-06,
+      "loss": 0.5508,
+      "step": 457
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.8106132369123122,
+      "learning_rate": 4.5829783858918756e-06,
+      "loss": 0.5626,
+      "step": 458
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.7827483964442634,
+      "learning_rate": 4.5811684731298355e-06,
+      "loss": 0.5575,
+      "step": 459
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.9284196979863513,
+      "learning_rate": 4.5793550002783e-06,
+      "loss": 0.5363,
+      "step": 460
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 2.029647468705457,
+      "learning_rate": 4.577537970439433e-06,
+      "loss": 0.5415,
+      "step": 461
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 2.0997127029950087,
+      "learning_rate": 4.575717386721482e-06,
+      "loss": 0.5814,
+      "step": 462
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.9589290300656341,
+      "learning_rate": 4.573893252238777e-06,
+      "loss": 0.5156,
+      "step": 463
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.905237143908251,
+      "learning_rate": 4.572065570111717e-06,
+      "loss": 0.5536,
+      "step": 464
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.929519794935609,
+      "learning_rate": 4.570234343466775e-06,
+      "loss": 0.5879,
+      "step": 465
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 2.096095808886982,
+      "learning_rate": 4.568399575436484e-06,
+      "loss": 0.6241,
+      "step": 466
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.9486118894048778,
+      "learning_rate": 4.566561269159437e-06,
+      "loss": 0.6307,
+      "step": 467
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 2.0839490306744586,
+      "learning_rate": 4.564719427780276e-06,
+      "loss": 0.5655,
+      "step": 468
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.9439525665822102,
+      "learning_rate": 4.562874054449694e-06,
+      "loss": 0.5437,
+      "step": 469
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.9409142791465297,
+      "learning_rate": 4.5610251523244244e-06,
+      "loss": 0.6429,
+      "step": 470
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.8664574493795525,
+      "learning_rate": 4.559172724567238e-06,
+      "loss": 0.5826,
+      "step": 471
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.80819349503324,
+      "learning_rate": 4.557316774346934e-06,
+      "loss": 0.5372,
+      "step": 472
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.8680097526865296,
+      "learning_rate": 4.555457304838341e-06,
+      "loss": 0.5503,
+      "step": 473
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.7466938790815696,
+      "learning_rate": 4.553594319222303e-06,
+      "loss": 0.5425,
+      "step": 474
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.9610557658505607,
+      "learning_rate": 4.551727820685684e-06,
+      "loss": 0.5755,
+      "step": 475
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.9414839604282412,
+      "learning_rate": 4.549857812421353e-06,
+      "loss": 0.5915,
+      "step": 476
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.8484957644576423,
+      "learning_rate": 4.547984297628186e-06,
+      "loss": 0.5676,
+      "step": 477
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 2.074524028551078,
+      "learning_rate": 4.546107279511055e-06,
+      "loss": 0.6084,
+      "step": 478
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 2.069692704122282,
+      "learning_rate": 4.544226761280826e-06,
+      "loss": 0.5676,
+      "step": 479
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.8975472248317244,
+      "learning_rate": 4.54234274615435e-06,
+      "loss": 0.5904,
+      "step": 480
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 2.0118868982719897,
+      "learning_rate": 4.540455237354466e-06,
+      "loss": 0.5722,
+      "step": 481
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.9733105429381828,
+      "learning_rate": 4.5385642381099814e-06,
+      "loss": 0.6112,
+      "step": 482
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.862156914026863,
+      "learning_rate": 4.53666975165568e-06,
+      "loss": 0.5951,
+      "step": 483
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.9512940035297868,
+      "learning_rate": 4.53477178123231e-06,
+      "loss": 0.5223,
+      "step": 484
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.9202464191558823,
+      "learning_rate": 4.532870330086577e-06,
+      "loss": 0.5638,
+      "step": 485
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.9015767656854419,
+      "learning_rate": 4.530965401471143e-06,
+      "loss": 0.5911,
+      "step": 486
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.95190921973106,
+      "learning_rate": 4.529056998644619e-06,
+      "loss": 0.6053,
+      "step": 487
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 2.0058459596081644,
+      "learning_rate": 4.527145124871556e-06,
+      "loss": 0.5466,
+      "step": 488
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.8902620959998047,
+      "learning_rate": 4.5252297834224454e-06,
+      "loss": 0.5526,
+      "step": 489
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.985466416169018,
+      "learning_rate": 4.523310977573711e-06,
+      "loss": 0.5958,
+      "step": 490
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 2.1140148957176415,
+      "learning_rate": 4.521388710607699e-06,
+      "loss": 0.613,
+      "step": 491
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.9470601192089525,
+      "learning_rate": 4.51946298581268e-06,
+      "loss": 0.5847,
+      "step": 492
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 2.0227057176069603,
+      "learning_rate": 4.51753380648284e-06,
+      "loss": 0.5784,
+      "step": 493
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 2.05501863673554,
+      "learning_rate": 4.515601175918269e-06,
+      "loss": 0.5501,
+      "step": 494
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 2.0129325402811715,
+      "learning_rate": 4.513665097424967e-06,
+      "loss": 0.5641,
+      "step": 495
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 2.0322333044110468,
+      "learning_rate": 4.51172557431483e-06,
+      "loss": 0.5422,
+      "step": 496
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.9573055659958774,
+      "learning_rate": 4.509782609905644e-06,
+      "loss": 0.516,
+      "step": 497
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.8223127451485421,
+      "learning_rate": 4.507836207521085e-06,
+      "loss": 0.5714,
+      "step": 498
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.9343089861079434,
+      "learning_rate": 4.50588637049071e-06,
+      "loss": 0.5424,
+      "step": 499
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.8940990649350729,
+      "learning_rate": 4.503933102149948e-06,
+      "loss": 0.5832,
+      "step": 500
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.908617301933682,
+      "learning_rate": 4.501976405840101e-06,
+      "loss": 0.5399,
+      "step": 501
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.8290259512093785,
+      "learning_rate": 4.500016284908334e-06,
+      "loss": 0.5561,
+      "step": 502
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.9840280991844164,
+      "learning_rate": 4.49805274270767e-06,
+      "loss": 0.5645,
+      "step": 503
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.9864953051636856,
+      "learning_rate": 4.496085782596984e-06,
+      "loss": 0.5369,
+      "step": 504
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.979387839103732,
+      "learning_rate": 4.494115407940999e-06,
+      "loss": 0.6196,
+      "step": 505
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.9266869362165981,
+      "learning_rate": 4.492141622110279e-06,
+      "loss": 0.5687,
+      "step": 506
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.9887461782376619,
+      "learning_rate": 4.4901644284812205e-06,
+      "loss": 0.5264,
+      "step": 507
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.8717867803152208,
+      "learning_rate": 4.488183830436052e-06,
+      "loss": 0.5612,
+      "step": 508
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 2.0044226171493,
+      "learning_rate": 4.486199831362828e-06,
+      "loss": 0.5571,
+      "step": 509
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 2.1075571016617958,
+      "learning_rate": 4.484212434655414e-06,
+      "loss": 0.5642,
+      "step": 510
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.8031612547539957,
+      "learning_rate": 4.482221643713494e-06,
+      "loss": 0.5805,
+      "step": 511
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.8782516337672304,
+      "learning_rate": 4.480227461942556e-06,
+      "loss": 0.5596,
+      "step": 512
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.075073901596185,
+      "learning_rate": 4.478229892753886e-06,
+      "loss": 0.6124,
+      "step": 513
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.0588983460568304,
+      "learning_rate": 4.47622893956457e-06,
+      "loss": 0.5589,
+      "step": 514
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.850248236464706,
+      "learning_rate": 4.474224605797476e-06,
+      "loss": 0.5603,
+      "step": 515
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.932844310652863,
+      "learning_rate": 4.472216894881261e-06,
+      "loss": 0.5571,
+      "step": 516
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.09975454805468,
+      "learning_rate": 4.470205810250357e-06,
+      "loss": 0.5975,
+      "step": 517
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.9694087093010304,
+      "learning_rate": 4.468191355344965e-06,
+      "loss": 0.5698,
+      "step": 518
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.8794788153917539,
+      "learning_rate": 4.466173533611053e-06,
+      "loss": 0.5559,
+      "step": 519
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.0650455557855434,
+      "learning_rate": 4.46415234850035e-06,
+      "loss": 0.5644,
+      "step": 520
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.0062649027982022,
+      "learning_rate": 4.462127803470334e-06,
+      "loss": 0.608,
+      "step": 521
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.043267877462657,
+      "learning_rate": 4.460099901984235e-06,
+      "loss": 0.573,
+      "step": 522
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.056372436619027,
+      "learning_rate": 4.4580686475110235e-06,
+      "loss": 0.5748,
+      "step": 523
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.8871033520138176,
+      "learning_rate": 4.456034043525404e-06,
+      "loss": 0.5339,
+      "step": 524
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.889474616209236,
+      "learning_rate": 4.45399609350781e-06,
+      "loss": 0.5185,
+      "step": 525
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.9767406217632912,
+      "learning_rate": 4.451954800944405e-06,
+      "loss": 0.5758,
+      "step": 526
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.9588695861513832,
+      "learning_rate": 4.449910169327062e-06,
+      "loss": 0.5472,
+      "step": 527
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.8852210889000718,
+      "learning_rate": 4.447862202153372e-06,
+      "loss": 0.5917,
+      "step": 528
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 2.0103638871993077,
+      "learning_rate": 4.445810902926629e-06,
+      "loss": 0.5761,
+      "step": 529
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 2.201836945389513,
+      "learning_rate": 4.443756275155827e-06,
+      "loss": 0.5614,
+      "step": 530
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.900702305836831,
+      "learning_rate": 4.441698322355656e-06,
+      "loss": 0.5254,
+      "step": 531
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 2.134694583439314,
+      "learning_rate": 4.4396370480464915e-06,
+      "loss": 0.5607,
+      "step": 532
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.8073751630381198,
+      "learning_rate": 4.437572455754391e-06,
+      "loss": 0.536,
+      "step": 533
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.9607338020142653,
+      "learning_rate": 4.435504549011088e-06,
+      "loss": 0.59,
+      "step": 534
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 2.0756430867435274,
+      "learning_rate": 4.433433331353988e-06,
+      "loss": 0.5538,
+      "step": 535
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.8280570853718465,
+      "learning_rate": 4.431358806326158e-06,
+      "loss": 0.5789,
+      "step": 536
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.2005143967434977,
+      "learning_rate": 4.429280977476321e-06,
+      "loss": 0.545,
+      "step": 537
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.896479397543979,
+      "learning_rate": 4.4271998483588565e-06,
+      "loss": 0.5791,
+      "step": 538
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.117773381781195,
+      "learning_rate": 4.425115422533785e-06,
+      "loss": 0.5234,
+      "step": 539
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.4438942429566617,
+      "learning_rate": 4.423027703566769e-06,
+      "loss": 0.5692,
+      "step": 540
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.873481152225171,
+      "learning_rate": 4.4209366950291025e-06,
+      "loss": 0.5739,
+      "step": 541
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.8655199147974673,
+      "learning_rate": 4.4188424004977085e-06,
+      "loss": 0.5795,
+      "step": 542
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.948840412241188,
+      "learning_rate": 4.416744823555129e-06,
+      "loss": 0.5304,
+      "step": 543
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.8389034133315045,
+      "learning_rate": 4.414643967789523e-06,
+      "loss": 0.5076,
+      "step": 544
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.8269235720085213,
+      "learning_rate": 4.412539836794657e-06,
+      "loss": 0.5837,
+      "step": 545
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.1298715969759505,
+      "learning_rate": 4.410432434169902e-06,
+      "loss": 0.5694,
+      "step": 546
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.0057741366005746,
+      "learning_rate": 4.408321763520223e-06,
+      "loss": 0.557,
+      "step": 547
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.7901331374893255,
+      "learning_rate": 4.406207828456177e-06,
+      "loss": 0.5746,
+      "step": 548
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 2.1994839889416187,
+      "learning_rate": 4.404090632593904e-06,
+      "loss": 0.5407,
+      "step": 549
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.9664921082690268,
+      "learning_rate": 4.401970179555123e-06,
+      "loss": 0.5322,
+      "step": 550
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.9933486180243851,
+      "learning_rate": 4.399846472967124e-06,
+      "loss": 0.5798,
+      "step": 551
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.986612256562151,
+      "learning_rate": 4.397719516462765e-06,
+      "loss": 0.5213,
+      "step": 552
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 2.046550123292336,
+      "learning_rate": 4.395589313680459e-06,
+      "loss": 0.5857,
+      "step": 553
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.7902327250340486,
+      "learning_rate": 4.393455868264176e-06,
+      "loss": 0.555,
+      "step": 554
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 2.0203627138517146,
+      "learning_rate": 4.391319183863432e-06,
+      "loss": 0.6329,
+      "step": 555
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.9373549045181289,
+      "learning_rate": 4.389179264133281e-06,
+      "loss": 0.566,
+      "step": 556
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.8936753353678124,
+      "learning_rate": 4.387036112734316e-06,
+      "loss": 0.5555,
+      "step": 557
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.8493817575820743,
+      "learning_rate": 4.3848897333326545e-06,
+      "loss": 0.5427,
+      "step": 558
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.9119588677783816,
+      "learning_rate": 4.382740129599937e-06,
+      "loss": 0.5157,
+      "step": 559
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.8190137094200924,
+      "learning_rate": 4.380587305213321e-06,
+      "loss": 0.503,
+      "step": 560
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.9891332712764953,
+      "learning_rate": 4.37843126385547e-06,
+      "loss": 0.5761,
+      "step": 561
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.8620896547461154,
+      "learning_rate": 4.376272009214555e-06,
+      "loss": 0.5259,
+      "step": 562
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.8896721756477406,
+      "learning_rate": 4.37410954498424e-06,
+      "loss": 0.5632,
+      "step": 563
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.8302281976781984,
+      "learning_rate": 4.37194387486368e-06,
+      "loss": 0.5612,
+      "step": 564
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 2.0721820586440165,
+      "learning_rate": 4.369775002557516e-06,
+      "loss": 0.533,
+      "step": 565
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.8259926551813157,
+      "learning_rate": 4.367602931775865e-06,
+      "loss": 0.526,
+      "step": 566
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.8096334574000785,
+      "learning_rate": 4.3654276662343155e-06,
+      "loss": 0.5306,
+      "step": 567
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.9675637591445598,
+      "learning_rate": 4.363249209653922e-06,
+      "loss": 0.5577,
+      "step": 568
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.8800389115841605,
+      "learning_rate": 4.361067565761197e-06,
+      "loss": 0.5553,
+      "step": 569
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.827485496395265,
+      "learning_rate": 4.358882738288105e-06,
+      "loss": 0.5587,
+      "step": 570
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.820954908943235,
+      "learning_rate": 4.356694730972056e-06,
+      "loss": 0.6186,
+      "step": 571
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.952072431699686,
+      "learning_rate": 4.3545035475559025e-06,
+      "loss": 0.5488,
+      "step": 572
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.8292648968688423,
+      "learning_rate": 4.352309191787924e-06,
+      "loss": 0.5534,
+      "step": 573
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.826293122529813,
+      "learning_rate": 4.350111667421835e-06,
+      "loss": 0.5872,
+      "step": 574
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.9251425791166785,
+      "learning_rate": 4.347910978216763e-06,
+      "loss": 0.5298,
+      "step": 575
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.8330818196811385,
+      "learning_rate": 4.345707127937253e-06,
+      "loss": 0.5871,
+      "step": 576
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.7842986545873851,
+      "learning_rate": 4.3435001203532555e-06,
+      "loss": 0.4898,
+      "step": 577
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.8778666245156521,
+      "learning_rate": 4.341289959240124e-06,
+      "loss": 0.5385,
+      "step": 578
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.9300679499181266,
+      "learning_rate": 4.339076648378605e-06,
+      "loss": 0.5698,
+      "step": 579
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.9440861965960357,
+      "learning_rate": 4.336860191554833e-06,
+      "loss": 0.5984,
+      "step": 580
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.929951096053947,
+      "learning_rate": 4.3346405925603265e-06,
+      "loss": 0.6222,
+      "step": 581
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.9138258400335695,
+      "learning_rate": 4.332417855191974e-06,
+      "loss": 0.5498,
+      "step": 582
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 2.058548455869675,
+      "learning_rate": 4.330191983252039e-06,
+      "loss": 0.5218,
+      "step": 583
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 2.243429045583125,
+      "learning_rate": 4.327962980548142e-06,
+      "loss": 0.5768,
+      "step": 584
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.9213537104634244,
+      "learning_rate": 4.32573085089326e-06,
+      "loss": 0.5784,
+      "step": 585
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.9165291289119128,
+      "learning_rate": 4.32349559810572e-06,
+      "loss": 0.5697,
+      "step": 586
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.9674279518735756,
+      "learning_rate": 4.321257226009193e-06,
+      "loss": 0.5104,
+      "step": 587
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.9051339015323923,
+      "learning_rate": 4.319015738432683e-06,
+      "loss": 0.5711,
+      "step": 588
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.957357618850765,
+      "learning_rate": 4.3167711392105245e-06,
+      "loss": 0.5854,
+      "step": 589
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.9859311708308915,
+      "learning_rate": 4.314523432182376e-06,
+      "loss": 0.547,
+      "step": 590
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.773704456523191,
+      "learning_rate": 4.312272621193209e-06,
+      "loss": 0.5259,
+      "step": 591
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.82988033655793,
+      "learning_rate": 4.31001871009331e-06,
+      "loss": 0.5209,
+      "step": 592
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.8925134832060522,
+      "learning_rate": 4.307761702738264e-06,
+      "loss": 0.59,
+      "step": 593
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.8477075780641046,
+      "learning_rate": 4.305501602988953e-06,
+      "loss": 0.5714,
+      "step": 594
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.8568432886623798,
+      "learning_rate": 4.303238414711552e-06,
+      "loss": 0.5877,
+      "step": 595
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.8179798660158206,
+      "learning_rate": 4.3009721417775166e-06,
+      "loss": 0.6029,
+      "step": 596
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.8494963193854803,
+      "learning_rate": 4.29870278806358e-06,
+      "loss": 0.5236,
+      "step": 597
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.9586017397154731,
+      "learning_rate": 4.296430357451744e-06,
+      "loss": 0.5998,
+      "step": 598
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.926616057974202,
+      "learning_rate": 4.2941548538292765e-06,
+      "loss": 0.5914,
+      "step": 599
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.9321738359144827,
+      "learning_rate": 4.291876281088701e-06,
+      "loss": 0.5358,
+      "step": 600
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.8229177571361932,
+      "learning_rate": 4.289594643127788e-06,
+      "loss": 0.5284,
+      "step": 601
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.849252449531427,
+      "learning_rate": 4.287309943849558e-06,
+      "loss": 0.5689,
+      "step": 602
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.985343175388319,
+      "learning_rate": 4.285022187162261e-06,
+      "loss": 0.6101,
+      "step": 603
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.9437791826489255,
+      "learning_rate": 4.2827313769793835e-06,
+      "loss": 0.5419,
+      "step": 604
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.8027421078538746,
+      "learning_rate": 4.28043751721963e-06,
+      "loss": 0.5504,
+      "step": 605
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.8221230935939319,
+      "learning_rate": 4.278140611806926e-06,
+      "loss": 0.5284,
+      "step": 606
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.8597205853821357,
+      "learning_rate": 4.275840664670403e-06,
+      "loss": 0.623,
+      "step": 607
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.7801370844338822,
+      "learning_rate": 4.2735376797444e-06,
+      "loss": 0.5265,
+      "step": 608
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.9028094416250234,
+      "learning_rate": 4.271231660968449e-06,
+      "loss": 0.5764,
+      "step": 609
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.9385737581380094,
+      "learning_rate": 4.268922612287273e-06,
+      "loss": 0.6047,
+      "step": 610
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.760006169733744,
+      "learning_rate": 4.266610537650778e-06,
+      "loss": 0.4944,
+      "step": 611
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.857083980479501,
+      "learning_rate": 4.264295441014047e-06,
+      "loss": 0.5174,
+      "step": 612
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.8299942480819913,
+      "learning_rate": 4.261977326337332e-06,
+      "loss": 0.5814,
+      "step": 613
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.8943903433033418,
+      "learning_rate": 4.259656197586046e-06,
+      "loss": 0.5514,
+      "step": 614
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.7839062839610529,
+      "learning_rate": 4.257332058730761e-06,
+      "loss": 0.5857,
+      "step": 615
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 2.7188975139736256,
+      "learning_rate": 4.255004913747196e-06,
+      "loss": 0.5509,
+      "step": 616
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.8767461602206779,
+      "learning_rate": 4.252674766616212e-06,
+      "loss": 0.5038,
+      "step": 617
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.8391588901867753,
+      "learning_rate": 4.250341621323809e-06,
+      "loss": 0.5196,
+      "step": 618
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.8106924420187829,
+      "learning_rate": 4.248005481861111e-06,
+      "loss": 0.5458,
+      "step": 619
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.9698953511074666,
+      "learning_rate": 4.245666352224367e-06,
+      "loss": 0.5963,
+      "step": 620
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.8890424031569348,
+      "learning_rate": 4.243324236414939e-06,
+      "loss": 0.5277,
+      "step": 621
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.8537879418167673,
+      "learning_rate": 4.240979138439301e-06,
+      "loss": 0.5407,
+      "step": 622
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.9264981771759184,
+      "learning_rate": 4.238631062309023e-06,
+      "loss": 0.5788,
+      "step": 623
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.949693389062837,
+      "learning_rate": 4.236280012040773e-06,
+      "loss": 0.5007,
+      "step": 624
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.8845778025905608,
+      "learning_rate": 4.233925991656307e-06,
+      "loss": 0.5905,
+      "step": 625
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.8977167810192608,
+      "learning_rate": 4.231569005182459e-06,
+      "loss": 0.5342,
+      "step": 626
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.9579196623045914,
+      "learning_rate": 4.229209056651139e-06,
+      "loss": 0.554,
+      "step": 627
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.8427820272426025,
+      "learning_rate": 4.226846150099324e-06,
+      "loss": 0.5629,
+      "step": 628
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.865218131227253,
+      "learning_rate": 4.22448028956905e-06,
+      "loss": 0.558,
+      "step": 629
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.7348773966225364,
+      "learning_rate": 4.222111479107406e-06,
+      "loss": 0.5332,
+      "step": 630
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.779367140127678,
+      "learning_rate": 4.219739722766528e-06,
+      "loss": 0.569,
+      "step": 631
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.92860570712595,
+      "learning_rate": 4.217365024603592e-06,
+      "loss": 0.5342,
+      "step": 632
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.946965997476449,
+      "learning_rate": 4.214987388680804e-06,
+      "loss": 0.5482,
+      "step": 633
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.7930454990298659,
+      "learning_rate": 4.212606819065399e-06,
+      "loss": 0.5376,
+      "step": 634
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.8379498458279013,
+      "learning_rate": 4.210223319829626e-06,
+      "loss": 0.5741,
+      "step": 635
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.742977498596499,
+      "learning_rate": 4.207836895050748e-06,
+      "loss": 0.5569,
+      "step": 636
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.852541709372898,
+      "learning_rate": 4.205447548811032e-06,
+      "loss": 0.578,
+      "step": 637
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.8180259569107267,
+      "learning_rate": 4.203055285197745e-06,
+      "loss": 0.5189,
+      "step": 638
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.8177842562763082,
+      "learning_rate": 4.20066010830314e-06,
+      "loss": 0.5424,
+      "step": 639
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.8068654723170434,
+      "learning_rate": 4.198262022224457e-06,
+      "loss": 0.5336,
+      "step": 640
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.9664843499052276,
+      "learning_rate": 4.195861031063909e-06,
+      "loss": 0.5399,
+      "step": 641
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.7812265481792608,
+      "learning_rate": 4.193457138928683e-06,
+      "loss": 0.534,
+      "step": 642
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.908377487778027,
+      "learning_rate": 4.191050349930925e-06,
+      "loss": 0.5831,
+      "step": 643
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.8124678634933105,
+      "learning_rate": 4.18864066818774e-06,
+      "loss": 0.5309,
+      "step": 644
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.902443199964304,
+      "learning_rate": 4.186228097821176e-06,
+      "loss": 0.5452,
+      "step": 645
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.9694387068719457,
+      "learning_rate": 4.183812642958227e-06,
+      "loss": 0.5462,
+      "step": 646
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.945352264767711,
+      "learning_rate": 4.181394307730819e-06,
+      "loss": 0.4853,
+      "step": 647
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.7967416728436914,
+      "learning_rate": 4.178973096275806e-06,
+      "loss": 0.5952,
+      "step": 648
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 2.0602433101771616,
+      "learning_rate": 4.176549012734963e-06,
+      "loss": 0.6346,
+      "step": 649
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.9158731498204968,
+      "learning_rate": 4.1741220612549746e-06,
+      "loss": 0.5101,
+      "step": 650
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.951875972207364,
+      "learning_rate": 4.171692245987436e-06,
+      "loss": 0.5718,
+      "step": 651
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.871788727804539,
+      "learning_rate": 4.169259571088839e-06,
+      "loss": 0.5516,
+      "step": 652
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.945571804366465,
+      "learning_rate": 4.166824040720566e-06,
+      "loss": 0.5544,
+      "step": 653
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.8975723622706568,
+      "learning_rate": 4.1643856590488866e-06,
+      "loss": 0.5643,
+      "step": 654
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.9772846459626554,
+      "learning_rate": 4.161944430244945e-06,
+      "loss": 0.5487,
+      "step": 655
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 2.036472038769578,
+      "learning_rate": 4.159500358484759e-06,
+      "loss": 0.5232,
+      "step": 656
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.7742095436926848,
+      "learning_rate": 4.157053447949206e-06,
+      "loss": 0.4963,
+      "step": 657
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 2.1819742476725814,
+      "learning_rate": 4.154603702824023e-06,
+      "loss": 0.5416,
+      "step": 658
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.9151345309457093,
+      "learning_rate": 4.152151127299794e-06,
+      "loss": 0.5822,
+      "step": 659
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 2.033640859083771,
+      "learning_rate": 4.149695725571944e-06,
+      "loss": 0.5876,
+      "step": 660
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.8935471013235925,
+      "learning_rate": 4.147237501840734e-06,
+      "loss": 0.548,
+      "step": 661
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.7836299476774775,
+      "learning_rate": 4.144776460311253e-06,
+      "loss": 0.5274,
+      "step": 662
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 2.194666072449123,
+      "learning_rate": 4.142312605193407e-06,
+      "loss": 0.5934,
+      "step": 663
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.988265407508224,
+      "learning_rate": 4.13984594070192e-06,
+      "loss": 0.5539,
+      "step": 664
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.7594955740187146,
+      "learning_rate": 4.137376471056317e-06,
+      "loss": 0.5324,
+      "step": 665
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.9342530277100989,
+      "learning_rate": 4.1349042004809224e-06,
+      "loss": 0.5902,
+      "step": 666
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.9757082453588417,
+      "learning_rate": 4.132429133204856e-06,
+      "loss": 0.5874,
+      "step": 667
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.7792467343474774,
+      "learning_rate": 4.129951273462016e-06,
+      "loss": 0.5516,
+      "step": 668
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.9010392264817964,
+      "learning_rate": 4.127470625491082e-06,
+      "loss": 0.5793,
+      "step": 669
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 2.054505290884914,
+      "learning_rate": 4.1249871935355e-06,
+      "loss": 0.5718,
+      "step": 670
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.8010036617727825,
+      "learning_rate": 4.1225009818434805e-06,
+      "loss": 0.5698,
+      "step": 671
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.975020822034628,
+      "learning_rate": 4.120011994667988e-06,
+      "loss": 0.5739,
+      "step": 672
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.9801075045379748,
+      "learning_rate": 4.117520236266734e-06,
+      "loss": 0.5589,
+      "step": 673
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.7773808874926829,
+      "learning_rate": 4.115025710902173e-06,
+      "loss": 0.5276,
+      "step": 674
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.890298398205481,
+      "learning_rate": 4.112528422841491e-06,
+      "loss": 0.4914,
+      "step": 675
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.9087570296379215,
+      "learning_rate": 4.110028376356599e-06,
+      "loss": 0.5412,
+      "step": 676
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.8908271691889404,
+      "learning_rate": 4.1075255757241295e-06,
+      "loss": 0.5618,
+      "step": 677
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 2.024312170169272,
+      "learning_rate": 4.105020025225423e-06,
+      "loss": 0.5618,
+      "step": 678
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.8072403207581518,
+      "learning_rate": 4.102511729146528e-06,
+      "loss": 0.5744,
+      "step": 679
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.7750572145097157,
+      "learning_rate": 4.100000691778185e-06,
+      "loss": 0.5716,
+      "step": 680
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.8778337896632162,
+      "learning_rate": 4.097486917415827e-06,
+      "loss": 0.5683,
+      "step": 681
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.9710167098273688,
+      "learning_rate": 4.094970410359568e-06,
+      "loss": 0.5273,
+      "step": 682
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.9136975523972874,
+      "learning_rate": 4.092451174914196e-06,
+      "loss": 0.5239,
+      "step": 683
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.929344793900944,
+      "learning_rate": 4.089929215389167e-06,
+      "loss": 0.5388,
+      "step": 684
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.7211535229712278,
+      "learning_rate": 4.087404536098597e-06,
+      "loss": 0.5068,
+      "step": 685
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.8739637749458882,
+      "learning_rate": 4.084877141361254e-06,
+      "loss": 0.5537,
+      "step": 686
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.9268469960932768,
+      "learning_rate": 4.082347035500553e-06,
+      "loss": 0.5875,
+      "step": 687
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.896542320004603,
+      "learning_rate": 4.079814222844541e-06,
+      "loss": 0.5314,
+      "step": 688
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.723925126440519,
+      "learning_rate": 4.077278707725904e-06,
+      "loss": 0.5009,
+      "step": 689
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.8345210205201996,
+      "learning_rate": 4.074740494481942e-06,
+      "loss": 0.5544,
+      "step": 690
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.766819080519227,
+      "learning_rate": 4.072199587454578e-06,
+      "loss": 0.5393,
+      "step": 691
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.9577975399484282,
+      "learning_rate": 4.069655990990337e-06,
+      "loss": 0.5357,
+      "step": 692
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.8254761359015224,
+      "learning_rate": 4.06710970944035e-06,
+      "loss": 0.5797,
+      "step": 693
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 2.1203973374999214,
+      "learning_rate": 4.064560747160337e-06,
+      "loss": 0.5811,
+      "step": 694
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.9066221824053846,
+      "learning_rate": 4.062009108510605e-06,
+      "loss": 0.5014,
+      "step": 695
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.951489716071849,
+      "learning_rate": 4.059454797856039e-06,
+      "loss": 0.529,
+      "step": 696
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.8402907113209426,
+      "learning_rate": 4.056897819566096e-06,
+      "loss": 0.4942,
+      "step": 697
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 2.0368715640768498,
+      "learning_rate": 4.0543381780147965e-06,
+      "loss": 0.5245,
+      "step": 698
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.8154462049772704,
+      "learning_rate": 4.0517758775807135e-06,
+      "loss": 0.4979,
+      "step": 699
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.890388895335948,
+      "learning_rate": 4.049210922646973e-06,
+      "loss": 0.5212,
+      "step": 700
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 2.0215900504030166,
+      "learning_rate": 4.046643317601237e-06,
+      "loss": 0.5384,
+      "step": 701
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.816997259900234,
+      "learning_rate": 4.0440730668357076e-06,
+      "loss": 0.492,
+      "step": 702
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.968633766153865,
+      "learning_rate": 4.0415001747471036e-06,
+      "loss": 0.5917,
+      "step": 703
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.8313487810801756,
+      "learning_rate": 4.0389246457366696e-06,
+      "loss": 0.5561,
+      "step": 704
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.7954421155528784,
+      "learning_rate": 4.036346484210159e-06,
+      "loss": 0.5383,
+      "step": 705
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8517101217315919,
+      "learning_rate": 4.033765694577826e-06,
+      "loss": 0.5368,
+      "step": 706
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8888441616203875,
+      "learning_rate": 4.031182281254423e-06,
+      "loss": 0.5895,
+      "step": 707
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8131436351862782,
+      "learning_rate": 4.028596248659191e-06,
+      "loss": 0.5346,
+      "step": 708
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8803113487311214,
+      "learning_rate": 4.0260076012158486e-06,
+      "loss": 0.4987,
+      "step": 709
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8989122650791335,
+      "learning_rate": 4.023416343352589e-06,
+      "loss": 0.5007,
+      "step": 710
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.9466291969735336,
+      "learning_rate": 4.020822479502074e-06,
+      "loss": 0.5868,
+      "step": 711
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.869533367998661,
+      "learning_rate": 4.018226014101418e-06,
+      "loss": 0.5995,
+      "step": 712
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.93738608926368,
+      "learning_rate": 4.015626951592187e-06,
+      "loss": 0.5625,
+      "step": 713
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8485080870897803,
+      "learning_rate": 4.013025296420394e-06,
+      "loss": 0.5585,
+      "step": 714
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8099669115387913,
+      "learning_rate": 4.010421053036481e-06,
+      "loss": 0.5384,
+      "step": 715
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8810123612010912,
+      "learning_rate": 4.007814225895321e-06,
+      "loss": 0.5589,
+      "step": 716
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8692823610937885,
+      "learning_rate": 4.005204819456205e-06,
+      "loss": 0.5474,
+      "step": 717
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.8120887102918588,
+      "learning_rate": 4.00259283818284e-06,
+      "loss": 0.5138,
+      "step": 718
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.7933926935301234,
+      "learning_rate": 3.999978286543331e-06,
+      "loss": 0.5235,
+      "step": 719
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.8382360731306235,
+      "learning_rate": 3.997361169010187e-06,
+      "loss": 0.5846,
+      "step": 720
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.993925306673069,
+      "learning_rate": 3.994741490060301e-06,
+      "loss": 0.5561,
+      "step": 721
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.900088669959918,
+      "learning_rate": 3.9921192541749505e-06,
+      "loss": 0.5215,
+      "step": 722
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.9250072769385074,
+      "learning_rate": 3.989494465839785e-06,
+      "loss": 0.54,
+      "step": 723
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.7928905908766457,
+      "learning_rate": 3.986867129544822e-06,
+      "loss": 0.6066,
+      "step": 724
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.9474900039545116,
+      "learning_rate": 3.984237249784437e-06,
+      "loss": 0.5173,
+      "step": 725
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.9004077336349998,
+      "learning_rate": 3.981604831057357e-06,
+      "loss": 0.5409,
+      "step": 726
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.7573843693188624,
+      "learning_rate": 3.97896987786665e-06,
+      "loss": 0.5239,
+      "step": 727
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.899283660379949,
+      "learning_rate": 3.976332394719721e-06,
+      "loss": 0.4977,
+      "step": 728
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.8353476568345033,
+      "learning_rate": 3.973692386128304e-06,
+      "loss": 0.5834,
+      "step": 729
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 2.032325534167748,
+      "learning_rate": 3.971049856608451e-06,
+      "loss": 0.5343,
+      "step": 730
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.8161347764383835,
+      "learning_rate": 3.9684048106805286e-06,
+      "loss": 0.585,
+      "step": 731
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.836376388525165,
+      "learning_rate": 3.965757252869204e-06,
+      "loss": 0.5978,
+      "step": 732
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.889118862096067,
+      "learning_rate": 3.963107187703446e-06,
+      "loss": 0.5393,
+      "step": 733
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.7772829607776217,
+      "learning_rate": 3.96045461971651e-06,
+      "loss": 0.5164,
+      "step": 734
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.7980410807492582,
+      "learning_rate": 3.957799553445932e-06,
+      "loss": 0.5455,
+      "step": 735
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.907936099702467,
+      "learning_rate": 3.955141993433526e-06,
+      "loss": 0.532,
+      "step": 736
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.8668064740862462,
+      "learning_rate": 3.9524819442253645e-06,
+      "loss": 0.5578,
+      "step": 737
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.838952740378055,
+      "learning_rate": 3.949819410371785e-06,
+      "loss": 0.5784,
+      "step": 738
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.9595767898211005,
+      "learning_rate": 3.947154396427373e-06,
+      "loss": 0.5213,
+      "step": 739
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.9422968944070973,
+      "learning_rate": 3.944486906950954e-06,
+      "loss": 0.5709,
+      "step": 740
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.760556693040696,
+      "learning_rate": 3.941816946505592e-06,
+      "loss": 0.5564,
+      "step": 741
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.8054841879427592,
+      "learning_rate": 3.939144519658575e-06,
+      "loss": 0.5435,
+      "step": 742
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 2.1072923992538,
+      "learning_rate": 3.936469630981412e-06,
+      "loss": 0.5622,
+      "step": 743
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.711687978027928,
+      "learning_rate": 3.933792285049821e-06,
+      "loss": 0.5554,
+      "step": 744
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.8166543944942228,
+      "learning_rate": 3.931112486443727e-06,
+      "loss": 0.5079,
+      "step": 745
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.7923405334139695,
+      "learning_rate": 3.928430239747246e-06,
+      "loss": 0.5692,
+      "step": 746
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.9611773239667012,
+      "learning_rate": 3.925745549548687e-06,
+      "loss": 0.5092,
+      "step": 747
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.8440088039871827,
+      "learning_rate": 3.923058420440534e-06,
+      "loss": 0.5369,
+      "step": 748
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.9272316571307881,
+      "learning_rate": 3.920368857019447e-06,
+      "loss": 0.5798,
+      "step": 749
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.8248503445199376,
+      "learning_rate": 3.917676863886246e-06,
+      "loss": 0.5479,
+      "step": 750
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.9200626612083824,
+      "learning_rate": 3.914982445645912e-06,
+      "loss": 0.549,
+      "step": 751
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.8585556832275227,
+      "learning_rate": 3.91228560690757e-06,
+      "loss": 0.5283,
+      "step": 752
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.819239895382093,
+      "learning_rate": 3.90958635228449e-06,
+      "loss": 0.535,
+      "step": 753
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.7810389942543545,
+      "learning_rate": 3.90688468639407e-06,
+      "loss": 0.5125,
+      "step": 754
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.9614453700373935,
+      "learning_rate": 3.904180613857837e-06,
+      "loss": 0.5406,
+      "step": 755
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.805104940263808,
+      "learning_rate": 3.901474139301433e-06,
+      "loss": 0.5794,
+      "step": 756
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.78756289235025,
+      "learning_rate": 3.898765267354607e-06,
+      "loss": 0.569,
+      "step": 757
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.912300438003516,
+      "learning_rate": 3.896054002651213e-06,
+      "loss": 0.5565,
+      "step": 758
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.8148356694353722,
+      "learning_rate": 3.893340349829195e-06,
+      "loss": 0.5471,
+      "step": 759
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.6836223387492706,
+      "learning_rate": 3.890624313530583e-06,
+      "loss": 0.5145,
+      "step": 760
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.8389298216964765,
+      "learning_rate": 3.887905898401485e-06,
+      "loss": 0.5441,
+      "step": 761
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.7845754057436856,
+      "learning_rate": 3.885185109092078e-06,
+      "loss": 0.5478,
+      "step": 762
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.77076035925993,
+      "learning_rate": 3.882461950256598e-06,
+      "loss": 0.5497,
+      "step": 763
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.8011284465286703,
+      "learning_rate": 3.87973642655334e-06,
+      "loss": 0.5039,
+      "step": 764
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.7400129481667248,
+      "learning_rate": 3.877008542644637e-06,
+      "loss": 0.5243,
+      "step": 765
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.9899565111682327,
+      "learning_rate": 3.874278303196866e-06,
+      "loss": 0.5767,
+      "step": 766
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.8345576263874734,
+      "learning_rate": 3.871545712880429e-06,
+      "loss": 0.5262,
+      "step": 767
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.8375211207672395,
+      "learning_rate": 3.8688107763697505e-06,
+      "loss": 0.5467,
+      "step": 768
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.8068462280574835,
+      "learning_rate": 3.8660734983432715e-06,
+      "loss": 0.5256,
+      "step": 769
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.7823522202158735,
+      "learning_rate": 3.863333883483433e-06,
+      "loss": 0.5419,
+      "step": 770
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.8881514180214427,
+      "learning_rate": 3.86059193647668e-06,
+      "loss": 0.541,
+      "step": 771
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.8311064595650786,
+      "learning_rate": 3.85784766201344e-06,
+      "loss": 0.5455,
+      "step": 772
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.9833459774866717,
+      "learning_rate": 3.855101064788126e-06,
+      "loss": 0.5723,
+      "step": 773
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.7968096633022903,
+      "learning_rate": 3.852352149499125e-06,
+      "loss": 0.5153,
+      "step": 774
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.775423895652992,
+      "learning_rate": 3.849600920848787e-06,
+      "loss": 0.5134,
+      "step": 775
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.7262892998825556,
+      "learning_rate": 3.84684738354342e-06,
+      "loss": 0.5287,
+      "step": 776
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.7866135638778051,
+      "learning_rate": 3.84409154229328e-06,
+      "loss": 0.57,
+      "step": 777
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.787377916112687,
+      "learning_rate": 3.841333401812569e-06,
+      "loss": 0.5312,
+      "step": 778
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.684801862246949,
+      "learning_rate": 3.838572966819416e-06,
+      "loss": 0.5822,
+      "step": 779
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.79074773131748,
+      "learning_rate": 3.835810242035879e-06,
+      "loss": 0.5651,
+      "step": 780
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.9234904827178134,
+      "learning_rate": 3.8330452321879305e-06,
+      "loss": 0.5527,
+      "step": 781
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 2.1733402579018186,
+      "learning_rate": 3.830277942005455e-06,
+      "loss": 0.5545,
+      "step": 782
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 2.112229504682016,
+      "learning_rate": 3.827508376222233e-06,
+      "loss": 0.5766,
+      "step": 783
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 2.087174122744587,
+      "learning_rate": 3.824736539575944e-06,
+      "loss": 0.549,
+      "step": 784
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.9570382810890106,
+      "learning_rate": 3.821962436808145e-06,
+      "loss": 0.4984,
+      "step": 785
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.94720853153738,
+      "learning_rate": 3.819186072664277e-06,
+      "loss": 0.5303,
+      "step": 786
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 2.21095404069362,
+      "learning_rate": 3.816407451893643e-06,
+      "loss": 0.5674,
+      "step": 787
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.7284336698899117,
+      "learning_rate": 3.8136265792494094e-06,
+      "loss": 0.5952,
+      "step": 788
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.940869697529687,
+      "learning_rate": 3.8108434594885934e-06,
+      "loss": 0.5198,
+      "step": 789
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.9282749931884566,
+      "learning_rate": 3.808058097372057e-06,
+      "loss": 0.5499,
+      "step": 790
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 2.0180195532646983,
+      "learning_rate": 3.8052704976644984e-06,
+      "loss": 0.5117,
+      "step": 791
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.8303561179366206,
+      "learning_rate": 3.8024806651344424e-06,
+      "loss": 0.5034,
+      "step": 792
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 2.0584295539484754,
+      "learning_rate": 3.7996886045542335e-06,
+      "loss": 0.5391,
+      "step": 793
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.7736893833047733,
+      "learning_rate": 3.7968943207000284e-06,
+      "loss": 0.5378,
+      "step": 794
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.7840353008162277,
+      "learning_rate": 3.794097818351786e-06,
+      "loss": 0.5091,
+      "step": 795
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 2.0949100717616225,
+      "learning_rate": 3.791299102293261e-06,
+      "loss": 0.5731,
+      "step": 796
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 2.048353193294094,
+      "learning_rate": 3.7884981773119943e-06,
+      "loss": 0.5576,
+      "step": 797
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.9990070284918733,
+      "learning_rate": 3.7856950481993054e-06,
+      "loss": 0.5297,
+      "step": 798
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.859560152641746,
+      "learning_rate": 3.7828897197502856e-06,
+      "loss": 0.5131,
+      "step": 799
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 2.0054802770873916,
+      "learning_rate": 3.780082196763785e-06,
+      "loss": 0.5428,
+      "step": 800
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.8985367093585213,
+      "learning_rate": 3.7772724840424126e-06,
+      "loss": 0.5206,
+      "step": 801
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.9964704653764362,
+      "learning_rate": 3.774460586392519e-06,
+      "loss": 0.5929,
+      "step": 802
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.7572936836574113,
+      "learning_rate": 3.771646508624194e-06,
+      "loss": 0.5428,
+      "step": 803
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.9623695483620975,
+      "learning_rate": 3.768830255551258e-06,
+      "loss": 0.5685,
+      "step": 804
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.9663290616402378,
+      "learning_rate": 3.76601183199125e-06,
+      "loss": 0.5351,
+      "step": 805
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.7876590847889615,
+      "learning_rate": 3.763191242765424e-06,
+      "loss": 0.567,
+      "step": 806
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.8500820456277005,
+      "learning_rate": 3.7603684926987383e-06,
+      "loss": 0.523,
+      "step": 807
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 2.041973125533567,
+      "learning_rate": 3.757543586619845e-06,
+      "loss": 0.5531,
+      "step": 808
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.7440376746222928,
+      "learning_rate": 3.754716529361089e-06,
+      "loss": 0.4913,
+      "step": 809
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.7910937306897654,
+      "learning_rate": 3.7518873257584897e-06,
+      "loss": 0.5128,
+      "step": 810
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.9334392608388238,
+      "learning_rate": 3.7490559806517434e-06,
+      "loss": 0.5861,
+      "step": 811
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 2.0003597857127673,
+      "learning_rate": 3.746222498884206e-06,
+      "loss": 0.5535,
+      "step": 812
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.7964615198133413,
+      "learning_rate": 3.74338688530289e-06,
+      "loss": 0.5409,
+      "step": 813
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.7726488990007383,
+      "learning_rate": 3.740549144758453e-06,
+      "loss": 0.5714,
+      "step": 814
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.9080323144095523,
+      "learning_rate": 3.737709282105193e-06,
+      "loss": 0.5534,
+      "step": 815
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.9612361354867969,
+      "learning_rate": 3.734867302201038e-06,
+      "loss": 0.5282,
+      "step": 816
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.873254058551618,
+      "learning_rate": 3.7320232099075363e-06,
+      "loss": 0.5422,
+      "step": 817
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.8383882069199007,
+      "learning_rate": 3.7291770100898508e-06,
+      "loss": 0.5588,
+      "step": 818
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 2.0137053963220835,
+      "learning_rate": 3.726328707616749e-06,
+      "loss": 0.5895,
+      "step": 819
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.8207549211692964,
+      "learning_rate": 3.7234783073605957e-06,
+      "loss": 0.5428,
+      "step": 820
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.7929761418069659,
+      "learning_rate": 3.7206258141973445e-06,
+      "loss": 0.555,
+      "step": 821
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.8863691259545465,
+      "learning_rate": 3.7177712330065285e-06,
+      "loss": 0.5802,
+      "step": 822
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.8383911000943605,
+      "learning_rate": 3.714914568671252e-06,
+      "loss": 0.4986,
+      "step": 823
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 2.0032777947804044,
+      "learning_rate": 3.7120558260781846e-06,
+      "loss": 0.6456,
+      "step": 824
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.733320874844507,
+      "learning_rate": 3.709195010117551e-06,
+      "loss": 0.5146,
+      "step": 825
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.7411187007421471,
+      "learning_rate": 3.7063321256831193e-06,
+      "loss": 0.5297,
+      "step": 826
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.8334107493901353,
+      "learning_rate": 3.7034671776722003e-06,
+      "loss": 0.545,
+      "step": 827
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.931467221651553,
+      "learning_rate": 3.7006001709856314e-06,
+      "loss": 0.579,
+      "step": 828
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.799522216655623,
+      "learning_rate": 3.697731110527774e-06,
+      "loss": 0.5453,
+      "step": 829
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.8098119388805842,
+      "learning_rate": 3.6948600012065016e-06,
+      "loss": 0.5186,
+      "step": 830
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.8419013342395714,
+      "learning_rate": 3.6919868479331934e-06,
+      "loss": 0.4833,
+      "step": 831
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.8419148322752323,
+      "learning_rate": 3.6891116556227234e-06,
+      "loss": 0.5479,
+      "step": 832
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.7858200344474908,
+      "learning_rate": 3.6862344291934545e-06,
+      "loss": 0.5264,
+      "step": 833
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.8057437623830686,
+      "learning_rate": 3.6833551735672293e-06,
+      "loss": 0.5208,
+      "step": 834
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.8570584000334132,
+      "learning_rate": 3.6804738936693617e-06,
+      "loss": 0.5652,
+      "step": 835
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.7961732805960369,
+      "learning_rate": 3.677590594428629e-06,
+      "loss": 0.5693,
+      "step": 836
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.954108513879844,
+      "learning_rate": 3.6747052807772614e-06,
+      "loss": 0.5673,
+      "step": 837
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.834152772161213,
+      "learning_rate": 3.671817957650936e-06,
+      "loss": 0.5118,
+      "step": 838
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.8035026424969205,
+      "learning_rate": 3.6689286299887663e-06,
+      "loss": 0.5778,
+      "step": 839
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.7862771700309947,
+      "learning_rate": 3.666037302733295e-06,
+      "loss": 0.5575,
+      "step": 840
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.7398650592861555,
+      "learning_rate": 3.6631439808304874e-06,
+      "loss": 0.5323,
+      "step": 841
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.7082885736006344,
+      "learning_rate": 3.6602486692297183e-06,
+      "loss": 0.543,
+      "step": 842
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.8242434568233548,
+      "learning_rate": 3.6573513728837685e-06,
+      "loss": 0.5579,
+      "step": 843
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.8305967806472925,
+      "learning_rate": 3.6544520967488108e-06,
+      "loss": 0.5425,
+      "step": 844
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.7126995402462595,
+      "learning_rate": 3.651550845784407e-06,
+      "loss": 0.5399,
+      "step": 845
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.992190051239983,
+      "learning_rate": 3.648647624953496e-06,
+      "loss": 0.5951,
+      "step": 846
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.9362402903409848,
+      "learning_rate": 3.6457424392223885e-06,
+      "loss": 0.5427,
+      "step": 847
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.7390586845081806,
+      "learning_rate": 3.642835293560754e-06,
+      "loss": 0.5269,
+      "step": 848
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.8601747321693383,
+      "learning_rate": 3.639926192941615e-06,
+      "loss": 0.5246,
+      "step": 849
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.8305054240762129,
+      "learning_rate": 3.6370151423413396e-06,
+      "loss": 0.562,
+      "step": 850
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.8361711553327809,
+      "learning_rate": 3.6341021467396296e-06,
+      "loss": 0.5066,
+      "step": 851
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.9202617492772214,
+      "learning_rate": 3.6311872111195163e-06,
+      "loss": 0.5755,
+      "step": 852
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.9056266366653432,
+      "learning_rate": 3.628270340467348e-06,
+      "loss": 0.5193,
+      "step": 853
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.9700971504271882,
+      "learning_rate": 3.625351539772783e-06,
+      "loss": 0.5499,
+      "step": 854
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.7142305580780086,
+      "learning_rate": 3.6224308140287818e-06,
+      "loss": 0.5597,
+      "step": 855
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.7897876492593174,
+      "learning_rate": 3.6195081682315972e-06,
+      "loss": 0.5347,
+      "step": 856
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 2.191923699092432,
+      "learning_rate": 3.616583607380769e-06,
+      "loss": 0.5251,
+      "step": 857
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.8582876176666503,
+      "learning_rate": 3.61365713647911e-06,
+      "loss": 0.5067,
+      "step": 858
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.991617360171558,
+      "learning_rate": 3.610728760532701e-06,
+      "loss": 0.6464,
+      "step": 859
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.892621069660817,
+      "learning_rate": 3.607798484550881e-06,
+      "loss": 0.5145,
+      "step": 860
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.7592963181570629,
+      "learning_rate": 3.6048663135462423e-06,
+      "loss": 0.5297,
+      "step": 861
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 2.020192040751123,
+      "learning_rate": 3.6019322525346157e-06,
+      "loss": 0.5709,
+      "step": 862
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.8575959680616767,
+      "learning_rate": 3.598996306535067e-06,
+      "loss": 0.5946,
+      "step": 863
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.9638758131071599,
+      "learning_rate": 3.5960584805698845e-06,
+      "loss": 0.4833,
+      "step": 864
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.7517341191956926,
+      "learning_rate": 3.593118779664574e-06,
+      "loss": 0.5439,
+      "step": 865
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.7637144330636925,
+      "learning_rate": 3.590177208847848e-06,
+      "loss": 0.4898,
+      "step": 866
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 2.107899096934758,
+      "learning_rate": 3.5872337731516186e-06,
+      "loss": 0.5332,
+      "step": 867
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 2.016493645108941,
+      "learning_rate": 3.5842884776109875e-06,
+      "loss": 0.5313,
+      "step": 868
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.8758602544873038,
+      "learning_rate": 3.581341327264236e-06,
+      "loss": 0.554,
+      "step": 869
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.8566881639083022,
+      "learning_rate": 3.5783923271528222e-06,
+      "loss": 0.5322,
+      "step": 870
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.9151838907738468,
+      "learning_rate": 3.5754414823213647e-06,
+      "loss": 0.5306,
+      "step": 871
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.7893407766785276,
+      "learning_rate": 3.572488797817639e-06,
+      "loss": 0.5226,
+      "step": 872
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.908122661974681,
+      "learning_rate": 3.569534278692569e-06,
+      "loss": 0.5132,
+      "step": 873
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.9052513037253582,
+      "learning_rate": 3.5665779300002144e-06,
+      "loss": 0.513,
+      "step": 874
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.7876914527016339,
+      "learning_rate": 3.563619756797767e-06,
+      "loss": 0.5627,
+      "step": 875
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.9607045801516068,
+      "learning_rate": 3.5606597641455387e-06,
+      "loss": 0.4986,
+      "step": 876
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.701462749441997,
+      "learning_rate": 3.5576979571069527e-06,
+      "loss": 0.5306,
+      "step": 877
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.8413701238351416,
+      "learning_rate": 3.554734340748538e-06,
+      "loss": 0.5602,
+      "step": 878
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.8762306249541667,
+      "learning_rate": 3.5517689201399162e-06,
+      "loss": 0.5663,
+      "step": 879
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.833164968453507,
+      "learning_rate": 3.5488017003537977e-06,
+      "loss": 0.5264,
+      "step": 880
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.766302763247428,
+      "learning_rate": 3.5458326864659687e-06,
+      "loss": 0.5498,
+      "step": 881
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.821883208129187,
+      "learning_rate": 3.5428618835552867e-06,
+      "loss": 0.5468,
+      "step": 882
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.7773758034614335,
+      "learning_rate": 3.5398892967036674e-06,
+      "loss": 0.505,
+      "step": 883
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.8248820711070537,
+      "learning_rate": 3.5369149309960783e-06,
+      "loss": 0.5679,
+      "step": 884
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.8248114104788378,
+      "learning_rate": 3.5339387915205305e-06,
+      "loss": 0.5351,
+      "step": 885
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 2.00472132505421,
+      "learning_rate": 3.53096088336807e-06,
+      "loss": 0.5637,
+      "step": 886
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 2.0594957277906656,
+      "learning_rate": 3.5279812116327667e-06,
+      "loss": 0.567,
+      "step": 887
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.916227169502353,
+      "learning_rate": 3.5249997814117098e-06,
+      "loss": 0.5733,
+      "step": 888
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.7595020268824906,
+      "learning_rate": 3.5220165978049937e-06,
+      "loss": 0.5512,
+      "step": 889
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.8259487385184114,
+      "learning_rate": 3.5190316659157126e-06,
+      "loss": 0.5332,
+      "step": 890
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.8216813752485344,
+      "learning_rate": 3.5160449908499538e-06,
+      "loss": 0.5718,
+      "step": 891
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.8497964997952454,
+      "learning_rate": 3.5130565777167845e-06,
+      "loss": 0.5179,
+      "step": 892
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.8242356367817554,
+      "learning_rate": 3.5100664316282464e-06,
+      "loss": 0.5587,
+      "step": 893
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.7793507179190546,
+      "learning_rate": 3.5070745576993428e-06,
+      "loss": 0.5924,
+      "step": 894
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.920176905610262,
+      "learning_rate": 3.5040809610480364e-06,
+      "loss": 0.5579,
+      "step": 895
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.954421523744336,
+      "learning_rate": 3.5010856467952335e-06,
+      "loss": 0.5496,
+      "step": 896
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.7785169911731862,
+      "learning_rate": 3.4980886200647817e-06,
+      "loss": 0.5383,
+      "step": 897
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.853827977546151,
+      "learning_rate": 3.4950898859834555e-06,
+      "loss": 0.5501,
+      "step": 898
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.9882198198152168,
+      "learning_rate": 3.4920894496809515e-06,
+      "loss": 0.5557,
+      "step": 899
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.98090605107646,
+      "learning_rate": 3.489087316289877e-06,
+      "loss": 0.5661,
+      "step": 900
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 2.0027723691714785,
+      "learning_rate": 3.486083490945743e-06,
+      "loss": 0.4791,
+      "step": 901
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 2.0183911897675015,
+      "learning_rate": 3.4830779787869555e-06,
+      "loss": 0.5386,
+      "step": 902
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.9385976919386894,
+      "learning_rate": 3.480070784954805e-06,
+      "loss": 0.5351,
+      "step": 903
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.7612550957325825,
+      "learning_rate": 3.4770619145934586e-06,
+      "loss": 0.511,
+      "step": 904
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.8677538420589843,
+      "learning_rate": 3.4740513728499515e-06,
+      "loss": 0.5942,
+      "step": 905
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.9208446249900946,
+      "learning_rate": 3.4710391648741787e-06,
+      "loss": 0.5146,
+      "step": 906
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.8008673055527855,
+      "learning_rate": 3.468025295818885e-06,
+      "loss": 0.5909,
+      "step": 907
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.891052390507894,
+      "learning_rate": 3.465009770839657e-06,
+      "loss": 0.5527,
+      "step": 908
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 2.0521048489395435,
+      "learning_rate": 3.4619925950949126e-06,
+      "loss": 0.5756,
+      "step": 909
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 2.003295441830653,
+      "learning_rate": 3.4589737737458946e-06,
+      "loss": 0.5299,
+      "step": 910
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.7635851435542724,
+      "learning_rate": 3.4559533119566612e-06,
+      "loss": 0.5338,
+      "step": 911
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.834326490517508,
+      "learning_rate": 3.4529312148940763e-06,
+      "loss": 0.56,
+      "step": 912
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.8618427761057224,
+      "learning_rate": 3.4499074877278016e-06,
+      "loss": 0.5189,
+      "step": 913
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 2.04459004844406,
+      "learning_rate": 3.446882135630286e-06,
+      "loss": 0.5765,
+      "step": 914
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.7467595732765806,
+      "learning_rate": 3.4438551637767604e-06,
+      "loss": 0.5512,
+      "step": 915
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.7952035114217406,
+      "learning_rate": 3.4408265773452226e-06,
+      "loss": 0.5348,
+      "step": 916
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.8448198186244822,
+      "learning_rate": 3.4377963815164362e-06,
+      "loss": 0.5187,
+      "step": 917
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.7738820116169103,
+      "learning_rate": 3.4347645814739156e-06,
+      "loss": 0.507,
+      "step": 918
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.9699054774415494,
+      "learning_rate": 3.4317311824039216e-06,
+      "loss": 0.5175,
+      "step": 919
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.7482905457169124,
+      "learning_rate": 3.4286961894954473e-06,
+      "loss": 0.5188,
+      "step": 920
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.8012194296110113,
+      "learning_rate": 3.425659607940215e-06,
+      "loss": 0.5465,
+      "step": 921
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.7978097428012587,
+      "learning_rate": 3.422621442932662e-06,
+      "loss": 0.5257,
+      "step": 922
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.8534167116514217,
+      "learning_rate": 3.419581699669937e-06,
+      "loss": 0.536,
+      "step": 923
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.7733377878036733,
+      "learning_rate": 3.416540383351888e-06,
+      "loss": 0.5632,
+      "step": 924
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.8124786776539388,
+      "learning_rate": 3.4134974991810503e-06,
+      "loss": 0.5471,
+      "step": 925
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.8553271859579439,
+      "learning_rate": 3.4104530523626463e-06,
+      "loss": 0.538,
+      "step": 926
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.8888926038913822,
+      "learning_rate": 3.4074070481045683e-06,
+      "loss": 0.4868,
+      "step": 927
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 2.0158609319355505,
+      "learning_rate": 3.404359491617374e-06,
+      "loss": 0.5757,
+      "step": 928
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.8376639720078027,
+      "learning_rate": 3.401310388114276e-06,
+      "loss": 0.5377,
+      "step": 929
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 2.3651883595335232,
+      "learning_rate": 3.3982597428111336e-06,
+      "loss": 0.5536,
+      "step": 930
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.908409388949023,
+      "learning_rate": 3.3952075609264423e-06,
+      "loss": 0.5349,
+      "step": 931
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.8261622890952995,
+      "learning_rate": 3.3921538476813278e-06,
+      "loss": 0.4991,
+      "step": 932
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.924034720876031,
+      "learning_rate": 3.3890986082995353e-06,
+      "loss": 0.536,
+      "step": 933
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.829615974230478,
+      "learning_rate": 3.3860418480074188e-06,
+      "loss": 0.5163,
+      "step": 934
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.7812992854973535,
+      "learning_rate": 3.3829835720339353e-06,
+      "loss": 0.5412,
+      "step": 935
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.8270515542068861,
+      "learning_rate": 3.3799237856106348e-06,
+      "loss": 0.5459,
+      "step": 936
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.8336967909163833,
+      "learning_rate": 3.3768624939716506e-06,
+      "loss": 0.5074,
+      "step": 937
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.773892866992307,
+      "learning_rate": 3.373799702353691e-06,
+      "loss": 0.5457,
+      "step": 938
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.8605607499004266,
+      "learning_rate": 3.370735415996031e-06,
+      "loss": 0.5691,
+      "step": 939
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.7961529805945686,
+      "learning_rate": 3.3676696401405007e-06,
+      "loss": 0.5406,
+      "step": 940
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.7406787561376078,
+      "learning_rate": 3.3646023800314792e-06,
+      "loss": 0.5297,
+      "step": 941
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.9794693468141764,
+      "learning_rate": 3.361533640915885e-06,
+      "loss": 0.4765,
+      "step": 942
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.820632707720892,
+      "learning_rate": 3.3584634280431657e-06,
+      "loss": 0.5395,
+      "step": 943
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.8478126164835549,
+      "learning_rate": 3.3553917466652915e-06,
+      "loss": 0.5288,
+      "step": 944
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.749509825583459,
+      "learning_rate": 3.352318602036742e-06,
+      "loss": 0.5343,
+      "step": 945
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.8034305951190157,
+      "learning_rate": 3.3492439994145033e-06,
+      "loss": 0.5536,
+      "step": 946
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.8172591817519397,
+      "learning_rate": 3.346167944058052e-06,
+      "loss": 0.5844,
+      "step": 947
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.749562414198837,
+      "learning_rate": 3.3430904412293526e-06,
+      "loss": 0.4833,
+      "step": 948
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.7243742428927225,
+      "learning_rate": 3.3400114961928444e-06,
+      "loss": 0.4828,
+      "step": 949
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.757242299744874,
+      "learning_rate": 3.3369311142154337e-06,
+      "loss": 0.5282,
+      "step": 950
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 2.036302581700697,
+      "learning_rate": 3.3338493005664853e-06,
+      "loss": 0.5315,
+      "step": 951
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.886299636672335,
+      "learning_rate": 3.330766060517812e-06,
+      "loss": 0.5244,
+      "step": 952
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.898853787733011,
+      "learning_rate": 3.3276813993436695e-06,
+      "loss": 0.5914,
+      "step": 953
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.8359472984671243,
+      "learning_rate": 3.324595322320741e-06,
+      "loss": 0.5488,
+      "step": 954
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.8768955168510497,
+      "learning_rate": 3.321507834728134e-06,
+      "loss": 0.5871,
+      "step": 955
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.8358033818112791,
+      "learning_rate": 3.3184189418473674e-06,
+      "loss": 0.5632,
+      "step": 956
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.792562502385941,
+      "learning_rate": 3.315328648962364e-06,
+      "loss": 0.4887,
+      "step": 957
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.8732702930932368,
+      "learning_rate": 3.312236961359444e-06,
+      "loss": 0.5313,
+      "step": 958
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.7708047128885986,
+      "learning_rate": 3.3091438843273115e-06,
+      "loss": 0.5348,
+      "step": 959
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.9094434763935804,
+      "learning_rate": 3.3060494231570463e-06,
+      "loss": 0.5027,
+      "step": 960
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.87927564418864,
+      "learning_rate": 3.3029535831420977e-06,
+      "loss": 0.511,
+      "step": 961
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.717365559903535,
+      "learning_rate": 3.299856369578273e-06,
+      "loss": 0.5203,
+      "step": 962
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.770779257052532,
+      "learning_rate": 3.2967577877637296e-06,
+      "loss": 0.5233,
+      "step": 963
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.7541392466004568,
+      "learning_rate": 3.2936578429989653e-06,
+      "loss": 0.5013,
+      "step": 964
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.7840578280891832,
+      "learning_rate": 3.290556540586809e-06,
+      "loss": 0.4844,
+      "step": 965
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.7184305413001233,
+      "learning_rate": 3.287453885832413e-06,
+      "loss": 0.4694,
+      "step": 966
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.8671517036325307,
+      "learning_rate": 3.2843498840432403e-06,
+      "loss": 0.4652,
+      "step": 967
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.9960847871768508,
+      "learning_rate": 3.2812445405290612e-06,
+      "loss": 0.5906,
+      "step": 968
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.7535227575839891,
+      "learning_rate": 3.27813786060194e-06,
+      "loss": 0.5482,
+      "step": 969
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.929231862440999,
+      "learning_rate": 3.2750298495762278e-06,
+      "loss": 0.5334,
+      "step": 970
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.7879676366114814,
+      "learning_rate": 3.2719205127685505e-06,
+      "loss": 0.515,
+      "step": 971
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.7817120865072218,
+      "learning_rate": 3.2688098554978053e-06,
+      "loss": 0.5045,
+      "step": 972
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.8725673808714274,
+      "learning_rate": 3.265697883085145e-06,
+      "loss": 0.5557,
+      "step": 973
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.8554796275037901,
+      "learning_rate": 3.262584600853973e-06,
+      "loss": 0.5785,
+      "step": 974
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.77078783324655,
+      "learning_rate": 3.259470014129936e-06,
+      "loss": 0.524,
+      "step": 975
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.820843626030818,
+      "learning_rate": 3.256354128240907e-06,
+      "loss": 0.5144,
+      "step": 976
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.9330495063889956,
+      "learning_rate": 3.253236948516987e-06,
+      "loss": 0.5405,
+      "step": 977
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.9113413794485425,
+      "learning_rate": 3.2501184802904867e-06,
+      "loss": 0.5212,
+      "step": 978
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.799188386703558,
+      "learning_rate": 3.2469987288959208e-06,
+      "loss": 0.5148,
+      "step": 979
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.8610914183588203,
+      "learning_rate": 3.2438776996700023e-06,
+      "loss": 0.5363,
+      "step": 980
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.8245263524947073,
+      "learning_rate": 3.240755397951625e-06,
+      "loss": 0.5216,
+      "step": 981
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.7863270641417597,
+      "learning_rate": 3.2376318290818643e-06,
+      "loss": 0.5581,
+      "step": 982
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.9266115141469626,
+      "learning_rate": 3.23450699840396e-06,
+      "loss": 0.5178,
+      "step": 983
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.8044458399187253,
+      "learning_rate": 3.2313809112633133e-06,
+      "loss": 0.5252,
+      "step": 984
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.8809392949423562,
+      "learning_rate": 3.2282535730074714e-06,
+      "loss": 0.486,
+      "step": 985
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.9487997548787144,
+      "learning_rate": 3.2251249889861237e-06,
+      "loss": 0.5272,
+      "step": 986
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 2.088279538426057,
+      "learning_rate": 3.2219951645510907e-06,
+      "loss": 0.5426,
+      "step": 987
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.8280370745964312,
+      "learning_rate": 3.218864105056313e-06,
+      "loss": 0.5545,
+      "step": 988
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.7678201455723743,
+      "learning_rate": 3.2157318158578473e-06,
+      "loss": 0.5476,
+      "step": 989
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.708170466024094,
+      "learning_rate": 3.21259830231385e-06,
+      "loss": 0.5442,
+      "step": 990
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 2.0427224573251483,
+      "learning_rate": 3.209463569784575e-06,
+      "loss": 0.5501,
+      "step": 991
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.8557413526282036,
+      "learning_rate": 3.206327623632359e-06,
+      "loss": 0.5573,
+      "step": 992
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.7138810851622357,
+      "learning_rate": 3.2031904692216153e-06,
+      "loss": 0.5267,
+      "step": 993
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.9034028799031073,
+      "learning_rate": 3.2000521119188267e-06,
+      "loss": 0.5605,
+      "step": 994
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.994571492675121,
+      "learning_rate": 3.1969125570925303e-06,
+      "loss": 0.53,
+      "step": 995
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.771581881704634,
+      "learning_rate": 3.193771810113313e-06,
+      "loss": 0.6177,
+      "step": 996
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.7808220445921694,
+      "learning_rate": 3.1906298763538005e-06,
+      "loss": 0.5215,
+      "step": 997
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.8069794706642701,
+      "learning_rate": 3.1874867611886513e-06,
+      "loss": 0.5444,
+      "step": 998
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.7806867210889854,
+      "learning_rate": 3.1843424699945403e-06,
+      "loss": 0.5471,
+      "step": 999
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.7481554024627886,
+      "learning_rate": 3.1811970081501576e-06,
+      "loss": 0.5159,
+      "step": 1000
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.8105318680671914,
+      "learning_rate": 3.1780503810361946e-06,
+      "loss": 0.4985,
+      "step": 1001
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.7033701950072382,
+      "learning_rate": 3.1749025940353363e-06,
+      "loss": 0.5594,
+      "step": 1002
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 2.3799847532384515,
+      "learning_rate": 3.1717536525322512e-06,
+      "loss": 0.5978,
+      "step": 1003
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.7427559432173463,
+      "learning_rate": 3.1686035619135845e-06,
+      "loss": 0.5299,
+      "step": 1004
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.7454547855925509,
+      "learning_rate": 3.1654523275679453e-06,
+      "loss": 0.5439,
+      "step": 1005
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.7130931472340127,
+      "learning_rate": 3.162299954885899e-06,
+      "loss": 0.5379,
+      "step": 1006
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.6940357366272063,
+      "learning_rate": 3.15914644925996e-06,
+      "loss": 0.5694,
+      "step": 1007
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.8544220651543013,
+      "learning_rate": 3.1559918160845787e-06,
+      "loss": 0.5285,
+      "step": 1008
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.8481774433371347,
+      "learning_rate": 3.1528360607561358e-06,
+      "loss": 0.5384,
+      "step": 1009
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.8256828659009958,
+      "learning_rate": 3.149679188672932e-06,
+      "loss": 0.4806,
+      "step": 1010
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.9380282822721238,
+      "learning_rate": 3.1465212052351766e-06,
+      "loss": 0.543,
+      "step": 1011
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.985943690469791,
+      "learning_rate": 3.1433621158449807e-06,
+      "loss": 0.5549,
+      "step": 1012
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.7038398790061953,
+      "learning_rate": 3.140201925906348e-06,
+      "loss": 0.4682,
+      "step": 1013
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.8748481620529394,
+      "learning_rate": 3.1370406408251632e-06,
+      "loss": 0.5046,
+      "step": 1014
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.7587036990451181,
+      "learning_rate": 3.133878266009186e-06,
+      "loss": 0.5203,
+      "step": 1015
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.7503537433041947,
+      "learning_rate": 3.130714806868041e-06,
+      "loss": 0.5546,
+      "step": 1016
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.7701505667314001,
+      "learning_rate": 3.127550268813205e-06,
+      "loss": 0.531,
+      "step": 1017
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.771371589393474,
+      "learning_rate": 3.124384657258001e-06,
+      "loss": 0.5424,
+      "step": 1018
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.8016015279719124,
+      "learning_rate": 3.1212179776175905e-06,
+      "loss": 0.5706,
+      "step": 1019
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.810944889002695,
+      "learning_rate": 3.1180502353089598e-06,
+      "loss": 0.5502,
+      "step": 1020
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.8062084514449492,
+      "learning_rate": 3.1148814357509147e-06,
+      "loss": 0.5337,
+      "step": 1021
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.669643406466654,
+      "learning_rate": 3.111711584364068e-06,
+      "loss": 0.4802,
+      "step": 1022
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.6852245083058144,
+      "learning_rate": 3.1085406865708333e-06,
+      "loss": 0.532,
+      "step": 1023
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.8463748056800222,
+      "learning_rate": 3.1053687477954124e-06,
+      "loss": 0.5112,
+      "step": 1024
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.7302148909577209,
+      "learning_rate": 3.10219577346379e-06,
+      "loss": 0.5549,
+      "step": 1025
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.7752983463714818,
+      "learning_rate": 3.0990217690037206e-06,
+      "loss": 0.5606,
+      "step": 1026
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.695119975844164,
+      "learning_rate": 3.09584673984472e-06,
+      "loss": 0.486,
+      "step": 1027
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.793543444803663,
+      "learning_rate": 3.0926706914180605e-06,
+      "loss": 0.6474,
+      "step": 1028
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.6954588940750932,
+      "learning_rate": 3.089493629156755e-06,
+      "loss": 0.5208,
+      "step": 1029
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.9045089074493644,
+      "learning_rate": 3.08631555849555e-06,
+      "loss": 0.5291,
+      "step": 1030
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.8481217904786489,
+      "learning_rate": 3.083136484870921e-06,
+      "loss": 0.5212,
+      "step": 1031
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.6729420221561044,
+      "learning_rate": 3.0799564137210536e-06,
+      "loss": 0.5024,
+      "step": 1032
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.8821832248249077,
+      "learning_rate": 3.076775350485845e-06,
+      "loss": 0.5459,
+      "step": 1033
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.762473350167322,
+      "learning_rate": 3.0735933006068863e-06,
+      "loss": 0.4938,
+      "step": 1034
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.7950707678098703,
+      "learning_rate": 3.0704102695274573e-06,
+      "loss": 0.4922,
+      "step": 1035
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.6853644769275375,
+      "learning_rate": 3.0672262626925174e-06,
+      "loss": 0.47,
+      "step": 1036
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.809909106997157,
+      "learning_rate": 3.0640412855486922e-06,
+      "loss": 0.5545,
+      "step": 1037
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 2.019472393876661,
+      "learning_rate": 3.06085534354427e-06,
+      "loss": 0.5616,
+      "step": 1038
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.7972785887075076,
+      "learning_rate": 3.057668442129188e-06,
+      "loss": 0.5269,
+      "step": 1039
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.865555820217107,
+      "learning_rate": 3.054480586755026e-06,
+      "loss": 0.5752,
+      "step": 1040
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.792147096098412,
+      "learning_rate": 3.051291782874995e-06,
+      "loss": 0.54,
+      "step": 1041
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.8108893550848508,
+      "learning_rate": 3.048102035943927e-06,
+      "loss": 0.5367,
+      "step": 1042
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 2.0966646553454793,
+      "learning_rate": 3.04491135141827e-06,
+      "loss": 0.5455,
+      "step": 1043
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.7357403687049695,
+      "learning_rate": 3.041719734756073e-06,
+      "loss": 0.502,
+      "step": 1044
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.8033826162723872,
+      "learning_rate": 3.038527191416982e-06,
+      "loss": 0.5644,
+      "step": 1045
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.7822928111630525,
+      "learning_rate": 3.0353337268622267e-06,
+      "loss": 0.4938,
+      "step": 1046
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.7910319343463081,
+      "learning_rate": 3.0321393465546134e-06,
+      "loss": 0.5889,
+      "step": 1047
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.7457160087273953,
+      "learning_rate": 3.028944055958514e-06,
+      "loss": 0.5022,
+      "step": 1048
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.691379648176161,
+      "learning_rate": 3.0257478605398595e-06,
+      "loss": 0.4841,
+      "step": 1049
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.7452186987943483,
+      "learning_rate": 3.0225507657661257e-06,
+      "loss": 0.5626,
+      "step": 1050
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.7578678635930594,
+      "learning_rate": 3.0193527771063297e-06,
+      "loss": 0.5115,
+      "step": 1051
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.7879798898209605,
+      "learning_rate": 3.016153900031016e-06,
+      "loss": 0.5296,
+      "step": 1052
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.6745604796677231,
+      "learning_rate": 3.0129541400122492e-06,
+      "loss": 0.5089,
+      "step": 1053
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.8484438696306678,
+      "learning_rate": 3.0097535025236045e-06,
+      "loss": 0.6124,
+      "step": 1054
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.8023880068850882,
+      "learning_rate": 3.0065519930401595e-06,
+      "loss": 0.4983,
+      "step": 1055
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.743901583565096,
+      "learning_rate": 3.0033496170384803e-06,
+      "loss": 0.4998,
+      "step": 1056
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.9494472820876043,
+      "learning_rate": 3.000146379996617e-06,
+      "loss": 0.537,
+      "step": 1057
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.6992995489648048,
+      "learning_rate": 2.996942287394093e-06,
+      "loss": 0.5822,
+      "step": 1058
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.8498288139189643,
+      "learning_rate": 2.993737344711895e-06,
+      "loss": 0.5651,
+      "step": 1059
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.755920633785882,
+      "learning_rate": 2.990531557432464e-06,
+      "loss": 0.496,
+      "step": 1060
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.7876484928074277,
+      "learning_rate": 2.9873249310396853e-06,
+      "loss": 0.5224,
+      "step": 1061
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.7573987279473129,
+      "learning_rate": 2.98411747101888e-06,
+      "loss": 0.5228,
+      "step": 1062
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.6995721104857204,
+      "learning_rate": 2.980909182856794e-06,
+      "loss": 0.4758,
+      "step": 1063
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.907464743607936,
+      "learning_rate": 2.9777000720415916e-06,
+      "loss": 0.5254,
+      "step": 1064
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.7921365259203703,
+      "learning_rate": 2.974490144062844e-06,
+      "loss": 0.5116,
+      "step": 1065
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.9010192849593792,
+      "learning_rate": 2.9712794044115196e-06,
+      "loss": 0.5136,
+      "step": 1066
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.742881813035793,
+      "learning_rate": 2.968067858579975e-06,
+      "loss": 0.5436,
+      "step": 1067
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.7135933558215708,
+      "learning_rate": 2.964855512061947e-06,
+      "loss": 0.5268,
+      "step": 1068
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.8360025545734582,
+      "learning_rate": 2.9616423703525414e-06,
+      "loss": 0.5238,
+      "step": 1069
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.7090421713960848,
+      "learning_rate": 2.9584284389482237e-06,
+      "loss": 0.5051,
+      "step": 1070
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.7462732547158757,
+      "learning_rate": 2.9552137233468113e-06,
+      "loss": 0.4838,
+      "step": 1071
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.9336108910937513,
+      "learning_rate": 2.951998229047464e-06,
+      "loss": 0.5576,
+      "step": 1072
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.784092660568157,
+      "learning_rate": 2.9487819615506702e-06,
+      "loss": 0.5349,
+      "step": 1073
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.772640354616067,
+      "learning_rate": 2.945564926358245e-06,
+      "loss": 0.5423,
+      "step": 1074
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.8491968859591044,
+      "learning_rate": 2.9423471289733125e-06,
+      "loss": 0.5453,
+      "step": 1075
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.8283172103770493,
+      "learning_rate": 2.9391285749003046e-06,
+      "loss": 0.5318,
+      "step": 1076
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.7802483696828226,
+      "learning_rate": 2.935909269644946e-06,
+      "loss": 0.4954,
+      "step": 1077
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.8687809173149,
+      "learning_rate": 2.9326892187142457e-06,
+      "loss": 0.5428,
+      "step": 1078
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.9218917868616974,
+      "learning_rate": 2.9294684276164888e-06,
+      "loss": 0.5125,
+      "step": 1079
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.8406300824318225,
+      "learning_rate": 2.9262469018612278e-06,
+      "loss": 0.5186,
+      "step": 1080
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.8153319034513924,
+      "learning_rate": 2.9230246469592695e-06,
+      "loss": 0.4878,
+      "step": 1081
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.8381190525343576,
+      "learning_rate": 2.91980166842267e-06,
+      "loss": 0.5455,
+      "step": 1082
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.7941629060330144,
+      "learning_rate": 2.9165779717647212e-06,
+      "loss": 0.5425,
+      "step": 1083
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.755950985861856,
+      "learning_rate": 2.9133535624999466e-06,
+      "loss": 0.4992,
+      "step": 1084
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.8065716401418646,
+      "learning_rate": 2.9101284461440853e-06,
+      "loss": 0.5569,
+      "step": 1085
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.8487073865649808,
+      "learning_rate": 2.9069026282140887e-06,
+      "loss": 0.5352,
+      "step": 1086
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.877024524581134,
+      "learning_rate": 2.903676114228107e-06,
+      "loss": 0.5584,
+      "step": 1087
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.812931375367902,
+      "learning_rate": 2.9004489097054807e-06,
+      "loss": 0.5154,
+      "step": 1088
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.7729938020658174,
+      "learning_rate": 2.897221020166732e-06,
+      "loss": 0.5386,
+      "step": 1089
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.6991898958250629,
+      "learning_rate": 2.8939924511335555e-06,
+      "loss": 0.5467,
+      "step": 1090
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.7298323860671052,
+      "learning_rate": 2.890763208128807e-06,
+      "loss": 0.5506,
+      "step": 1091
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.9718362378496106,
+      "learning_rate": 2.887533296676497e-06,
+      "loss": 0.5453,
+      "step": 1092
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.7003897379752575,
+      "learning_rate": 2.8843027223017767e-06,
+      "loss": 0.5016,
+      "step": 1093
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.7604846690613096,
+      "learning_rate": 2.8810714905309346e-06,
+      "loss": 0.5206,
+      "step": 1094
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.868522047775135,
+      "learning_rate": 2.8778396068913807e-06,
+      "loss": 0.5152,
+      "step": 1095
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.8080911269766844,
+      "learning_rate": 2.874607076911642e-06,
+      "loss": 0.4966,
+      "step": 1096
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.7767037245003534,
+      "learning_rate": 2.871373906121351e-06,
+      "loss": 0.5081,
+      "step": 1097
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.733045586658075,
+      "learning_rate": 2.8681401000512356e-06,
+      "loss": 0.5031,
+      "step": 1098
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.6767478479637847,
+      "learning_rate": 2.8649056642331103e-06,
+      "loss": 0.4856,
+      "step": 1099
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.6820690185704608,
+      "learning_rate": 2.8616706041998686e-06,
+      "loss": 0.5151,
+      "step": 1100
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.840181264549285,
+      "learning_rate": 2.8584349254854693e-06,
+      "loss": 0.5393,
+      "step": 1101
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.827807570004724,
+      "learning_rate": 2.8551986336249322e-06,
+      "loss": 0.5572,
+      "step": 1102
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.711815265099016,
+      "learning_rate": 2.8519617341543233e-06,
+      "loss": 0.5184,
+      "step": 1103
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.7460018389221874,
+      "learning_rate": 2.8487242326107495e-06,
+      "loss": 0.5374,
+      "step": 1104
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.985067366728648,
+      "learning_rate": 2.8454861345323475e-06,
+      "loss": 0.538,
+      "step": 1105
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.8044567576569952,
+      "learning_rate": 2.8422474454582754e-06,
+      "loss": 0.4947,
+      "step": 1106
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.7648712890692506,
+      "learning_rate": 2.8390081709286997e-06,
+      "loss": 0.5584,
+      "step": 1107
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.7544905722043518,
+      "learning_rate": 2.8357683164847903e-06,
+      "loss": 0.5696,
+      "step": 1108
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.7923136846837993,
+      "learning_rate": 2.8325278876687084e-06,
+      "loss": 0.5502,
+      "step": 1109
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 2.077195937792951,
+      "learning_rate": 2.8292868900235986e-06,
+      "loss": 0.543,
+      "step": 1110
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.7675854046933754,
+      "learning_rate": 2.826045329093578e-06,
+      "loss": 0.5422,
+      "step": 1111
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.8457239401392898,
+      "learning_rate": 2.822803210423727e-06,
+      "loss": 0.5334,
+      "step": 1112
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.7426929121470698,
+      "learning_rate": 2.8195605395600804e-06,
+      "loss": 0.4972,
+      "step": 1113
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.7675216264197045,
+      "learning_rate": 2.8163173220496175e-06,
+      "loss": 0.5442,
+      "step": 1114
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.7483102565661375,
+      "learning_rate": 2.8130735634402527e-06,
+      "loss": 0.5425,
+      "step": 1115
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.692036399159914,
+      "learning_rate": 2.8098292692808253e-06,
+      "loss": 0.521,
+      "step": 1116
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.799980213437577,
+      "learning_rate": 2.8065844451210933e-06,
+      "loss": 0.5597,
+      "step": 1117
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.7666190830884467,
+      "learning_rate": 2.803339096511718e-06,
+      "loss": 0.5612,
+      "step": 1118
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.792129515845057,
+      "learning_rate": 2.8000932290042597e-06,
+      "loss": 0.5334,
+      "step": 1119
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.7395715578516604,
+      "learning_rate": 2.7968468481511663e-06,
+      "loss": 0.5545,
+      "step": 1120
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.6843830287676704,
+      "learning_rate": 2.7935999595057623e-06,
+      "loss": 0.5659,
+      "step": 1121
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.6432688824199502,
+      "learning_rate": 2.790352568622244e-06,
+      "loss": 0.4926,
+      "step": 1122
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.7430642435954644,
+      "learning_rate": 2.787104681055663e-06,
+      "loss": 0.4666,
+      "step": 1123
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.8067789882264202,
+      "learning_rate": 2.783856302361923e-06,
+      "loss": 0.5233,
+      "step": 1124
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.7685143281757654,
+      "learning_rate": 2.780607438097769e-06,
+      "loss": 0.5506,
+      "step": 1125
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.7163110868931304,
+      "learning_rate": 2.7773580938207717e-06,
+      "loss": 0.5044,
+      "step": 1126
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.809036270322799,
+      "learning_rate": 2.7741082750893284e-06,
+      "loss": 0.5206,
+      "step": 1127
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.8193898978325846,
+      "learning_rate": 2.770857987462645e-06,
+      "loss": 0.6064,
+      "step": 1128
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.765826426309075,
+      "learning_rate": 2.76760723650073e-06,
+      "loss": 0.4914,
+      "step": 1129
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 2.046345230237298,
+      "learning_rate": 2.764356027764385e-06,
+      "loss": 0.5938,
+      "step": 1130
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.8264697696225647,
+      "learning_rate": 2.7611043668151948e-06,
+      "loss": 0.5476,
+      "step": 1131
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.7776043318415495,
+      "learning_rate": 2.7578522592155166e-06,
+      "loss": 0.5318,
+      "step": 1132
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.767284538432005,
+      "learning_rate": 2.7545997105284735e-06,
+      "loss": 0.5197,
+      "step": 1133
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.831190014066027,
+      "learning_rate": 2.75134672631794e-06,
+      "loss": 0.4939,
+      "step": 1134
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.7727769641989948,
+      "learning_rate": 2.7480933121485394e-06,
+      "loss": 0.5542,
+      "step": 1135
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.7599576706599651,
+      "learning_rate": 2.7448394735856275e-06,
+      "loss": 0.5102,
+      "step": 1136
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.7526987759875383,
+      "learning_rate": 2.7415852161952893e-06,
+      "loss": 0.5357,
+      "step": 1137
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.7478180377944075,
+      "learning_rate": 2.7383305455443223e-06,
+      "loss": 0.552,
+      "step": 1138
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.8026983878339322,
+      "learning_rate": 2.7350754672002334e-06,
+      "loss": 0.5324,
+      "step": 1139
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.7539604119960455,
+      "learning_rate": 2.7318199867312267e-06,
+      "loss": 0.4951,
+      "step": 1140
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.7060714376533908,
+      "learning_rate": 2.728564109706193e-06,
+      "loss": 0.5044,
+      "step": 1141
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.896732668736906,
+      "learning_rate": 2.725307841694704e-06,
+      "loss": 0.5272,
+      "step": 1142
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.9094037542829962,
+      "learning_rate": 2.722051188266998e-06,
+      "loss": 0.5036,
+      "step": 1143
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.7529900591353695,
+      "learning_rate": 2.7187941549939723e-06,
+      "loss": 0.4962,
+      "step": 1144
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.7652784724721573,
+      "learning_rate": 2.7155367474471763e-06,
+      "loss": 0.5159,
+      "step": 1145
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.9070275680276054,
+      "learning_rate": 2.7122789711987964e-06,
+      "loss": 0.5269,
+      "step": 1146
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.7630505518040367,
+      "learning_rate": 2.709020831821652e-06,
+      "loss": 0.5286,
+      "step": 1147
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.7410138974922291,
+      "learning_rate": 2.7057623348891846e-06,
+      "loss": 0.4902,
+      "step": 1148
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.745842560539345,
+      "learning_rate": 2.7025034859754446e-06,
+      "loss": 0.5178,
+      "step": 1149
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.8498982578771728,
+      "learning_rate": 2.699244290655086e-06,
+      "loss": 0.55,
+      "step": 1150
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.6360369924184164,
+      "learning_rate": 2.6959847545033558e-06,
+      "loss": 0.4988,
+      "step": 1151
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.6784833460211517,
+      "learning_rate": 2.692724883096082e-06,
+      "loss": 0.5303,
+      "step": 1152
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.7888637226825195,
+      "learning_rate": 2.68946468200967e-06,
+      "loss": 0.542,
+      "step": 1153
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.7156031503954616,
+      "learning_rate": 2.686204156821084e-06,
+      "loss": 0.499,
+      "step": 1154
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.802618839032982,
+      "learning_rate": 2.6829433131078464e-06,
+      "loss": 0.5095,
+      "step": 1155
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.7018673816457677,
+      "learning_rate": 2.6796821564480237e-06,
+      "loss": 0.4911,
+      "step": 1156
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.939833859373507,
+      "learning_rate": 2.6764206924202173e-06,
+      "loss": 0.5965,
+      "step": 1157
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.757462214596805,
+      "learning_rate": 2.673158926603554e-06,
+      "loss": 0.5119,
+      "step": 1158
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.824906787992325,
+      "learning_rate": 2.669896864577678e-06,
+      "loss": 0.4995,
+      "step": 1159
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.6963319988581682,
+      "learning_rate": 2.666634511922739e-06,
+      "loss": 0.499,
+      "step": 1160
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.7490967555131538,
+      "learning_rate": 2.6633718742193837e-06,
+      "loss": 0.5045,
+      "step": 1161
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.7295387040616608,
+      "learning_rate": 2.660108957048749e-06,
+      "loss": 0.48,
+      "step": 1162
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.7062936128447537,
+      "learning_rate": 2.656845765992447e-06,
+      "loss": 0.5024,
+      "step": 1163
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.7291223687738257,
+      "learning_rate": 2.6535823066325594e-06,
+      "loss": 0.4965,
+      "step": 1164
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.7660018876230184,
+      "learning_rate": 2.650318584551626e-06,
+      "loss": 0.6289,
+      "step": 1165
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.6875948695046943,
+      "learning_rate": 2.6470546053326375e-06,
+      "loss": 0.5099,
+      "step": 1166
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.7055862895950586,
+      "learning_rate": 2.643790374559023e-06,
+      "loss": 0.4748,
+      "step": 1167
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.8397810404769834,
+      "learning_rate": 2.6405258978146443e-06,
+      "loss": 0.5547,
+      "step": 1168
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.6780759297615608,
+      "learning_rate": 2.6372611806837804e-06,
+      "loss": 0.4696,
+      "step": 1169
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.7463193906158438,
+      "learning_rate": 2.633996228751125e-06,
+      "loss": 0.5167,
+      "step": 1170
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.7682737157303552,
+      "learning_rate": 2.6307310476017705e-06,
+      "loss": 0.5178,
+      "step": 1171
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.7759532350573655,
+      "learning_rate": 2.627465642821203e-06,
+      "loss": 0.5411,
+      "step": 1172
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.741742707150691,
+      "learning_rate": 2.624200019995293e-06,
+      "loss": 0.5357,
+      "step": 1173
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.7638181255611864,
+      "learning_rate": 2.6209341847102787e-06,
+      "loss": 0.5598,
+      "step": 1174
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.6585763596592404,
+      "learning_rate": 2.6176681425527663e-06,
+      "loss": 0.4891,
+      "step": 1175
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.7652514703885578,
+      "learning_rate": 2.614401899109716e-06,
+      "loss": 0.5412,
+      "step": 1176
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.7646286601286296,
+      "learning_rate": 2.6111354599684287e-06,
+      "loss": 0.4753,
+      "step": 1177
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.7933546923906454,
+      "learning_rate": 2.6078688307165436e-06,
+      "loss": 0.5159,
+      "step": 1178
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.8474498352431208,
+      "learning_rate": 2.6046020169420223e-06,
+      "loss": 0.4786,
+      "step": 1179
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.816609500392057,
+      "learning_rate": 2.601335024233145e-06,
+      "loss": 0.5821,
+      "step": 1180
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.7603922858788037,
+      "learning_rate": 2.598067858178495e-06,
+      "loss": 0.4749,
+      "step": 1181
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.771168764538133,
+      "learning_rate": 2.594800524366956e-06,
+      "loss": 0.5221,
+      "step": 1182
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.7428386931770696,
+      "learning_rate": 2.591533028387694e-06,
+      "loss": 0.5243,
+      "step": 1183
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.7354647623517858,
+      "learning_rate": 2.588265375830155e-06,
+      "loss": 0.4665,
+      "step": 1184
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.7757829783254058,
+      "learning_rate": 2.5849975722840537e-06,
+      "loss": 0.4713,
+      "step": 1185
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.7660698291034924,
+      "learning_rate": 2.58172962333936e-06,
+      "loss": 0.5198,
+      "step": 1186
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.7071465020770178,
+      "learning_rate": 2.5784615345862963e-06,
+      "loss": 0.5355,
+      "step": 1187
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.6994920599655763,
+      "learning_rate": 2.5751933116153215e-06,
+      "loss": 0.4867,
+      "step": 1188
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.7891977115774562,
+      "learning_rate": 2.5719249600171247e-06,
+      "loss": 0.5071,
+      "step": 1189
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.6866451169084888,
+      "learning_rate": 2.568656485382616e-06,
+      "loss": 0.4767,
+      "step": 1190
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.9106444693405875,
+      "learning_rate": 2.5653878933029134e-06,
+      "loss": 0.5063,
+      "step": 1191
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.7546015951107552,
+      "learning_rate": 2.56211918936934e-06,
+      "loss": 0.5536,
+      "step": 1192
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.7866083346923656,
+      "learning_rate": 2.5588503791734053e-06,
+      "loss": 0.4738,
+      "step": 1193
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.6678313975517949,
+      "learning_rate": 2.5555814683068058e-06,
+      "loss": 0.5095,
+      "step": 1194
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.694690087625629,
+      "learning_rate": 2.552312462361405e-06,
+      "loss": 0.5711,
+      "step": 1195
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.7583066556547233,
+      "learning_rate": 2.5490433669292337e-06,
+      "loss": 0.5183,
+      "step": 1196
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.8259327544569408,
+      "learning_rate": 2.5457741876024716e-06,
+      "loss": 0.5129,
+      "step": 1197
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.743709458286742,
+      "learning_rate": 2.542504929973445e-06,
+      "loss": 0.509,
+      "step": 1198
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.8551037168096902,
+      "learning_rate": 2.5392355996346134e-06,
+      "loss": 0.4874,
+      "step": 1199
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.7705896553689628,
+      "learning_rate": 2.5359662021785596e-06,
+      "loss": 0.5102,
+      "step": 1200
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.8456154073029885,
+      "learning_rate": 2.532696743197982e-06,
+      "loss": 0.5363,
+      "step": 1201
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.7341454202963031,
+      "learning_rate": 2.529427228285686e-06,
+      "loss": 0.5013,
+      "step": 1202
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.7923147732329405,
+      "learning_rate": 2.526157663034568e-06,
+      "loss": 0.5191,
+      "step": 1203
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.731262319220837,
+      "learning_rate": 2.522888053037616e-06,
+      "loss": 0.4889,
+      "step": 1204
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.797800368847369,
+      "learning_rate": 2.5196184038878895e-06,
+      "loss": 0.4868,
+      "step": 1205
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.8182272292135089,
+      "learning_rate": 2.5163487211785194e-06,
+      "loss": 0.5159,
+      "step": 1206
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 2412,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 603,
+  "total_flos": 568033919631360.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-1206/training_args.bin b/checkpoint-1206/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9c7d637f7d8ebf811202f98fbce7e1b0a49f637e
--- /dev/null
+++ b/checkpoint-1206/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7f4a52e7a4e455192e0e321f42138a81946c62773f6570625fe5cb773689e26
+size 7352
diff --git a/checkpoint-1206/zero_to_fp32.py b/checkpoint-1206/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..49b846633d6eb1e836e34681e44033581f4edb7b
--- /dev/null
+++ b/checkpoint-1206/zero_to_fp32.py
@@ -0,0 +1,592 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    """
+
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag)
diff --git a/checkpoint-1809/config.json b/checkpoint-1809/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e19729ddff0edff2916b7fba6d2fec722621e76
--- /dev/null
+++ b/checkpoint-1809/config.json
@@ -0,0 +1,26 @@
+{
+  "_name_or_path": "alpindale/Mistral-7B-v0.2-hf",
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 32000,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 32768,
+  "model_type": "mistral",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": false,
+  "vocab_size": 32002
+}
diff --git a/checkpoint-1809/generation_config.json b/checkpoint-1809/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..282b497efd8f276cf9270e576fb79be429aebcdc
--- /dev/null
+++ b/checkpoint-1809/generation_config.json
@@ -0,0 +1,7 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "do_sample": true,
+  "eos_token_id": 2,
+  "transformers_version": "4.38.2"
+}
diff --git a/checkpoint-1809/global_step1809/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ec2bb87aa840c4f3e39f00243d40d225c8aacc6a
--- /dev/null
+++ b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b761fd04713774a95e8f55cd1512423d9b60e2f7d1067e957dc4e58d7c1aee5
+size 4831623435
diff --git a/checkpoint-1809/global_step1809/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1332c73f66ac989a229a64fc4ed2a6f429b1298c
--- /dev/null
+++ b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1da59e8825401cf7aaea87c02121a8975ef5f635d67de2862ef3afdfad8d1efc
+size 4831623435
diff --git a/checkpoint-1809/global_step1809/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..82a2c46eef9a5c40de66e477f837855f0f88217d
--- /dev/null
+++ b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e8bd1199d401a427bc6ab3b0346969377314f13a04d0c44b6974dadeecb0297
+size 4831623435
diff --git a/checkpoint-1809/global_step1809/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a1548c0ee9b64e20cf46cc668368e7e994ca804a
--- /dev/null
+++ b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b240e3057006281bb7c0412acf601226c8c038a3a0e68b6e95257be0a08d5ceb
+size 4831623435
diff --git a/checkpoint-1809/global_step1809/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f6ccd117435d70689b52e4a411e32d82c4fab024
--- /dev/null
+++ b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc16be622ac689221d02ff32361dea668dfa54a3962f29f23b1d3fbe7fbd603f
+size 4831623435
diff --git a/checkpoint-1809/global_step1809/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..971d9630a4c632966f231163619685f075d796c2
--- /dev/null
+++ b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a31f0153235fb37447cb31d015478d03b41cbc646dbb0c5d908eea301aef039b
+size 4831623435
diff --git a/checkpoint-1809/global_step1809/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..672f632651c0cf14adead1079e64ea596ba74743
--- /dev/null
+++ b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f8a1eb5c697a46b107c09fcd567344229ca763eef9fd99b2b55f96093e8c83e
+size 4831623435
diff --git a/checkpoint-1809/global_step1809/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0b811ee40cada0019ea8ba14c219f957f514f054
--- /dev/null
+++ b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1accd07344e616294aa51015f46d2e872115ae1bba0a305fa544af215d76bb69
+size 4831623435
diff --git a/checkpoint-1809/global_step1809/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..37041e00873f94ab25436dc1549092be1dbd9a02
--- /dev/null
+++ b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be77fe5c721c18aa3684678f1c47a77d3094b2d950cc83f13282dd740ed64b61
+size 4831623435
diff --git a/checkpoint-1809/global_step1809/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-1809/global_step1809/zero_pp_rank_0_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..292ec98f06c3f10540c0d9cb8dbfc144d2e4c877
--- /dev/null
+++ b/checkpoint-1809/global_step1809/zero_pp_rank_0_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd31ba10755723f3d557ea9d00db6b1c1e800660204bfe845e0ffcf74d789c83
+size 153829
diff --git a/checkpoint-1809/global_step1809/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-1809/global_step1809/zero_pp_rank_1_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f0eed4345efd988d9831dfc723f9f95285c2b0ab
--- /dev/null
+++ b/checkpoint-1809/global_step1809/zero_pp_rank_1_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4624485375edf79b90d9bf1c4daa6da34f3494ae148bfc4b24bd73b9997bc0c1
+size 153829
diff --git a/checkpoint-1809/global_step1809/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-1809/global_step1809/zero_pp_rank_2_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..10ccacf21a705964b9e9334830fd5175cd9b3141
--- /dev/null
+++ b/checkpoint-1809/global_step1809/zero_pp_rank_2_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07a26376f5885a10c708f9a117f274d6ccaee0da9ccfeb1ac15213aa1509830c
+size 153829
diff --git a/checkpoint-1809/global_step1809/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-1809/global_step1809/zero_pp_rank_3_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c4f4632248727b10fb10f41d0b1f7ed34c735ca2
--- /dev/null
+++ b/checkpoint-1809/global_step1809/zero_pp_rank_3_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:48226f62e4bb0a5db0ce34c43c572a25d6f0da19415f838cfe65e4702302a662
+size 153829
diff --git a/checkpoint-1809/global_step1809/zero_pp_rank_4_mp_rank_00_model_states.pt b/checkpoint-1809/global_step1809/zero_pp_rank_4_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a1afef7dde30e1e4784d6ccc68bc284949c6a6b4
--- /dev/null
+++ b/checkpoint-1809/global_step1809/zero_pp_rank_4_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d4608cee058708405735822e13a938e3b3c944f9a7c1d220ded5f2f06b572c3
+size 153829
diff --git a/checkpoint-1809/global_step1809/zero_pp_rank_5_mp_rank_00_model_states.pt b/checkpoint-1809/global_step1809/zero_pp_rank_5_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..201b8fa9d94f83920ccbe88aad273b22f6c99c53
--- /dev/null
+++ b/checkpoint-1809/global_step1809/zero_pp_rank_5_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5423f97e3f0e471a6ea41652cd8ec31ae166df737f3fcc7ec1a3ac2958274718
+size 153829
diff --git a/checkpoint-1809/global_step1809/zero_pp_rank_6_mp_rank_00_model_states.pt b/checkpoint-1809/global_step1809/zero_pp_rank_6_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..163127281d910c3f6bf90fa4cd42d4854a2a1e97
--- /dev/null
+++ b/checkpoint-1809/global_step1809/zero_pp_rank_6_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95ee1216c4418444b48374ee04393e19379060c968a8b0ef16b86c3b411e54c8
+size 153829
diff --git a/checkpoint-1809/global_step1809/zero_pp_rank_7_mp_rank_00_model_states.pt b/checkpoint-1809/global_step1809/zero_pp_rank_7_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..35a749e0a0d3fe39c2c720e8c886d3893d1b6793
--- /dev/null
+++ b/checkpoint-1809/global_step1809/zero_pp_rank_7_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2726e76b841826bbe108a1bed34734a9250b20b051559fee1d53d7300a2d0e8
+size 153829
diff --git a/checkpoint-1809/global_step1809/zero_pp_rank_8_mp_rank_00_model_states.pt b/checkpoint-1809/global_step1809/zero_pp_rank_8_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..766c35e84d3cc839458c28c25c83337a954e3ca4
--- /dev/null
+++ b/checkpoint-1809/global_step1809/zero_pp_rank_8_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7c1471f91b15beccbbcb3c355b08a1e31f0841ecbdf88d26267ab30fd5b4697
+size 153829
diff --git a/checkpoint-1809/latest b/checkpoint-1809/latest
new file mode 100644
index 0000000000000000000000000000000000000000..40509fea8c03b0331f0d689e84c0191961ecc7c3
--- /dev/null
+++ b/checkpoint-1809/latest
@@ -0,0 +1 @@
+global_step1809
\ No newline at end of file
diff --git a/checkpoint-1809/model-00001-of-00003.safetensors b/checkpoint-1809/model-00001-of-00003.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..61710203dc37adf5e29e02a035dd805965011aea
--- /dev/null
+++ b/checkpoint-1809/model-00001-of-00003.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4aa419e433b185323444a3b8350d979b45a038e6887330b3a1edaacf48ac9f2d
+size 4943178720
diff --git a/checkpoint-1809/model-00002-of-00003.safetensors b/checkpoint-1809/model-00002-of-00003.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7842a9fedcbbc6e8bc6d6791d1f99b8aed523b34
--- /dev/null
+++ b/checkpoint-1809/model-00002-of-00003.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:73c5a608fc2645deb20b706f73174b5ddc9df7a86e31b670b4ea896c064afb27
+size 4999819336
diff --git a/checkpoint-1809/model-00003-of-00003.safetensors b/checkpoint-1809/model-00003-of-00003.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e1e5307207e0a065ef57ec97d3dad29dc5197319
--- /dev/null
+++ b/checkpoint-1809/model-00003-of-00003.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91a448af004507aa23616541e844c83722dc86610112b69ad59f13b4dc59b466
+size 4540532728
diff --git a/checkpoint-1809/model.safetensors.index.json b/checkpoint-1809/model.safetensors.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..71e207631efb42944bc779548c9b6d4b5818ffe2
--- /dev/null
+++ b/checkpoint-1809/model.safetensors.index.json
@@ -0,0 +1,298 @@
+{
+  "metadata": {
+    "total_size": 14483496960
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00003-of-00003.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.norm.weight": "model-00003-of-00003.safetensors"
+  }
+}
diff --git a/checkpoint-1809/rng_state_0.pth b/checkpoint-1809/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..ed9c956014a637b9d3ccb494c387c7452ae938e0
--- /dev/null
+++ b/checkpoint-1809/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40b7907b6e8bbc0deaf9b6cadef63205dade64f9fbf74f9a4dca9c34792d7aab
+size 16240
diff --git a/checkpoint-1809/rng_state_1.pth b/checkpoint-1809/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..a2452cb1ac950d724f0559bab3e53e6a671da5ba
--- /dev/null
+++ b/checkpoint-1809/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a4ca3302c930a1b49ced40d5e2133aedc4c5857930d92deb8c6496a317958d8
+size 16240
diff --git a/checkpoint-1809/rng_state_2.pth b/checkpoint-1809/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..30ca1e0fbf8047c1cd0606a37b02d545623d4a67
--- /dev/null
+++ b/checkpoint-1809/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fbbf2364108e70a0ac183356d1693182b452bb464271c3d2f4ade972244d710d
+size 16240
diff --git a/checkpoint-1809/rng_state_3.pth b/checkpoint-1809/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..a342cc40db30db7d18c31cffe2a2e1b1d2f3b084
--- /dev/null
+++ b/checkpoint-1809/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9269c171a7948127faa588109a1fb8043194b407d2dfbeda2e25ed8b35126a5
+size 16240
diff --git a/checkpoint-1809/rng_state_4.pth b/checkpoint-1809/rng_state_4.pth
new file mode 100644
index 0000000000000000000000000000000000000000..ca08e0f4a907b0b1649b7bc3537dd48c83723830
--- /dev/null
+++ b/checkpoint-1809/rng_state_4.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f02625e4547fbacdb164e484867f76d5024a007c22c297f8ecbef13fc6aa3202
+size 16240
diff --git a/checkpoint-1809/rng_state_5.pth b/checkpoint-1809/rng_state_5.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1aeba77fabdef8a232c2785991d798bd3f84afd3
--- /dev/null
+++ b/checkpoint-1809/rng_state_5.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51eb0286c1f14a2c09c443d8c606951c3debeb25f9ba4f71e0aea90ae2f0786e
+size 16240
diff --git a/checkpoint-1809/rng_state_6.pth b/checkpoint-1809/rng_state_6.pth
new file mode 100644
index 0000000000000000000000000000000000000000..499c459dc2af4317a2a23f7877927bf7c586e439
--- /dev/null
+++ b/checkpoint-1809/rng_state_6.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:080bbd36834b7a1623430efdd9f598b791f466541d25b545ca410ec4a930a0f3
+size 16240
diff --git a/checkpoint-1809/rng_state_7.pth b/checkpoint-1809/rng_state_7.pth
new file mode 100644
index 0000000000000000000000000000000000000000..cdfb9b9f9f3356413f6755deb29a84b7b4e360a2
--- /dev/null
+++ b/checkpoint-1809/rng_state_7.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54aa959bf290908dfe1fc65c2591b99982e9fdce5caf276626d0084ccffa7e95
+size 16240
diff --git a/checkpoint-1809/rng_state_8.pth b/checkpoint-1809/rng_state_8.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6533db02002842edcb0c9b2a6dd89506e90ac8c8
--- /dev/null
+++ b/checkpoint-1809/rng_state_8.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:85f8554f99e72a1c251b463a30088dd49afece6deb61c5ad09834d35ff89308b
+size 16240
diff --git a/checkpoint-1809/scheduler.pt b/checkpoint-1809/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6305be6b3a4171fe11369d2578fc7945741c40d5
--- /dev/null
+++ b/checkpoint-1809/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ef3ed14afeb23e7559e1ece00ec5a5ba48527918d9a770399a0f1d431d2f9b0
+size 1064
diff --git a/checkpoint-1809/trainer_state.json b/checkpoint-1809/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ec8d4384deb2d7c2bc47244f56f5ecd0ca866f5
--- /dev/null
+++ b/checkpoint-1809/trainer_state.json
@@ -0,0 +1,12684 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.4820725388601037,
+  "eval_steps": 500,
+  "global_step": 1809,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "grad_norm": 27.81778461909011,
+      "learning_rate": 5.000000000000001e-07,
+      "loss": 0.7993,
+      "step": 1
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 28.63833175363421,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 0.9056,
+      "step": 2
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 25.646828828014854,
+      "learning_rate": 1.5e-06,
+      "loss": 0.8473,
+      "step": 3
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 9.834124771941388,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 0.8192,
+      "step": 4
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 10.558095859980105,
+      "learning_rate": 2.5e-06,
+      "loss": 0.7943,
+      "step": 5
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 7.905789045775758,
+      "learning_rate": 3e-06,
+      "loss": 0.7075,
+      "step": 6
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 7.259519170268483,
+      "learning_rate": 3.5e-06,
+      "loss": 0.7537,
+      "step": 7
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 6.639042051048664,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 0.7471,
+      "step": 8
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 8.515070932390074,
+      "learning_rate": 4.5e-06,
+      "loss": 0.7689,
+      "step": 9
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 8.916410424632533,
+      "learning_rate": 5e-06,
+      "loss": 0.7194,
+      "step": 10
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 4.835046497413255,
+      "learning_rate": 4.9999978617243506e-06,
+      "loss": 0.6949,
+      "step": 11
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 10.065648500649479,
+      "learning_rate": 4.9999914469010585e-06,
+      "loss": 0.7039,
+      "step": 12
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 5.299372887839679,
+      "learning_rate": 4.999980755541098e-06,
+      "loss": 0.7067,
+      "step": 13
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 5.693110837094718,
+      "learning_rate": 4.999965787662758e-06,
+      "loss": 0.7126,
+      "step": 14
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 2.983869635716314,
+      "learning_rate": 4.999946543291642e-06,
+      "loss": 0.6496,
+      "step": 15
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 4.2561193962441175,
+      "learning_rate": 4.999923022460671e-06,
+      "loss": 0.7036,
+      "step": 16
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 3.011772824968437,
+      "learning_rate": 4.999895225210079e-06,
+      "loss": 0.7009,
+      "step": 17
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 3.386638415717137,
+      "learning_rate": 4.9998631515874165e-06,
+      "loss": 0.6624,
+      "step": 18
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 3.764658092125165,
+      "learning_rate": 4.999826801647551e-06,
+      "loss": 0.6687,
+      "step": 19
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.3982096117966614,
+      "learning_rate": 4.999786175452662e-06,
+      "loss": 0.706,
+      "step": 20
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.8051633678260193,
+      "learning_rate": 4.999741273072246e-06,
+      "loss": 0.7031,
+      "step": 21
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 3.1177784624332614,
+      "learning_rate": 4.999692094583114e-06,
+      "loss": 0.7525,
+      "step": 22
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.2533819675617806,
+      "learning_rate": 4.9996386400693906e-06,
+      "loss": 0.6767,
+      "step": 23
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.61893793162573,
+      "learning_rate": 4.999580909622518e-06,
+      "loss": 0.6432,
+      "step": 24
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.76057623723569,
+      "learning_rate": 4.999518903341251e-06,
+      "loss": 0.6809,
+      "step": 25
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.27983032069553,
+      "learning_rate": 4.999452621331657e-06,
+      "loss": 0.6798,
+      "step": 26
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.501904568120582,
+      "learning_rate": 4.99938206370712e-06,
+      "loss": 0.6412,
+      "step": 27
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.819229290729669,
+      "learning_rate": 4.999307230588338e-06,
+      "loss": 0.6188,
+      "step": 28
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.1233212322022212,
+      "learning_rate": 4.9992281221033224e-06,
+      "loss": 0.6378,
+      "step": 29
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.7806911906686755,
+      "learning_rate": 4.999144738387396e-06,
+      "loss": 0.6653,
+      "step": 30
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.4045490257014563,
+      "learning_rate": 4.999057079583199e-06,
+      "loss": 0.6377,
+      "step": 31
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.3803717769210446,
+      "learning_rate": 4.998965145840681e-06,
+      "loss": 0.6855,
+      "step": 32
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.3976652879633473,
+      "learning_rate": 4.998868937317106e-06,
+      "loss": 0.6284,
+      "step": 33
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.2958541157119727,
+      "learning_rate": 4.998768454177051e-06,
+      "loss": 0.6521,
+      "step": 34
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.1925196833696154,
+      "learning_rate": 4.998663696592403e-06,
+      "loss": 0.6619,
+      "step": 35
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.361006042901851,
+      "learning_rate": 4.998554664742362e-06,
+      "loss": 0.6155,
+      "step": 36
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.1577758143653614,
+      "learning_rate": 4.998441358813443e-06,
+      "loss": 0.6398,
+      "step": 37
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.219872074512664,
+      "learning_rate": 4.998323778999467e-06,
+      "loss": 0.6051,
+      "step": 38
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.2907501521408546,
+      "learning_rate": 4.9982019255015705e-06,
+      "loss": 0.6337,
+      "step": 39
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.1769862324666183,
+      "learning_rate": 4.9980757985281955e-06,
+      "loss": 0.6606,
+      "step": 40
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.4252479779661607,
+      "learning_rate": 4.997945398295101e-06,
+      "loss": 0.6685,
+      "step": 41
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.3929541982084657,
+      "learning_rate": 4.99781072502535e-06,
+      "loss": 0.6084,
+      "step": 42
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.932539969840091,
+      "learning_rate": 4.997671778949318e-06,
+      "loss": 0.6123,
+      "step": 43
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.191742541327873,
+      "learning_rate": 4.997528560304688e-06,
+      "loss": 0.6247,
+      "step": 44
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.423376784566499,
+      "learning_rate": 4.997381069336455e-06,
+      "loss": 0.7024,
+      "step": 45
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.0599055392481076,
+      "learning_rate": 4.997229306296918e-06,
+      "loss": 0.6612,
+      "step": 46
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.16832922087532,
+      "learning_rate": 4.997073271445686e-06,
+      "loss": 0.5949,
+      "step": 47
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.0483598654319453,
+      "learning_rate": 4.9969129650496775e-06,
+      "loss": 0.6406,
+      "step": 48
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.963056609139284,
+      "learning_rate": 4.996748387383113e-06,
+      "loss": 0.6361,
+      "step": 49
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.2094923844269307,
+      "learning_rate": 4.996579538727527e-06,
+      "loss": 0.5901,
+      "step": 50
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.1088153449411857,
+      "learning_rate": 4.996406419371749e-06,
+      "loss": 0.6458,
+      "step": 51
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.093448940617732,
+      "learning_rate": 4.996229029611926e-06,
+      "loss": 0.6509,
+      "step": 52
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.075116207412987,
+      "learning_rate": 4.996047369751502e-06,
+      "loss": 0.6295,
+      "step": 53
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.138141165277684,
+      "learning_rate": 4.995861440101229e-06,
+      "loss": 0.6088,
+      "step": 54
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.186316382848445,
+      "learning_rate": 4.995671240979161e-06,
+      "loss": 0.6307,
+      "step": 55
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.2513741083982195,
+      "learning_rate": 4.995476772710657e-06,
+      "loss": 0.6175,
+      "step": 56
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.0827167336870596,
+      "learning_rate": 4.995278035628379e-06,
+      "loss": 0.5935,
+      "step": 57
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.117977588574442,
+      "learning_rate": 4.995075030072291e-06,
+      "loss": 0.5998,
+      "step": 58
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.0996940200235485,
+      "learning_rate": 4.994867756389658e-06,
+      "loss": 0.6159,
+      "step": 59
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.141096165691323,
+      "learning_rate": 4.994656214935045e-06,
+      "loss": 0.6294,
+      "step": 60
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.022748830058395,
+      "learning_rate": 4.994440406070323e-06,
+      "loss": 0.6315,
+      "step": 61
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.209132168720991,
+      "learning_rate": 4.994220330164654e-06,
+      "loss": 0.5645,
+      "step": 62
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.0994557317862674,
+      "learning_rate": 4.993995987594509e-06,
+      "loss": 0.6272,
+      "step": 63
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.204220831053169,
+      "learning_rate": 4.99376737874365e-06,
+      "loss": 0.6379,
+      "step": 64
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.127733932186697,
+      "learning_rate": 4.993534504003141e-06,
+      "loss": 0.622,
+      "step": 65
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.1338506582034316,
+      "learning_rate": 4.993297363771342e-06,
+      "loss": 0.6259,
+      "step": 66
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.104802764460729,
+      "learning_rate": 4.993055958453912e-06,
+      "loss": 0.6414,
+      "step": 67
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.0889535347771675,
+      "learning_rate": 4.9928102884638004e-06,
+      "loss": 0.6466,
+      "step": 68
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.252225316694296,
+      "learning_rate": 4.992560354221258e-06,
+      "loss": 0.6167,
+      "step": 69
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.015392533516649,
+      "learning_rate": 4.992306156153827e-06,
+      "loss": 0.5958,
+      "step": 70
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.151741408948778,
+      "learning_rate": 4.992047694696343e-06,
+      "loss": 0.5875,
+      "step": 71
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.0351299117412696,
+      "learning_rate": 4.991784970290935e-06,
+      "loss": 0.5935,
+      "step": 72
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.0000962363827983,
+      "learning_rate": 4.991517983387026e-06,
+      "loss": 0.6091,
+      "step": 73
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.202881736102415,
+      "learning_rate": 4.99124673444133e-06,
+      "loss": 0.6122,
+      "step": 74
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.015074773396151,
+      "learning_rate": 4.990971223917848e-06,
+      "loss": 0.6134,
+      "step": 75
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.009305960567766,
+      "learning_rate": 4.990691452287877e-06,
+      "loss": 0.6308,
+      "step": 76
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 1.9967884756310221,
+      "learning_rate": 4.990407420029999e-06,
+      "loss": 0.6098,
+      "step": 77
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.0858738033925905,
+      "learning_rate": 4.990119127630085e-06,
+      "loss": 0.6344,
+      "step": 78
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 1.9427707561903895,
+      "learning_rate": 4.989826575581295e-06,
+      "loss": 0.6049,
+      "step": 79
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.157150584766853,
+      "learning_rate": 4.989529764384073e-06,
+      "loss": 0.5965,
+      "step": 80
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.0303527419352583,
+      "learning_rate": 4.989228694546151e-06,
+      "loss": 0.6524,
+      "step": 81
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.128799919475717,
+      "learning_rate": 4.988923366582546e-06,
+      "loss": 0.5524,
+      "step": 82
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.0122786280510696,
+      "learning_rate": 4.988613781015557e-06,
+      "loss": 0.6268,
+      "step": 83
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.104580177719229,
+      "learning_rate": 4.988299938374769e-06,
+      "loss": 0.6229,
+      "step": 84
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.3894843860356834,
+      "learning_rate": 4.9879818391970455e-06,
+      "loss": 0.6194,
+      "step": 85
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 1.9615211372441477,
+      "learning_rate": 4.9876594840265355e-06,
+      "loss": 0.6355,
+      "step": 86
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.4509852093141937,
+      "learning_rate": 4.987332873414666e-06,
+      "loss": 0.6405,
+      "step": 87
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.178942375285086,
+      "learning_rate": 4.987002007920142e-06,
+      "loss": 0.5593,
+      "step": 88
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.2625634345900445,
+      "learning_rate": 4.9866668881089515e-06,
+      "loss": 0.6133,
+      "step": 89
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.363092638811143,
+      "learning_rate": 4.986327514554356e-06,
+      "loss": 0.6298,
+      "step": 90
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.0401982492138546,
+      "learning_rate": 4.985983887836894e-06,
+      "loss": 0.6276,
+      "step": 91
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.276956647922478,
+      "learning_rate": 4.985636008544381e-06,
+      "loss": 0.5691,
+      "step": 92
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.1072762844110233,
+      "learning_rate": 4.985283877271908e-06,
+      "loss": 0.6175,
+      "step": 93
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.2931866879442637,
+      "learning_rate": 4.984927494621836e-06,
+      "loss": 0.6419,
+      "step": 94
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.112474101166308,
+      "learning_rate": 4.984566861203801e-06,
+      "loss": 0.607,
+      "step": 95
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.1816059679212634,
+      "learning_rate": 4.984201977634711e-06,
+      "loss": 0.6136,
+      "step": 96
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.0620776369966554,
+      "learning_rate": 4.9838328445387415e-06,
+      "loss": 0.6372,
+      "step": 97
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.147592836641578,
+      "learning_rate": 4.983459462547341e-06,
+      "loss": 0.606,
+      "step": 98
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.1808001877062453,
+      "learning_rate": 4.983081832299224e-06,
+      "loss": 0.6019,
+      "step": 99
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.3751999527114087,
+      "learning_rate": 4.98269995444037e-06,
+      "loss": 0.6021,
+      "step": 100
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.8769470206406913,
+      "learning_rate": 4.98231382962403e-06,
+      "loss": 0.6082,
+      "step": 101
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.3060925784921347,
+      "learning_rate": 4.981923458510717e-06,
+      "loss": 0.6174,
+      "step": 102
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.1543176832473683,
+      "learning_rate": 4.981528841768206e-06,
+      "loss": 0.6092,
+      "step": 103
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.1558689520522547,
+      "learning_rate": 4.981129980071538e-06,
+      "loss": 0.587,
+      "step": 104
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.3830532005188383,
+      "learning_rate": 4.980726874103014e-06,
+      "loss": 0.6518,
+      "step": 105
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.3333119576634767,
+      "learning_rate": 4.980319524552195e-06,
+      "loss": 0.6096,
+      "step": 106
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.1135146855324214,
+      "learning_rate": 4.9799079321159e-06,
+      "loss": 0.5728,
+      "step": 107
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.2300463384326394,
+      "learning_rate": 4.9794920974982095e-06,
+      "loss": 0.6563,
+      "step": 108
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.1745234017525443,
+      "learning_rate": 4.979072021410458e-06,
+      "loss": 0.5968,
+      "step": 109
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.1536586182562334,
+      "learning_rate": 4.978647704571237e-06,
+      "loss": 0.6189,
+      "step": 110
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.193809374687326,
+      "learning_rate": 4.97821914770639e-06,
+      "loss": 0.5864,
+      "step": 111
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.0525896373682047,
+      "learning_rate": 4.977786351549017e-06,
+      "loss": 0.6101,
+      "step": 112
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.216099286618384,
+      "learning_rate": 4.977349316839467e-06,
+      "loss": 0.5984,
+      "step": 113
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.155122255962579,
+      "learning_rate": 4.97690804432534e-06,
+      "loss": 0.6311,
+      "step": 114
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.2972101190291374,
+      "learning_rate": 4.976462534761487e-06,
+      "loss": 0.5813,
+      "step": 115
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 1.9925413745245948,
+      "learning_rate": 4.9760127889100044e-06,
+      "loss": 0.6157,
+      "step": 116
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.2802548684036568,
+      "learning_rate": 4.975558807540238e-06,
+      "loss": 0.6079,
+      "step": 117
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.048888007394621,
+      "learning_rate": 4.9751005914287775e-06,
+      "loss": 0.6467,
+      "step": 118
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.28661640438254,
+      "learning_rate": 4.974638141359456e-06,
+      "loss": 0.6029,
+      "step": 119
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.004056683755783,
+      "learning_rate": 4.974171458123351e-06,
+      "loss": 0.6289,
+      "step": 120
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.1628470048067667,
+      "learning_rate": 4.97370054251878e-06,
+      "loss": 0.6139,
+      "step": 121
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.056119895466544,
+      "learning_rate": 4.9732253953513e-06,
+      "loss": 0.5798,
+      "step": 122
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.1716513163164275,
+      "learning_rate": 4.972746017433709e-06,
+      "loss": 0.6085,
+      "step": 123
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.255856676525811,
+      "learning_rate": 4.97226240958604e-06,
+      "loss": 0.6342,
+      "step": 124
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.1049280498075373,
+      "learning_rate": 4.971774572635563e-06,
+      "loss": 0.6197,
+      "step": 125
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.133349390995361,
+      "learning_rate": 4.97128250741678e-06,
+      "loss": 0.5751,
+      "step": 126
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.2044887467317578,
+      "learning_rate": 4.97078621477143e-06,
+      "loss": 0.6611,
+      "step": 127
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.1413863795698145,
+      "learning_rate": 4.970285695548481e-06,
+      "loss": 0.625,
+      "step": 128
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.0229587336296615,
+      "learning_rate": 4.969780950604132e-06,
+      "loss": 0.5989,
+      "step": 129
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.0983599595244247,
+      "learning_rate": 4.969271980801808e-06,
+      "loss": 0.5747,
+      "step": 130
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.1059041140010786,
+      "learning_rate": 4.9687587870121645e-06,
+      "loss": 0.5869,
+      "step": 131
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.8967441614595046,
+      "learning_rate": 4.9682413701130815e-06,
+      "loss": 0.6272,
+      "step": 132
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.9976164993621088,
+      "learning_rate": 4.967719730989663e-06,
+      "loss": 0.6282,
+      "step": 133
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.8719131324952145,
+      "learning_rate": 4.967193870534235e-06,
+      "loss": 0.6052,
+      "step": 134
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.071702997476533,
+      "learning_rate": 4.9666637896463455e-06,
+      "loss": 0.5785,
+      "step": 135
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.9549455320048341,
+      "learning_rate": 4.966129489232762e-06,
+      "loss": 0.5739,
+      "step": 136
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.0656898626759315,
+      "learning_rate": 4.9655909702074684e-06,
+      "loss": 0.6651,
+      "step": 137
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.1185948604203038,
+      "learning_rate": 4.965048233491669e-06,
+      "loss": 0.5759,
+      "step": 138
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.08566019272993,
+      "learning_rate": 4.964501280013777e-06,
+      "loss": 0.6271,
+      "step": 139
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.117420903965419,
+      "learning_rate": 4.963950110709425e-06,
+      "loss": 0.5968,
+      "step": 140
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.9784944143818486,
+      "learning_rate": 4.963394726521453e-06,
+      "loss": 0.6112,
+      "step": 141
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.077292948039572,
+      "learning_rate": 4.9628351283999144e-06,
+      "loss": 0.5636,
+      "step": 142
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.223803520245629,
+      "learning_rate": 4.962271317302068e-06,
+      "loss": 0.6658,
+      "step": 143
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.039369072186367,
+      "learning_rate": 4.9617032941923796e-06,
+      "loss": 0.5853,
+      "step": 144
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.071470113085907,
+      "learning_rate": 4.961131060042522e-06,
+      "loss": 0.601,
+      "step": 145
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.437470272347474,
+      "learning_rate": 4.960554615831372e-06,
+      "loss": 0.6593,
+      "step": 146
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.178684122927139,
+      "learning_rate": 4.959973962545005e-06,
+      "loss": 0.607,
+      "step": 147
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.097006749956471,
+      "learning_rate": 4.9593891011767e-06,
+      "loss": 0.5873,
+      "step": 148
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.9801202541822784,
+      "learning_rate": 4.958800032726931e-06,
+      "loss": 0.5877,
+      "step": 149
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.30001951085656,
+      "learning_rate": 4.958206758203373e-06,
+      "loss": 0.6368,
+      "step": 150
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 1.990094260131078,
+      "learning_rate": 4.957609278620891e-06,
+      "loss": 0.59,
+      "step": 151
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.262163752076628,
+      "learning_rate": 4.957007595001548e-06,
+      "loss": 0.5779,
+      "step": 152
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.1970152093220983,
+      "learning_rate": 4.956401708374595e-06,
+      "loss": 0.5894,
+      "step": 153
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.220825872684071,
+      "learning_rate": 4.9557916197764745e-06,
+      "loss": 0.6528,
+      "step": 154
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.099472677591387,
+      "learning_rate": 4.955177330250817e-06,
+      "loss": 0.5798,
+      "step": 155
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.159203936881569,
+      "learning_rate": 4.954558840848437e-06,
+      "loss": 0.6206,
+      "step": 156
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.185152414039555,
+      "learning_rate": 4.953936152627338e-06,
+      "loss": 0.5624,
+      "step": 157
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.0679748168992624,
+      "learning_rate": 4.953309266652701e-06,
+      "loss": 0.5859,
+      "step": 158
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.327237187255128,
+      "learning_rate": 4.952678183996891e-06,
+      "loss": 0.5632,
+      "step": 159
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.2865519679977417,
+      "learning_rate": 4.952042905739451e-06,
+      "loss": 0.6965,
+      "step": 160
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.523435408018699,
+      "learning_rate": 4.9514034329671e-06,
+      "loss": 0.6217,
+      "step": 161
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.4992653226709636,
+      "learning_rate": 4.950759766773734e-06,
+      "loss": 0.6175,
+      "step": 162
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.432752824777114,
+      "learning_rate": 4.950111908260423e-06,
+      "loss": 0.5862,
+      "step": 163
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.137500912204061,
+      "learning_rate": 4.949459858535404e-06,
+      "loss": 0.6124,
+      "step": 164
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.2226376224120474,
+      "learning_rate": 4.94880361871409e-06,
+      "loss": 0.5891,
+      "step": 165
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.3821839805775165,
+      "learning_rate": 4.9481431899190544e-06,
+      "loss": 0.6008,
+      "step": 166
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.306242834684614,
+      "learning_rate": 4.947478573280044e-06,
+      "loss": 0.6159,
+      "step": 167
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.3298092236851518,
+      "learning_rate": 4.946809769933963e-06,
+      "loss": 0.5809,
+      "step": 168
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.364296499621558,
+      "learning_rate": 4.946136781024883e-06,
+      "loss": 0.5895,
+      "step": 169
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.237241095609228,
+      "learning_rate": 4.945459607704029e-06,
+      "loss": 0.6144,
+      "step": 170
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.4027419761972264,
+      "learning_rate": 4.9447782511297905e-06,
+      "loss": 0.5985,
+      "step": 171
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.1547059182244284,
+      "learning_rate": 4.944092712467709e-06,
+      "loss": 0.5763,
+      "step": 172
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.1530221667047984,
+      "learning_rate": 4.9434029928904805e-06,
+      "loss": 0.5692,
+      "step": 173
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.228588593294869,
+      "learning_rate": 4.942709093577954e-06,
+      "loss": 0.5896,
+      "step": 174
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.1597295307130198,
+      "learning_rate": 4.942011015717129e-06,
+      "loss": 0.5864,
+      "step": 175
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.321140955498194,
+      "learning_rate": 4.941308760502149e-06,
+      "loss": 0.6089,
+      "step": 176
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.220124736460707,
+      "learning_rate": 4.940602329134309e-06,
+      "loss": 0.5786,
+      "step": 177
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.1698038563080417,
+      "learning_rate": 4.939891722822043e-06,
+      "loss": 0.5749,
+      "step": 178
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.244425969121411,
+      "learning_rate": 4.93917694278093e-06,
+      "loss": 0.5877,
+      "step": 179
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.143920008069458,
+      "learning_rate": 4.938457990233687e-06,
+      "loss": 0.6024,
+      "step": 180
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.1786040820345813,
+      "learning_rate": 4.937734866410169e-06,
+      "loss": 0.5845,
+      "step": 181
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.301832824481007,
+      "learning_rate": 4.9370075725473665e-06,
+      "loss": 0.6182,
+      "step": 182
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.3748033727083997,
+      "learning_rate": 4.936276109889403e-06,
+      "loss": 0.6073,
+      "step": 183
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.476334487382023,
+      "learning_rate": 4.935540479687534e-06,
+      "loss": 0.5793,
+      "step": 184
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.2509466352322494,
+      "learning_rate": 4.934800683200143e-06,
+      "loss": 0.6133,
+      "step": 185
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.8391697547684873,
+      "learning_rate": 4.934056721692742e-06,
+      "loss": 0.5967,
+      "step": 186
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.4492364225391765,
+      "learning_rate": 4.933308596437965e-06,
+      "loss": 0.5676,
+      "step": 187
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.685548141821295,
+      "learning_rate": 4.932556308715573e-06,
+      "loss": 0.6069,
+      "step": 188
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.261217637824808,
+      "learning_rate": 4.931799859812443e-06,
+      "loss": 0.6411,
+      "step": 189
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.3838284395200966,
+      "learning_rate": 4.931039251022573e-06,
+      "loss": 0.5745,
+      "step": 190
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.2550921344466164,
+      "learning_rate": 4.930274483647074e-06,
+      "loss": 0.5989,
+      "step": 191
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.078406234527636,
+      "learning_rate": 4.929505558994175e-06,
+      "loss": 0.5998,
+      "step": 192
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.592864566091496,
+      "learning_rate": 4.928732478379214e-06,
+      "loss": 0.5842,
+      "step": 193
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.092752299259724,
+      "learning_rate": 4.927955243124638e-06,
+      "loss": 0.5789,
+      "step": 194
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.3799311595696966,
+      "learning_rate": 4.927173854560002e-06,
+      "loss": 0.6265,
+      "step": 195
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.246876688010602,
+      "learning_rate": 4.926388314021964e-06,
+      "loss": 0.6126,
+      "step": 196
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.1409898276704578,
+      "learning_rate": 4.925598622854287e-06,
+      "loss": 0.6073,
+      "step": 197
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.5946158421875385,
+      "learning_rate": 4.924804782407834e-06,
+      "loss": 0.6154,
+      "step": 198
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.1225494320427982,
+      "learning_rate": 4.924006794040562e-06,
+      "loss": 0.583,
+      "step": 199
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.1971323526291338,
+      "learning_rate": 4.923204659117528e-06,
+      "loss": 0.6078,
+      "step": 200
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.289185506404785,
+      "learning_rate": 4.92239837901088e-06,
+      "loss": 0.6127,
+      "step": 201
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.0071007751625354,
+      "learning_rate": 4.921587955099858e-06,
+      "loss": 0.5804,
+      "step": 202
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.2981840149068247,
+      "learning_rate": 4.920773388770789e-06,
+      "loss": 0.6027,
+      "step": 203
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.236179116886702,
+      "learning_rate": 4.919954681417087e-06,
+      "loss": 0.6179,
+      "step": 204
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.007422589251611,
+      "learning_rate": 4.91913183443925e-06,
+      "loss": 0.5647,
+      "step": 205
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.1402813555735483,
+      "learning_rate": 4.918304849244857e-06,
+      "loss": 0.5841,
+      "step": 206
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.0456415785177104,
+      "learning_rate": 4.917473727248565e-06,
+      "loss": 0.5524,
+      "step": 207
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 1.9673558126020942,
+      "learning_rate": 4.916638469872109e-06,
+      "loss": 0.5698,
+      "step": 208
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.015111672496819,
+      "learning_rate": 4.9157990785442964e-06,
+      "loss": 0.5957,
+      "step": 209
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 1.9502065547578398,
+      "learning_rate": 4.9149555547010086e-06,
+      "loss": 0.5592,
+      "step": 210
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.167936522558899,
+      "learning_rate": 4.9141078997851945e-06,
+      "loss": 0.5705,
+      "step": 211
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.2066587458997935,
+      "learning_rate": 4.91325611524687e-06,
+      "loss": 0.5526,
+      "step": 212
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.9132995625903553,
+      "learning_rate": 4.9124002025431136e-06,
+      "loss": 0.5767,
+      "step": 213
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.0097281107801277,
+      "learning_rate": 4.91154016313807e-06,
+      "loss": 0.6185,
+      "step": 214
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.023532008241332,
+      "learning_rate": 4.910675998502938e-06,
+      "loss": 0.6005,
+      "step": 215
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.9253831001776973,
+      "learning_rate": 4.909807710115977e-06,
+      "loss": 0.5769,
+      "step": 216
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.066862408842564,
+      "learning_rate": 4.908935299462497e-06,
+      "loss": 0.5671,
+      "step": 217
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.9412704290792853,
+      "learning_rate": 4.908058768034862e-06,
+      "loss": 0.5568,
+      "step": 218
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.185994457097553,
+      "learning_rate": 4.907178117332487e-06,
+      "loss": 0.5621,
+      "step": 219
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.021517127546353,
+      "learning_rate": 4.906293348861829e-06,
+      "loss": 0.5672,
+      "step": 220
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.099703967072734,
+      "learning_rate": 4.905404464136391e-06,
+      "loss": 0.5366,
+      "step": 221
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.030197056583618,
+      "learning_rate": 4.904511464676718e-06,
+      "loss": 0.6064,
+      "step": 222
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.4170102988954896,
+      "learning_rate": 4.903614352010393e-06,
+      "loss": 0.5919,
+      "step": 223
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.0819468873015476,
+      "learning_rate": 4.9027131276720355e-06,
+      "loss": 0.5366,
+      "step": 224
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.148008018153629,
+      "learning_rate": 4.901807793203299e-06,
+      "loss": 0.597,
+      "step": 225
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.0303725862017186,
+      "learning_rate": 4.900898350152866e-06,
+      "loss": 0.6394,
+      "step": 226
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.1598989214704334,
+      "learning_rate": 4.899984800076449e-06,
+      "loss": 0.5932,
+      "step": 227
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.0816312637185255,
+      "learning_rate": 4.899067144536786e-06,
+      "loss": 0.5909,
+      "step": 228
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 1.9024067197329315,
+      "learning_rate": 4.8981453851036365e-06,
+      "loss": 0.5463,
+      "step": 229
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.1830926868871043,
+      "learning_rate": 4.897219523353781e-06,
+      "loss": 0.5821,
+      "step": 230
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.1156269612794016,
+      "learning_rate": 4.8962895608710195e-06,
+      "loss": 0.5993,
+      "step": 231
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 1.9653407654210864,
+      "learning_rate": 4.895355499246162e-06,
+      "loss": 0.5525,
+      "step": 232
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.367769051061897,
+      "learning_rate": 4.894417340077036e-06,
+      "loss": 0.5683,
+      "step": 233
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.078327064466567,
+      "learning_rate": 4.893475084968474e-06,
+      "loss": 0.6184,
+      "step": 234
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.1661882731589475,
+      "learning_rate": 4.8925287355323195e-06,
+      "loss": 0.6321,
+      "step": 235
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.182760952002799,
+      "learning_rate": 4.891578293387413e-06,
+      "loss": 0.6254,
+      "step": 236
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.998723579962691,
+      "learning_rate": 4.890623760159605e-06,
+      "loss": 0.5371,
+      "step": 237
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.319922346931926,
+      "learning_rate": 4.8896651374817365e-06,
+      "loss": 0.5941,
+      "step": 238
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.090735197217999,
+      "learning_rate": 4.888702426993648e-06,
+      "loss": 0.577,
+      "step": 239
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.1247199987228558,
+      "learning_rate": 4.887735630342173e-06,
+      "loss": 0.5928,
+      "step": 240
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.33151114429804,
+      "learning_rate": 4.8867647491811315e-06,
+      "loss": 0.5838,
+      "step": 241
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.1570026356289147,
+      "learning_rate": 4.885789785171334e-06,
+      "loss": 0.5642,
+      "step": 242
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.049571197047368,
+      "learning_rate": 4.884810739980575e-06,
+      "loss": 0.6684,
+      "step": 243
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.9810062424466381,
+      "learning_rate": 4.883827615283626e-06,
+      "loss": 0.5942,
+      "step": 244
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.145869663660159,
+      "learning_rate": 4.882840412762244e-06,
+      "loss": 0.6356,
+      "step": 245
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.19290302186514,
+      "learning_rate": 4.881849134105156e-06,
+      "loss": 0.6189,
+      "step": 246
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.0561043419872984,
+      "learning_rate": 4.880853781008062e-06,
+      "loss": 0.5563,
+      "step": 247
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 1.8831183793224635,
+      "learning_rate": 4.879854355173638e-06,
+      "loss": 0.5522,
+      "step": 248
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.020981606684741,
+      "learning_rate": 4.878850858311518e-06,
+      "loss": 0.5548,
+      "step": 249
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.060242570493272,
+      "learning_rate": 4.877843292138307e-06,
+      "loss": 0.5715,
+      "step": 250
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.082455778933014,
+      "learning_rate": 4.8768316583775665e-06,
+      "loss": 0.5959,
+      "step": 251
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 1.9830929719438626,
+      "learning_rate": 4.875815958759819e-06,
+      "loss": 0.5813,
+      "step": 252
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 1.9772267506828567,
+      "learning_rate": 4.8747961950225406e-06,
+      "loss": 0.539,
+      "step": 253
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.1492561995002104,
+      "learning_rate": 4.873772368910161e-06,
+      "loss": 0.6059,
+      "step": 254
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.253757247139787,
+      "learning_rate": 4.872744482174058e-06,
+      "loss": 0.5897,
+      "step": 255
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.3282624851882496,
+      "learning_rate": 4.8717125365725545e-06,
+      "loss": 0.5675,
+      "step": 256
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.15573581133063,
+      "learning_rate": 4.8706765338709185e-06,
+      "loss": 0.5958,
+      "step": 257
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.073289220218241,
+      "learning_rate": 4.869636475841358e-06,
+      "loss": 0.6052,
+      "step": 258
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.293714090249444,
+      "learning_rate": 4.8685923642630165e-06,
+      "loss": 0.5786,
+      "step": 259
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.9496544276539172,
+      "learning_rate": 4.867544200921974e-06,
+      "loss": 0.6163,
+      "step": 260
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.5267016753690132,
+      "learning_rate": 4.866491987611239e-06,
+      "loss": 0.6223,
+      "step": 261
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.8731249445320794,
+      "learning_rate": 4.865435726130751e-06,
+      "loss": 0.5632,
+      "step": 262
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.3586331105798863,
+      "learning_rate": 4.86437541828737e-06,
+      "loss": 0.5769,
+      "step": 263
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.0258106914510585,
+      "learning_rate": 4.863311065894883e-06,
+      "loss": 0.6103,
+      "step": 264
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.2543614390885955,
+      "learning_rate": 4.862242670773991e-06,
+      "loss": 0.5844,
+      "step": 265
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.9440299381244668,
+      "learning_rate": 4.861170234752314e-06,
+      "loss": 0.5559,
+      "step": 266
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.254538268495492,
+      "learning_rate": 4.8600937596643815e-06,
+      "loss": 0.5709,
+      "step": 267
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.007651746385687,
+      "learning_rate": 4.8590132473516346e-06,
+      "loss": 0.573,
+      "step": 268
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.0735253118288837,
+      "learning_rate": 4.857928699662421e-06,
+      "loss": 0.5954,
+      "step": 269
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.024775417101569,
+      "learning_rate": 4.856840118451989e-06,
+      "loss": 0.5992,
+      "step": 270
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.1043310699945814,
+      "learning_rate": 4.855747505582488e-06,
+      "loss": 0.6507,
+      "step": 271
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.0386353328313214,
+      "learning_rate": 4.854650862922965e-06,
+      "loss": 0.5666,
+      "step": 272
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.978698841367705,
+      "learning_rate": 4.853550192349358e-06,
+      "loss": 0.5593,
+      "step": 273
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.9386534247633986,
+      "learning_rate": 4.852445495744497e-06,
+      "loss": 0.5735,
+      "step": 274
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.049346245018599,
+      "learning_rate": 4.8513367749981e-06,
+      "loss": 0.5415,
+      "step": 275
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.1051969521216605,
+      "learning_rate": 4.850224032006765e-06,
+      "loss": 0.5532,
+      "step": 276
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.2006792558872315,
+      "learning_rate": 4.849107268673975e-06,
+      "loss": 0.5696,
+      "step": 277
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.0460787736353647,
+      "learning_rate": 4.847986486910088e-06,
+      "loss": 0.5658,
+      "step": 278
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.1161843259225406,
+      "learning_rate": 4.846861688632336e-06,
+      "loss": 0.583,
+      "step": 279
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.8882198480393542,
+      "learning_rate": 4.8457328757648224e-06,
+      "loss": 0.5693,
+      "step": 280
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.1578413701109596,
+      "learning_rate": 4.844600050238517e-06,
+      "loss": 0.5409,
+      "step": 281
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.03912467778954,
+      "learning_rate": 4.843463213991255e-06,
+      "loss": 0.5908,
+      "step": 282
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.2333462480826247,
+      "learning_rate": 4.842322368967731e-06,
+      "loss": 0.6088,
+      "step": 283
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.06698702157327,
+      "learning_rate": 4.8411775171194986e-06,
+      "loss": 0.5953,
+      "step": 284
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.1433923121572045,
+      "learning_rate": 4.840028660404964e-06,
+      "loss": 0.5851,
+      "step": 285
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.214858780835041,
+      "learning_rate": 4.838875800789386e-06,
+      "loss": 0.5913,
+      "step": 286
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.038128612492624,
+      "learning_rate": 4.837718940244871e-06,
+      "loss": 0.5827,
+      "step": 287
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.9894065096959768,
+      "learning_rate": 4.836558080750365e-06,
+      "loss": 0.5769,
+      "step": 288
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.1711590153285822,
+      "learning_rate": 4.835393224291662e-06,
+      "loss": 0.654,
+      "step": 289
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.105004451988696,
+      "learning_rate": 4.834224372861386e-06,
+      "loss": 0.6158,
+      "step": 290
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.9554568023729102,
+      "learning_rate": 4.833051528459001e-06,
+      "loss": 0.5807,
+      "step": 291
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.2693917834500312,
+      "learning_rate": 4.831874693090797e-06,
+      "loss": 0.5557,
+      "step": 292
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.9081391627126192,
+      "learning_rate": 4.830693868769892e-06,
+      "loss": 0.6057,
+      "step": 293
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.2133664110768585,
+      "learning_rate": 4.82950905751623e-06,
+      "loss": 0.6103,
+      "step": 294
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.015392814211589,
+      "learning_rate": 4.8283202613565735e-06,
+      "loss": 0.5578,
+      "step": 295
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.142124020349717,
+      "learning_rate": 4.8271274823245e-06,
+      "loss": 0.5675,
+      "step": 296
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.981611826462286,
+      "learning_rate": 4.825930722460405e-06,
+      "loss": 0.5696,
+      "step": 297
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.966759748348117,
+      "learning_rate": 4.824729983811486e-06,
+      "loss": 0.58,
+      "step": 298
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.0117040369769397,
+      "learning_rate": 4.823525268431754e-06,
+      "loss": 0.6005,
+      "step": 299
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.9579664917991193,
+      "learning_rate": 4.822316578382019e-06,
+      "loss": 0.5472,
+      "step": 300
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.9075723479635032,
+      "learning_rate": 4.821103915729892e-06,
+      "loss": 0.5834,
+      "step": 301
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.289340229011896,
+      "learning_rate": 4.819887282549777e-06,
+      "loss": 0.6088,
+      "step": 302
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.0410700553735235,
+      "learning_rate": 4.818666680922874e-06,
+      "loss": 0.5449,
+      "step": 303
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.074434792511819,
+      "learning_rate": 4.8174421129371675e-06,
+      "loss": 0.5826,
+      "step": 304
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.1377170527698865,
+      "learning_rate": 4.816213580687428e-06,
+      "loss": 0.6262,
+      "step": 305
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.060340839248083,
+      "learning_rate": 4.814981086275209e-06,
+      "loss": 0.5479,
+      "step": 306
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.007036467413588,
+      "learning_rate": 4.813744631808841e-06,
+      "loss": 0.5642,
+      "step": 307
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.016779606220332,
+      "learning_rate": 4.8125042194034285e-06,
+      "loss": 0.5503,
+      "step": 308
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.930004252757651,
+      "learning_rate": 4.811259851180845e-06,
+      "loss": 0.582,
+      "step": 309
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.9179477992752856,
+      "learning_rate": 4.810011529269734e-06,
+      "loss": 0.5678,
+      "step": 310
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.023430757276848,
+      "learning_rate": 4.808759255805498e-06,
+      "loss": 0.614,
+      "step": 311
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.8334738409404936,
+      "learning_rate": 4.807503032930306e-06,
+      "loss": 0.5742,
+      "step": 312
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.937332706274502,
+      "learning_rate": 4.806242862793075e-06,
+      "loss": 0.6257,
+      "step": 313
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.0265383045700363,
+      "learning_rate": 4.8049787475494786e-06,
+      "loss": 0.5733,
+      "step": 314
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.056444039073761,
+      "learning_rate": 4.803710689361939e-06,
+      "loss": 0.578,
+      "step": 315
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.411132719183335,
+      "learning_rate": 4.802438690399622e-06,
+      "loss": 0.5778,
+      "step": 316
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.0233969242222853,
+      "learning_rate": 4.801162752838436e-06,
+      "loss": 0.5649,
+      "step": 317
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.2809121915132815,
+      "learning_rate": 4.799882878861025e-06,
+      "loss": 0.5589,
+      "step": 318
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.9806834041020271,
+      "learning_rate": 4.798599070656768e-06,
+      "loss": 0.5753,
+      "step": 319
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.095099671577702,
+      "learning_rate": 4.797311330421773e-06,
+      "loss": 0.5644,
+      "step": 320
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.1697606190375764,
+      "learning_rate": 4.796019660358877e-06,
+      "loss": 0.6009,
+      "step": 321
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.9549416103216173,
+      "learning_rate": 4.794724062677635e-06,
+      "loss": 0.5429,
+      "step": 322
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.9986949357292838,
+      "learning_rate": 4.793424539594323e-06,
+      "loss": 0.5456,
+      "step": 323
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.9414831957796765,
+      "learning_rate": 4.792121093331935e-06,
+      "loss": 0.5468,
+      "step": 324
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.100702188933012,
+      "learning_rate": 4.7908137261201685e-06,
+      "loss": 0.5763,
+      "step": 325
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.2747471285831025,
+      "learning_rate": 4.789502440195436e-06,
+      "loss": 0.5637,
+      "step": 326
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.8996382919319124,
+      "learning_rate": 4.788187237800849e-06,
+      "loss": 0.5285,
+      "step": 327
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.3451495174978847,
+      "learning_rate": 4.786868121186218e-06,
+      "loss": 0.5638,
+      "step": 328
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.0437536068229565,
+      "learning_rate": 4.7855450926080535e-06,
+      "loss": 0.5282,
+      "step": 329
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.1185488514745554,
+      "learning_rate": 4.784218154329555e-06,
+      "loss": 0.5689,
+      "step": 330
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.08745956731504,
+      "learning_rate": 4.78288730862061e-06,
+      "loss": 0.5772,
+      "step": 331
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.9479507156354359,
+      "learning_rate": 4.781552557757789e-06,
+      "loss": 0.5419,
+      "step": 332
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.0211480847937255,
+      "learning_rate": 4.780213904024346e-06,
+      "loss": 0.5757,
+      "step": 333
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.9075335749936069,
+      "learning_rate": 4.7788713497102094e-06,
+      "loss": 0.5693,
+      "step": 334
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.9590727137410602,
+      "learning_rate": 4.777524897111979e-06,
+      "loss": 0.5501,
+      "step": 335
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.0328480247612752,
+      "learning_rate": 4.776174548532926e-06,
+      "loss": 0.587,
+      "step": 336
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.062540517496736,
+      "learning_rate": 4.774820306282982e-06,
+      "loss": 0.5819,
+      "step": 337
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.0054452800156195,
+      "learning_rate": 4.773462172678744e-06,
+      "loss": 0.5529,
+      "step": 338
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.9641125644599562,
+      "learning_rate": 4.772100150043462e-06,
+      "loss": 0.5895,
+      "step": 339
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.9196744569285298,
+      "learning_rate": 4.77073424070704e-06,
+      "loss": 0.5504,
+      "step": 340
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.0002752186146484,
+      "learning_rate": 4.76936444700603e-06,
+      "loss": 0.5307,
+      "step": 341
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.1068919823054344,
+      "learning_rate": 4.76799077128363e-06,
+      "loss": 0.5908,
+      "step": 342
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.919597745459612,
+      "learning_rate": 4.766613215889678e-06,
+      "loss": 0.5423,
+      "step": 343
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 2.0670928578728716,
+      "learning_rate": 4.765231783180648e-06,
+      "loss": 0.5901,
+      "step": 344
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.906116148793229,
+      "learning_rate": 4.763846475519648e-06,
+      "loss": 0.5919,
+      "step": 345
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.9133575268702454,
+      "learning_rate": 4.762457295276413e-06,
+      "loss": 0.585,
+      "step": 346
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 2.133902651855379,
+      "learning_rate": 4.7610642448273025e-06,
+      "loss": 0.5444,
+      "step": 347
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.95222194640397,
+      "learning_rate": 4.7596673265552985e-06,
+      "loss": 0.5941,
+      "step": 348
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 2.095010268380277,
+      "learning_rate": 4.758266542849997e-06,
+      "loss": 0.6045,
+      "step": 349
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 2.0493864712059655,
+      "learning_rate": 4.756861896107609e-06,
+      "loss": 0.6011,
+      "step": 350
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.9222198823064967,
+      "learning_rate": 4.755453388730949e-06,
+      "loss": 0.5521,
+      "step": 351
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 2.368147154955994,
+      "learning_rate": 4.754041023129442e-06,
+      "loss": 0.6117,
+      "step": 352
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.9734596786106697,
+      "learning_rate": 4.752624801719108e-06,
+      "loss": 0.5727,
+      "step": 353
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 2.151510566977991,
+      "learning_rate": 4.751204726922564e-06,
+      "loss": 0.6085,
+      "step": 354
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.9291219072892685,
+      "learning_rate": 4.74978080116902e-06,
+      "loss": 0.5655,
+      "step": 355
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.838592559018919,
+      "learning_rate": 4.748353026894273e-06,
+      "loss": 0.5508,
+      "step": 356
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 2.069156589116884,
+      "learning_rate": 4.7469214065407e-06,
+      "loss": 0.5942,
+      "step": 357
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.8960817746615841,
+      "learning_rate": 4.745485942557264e-06,
+      "loss": 0.5902,
+      "step": 358
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 2.0606557307859634,
+      "learning_rate": 4.744046637399497e-06,
+      "loss": 0.556,
+      "step": 359
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.9660065879130573,
+      "learning_rate": 4.742603493529505e-06,
+      "loss": 0.5364,
+      "step": 360
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.9647921383638112,
+      "learning_rate": 4.741156513415958e-06,
+      "loss": 0.5601,
+      "step": 361
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 2.049074688423064,
+      "learning_rate": 4.739705699534092e-06,
+      "loss": 0.556,
+      "step": 362
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.962593945802751,
+      "learning_rate": 4.738251054365697e-06,
+      "loss": 0.5609,
+      "step": 363
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 2.059675349950347,
+      "learning_rate": 4.736792580399119e-06,
+      "loss": 0.5499,
+      "step": 364
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.8479566025134508,
+      "learning_rate": 4.7353302801292555e-06,
+      "loss": 0.5621,
+      "step": 365
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.9405450724813613,
+      "learning_rate": 4.733864156057545e-06,
+      "loss": 0.5437,
+      "step": 366
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 2.122487864033456,
+      "learning_rate": 4.7323942106919715e-06,
+      "loss": 0.5984,
+      "step": 367
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.6822841144123046,
+      "learning_rate": 4.730920446547052e-06,
+      "loss": 0.5951,
+      "step": 368
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.001405394086718,
+      "learning_rate": 4.729442866143838e-06,
+      "loss": 0.5552,
+      "step": 369
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.081154186949651,
+      "learning_rate": 4.72796147200991e-06,
+      "loss": 0.587,
+      "step": 370
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.1196544292473236,
+      "learning_rate": 4.72647626667937e-06,
+      "loss": 0.5882,
+      "step": 371
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.107445583509131,
+      "learning_rate": 4.724987252692841e-06,
+      "loss": 0.5389,
+      "step": 372
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 1.9529785007256542,
+      "learning_rate": 4.723494432597462e-06,
+      "loss": 0.6439,
+      "step": 373
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.11513441515607,
+      "learning_rate": 4.72199780894688e-06,
+      "loss": 0.6089,
+      "step": 374
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 1.9769899713721226,
+      "learning_rate": 4.7204973843012504e-06,
+      "loss": 0.5393,
+      "step": 375
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.063749623036316,
+      "learning_rate": 4.718993161227231e-06,
+      "loss": 0.5987,
+      "step": 376
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.0515862288253883,
+      "learning_rate": 4.717485142297977e-06,
+      "loss": 0.5772,
+      "step": 377
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 1.8962297741946081,
+      "learning_rate": 4.715973330093135e-06,
+      "loss": 0.5424,
+      "step": 378
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.2210958340400087,
+      "learning_rate": 4.7144577271988435e-06,
+      "loss": 0.6072,
+      "step": 379
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.067113337475314,
+      "learning_rate": 4.712938336207724e-06,
+      "loss": 0.5482,
+      "step": 380
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.8985489253954526,
+      "learning_rate": 4.711415159718876e-06,
+      "loss": 0.5593,
+      "step": 381
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.085236381118245,
+      "learning_rate": 4.709888200337879e-06,
+      "loss": 0.5704,
+      "step": 382
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.0967664183909784,
+      "learning_rate": 4.708357460676779e-06,
+      "loss": 0.5997,
+      "step": 383
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.0454278026009645,
+      "learning_rate": 4.706822943354092e-06,
+      "loss": 0.5669,
+      "step": 384
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.9171673309342674,
+      "learning_rate": 4.705284650994793e-06,
+      "loss": 0.517,
+      "step": 385
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.2003223432761287,
+      "learning_rate": 4.70374258623032e-06,
+      "loss": 0.5957,
+      "step": 386
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.936392519491186,
+      "learning_rate": 4.702196751698557e-06,
+      "loss": 0.5767,
+      "step": 387
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.354272003403086,
+      "learning_rate": 4.700647150043841e-06,
+      "loss": 0.6515,
+      "step": 388
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.9115059027323418,
+      "learning_rate": 4.699093783916955e-06,
+      "loss": 0.5579,
+      "step": 389
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.9878827587010002,
+      "learning_rate": 4.697536655975115e-06,
+      "loss": 0.572,
+      "step": 390
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.9729552535473858,
+      "learning_rate": 4.69597576888198e-06,
+      "loss": 0.5665,
+      "step": 391
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.177634366499155,
+      "learning_rate": 4.694411125307632e-06,
+      "loss": 0.6363,
+      "step": 392
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.8955146664976508,
+      "learning_rate": 4.692842727928584e-06,
+      "loss": 0.5682,
+      "step": 393
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 2.175305874476245,
+      "learning_rate": 4.691270579427769e-06,
+      "loss": 0.5943,
+      "step": 394
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 2.068140527232831,
+      "learning_rate": 4.689694682494537e-06,
+      "loss": 0.5659,
+      "step": 395
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.9112960694448755,
+      "learning_rate": 4.688115039824648e-06,
+      "loss": 0.6048,
+      "step": 396
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.9778305624626604,
+      "learning_rate": 4.686531654120272e-06,
+      "loss": 0.5695,
+      "step": 397
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 2.096904163204813,
+      "learning_rate": 4.684944528089981e-06,
+      "loss": 0.6113,
+      "step": 398
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 2.0011934144948516,
+      "learning_rate": 4.683353664448745e-06,
+      "loss": 0.5568,
+      "step": 399
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.8562851971757464,
+      "learning_rate": 4.681759065917929e-06,
+      "loss": 0.5474,
+      "step": 400
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.8190547574166316,
+      "learning_rate": 4.680160735225285e-06,
+      "loss": 0.5315,
+      "step": 401
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.9247862956929132,
+      "learning_rate": 4.6785586751049505e-06,
+      "loss": 0.5568,
+      "step": 402
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.8469793674077621,
+      "learning_rate": 4.676952888297442e-06,
+      "loss": 0.5811,
+      "step": 403
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.946943145198674,
+      "learning_rate": 4.675343377549653e-06,
+      "loss": 0.5475,
+      "step": 404
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.991304422730463,
+      "learning_rate": 4.6737301456148445e-06,
+      "loss": 0.5856,
+      "step": 405
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.9168241989446437,
+      "learning_rate": 4.672113195252644e-06,
+      "loss": 0.6069,
+      "step": 406
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.9305433665377905,
+      "learning_rate": 4.670492529229039e-06,
+      "loss": 0.5536,
+      "step": 407
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.8441008898830742,
+      "learning_rate": 4.668868150316377e-06,
+      "loss": 0.5859,
+      "step": 408
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.8879301596961315,
+      "learning_rate": 4.667240061293351e-06,
+      "loss": 0.5483,
+      "step": 409
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 2.024767417636281,
+      "learning_rate": 4.665608264945004e-06,
+      "loss": 0.5414,
+      "step": 410
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 2.1331610141797395,
+      "learning_rate": 4.663972764062722e-06,
+      "loss": 0.5811,
+      "step": 411
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.8132480265817386,
+      "learning_rate": 4.662333561444226e-06,
+      "loss": 0.5573,
+      "step": 412
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.9795813972027145,
+      "learning_rate": 4.6606906598935675e-06,
+      "loss": 0.5814,
+      "step": 413
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.8782931074297053,
+      "learning_rate": 4.6590440622211295e-06,
+      "loss": 0.569,
+      "step": 414
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.8219945335518706,
+      "learning_rate": 4.657393771243614e-06,
+      "loss": 0.5669,
+      "step": 415
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 2.4047268604371306,
+      "learning_rate": 4.6557397897840454e-06,
+      "loss": 0.5602,
+      "step": 416
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 2.064501780523946,
+      "learning_rate": 4.654082120671757e-06,
+      "loss": 0.5699,
+      "step": 417
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.9183128854940252,
+      "learning_rate": 4.65242076674239e-06,
+      "loss": 0.6112,
+      "step": 418
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.9315698971629633,
+      "learning_rate": 4.650755730837894e-06,
+      "loss": 0.5537,
+      "step": 419
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.9527809333659218,
+      "learning_rate": 4.649087015806509e-06,
+      "loss": 0.5423,
+      "step": 420
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.8940523915995442,
+      "learning_rate": 4.647414624502777e-06,
+      "loss": 0.5708,
+      "step": 421
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.9976964785548623,
+      "learning_rate": 4.645738559787524e-06,
+      "loss": 0.6006,
+      "step": 422
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.9098681403283917,
+      "learning_rate": 4.64405882452786e-06,
+      "loss": 0.5591,
+      "step": 423
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.8695612182804557,
+      "learning_rate": 4.642375421597175e-06,
+      "loss": 0.5219,
+      "step": 424
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.8912077704810082,
+      "learning_rate": 4.6406883538751315e-06,
+      "loss": 0.5224,
+      "step": 425
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.9390714726978922,
+      "learning_rate": 4.638997624247664e-06,
+      "loss": 0.5359,
+      "step": 426
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 2.051545992296337,
+      "learning_rate": 4.637303235606968e-06,
+      "loss": 0.544,
+      "step": 427
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 2.0657109136265914,
+      "learning_rate": 4.6356051908515e-06,
+      "loss": 0.5429,
+      "step": 428
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 2.0301022307984793,
+      "learning_rate": 4.63390349288597e-06,
+      "loss": 0.5787,
+      "step": 429
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 2.052515756169346,
+      "learning_rate": 4.632198144621338e-06,
+      "loss": 0.5778,
+      "step": 430
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.9741370495474897,
+      "learning_rate": 4.630489148974807e-06,
+      "loss": 0.5142,
+      "step": 431
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.9713229498863698,
+      "learning_rate": 4.62877650886982e-06,
+      "loss": 0.6127,
+      "step": 432
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 2.1609440121306007,
+      "learning_rate": 4.627060227236055e-06,
+      "loss": 0.5886,
+      "step": 433
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.944966445355139,
+      "learning_rate": 4.625340307009418e-06,
+      "loss": 0.5657,
+      "step": 434
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 2.031003925680835,
+      "learning_rate": 4.623616751132041e-06,
+      "loss": 0.5628,
+      "step": 435
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.8774113373137704,
+      "learning_rate": 4.621889562552272e-06,
+      "loss": 0.6068,
+      "step": 436
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 2.0385201543401785,
+      "learning_rate": 4.620158744224677e-06,
+      "loss": 0.5511,
+      "step": 437
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.8440750841938207,
+      "learning_rate": 4.618424299110028e-06,
+      "loss": 0.5261,
+      "step": 438
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.8978691755923442,
+      "learning_rate": 4.616686230175303e-06,
+      "loss": 0.5862,
+      "step": 439
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.8120850246861446,
+      "learning_rate": 4.614944540393679e-06,
+      "loss": 0.5652,
+      "step": 440
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.1821084695714914,
+      "learning_rate": 4.613199232744525e-06,
+      "loss": 0.5598,
+      "step": 441
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.9626422737625222,
+      "learning_rate": 4.611450310213401e-06,
+      "loss": 0.5267,
+      "step": 442
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.9714913234889215,
+      "learning_rate": 4.6096977757920505e-06,
+      "loss": 0.5658,
+      "step": 443
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.0179324078198233,
+      "learning_rate": 4.607941632478393e-06,
+      "loss": 0.582,
+      "step": 444
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.8565193856331161,
+      "learning_rate": 4.6061818832765246e-06,
+      "loss": 0.5715,
+      "step": 445
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.9798501479599246,
+      "learning_rate": 4.604418531196708e-06,
+      "loss": 0.6007,
+      "step": 446
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.0095846956468257,
+      "learning_rate": 4.602651579255369e-06,
+      "loss": 0.5947,
+      "step": 447
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.9316541079988245,
+      "learning_rate": 4.600881030475093e-06,
+      "loss": 0.5501,
+      "step": 448
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.080069353365406,
+      "learning_rate": 4.599106887884616e-06,
+      "loss": 0.5631,
+      "step": 449
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.965973137652201,
+      "learning_rate": 4.5973291545188235e-06,
+      "loss": 0.5267,
+      "step": 450
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.1082225966704087,
+      "learning_rate": 4.595547833418741e-06,
+      "loss": 0.6418,
+      "step": 451
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.0359312594194083,
+      "learning_rate": 4.593762927631536e-06,
+      "loss": 0.5644,
+      "step": 452
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 2.1254892914109433,
+      "learning_rate": 4.591974440210502e-06,
+      "loss": 0.5693,
+      "step": 453
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.9121188587334927,
+      "learning_rate": 4.590182374215064e-06,
+      "loss": 0.5572,
+      "step": 454
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.9348642624953207,
+      "learning_rate": 4.588386732710765e-06,
+      "loss": 0.5446,
+      "step": 455
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.8667846547370581,
+      "learning_rate": 4.5865875187692695e-06,
+      "loss": 0.5681,
+      "step": 456
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.9219061327454674,
+      "learning_rate": 4.5847847354683465e-06,
+      "loss": 0.5508,
+      "step": 457
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.8106132369123122,
+      "learning_rate": 4.5829783858918756e-06,
+      "loss": 0.5626,
+      "step": 458
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.7827483964442634,
+      "learning_rate": 4.5811684731298355e-06,
+      "loss": 0.5575,
+      "step": 459
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.9284196979863513,
+      "learning_rate": 4.5793550002783e-06,
+      "loss": 0.5363,
+      "step": 460
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 2.029647468705457,
+      "learning_rate": 4.577537970439433e-06,
+      "loss": 0.5415,
+      "step": 461
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 2.0997127029950087,
+      "learning_rate": 4.575717386721482e-06,
+      "loss": 0.5814,
+      "step": 462
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.9589290300656341,
+      "learning_rate": 4.573893252238777e-06,
+      "loss": 0.5156,
+      "step": 463
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.905237143908251,
+      "learning_rate": 4.572065570111717e-06,
+      "loss": 0.5536,
+      "step": 464
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.929519794935609,
+      "learning_rate": 4.570234343466775e-06,
+      "loss": 0.5879,
+      "step": 465
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 2.096095808886982,
+      "learning_rate": 4.568399575436484e-06,
+      "loss": 0.6241,
+      "step": 466
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.9486118894048778,
+      "learning_rate": 4.566561269159437e-06,
+      "loss": 0.6307,
+      "step": 467
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 2.0839490306744586,
+      "learning_rate": 4.564719427780276e-06,
+      "loss": 0.5655,
+      "step": 468
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.9439525665822102,
+      "learning_rate": 4.562874054449694e-06,
+      "loss": 0.5437,
+      "step": 469
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.9409142791465297,
+      "learning_rate": 4.5610251523244244e-06,
+      "loss": 0.6429,
+      "step": 470
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.8664574493795525,
+      "learning_rate": 4.559172724567238e-06,
+      "loss": 0.5826,
+      "step": 471
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.80819349503324,
+      "learning_rate": 4.557316774346934e-06,
+      "loss": 0.5372,
+      "step": 472
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.8680097526865296,
+      "learning_rate": 4.555457304838341e-06,
+      "loss": 0.5503,
+      "step": 473
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.7466938790815696,
+      "learning_rate": 4.553594319222303e-06,
+      "loss": 0.5425,
+      "step": 474
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.9610557658505607,
+      "learning_rate": 4.551727820685684e-06,
+      "loss": 0.5755,
+      "step": 475
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.9414839604282412,
+      "learning_rate": 4.549857812421353e-06,
+      "loss": 0.5915,
+      "step": 476
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.8484957644576423,
+      "learning_rate": 4.547984297628186e-06,
+      "loss": 0.5676,
+      "step": 477
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 2.074524028551078,
+      "learning_rate": 4.546107279511055e-06,
+      "loss": 0.6084,
+      "step": 478
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 2.069692704122282,
+      "learning_rate": 4.544226761280826e-06,
+      "loss": 0.5676,
+      "step": 479
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.8975472248317244,
+      "learning_rate": 4.54234274615435e-06,
+      "loss": 0.5904,
+      "step": 480
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 2.0118868982719897,
+      "learning_rate": 4.540455237354466e-06,
+      "loss": 0.5722,
+      "step": 481
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.9733105429381828,
+      "learning_rate": 4.5385642381099814e-06,
+      "loss": 0.6112,
+      "step": 482
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.862156914026863,
+      "learning_rate": 4.53666975165568e-06,
+      "loss": 0.5951,
+      "step": 483
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.9512940035297868,
+      "learning_rate": 4.53477178123231e-06,
+      "loss": 0.5223,
+      "step": 484
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.9202464191558823,
+      "learning_rate": 4.532870330086577e-06,
+      "loss": 0.5638,
+      "step": 485
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.9015767656854419,
+      "learning_rate": 4.530965401471143e-06,
+      "loss": 0.5911,
+      "step": 486
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.95190921973106,
+      "learning_rate": 4.529056998644619e-06,
+      "loss": 0.6053,
+      "step": 487
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 2.0058459596081644,
+      "learning_rate": 4.527145124871556e-06,
+      "loss": 0.5466,
+      "step": 488
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.8902620959998047,
+      "learning_rate": 4.5252297834224454e-06,
+      "loss": 0.5526,
+      "step": 489
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.985466416169018,
+      "learning_rate": 4.523310977573711e-06,
+      "loss": 0.5958,
+      "step": 490
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 2.1140148957176415,
+      "learning_rate": 4.521388710607699e-06,
+      "loss": 0.613,
+      "step": 491
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.9470601192089525,
+      "learning_rate": 4.51946298581268e-06,
+      "loss": 0.5847,
+      "step": 492
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 2.0227057176069603,
+      "learning_rate": 4.51753380648284e-06,
+      "loss": 0.5784,
+      "step": 493
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 2.05501863673554,
+      "learning_rate": 4.515601175918269e-06,
+      "loss": 0.5501,
+      "step": 494
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 2.0129325402811715,
+      "learning_rate": 4.513665097424967e-06,
+      "loss": 0.5641,
+      "step": 495
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 2.0322333044110468,
+      "learning_rate": 4.51172557431483e-06,
+      "loss": 0.5422,
+      "step": 496
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.9573055659958774,
+      "learning_rate": 4.509782609905644e-06,
+      "loss": 0.516,
+      "step": 497
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.8223127451485421,
+      "learning_rate": 4.507836207521085e-06,
+      "loss": 0.5714,
+      "step": 498
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.9343089861079434,
+      "learning_rate": 4.50588637049071e-06,
+      "loss": 0.5424,
+      "step": 499
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.8940990649350729,
+      "learning_rate": 4.503933102149948e-06,
+      "loss": 0.5832,
+      "step": 500
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.908617301933682,
+      "learning_rate": 4.501976405840101e-06,
+      "loss": 0.5399,
+      "step": 501
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.8290259512093785,
+      "learning_rate": 4.500016284908334e-06,
+      "loss": 0.5561,
+      "step": 502
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.9840280991844164,
+      "learning_rate": 4.49805274270767e-06,
+      "loss": 0.5645,
+      "step": 503
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.9864953051636856,
+      "learning_rate": 4.496085782596984e-06,
+      "loss": 0.5369,
+      "step": 504
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.979387839103732,
+      "learning_rate": 4.494115407940999e-06,
+      "loss": 0.6196,
+      "step": 505
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.9266869362165981,
+      "learning_rate": 4.492141622110279e-06,
+      "loss": 0.5687,
+      "step": 506
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.9887461782376619,
+      "learning_rate": 4.4901644284812205e-06,
+      "loss": 0.5264,
+      "step": 507
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.8717867803152208,
+      "learning_rate": 4.488183830436052e-06,
+      "loss": 0.5612,
+      "step": 508
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 2.0044226171493,
+      "learning_rate": 4.486199831362828e-06,
+      "loss": 0.5571,
+      "step": 509
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 2.1075571016617958,
+      "learning_rate": 4.484212434655414e-06,
+      "loss": 0.5642,
+      "step": 510
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.8031612547539957,
+      "learning_rate": 4.482221643713494e-06,
+      "loss": 0.5805,
+      "step": 511
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.8782516337672304,
+      "learning_rate": 4.480227461942556e-06,
+      "loss": 0.5596,
+      "step": 512
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.075073901596185,
+      "learning_rate": 4.478229892753886e-06,
+      "loss": 0.6124,
+      "step": 513
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.0588983460568304,
+      "learning_rate": 4.47622893956457e-06,
+      "loss": 0.5589,
+      "step": 514
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.850248236464706,
+      "learning_rate": 4.474224605797476e-06,
+      "loss": 0.5603,
+      "step": 515
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.932844310652863,
+      "learning_rate": 4.472216894881261e-06,
+      "loss": 0.5571,
+      "step": 516
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.09975454805468,
+      "learning_rate": 4.470205810250357e-06,
+      "loss": 0.5975,
+      "step": 517
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.9694087093010304,
+      "learning_rate": 4.468191355344965e-06,
+      "loss": 0.5698,
+      "step": 518
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.8794788153917539,
+      "learning_rate": 4.466173533611053e-06,
+      "loss": 0.5559,
+      "step": 519
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.0650455557855434,
+      "learning_rate": 4.46415234850035e-06,
+      "loss": 0.5644,
+      "step": 520
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.0062649027982022,
+      "learning_rate": 4.462127803470334e-06,
+      "loss": 0.608,
+      "step": 521
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.043267877462657,
+      "learning_rate": 4.460099901984235e-06,
+      "loss": 0.573,
+      "step": 522
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.056372436619027,
+      "learning_rate": 4.4580686475110235e-06,
+      "loss": 0.5748,
+      "step": 523
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.8871033520138176,
+      "learning_rate": 4.456034043525404e-06,
+      "loss": 0.5339,
+      "step": 524
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.889474616209236,
+      "learning_rate": 4.45399609350781e-06,
+      "loss": 0.5185,
+      "step": 525
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.9767406217632912,
+      "learning_rate": 4.451954800944405e-06,
+      "loss": 0.5758,
+      "step": 526
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.9588695861513832,
+      "learning_rate": 4.449910169327062e-06,
+      "loss": 0.5472,
+      "step": 527
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.8852210889000718,
+      "learning_rate": 4.447862202153372e-06,
+      "loss": 0.5917,
+      "step": 528
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 2.0103638871993077,
+      "learning_rate": 4.445810902926629e-06,
+      "loss": 0.5761,
+      "step": 529
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 2.201836945389513,
+      "learning_rate": 4.443756275155827e-06,
+      "loss": 0.5614,
+      "step": 530
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.900702305836831,
+      "learning_rate": 4.441698322355656e-06,
+      "loss": 0.5254,
+      "step": 531
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 2.134694583439314,
+      "learning_rate": 4.4396370480464915e-06,
+      "loss": 0.5607,
+      "step": 532
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.8073751630381198,
+      "learning_rate": 4.437572455754391e-06,
+      "loss": 0.536,
+      "step": 533
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.9607338020142653,
+      "learning_rate": 4.435504549011088e-06,
+      "loss": 0.59,
+      "step": 534
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 2.0756430867435274,
+      "learning_rate": 4.433433331353988e-06,
+      "loss": 0.5538,
+      "step": 535
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.8280570853718465,
+      "learning_rate": 4.431358806326158e-06,
+      "loss": 0.5789,
+      "step": 536
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.2005143967434977,
+      "learning_rate": 4.429280977476321e-06,
+      "loss": 0.545,
+      "step": 537
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.896479397543979,
+      "learning_rate": 4.4271998483588565e-06,
+      "loss": 0.5791,
+      "step": 538
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.117773381781195,
+      "learning_rate": 4.425115422533785e-06,
+      "loss": 0.5234,
+      "step": 539
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.4438942429566617,
+      "learning_rate": 4.423027703566769e-06,
+      "loss": 0.5692,
+      "step": 540
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.873481152225171,
+      "learning_rate": 4.4209366950291025e-06,
+      "loss": 0.5739,
+      "step": 541
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.8655199147974673,
+      "learning_rate": 4.4188424004977085e-06,
+      "loss": 0.5795,
+      "step": 542
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.948840412241188,
+      "learning_rate": 4.416744823555129e-06,
+      "loss": 0.5304,
+      "step": 543
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.8389034133315045,
+      "learning_rate": 4.414643967789523e-06,
+      "loss": 0.5076,
+      "step": 544
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.8269235720085213,
+      "learning_rate": 4.412539836794657e-06,
+      "loss": 0.5837,
+      "step": 545
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.1298715969759505,
+      "learning_rate": 4.410432434169902e-06,
+      "loss": 0.5694,
+      "step": 546
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.0057741366005746,
+      "learning_rate": 4.408321763520223e-06,
+      "loss": 0.557,
+      "step": 547
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.7901331374893255,
+      "learning_rate": 4.406207828456177e-06,
+      "loss": 0.5746,
+      "step": 548
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 2.1994839889416187,
+      "learning_rate": 4.404090632593904e-06,
+      "loss": 0.5407,
+      "step": 549
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.9664921082690268,
+      "learning_rate": 4.401970179555123e-06,
+      "loss": 0.5322,
+      "step": 550
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.9933486180243851,
+      "learning_rate": 4.399846472967124e-06,
+      "loss": 0.5798,
+      "step": 551
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.986612256562151,
+      "learning_rate": 4.397719516462765e-06,
+      "loss": 0.5213,
+      "step": 552
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 2.046550123292336,
+      "learning_rate": 4.395589313680459e-06,
+      "loss": 0.5857,
+      "step": 553
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.7902327250340486,
+      "learning_rate": 4.393455868264176e-06,
+      "loss": 0.555,
+      "step": 554
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 2.0203627138517146,
+      "learning_rate": 4.391319183863432e-06,
+      "loss": 0.6329,
+      "step": 555
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.9373549045181289,
+      "learning_rate": 4.389179264133281e-06,
+      "loss": 0.566,
+      "step": 556
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.8936753353678124,
+      "learning_rate": 4.387036112734316e-06,
+      "loss": 0.5555,
+      "step": 557
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.8493817575820743,
+      "learning_rate": 4.3848897333326545e-06,
+      "loss": 0.5427,
+      "step": 558
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.9119588677783816,
+      "learning_rate": 4.382740129599937e-06,
+      "loss": 0.5157,
+      "step": 559
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.8190137094200924,
+      "learning_rate": 4.380587305213321e-06,
+      "loss": 0.503,
+      "step": 560
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.9891332712764953,
+      "learning_rate": 4.37843126385547e-06,
+      "loss": 0.5761,
+      "step": 561
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.8620896547461154,
+      "learning_rate": 4.376272009214555e-06,
+      "loss": 0.5259,
+      "step": 562
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.8896721756477406,
+      "learning_rate": 4.37410954498424e-06,
+      "loss": 0.5632,
+      "step": 563
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.8302281976781984,
+      "learning_rate": 4.37194387486368e-06,
+      "loss": 0.5612,
+      "step": 564
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 2.0721820586440165,
+      "learning_rate": 4.369775002557516e-06,
+      "loss": 0.533,
+      "step": 565
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.8259926551813157,
+      "learning_rate": 4.367602931775865e-06,
+      "loss": 0.526,
+      "step": 566
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.8096334574000785,
+      "learning_rate": 4.3654276662343155e-06,
+      "loss": 0.5306,
+      "step": 567
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.9675637591445598,
+      "learning_rate": 4.363249209653922e-06,
+      "loss": 0.5577,
+      "step": 568
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.8800389115841605,
+      "learning_rate": 4.361067565761197e-06,
+      "loss": 0.5553,
+      "step": 569
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.827485496395265,
+      "learning_rate": 4.358882738288105e-06,
+      "loss": 0.5587,
+      "step": 570
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.820954908943235,
+      "learning_rate": 4.356694730972056e-06,
+      "loss": 0.6186,
+      "step": 571
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.952072431699686,
+      "learning_rate": 4.3545035475559025e-06,
+      "loss": 0.5488,
+      "step": 572
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.8292648968688423,
+      "learning_rate": 4.352309191787924e-06,
+      "loss": 0.5534,
+      "step": 573
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.826293122529813,
+      "learning_rate": 4.350111667421835e-06,
+      "loss": 0.5872,
+      "step": 574
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.9251425791166785,
+      "learning_rate": 4.347910978216763e-06,
+      "loss": 0.5298,
+      "step": 575
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.8330818196811385,
+      "learning_rate": 4.345707127937253e-06,
+      "loss": 0.5871,
+      "step": 576
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.7842986545873851,
+      "learning_rate": 4.3435001203532555e-06,
+      "loss": 0.4898,
+      "step": 577
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.8778666245156521,
+      "learning_rate": 4.341289959240124e-06,
+      "loss": 0.5385,
+      "step": 578
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.9300679499181266,
+      "learning_rate": 4.339076648378605e-06,
+      "loss": 0.5698,
+      "step": 579
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.9440861965960357,
+      "learning_rate": 4.336860191554833e-06,
+      "loss": 0.5984,
+      "step": 580
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.929951096053947,
+      "learning_rate": 4.3346405925603265e-06,
+      "loss": 0.6222,
+      "step": 581
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.9138258400335695,
+      "learning_rate": 4.332417855191974e-06,
+      "loss": 0.5498,
+      "step": 582
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 2.058548455869675,
+      "learning_rate": 4.330191983252039e-06,
+      "loss": 0.5218,
+      "step": 583
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 2.243429045583125,
+      "learning_rate": 4.327962980548142e-06,
+      "loss": 0.5768,
+      "step": 584
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.9213537104634244,
+      "learning_rate": 4.32573085089326e-06,
+      "loss": 0.5784,
+      "step": 585
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.9165291289119128,
+      "learning_rate": 4.32349559810572e-06,
+      "loss": 0.5697,
+      "step": 586
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.9674279518735756,
+      "learning_rate": 4.321257226009193e-06,
+      "loss": 0.5104,
+      "step": 587
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.9051339015323923,
+      "learning_rate": 4.319015738432683e-06,
+      "loss": 0.5711,
+      "step": 588
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.957357618850765,
+      "learning_rate": 4.3167711392105245e-06,
+      "loss": 0.5854,
+      "step": 589
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.9859311708308915,
+      "learning_rate": 4.314523432182376e-06,
+      "loss": 0.547,
+      "step": 590
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.773704456523191,
+      "learning_rate": 4.312272621193209e-06,
+      "loss": 0.5259,
+      "step": 591
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.82988033655793,
+      "learning_rate": 4.31001871009331e-06,
+      "loss": 0.5209,
+      "step": 592
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.8925134832060522,
+      "learning_rate": 4.307761702738264e-06,
+      "loss": 0.59,
+      "step": 593
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.8477075780641046,
+      "learning_rate": 4.305501602988953e-06,
+      "loss": 0.5714,
+      "step": 594
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.8568432886623798,
+      "learning_rate": 4.303238414711552e-06,
+      "loss": 0.5877,
+      "step": 595
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.8179798660158206,
+      "learning_rate": 4.3009721417775166e-06,
+      "loss": 0.6029,
+      "step": 596
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.8494963193854803,
+      "learning_rate": 4.29870278806358e-06,
+      "loss": 0.5236,
+      "step": 597
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.9586017397154731,
+      "learning_rate": 4.296430357451744e-06,
+      "loss": 0.5998,
+      "step": 598
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.926616057974202,
+      "learning_rate": 4.2941548538292765e-06,
+      "loss": 0.5914,
+      "step": 599
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.9321738359144827,
+      "learning_rate": 4.291876281088701e-06,
+      "loss": 0.5358,
+      "step": 600
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.8229177571361932,
+      "learning_rate": 4.289594643127788e-06,
+      "loss": 0.5284,
+      "step": 601
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.849252449531427,
+      "learning_rate": 4.287309943849558e-06,
+      "loss": 0.5689,
+      "step": 602
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.985343175388319,
+      "learning_rate": 4.285022187162261e-06,
+      "loss": 0.6101,
+      "step": 603
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.9437791826489255,
+      "learning_rate": 4.2827313769793835e-06,
+      "loss": 0.5419,
+      "step": 604
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.8027421078538746,
+      "learning_rate": 4.28043751721963e-06,
+      "loss": 0.5504,
+      "step": 605
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.8221230935939319,
+      "learning_rate": 4.278140611806926e-06,
+      "loss": 0.5284,
+      "step": 606
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.8597205853821357,
+      "learning_rate": 4.275840664670403e-06,
+      "loss": 0.623,
+      "step": 607
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.7801370844338822,
+      "learning_rate": 4.2735376797444e-06,
+      "loss": 0.5265,
+      "step": 608
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.9028094416250234,
+      "learning_rate": 4.271231660968449e-06,
+      "loss": 0.5764,
+      "step": 609
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.9385737581380094,
+      "learning_rate": 4.268922612287273e-06,
+      "loss": 0.6047,
+      "step": 610
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.760006169733744,
+      "learning_rate": 4.266610537650778e-06,
+      "loss": 0.4944,
+      "step": 611
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.857083980479501,
+      "learning_rate": 4.264295441014047e-06,
+      "loss": 0.5174,
+      "step": 612
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.8299942480819913,
+      "learning_rate": 4.261977326337332e-06,
+      "loss": 0.5814,
+      "step": 613
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.8943903433033418,
+      "learning_rate": 4.259656197586046e-06,
+      "loss": 0.5514,
+      "step": 614
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.7839062839610529,
+      "learning_rate": 4.257332058730761e-06,
+      "loss": 0.5857,
+      "step": 615
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 2.7188975139736256,
+      "learning_rate": 4.255004913747196e-06,
+      "loss": 0.5509,
+      "step": 616
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.8767461602206779,
+      "learning_rate": 4.252674766616212e-06,
+      "loss": 0.5038,
+      "step": 617
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.8391588901867753,
+      "learning_rate": 4.250341621323809e-06,
+      "loss": 0.5196,
+      "step": 618
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.8106924420187829,
+      "learning_rate": 4.248005481861111e-06,
+      "loss": 0.5458,
+      "step": 619
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.9698953511074666,
+      "learning_rate": 4.245666352224367e-06,
+      "loss": 0.5963,
+      "step": 620
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.8890424031569348,
+      "learning_rate": 4.243324236414939e-06,
+      "loss": 0.5277,
+      "step": 621
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.8537879418167673,
+      "learning_rate": 4.240979138439301e-06,
+      "loss": 0.5407,
+      "step": 622
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.9264981771759184,
+      "learning_rate": 4.238631062309023e-06,
+      "loss": 0.5788,
+      "step": 623
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.949693389062837,
+      "learning_rate": 4.236280012040773e-06,
+      "loss": 0.5007,
+      "step": 624
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.8845778025905608,
+      "learning_rate": 4.233925991656307e-06,
+      "loss": 0.5905,
+      "step": 625
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.8977167810192608,
+      "learning_rate": 4.231569005182459e-06,
+      "loss": 0.5342,
+      "step": 626
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.9579196623045914,
+      "learning_rate": 4.229209056651139e-06,
+      "loss": 0.554,
+      "step": 627
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.8427820272426025,
+      "learning_rate": 4.226846150099324e-06,
+      "loss": 0.5629,
+      "step": 628
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.865218131227253,
+      "learning_rate": 4.22448028956905e-06,
+      "loss": 0.558,
+      "step": 629
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.7348773966225364,
+      "learning_rate": 4.222111479107406e-06,
+      "loss": 0.5332,
+      "step": 630
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.779367140127678,
+      "learning_rate": 4.219739722766528e-06,
+      "loss": 0.569,
+      "step": 631
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.92860570712595,
+      "learning_rate": 4.217365024603592e-06,
+      "loss": 0.5342,
+      "step": 632
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.946965997476449,
+      "learning_rate": 4.214987388680804e-06,
+      "loss": 0.5482,
+      "step": 633
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.7930454990298659,
+      "learning_rate": 4.212606819065399e-06,
+      "loss": 0.5376,
+      "step": 634
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.8379498458279013,
+      "learning_rate": 4.210223319829626e-06,
+      "loss": 0.5741,
+      "step": 635
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.742977498596499,
+      "learning_rate": 4.207836895050748e-06,
+      "loss": 0.5569,
+      "step": 636
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.852541709372898,
+      "learning_rate": 4.205447548811032e-06,
+      "loss": 0.578,
+      "step": 637
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.8180259569107267,
+      "learning_rate": 4.203055285197745e-06,
+      "loss": 0.5189,
+      "step": 638
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.8177842562763082,
+      "learning_rate": 4.20066010830314e-06,
+      "loss": 0.5424,
+      "step": 639
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.8068654723170434,
+      "learning_rate": 4.198262022224457e-06,
+      "loss": 0.5336,
+      "step": 640
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.9664843499052276,
+      "learning_rate": 4.195861031063909e-06,
+      "loss": 0.5399,
+      "step": 641
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.7812265481792608,
+      "learning_rate": 4.193457138928683e-06,
+      "loss": 0.534,
+      "step": 642
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.908377487778027,
+      "learning_rate": 4.191050349930925e-06,
+      "loss": 0.5831,
+      "step": 643
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.8124678634933105,
+      "learning_rate": 4.18864066818774e-06,
+      "loss": 0.5309,
+      "step": 644
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.902443199964304,
+      "learning_rate": 4.186228097821176e-06,
+      "loss": 0.5452,
+      "step": 645
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.9694387068719457,
+      "learning_rate": 4.183812642958227e-06,
+      "loss": 0.5462,
+      "step": 646
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.945352264767711,
+      "learning_rate": 4.181394307730819e-06,
+      "loss": 0.4853,
+      "step": 647
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.7967416728436914,
+      "learning_rate": 4.178973096275806e-06,
+      "loss": 0.5952,
+      "step": 648
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 2.0602433101771616,
+      "learning_rate": 4.176549012734963e-06,
+      "loss": 0.6346,
+      "step": 649
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.9158731498204968,
+      "learning_rate": 4.1741220612549746e-06,
+      "loss": 0.5101,
+      "step": 650
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.951875972207364,
+      "learning_rate": 4.171692245987436e-06,
+      "loss": 0.5718,
+      "step": 651
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.871788727804539,
+      "learning_rate": 4.169259571088839e-06,
+      "loss": 0.5516,
+      "step": 652
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.945571804366465,
+      "learning_rate": 4.166824040720566e-06,
+      "loss": 0.5544,
+      "step": 653
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.8975723622706568,
+      "learning_rate": 4.1643856590488866e-06,
+      "loss": 0.5643,
+      "step": 654
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.9772846459626554,
+      "learning_rate": 4.161944430244945e-06,
+      "loss": 0.5487,
+      "step": 655
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 2.036472038769578,
+      "learning_rate": 4.159500358484759e-06,
+      "loss": 0.5232,
+      "step": 656
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.7742095436926848,
+      "learning_rate": 4.157053447949206e-06,
+      "loss": 0.4963,
+      "step": 657
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 2.1819742476725814,
+      "learning_rate": 4.154603702824023e-06,
+      "loss": 0.5416,
+      "step": 658
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.9151345309457093,
+      "learning_rate": 4.152151127299794e-06,
+      "loss": 0.5822,
+      "step": 659
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 2.033640859083771,
+      "learning_rate": 4.149695725571944e-06,
+      "loss": 0.5876,
+      "step": 660
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.8935471013235925,
+      "learning_rate": 4.147237501840734e-06,
+      "loss": 0.548,
+      "step": 661
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.7836299476774775,
+      "learning_rate": 4.144776460311253e-06,
+      "loss": 0.5274,
+      "step": 662
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 2.194666072449123,
+      "learning_rate": 4.142312605193407e-06,
+      "loss": 0.5934,
+      "step": 663
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.988265407508224,
+      "learning_rate": 4.13984594070192e-06,
+      "loss": 0.5539,
+      "step": 664
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.7594955740187146,
+      "learning_rate": 4.137376471056317e-06,
+      "loss": 0.5324,
+      "step": 665
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.9342530277100989,
+      "learning_rate": 4.1349042004809224e-06,
+      "loss": 0.5902,
+      "step": 666
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.9757082453588417,
+      "learning_rate": 4.132429133204856e-06,
+      "loss": 0.5874,
+      "step": 667
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.7792467343474774,
+      "learning_rate": 4.129951273462016e-06,
+      "loss": 0.5516,
+      "step": 668
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.9010392264817964,
+      "learning_rate": 4.127470625491082e-06,
+      "loss": 0.5793,
+      "step": 669
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 2.054505290884914,
+      "learning_rate": 4.1249871935355e-06,
+      "loss": 0.5718,
+      "step": 670
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.8010036617727825,
+      "learning_rate": 4.1225009818434805e-06,
+      "loss": 0.5698,
+      "step": 671
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.975020822034628,
+      "learning_rate": 4.120011994667988e-06,
+      "loss": 0.5739,
+      "step": 672
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.9801075045379748,
+      "learning_rate": 4.117520236266734e-06,
+      "loss": 0.5589,
+      "step": 673
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.7773808874926829,
+      "learning_rate": 4.115025710902173e-06,
+      "loss": 0.5276,
+      "step": 674
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.890298398205481,
+      "learning_rate": 4.112528422841491e-06,
+      "loss": 0.4914,
+      "step": 675
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.9087570296379215,
+      "learning_rate": 4.110028376356599e-06,
+      "loss": 0.5412,
+      "step": 676
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.8908271691889404,
+      "learning_rate": 4.1075255757241295e-06,
+      "loss": 0.5618,
+      "step": 677
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 2.024312170169272,
+      "learning_rate": 4.105020025225423e-06,
+      "loss": 0.5618,
+      "step": 678
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.8072403207581518,
+      "learning_rate": 4.102511729146528e-06,
+      "loss": 0.5744,
+      "step": 679
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.7750572145097157,
+      "learning_rate": 4.100000691778185e-06,
+      "loss": 0.5716,
+      "step": 680
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.8778337896632162,
+      "learning_rate": 4.097486917415827e-06,
+      "loss": 0.5683,
+      "step": 681
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.9710167098273688,
+      "learning_rate": 4.094970410359568e-06,
+      "loss": 0.5273,
+      "step": 682
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.9136975523972874,
+      "learning_rate": 4.092451174914196e-06,
+      "loss": 0.5239,
+      "step": 683
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.929344793900944,
+      "learning_rate": 4.089929215389167e-06,
+      "loss": 0.5388,
+      "step": 684
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.7211535229712278,
+      "learning_rate": 4.087404536098597e-06,
+      "loss": 0.5068,
+      "step": 685
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.8739637749458882,
+      "learning_rate": 4.084877141361254e-06,
+      "loss": 0.5537,
+      "step": 686
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.9268469960932768,
+      "learning_rate": 4.082347035500553e-06,
+      "loss": 0.5875,
+      "step": 687
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.896542320004603,
+      "learning_rate": 4.079814222844541e-06,
+      "loss": 0.5314,
+      "step": 688
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.723925126440519,
+      "learning_rate": 4.077278707725904e-06,
+      "loss": 0.5009,
+      "step": 689
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.8345210205201996,
+      "learning_rate": 4.074740494481942e-06,
+      "loss": 0.5544,
+      "step": 690
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.766819080519227,
+      "learning_rate": 4.072199587454578e-06,
+      "loss": 0.5393,
+      "step": 691
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.9577975399484282,
+      "learning_rate": 4.069655990990337e-06,
+      "loss": 0.5357,
+      "step": 692
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.8254761359015224,
+      "learning_rate": 4.06710970944035e-06,
+      "loss": 0.5797,
+      "step": 693
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 2.1203973374999214,
+      "learning_rate": 4.064560747160337e-06,
+      "loss": 0.5811,
+      "step": 694
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.9066221824053846,
+      "learning_rate": 4.062009108510605e-06,
+      "loss": 0.5014,
+      "step": 695
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.951489716071849,
+      "learning_rate": 4.059454797856039e-06,
+      "loss": 0.529,
+      "step": 696
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.8402907113209426,
+      "learning_rate": 4.056897819566096e-06,
+      "loss": 0.4942,
+      "step": 697
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 2.0368715640768498,
+      "learning_rate": 4.0543381780147965e-06,
+      "loss": 0.5245,
+      "step": 698
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.8154462049772704,
+      "learning_rate": 4.0517758775807135e-06,
+      "loss": 0.4979,
+      "step": 699
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.890388895335948,
+      "learning_rate": 4.049210922646973e-06,
+      "loss": 0.5212,
+      "step": 700
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 2.0215900504030166,
+      "learning_rate": 4.046643317601237e-06,
+      "loss": 0.5384,
+      "step": 701
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.816997259900234,
+      "learning_rate": 4.0440730668357076e-06,
+      "loss": 0.492,
+      "step": 702
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.968633766153865,
+      "learning_rate": 4.0415001747471036e-06,
+      "loss": 0.5917,
+      "step": 703
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.8313487810801756,
+      "learning_rate": 4.0389246457366696e-06,
+      "loss": 0.5561,
+      "step": 704
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.7954421155528784,
+      "learning_rate": 4.036346484210159e-06,
+      "loss": 0.5383,
+      "step": 705
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8517101217315919,
+      "learning_rate": 4.033765694577826e-06,
+      "loss": 0.5368,
+      "step": 706
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8888441616203875,
+      "learning_rate": 4.031182281254423e-06,
+      "loss": 0.5895,
+      "step": 707
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8131436351862782,
+      "learning_rate": 4.028596248659191e-06,
+      "loss": 0.5346,
+      "step": 708
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8803113487311214,
+      "learning_rate": 4.0260076012158486e-06,
+      "loss": 0.4987,
+      "step": 709
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8989122650791335,
+      "learning_rate": 4.023416343352589e-06,
+      "loss": 0.5007,
+      "step": 710
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.9466291969735336,
+      "learning_rate": 4.020822479502074e-06,
+      "loss": 0.5868,
+      "step": 711
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.869533367998661,
+      "learning_rate": 4.018226014101418e-06,
+      "loss": 0.5995,
+      "step": 712
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.93738608926368,
+      "learning_rate": 4.015626951592187e-06,
+      "loss": 0.5625,
+      "step": 713
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8485080870897803,
+      "learning_rate": 4.013025296420394e-06,
+      "loss": 0.5585,
+      "step": 714
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8099669115387913,
+      "learning_rate": 4.010421053036481e-06,
+      "loss": 0.5384,
+      "step": 715
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8810123612010912,
+      "learning_rate": 4.007814225895321e-06,
+      "loss": 0.5589,
+      "step": 716
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8692823610937885,
+      "learning_rate": 4.005204819456205e-06,
+      "loss": 0.5474,
+      "step": 717
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.8120887102918588,
+      "learning_rate": 4.00259283818284e-06,
+      "loss": 0.5138,
+      "step": 718
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.7933926935301234,
+      "learning_rate": 3.999978286543331e-06,
+      "loss": 0.5235,
+      "step": 719
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.8382360731306235,
+      "learning_rate": 3.997361169010187e-06,
+      "loss": 0.5846,
+      "step": 720
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.993925306673069,
+      "learning_rate": 3.994741490060301e-06,
+      "loss": 0.5561,
+      "step": 721
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.900088669959918,
+      "learning_rate": 3.9921192541749505e-06,
+      "loss": 0.5215,
+      "step": 722
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.9250072769385074,
+      "learning_rate": 3.989494465839785e-06,
+      "loss": 0.54,
+      "step": 723
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.7928905908766457,
+      "learning_rate": 3.986867129544822e-06,
+      "loss": 0.6066,
+      "step": 724
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.9474900039545116,
+      "learning_rate": 3.984237249784437e-06,
+      "loss": 0.5173,
+      "step": 725
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.9004077336349998,
+      "learning_rate": 3.981604831057357e-06,
+      "loss": 0.5409,
+      "step": 726
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.7573843693188624,
+      "learning_rate": 3.97896987786665e-06,
+      "loss": 0.5239,
+      "step": 727
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.899283660379949,
+      "learning_rate": 3.976332394719721e-06,
+      "loss": 0.4977,
+      "step": 728
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.8353476568345033,
+      "learning_rate": 3.973692386128304e-06,
+      "loss": 0.5834,
+      "step": 729
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 2.032325534167748,
+      "learning_rate": 3.971049856608451e-06,
+      "loss": 0.5343,
+      "step": 730
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.8161347764383835,
+      "learning_rate": 3.9684048106805286e-06,
+      "loss": 0.585,
+      "step": 731
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.836376388525165,
+      "learning_rate": 3.965757252869204e-06,
+      "loss": 0.5978,
+      "step": 732
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.889118862096067,
+      "learning_rate": 3.963107187703446e-06,
+      "loss": 0.5393,
+      "step": 733
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.7772829607776217,
+      "learning_rate": 3.96045461971651e-06,
+      "loss": 0.5164,
+      "step": 734
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.7980410807492582,
+      "learning_rate": 3.957799553445932e-06,
+      "loss": 0.5455,
+      "step": 735
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.907936099702467,
+      "learning_rate": 3.955141993433526e-06,
+      "loss": 0.532,
+      "step": 736
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.8668064740862462,
+      "learning_rate": 3.9524819442253645e-06,
+      "loss": 0.5578,
+      "step": 737
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.838952740378055,
+      "learning_rate": 3.949819410371785e-06,
+      "loss": 0.5784,
+      "step": 738
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.9595767898211005,
+      "learning_rate": 3.947154396427373e-06,
+      "loss": 0.5213,
+      "step": 739
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.9422968944070973,
+      "learning_rate": 3.944486906950954e-06,
+      "loss": 0.5709,
+      "step": 740
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.760556693040696,
+      "learning_rate": 3.941816946505592e-06,
+      "loss": 0.5564,
+      "step": 741
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.8054841879427592,
+      "learning_rate": 3.939144519658575e-06,
+      "loss": 0.5435,
+      "step": 742
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 2.1072923992538,
+      "learning_rate": 3.936469630981412e-06,
+      "loss": 0.5622,
+      "step": 743
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.711687978027928,
+      "learning_rate": 3.933792285049821e-06,
+      "loss": 0.5554,
+      "step": 744
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.8166543944942228,
+      "learning_rate": 3.931112486443727e-06,
+      "loss": 0.5079,
+      "step": 745
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.7923405334139695,
+      "learning_rate": 3.928430239747246e-06,
+      "loss": 0.5692,
+      "step": 746
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.9611773239667012,
+      "learning_rate": 3.925745549548687e-06,
+      "loss": 0.5092,
+      "step": 747
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.8440088039871827,
+      "learning_rate": 3.923058420440534e-06,
+      "loss": 0.5369,
+      "step": 748
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.9272316571307881,
+      "learning_rate": 3.920368857019447e-06,
+      "loss": 0.5798,
+      "step": 749
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.8248503445199376,
+      "learning_rate": 3.917676863886246e-06,
+      "loss": 0.5479,
+      "step": 750
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.9200626612083824,
+      "learning_rate": 3.914982445645912e-06,
+      "loss": 0.549,
+      "step": 751
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.8585556832275227,
+      "learning_rate": 3.91228560690757e-06,
+      "loss": 0.5283,
+      "step": 752
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.819239895382093,
+      "learning_rate": 3.90958635228449e-06,
+      "loss": 0.535,
+      "step": 753
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.7810389942543545,
+      "learning_rate": 3.90688468639407e-06,
+      "loss": 0.5125,
+      "step": 754
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.9614453700373935,
+      "learning_rate": 3.904180613857837e-06,
+      "loss": 0.5406,
+      "step": 755
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.805104940263808,
+      "learning_rate": 3.901474139301433e-06,
+      "loss": 0.5794,
+      "step": 756
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.78756289235025,
+      "learning_rate": 3.898765267354607e-06,
+      "loss": 0.569,
+      "step": 757
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.912300438003516,
+      "learning_rate": 3.896054002651213e-06,
+      "loss": 0.5565,
+      "step": 758
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.8148356694353722,
+      "learning_rate": 3.893340349829195e-06,
+      "loss": 0.5471,
+      "step": 759
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.6836223387492706,
+      "learning_rate": 3.890624313530583e-06,
+      "loss": 0.5145,
+      "step": 760
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.8389298216964765,
+      "learning_rate": 3.887905898401485e-06,
+      "loss": 0.5441,
+      "step": 761
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.7845754057436856,
+      "learning_rate": 3.885185109092078e-06,
+      "loss": 0.5478,
+      "step": 762
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.77076035925993,
+      "learning_rate": 3.882461950256598e-06,
+      "loss": 0.5497,
+      "step": 763
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.8011284465286703,
+      "learning_rate": 3.87973642655334e-06,
+      "loss": 0.5039,
+      "step": 764
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.7400129481667248,
+      "learning_rate": 3.877008542644637e-06,
+      "loss": 0.5243,
+      "step": 765
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.9899565111682327,
+      "learning_rate": 3.874278303196866e-06,
+      "loss": 0.5767,
+      "step": 766
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.8345576263874734,
+      "learning_rate": 3.871545712880429e-06,
+      "loss": 0.5262,
+      "step": 767
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.8375211207672395,
+      "learning_rate": 3.8688107763697505e-06,
+      "loss": 0.5467,
+      "step": 768
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.8068462280574835,
+      "learning_rate": 3.8660734983432715e-06,
+      "loss": 0.5256,
+      "step": 769
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.7823522202158735,
+      "learning_rate": 3.863333883483433e-06,
+      "loss": 0.5419,
+      "step": 770
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.8881514180214427,
+      "learning_rate": 3.86059193647668e-06,
+      "loss": 0.541,
+      "step": 771
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.8311064595650786,
+      "learning_rate": 3.85784766201344e-06,
+      "loss": 0.5455,
+      "step": 772
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.9833459774866717,
+      "learning_rate": 3.855101064788126e-06,
+      "loss": 0.5723,
+      "step": 773
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.7968096633022903,
+      "learning_rate": 3.852352149499125e-06,
+      "loss": 0.5153,
+      "step": 774
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.775423895652992,
+      "learning_rate": 3.849600920848787e-06,
+      "loss": 0.5134,
+      "step": 775
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.7262892998825556,
+      "learning_rate": 3.84684738354342e-06,
+      "loss": 0.5287,
+      "step": 776
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.7866135638778051,
+      "learning_rate": 3.84409154229328e-06,
+      "loss": 0.57,
+      "step": 777
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.787377916112687,
+      "learning_rate": 3.841333401812569e-06,
+      "loss": 0.5312,
+      "step": 778
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.684801862246949,
+      "learning_rate": 3.838572966819416e-06,
+      "loss": 0.5822,
+      "step": 779
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.79074773131748,
+      "learning_rate": 3.835810242035879e-06,
+      "loss": 0.5651,
+      "step": 780
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.9234904827178134,
+      "learning_rate": 3.8330452321879305e-06,
+      "loss": 0.5527,
+      "step": 781
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 2.1733402579018186,
+      "learning_rate": 3.830277942005455e-06,
+      "loss": 0.5545,
+      "step": 782
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 2.112229504682016,
+      "learning_rate": 3.827508376222233e-06,
+      "loss": 0.5766,
+      "step": 783
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 2.087174122744587,
+      "learning_rate": 3.824736539575944e-06,
+      "loss": 0.549,
+      "step": 784
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.9570382810890106,
+      "learning_rate": 3.821962436808145e-06,
+      "loss": 0.4984,
+      "step": 785
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.94720853153738,
+      "learning_rate": 3.819186072664277e-06,
+      "loss": 0.5303,
+      "step": 786
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 2.21095404069362,
+      "learning_rate": 3.816407451893643e-06,
+      "loss": 0.5674,
+      "step": 787
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.7284336698899117,
+      "learning_rate": 3.8136265792494094e-06,
+      "loss": 0.5952,
+      "step": 788
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.940869697529687,
+      "learning_rate": 3.8108434594885934e-06,
+      "loss": 0.5198,
+      "step": 789
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.9282749931884566,
+      "learning_rate": 3.808058097372057e-06,
+      "loss": 0.5499,
+      "step": 790
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 2.0180195532646983,
+      "learning_rate": 3.8052704976644984e-06,
+      "loss": 0.5117,
+      "step": 791
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.8303561179366206,
+      "learning_rate": 3.8024806651344424e-06,
+      "loss": 0.5034,
+      "step": 792
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 2.0584295539484754,
+      "learning_rate": 3.7996886045542335e-06,
+      "loss": 0.5391,
+      "step": 793
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.7736893833047733,
+      "learning_rate": 3.7968943207000284e-06,
+      "loss": 0.5378,
+      "step": 794
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.7840353008162277,
+      "learning_rate": 3.794097818351786e-06,
+      "loss": 0.5091,
+      "step": 795
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 2.0949100717616225,
+      "learning_rate": 3.791299102293261e-06,
+      "loss": 0.5731,
+      "step": 796
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 2.048353193294094,
+      "learning_rate": 3.7884981773119943e-06,
+      "loss": 0.5576,
+      "step": 797
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.9990070284918733,
+      "learning_rate": 3.7856950481993054e-06,
+      "loss": 0.5297,
+      "step": 798
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.859560152641746,
+      "learning_rate": 3.7828897197502856e-06,
+      "loss": 0.5131,
+      "step": 799
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 2.0054802770873916,
+      "learning_rate": 3.780082196763785e-06,
+      "loss": 0.5428,
+      "step": 800
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.8985367093585213,
+      "learning_rate": 3.7772724840424126e-06,
+      "loss": 0.5206,
+      "step": 801
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.9964704653764362,
+      "learning_rate": 3.774460586392519e-06,
+      "loss": 0.5929,
+      "step": 802
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.7572936836574113,
+      "learning_rate": 3.771646508624194e-06,
+      "loss": 0.5428,
+      "step": 803
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.9623695483620975,
+      "learning_rate": 3.768830255551258e-06,
+      "loss": 0.5685,
+      "step": 804
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.9663290616402378,
+      "learning_rate": 3.76601183199125e-06,
+      "loss": 0.5351,
+      "step": 805
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.7876590847889615,
+      "learning_rate": 3.763191242765424e-06,
+      "loss": 0.567,
+      "step": 806
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.8500820456277005,
+      "learning_rate": 3.7603684926987383e-06,
+      "loss": 0.523,
+      "step": 807
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 2.041973125533567,
+      "learning_rate": 3.757543586619845e-06,
+      "loss": 0.5531,
+      "step": 808
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.7440376746222928,
+      "learning_rate": 3.754716529361089e-06,
+      "loss": 0.4913,
+      "step": 809
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.7910937306897654,
+      "learning_rate": 3.7518873257584897e-06,
+      "loss": 0.5128,
+      "step": 810
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.9334392608388238,
+      "learning_rate": 3.7490559806517434e-06,
+      "loss": 0.5861,
+      "step": 811
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 2.0003597857127673,
+      "learning_rate": 3.746222498884206e-06,
+      "loss": 0.5535,
+      "step": 812
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.7964615198133413,
+      "learning_rate": 3.74338688530289e-06,
+      "loss": 0.5409,
+      "step": 813
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.7726488990007383,
+      "learning_rate": 3.740549144758453e-06,
+      "loss": 0.5714,
+      "step": 814
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.9080323144095523,
+      "learning_rate": 3.737709282105193e-06,
+      "loss": 0.5534,
+      "step": 815
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.9612361354867969,
+      "learning_rate": 3.734867302201038e-06,
+      "loss": 0.5282,
+      "step": 816
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.873254058551618,
+      "learning_rate": 3.7320232099075363e-06,
+      "loss": 0.5422,
+      "step": 817
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.8383882069199007,
+      "learning_rate": 3.7291770100898508e-06,
+      "loss": 0.5588,
+      "step": 818
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 2.0137053963220835,
+      "learning_rate": 3.726328707616749e-06,
+      "loss": 0.5895,
+      "step": 819
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.8207549211692964,
+      "learning_rate": 3.7234783073605957e-06,
+      "loss": 0.5428,
+      "step": 820
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.7929761418069659,
+      "learning_rate": 3.7206258141973445e-06,
+      "loss": 0.555,
+      "step": 821
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.8863691259545465,
+      "learning_rate": 3.7177712330065285e-06,
+      "loss": 0.5802,
+      "step": 822
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.8383911000943605,
+      "learning_rate": 3.714914568671252e-06,
+      "loss": 0.4986,
+      "step": 823
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 2.0032777947804044,
+      "learning_rate": 3.7120558260781846e-06,
+      "loss": 0.6456,
+      "step": 824
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.733320874844507,
+      "learning_rate": 3.709195010117551e-06,
+      "loss": 0.5146,
+      "step": 825
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.7411187007421471,
+      "learning_rate": 3.7063321256831193e-06,
+      "loss": 0.5297,
+      "step": 826
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.8334107493901353,
+      "learning_rate": 3.7034671776722003e-06,
+      "loss": 0.545,
+      "step": 827
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.931467221651553,
+      "learning_rate": 3.7006001709856314e-06,
+      "loss": 0.579,
+      "step": 828
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.799522216655623,
+      "learning_rate": 3.697731110527774e-06,
+      "loss": 0.5453,
+      "step": 829
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.8098119388805842,
+      "learning_rate": 3.6948600012065016e-06,
+      "loss": 0.5186,
+      "step": 830
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.8419013342395714,
+      "learning_rate": 3.6919868479331934e-06,
+      "loss": 0.4833,
+      "step": 831
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.8419148322752323,
+      "learning_rate": 3.6891116556227234e-06,
+      "loss": 0.5479,
+      "step": 832
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.7858200344474908,
+      "learning_rate": 3.6862344291934545e-06,
+      "loss": 0.5264,
+      "step": 833
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.8057437623830686,
+      "learning_rate": 3.6833551735672293e-06,
+      "loss": 0.5208,
+      "step": 834
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.8570584000334132,
+      "learning_rate": 3.6804738936693617e-06,
+      "loss": 0.5652,
+      "step": 835
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.7961732805960369,
+      "learning_rate": 3.677590594428629e-06,
+      "loss": 0.5693,
+      "step": 836
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.954108513879844,
+      "learning_rate": 3.6747052807772614e-06,
+      "loss": 0.5673,
+      "step": 837
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.834152772161213,
+      "learning_rate": 3.671817957650936e-06,
+      "loss": 0.5118,
+      "step": 838
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.8035026424969205,
+      "learning_rate": 3.6689286299887663e-06,
+      "loss": 0.5778,
+      "step": 839
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.7862771700309947,
+      "learning_rate": 3.666037302733295e-06,
+      "loss": 0.5575,
+      "step": 840
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.7398650592861555,
+      "learning_rate": 3.6631439808304874e-06,
+      "loss": 0.5323,
+      "step": 841
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.7082885736006344,
+      "learning_rate": 3.6602486692297183e-06,
+      "loss": 0.543,
+      "step": 842
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.8242434568233548,
+      "learning_rate": 3.6573513728837685e-06,
+      "loss": 0.5579,
+      "step": 843
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.8305967806472925,
+      "learning_rate": 3.6544520967488108e-06,
+      "loss": 0.5425,
+      "step": 844
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.7126995402462595,
+      "learning_rate": 3.651550845784407e-06,
+      "loss": 0.5399,
+      "step": 845
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.992190051239983,
+      "learning_rate": 3.648647624953496e-06,
+      "loss": 0.5951,
+      "step": 846
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.9362402903409848,
+      "learning_rate": 3.6457424392223885e-06,
+      "loss": 0.5427,
+      "step": 847
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.7390586845081806,
+      "learning_rate": 3.642835293560754e-06,
+      "loss": 0.5269,
+      "step": 848
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.8601747321693383,
+      "learning_rate": 3.639926192941615e-06,
+      "loss": 0.5246,
+      "step": 849
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.8305054240762129,
+      "learning_rate": 3.6370151423413396e-06,
+      "loss": 0.562,
+      "step": 850
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.8361711553327809,
+      "learning_rate": 3.6341021467396296e-06,
+      "loss": 0.5066,
+      "step": 851
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.9202617492772214,
+      "learning_rate": 3.6311872111195163e-06,
+      "loss": 0.5755,
+      "step": 852
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.9056266366653432,
+      "learning_rate": 3.628270340467348e-06,
+      "loss": 0.5193,
+      "step": 853
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.9700971504271882,
+      "learning_rate": 3.625351539772783e-06,
+      "loss": 0.5499,
+      "step": 854
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.7142305580780086,
+      "learning_rate": 3.6224308140287818e-06,
+      "loss": 0.5597,
+      "step": 855
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.7897876492593174,
+      "learning_rate": 3.6195081682315972e-06,
+      "loss": 0.5347,
+      "step": 856
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 2.191923699092432,
+      "learning_rate": 3.616583607380769e-06,
+      "loss": 0.5251,
+      "step": 857
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.8582876176666503,
+      "learning_rate": 3.61365713647911e-06,
+      "loss": 0.5067,
+      "step": 858
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.991617360171558,
+      "learning_rate": 3.610728760532701e-06,
+      "loss": 0.6464,
+      "step": 859
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.892621069660817,
+      "learning_rate": 3.607798484550881e-06,
+      "loss": 0.5145,
+      "step": 860
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.7592963181570629,
+      "learning_rate": 3.6048663135462423e-06,
+      "loss": 0.5297,
+      "step": 861
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 2.020192040751123,
+      "learning_rate": 3.6019322525346157e-06,
+      "loss": 0.5709,
+      "step": 862
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.8575959680616767,
+      "learning_rate": 3.598996306535067e-06,
+      "loss": 0.5946,
+      "step": 863
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.9638758131071599,
+      "learning_rate": 3.5960584805698845e-06,
+      "loss": 0.4833,
+      "step": 864
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.7517341191956926,
+      "learning_rate": 3.593118779664574e-06,
+      "loss": 0.5439,
+      "step": 865
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.7637144330636925,
+      "learning_rate": 3.590177208847848e-06,
+      "loss": 0.4898,
+      "step": 866
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 2.107899096934758,
+      "learning_rate": 3.5872337731516186e-06,
+      "loss": 0.5332,
+      "step": 867
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 2.016493645108941,
+      "learning_rate": 3.5842884776109875e-06,
+      "loss": 0.5313,
+      "step": 868
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.8758602544873038,
+      "learning_rate": 3.581341327264236e-06,
+      "loss": 0.554,
+      "step": 869
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.8566881639083022,
+      "learning_rate": 3.5783923271528222e-06,
+      "loss": 0.5322,
+      "step": 870
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.9151838907738468,
+      "learning_rate": 3.5754414823213647e-06,
+      "loss": 0.5306,
+      "step": 871
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.7893407766785276,
+      "learning_rate": 3.572488797817639e-06,
+      "loss": 0.5226,
+      "step": 872
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.908122661974681,
+      "learning_rate": 3.569534278692569e-06,
+      "loss": 0.5132,
+      "step": 873
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.9052513037253582,
+      "learning_rate": 3.5665779300002144e-06,
+      "loss": 0.513,
+      "step": 874
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.7876914527016339,
+      "learning_rate": 3.563619756797767e-06,
+      "loss": 0.5627,
+      "step": 875
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.9607045801516068,
+      "learning_rate": 3.5606597641455387e-06,
+      "loss": 0.4986,
+      "step": 876
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.701462749441997,
+      "learning_rate": 3.5576979571069527e-06,
+      "loss": 0.5306,
+      "step": 877
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.8413701238351416,
+      "learning_rate": 3.554734340748538e-06,
+      "loss": 0.5602,
+      "step": 878
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.8762306249541667,
+      "learning_rate": 3.5517689201399162e-06,
+      "loss": 0.5663,
+      "step": 879
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.833164968453507,
+      "learning_rate": 3.5488017003537977e-06,
+      "loss": 0.5264,
+      "step": 880
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.766302763247428,
+      "learning_rate": 3.5458326864659687e-06,
+      "loss": 0.5498,
+      "step": 881
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.821883208129187,
+      "learning_rate": 3.5428618835552867e-06,
+      "loss": 0.5468,
+      "step": 882
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.7773758034614335,
+      "learning_rate": 3.5398892967036674e-06,
+      "loss": 0.505,
+      "step": 883
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.8248820711070537,
+      "learning_rate": 3.5369149309960783e-06,
+      "loss": 0.5679,
+      "step": 884
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.8248114104788378,
+      "learning_rate": 3.5339387915205305e-06,
+      "loss": 0.5351,
+      "step": 885
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 2.00472132505421,
+      "learning_rate": 3.53096088336807e-06,
+      "loss": 0.5637,
+      "step": 886
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 2.0594957277906656,
+      "learning_rate": 3.5279812116327667e-06,
+      "loss": 0.567,
+      "step": 887
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.916227169502353,
+      "learning_rate": 3.5249997814117098e-06,
+      "loss": 0.5733,
+      "step": 888
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.7595020268824906,
+      "learning_rate": 3.5220165978049937e-06,
+      "loss": 0.5512,
+      "step": 889
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.8259487385184114,
+      "learning_rate": 3.5190316659157126e-06,
+      "loss": 0.5332,
+      "step": 890
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.8216813752485344,
+      "learning_rate": 3.5160449908499538e-06,
+      "loss": 0.5718,
+      "step": 891
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.8497964997952454,
+      "learning_rate": 3.5130565777167845e-06,
+      "loss": 0.5179,
+      "step": 892
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.8242356367817554,
+      "learning_rate": 3.5100664316282464e-06,
+      "loss": 0.5587,
+      "step": 893
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.7793507179190546,
+      "learning_rate": 3.5070745576993428e-06,
+      "loss": 0.5924,
+      "step": 894
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.920176905610262,
+      "learning_rate": 3.5040809610480364e-06,
+      "loss": 0.5579,
+      "step": 895
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.954421523744336,
+      "learning_rate": 3.5010856467952335e-06,
+      "loss": 0.5496,
+      "step": 896
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.7785169911731862,
+      "learning_rate": 3.4980886200647817e-06,
+      "loss": 0.5383,
+      "step": 897
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.853827977546151,
+      "learning_rate": 3.4950898859834555e-06,
+      "loss": 0.5501,
+      "step": 898
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.9882198198152168,
+      "learning_rate": 3.4920894496809515e-06,
+      "loss": 0.5557,
+      "step": 899
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.98090605107646,
+      "learning_rate": 3.489087316289877e-06,
+      "loss": 0.5661,
+      "step": 900
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 2.0027723691714785,
+      "learning_rate": 3.486083490945743e-06,
+      "loss": 0.4791,
+      "step": 901
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 2.0183911897675015,
+      "learning_rate": 3.4830779787869555e-06,
+      "loss": 0.5386,
+      "step": 902
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.9385976919386894,
+      "learning_rate": 3.480070784954805e-06,
+      "loss": 0.5351,
+      "step": 903
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.7612550957325825,
+      "learning_rate": 3.4770619145934586e-06,
+      "loss": 0.511,
+      "step": 904
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.8677538420589843,
+      "learning_rate": 3.4740513728499515e-06,
+      "loss": 0.5942,
+      "step": 905
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.9208446249900946,
+      "learning_rate": 3.4710391648741787e-06,
+      "loss": 0.5146,
+      "step": 906
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.8008673055527855,
+      "learning_rate": 3.468025295818885e-06,
+      "loss": 0.5909,
+      "step": 907
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.891052390507894,
+      "learning_rate": 3.465009770839657e-06,
+      "loss": 0.5527,
+      "step": 908
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 2.0521048489395435,
+      "learning_rate": 3.4619925950949126e-06,
+      "loss": 0.5756,
+      "step": 909
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 2.003295441830653,
+      "learning_rate": 3.4589737737458946e-06,
+      "loss": 0.5299,
+      "step": 910
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.7635851435542724,
+      "learning_rate": 3.4559533119566612e-06,
+      "loss": 0.5338,
+      "step": 911
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.834326490517508,
+      "learning_rate": 3.4529312148940763e-06,
+      "loss": 0.56,
+      "step": 912
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.8618427761057224,
+      "learning_rate": 3.4499074877278016e-06,
+      "loss": 0.5189,
+      "step": 913
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 2.04459004844406,
+      "learning_rate": 3.446882135630286e-06,
+      "loss": 0.5765,
+      "step": 914
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.7467595732765806,
+      "learning_rate": 3.4438551637767604e-06,
+      "loss": 0.5512,
+      "step": 915
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.7952035114217406,
+      "learning_rate": 3.4408265773452226e-06,
+      "loss": 0.5348,
+      "step": 916
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.8448198186244822,
+      "learning_rate": 3.4377963815164362e-06,
+      "loss": 0.5187,
+      "step": 917
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.7738820116169103,
+      "learning_rate": 3.4347645814739156e-06,
+      "loss": 0.507,
+      "step": 918
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.9699054774415494,
+      "learning_rate": 3.4317311824039216e-06,
+      "loss": 0.5175,
+      "step": 919
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.7482905457169124,
+      "learning_rate": 3.4286961894954473e-06,
+      "loss": 0.5188,
+      "step": 920
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.8012194296110113,
+      "learning_rate": 3.425659607940215e-06,
+      "loss": 0.5465,
+      "step": 921
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.7978097428012587,
+      "learning_rate": 3.422621442932662e-06,
+      "loss": 0.5257,
+      "step": 922
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.8534167116514217,
+      "learning_rate": 3.419581699669937e-06,
+      "loss": 0.536,
+      "step": 923
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.7733377878036733,
+      "learning_rate": 3.416540383351888e-06,
+      "loss": 0.5632,
+      "step": 924
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.8124786776539388,
+      "learning_rate": 3.4134974991810503e-06,
+      "loss": 0.5471,
+      "step": 925
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.8553271859579439,
+      "learning_rate": 3.4104530523626463e-06,
+      "loss": 0.538,
+      "step": 926
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.8888926038913822,
+      "learning_rate": 3.4074070481045683e-06,
+      "loss": 0.4868,
+      "step": 927
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 2.0158609319355505,
+      "learning_rate": 3.404359491617374e-06,
+      "loss": 0.5757,
+      "step": 928
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.8376639720078027,
+      "learning_rate": 3.401310388114276e-06,
+      "loss": 0.5377,
+      "step": 929
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 2.3651883595335232,
+      "learning_rate": 3.3982597428111336e-06,
+      "loss": 0.5536,
+      "step": 930
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.908409388949023,
+      "learning_rate": 3.3952075609264423e-06,
+      "loss": 0.5349,
+      "step": 931
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.8261622890952995,
+      "learning_rate": 3.3921538476813278e-06,
+      "loss": 0.4991,
+      "step": 932
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.924034720876031,
+      "learning_rate": 3.3890986082995353e-06,
+      "loss": 0.536,
+      "step": 933
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.829615974230478,
+      "learning_rate": 3.3860418480074188e-06,
+      "loss": 0.5163,
+      "step": 934
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.7812992854973535,
+      "learning_rate": 3.3829835720339353e-06,
+      "loss": 0.5412,
+      "step": 935
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.8270515542068861,
+      "learning_rate": 3.3799237856106348e-06,
+      "loss": 0.5459,
+      "step": 936
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.8336967909163833,
+      "learning_rate": 3.3768624939716506e-06,
+      "loss": 0.5074,
+      "step": 937
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.773892866992307,
+      "learning_rate": 3.373799702353691e-06,
+      "loss": 0.5457,
+      "step": 938
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.8605607499004266,
+      "learning_rate": 3.370735415996031e-06,
+      "loss": 0.5691,
+      "step": 939
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.7961529805945686,
+      "learning_rate": 3.3676696401405007e-06,
+      "loss": 0.5406,
+      "step": 940
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.7406787561376078,
+      "learning_rate": 3.3646023800314792e-06,
+      "loss": 0.5297,
+      "step": 941
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.9794693468141764,
+      "learning_rate": 3.361533640915885e-06,
+      "loss": 0.4765,
+      "step": 942
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.820632707720892,
+      "learning_rate": 3.3584634280431657e-06,
+      "loss": 0.5395,
+      "step": 943
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.8478126164835549,
+      "learning_rate": 3.3553917466652915e-06,
+      "loss": 0.5288,
+      "step": 944
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.749509825583459,
+      "learning_rate": 3.352318602036742e-06,
+      "loss": 0.5343,
+      "step": 945
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.8034305951190157,
+      "learning_rate": 3.3492439994145033e-06,
+      "loss": 0.5536,
+      "step": 946
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.8172591817519397,
+      "learning_rate": 3.346167944058052e-06,
+      "loss": 0.5844,
+      "step": 947
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.749562414198837,
+      "learning_rate": 3.3430904412293526e-06,
+      "loss": 0.4833,
+      "step": 948
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.7243742428927225,
+      "learning_rate": 3.3400114961928444e-06,
+      "loss": 0.4828,
+      "step": 949
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.757242299744874,
+      "learning_rate": 3.3369311142154337e-06,
+      "loss": 0.5282,
+      "step": 950
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 2.036302581700697,
+      "learning_rate": 3.3338493005664853e-06,
+      "loss": 0.5315,
+      "step": 951
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.886299636672335,
+      "learning_rate": 3.330766060517812e-06,
+      "loss": 0.5244,
+      "step": 952
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.898853787733011,
+      "learning_rate": 3.3276813993436695e-06,
+      "loss": 0.5914,
+      "step": 953
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.8359472984671243,
+      "learning_rate": 3.324595322320741e-06,
+      "loss": 0.5488,
+      "step": 954
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.8768955168510497,
+      "learning_rate": 3.321507834728134e-06,
+      "loss": 0.5871,
+      "step": 955
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.8358033818112791,
+      "learning_rate": 3.3184189418473674e-06,
+      "loss": 0.5632,
+      "step": 956
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.792562502385941,
+      "learning_rate": 3.315328648962364e-06,
+      "loss": 0.4887,
+      "step": 957
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.8732702930932368,
+      "learning_rate": 3.312236961359444e-06,
+      "loss": 0.5313,
+      "step": 958
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.7708047128885986,
+      "learning_rate": 3.3091438843273115e-06,
+      "loss": 0.5348,
+      "step": 959
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.9094434763935804,
+      "learning_rate": 3.3060494231570463e-06,
+      "loss": 0.5027,
+      "step": 960
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.87927564418864,
+      "learning_rate": 3.3029535831420977e-06,
+      "loss": 0.511,
+      "step": 961
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.717365559903535,
+      "learning_rate": 3.299856369578273e-06,
+      "loss": 0.5203,
+      "step": 962
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.770779257052532,
+      "learning_rate": 3.2967577877637296e-06,
+      "loss": 0.5233,
+      "step": 963
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.7541392466004568,
+      "learning_rate": 3.2936578429989653e-06,
+      "loss": 0.5013,
+      "step": 964
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.7840578280891832,
+      "learning_rate": 3.290556540586809e-06,
+      "loss": 0.4844,
+      "step": 965
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.7184305413001233,
+      "learning_rate": 3.287453885832413e-06,
+      "loss": 0.4694,
+      "step": 966
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.8671517036325307,
+      "learning_rate": 3.2843498840432403e-06,
+      "loss": 0.4652,
+      "step": 967
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.9960847871768508,
+      "learning_rate": 3.2812445405290612e-06,
+      "loss": 0.5906,
+      "step": 968
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.7535227575839891,
+      "learning_rate": 3.27813786060194e-06,
+      "loss": 0.5482,
+      "step": 969
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.929231862440999,
+      "learning_rate": 3.2750298495762278e-06,
+      "loss": 0.5334,
+      "step": 970
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.7879676366114814,
+      "learning_rate": 3.2719205127685505e-06,
+      "loss": 0.515,
+      "step": 971
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.7817120865072218,
+      "learning_rate": 3.2688098554978053e-06,
+      "loss": 0.5045,
+      "step": 972
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.8725673808714274,
+      "learning_rate": 3.265697883085145e-06,
+      "loss": 0.5557,
+      "step": 973
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.8554796275037901,
+      "learning_rate": 3.262584600853973e-06,
+      "loss": 0.5785,
+      "step": 974
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.77078783324655,
+      "learning_rate": 3.259470014129936e-06,
+      "loss": 0.524,
+      "step": 975
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.820843626030818,
+      "learning_rate": 3.256354128240907e-06,
+      "loss": 0.5144,
+      "step": 976
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.9330495063889956,
+      "learning_rate": 3.253236948516987e-06,
+      "loss": 0.5405,
+      "step": 977
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.9113413794485425,
+      "learning_rate": 3.2501184802904867e-06,
+      "loss": 0.5212,
+      "step": 978
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.799188386703558,
+      "learning_rate": 3.2469987288959208e-06,
+      "loss": 0.5148,
+      "step": 979
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.8610914183588203,
+      "learning_rate": 3.2438776996700023e-06,
+      "loss": 0.5363,
+      "step": 980
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.8245263524947073,
+      "learning_rate": 3.240755397951625e-06,
+      "loss": 0.5216,
+      "step": 981
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.7863270641417597,
+      "learning_rate": 3.2376318290818643e-06,
+      "loss": 0.5581,
+      "step": 982
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.9266115141469626,
+      "learning_rate": 3.23450699840396e-06,
+      "loss": 0.5178,
+      "step": 983
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.8044458399187253,
+      "learning_rate": 3.2313809112633133e-06,
+      "loss": 0.5252,
+      "step": 984
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.8809392949423562,
+      "learning_rate": 3.2282535730074714e-06,
+      "loss": 0.486,
+      "step": 985
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.9487997548787144,
+      "learning_rate": 3.2251249889861237e-06,
+      "loss": 0.5272,
+      "step": 986
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 2.088279538426057,
+      "learning_rate": 3.2219951645510907e-06,
+      "loss": 0.5426,
+      "step": 987
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.8280370745964312,
+      "learning_rate": 3.218864105056313e-06,
+      "loss": 0.5545,
+      "step": 988
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.7678201455723743,
+      "learning_rate": 3.2157318158578473e-06,
+      "loss": 0.5476,
+      "step": 989
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.708170466024094,
+      "learning_rate": 3.21259830231385e-06,
+      "loss": 0.5442,
+      "step": 990
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 2.0427224573251483,
+      "learning_rate": 3.209463569784575e-06,
+      "loss": 0.5501,
+      "step": 991
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.8557413526282036,
+      "learning_rate": 3.206327623632359e-06,
+      "loss": 0.5573,
+      "step": 992
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.7138810851622357,
+      "learning_rate": 3.2031904692216153e-06,
+      "loss": 0.5267,
+      "step": 993
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.9034028799031073,
+      "learning_rate": 3.2000521119188267e-06,
+      "loss": 0.5605,
+      "step": 994
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.994571492675121,
+      "learning_rate": 3.1969125570925303e-06,
+      "loss": 0.53,
+      "step": 995
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.771581881704634,
+      "learning_rate": 3.193771810113313e-06,
+      "loss": 0.6177,
+      "step": 996
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.7808220445921694,
+      "learning_rate": 3.1906298763538005e-06,
+      "loss": 0.5215,
+      "step": 997
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.8069794706642701,
+      "learning_rate": 3.1874867611886513e-06,
+      "loss": 0.5444,
+      "step": 998
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.7806867210889854,
+      "learning_rate": 3.1843424699945403e-06,
+      "loss": 0.5471,
+      "step": 999
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.7481554024627886,
+      "learning_rate": 3.1811970081501576e-06,
+      "loss": 0.5159,
+      "step": 1000
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.8105318680671914,
+      "learning_rate": 3.1780503810361946e-06,
+      "loss": 0.4985,
+      "step": 1001
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.7033701950072382,
+      "learning_rate": 3.1749025940353363e-06,
+      "loss": 0.5594,
+      "step": 1002
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 2.3799847532384515,
+      "learning_rate": 3.1717536525322512e-06,
+      "loss": 0.5978,
+      "step": 1003
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.7427559432173463,
+      "learning_rate": 3.1686035619135845e-06,
+      "loss": 0.5299,
+      "step": 1004
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.7454547855925509,
+      "learning_rate": 3.1654523275679453e-06,
+      "loss": 0.5439,
+      "step": 1005
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.7130931472340127,
+      "learning_rate": 3.162299954885899e-06,
+      "loss": 0.5379,
+      "step": 1006
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.6940357366272063,
+      "learning_rate": 3.15914644925996e-06,
+      "loss": 0.5694,
+      "step": 1007
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.8544220651543013,
+      "learning_rate": 3.1559918160845787e-06,
+      "loss": 0.5285,
+      "step": 1008
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.8481774433371347,
+      "learning_rate": 3.1528360607561358e-06,
+      "loss": 0.5384,
+      "step": 1009
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.8256828659009958,
+      "learning_rate": 3.149679188672932e-06,
+      "loss": 0.4806,
+      "step": 1010
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.9380282822721238,
+      "learning_rate": 3.1465212052351766e-06,
+      "loss": 0.543,
+      "step": 1011
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.985943690469791,
+      "learning_rate": 3.1433621158449807e-06,
+      "loss": 0.5549,
+      "step": 1012
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.7038398790061953,
+      "learning_rate": 3.140201925906348e-06,
+      "loss": 0.4682,
+      "step": 1013
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.8748481620529394,
+      "learning_rate": 3.1370406408251632e-06,
+      "loss": 0.5046,
+      "step": 1014
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.7587036990451181,
+      "learning_rate": 3.133878266009186e-06,
+      "loss": 0.5203,
+      "step": 1015
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.7503537433041947,
+      "learning_rate": 3.130714806868041e-06,
+      "loss": 0.5546,
+      "step": 1016
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.7701505667314001,
+      "learning_rate": 3.127550268813205e-06,
+      "loss": 0.531,
+      "step": 1017
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.771371589393474,
+      "learning_rate": 3.124384657258001e-06,
+      "loss": 0.5424,
+      "step": 1018
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.8016015279719124,
+      "learning_rate": 3.1212179776175905e-06,
+      "loss": 0.5706,
+      "step": 1019
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.810944889002695,
+      "learning_rate": 3.1180502353089598e-06,
+      "loss": 0.5502,
+      "step": 1020
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.8062084514449492,
+      "learning_rate": 3.1148814357509147e-06,
+      "loss": 0.5337,
+      "step": 1021
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.669643406466654,
+      "learning_rate": 3.111711584364068e-06,
+      "loss": 0.4802,
+      "step": 1022
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.6852245083058144,
+      "learning_rate": 3.1085406865708333e-06,
+      "loss": 0.532,
+      "step": 1023
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.8463748056800222,
+      "learning_rate": 3.1053687477954124e-06,
+      "loss": 0.5112,
+      "step": 1024
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.7302148909577209,
+      "learning_rate": 3.10219577346379e-06,
+      "loss": 0.5549,
+      "step": 1025
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.7752983463714818,
+      "learning_rate": 3.0990217690037206e-06,
+      "loss": 0.5606,
+      "step": 1026
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.695119975844164,
+      "learning_rate": 3.09584673984472e-06,
+      "loss": 0.486,
+      "step": 1027
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.793543444803663,
+      "learning_rate": 3.0926706914180605e-06,
+      "loss": 0.6474,
+      "step": 1028
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.6954588940750932,
+      "learning_rate": 3.089493629156755e-06,
+      "loss": 0.5208,
+      "step": 1029
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.9045089074493644,
+      "learning_rate": 3.08631555849555e-06,
+      "loss": 0.5291,
+      "step": 1030
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.8481217904786489,
+      "learning_rate": 3.083136484870921e-06,
+      "loss": 0.5212,
+      "step": 1031
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.6729420221561044,
+      "learning_rate": 3.0799564137210536e-06,
+      "loss": 0.5024,
+      "step": 1032
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.8821832248249077,
+      "learning_rate": 3.076775350485845e-06,
+      "loss": 0.5459,
+      "step": 1033
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.762473350167322,
+      "learning_rate": 3.0735933006068863e-06,
+      "loss": 0.4938,
+      "step": 1034
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.7950707678098703,
+      "learning_rate": 3.0704102695274573e-06,
+      "loss": 0.4922,
+      "step": 1035
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.6853644769275375,
+      "learning_rate": 3.0672262626925174e-06,
+      "loss": 0.47,
+      "step": 1036
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.809909106997157,
+      "learning_rate": 3.0640412855486922e-06,
+      "loss": 0.5545,
+      "step": 1037
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 2.019472393876661,
+      "learning_rate": 3.06085534354427e-06,
+      "loss": 0.5616,
+      "step": 1038
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.7972785887075076,
+      "learning_rate": 3.057668442129188e-06,
+      "loss": 0.5269,
+      "step": 1039
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.865555820217107,
+      "learning_rate": 3.054480586755026e-06,
+      "loss": 0.5752,
+      "step": 1040
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.792147096098412,
+      "learning_rate": 3.051291782874995e-06,
+      "loss": 0.54,
+      "step": 1041
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.8108893550848508,
+      "learning_rate": 3.048102035943927e-06,
+      "loss": 0.5367,
+      "step": 1042
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 2.0966646553454793,
+      "learning_rate": 3.04491135141827e-06,
+      "loss": 0.5455,
+      "step": 1043
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.7357403687049695,
+      "learning_rate": 3.041719734756073e-06,
+      "loss": 0.502,
+      "step": 1044
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.8033826162723872,
+      "learning_rate": 3.038527191416982e-06,
+      "loss": 0.5644,
+      "step": 1045
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.7822928111630525,
+      "learning_rate": 3.0353337268622267e-06,
+      "loss": 0.4938,
+      "step": 1046
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.7910319343463081,
+      "learning_rate": 3.0321393465546134e-06,
+      "loss": 0.5889,
+      "step": 1047
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.7457160087273953,
+      "learning_rate": 3.028944055958514e-06,
+      "loss": 0.5022,
+      "step": 1048
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.691379648176161,
+      "learning_rate": 3.0257478605398595e-06,
+      "loss": 0.4841,
+      "step": 1049
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.7452186987943483,
+      "learning_rate": 3.0225507657661257e-06,
+      "loss": 0.5626,
+      "step": 1050
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.7578678635930594,
+      "learning_rate": 3.0193527771063297e-06,
+      "loss": 0.5115,
+      "step": 1051
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.7879798898209605,
+      "learning_rate": 3.016153900031016e-06,
+      "loss": 0.5296,
+      "step": 1052
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.6745604796677231,
+      "learning_rate": 3.0129541400122492e-06,
+      "loss": 0.5089,
+      "step": 1053
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.8484438696306678,
+      "learning_rate": 3.0097535025236045e-06,
+      "loss": 0.6124,
+      "step": 1054
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.8023880068850882,
+      "learning_rate": 3.0065519930401595e-06,
+      "loss": 0.4983,
+      "step": 1055
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.743901583565096,
+      "learning_rate": 3.0033496170384803e-06,
+      "loss": 0.4998,
+      "step": 1056
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.9494472820876043,
+      "learning_rate": 3.000146379996617e-06,
+      "loss": 0.537,
+      "step": 1057
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.6992995489648048,
+      "learning_rate": 2.996942287394093e-06,
+      "loss": 0.5822,
+      "step": 1058
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.8498288139189643,
+      "learning_rate": 2.993737344711895e-06,
+      "loss": 0.5651,
+      "step": 1059
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.755920633785882,
+      "learning_rate": 2.990531557432464e-06,
+      "loss": 0.496,
+      "step": 1060
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.7876484928074277,
+      "learning_rate": 2.9873249310396853e-06,
+      "loss": 0.5224,
+      "step": 1061
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.7573987279473129,
+      "learning_rate": 2.98411747101888e-06,
+      "loss": 0.5228,
+      "step": 1062
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.6995721104857204,
+      "learning_rate": 2.980909182856794e-06,
+      "loss": 0.4758,
+      "step": 1063
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.907464743607936,
+      "learning_rate": 2.9777000720415916e-06,
+      "loss": 0.5254,
+      "step": 1064
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.7921365259203703,
+      "learning_rate": 2.974490144062844e-06,
+      "loss": 0.5116,
+      "step": 1065
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.9010192849593792,
+      "learning_rate": 2.9712794044115196e-06,
+      "loss": 0.5136,
+      "step": 1066
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.742881813035793,
+      "learning_rate": 2.968067858579975e-06,
+      "loss": 0.5436,
+      "step": 1067
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.7135933558215708,
+      "learning_rate": 2.964855512061947e-06,
+      "loss": 0.5268,
+      "step": 1068
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.8360025545734582,
+      "learning_rate": 2.9616423703525414e-06,
+      "loss": 0.5238,
+      "step": 1069
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.7090421713960848,
+      "learning_rate": 2.9584284389482237e-06,
+      "loss": 0.5051,
+      "step": 1070
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.7462732547158757,
+      "learning_rate": 2.9552137233468113e-06,
+      "loss": 0.4838,
+      "step": 1071
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.9336108910937513,
+      "learning_rate": 2.951998229047464e-06,
+      "loss": 0.5576,
+      "step": 1072
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.784092660568157,
+      "learning_rate": 2.9487819615506702e-06,
+      "loss": 0.5349,
+      "step": 1073
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.772640354616067,
+      "learning_rate": 2.945564926358245e-06,
+      "loss": 0.5423,
+      "step": 1074
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.8491968859591044,
+      "learning_rate": 2.9423471289733125e-06,
+      "loss": 0.5453,
+      "step": 1075
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.8283172103770493,
+      "learning_rate": 2.9391285749003046e-06,
+      "loss": 0.5318,
+      "step": 1076
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.7802483696828226,
+      "learning_rate": 2.935909269644946e-06,
+      "loss": 0.4954,
+      "step": 1077
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.8687809173149,
+      "learning_rate": 2.9326892187142457e-06,
+      "loss": 0.5428,
+      "step": 1078
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.9218917868616974,
+      "learning_rate": 2.9294684276164888e-06,
+      "loss": 0.5125,
+      "step": 1079
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.8406300824318225,
+      "learning_rate": 2.9262469018612278e-06,
+      "loss": 0.5186,
+      "step": 1080
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.8153319034513924,
+      "learning_rate": 2.9230246469592695e-06,
+      "loss": 0.4878,
+      "step": 1081
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.8381190525343576,
+      "learning_rate": 2.91980166842267e-06,
+      "loss": 0.5455,
+      "step": 1082
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.7941629060330144,
+      "learning_rate": 2.9165779717647212e-06,
+      "loss": 0.5425,
+      "step": 1083
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.755950985861856,
+      "learning_rate": 2.9133535624999466e-06,
+      "loss": 0.4992,
+      "step": 1084
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.8065716401418646,
+      "learning_rate": 2.9101284461440853e-06,
+      "loss": 0.5569,
+      "step": 1085
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.8487073865649808,
+      "learning_rate": 2.9069026282140887e-06,
+      "loss": 0.5352,
+      "step": 1086
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.877024524581134,
+      "learning_rate": 2.903676114228107e-06,
+      "loss": 0.5584,
+      "step": 1087
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.812931375367902,
+      "learning_rate": 2.9004489097054807e-06,
+      "loss": 0.5154,
+      "step": 1088
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.7729938020658174,
+      "learning_rate": 2.897221020166732e-06,
+      "loss": 0.5386,
+      "step": 1089
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.6991898958250629,
+      "learning_rate": 2.8939924511335555e-06,
+      "loss": 0.5467,
+      "step": 1090
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.7298323860671052,
+      "learning_rate": 2.890763208128807e-06,
+      "loss": 0.5506,
+      "step": 1091
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.9718362378496106,
+      "learning_rate": 2.887533296676497e-06,
+      "loss": 0.5453,
+      "step": 1092
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.7003897379752575,
+      "learning_rate": 2.8843027223017767e-06,
+      "loss": 0.5016,
+      "step": 1093
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.7604846690613096,
+      "learning_rate": 2.8810714905309346e-06,
+      "loss": 0.5206,
+      "step": 1094
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.868522047775135,
+      "learning_rate": 2.8778396068913807e-06,
+      "loss": 0.5152,
+      "step": 1095
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.8080911269766844,
+      "learning_rate": 2.874607076911642e-06,
+      "loss": 0.4966,
+      "step": 1096
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.7767037245003534,
+      "learning_rate": 2.871373906121351e-06,
+      "loss": 0.5081,
+      "step": 1097
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.733045586658075,
+      "learning_rate": 2.8681401000512356e-06,
+      "loss": 0.5031,
+      "step": 1098
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.6767478479637847,
+      "learning_rate": 2.8649056642331103e-06,
+      "loss": 0.4856,
+      "step": 1099
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.6820690185704608,
+      "learning_rate": 2.8616706041998686e-06,
+      "loss": 0.5151,
+      "step": 1100
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.840181264549285,
+      "learning_rate": 2.8584349254854693e-06,
+      "loss": 0.5393,
+      "step": 1101
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.827807570004724,
+      "learning_rate": 2.8551986336249322e-06,
+      "loss": 0.5572,
+      "step": 1102
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.711815265099016,
+      "learning_rate": 2.8519617341543233e-06,
+      "loss": 0.5184,
+      "step": 1103
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.7460018389221874,
+      "learning_rate": 2.8487242326107495e-06,
+      "loss": 0.5374,
+      "step": 1104
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.985067366728648,
+      "learning_rate": 2.8454861345323475e-06,
+      "loss": 0.538,
+      "step": 1105
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.8044567576569952,
+      "learning_rate": 2.8422474454582754e-06,
+      "loss": 0.4947,
+      "step": 1106
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.7648712890692506,
+      "learning_rate": 2.8390081709286997e-06,
+      "loss": 0.5584,
+      "step": 1107
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.7544905722043518,
+      "learning_rate": 2.8357683164847903e-06,
+      "loss": 0.5696,
+      "step": 1108
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.7923136846837993,
+      "learning_rate": 2.8325278876687084e-06,
+      "loss": 0.5502,
+      "step": 1109
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 2.077195937792951,
+      "learning_rate": 2.8292868900235986e-06,
+      "loss": 0.543,
+      "step": 1110
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.7675854046933754,
+      "learning_rate": 2.826045329093578e-06,
+      "loss": 0.5422,
+      "step": 1111
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.8457239401392898,
+      "learning_rate": 2.822803210423727e-06,
+      "loss": 0.5334,
+      "step": 1112
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.7426929121470698,
+      "learning_rate": 2.8195605395600804e-06,
+      "loss": 0.4972,
+      "step": 1113
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.7675216264197045,
+      "learning_rate": 2.8163173220496175e-06,
+      "loss": 0.5442,
+      "step": 1114
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.7483102565661375,
+      "learning_rate": 2.8130735634402527e-06,
+      "loss": 0.5425,
+      "step": 1115
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.692036399159914,
+      "learning_rate": 2.8098292692808253e-06,
+      "loss": 0.521,
+      "step": 1116
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.799980213437577,
+      "learning_rate": 2.8065844451210933e-06,
+      "loss": 0.5597,
+      "step": 1117
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.7666190830884467,
+      "learning_rate": 2.803339096511718e-06,
+      "loss": 0.5612,
+      "step": 1118
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.792129515845057,
+      "learning_rate": 2.8000932290042597e-06,
+      "loss": 0.5334,
+      "step": 1119
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.7395715578516604,
+      "learning_rate": 2.7968468481511663e-06,
+      "loss": 0.5545,
+      "step": 1120
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.6843830287676704,
+      "learning_rate": 2.7935999595057623e-06,
+      "loss": 0.5659,
+      "step": 1121
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.6432688824199502,
+      "learning_rate": 2.790352568622244e-06,
+      "loss": 0.4926,
+      "step": 1122
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.7430642435954644,
+      "learning_rate": 2.787104681055663e-06,
+      "loss": 0.4666,
+      "step": 1123
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.8067789882264202,
+      "learning_rate": 2.783856302361923e-06,
+      "loss": 0.5233,
+      "step": 1124
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.7685143281757654,
+      "learning_rate": 2.780607438097769e-06,
+      "loss": 0.5506,
+      "step": 1125
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.7163110868931304,
+      "learning_rate": 2.7773580938207717e-06,
+      "loss": 0.5044,
+      "step": 1126
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.809036270322799,
+      "learning_rate": 2.7741082750893284e-06,
+      "loss": 0.5206,
+      "step": 1127
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.8193898978325846,
+      "learning_rate": 2.770857987462645e-06,
+      "loss": 0.6064,
+      "step": 1128
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.765826426309075,
+      "learning_rate": 2.76760723650073e-06,
+      "loss": 0.4914,
+      "step": 1129
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 2.046345230237298,
+      "learning_rate": 2.764356027764385e-06,
+      "loss": 0.5938,
+      "step": 1130
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.8264697696225647,
+      "learning_rate": 2.7611043668151948e-06,
+      "loss": 0.5476,
+      "step": 1131
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.7776043318415495,
+      "learning_rate": 2.7578522592155166e-06,
+      "loss": 0.5318,
+      "step": 1132
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.767284538432005,
+      "learning_rate": 2.7545997105284735e-06,
+      "loss": 0.5197,
+      "step": 1133
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.831190014066027,
+      "learning_rate": 2.75134672631794e-06,
+      "loss": 0.4939,
+      "step": 1134
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.7727769641989948,
+      "learning_rate": 2.7480933121485394e-06,
+      "loss": 0.5542,
+      "step": 1135
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.7599576706599651,
+      "learning_rate": 2.7448394735856275e-06,
+      "loss": 0.5102,
+      "step": 1136
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.7526987759875383,
+      "learning_rate": 2.7415852161952893e-06,
+      "loss": 0.5357,
+      "step": 1137
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.7478180377944075,
+      "learning_rate": 2.7383305455443223e-06,
+      "loss": 0.552,
+      "step": 1138
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.8026983878339322,
+      "learning_rate": 2.7350754672002334e-06,
+      "loss": 0.5324,
+      "step": 1139
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.7539604119960455,
+      "learning_rate": 2.7318199867312267e-06,
+      "loss": 0.4951,
+      "step": 1140
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.7060714376533908,
+      "learning_rate": 2.728564109706193e-06,
+      "loss": 0.5044,
+      "step": 1141
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.896732668736906,
+      "learning_rate": 2.725307841694704e-06,
+      "loss": 0.5272,
+      "step": 1142
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.9094037542829962,
+      "learning_rate": 2.722051188266998e-06,
+      "loss": 0.5036,
+      "step": 1143
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.7529900591353695,
+      "learning_rate": 2.7187941549939723e-06,
+      "loss": 0.4962,
+      "step": 1144
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.7652784724721573,
+      "learning_rate": 2.7155367474471763e-06,
+      "loss": 0.5159,
+      "step": 1145
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.9070275680276054,
+      "learning_rate": 2.7122789711987964e-06,
+      "loss": 0.5269,
+      "step": 1146
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.7630505518040367,
+      "learning_rate": 2.709020831821652e-06,
+      "loss": 0.5286,
+      "step": 1147
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.7410138974922291,
+      "learning_rate": 2.7057623348891846e-06,
+      "loss": 0.4902,
+      "step": 1148
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.745842560539345,
+      "learning_rate": 2.7025034859754446e-06,
+      "loss": 0.5178,
+      "step": 1149
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.8498982578771728,
+      "learning_rate": 2.699244290655086e-06,
+      "loss": 0.55,
+      "step": 1150
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.6360369924184164,
+      "learning_rate": 2.6959847545033558e-06,
+      "loss": 0.4988,
+      "step": 1151
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.6784833460211517,
+      "learning_rate": 2.692724883096082e-06,
+      "loss": 0.5303,
+      "step": 1152
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.7888637226825195,
+      "learning_rate": 2.68946468200967e-06,
+      "loss": 0.542,
+      "step": 1153
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.7156031503954616,
+      "learning_rate": 2.686204156821084e-06,
+      "loss": 0.499,
+      "step": 1154
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.802618839032982,
+      "learning_rate": 2.6829433131078464e-06,
+      "loss": 0.5095,
+      "step": 1155
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.7018673816457677,
+      "learning_rate": 2.6796821564480237e-06,
+      "loss": 0.4911,
+      "step": 1156
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.939833859373507,
+      "learning_rate": 2.6764206924202173e-06,
+      "loss": 0.5965,
+      "step": 1157
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.757462214596805,
+      "learning_rate": 2.673158926603554e-06,
+      "loss": 0.5119,
+      "step": 1158
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.824906787992325,
+      "learning_rate": 2.669896864577678e-06,
+      "loss": 0.4995,
+      "step": 1159
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.6963319988581682,
+      "learning_rate": 2.666634511922739e-06,
+      "loss": 0.499,
+      "step": 1160
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.7490967555131538,
+      "learning_rate": 2.6633718742193837e-06,
+      "loss": 0.5045,
+      "step": 1161
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.7295387040616608,
+      "learning_rate": 2.660108957048749e-06,
+      "loss": 0.48,
+      "step": 1162
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.7062936128447537,
+      "learning_rate": 2.656845765992447e-06,
+      "loss": 0.5024,
+      "step": 1163
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.7291223687738257,
+      "learning_rate": 2.6535823066325594e-06,
+      "loss": 0.4965,
+      "step": 1164
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.7660018876230184,
+      "learning_rate": 2.650318584551626e-06,
+      "loss": 0.6289,
+      "step": 1165
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.6875948695046943,
+      "learning_rate": 2.6470546053326375e-06,
+      "loss": 0.5099,
+      "step": 1166
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.7055862895950586,
+      "learning_rate": 2.643790374559023e-06,
+      "loss": 0.4748,
+      "step": 1167
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.8397810404769834,
+      "learning_rate": 2.6405258978146443e-06,
+      "loss": 0.5547,
+      "step": 1168
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.6780759297615608,
+      "learning_rate": 2.6372611806837804e-06,
+      "loss": 0.4696,
+      "step": 1169
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.7463193906158438,
+      "learning_rate": 2.633996228751125e-06,
+      "loss": 0.5167,
+      "step": 1170
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.7682737157303552,
+      "learning_rate": 2.6307310476017705e-06,
+      "loss": 0.5178,
+      "step": 1171
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.7759532350573655,
+      "learning_rate": 2.627465642821203e-06,
+      "loss": 0.5411,
+      "step": 1172
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.741742707150691,
+      "learning_rate": 2.624200019995293e-06,
+      "loss": 0.5357,
+      "step": 1173
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.7638181255611864,
+      "learning_rate": 2.6209341847102787e-06,
+      "loss": 0.5598,
+      "step": 1174
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.6585763596592404,
+      "learning_rate": 2.6176681425527663e-06,
+      "loss": 0.4891,
+      "step": 1175
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.7652514703885578,
+      "learning_rate": 2.614401899109716e-06,
+      "loss": 0.5412,
+      "step": 1176
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.7646286601286296,
+      "learning_rate": 2.6111354599684287e-06,
+      "loss": 0.4753,
+      "step": 1177
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.7933546923906454,
+      "learning_rate": 2.6078688307165436e-06,
+      "loss": 0.5159,
+      "step": 1178
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.8474498352431208,
+      "learning_rate": 2.6046020169420223e-06,
+      "loss": 0.4786,
+      "step": 1179
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.816609500392057,
+      "learning_rate": 2.601335024233145e-06,
+      "loss": 0.5821,
+      "step": 1180
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.7603922858788037,
+      "learning_rate": 2.598067858178495e-06,
+      "loss": 0.4749,
+      "step": 1181
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.771168764538133,
+      "learning_rate": 2.594800524366956e-06,
+      "loss": 0.5221,
+      "step": 1182
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.7428386931770696,
+      "learning_rate": 2.591533028387694e-06,
+      "loss": 0.5243,
+      "step": 1183
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.7354647623517858,
+      "learning_rate": 2.588265375830155e-06,
+      "loss": 0.4665,
+      "step": 1184
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.7757829783254058,
+      "learning_rate": 2.5849975722840537e-06,
+      "loss": 0.4713,
+      "step": 1185
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.7660698291034924,
+      "learning_rate": 2.58172962333936e-06,
+      "loss": 0.5198,
+      "step": 1186
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.7071465020770178,
+      "learning_rate": 2.5784615345862963e-06,
+      "loss": 0.5355,
+      "step": 1187
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.6994920599655763,
+      "learning_rate": 2.5751933116153215e-06,
+      "loss": 0.4867,
+      "step": 1188
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.7891977115774562,
+      "learning_rate": 2.5719249600171247e-06,
+      "loss": 0.5071,
+      "step": 1189
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.6866451169084888,
+      "learning_rate": 2.568656485382616e-06,
+      "loss": 0.4767,
+      "step": 1190
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.9106444693405875,
+      "learning_rate": 2.5653878933029134e-06,
+      "loss": 0.5063,
+      "step": 1191
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.7546015951107552,
+      "learning_rate": 2.56211918936934e-06,
+      "loss": 0.5536,
+      "step": 1192
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.7866083346923656,
+      "learning_rate": 2.5588503791734053e-06,
+      "loss": 0.4738,
+      "step": 1193
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.6678313975517949,
+      "learning_rate": 2.5555814683068058e-06,
+      "loss": 0.5095,
+      "step": 1194
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.694690087625629,
+      "learning_rate": 2.552312462361405e-06,
+      "loss": 0.5711,
+      "step": 1195
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.7583066556547233,
+      "learning_rate": 2.5490433669292337e-06,
+      "loss": 0.5183,
+      "step": 1196
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.8259327544569408,
+      "learning_rate": 2.5457741876024716e-06,
+      "loss": 0.5129,
+      "step": 1197
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.743709458286742,
+      "learning_rate": 2.542504929973445e-06,
+      "loss": 0.509,
+      "step": 1198
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.8551037168096902,
+      "learning_rate": 2.5392355996346134e-06,
+      "loss": 0.4874,
+      "step": 1199
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.7705896553689628,
+      "learning_rate": 2.5359662021785596e-06,
+      "loss": 0.5102,
+      "step": 1200
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.8456154073029885,
+      "learning_rate": 2.532696743197982e-06,
+      "loss": 0.5363,
+      "step": 1201
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.7341454202963031,
+      "learning_rate": 2.529427228285686e-06,
+      "loss": 0.5013,
+      "step": 1202
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.7923147732329405,
+      "learning_rate": 2.526157663034568e-06,
+      "loss": 0.5191,
+      "step": 1203
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.731262319220837,
+      "learning_rate": 2.522888053037616e-06,
+      "loss": 0.4889,
+      "step": 1204
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.797800368847369,
+      "learning_rate": 2.5196184038878895e-06,
+      "loss": 0.4868,
+      "step": 1205
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.8182272292135089,
+      "learning_rate": 2.5163487211785194e-06,
+      "loss": 0.5159,
+      "step": 1206
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.9699143840893472,
+      "learning_rate": 2.5130790105026908e-06,
+      "loss": 0.543,
+      "step": 1207
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.805587879000798,
+      "learning_rate": 2.5098092774536397e-06,
+      "loss": 0.5162,
+      "step": 1208
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.966538834153111,
+      "learning_rate": 2.506539527624637e-06,
+      "loss": 0.4973,
+      "step": 1209
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.7007116827865891,
+      "learning_rate": 2.5032697666089833e-06,
+      "loss": 0.5337,
+      "step": 1210
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.8200190388383481,
+      "learning_rate": 2.5e-06,
+      "loss": 0.492,
+      "step": 1211
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.7811733389101785,
+      "learning_rate": 2.496730233391017e-06,
+      "loss": 0.533,
+      "step": 1212
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.7692852455085013,
+      "learning_rate": 2.4934604723753636e-06,
+      "loss": 0.5151,
+      "step": 1213
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 2.0118407638136726,
+      "learning_rate": 2.4901907225463607e-06,
+      "loss": 0.566,
+      "step": 1214
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.9919699597672162,
+      "learning_rate": 2.486920989497309e-06,
+      "loss": 0.5296,
+      "step": 1215
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.7399123797451834,
+      "learning_rate": 2.483651278821481e-06,
+      "loss": 0.5535,
+      "step": 1216
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 2.0162050634113617,
+      "learning_rate": 2.4803815961121117e-06,
+      "loss": 0.5105,
+      "step": 1217
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.9472302767468135,
+      "learning_rate": 2.4771119469623856e-06,
+      "loss": 0.4829,
+      "step": 1218
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.9358326178363474,
+      "learning_rate": 2.4738423369654327e-06,
+      "loss": 0.5895,
+      "step": 1219
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.8202396491898063,
+      "learning_rate": 2.470572771714315e-06,
+      "loss": 0.5159,
+      "step": 1220
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 2.0705540084815652,
+      "learning_rate": 2.4673032568020183e-06,
+      "loss": 0.5375,
+      "step": 1221
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.9290016818033147,
+      "learning_rate": 2.464033797821441e-06,
+      "loss": 0.5328,
+      "step": 1222
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.858876842427081,
+      "learning_rate": 2.460764400365387e-06,
+      "loss": 0.5246,
+      "step": 1223
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.7372257522644121,
+      "learning_rate": 2.457495070026555e-06,
+      "loss": 0.5557,
+      "step": 1224
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 2.042578607858068,
+      "learning_rate": 2.454225812397529e-06,
+      "loss": 0.5493,
+      "step": 1225
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 1.80578953353184,
+      "learning_rate": 2.450956633070767e-06,
+      "loss": 0.4722,
+      "step": 1226
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 1.6245117501883604,
+      "learning_rate": 2.4476875376385954e-06,
+      "loss": 0.4861,
+      "step": 1227
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 2.3717275673814986,
+      "learning_rate": 2.4444185316931955e-06,
+      "loss": 0.4955,
+      "step": 1228
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 2.789230426976571,
+      "learning_rate": 2.441149620826595e-06,
+      "loss": 0.401,
+      "step": 1229
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 2.3165196574538163,
+      "learning_rate": 2.437880810630661e-06,
+      "loss": 0.391,
+      "step": 1230
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 3.7748119497874244,
+      "learning_rate": 2.434612106697087e-06,
+      "loss": 0.3971,
+      "step": 1231
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 2.516708769328096,
+      "learning_rate": 2.4313435146173845e-06,
+      "loss": 0.3677,
+      "step": 1232
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 2.0383812730416593,
+      "learning_rate": 2.4280750399828757e-06,
+      "loss": 0.3834,
+      "step": 1233
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 2.388274870254754,
+      "learning_rate": 2.424806688384679e-06,
+      "loss": 0.38,
+      "step": 1234
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 2.428758767469847,
+      "learning_rate": 2.4215384654137037e-06,
+      "loss": 0.3557,
+      "step": 1235
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.9871015940327752,
+      "learning_rate": 2.41827037666064e-06,
+      "loss": 0.3742,
+      "step": 1236
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 2.0490853630896595,
+      "learning_rate": 2.415002427715948e-06,
+      "loss": 0.4077,
+      "step": 1237
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 2.36022057857035,
+      "learning_rate": 2.4117346241698457e-06,
+      "loss": 0.4079,
+      "step": 1238
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 2.4014397498962974,
+      "learning_rate": 2.408466971612307e-06,
+      "loss": 0.3783,
+      "step": 1239
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 2.1970209263326246,
+      "learning_rate": 2.405199475633045e-06,
+      "loss": 0.4019,
+      "step": 1240
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.8747804397851657,
+      "learning_rate": 2.4019321418215053e-06,
+      "loss": 0.3657,
+      "step": 1241
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 2.0377029592503666,
+      "learning_rate": 2.398664975766856e-06,
+      "loss": 0.3575,
+      "step": 1242
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 2.2162687478729133,
+      "learning_rate": 2.3953979830579785e-06,
+      "loss": 0.3891,
+      "step": 1243
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 2.0736112974636605,
+      "learning_rate": 2.3921311692834577e-06,
+      "loss": 0.3872,
+      "step": 1244
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.8065329023464558,
+      "learning_rate": 2.3888645400315717e-06,
+      "loss": 0.3684,
+      "step": 1245
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 2.144863722944226,
+      "learning_rate": 2.385598100890285e-06,
+      "loss": 0.3781,
+      "step": 1246
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 2.245173550848138,
+      "learning_rate": 2.382331857447234e-06,
+      "loss": 0.3906,
+      "step": 1247
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 2.0580037557233806,
+      "learning_rate": 2.379065815289723e-06,
+      "loss": 0.3461,
+      "step": 1248
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 1.754328637936701,
+      "learning_rate": 2.3757999800047088e-06,
+      "loss": 0.3626,
+      "step": 1249
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 1.8749369460952616,
+      "learning_rate": 2.3725343571787974e-06,
+      "loss": 0.3723,
+      "step": 1250
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 1.9635590762348785,
+      "learning_rate": 2.36926895239823e-06,
+      "loss": 0.3506,
+      "step": 1251
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 1.9091295881177242,
+      "learning_rate": 2.3660037712488758e-06,
+      "loss": 0.3705,
+      "step": 1252
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 2.0807822077632445,
+      "learning_rate": 2.36273881931622e-06,
+      "loss": 0.4083,
+      "step": 1253
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 1.9247801946548893,
+      "learning_rate": 2.3594741021853565e-06,
+      "loss": 0.3896,
+      "step": 1254
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 2.003234826375957,
+      "learning_rate": 2.356209625440977e-06,
+      "loss": 0.3928,
+      "step": 1255
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 1.9601094488156638,
+      "learning_rate": 2.352945394667363e-06,
+      "loss": 0.346,
+      "step": 1256
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 1.835912356231795,
+      "learning_rate": 2.3496814154483754e-06,
+      "loss": 0.3268,
+      "step": 1257
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 1.851616138864044,
+      "learning_rate": 2.346417693367442e-06,
+      "loss": 0.395,
+      "step": 1258
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 2.017511453982363,
+      "learning_rate": 2.3431542340075535e-06,
+      "loss": 0.3989,
+      "step": 1259
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 1.9337327085061278,
+      "learning_rate": 2.3398910429512516e-06,
+      "loss": 0.4168,
+      "step": 1260
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 1.8957440589808827,
+      "learning_rate": 2.3366281257806167e-06,
+      "loss": 0.3626,
+      "step": 1261
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 1.819897111464585,
+      "learning_rate": 2.3333654880772622e-06,
+      "loss": 0.3737,
+      "step": 1262
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 1.9283607336926767,
+      "learning_rate": 2.3301031354223226e-06,
+      "loss": 0.3595,
+      "step": 1263
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 1.8049670593502345,
+      "learning_rate": 2.3268410733964463e-06,
+      "loss": 0.3645,
+      "step": 1264
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 1.866103990559354,
+      "learning_rate": 2.3235793075797835e-06,
+      "loss": 0.391,
+      "step": 1265
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 1.774992664072412,
+      "learning_rate": 2.3203178435519767e-06,
+      "loss": 0.3863,
+      "step": 1266
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 1.8431093658964484,
+      "learning_rate": 2.3170566868921553e-06,
+      "loss": 0.4175,
+      "step": 1267
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 1.7731154009482526,
+      "learning_rate": 2.3137958431789175e-06,
+      "loss": 0.3651,
+      "step": 1268
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 1.980392583405916,
+      "learning_rate": 2.3105353179903313e-06,
+      "loss": 0.3919,
+      "step": 1269
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 1.8435910751312221,
+      "learning_rate": 2.3072751169039183e-06,
+      "loss": 0.3466,
+      "step": 1270
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 1.88150621693115,
+      "learning_rate": 2.304015245496645e-06,
+      "loss": 0.3991,
+      "step": 1271
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 1.9365960105712363,
+      "learning_rate": 2.300755709344915e-06,
+      "loss": 0.3675,
+      "step": 1272
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 1.8120924423380202,
+      "learning_rate": 2.297496514024556e-06,
+      "loss": 0.389,
+      "step": 1273
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 1.822066570446833,
+      "learning_rate": 2.2942376651108158e-06,
+      "loss": 0.3355,
+      "step": 1274
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 1.968043494993567,
+      "learning_rate": 2.290979168178348e-06,
+      "loss": 0.3909,
+      "step": 1275
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 1.8571689944285859,
+      "learning_rate": 2.287721028801204e-06,
+      "loss": 0.376,
+      "step": 1276
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 2.003415605331929,
+      "learning_rate": 2.2844632525528245e-06,
+      "loss": 0.3439,
+      "step": 1277
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 2.248040597881556,
+      "learning_rate": 2.2812058450060285e-06,
+      "loss": 0.3789,
+      "step": 1278
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 1.8018969815730068,
+      "learning_rate": 2.2779488117330032e-06,
+      "loss": 0.3756,
+      "step": 1279
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 1.90374397055853,
+      "learning_rate": 2.2746921583052967e-06,
+      "loss": 0.4126,
+      "step": 1280
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 1.8558365521624263,
+      "learning_rate": 2.2714358902938073e-06,
+      "loss": 0.3959,
+      "step": 1281
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 1.8375175796231433,
+      "learning_rate": 2.268180013268774e-06,
+      "loss": 0.4048,
+      "step": 1282
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 1.984205865069469,
+      "learning_rate": 2.2649245327997674e-06,
+      "loss": 0.4039,
+      "step": 1283
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 1.8933532928718015,
+      "learning_rate": 2.261669454455679e-06,
+      "loss": 0.3781,
+      "step": 1284
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 1.9740915743952114,
+      "learning_rate": 2.2584147838047116e-06,
+      "loss": 0.4003,
+      "step": 1285
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 1.8808844925592019,
+      "learning_rate": 2.2551605264143725e-06,
+      "loss": 0.3449,
+      "step": 1286
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 1.9307797122579196,
+      "learning_rate": 2.251906687851461e-06,
+      "loss": 0.4182,
+      "step": 1287
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 1.8492505145939904,
+      "learning_rate": 2.2486532736820614e-06,
+      "loss": 0.3736,
+      "step": 1288
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 1.8826597143825838,
+      "learning_rate": 2.245400289471528e-06,
+      "loss": 0.3987,
+      "step": 1289
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 1.8696499317715565,
+      "learning_rate": 2.242147740784484e-06,
+      "loss": 0.3725,
+      "step": 1290
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 2.0572316139676463,
+      "learning_rate": 2.2388956331848057e-06,
+      "loss": 0.3777,
+      "step": 1291
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 1.9916048666817696,
+      "learning_rate": 2.2356439722356154e-06,
+      "loss": 0.3435,
+      "step": 1292
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 1.7903849297787813,
+      "learning_rate": 2.2323927634992706e-06,
+      "loss": 0.3691,
+      "step": 1293
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 1.8840722711485807,
+      "learning_rate": 2.2291420125373555e-06,
+      "loss": 0.3619,
+      "step": 1294
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 1.853222255447046,
+      "learning_rate": 2.225891724910672e-06,
+      "loss": 0.3406,
+      "step": 1295
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 1.8075515802139996,
+      "learning_rate": 2.2226419061792282e-06,
+      "loss": 0.3775,
+      "step": 1296
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 1.8220733253527324,
+      "learning_rate": 2.2193925619022323e-06,
+      "loss": 0.3652,
+      "step": 1297
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 1.9758397782161456,
+      "learning_rate": 2.2161436976380774e-06,
+      "loss": 0.3825,
+      "step": 1298
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 2.0469053125573202,
+      "learning_rate": 2.212895318944338e-06,
+      "loss": 0.4162,
+      "step": 1299
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 1.8037669439194224,
+      "learning_rate": 2.2096474313777574e-06,
+      "loss": 0.3584,
+      "step": 1300
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 1.8852980241376032,
+      "learning_rate": 2.206400040494238e-06,
+      "loss": 0.3786,
+      "step": 1301
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 1.8014277477129081,
+      "learning_rate": 2.2031531518488345e-06,
+      "loss": 0.4126,
+      "step": 1302
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 1.844230526856602,
+      "learning_rate": 2.1999067709957407e-06,
+      "loss": 0.4005,
+      "step": 1303
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 1.9775624321749639,
+      "learning_rate": 2.1966609034882825e-06,
+      "loss": 0.4279,
+      "step": 1304
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 1.7752280618538778,
+      "learning_rate": 2.193415554878907e-06,
+      "loss": 0.3512,
+      "step": 1305
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 1.8490455260047038,
+      "learning_rate": 2.1901707307191743e-06,
+      "loss": 0.3828,
+      "step": 1306
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 5.328150832014928,
+      "learning_rate": 2.1869264365597477e-06,
+      "loss": 0.3909,
+      "step": 1307
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 1.8437062886123319,
+      "learning_rate": 2.1836826779503838e-06,
+      "loss": 0.37,
+      "step": 1308
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 2.008796830412121,
+      "learning_rate": 2.1804394604399204e-06,
+      "loss": 0.4077,
+      "step": 1309
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 1.800679268264127,
+      "learning_rate": 2.1771967895762736e-06,
+      "loss": 0.3679,
+      "step": 1310
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 1.8462133413299637,
+      "learning_rate": 2.173954670906423e-06,
+      "loss": 0.3602,
+      "step": 1311
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 1.809976917930169,
+      "learning_rate": 2.1707131099764022e-06,
+      "loss": 0.3899,
+      "step": 1312
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 1.8544861012991105,
+      "learning_rate": 2.1674721123312924e-06,
+      "loss": 0.3747,
+      "step": 1313
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 1.8852269898368,
+      "learning_rate": 2.1642316835152106e-06,
+      "loss": 0.4467,
+      "step": 1314
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 1.9122728391881445,
+      "learning_rate": 2.1609918290713007e-06,
+      "loss": 0.3402,
+      "step": 1315
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 1.9590310432156601,
+      "learning_rate": 2.1577525545417254e-06,
+      "loss": 0.3732,
+      "step": 1316
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 1.8276147883157745,
+      "learning_rate": 2.1545138654676525e-06,
+      "loss": 0.3953,
+      "step": 1317
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 1.8133703409989375,
+      "learning_rate": 2.151275767389252e-06,
+      "loss": 0.3539,
+      "step": 1318
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 1.8006183709975836,
+      "learning_rate": 2.148038265845678e-06,
+      "loss": 0.4006,
+      "step": 1319
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 1.8947220090164194,
+      "learning_rate": 2.144801366375069e-06,
+      "loss": 0.4406,
+      "step": 1320
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 1.8280103512099313,
+      "learning_rate": 2.141565074514531e-06,
+      "loss": 0.3815,
+      "step": 1321
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 1.8706012819390525,
+      "learning_rate": 2.138329395800132e-06,
+      "loss": 0.3445,
+      "step": 1322
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 1.9063701163877025,
+      "learning_rate": 2.1350943357668905e-06,
+      "loss": 0.3983,
+      "step": 1323
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 2.033333592395131,
+      "learning_rate": 2.131859899948765e-06,
+      "loss": 0.3686,
+      "step": 1324
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 2.0894724502176425,
+      "learning_rate": 2.1286260938786497e-06,
+      "loss": 0.3811,
+      "step": 1325
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 1.9145691870270913,
+      "learning_rate": 2.125392923088358e-06,
+      "loss": 0.3783,
+      "step": 1326
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 1.941699323344672,
+      "learning_rate": 2.1221603931086193e-06,
+      "loss": 0.3842,
+      "step": 1327
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 2.0079800551627565,
+      "learning_rate": 2.118928509469066e-06,
+      "loss": 0.3885,
+      "step": 1328
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 1.851351482771633,
+      "learning_rate": 2.1156972776982238e-06,
+      "loss": 0.3281,
+      "step": 1329
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 1.9104937018736412,
+      "learning_rate": 2.112466703323504e-06,
+      "loss": 0.4231,
+      "step": 1330
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.92374307717419,
+      "learning_rate": 2.1092367918711935e-06,
+      "loss": 0.3702,
+      "step": 1331
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.8725737952655952,
+      "learning_rate": 2.1060075488664453e-06,
+      "loss": 0.3591,
+      "step": 1332
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.850042908610832,
+      "learning_rate": 2.1027789798332688e-06,
+      "loss": 0.3368,
+      "step": 1333
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.9324592525287807,
+      "learning_rate": 2.0995510902945197e-06,
+      "loss": 0.3676,
+      "step": 1334
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.9116116557564555,
+      "learning_rate": 2.0963238857718934e-06,
+      "loss": 0.3817,
+      "step": 1335
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.9148726445140338,
+      "learning_rate": 2.0930973717859117e-06,
+      "loss": 0.3704,
+      "step": 1336
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.8376871831619126,
+      "learning_rate": 2.089871553855915e-06,
+      "loss": 0.3521,
+      "step": 1337
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 2.069303925978208,
+      "learning_rate": 2.086646437500054e-06,
+      "loss": 0.3848,
+      "step": 1338
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.876178784774616,
+      "learning_rate": 2.08342202823528e-06,
+      "loss": 0.3697,
+      "step": 1339
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.8981757166548485,
+      "learning_rate": 2.0801983315773317e-06,
+      "loss": 0.3864,
+      "step": 1340
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.8313223303972075,
+      "learning_rate": 2.0769753530407317e-06,
+      "loss": 0.3768,
+      "step": 1341
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.9073767874852925,
+      "learning_rate": 2.073753098138773e-06,
+      "loss": 0.3991,
+      "step": 1342
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 1.837313805268737,
+      "learning_rate": 2.0705315723835116e-06,
+      "loss": 0.3959,
+      "step": 1343
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 1.9539946764244502,
+      "learning_rate": 2.067310781285755e-06,
+      "loss": 0.4305,
+      "step": 1344
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 2.019270181770809,
+      "learning_rate": 2.0640907303550545e-06,
+      "loss": 0.3601,
+      "step": 1345
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 2.406213238917182,
+      "learning_rate": 2.0608714250996954e-06,
+      "loss": 0.4426,
+      "step": 1346
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 1.9236578073704644,
+      "learning_rate": 2.0576528710266875e-06,
+      "loss": 0.4038,
+      "step": 1347
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 2.048182172212149,
+      "learning_rate": 2.054435073641756e-06,
+      "loss": 0.3746,
+      "step": 1348
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 1.928863945427719,
+      "learning_rate": 2.0512180384493306e-06,
+      "loss": 0.3894,
+      "step": 1349
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 1.8335551339682872,
+      "learning_rate": 2.0480017709525372e-06,
+      "loss": 0.3693,
+      "step": 1350
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 1.9647819756067608,
+      "learning_rate": 2.044786276653189e-06,
+      "loss": 0.3781,
+      "step": 1351
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 2.12907859222308,
+      "learning_rate": 2.041571561051777e-06,
+      "loss": 0.4171,
+      "step": 1352
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 1.9030554994611362,
+      "learning_rate": 2.0383576296474595e-06,
+      "loss": 0.3871,
+      "step": 1353
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 1.8482128197200014,
+      "learning_rate": 2.0351444879380533e-06,
+      "loss": 0.3801,
+      "step": 1354
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 1.9237098856083394,
+      "learning_rate": 2.031932141420026e-06,
+      "loss": 0.397,
+      "step": 1355
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 1.9292461604759314,
+      "learning_rate": 2.0287205955884812e-06,
+      "loss": 0.3808,
+      "step": 1356
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 1.905891034454967,
+      "learning_rate": 2.025509855937156e-06,
+      "loss": 0.3991,
+      "step": 1357
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 1.8451385574242787,
+      "learning_rate": 2.0222999279584084e-06,
+      "loss": 0.3801,
+      "step": 1358
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 1.949400009057099,
+      "learning_rate": 2.0190908171432073e-06,
+      "loss": 0.3892,
+      "step": 1359
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 1.9605363810464835,
+      "learning_rate": 2.0158825289811214e-06,
+      "loss": 0.3965,
+      "step": 1360
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 1.8606173348780064,
+      "learning_rate": 2.012675068960315e-06,
+      "loss": 0.3954,
+      "step": 1361
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 1.894555038278285,
+      "learning_rate": 2.009468442567537e-06,
+      "loss": 0.3872,
+      "step": 1362
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 1.8879641436732342,
+      "learning_rate": 2.006262655288106e-06,
+      "loss": 0.381,
+      "step": 1363
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 6.804463123370788,
+      "learning_rate": 2.003057712605908e-06,
+      "loss": 0.3598,
+      "step": 1364
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 1.9484231062475323,
+      "learning_rate": 1.9998536200033843e-06,
+      "loss": 0.387,
+      "step": 1365
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 1.9430636182866459,
+      "learning_rate": 1.996650382961521e-06,
+      "loss": 0.3815,
+      "step": 1366
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 1.8099872908810362,
+      "learning_rate": 1.9934480069598418e-06,
+      "loss": 0.3931,
+      "step": 1367
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 2.0871498559503583,
+      "learning_rate": 1.990246497476396e-06,
+      "loss": 0.3946,
+      "step": 1368
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 1.9534152521538926,
+      "learning_rate": 1.9870458599877524e-06,
+      "loss": 0.3998,
+      "step": 1369
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 1.9712355359168434,
+      "learning_rate": 1.9838460999689854e-06,
+      "loss": 0.3741,
+      "step": 1370
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 1.8831191819719022,
+      "learning_rate": 1.980647222893671e-06,
+      "loss": 0.3758,
+      "step": 1371
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 2.03493312021646,
+      "learning_rate": 1.977449234233875e-06,
+      "loss": 0.4066,
+      "step": 1372
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 1.9837157371609282,
+      "learning_rate": 1.9742521394601413e-06,
+      "loss": 0.3757,
+      "step": 1373
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 1.9871704920253919,
+      "learning_rate": 1.9710559440414867e-06,
+      "loss": 0.3811,
+      "step": 1374
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 1.8609975534569105,
+      "learning_rate": 1.9678606534453874e-06,
+      "loss": 0.3709,
+      "step": 1375
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 1.8599855946550903,
+      "learning_rate": 1.9646662731377737e-06,
+      "loss": 0.3589,
+      "step": 1376
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 2.0183183444158224,
+      "learning_rate": 1.9614728085830185e-06,
+      "loss": 0.3521,
+      "step": 1377
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 1.9976152320569405,
+      "learning_rate": 1.958280265243927e-06,
+      "loss": 0.3757,
+      "step": 1378
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 1.9951401325370672,
+      "learning_rate": 1.9550886485817313e-06,
+      "loss": 0.3947,
+      "step": 1379
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 1.9553672687038417,
+      "learning_rate": 1.9518979640560737e-06,
+      "loss": 0.3473,
+      "step": 1380
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 1.9340367763443969,
+      "learning_rate": 1.9487082171250057e-06,
+      "loss": 0.37,
+      "step": 1381
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 1.8996712185125788,
+      "learning_rate": 1.9455194132449745e-06,
+      "loss": 0.3924,
+      "step": 1382
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 1.9351658663427442,
+      "learning_rate": 1.9423315578708126e-06,
+      "loss": 0.3959,
+      "step": 1383
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 2.0174109611058504,
+      "learning_rate": 1.939144656455731e-06,
+      "loss": 0.3987,
+      "step": 1384
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 1.76886531168205,
+      "learning_rate": 1.9359587144513086e-06,
+      "loss": 0.4277,
+      "step": 1385
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 2.1774228741508455,
+      "learning_rate": 1.9327737373074834e-06,
+      "loss": 0.4474,
+      "step": 1386
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 1.8335022286037221,
+      "learning_rate": 1.929589730472543e-06,
+      "loss": 0.3586,
+      "step": 1387
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 1.944762597816562,
+      "learning_rate": 1.926406699393114e-06,
+      "loss": 0.3916,
+      "step": 1388
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 1.9158836718088024,
+      "learning_rate": 1.9232246495141554e-06,
+      "loss": 0.3471,
+      "step": 1389
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 1.9546368466405357,
+      "learning_rate": 1.920043586278947e-06,
+      "loss": 0.3747,
+      "step": 1390
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 1.9070019014660136,
+      "learning_rate": 1.9168635151290803e-06,
+      "loss": 0.3524,
+      "step": 1391
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 2.023146490194608,
+      "learning_rate": 1.9136844415044502e-06,
+      "loss": 0.3707,
+      "step": 1392
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 1.8809251159178713,
+      "learning_rate": 1.910506370843246e-06,
+      "loss": 0.3801,
+      "step": 1393
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 2.0409011175956784,
+      "learning_rate": 1.9073293085819402e-06,
+      "loss": 0.373,
+      "step": 1394
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 2.0117643519136315,
+      "learning_rate": 1.9041532601552804e-06,
+      "loss": 0.3645,
+      "step": 1395
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 1.9716378326274158,
+      "learning_rate": 1.9009782309962805e-06,
+      "loss": 0.3614,
+      "step": 1396
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 1.9329872273189466,
+      "learning_rate": 1.8978042265362103e-06,
+      "loss": 0.3551,
+      "step": 1397
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 1.9199554634763143,
+      "learning_rate": 1.8946312522045874e-06,
+      "loss": 0.3902,
+      "step": 1398
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 1.9590655710866773,
+      "learning_rate": 1.891459313429167e-06,
+      "loss": 0.4142,
+      "step": 1399
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 2.0331664011816972,
+      "learning_rate": 1.8882884156359324e-06,
+      "loss": 0.3656,
+      "step": 1400
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 2.0472909494424583,
+      "learning_rate": 1.8851185642490863e-06,
+      "loss": 0.3886,
+      "step": 1401
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 1.9929489595454677,
+      "learning_rate": 1.8819497646910408e-06,
+      "loss": 0.3672,
+      "step": 1402
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.9438211462442658,
+      "learning_rate": 1.87878202238241e-06,
+      "loss": 0.3713,
+      "step": 1403
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.9090031612890588,
+      "learning_rate": 1.8756153427419996e-06,
+      "loss": 0.3806,
+      "step": 1404
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.8225379267675694,
+      "learning_rate": 1.872449731186796e-06,
+      "loss": 0.3412,
+      "step": 1405
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.7944071121109437,
+      "learning_rate": 1.86928519313196e-06,
+      "loss": 0.3642,
+      "step": 1406
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.9414616279338623,
+      "learning_rate": 1.8661217339908142e-06,
+      "loss": 0.3806,
+      "step": 1407
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.944356212181711,
+      "learning_rate": 1.8629593591748374e-06,
+      "loss": 0.3987,
+      "step": 1408
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.857841085738498,
+      "learning_rate": 1.8597980740936528e-06,
+      "loss": 0.3899,
+      "step": 1409
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.8710356295384132,
+      "learning_rate": 1.8566378841550205e-06,
+      "loss": 0.3784,
+      "step": 1410
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.8728296119496737,
+      "learning_rate": 1.8534787947648247e-06,
+      "loss": 0.3867,
+      "step": 1411
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.8738844694805654,
+      "learning_rate": 1.8503208113270687e-06,
+      "loss": 0.3696,
+      "step": 1412
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.9649370685779552,
+      "learning_rate": 1.8471639392438648e-06,
+      "loss": 0.3986,
+      "step": 1413
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.7859555369523812,
+      "learning_rate": 1.8440081839154222e-06,
+      "loss": 0.3871,
+      "step": 1414
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 1.8610430021362592,
+      "learning_rate": 1.840853550740041e-06,
+      "loss": 0.333,
+      "step": 1415
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 1.9871037672382785,
+      "learning_rate": 1.8377000451141013e-06,
+      "loss": 0.3655,
+      "step": 1416
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 2.0510993717790544,
+      "learning_rate": 1.8345476724320549e-06,
+      "loss": 0.3345,
+      "step": 1417
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 2.022865297999793,
+      "learning_rate": 1.8313964380864157e-06,
+      "loss": 0.4238,
+      "step": 1418
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 2.0272213314003786,
+      "learning_rate": 1.8282463474677485e-06,
+      "loss": 0.3775,
+      "step": 1419
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 2.006744012043913,
+      "learning_rate": 1.825097405964665e-06,
+      "loss": 0.3886,
+      "step": 1420
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 2.0596399522136406,
+      "learning_rate": 1.8219496189638065e-06,
+      "loss": 0.4091,
+      "step": 1421
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 1.8816895162930982,
+      "learning_rate": 1.8188029918498434e-06,
+      "loss": 0.4065,
+      "step": 1422
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 1.9988370328142775,
+      "learning_rate": 1.8156575300054607e-06,
+      "loss": 0.3968,
+      "step": 1423
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 2.0379288149529216,
+      "learning_rate": 1.8125132388113497e-06,
+      "loss": 0.3893,
+      "step": 1424
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 1.8764951987892278,
+      "learning_rate": 1.8093701236461999e-06,
+      "loss": 0.3757,
+      "step": 1425
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 1.9911843473469748,
+      "learning_rate": 1.806228189886688e-06,
+      "loss": 0.3891,
+      "step": 1426
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 1.9631453513585595,
+      "learning_rate": 1.8030874429074701e-06,
+      "loss": 0.3969,
+      "step": 1427
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 1.8998526626952037,
+      "learning_rate": 1.7999478880811735e-06,
+      "loss": 0.3919,
+      "step": 1428
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 1.8805553933080315,
+      "learning_rate": 1.7968095307783845e-06,
+      "loss": 0.3767,
+      "step": 1429
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 1.9958093732421776,
+      "learning_rate": 1.7936723763676426e-06,
+      "loss": 0.3861,
+      "step": 1430
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 1.8587137598489651,
+      "learning_rate": 1.7905364302154264e-06,
+      "loss": 0.3289,
+      "step": 1431
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 2.0380004642313785,
+      "learning_rate": 1.7874016976861504e-06,
+      "loss": 0.3531,
+      "step": 1432
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 1.9171820086465794,
+      "learning_rate": 1.784268184142154e-06,
+      "loss": 0.3986,
+      "step": 1433
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 1.95855879390137,
+      "learning_rate": 1.7811358949436874e-06,
+      "loss": 0.3402,
+      "step": 1434
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 1.9995990338040457,
+      "learning_rate": 1.7780048354489101e-06,
+      "loss": 0.3599,
+      "step": 1435
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 1.9243145774410442,
+      "learning_rate": 1.7748750110138768e-06,
+      "loss": 0.4399,
+      "step": 1436
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 2.279285862974166,
+      "learning_rate": 1.7717464269925288e-06,
+      "loss": 0.3614,
+      "step": 1437
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 1.9005095716347011,
+      "learning_rate": 1.7686190887366875e-06,
+      "loss": 0.3665,
+      "step": 1438
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 1.8076423185524721,
+      "learning_rate": 1.7654930015960401e-06,
+      "loss": 0.3408,
+      "step": 1439
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 1.8762893879880087,
+      "learning_rate": 1.762368170918136e-06,
+      "loss": 0.39,
+      "step": 1440
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 2.0153368993119556,
+      "learning_rate": 1.7592446020483762e-06,
+      "loss": 0.3539,
+      "step": 1441
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 1.9585515808006808,
+      "learning_rate": 1.7561223003299994e-06,
+      "loss": 0.3956,
+      "step": 1442
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 2.124848103864915,
+      "learning_rate": 1.7530012711040794e-06,
+      "loss": 0.4119,
+      "step": 1443
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 2.012402459921111,
+      "learning_rate": 1.749881519709514e-06,
+      "loss": 0.408,
+      "step": 1444
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 1.9649268732755643,
+      "learning_rate": 1.7467630514830136e-06,
+      "loss": 0.3283,
+      "step": 1445
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 1.8596310758669552,
+      "learning_rate": 1.7436458717590931e-06,
+      "loss": 0.4354,
+      "step": 1446
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 1.9102148486337966,
+      "learning_rate": 1.7405299858700648e-06,
+      "loss": 0.3954,
+      "step": 1447
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 1.8553487771224224,
+      "learning_rate": 1.737415399146027e-06,
+      "loss": 0.3668,
+      "step": 1448
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 2.1142472778200756,
+      "learning_rate": 1.7343021169148554e-06,
+      "loss": 0.3745,
+      "step": 1449
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 1.9058887276269199,
+      "learning_rate": 1.7311901445021955e-06,
+      "loss": 0.3818,
+      "step": 1450
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 2.0622661899571666,
+      "learning_rate": 1.7280794872314499e-06,
+      "loss": 0.3961,
+      "step": 1451
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.8962754770592172,
+      "learning_rate": 1.7249701504237737e-06,
+      "loss": 0.3586,
+      "step": 1452
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.8165490259194481,
+      "learning_rate": 1.7218621393980606e-06,
+      "loss": 0.3311,
+      "step": 1453
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.9977375977133494,
+      "learning_rate": 1.7187554594709396e-06,
+      "loss": 0.3674,
+      "step": 1454
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.8504323227168384,
+      "learning_rate": 1.7156501159567607e-06,
+      "loss": 0.3743,
+      "step": 1455
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.9541250949627105,
+      "learning_rate": 1.7125461141675881e-06,
+      "loss": 0.3812,
+      "step": 1456
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.993766367538168,
+      "learning_rate": 1.7094434594131914e-06,
+      "loss": 0.355,
+      "step": 1457
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.851815452351873,
+      "learning_rate": 1.7063421570010349e-06,
+      "loss": 0.3792,
+      "step": 1458
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.8699896985814497,
+      "learning_rate": 1.7032422122362704e-06,
+      "loss": 0.345,
+      "step": 1459
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.941362367589001,
+      "learning_rate": 1.700143630421727e-06,
+      "loss": 0.3735,
+      "step": 1460
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.844833441576945,
+      "learning_rate": 1.6970464168579034e-06,
+      "loss": 0.3883,
+      "step": 1461
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.9382330200940399,
+      "learning_rate": 1.6939505768429548e-06,
+      "loss": 0.3451,
+      "step": 1462
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.9404379114850492,
+      "learning_rate": 1.6908561156726894e-06,
+      "loss": 0.3886,
+      "step": 1463
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.89967752240511,
+      "learning_rate": 1.6877630386405567e-06,
+      "loss": 0.4322,
+      "step": 1464
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.9542258627644085,
+      "learning_rate": 1.6846713510376363e-06,
+      "loss": 0.4143,
+      "step": 1465
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 2.0224476812069305,
+      "learning_rate": 1.6815810581526337e-06,
+      "loss": 0.3885,
+      "step": 1466
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.9984358815769925,
+      "learning_rate": 1.6784921652718666e-06,
+      "loss": 0.326,
+      "step": 1467
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.9112545672749313,
+      "learning_rate": 1.675404677679259e-06,
+      "loss": 0.3818,
+      "step": 1468
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.8535662369823578,
+      "learning_rate": 1.6723186006563309e-06,
+      "loss": 0.348,
+      "step": 1469
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.9484817526163822,
+      "learning_rate": 1.6692339394821877e-06,
+      "loss": 0.3357,
+      "step": 1470
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.898163029912662,
+      "learning_rate": 1.6661506994335164e-06,
+      "loss": 0.3755,
+      "step": 1471
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.8795795559493234,
+      "learning_rate": 1.6630688857845678e-06,
+      "loss": 0.3616,
+      "step": 1472
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.9167503410588418,
+      "learning_rate": 1.6599885038071566e-06,
+      "loss": 0.3592,
+      "step": 1473
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.9765253259894953,
+      "learning_rate": 1.6569095587706485e-06,
+      "loss": 0.3953,
+      "step": 1474
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.9352433621405845,
+      "learning_rate": 1.6538320559419488e-06,
+      "loss": 0.3528,
+      "step": 1475
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 2.0111021011512125,
+      "learning_rate": 1.6507560005854977e-06,
+      "loss": 0.407,
+      "step": 1476
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.8339393905209536,
+      "learning_rate": 1.6476813979632589e-06,
+      "loss": 0.3668,
+      "step": 1477
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.9309495145983575,
+      "learning_rate": 1.6446082533347096e-06,
+      "loss": 0.4106,
+      "step": 1478
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.8708341753950297,
+      "learning_rate": 1.641536571956835e-06,
+      "loss": 0.3749,
+      "step": 1479
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.8244009733234272,
+      "learning_rate": 1.6384663590841154e-06,
+      "loss": 0.3832,
+      "step": 1480
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.8878853394194013,
+      "learning_rate": 1.6353976199685222e-06,
+      "loss": 0.3539,
+      "step": 1481
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.8830734244466278,
+      "learning_rate": 1.6323303598595006e-06,
+      "loss": 0.3852,
+      "step": 1482
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.866253132730359,
+      "learning_rate": 1.6292645840039697e-06,
+      "loss": 0.364,
+      "step": 1483
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.977321954101075,
+      "learning_rate": 1.6262002976463098e-06,
+      "loss": 0.3866,
+      "step": 1484
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.9753878011905568,
+      "learning_rate": 1.62313750602835e-06,
+      "loss": 0.3999,
+      "step": 1485
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.9461948334927384,
+      "learning_rate": 1.6200762143893659e-06,
+      "loss": 0.3769,
+      "step": 1486
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 1.9597078370114984,
+      "learning_rate": 1.6170164279660656e-06,
+      "loss": 0.3546,
+      "step": 1487
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 2.0333727955548735,
+      "learning_rate": 1.6139581519925818e-06,
+      "loss": 0.3631,
+      "step": 1488
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 1.8957200128798963,
+      "learning_rate": 1.6109013917004657e-06,
+      "loss": 0.3738,
+      "step": 1489
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 1.8758015207075704,
+      "learning_rate": 1.6078461523186722e-06,
+      "loss": 0.3511,
+      "step": 1490
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 1.9539261883496823,
+      "learning_rate": 1.6047924390735587e-06,
+      "loss": 0.4074,
+      "step": 1491
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 2.046216911945662,
+      "learning_rate": 1.6017402571888677e-06,
+      "loss": 0.3729,
+      "step": 1492
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 2.0334239477316194,
+      "learning_rate": 1.5986896118857247e-06,
+      "loss": 0.3999,
+      "step": 1493
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 2.0768274033669556,
+      "learning_rate": 1.5956405083826266e-06,
+      "loss": 0.3982,
+      "step": 1494
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 1.9997134218487143,
+      "learning_rate": 1.592592951895432e-06,
+      "loss": 0.4319,
+      "step": 1495
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 1.9000589337955354,
+      "learning_rate": 1.5895469476373545e-06,
+      "loss": 0.3813,
+      "step": 1496
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 1.8787692854188953,
+      "learning_rate": 1.5865025008189501e-06,
+      "loss": 0.3801,
+      "step": 1497
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 1.8346902202639779,
+      "learning_rate": 1.5834596166481132e-06,
+      "loss": 0.3533,
+      "step": 1498
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 1.8993496821666367,
+      "learning_rate": 1.5804183003300627e-06,
+      "loss": 0.429,
+      "step": 1499
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 2.342530229905022,
+      "learning_rate": 1.5773785570673378e-06,
+      "loss": 0.3356,
+      "step": 1500
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 2.1048882391009127,
+      "learning_rate": 1.5743403920597856e-06,
+      "loss": 0.3896,
+      "step": 1501
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 1.8528209728378324,
+      "learning_rate": 1.5713038105045535e-06,
+      "loss": 0.3307,
+      "step": 1502
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 1.9057632190431548,
+      "learning_rate": 1.5682688175960797e-06,
+      "loss": 0.3806,
+      "step": 1503
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 1.8724905465304538,
+      "learning_rate": 1.5652354185260848e-06,
+      "loss": 0.3637,
+      "step": 1504
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 1.8484069152287292,
+      "learning_rate": 1.5622036184835648e-06,
+      "loss": 0.3161,
+      "step": 1505
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 1.8399814687678377,
+      "learning_rate": 1.559173422654778e-06,
+      "loss": 0.3745,
+      "step": 1506
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 1.8838641942793775,
+      "learning_rate": 1.5561448362232404e-06,
+      "loss": 0.3537,
+      "step": 1507
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 1.8623848433104377,
+      "learning_rate": 1.5531178643697142e-06,
+      "loss": 0.3624,
+      "step": 1508
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 1.8997144759052735,
+      "learning_rate": 1.5500925122721988e-06,
+      "loss": 0.3679,
+      "step": 1509
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 1.8976582272389906,
+      "learning_rate": 1.5470687851059235e-06,
+      "loss": 0.3736,
+      "step": 1510
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.8750760623537808,
+      "learning_rate": 1.5440466880433388e-06,
+      "loss": 0.3735,
+      "step": 1511
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.990180186983658,
+      "learning_rate": 1.5410262262541065e-06,
+      "loss": 0.3797,
+      "step": 1512
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.8820633605632435,
+      "learning_rate": 1.538007404905089e-06,
+      "loss": 0.3659,
+      "step": 1513
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.9458293982836543,
+      "learning_rate": 1.5349902291603441e-06,
+      "loss": 0.4092,
+      "step": 1514
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.822097097325058,
+      "learning_rate": 1.5319747041811158e-06,
+      "loss": 0.3276,
+      "step": 1515
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 2.0516824372881457,
+      "learning_rate": 1.528960835125822e-06,
+      "loss": 0.4232,
+      "step": 1516
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 2.0624060387577816,
+      "learning_rate": 1.5259486271500489e-06,
+      "loss": 0.3996,
+      "step": 1517
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.9158764361943028,
+      "learning_rate": 1.522938085406542e-06,
+      "loss": 0.3728,
+      "step": 1518
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.9071590654189663,
+      "learning_rate": 1.5199292150451956e-06,
+      "loss": 0.3459,
+      "step": 1519
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.9532115896688163,
+      "learning_rate": 1.5169220212130449e-06,
+      "loss": 0.3513,
+      "step": 1520
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.9901825773245059,
+      "learning_rate": 1.5139165090542574e-06,
+      "loss": 0.3468,
+      "step": 1521
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.7913388603914477,
+      "learning_rate": 1.510912683710124e-06,
+      "loss": 0.3381,
+      "step": 1522
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.8270379040698477,
+      "learning_rate": 1.5079105503190497e-06,
+      "loss": 0.3873,
+      "step": 1523
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.9259224146444094,
+      "learning_rate": 1.5049101140165453e-06,
+      "loss": 0.3553,
+      "step": 1524
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.7933642267566716,
+      "learning_rate": 1.501911379935219e-06,
+      "loss": 0.3928,
+      "step": 1525
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.859002957520952,
+      "learning_rate": 1.498914353204767e-06,
+      "loss": 0.3331,
+      "step": 1526
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.9280095918192017,
+      "learning_rate": 1.4959190389519646e-06,
+      "loss": 0.3902,
+      "step": 1527
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.9929705610530277,
+      "learning_rate": 1.492925442300658e-06,
+      "loss": 0.3765,
+      "step": 1528
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 2.02617558936789,
+      "learning_rate": 1.4899335683717546e-06,
+      "loss": 0.3815,
+      "step": 1529
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.8532248246777345,
+      "learning_rate": 1.4869434222832157e-06,
+      "loss": 0.3998,
+      "step": 1530
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.8616511215661515,
+      "learning_rate": 1.4839550091500464e-06,
+      "loss": 0.4005,
+      "step": 1531
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.9696593290003677,
+      "learning_rate": 1.4809683340842885e-06,
+      "loss": 0.4136,
+      "step": 1532
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.9439323576237217,
+      "learning_rate": 1.477983402195008e-06,
+      "loss": 0.3674,
+      "step": 1533
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.8858064066643994,
+      "learning_rate": 1.475000218588291e-06,
+      "loss": 0.3505,
+      "step": 1534
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.9565923900750009,
+      "learning_rate": 1.4720187883672337e-06,
+      "loss": 0.379,
+      "step": 1535
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 1.9482950589580994,
+      "learning_rate": 1.4690391166319307e-06,
+      "loss": 0.3962,
+      "step": 1536
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 1.979462387227227,
+      "learning_rate": 1.4660612084794701e-06,
+      "loss": 0.3662,
+      "step": 1537
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 1.894203355197371,
+      "learning_rate": 1.4630850690039221e-06,
+      "loss": 0.3703,
+      "step": 1538
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 1.8798042105520323,
+      "learning_rate": 1.460110703296333e-06,
+      "loss": 0.3631,
+      "step": 1539
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 1.9687008779986372,
+      "learning_rate": 1.4571381164447137e-06,
+      "loss": 0.4081,
+      "step": 1540
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 2.043706332156422,
+      "learning_rate": 1.454167313534031e-06,
+      "loss": 0.3629,
+      "step": 1541
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 1.9336401989651433,
+      "learning_rate": 1.4511982996462038e-06,
+      "loss": 0.4042,
+      "step": 1542
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 1.9550529998108908,
+      "learning_rate": 1.4482310798600852e-06,
+      "loss": 0.3768,
+      "step": 1543
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 1.874147928818456,
+      "learning_rate": 1.4452656592514633e-06,
+      "loss": 0.4125,
+      "step": 1544
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 1.848295970105597,
+      "learning_rate": 1.442302042893048e-06,
+      "loss": 0.3646,
+      "step": 1545
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 1.991422406332833,
+      "learning_rate": 1.439340235854462e-06,
+      "loss": 0.3885,
+      "step": 1546
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 1.89855710617557,
+      "learning_rate": 1.436380243202233e-06,
+      "loss": 0.3658,
+      "step": 1547
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 1.8657910310229384,
+      "learning_rate": 1.4334220699997856e-06,
+      "loss": 0.3659,
+      "step": 1548
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 1.9035891506078888,
+      "learning_rate": 1.4304657213074314e-06,
+      "loss": 0.3662,
+      "step": 1549
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 1.9026573701280374,
+      "learning_rate": 1.4275112021823618e-06,
+      "loss": 0.3712,
+      "step": 1550
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 1.9342408780305267,
+      "learning_rate": 1.4245585176786363e-06,
+      "loss": 0.355,
+      "step": 1551
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 1.8785254217068754,
+      "learning_rate": 1.4216076728471794e-06,
+      "loss": 0.3985,
+      "step": 1552
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 1.9602955113202258,
+      "learning_rate": 1.4186586727357649e-06,
+      "loss": 0.4063,
+      "step": 1553
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 2.083823151902659,
+      "learning_rate": 1.4157115223890136e-06,
+      "loss": 0.4121,
+      "step": 1554
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 1.8676871403375772,
+      "learning_rate": 1.4127662268483818e-06,
+      "loss": 0.3912,
+      "step": 1555
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 1.9120128683776039,
+      "learning_rate": 1.4098227911521523e-06,
+      "loss": 0.3453,
+      "step": 1556
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 1.844790264464269,
+      "learning_rate": 1.4068812203354264e-06,
+      "loss": 0.3666,
+      "step": 1557
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 1.8477236162312085,
+      "learning_rate": 1.4039415194301159e-06,
+      "loss": 0.3652,
+      "step": 1558
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 1.9200270211079769,
+      "learning_rate": 1.4010036934649334e-06,
+      "loss": 0.3755,
+      "step": 1559
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.8353558471804892,
+      "learning_rate": 1.3980677474653838e-06,
+      "loss": 0.3653,
+      "step": 1560
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.9621989060334357,
+      "learning_rate": 1.3951336864537572e-06,
+      "loss": 0.4104,
+      "step": 1561
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.8245538722983388,
+      "learning_rate": 1.3922015154491194e-06,
+      "loss": 0.3991,
+      "step": 1562
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.933539870056334,
+      "learning_rate": 1.3892712394673002e-06,
+      "loss": 0.3877,
+      "step": 1563
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.8275785324682217,
+      "learning_rate": 1.3863428635208915e-06,
+      "loss": 0.3546,
+      "step": 1564
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 2.0450836317829215,
+      "learning_rate": 1.3834163926192318e-06,
+      "loss": 0.3847,
+      "step": 1565
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 3.523986698344347,
+      "learning_rate": 1.380491831768403e-06,
+      "loss": 0.3502,
+      "step": 1566
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.9164812764116064,
+      "learning_rate": 1.3775691859712193e-06,
+      "loss": 0.309,
+      "step": 1567
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 2.0951493120042604,
+      "learning_rate": 1.3746484602272178e-06,
+      "loss": 0.3678,
+      "step": 1568
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.8843177010635455,
+      "learning_rate": 1.3717296595326527e-06,
+      "loss": 0.358,
+      "step": 1569
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.9562282189438478,
+      "learning_rate": 1.3688127888804837e-06,
+      "loss": 0.4021,
+      "step": 1570
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.997781626544885,
+      "learning_rate": 1.36589785326037e-06,
+      "loss": 0.4158,
+      "step": 1571
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 1.8805954764404564,
+      "learning_rate": 1.3629848576586604e-06,
+      "loss": 0.3678,
+      "step": 1572
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 2.037723153555198,
+      "learning_rate": 1.3600738070583858e-06,
+      "loss": 0.3611,
+      "step": 1573
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 1.9504207408498462,
+      "learning_rate": 1.3571647064392467e-06,
+      "loss": 0.4096,
+      "step": 1574
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 2.0573869926356494,
+      "learning_rate": 1.3542575607776117e-06,
+      "loss": 0.3698,
+      "step": 1575
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 1.9648011988919714,
+      "learning_rate": 1.3513523750465049e-06,
+      "loss": 0.3557,
+      "step": 1576
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 2.0566628239070077,
+      "learning_rate": 1.3484491542155941e-06,
+      "loss": 0.4099,
+      "step": 1577
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 1.840088910062188,
+      "learning_rate": 1.3455479032511903e-06,
+      "loss": 0.3759,
+      "step": 1578
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 1.916068103431673,
+      "learning_rate": 1.3426486271162326e-06,
+      "loss": 0.36,
+      "step": 1579
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 1.932989091441797,
+      "learning_rate": 1.3397513307702817e-06,
+      "loss": 0.3658,
+      "step": 1580
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 1.8629067871512175,
+      "learning_rate": 1.3368560191695126e-06,
+      "loss": 0.3562,
+      "step": 1581
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 2.0118302341661307,
+      "learning_rate": 1.3339626972667048e-06,
+      "loss": 0.3878,
+      "step": 1582
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 1.9124583307461076,
+      "learning_rate": 1.3310713700112348e-06,
+      "loss": 0.3809,
+      "step": 1583
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.9774861213509043,
+      "learning_rate": 1.328182042349065e-06,
+      "loss": 0.4137,
+      "step": 1584
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.9114216906066048,
+      "learning_rate": 1.3252947192227388e-06,
+      "loss": 0.3837,
+      "step": 1585
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.8560468375199388,
+      "learning_rate": 1.3224094055713713e-06,
+      "loss": 0.3603,
+      "step": 1586
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.9212128604014926,
+      "learning_rate": 1.3195261063306381e-06,
+      "loss": 0.3458,
+      "step": 1587
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.9251208352537634,
+      "learning_rate": 1.316644826432772e-06,
+      "loss": 0.3844,
+      "step": 1588
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.883081065391109,
+      "learning_rate": 1.313765570806547e-06,
+      "loss": 0.4208,
+      "step": 1589
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.8564972529452957,
+      "learning_rate": 1.3108883443772779e-06,
+      "loss": 0.3622,
+      "step": 1590
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.9725309818034906,
+      "learning_rate": 1.3080131520668075e-06,
+      "loss": 0.3489,
+      "step": 1591
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.9747828638689664,
+      "learning_rate": 1.3051399987934988e-06,
+      "loss": 0.38,
+      "step": 1592
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.8498395134731278,
+      "learning_rate": 1.3022688894722271e-06,
+      "loss": 0.3797,
+      "step": 1593
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.8845414148933772,
+      "learning_rate": 1.2993998290143698e-06,
+      "loss": 0.3335,
+      "step": 1594
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.9610318168301932,
+      "learning_rate": 1.296532822327801e-06,
+      "loss": 0.3769,
+      "step": 1595
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 1.8917429842068785,
+      "learning_rate": 1.2936678743168813e-06,
+      "loss": 0.3981,
+      "step": 1596
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 2.005525949740854,
+      "learning_rate": 1.29080498988245e-06,
+      "loss": 0.3789,
+      "step": 1597
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 1.891996578027132,
+      "learning_rate": 1.2879441739218152e-06,
+      "loss": 0.3906,
+      "step": 1598
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 2.0224573297517114,
+      "learning_rate": 1.285085431328748e-06,
+      "loss": 0.3852,
+      "step": 1599
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 1.9933585122271171,
+      "learning_rate": 1.282228766993472e-06,
+      "loss": 0.3811,
+      "step": 1600
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 1.9655771579152717,
+      "learning_rate": 1.2793741858026565e-06,
+      "loss": 0.3799,
+      "step": 1601
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 1.8953068551718162,
+      "learning_rate": 1.2765216926394047e-06,
+      "loss": 0.3508,
+      "step": 1602
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 1.8702448937265155,
+      "learning_rate": 1.2736712923832526e-06,
+      "loss": 0.3427,
+      "step": 1603
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 1.9279047888369216,
+      "learning_rate": 1.2708229899101505e-06,
+      "loss": 0.3755,
+      "step": 1604
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 1.8867926377124098,
+      "learning_rate": 1.2679767900924647e-06,
+      "loss": 0.3366,
+      "step": 1605
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 1.8256946570291102,
+      "learning_rate": 1.2651326977989629e-06,
+      "loss": 0.3419,
+      "step": 1606
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 1.8845001674022432,
+      "learning_rate": 1.2622907178948074e-06,
+      "loss": 0.3593,
+      "step": 1607
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.8041550297275601,
+      "learning_rate": 1.2594508552415474e-06,
+      "loss": 0.3565,
+      "step": 1608
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.929162466271085,
+      "learning_rate": 1.2566131146971105e-06,
+      "loss": 0.346,
+      "step": 1609
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.9783530922620556,
+      "learning_rate": 1.2537775011157943e-06,
+      "loss": 0.3655,
+      "step": 1610
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.9493980516637623,
+      "learning_rate": 1.2509440193482564e-06,
+      "loss": 0.417,
+      "step": 1611
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.8895548928491517,
+      "learning_rate": 1.2481126742415098e-06,
+      "loss": 0.3731,
+      "step": 1612
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.874868433424839,
+      "learning_rate": 1.2452834706389122e-06,
+      "loss": 0.3743,
+      "step": 1613
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.917114604759422,
+      "learning_rate": 1.2424564133801553e-06,
+      "loss": 0.3412,
+      "step": 1614
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.9354723425395528,
+      "learning_rate": 1.2396315073012636e-06,
+      "loss": 0.3564,
+      "step": 1615
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.9621850514310992,
+      "learning_rate": 1.2368087572345772e-06,
+      "loss": 0.348,
+      "step": 1616
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 2.058589411316211,
+      "learning_rate": 1.233988168008751e-06,
+      "loss": 0.3679,
+      "step": 1617
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.9516795286397743,
+      "learning_rate": 1.2311697444487431e-06,
+      "loss": 0.3635,
+      "step": 1618
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.9233248775745249,
+      "learning_rate": 1.2283534913758066e-06,
+      "loss": 0.3957,
+      "step": 1619
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 1.9303786560618386,
+      "learning_rate": 1.225539413607482e-06,
+      "loss": 0.3806,
+      "step": 1620
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 2.030744520145863,
+      "learning_rate": 1.222727515957588e-06,
+      "loss": 0.4023,
+      "step": 1621
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 1.9537051918570292,
+      "learning_rate": 1.2199178032362149e-06,
+      "loss": 0.3808,
+      "step": 1622
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 1.8928085054817043,
+      "learning_rate": 1.2171102802497148e-06,
+      "loss": 0.3982,
+      "step": 1623
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 2.0571192296380296,
+      "learning_rate": 1.2143049518006952e-06,
+      "loss": 0.4044,
+      "step": 1624
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 1.856402590326006,
+      "learning_rate": 1.2115018226880063e-06,
+      "loss": 0.3977,
+      "step": 1625
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 1.927548078890778,
+      "learning_rate": 1.208700897706739e-06,
+      "loss": 0.4048,
+      "step": 1626
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 1.9400375481531664,
+      "learning_rate": 1.205902181648215e-06,
+      "loss": 0.3605,
+      "step": 1627
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 1.873775466516257,
+      "learning_rate": 1.2031056792999726e-06,
+      "loss": 0.3375,
+      "step": 1628
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 1.9913863168589552,
+      "learning_rate": 1.2003113954457673e-06,
+      "loss": 0.3964,
+      "step": 1629
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 1.9685736172926571,
+      "learning_rate": 1.1975193348655584e-06,
+      "loss": 0.3587,
+      "step": 1630
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 1.8698671252931964,
+      "learning_rate": 1.1947295023355022e-06,
+      "loss": 0.3568,
+      "step": 1631
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 1.9615330930141146,
+      "learning_rate": 1.1919419026279434e-06,
+      "loss": 0.385,
+      "step": 1632
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 1.8699401980633292,
+      "learning_rate": 1.189156540511407e-06,
+      "loss": 0.362,
+      "step": 1633
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 2.054845402143213,
+      "learning_rate": 1.186373420750592e-06,
+      "loss": 0.3746,
+      "step": 1634
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 1.828582523525085,
+      "learning_rate": 1.1835925481063575e-06,
+      "loss": 0.3915,
+      "step": 1635
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 1.9369510226251998,
+      "learning_rate": 1.1808139273357232e-06,
+      "loss": 0.3736,
+      "step": 1636
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 1.9623351823945685,
+      "learning_rate": 1.1780375631918544e-06,
+      "loss": 0.3861,
+      "step": 1637
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 2.057951803903781,
+      "learning_rate": 1.1752634604240565e-06,
+      "loss": 0.3988,
+      "step": 1638
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 1.926766218075179,
+      "learning_rate": 1.1724916237777675e-06,
+      "loss": 0.3526,
+      "step": 1639
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 1.8312750701887877,
+      "learning_rate": 1.1697220579945466e-06,
+      "loss": 0.3518,
+      "step": 1640
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 2.025004547929062,
+      "learning_rate": 1.1669547678120701e-06,
+      "loss": 0.3651,
+      "step": 1641
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 2.3363123335351874,
+      "learning_rate": 1.1641897579641221e-06,
+      "loss": 0.4033,
+      "step": 1642
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 1.8749245234784346,
+      "learning_rate": 1.1614270331805844e-06,
+      "loss": 0.3701,
+      "step": 1643
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 1.8332454151429327,
+      "learning_rate": 1.1586665981874323e-06,
+      "loss": 0.3911,
+      "step": 1644
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 2.217946505455479,
+      "learning_rate": 1.1559084577067206e-06,
+      "loss": 0.3346,
+      "step": 1645
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 1.799776695931742,
+      "learning_rate": 1.1531526164565816e-06,
+      "loss": 0.3489,
+      "step": 1646
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 1.9376934559686718,
+      "learning_rate": 1.150399079151214e-06,
+      "loss": 0.3721,
+      "step": 1647
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 1.826040524283735,
+      "learning_rate": 1.1476478505008753e-06,
+      "loss": 0.3464,
+      "step": 1648
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 1.9007570045973046,
+      "learning_rate": 1.144898935211874e-06,
+      "loss": 0.3859,
+      "step": 1649
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 2.1474984005060334,
+      "learning_rate": 1.1421523379865603e-06,
+      "loss": 0.3456,
+      "step": 1650
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 1.843189989485683,
+      "learning_rate": 1.1394080635233204e-06,
+      "loss": 0.3052,
+      "step": 1651
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 2.009903889503656,
+      "learning_rate": 1.136666116516567e-06,
+      "loss": 0.4498,
+      "step": 1652
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 3.0285468769549473,
+      "learning_rate": 1.1339265016567294e-06,
+      "loss": 0.3532,
+      "step": 1653
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 1.8725192886740858,
+      "learning_rate": 1.1311892236302508e-06,
+      "loss": 0.3685,
+      "step": 1654
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 1.8726862166869487,
+      "learning_rate": 1.128454287119573e-06,
+      "loss": 0.3761,
+      "step": 1655
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 1.8883766624211467,
+      "learning_rate": 1.1257216968031357e-06,
+      "loss": 0.3574,
+      "step": 1656
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 1.9004020165185547,
+      "learning_rate": 1.1229914573553641e-06,
+      "loss": 0.3638,
+      "step": 1657
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 1.8723325311418417,
+      "learning_rate": 1.1202635734466612e-06,
+      "loss": 0.3468,
+      "step": 1658
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 1.804021127084218,
+      "learning_rate": 1.1175380497434022e-06,
+      "loss": 0.3534,
+      "step": 1659
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 1.8962566852248846,
+      "learning_rate": 1.1148148909079229e-06,
+      "loss": 0.3943,
+      "step": 1660
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 1.9982200928541012,
+      "learning_rate": 1.1120941015985152e-06,
+      "loss": 0.4224,
+      "step": 1661
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 1.8053179049263286,
+      "learning_rate": 1.109375686469417e-06,
+      "loss": 0.3389,
+      "step": 1662
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 1.888467793597335,
+      "learning_rate": 1.106659650170805e-06,
+      "loss": 0.387,
+      "step": 1663
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 1.8685159814187862,
+      "learning_rate": 1.1039459973487876e-06,
+      "loss": 0.3428,
+      "step": 1664
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 1.972180811818148,
+      "learning_rate": 1.101234732645393e-06,
+      "loss": 0.3587,
+      "step": 1665
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 2.252459557872569,
+      "learning_rate": 1.0985258606985683e-06,
+      "loss": 0.3684,
+      "step": 1666
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 1.9679034729828595,
+      "learning_rate": 1.0958193861421634e-06,
+      "loss": 0.338,
+      "step": 1667
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 1.9117564762331398,
+      "learning_rate": 1.0931153136059304e-06,
+      "loss": 0.4016,
+      "step": 1668
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 1.915297031471561,
+      "learning_rate": 1.0904136477155112e-06,
+      "loss": 0.3629,
+      "step": 1669
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 1.8376703588677337,
+      "learning_rate": 1.0877143930924306e-06,
+      "loss": 0.371,
+      "step": 1670
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 1.9070422380758454,
+      "learning_rate": 1.085017554354089e-06,
+      "loss": 0.3533,
+      "step": 1671
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 1.9752631861235486,
+      "learning_rate": 1.0823231361137543e-06,
+      "loss": 0.4164,
+      "step": 1672
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 1.885197204563304,
+      "learning_rate": 1.0796311429805536e-06,
+      "loss": 0.3929,
+      "step": 1673
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 1.9090106863841916,
+      "learning_rate": 1.0769415795594659e-06,
+      "loss": 0.3449,
+      "step": 1674
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 2.022637519082336,
+      "learning_rate": 1.074254450451314e-06,
+      "loss": 0.3553,
+      "step": 1675
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 1.942217527277708,
+      "learning_rate": 1.0715697602527542e-06,
+      "loss": 0.3936,
+      "step": 1676
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 1.8809306152215932,
+      "learning_rate": 1.0688875135562738e-06,
+      "loss": 0.3481,
+      "step": 1677
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 2.0969194462234513,
+      "learning_rate": 1.0662077149501798e-06,
+      "loss": 0.3864,
+      "step": 1678
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 1.8365124296835973,
+      "learning_rate": 1.0635303690185894e-06,
+      "loss": 0.3778,
+      "step": 1679
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 1.9221630207347382,
+      "learning_rate": 1.0608554803414256e-06,
+      "loss": 0.3443,
+      "step": 1680
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 1.9319799829762891,
+      "learning_rate": 1.0581830534944084e-06,
+      "loss": 0.3759,
+      "step": 1681
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 2.00532761754314,
+      "learning_rate": 1.055513093049046e-06,
+      "loss": 0.373,
+      "step": 1682
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 1.8361577324130107,
+      "learning_rate": 1.052845603572627e-06,
+      "loss": 0.3671,
+      "step": 1683
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 1.9246365496147386,
+      "learning_rate": 1.0501805896282144e-06,
+      "loss": 0.3888,
+      "step": 1684
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 1.933677406014513,
+      "learning_rate": 1.047518055774636e-06,
+      "loss": 0.428,
+      "step": 1685
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 1.8497481971894003,
+      "learning_rate": 1.0448580065664754e-06,
+      "loss": 0.339,
+      "step": 1686
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 1.9674163310656592,
+      "learning_rate": 1.042200446554068e-06,
+      "loss": 0.3933,
+      "step": 1687
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 1.8703345670634528,
+      "learning_rate": 1.039545380283491e-06,
+      "loss": 0.3805,
+      "step": 1688
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 1.8996794102359933,
+      "learning_rate": 1.0368928122965547e-06,
+      "loss": 0.3612,
+      "step": 1689
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 1.8163372630466865,
+      "learning_rate": 1.0342427471307973e-06,
+      "loss": 0.3631,
+      "step": 1690
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 1.8990581755942872,
+      "learning_rate": 1.031595189319473e-06,
+      "loss": 0.4539,
+      "step": 1691
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 1.9101558963616596,
+      "learning_rate": 1.0289501433915493e-06,
+      "loss": 0.4649,
+      "step": 1692
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 1.8873611659348446,
+      "learning_rate": 1.0263076138716962e-06,
+      "loss": 0.3649,
+      "step": 1693
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 1.823482013352725,
+      "learning_rate": 1.0236676052802791e-06,
+      "loss": 0.3648,
+      "step": 1694
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 1.8931382792204232,
+      "learning_rate": 1.0210301221333512e-06,
+      "loss": 0.3589,
+      "step": 1695
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 2.0713580311911355,
+      "learning_rate": 1.0183951689426438e-06,
+      "loss": 0.3474,
+      "step": 1696
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 1.8607620741027457,
+      "learning_rate": 1.0157627502155632e-06,
+      "loss": 0.3773,
+      "step": 1697
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 1.8645944548746636,
+      "learning_rate": 1.0131328704551782e-06,
+      "loss": 0.3457,
+      "step": 1698
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 1.852711235772826,
+      "learning_rate": 1.0105055341602153e-06,
+      "loss": 0.3559,
+      "step": 1699
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 1.969084245230365,
+      "learning_rate": 1.00788074582505e-06,
+      "loss": 0.3786,
+      "step": 1700
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 1.939185809703108,
+      "learning_rate": 1.005258509939699e-06,
+      "loss": 0.3649,
+      "step": 1701
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 1.9104875321871906,
+      "learning_rate": 1.0026388309898132e-06,
+      "loss": 0.388,
+      "step": 1702
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 2.161662535348609,
+      "learning_rate": 1.0000217134566694e-06,
+      "loss": 0.3692,
+      "step": 1703
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.8024704245485432,
+      "learning_rate": 9.974071618171613e-07,
+      "loss": 0.3751,
+      "step": 1704
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.7739470701867779,
+      "learning_rate": 9.94795180543796e-07,
+      "loss": 0.3373,
+      "step": 1705
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.8188576734630457,
+      "learning_rate": 9.921857741046806e-07,
+      "loss": 0.3945,
+      "step": 1706
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.9054961265186567,
+      "learning_rate": 9.895789469635204e-07,
+      "loss": 0.3518,
+      "step": 1707
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.8782724635395873,
+      "learning_rate": 9.869747035796071e-07,
+      "loss": 0.37,
+      "step": 1708
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.838615529167183,
+      "learning_rate": 9.843730484078128e-07,
+      "loss": 0.3376,
+      "step": 1709
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.785535753238471,
+      "learning_rate": 9.817739858985828e-07,
+      "loss": 0.337,
+      "step": 1710
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.8535882977550358,
+      "learning_rate": 9.791775204979263e-07,
+      "loss": 0.3391,
+      "step": 1711
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.882614515071742,
+      "learning_rate": 9.765836566474105e-07,
+      "loss": 0.391,
+      "step": 1712
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.8285960302994975,
+      "learning_rate": 9.739923987841518e-07,
+      "loss": 0.356,
+      "step": 1713
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.8054856197120326,
+      "learning_rate": 9.714037513408093e-07,
+      "loss": 0.3623,
+      "step": 1714
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.8671208649893825,
+      "learning_rate": 9.68817718745577e-07,
+      "loss": 0.3693,
+      "step": 1715
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.9004503058230886,
+      "learning_rate": 9.662343054221743e-07,
+      "loss": 0.3327,
+      "step": 1716
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 1.9148999919712566,
+      "learning_rate": 9.636535157898422e-07,
+      "loss": 0.3618,
+      "step": 1717
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 1.8635582232372712,
+      "learning_rate": 9.610753542633309e-07,
+      "loss": 0.3884,
+      "step": 1718
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 1.9383472683274976,
+      "learning_rate": 9.58499825252897e-07,
+      "loss": 0.3953,
+      "step": 1719
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 1.946035726357351,
+      "learning_rate": 9.559269331642937e-07,
+      "loss": 0.3292,
+      "step": 1720
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 1.8700217872447233,
+      "learning_rate": 9.533566823987628e-07,
+      "loss": 0.361,
+      "step": 1721
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 1.8900223904453795,
+      "learning_rate": 9.507890773530276e-07,
+      "loss": 0.3349,
+      "step": 1722
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 1.9125828500996216,
+      "learning_rate": 9.482241224192867e-07,
+      "loss": 0.3641,
+      "step": 1723
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 1.940533327906808,
+      "learning_rate": 9.456618219852042e-07,
+      "loss": 0.4036,
+      "step": 1724
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 2.0712298544333687,
+      "learning_rate": 9.431021804339047e-07,
+      "loss": 0.3934,
+      "step": 1725
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 1.8791027421557622,
+      "learning_rate": 9.40545202143962e-07,
+      "loss": 0.3507,
+      "step": 1726
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 1.9686923479849525,
+      "learning_rate": 9.379908914893962e-07,
+      "loss": 0.3497,
+      "step": 1727
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 2.0437193308441253,
+      "learning_rate": 9.354392528396638e-07,
+      "loss": 0.395,
+      "step": 1728
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.864988214025856,
+      "learning_rate": 9.328902905596512e-07,
+      "loss": 0.379,
+      "step": 1729
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.963062444850751,
+      "learning_rate": 9.303440090096633e-07,
+      "loss": 0.3565,
+      "step": 1730
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.9399170798660286,
+      "learning_rate": 9.278004125454232e-07,
+      "loss": 0.415,
+      "step": 1731
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.874726297624515,
+      "learning_rate": 9.252595055180585e-07,
+      "loss": 0.3606,
+      "step": 1732
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.834934005776965,
+      "learning_rate": 9.227212922740971e-07,
+      "loss": 0.4104,
+      "step": 1733
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.8726418919835732,
+      "learning_rate": 9.20185777155459e-07,
+      "loss": 0.3325,
+      "step": 1734
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.9432074923657174,
+      "learning_rate": 9.176529644994481e-07,
+      "loss": 0.3663,
+      "step": 1735
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.7937207452405413,
+      "learning_rate": 9.151228586387464e-07,
+      "loss": 0.3225,
+      "step": 1736
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.911607719176901,
+      "learning_rate": 9.125954639014037e-07,
+      "loss": 0.3491,
+      "step": 1737
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.8954594851178048,
+      "learning_rate": 9.100707846108337e-07,
+      "loss": 0.3474,
+      "step": 1738
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.9081066235083353,
+      "learning_rate": 9.075488250858047e-07,
+      "loss": 0.3654,
+      "step": 1739
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.9384836973235149,
+      "learning_rate": 9.050295896404326e-07,
+      "loss": 0.3519,
+      "step": 1740
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 1.9655302768136176,
+      "learning_rate": 9.02513082584173e-07,
+      "loss": 0.3482,
+      "step": 1741
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 1.900218584161994,
+      "learning_rate": 8.999993082218156e-07,
+      "loss": 0.3576,
+      "step": 1742
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 2.030742409886431,
+      "learning_rate": 8.974882708534724e-07,
+      "loss": 0.3055,
+      "step": 1743
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 1.865959678567607,
+      "learning_rate": 8.949799747745766e-07,
+      "loss": 0.3485,
+      "step": 1744
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 1.8300998571759115,
+      "learning_rate": 8.924744242758707e-07,
+      "loss": 0.3412,
+      "step": 1745
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 2.3841641123937514,
+      "learning_rate": 8.899716236434019e-07,
+      "loss": 0.3484,
+      "step": 1746
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 1.844271076789803,
+      "learning_rate": 8.874715771585105e-07,
+      "loss": 0.3762,
+      "step": 1747
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 1.8687696131042617,
+      "learning_rate": 8.84974289097828e-07,
+      "loss": 0.402,
+      "step": 1748
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 1.889973499535232,
+      "learning_rate": 8.824797637332669e-07,
+      "loss": 0.3566,
+      "step": 1749
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 1.8681107208205963,
+      "learning_rate": 8.799880053320131e-07,
+      "loss": 0.4057,
+      "step": 1750
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 1.8928327876139377,
+      "learning_rate": 8.774990181565201e-07,
+      "loss": 0.3784,
+      "step": 1751
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 1.931089236577729,
+      "learning_rate": 8.750128064645002e-07,
+      "loss": 0.4008,
+      "step": 1752
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 1.9573581859995763,
+      "learning_rate": 8.725293745089181e-07,
+      "loss": 0.3486,
+      "step": 1753
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 1.9164746693234396,
+      "learning_rate": 8.700487265379845e-07,
+      "loss": 0.3634,
+      "step": 1754
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 1.812159570787973,
+      "learning_rate": 8.675708667951446e-07,
+      "loss": 0.3476,
+      "step": 1755
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 2.0355096473340146,
+      "learning_rate": 8.650957995190784e-07,
+      "loss": 0.3562,
+      "step": 1756
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 1.8995538618272807,
+      "learning_rate": 8.626235289436846e-07,
+      "loss": 0.3767,
+      "step": 1757
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 1.8751894629115184,
+      "learning_rate": 8.601540592980812e-07,
+      "loss": 0.3709,
+      "step": 1758
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 1.8772906072081945,
+      "learning_rate": 8.576873948065931e-07,
+      "loss": 0.3692,
+      "step": 1759
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 1.855725719743314,
+      "learning_rate": 8.552235396887479e-07,
+      "loss": 0.3461,
+      "step": 1760
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 1.9058932387569096,
+      "learning_rate": 8.52762498159266e-07,
+      "loss": 0.4035,
+      "step": 1761
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 1.8155999399280405,
+      "learning_rate": 8.503042744280565e-07,
+      "loss": 0.3821,
+      "step": 1762
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 1.9191184065214926,
+      "learning_rate": 8.478488727002062e-07,
+      "loss": 0.4182,
+      "step": 1763
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 1.8660511914055784,
+      "learning_rate": 8.453962971759766e-07,
+      "loss": 0.3936,
+      "step": 1764
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.8559359079620885,
+      "learning_rate": 8.429465520507932e-07,
+      "loss": 0.3555,
+      "step": 1765
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.871625930259135,
+      "learning_rate": 8.404996415152414e-07,
+      "loss": 0.3336,
+      "step": 1766
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.9146405985810966,
+      "learning_rate": 8.38055569755055e-07,
+      "loss": 0.3595,
+      "step": 1767
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.8172916285896499,
+      "learning_rate": 8.356143409511145e-07,
+      "loss": 0.3763,
+      "step": 1768
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.9045338434685268,
+      "learning_rate": 8.331759592794344e-07,
+      "loss": 0.3454,
+      "step": 1769
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.9019450574908656,
+      "learning_rate": 8.307404289111618e-07,
+      "loss": 0.3782,
+      "step": 1770
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.8040956687408418,
+      "learning_rate": 8.283077540125642e-07,
+      "loss": 0.3397,
+      "step": 1771
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.8854623689371994,
+      "learning_rate": 8.258779387450258e-07,
+      "loss": 0.3632,
+      "step": 1772
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.8703628366355571,
+      "learning_rate": 8.234509872650381e-07,
+      "loss": 0.3796,
+      "step": 1773
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.8974382562927672,
+      "learning_rate": 8.210269037241945e-07,
+      "loss": 0.3577,
+      "step": 1774
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.8041564148309792,
+      "learning_rate": 8.186056922691816e-07,
+      "loss": 0.3423,
+      "step": 1775
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.8871513088592733,
+      "learning_rate": 8.161873570417742e-07,
+      "loss": 0.3724,
+      "step": 1776
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.7959090299202567,
+      "learning_rate": 8.137719021788248e-07,
+      "loss": 0.3514,
+      "step": 1777
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.77414937614363,
+      "learning_rate": 8.113593318122609e-07,
+      "loss": 0.3655,
+      "step": 1778
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.8415138040355723,
+      "learning_rate": 8.089496500690747e-07,
+      "loss": 0.3469,
+      "step": 1779
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.943916626029921,
+      "learning_rate": 8.06542861071318e-07,
+      "loss": 0.3626,
+      "step": 1780
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.9699325195709307,
+      "learning_rate": 8.041389689360921e-07,
+      "loss": 0.3897,
+      "step": 1781
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.8300758832916175,
+      "learning_rate": 8.01737977775545e-07,
+      "loss": 0.3528,
+      "step": 1782
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.8854405268423242,
+      "learning_rate": 7.993398916968609e-07,
+      "loss": 0.3458,
+      "step": 1783
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.8610707367327934,
+      "learning_rate": 7.969447148022555e-07,
+      "loss": 0.3825,
+      "step": 1784
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.8761158349166456,
+      "learning_rate": 7.945524511889676e-07,
+      "loss": 0.361,
+      "step": 1785
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.8316905966902863,
+      "learning_rate": 7.921631049492526e-07,
+      "loss": 0.3791,
+      "step": 1786
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.8815617462853849,
+      "learning_rate": 7.897766801703754e-07,
+      "loss": 0.3334,
+      "step": 1787
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.8069850793814037,
+      "learning_rate": 7.873931809346022e-07,
+      "loss": 0.3063,
+      "step": 1788
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.877897596569181,
+      "learning_rate": 7.850126113191961e-07,
+      "loss": 0.3551,
+      "step": 1789
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.933100704380605,
+      "learning_rate": 7.826349753964083e-07,
+      "loss": 0.4,
+      "step": 1790
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.8588317568608963,
+      "learning_rate": 7.802602772334719e-07,
+      "loss": 0.3695,
+      "step": 1791
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.75903586927703,
+      "learning_rate": 7.778885208925943e-07,
+      "loss": 0.3334,
+      "step": 1792
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.847597726088611,
+      "learning_rate": 7.755197104309512e-07,
+      "loss": 0.3508,
+      "step": 1793
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.8730373365521515,
+      "learning_rate": 7.731538499006767e-07,
+      "loss": 0.3727,
+      "step": 1794
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.8696875894594878,
+      "learning_rate": 7.707909433488611e-07,
+      "loss": 0.3694,
+      "step": 1795
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.8224097896476315,
+      "learning_rate": 7.684309948175414e-07,
+      "loss": 0.3682,
+      "step": 1796
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.8896591788553188,
+      "learning_rate": 7.660740083436943e-07,
+      "loss": 0.353,
+      "step": 1797
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.8622597363460462,
+      "learning_rate": 7.637199879592275e-07,
+      "loss": 0.3835,
+      "step": 1798
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.8261440807434144,
+      "learning_rate": 7.61368937690978e-07,
+      "loss": 0.3673,
+      "step": 1799
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.86324753247062,
+      "learning_rate": 7.590208615607001e-07,
+      "loss": 0.3613,
+      "step": 1800
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 1.8704051001710107,
+      "learning_rate": 7.566757635850608e-07,
+      "loss": 0.3756,
+      "step": 1801
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 1.8547689419526656,
+      "learning_rate": 7.543336477756336e-07,
+      "loss": 0.3557,
+      "step": 1802
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 1.8970591656145008,
+      "learning_rate": 7.519945181388893e-07,
+      "loss": 0.3713,
+      "step": 1803
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 2.034710049647413,
+      "learning_rate": 7.496583786761911e-07,
+      "loss": 0.379,
+      "step": 1804
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 1.7207339510591724,
+      "learning_rate": 7.47325233383788e-07,
+      "loss": 0.324,
+      "step": 1805
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 1.8353430031672993,
+      "learning_rate": 7.449950862528046e-07,
+      "loss": 0.3688,
+      "step": 1806
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 1.8248952138910253,
+      "learning_rate": 7.426679412692403e-07,
+      "loss": 0.3744,
+      "step": 1807
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 1.8581710166024752,
+      "learning_rate": 7.403438024139547e-07,
+      "loss": 0.3591,
+      "step": 1808
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 1.972956887111899,
+      "learning_rate": 7.380226736626692e-07,
+      "loss": 0.3786,
+      "step": 1809
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 2412,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 603,
+  "total_flos": 852109767475200.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-1809/training_args.bin b/checkpoint-1809/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9c7d637f7d8ebf811202f98fbce7e1b0a49f637e
--- /dev/null
+++ b/checkpoint-1809/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7f4a52e7a4e455192e0e321f42138a81946c62773f6570625fe5cb773689e26
+size 7352
diff --git a/checkpoint-1809/zero_to_fp32.py b/checkpoint-1809/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..49b846633d6eb1e836e34681e44033581f4edb7b
--- /dev/null
+++ b/checkpoint-1809/zero_to_fp32.py
@@ -0,0 +1,592 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    """
+
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag)
diff --git a/checkpoint-2412/config.json b/checkpoint-2412/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e19729ddff0edff2916b7fba6d2fec722621e76
--- /dev/null
+++ b/checkpoint-2412/config.json
@@ -0,0 +1,26 @@
+{
+  "_name_or_path": "alpindale/Mistral-7B-v0.2-hf",
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 32000,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 32768,
+  "model_type": "mistral",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": false,
+  "vocab_size": 32002
+}
diff --git a/checkpoint-2412/generation_config.json b/checkpoint-2412/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..282b497efd8f276cf9270e576fb79be429aebcdc
--- /dev/null
+++ b/checkpoint-2412/generation_config.json
@@ -0,0 +1,7 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "do_sample": true,
+  "eos_token_id": 2,
+  "transformers_version": "4.38.2"
+}
diff --git a/checkpoint-2412/global_step2412/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8a59ff7a72f9df7a68e7248c7399fd213b080dd5
--- /dev/null
+++ b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e111d64b9ed3b4d12e7789f3192543373535aa92ee6c263ea786d7b23522bd49
+size 4831623435
diff --git a/checkpoint-2412/global_step2412/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6ac560f6e5321dbce193cc92e13b9baff4470062
--- /dev/null
+++ b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61b4141a3fa6d69f16e220f7b8c5d2cd3082a9523abf78acf9b27bbb7bb2d886
+size 4831623435
diff --git a/checkpoint-2412/global_step2412/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0d193120ef6ed6b5d0eb87e15df87b771e9fb65d
--- /dev/null
+++ b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3501f7efc9a0a2bfa63c0615155be565644fd9d8177e0aa748d17d19df9987f9
+size 4831623435
diff --git a/checkpoint-2412/global_step2412/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..51d8ee6f63f7ab6751b97b92bbcb81de4edd4d42
--- /dev/null
+++ b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f42a454bc0043a5a21051c0a6cde9be4b129e43271934efe035e421ce7048855
+size 4831623435
diff --git a/checkpoint-2412/global_step2412/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1e208605d73864af1217ae03dd78c332a6a8a26f
--- /dev/null
+++ b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e5af8493f413af7dd5769c90efd786dacb4e6288379692e0e4d8fa8e6069071
+size 4831623435
diff --git a/checkpoint-2412/global_step2412/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9d41711f76d39f04d8abe041dbc972b32c61ad89
--- /dev/null
+++ b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bcf6269cf2020dcfcff7333f3d4507e1a924ed80000200f90172b97709be09ed
+size 4831623435
diff --git a/checkpoint-2412/global_step2412/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f36e24043786da5b21dcf8c77bffd6c143ccbe35
--- /dev/null
+++ b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39d766462269221e2711490a61a779a5b0353d922145a3bad0b53fd31f240a25
+size 4831623435
diff --git a/checkpoint-2412/global_step2412/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..427c21d64831fcf5abddb0c3b950ea9bef3ce771
--- /dev/null
+++ b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e646565376c91ee882145d965b468d65d730d2d11fef7b4979f6d8b7c66ad29
+size 4831623435
diff --git a/checkpoint-2412/global_step2412/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..116880a693a1dd676d5291ea62f95e501d65db88
--- /dev/null
+++ b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa1ee0975fb89f5419c41d5bc2e2f62fef1b141132a7125b7a92896641cbb001
+size 4831623435
diff --git a/checkpoint-2412/global_step2412/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-2412/global_step2412/zero_pp_rank_0_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dd9d3734b8410e9aca4de6be1cfe056321e2fe62
--- /dev/null
+++ b/checkpoint-2412/global_step2412/zero_pp_rank_0_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5721478ea543d8dd78b059def5d48ba9b5a08820b4d524c8bd30ac284e466c4
+size 153829
diff --git a/checkpoint-2412/global_step2412/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-2412/global_step2412/zero_pp_rank_1_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..774949779d756d7ac6644f0438b01207093a5bba
--- /dev/null
+++ b/checkpoint-2412/global_step2412/zero_pp_rank_1_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2004ecf30872828841b44b9cad984d2fb4049e40e90d86f1cdf34c0c8af4cf28
+size 153829
diff --git a/checkpoint-2412/global_step2412/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-2412/global_step2412/zero_pp_rank_2_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e856e3b7c3db51d54629b44a65db43e135baeb42
--- /dev/null
+++ b/checkpoint-2412/global_step2412/zero_pp_rank_2_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92568d2f2270b067395a7744b0e8e59530291155adc571485e510e5881453efb
+size 153829
diff --git a/checkpoint-2412/global_step2412/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-2412/global_step2412/zero_pp_rank_3_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1c80d18171a71b37e6e8c1632689efe5bd4c4df7
--- /dev/null
+++ b/checkpoint-2412/global_step2412/zero_pp_rank_3_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d2cc635532d6fce6a79b4935c5c637cd3a7e22a4a09353821285e20040086b76
+size 153829
diff --git a/checkpoint-2412/global_step2412/zero_pp_rank_4_mp_rank_00_model_states.pt b/checkpoint-2412/global_step2412/zero_pp_rank_4_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9297703656fc3df2fb912c3e8364b40f7acb2150
--- /dev/null
+++ b/checkpoint-2412/global_step2412/zero_pp_rank_4_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6937465a4c2af5b1415fd2afcb341257b94593dd351e04d85a15e207912b72b
+size 153829
diff --git a/checkpoint-2412/global_step2412/zero_pp_rank_5_mp_rank_00_model_states.pt b/checkpoint-2412/global_step2412/zero_pp_rank_5_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6ecfe45e1e980d872619605aa7ba4da1edc26509
--- /dev/null
+++ b/checkpoint-2412/global_step2412/zero_pp_rank_5_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a64004407be05cca8398dc72883f365b0947dc9dd2a983e6d943edc998c9e04
+size 153829
diff --git a/checkpoint-2412/global_step2412/zero_pp_rank_6_mp_rank_00_model_states.pt b/checkpoint-2412/global_step2412/zero_pp_rank_6_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9553a8d1a362495b7bd575fc191edad0bac73ab2
--- /dev/null
+++ b/checkpoint-2412/global_step2412/zero_pp_rank_6_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8474ab6e0891601b6f95e61afdd6dcdf1085b7df706655f714825bee2ab72a4b
+size 153829
diff --git a/checkpoint-2412/global_step2412/zero_pp_rank_7_mp_rank_00_model_states.pt b/checkpoint-2412/global_step2412/zero_pp_rank_7_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bdefca753bb52a1a131c05be59fb545b9ed03f2b
--- /dev/null
+++ b/checkpoint-2412/global_step2412/zero_pp_rank_7_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e295c5022a8e8800aa98a2b9745c62d4648292fe4ea02bc5f154090bea6adc38
+size 153829
diff --git a/checkpoint-2412/global_step2412/zero_pp_rank_8_mp_rank_00_model_states.pt b/checkpoint-2412/global_step2412/zero_pp_rank_8_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b382b7eb0495e302ef9241b3f6e7c519bad3df6e
--- /dev/null
+++ b/checkpoint-2412/global_step2412/zero_pp_rank_8_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39fde8a1dcf3d0db192cd199515ea53b7cf0de2864570f8b3e1c599c3b94ed9e
+size 153829
diff --git a/checkpoint-2412/latest b/checkpoint-2412/latest
new file mode 100644
index 0000000000000000000000000000000000000000..75087eef97c8712d556b81f66e003de493e93c96
--- /dev/null
+++ b/checkpoint-2412/latest
@@ -0,0 +1 @@
+global_step2412
\ No newline at end of file
diff --git a/checkpoint-2412/model-00001-of-00003.safetensors b/checkpoint-2412/model-00001-of-00003.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0f1c522c741bc956a541d5544734d12ff3a71b33
--- /dev/null
+++ b/checkpoint-2412/model-00001-of-00003.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c89fd0fface188ca3f7988aa53f25e087292d72ca99cd52ef8cb52cf180ad2ff
+size 4943178720
diff --git a/checkpoint-2412/model-00002-of-00003.safetensors b/checkpoint-2412/model-00002-of-00003.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6a1c7f2c1a284a17e9b7a9124040ee4bb6680b67
--- /dev/null
+++ b/checkpoint-2412/model-00002-of-00003.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49dd97160e0a8ff75303f02969df38307407c8800ce94aaa86611ceb6727bca0
+size 4999819336
diff --git a/checkpoint-2412/model-00003-of-00003.safetensors b/checkpoint-2412/model-00003-of-00003.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3f8cc928e41a10674f627e9a238420111f974bb7
--- /dev/null
+++ b/checkpoint-2412/model-00003-of-00003.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03098a839ef612f1efe325b376aa90bc8311a01c1236120d9ca7934eb9b12fed
+size 4540532728
diff --git a/checkpoint-2412/model.safetensors.index.json b/checkpoint-2412/model.safetensors.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..71e207631efb42944bc779548c9b6d4b5818ffe2
--- /dev/null
+++ b/checkpoint-2412/model.safetensors.index.json
@@ -0,0 +1,298 @@
+{
+  "metadata": {
+    "total_size": 14483496960
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00003-of-00003.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.norm.weight": "model-00003-of-00003.safetensors"
+  }
+}
diff --git a/checkpoint-2412/rng_state_0.pth b/checkpoint-2412/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..ed9c956014a637b9d3ccb494c387c7452ae938e0
--- /dev/null
+++ b/checkpoint-2412/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40b7907b6e8bbc0deaf9b6cadef63205dade64f9fbf74f9a4dca9c34792d7aab
+size 16240
diff --git a/checkpoint-2412/rng_state_1.pth b/checkpoint-2412/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..a2452cb1ac950d724f0559bab3e53e6a671da5ba
--- /dev/null
+++ b/checkpoint-2412/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a4ca3302c930a1b49ced40d5e2133aedc4c5857930d92deb8c6496a317958d8
+size 16240
diff --git a/checkpoint-2412/rng_state_2.pth b/checkpoint-2412/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..30ca1e0fbf8047c1cd0606a37b02d545623d4a67
--- /dev/null
+++ b/checkpoint-2412/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fbbf2364108e70a0ac183356d1693182b452bb464271c3d2f4ade972244d710d
+size 16240
diff --git a/checkpoint-2412/rng_state_3.pth b/checkpoint-2412/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..a342cc40db30db7d18c31cffe2a2e1b1d2f3b084
--- /dev/null
+++ b/checkpoint-2412/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9269c171a7948127faa588109a1fb8043194b407d2dfbeda2e25ed8b35126a5
+size 16240
diff --git a/checkpoint-2412/rng_state_4.pth b/checkpoint-2412/rng_state_4.pth
new file mode 100644
index 0000000000000000000000000000000000000000..ca08e0f4a907b0b1649b7bc3537dd48c83723830
--- /dev/null
+++ b/checkpoint-2412/rng_state_4.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f02625e4547fbacdb164e484867f76d5024a007c22c297f8ecbef13fc6aa3202
+size 16240
diff --git a/checkpoint-2412/rng_state_5.pth b/checkpoint-2412/rng_state_5.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1aeba77fabdef8a232c2785991d798bd3f84afd3
--- /dev/null
+++ b/checkpoint-2412/rng_state_5.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51eb0286c1f14a2c09c443d8c606951c3debeb25f9ba4f71e0aea90ae2f0786e
+size 16240
diff --git a/checkpoint-2412/rng_state_6.pth b/checkpoint-2412/rng_state_6.pth
new file mode 100644
index 0000000000000000000000000000000000000000..499c459dc2af4317a2a23f7877927bf7c586e439
--- /dev/null
+++ b/checkpoint-2412/rng_state_6.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:080bbd36834b7a1623430efdd9f598b791f466541d25b545ca410ec4a930a0f3
+size 16240
diff --git a/checkpoint-2412/rng_state_7.pth b/checkpoint-2412/rng_state_7.pth
new file mode 100644
index 0000000000000000000000000000000000000000..cdfb9b9f9f3356413f6755deb29a84b7b4e360a2
--- /dev/null
+++ b/checkpoint-2412/rng_state_7.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54aa959bf290908dfe1fc65c2591b99982e9fdce5caf276626d0084ccffa7e95
+size 16240
diff --git a/checkpoint-2412/rng_state_8.pth b/checkpoint-2412/rng_state_8.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6533db02002842edcb0c9b2a6dd89506e90ac8c8
--- /dev/null
+++ b/checkpoint-2412/rng_state_8.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:85f8554f99e72a1c251b463a30088dd49afece6deb61c5ad09834d35ff89308b
+size 16240
diff --git a/checkpoint-2412/scheduler.pt b/checkpoint-2412/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bb24d02e289560291fce88a5d78a2810c68f08f6
--- /dev/null
+++ b/checkpoint-2412/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2380d2748a4c48cacb4bc12df77e29fd92e9aef87c62d8b17fbf348a1afa8525
+size 1064
diff --git a/checkpoint-2412/trainer_state.json b/checkpoint-2412/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..2a8a3040b58cb80dcff351c71d92691ff1ac9c20
--- /dev/null
+++ b/checkpoint-2412/trainer_state.json
@@ -0,0 +1,16905 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.9819689119170985,
+  "eval_steps": 500,
+  "global_step": 2412,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "grad_norm": 27.81778461909011,
+      "learning_rate": 5.000000000000001e-07,
+      "loss": 0.7993,
+      "step": 1
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 28.63833175363421,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 0.9056,
+      "step": 2
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 25.646828828014854,
+      "learning_rate": 1.5e-06,
+      "loss": 0.8473,
+      "step": 3
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 9.834124771941388,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 0.8192,
+      "step": 4
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 10.558095859980105,
+      "learning_rate": 2.5e-06,
+      "loss": 0.7943,
+      "step": 5
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 7.905789045775758,
+      "learning_rate": 3e-06,
+      "loss": 0.7075,
+      "step": 6
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 7.259519170268483,
+      "learning_rate": 3.5e-06,
+      "loss": 0.7537,
+      "step": 7
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 6.639042051048664,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 0.7471,
+      "step": 8
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 8.515070932390074,
+      "learning_rate": 4.5e-06,
+      "loss": 0.7689,
+      "step": 9
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 8.916410424632533,
+      "learning_rate": 5e-06,
+      "loss": 0.7194,
+      "step": 10
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 4.835046497413255,
+      "learning_rate": 4.9999978617243506e-06,
+      "loss": 0.6949,
+      "step": 11
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 10.065648500649479,
+      "learning_rate": 4.9999914469010585e-06,
+      "loss": 0.7039,
+      "step": 12
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 5.299372887839679,
+      "learning_rate": 4.999980755541098e-06,
+      "loss": 0.7067,
+      "step": 13
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 5.693110837094718,
+      "learning_rate": 4.999965787662758e-06,
+      "loss": 0.7126,
+      "step": 14
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 2.983869635716314,
+      "learning_rate": 4.999946543291642e-06,
+      "loss": 0.6496,
+      "step": 15
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 4.2561193962441175,
+      "learning_rate": 4.999923022460671e-06,
+      "loss": 0.7036,
+      "step": 16
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 3.011772824968437,
+      "learning_rate": 4.999895225210079e-06,
+      "loss": 0.7009,
+      "step": 17
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 3.386638415717137,
+      "learning_rate": 4.9998631515874165e-06,
+      "loss": 0.6624,
+      "step": 18
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 3.764658092125165,
+      "learning_rate": 4.999826801647551e-06,
+      "loss": 0.6687,
+      "step": 19
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.3982096117966614,
+      "learning_rate": 4.999786175452662e-06,
+      "loss": 0.706,
+      "step": 20
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.8051633678260193,
+      "learning_rate": 4.999741273072246e-06,
+      "loss": 0.7031,
+      "step": 21
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 3.1177784624332614,
+      "learning_rate": 4.999692094583114e-06,
+      "loss": 0.7525,
+      "step": 22
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.2533819675617806,
+      "learning_rate": 4.9996386400693906e-06,
+      "loss": 0.6767,
+      "step": 23
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.61893793162573,
+      "learning_rate": 4.999580909622518e-06,
+      "loss": 0.6432,
+      "step": 24
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.76057623723569,
+      "learning_rate": 4.999518903341251e-06,
+      "loss": 0.6809,
+      "step": 25
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.27983032069553,
+      "learning_rate": 4.999452621331657e-06,
+      "loss": 0.6798,
+      "step": 26
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.501904568120582,
+      "learning_rate": 4.99938206370712e-06,
+      "loss": 0.6412,
+      "step": 27
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.819229290729669,
+      "learning_rate": 4.999307230588338e-06,
+      "loss": 0.6188,
+      "step": 28
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.1233212322022212,
+      "learning_rate": 4.9992281221033224e-06,
+      "loss": 0.6378,
+      "step": 29
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.7806911906686755,
+      "learning_rate": 4.999144738387396e-06,
+      "loss": 0.6653,
+      "step": 30
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.4045490257014563,
+      "learning_rate": 4.999057079583199e-06,
+      "loss": 0.6377,
+      "step": 31
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.3803717769210446,
+      "learning_rate": 4.998965145840681e-06,
+      "loss": 0.6855,
+      "step": 32
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.3976652879633473,
+      "learning_rate": 4.998868937317106e-06,
+      "loss": 0.6284,
+      "step": 33
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.2958541157119727,
+      "learning_rate": 4.998768454177051e-06,
+      "loss": 0.6521,
+      "step": 34
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.1925196833696154,
+      "learning_rate": 4.998663696592403e-06,
+      "loss": 0.6619,
+      "step": 35
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.361006042901851,
+      "learning_rate": 4.998554664742362e-06,
+      "loss": 0.6155,
+      "step": 36
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.1577758143653614,
+      "learning_rate": 4.998441358813443e-06,
+      "loss": 0.6398,
+      "step": 37
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.219872074512664,
+      "learning_rate": 4.998323778999467e-06,
+      "loss": 0.6051,
+      "step": 38
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.2907501521408546,
+      "learning_rate": 4.9982019255015705e-06,
+      "loss": 0.6337,
+      "step": 39
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.1769862324666183,
+      "learning_rate": 4.9980757985281955e-06,
+      "loss": 0.6606,
+      "step": 40
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.4252479779661607,
+      "learning_rate": 4.997945398295101e-06,
+      "loss": 0.6685,
+      "step": 41
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.3929541982084657,
+      "learning_rate": 4.99781072502535e-06,
+      "loss": 0.6084,
+      "step": 42
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.932539969840091,
+      "learning_rate": 4.997671778949318e-06,
+      "loss": 0.6123,
+      "step": 43
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.191742541327873,
+      "learning_rate": 4.997528560304688e-06,
+      "loss": 0.6247,
+      "step": 44
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.423376784566499,
+      "learning_rate": 4.997381069336455e-06,
+      "loss": 0.7024,
+      "step": 45
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.0599055392481076,
+      "learning_rate": 4.997229306296918e-06,
+      "loss": 0.6612,
+      "step": 46
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.16832922087532,
+      "learning_rate": 4.997073271445686e-06,
+      "loss": 0.5949,
+      "step": 47
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.0483598654319453,
+      "learning_rate": 4.9969129650496775e-06,
+      "loss": 0.6406,
+      "step": 48
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.963056609139284,
+      "learning_rate": 4.996748387383113e-06,
+      "loss": 0.6361,
+      "step": 49
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.2094923844269307,
+      "learning_rate": 4.996579538727527e-06,
+      "loss": 0.5901,
+      "step": 50
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.1088153449411857,
+      "learning_rate": 4.996406419371749e-06,
+      "loss": 0.6458,
+      "step": 51
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.093448940617732,
+      "learning_rate": 4.996229029611926e-06,
+      "loss": 0.6509,
+      "step": 52
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.075116207412987,
+      "learning_rate": 4.996047369751502e-06,
+      "loss": 0.6295,
+      "step": 53
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.138141165277684,
+      "learning_rate": 4.995861440101229e-06,
+      "loss": 0.6088,
+      "step": 54
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.186316382848445,
+      "learning_rate": 4.995671240979161e-06,
+      "loss": 0.6307,
+      "step": 55
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.2513741083982195,
+      "learning_rate": 4.995476772710657e-06,
+      "loss": 0.6175,
+      "step": 56
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.0827167336870596,
+      "learning_rate": 4.995278035628379e-06,
+      "loss": 0.5935,
+      "step": 57
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.117977588574442,
+      "learning_rate": 4.995075030072291e-06,
+      "loss": 0.5998,
+      "step": 58
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.0996940200235485,
+      "learning_rate": 4.994867756389658e-06,
+      "loss": 0.6159,
+      "step": 59
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.141096165691323,
+      "learning_rate": 4.994656214935045e-06,
+      "loss": 0.6294,
+      "step": 60
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.022748830058395,
+      "learning_rate": 4.994440406070323e-06,
+      "loss": 0.6315,
+      "step": 61
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.209132168720991,
+      "learning_rate": 4.994220330164654e-06,
+      "loss": 0.5645,
+      "step": 62
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.0994557317862674,
+      "learning_rate": 4.993995987594509e-06,
+      "loss": 0.6272,
+      "step": 63
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.204220831053169,
+      "learning_rate": 4.99376737874365e-06,
+      "loss": 0.6379,
+      "step": 64
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.127733932186697,
+      "learning_rate": 4.993534504003141e-06,
+      "loss": 0.622,
+      "step": 65
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.1338506582034316,
+      "learning_rate": 4.993297363771342e-06,
+      "loss": 0.6259,
+      "step": 66
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.104802764460729,
+      "learning_rate": 4.993055958453912e-06,
+      "loss": 0.6414,
+      "step": 67
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.0889535347771675,
+      "learning_rate": 4.9928102884638004e-06,
+      "loss": 0.6466,
+      "step": 68
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.252225316694296,
+      "learning_rate": 4.992560354221258e-06,
+      "loss": 0.6167,
+      "step": 69
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.015392533516649,
+      "learning_rate": 4.992306156153827e-06,
+      "loss": 0.5958,
+      "step": 70
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.151741408948778,
+      "learning_rate": 4.992047694696343e-06,
+      "loss": 0.5875,
+      "step": 71
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.0351299117412696,
+      "learning_rate": 4.991784970290935e-06,
+      "loss": 0.5935,
+      "step": 72
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.0000962363827983,
+      "learning_rate": 4.991517983387026e-06,
+      "loss": 0.6091,
+      "step": 73
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.202881736102415,
+      "learning_rate": 4.99124673444133e-06,
+      "loss": 0.6122,
+      "step": 74
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.015074773396151,
+      "learning_rate": 4.990971223917848e-06,
+      "loss": 0.6134,
+      "step": 75
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.009305960567766,
+      "learning_rate": 4.990691452287877e-06,
+      "loss": 0.6308,
+      "step": 76
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 1.9967884756310221,
+      "learning_rate": 4.990407420029999e-06,
+      "loss": 0.6098,
+      "step": 77
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.0858738033925905,
+      "learning_rate": 4.990119127630085e-06,
+      "loss": 0.6344,
+      "step": 78
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 1.9427707561903895,
+      "learning_rate": 4.989826575581295e-06,
+      "loss": 0.6049,
+      "step": 79
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.157150584766853,
+      "learning_rate": 4.989529764384073e-06,
+      "loss": 0.5965,
+      "step": 80
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.0303527419352583,
+      "learning_rate": 4.989228694546151e-06,
+      "loss": 0.6524,
+      "step": 81
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.128799919475717,
+      "learning_rate": 4.988923366582546e-06,
+      "loss": 0.5524,
+      "step": 82
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.0122786280510696,
+      "learning_rate": 4.988613781015557e-06,
+      "loss": 0.6268,
+      "step": 83
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.104580177719229,
+      "learning_rate": 4.988299938374769e-06,
+      "loss": 0.6229,
+      "step": 84
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.3894843860356834,
+      "learning_rate": 4.9879818391970455e-06,
+      "loss": 0.6194,
+      "step": 85
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 1.9615211372441477,
+      "learning_rate": 4.9876594840265355e-06,
+      "loss": 0.6355,
+      "step": 86
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.4509852093141937,
+      "learning_rate": 4.987332873414666e-06,
+      "loss": 0.6405,
+      "step": 87
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.178942375285086,
+      "learning_rate": 4.987002007920142e-06,
+      "loss": 0.5593,
+      "step": 88
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.2625634345900445,
+      "learning_rate": 4.9866668881089515e-06,
+      "loss": 0.6133,
+      "step": 89
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.363092638811143,
+      "learning_rate": 4.986327514554356e-06,
+      "loss": 0.6298,
+      "step": 90
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.0401982492138546,
+      "learning_rate": 4.985983887836894e-06,
+      "loss": 0.6276,
+      "step": 91
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.276956647922478,
+      "learning_rate": 4.985636008544381e-06,
+      "loss": 0.5691,
+      "step": 92
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.1072762844110233,
+      "learning_rate": 4.985283877271908e-06,
+      "loss": 0.6175,
+      "step": 93
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.2931866879442637,
+      "learning_rate": 4.984927494621836e-06,
+      "loss": 0.6419,
+      "step": 94
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.112474101166308,
+      "learning_rate": 4.984566861203801e-06,
+      "loss": 0.607,
+      "step": 95
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.1816059679212634,
+      "learning_rate": 4.984201977634711e-06,
+      "loss": 0.6136,
+      "step": 96
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.0620776369966554,
+      "learning_rate": 4.9838328445387415e-06,
+      "loss": 0.6372,
+      "step": 97
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.147592836641578,
+      "learning_rate": 4.983459462547341e-06,
+      "loss": 0.606,
+      "step": 98
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.1808001877062453,
+      "learning_rate": 4.983081832299224e-06,
+      "loss": 0.6019,
+      "step": 99
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.3751999527114087,
+      "learning_rate": 4.98269995444037e-06,
+      "loss": 0.6021,
+      "step": 100
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.8769470206406913,
+      "learning_rate": 4.98231382962403e-06,
+      "loss": 0.6082,
+      "step": 101
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.3060925784921347,
+      "learning_rate": 4.981923458510717e-06,
+      "loss": 0.6174,
+      "step": 102
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.1543176832473683,
+      "learning_rate": 4.981528841768206e-06,
+      "loss": 0.6092,
+      "step": 103
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.1558689520522547,
+      "learning_rate": 4.981129980071538e-06,
+      "loss": 0.587,
+      "step": 104
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.3830532005188383,
+      "learning_rate": 4.980726874103014e-06,
+      "loss": 0.6518,
+      "step": 105
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.3333119576634767,
+      "learning_rate": 4.980319524552195e-06,
+      "loss": 0.6096,
+      "step": 106
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.1135146855324214,
+      "learning_rate": 4.9799079321159e-06,
+      "loss": 0.5728,
+      "step": 107
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.2300463384326394,
+      "learning_rate": 4.9794920974982095e-06,
+      "loss": 0.6563,
+      "step": 108
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.1745234017525443,
+      "learning_rate": 4.979072021410458e-06,
+      "loss": 0.5968,
+      "step": 109
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.1536586182562334,
+      "learning_rate": 4.978647704571237e-06,
+      "loss": 0.6189,
+      "step": 110
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.193809374687326,
+      "learning_rate": 4.97821914770639e-06,
+      "loss": 0.5864,
+      "step": 111
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.0525896373682047,
+      "learning_rate": 4.977786351549017e-06,
+      "loss": 0.6101,
+      "step": 112
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.216099286618384,
+      "learning_rate": 4.977349316839467e-06,
+      "loss": 0.5984,
+      "step": 113
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.155122255962579,
+      "learning_rate": 4.97690804432534e-06,
+      "loss": 0.6311,
+      "step": 114
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.2972101190291374,
+      "learning_rate": 4.976462534761487e-06,
+      "loss": 0.5813,
+      "step": 115
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 1.9925413745245948,
+      "learning_rate": 4.9760127889100044e-06,
+      "loss": 0.6157,
+      "step": 116
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.2802548684036568,
+      "learning_rate": 4.975558807540238e-06,
+      "loss": 0.6079,
+      "step": 117
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.048888007394621,
+      "learning_rate": 4.9751005914287775e-06,
+      "loss": 0.6467,
+      "step": 118
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.28661640438254,
+      "learning_rate": 4.974638141359456e-06,
+      "loss": 0.6029,
+      "step": 119
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.004056683755783,
+      "learning_rate": 4.974171458123351e-06,
+      "loss": 0.6289,
+      "step": 120
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.1628470048067667,
+      "learning_rate": 4.97370054251878e-06,
+      "loss": 0.6139,
+      "step": 121
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.056119895466544,
+      "learning_rate": 4.9732253953513e-06,
+      "loss": 0.5798,
+      "step": 122
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.1716513163164275,
+      "learning_rate": 4.972746017433709e-06,
+      "loss": 0.6085,
+      "step": 123
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.255856676525811,
+      "learning_rate": 4.97226240958604e-06,
+      "loss": 0.6342,
+      "step": 124
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.1049280498075373,
+      "learning_rate": 4.971774572635563e-06,
+      "loss": 0.6197,
+      "step": 125
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.133349390995361,
+      "learning_rate": 4.97128250741678e-06,
+      "loss": 0.5751,
+      "step": 126
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.2044887467317578,
+      "learning_rate": 4.97078621477143e-06,
+      "loss": 0.6611,
+      "step": 127
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.1413863795698145,
+      "learning_rate": 4.970285695548481e-06,
+      "loss": 0.625,
+      "step": 128
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.0229587336296615,
+      "learning_rate": 4.969780950604132e-06,
+      "loss": 0.5989,
+      "step": 129
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.0983599595244247,
+      "learning_rate": 4.969271980801808e-06,
+      "loss": 0.5747,
+      "step": 130
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.1059041140010786,
+      "learning_rate": 4.9687587870121645e-06,
+      "loss": 0.5869,
+      "step": 131
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.8967441614595046,
+      "learning_rate": 4.9682413701130815e-06,
+      "loss": 0.6272,
+      "step": 132
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.9976164993621088,
+      "learning_rate": 4.967719730989663e-06,
+      "loss": 0.6282,
+      "step": 133
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.8719131324952145,
+      "learning_rate": 4.967193870534235e-06,
+      "loss": 0.6052,
+      "step": 134
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.071702997476533,
+      "learning_rate": 4.9666637896463455e-06,
+      "loss": 0.5785,
+      "step": 135
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.9549455320048341,
+      "learning_rate": 4.966129489232762e-06,
+      "loss": 0.5739,
+      "step": 136
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.0656898626759315,
+      "learning_rate": 4.9655909702074684e-06,
+      "loss": 0.6651,
+      "step": 137
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.1185948604203038,
+      "learning_rate": 4.965048233491669e-06,
+      "loss": 0.5759,
+      "step": 138
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.08566019272993,
+      "learning_rate": 4.964501280013777e-06,
+      "loss": 0.6271,
+      "step": 139
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.117420903965419,
+      "learning_rate": 4.963950110709425e-06,
+      "loss": 0.5968,
+      "step": 140
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.9784944143818486,
+      "learning_rate": 4.963394726521453e-06,
+      "loss": 0.6112,
+      "step": 141
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.077292948039572,
+      "learning_rate": 4.9628351283999144e-06,
+      "loss": 0.5636,
+      "step": 142
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.223803520245629,
+      "learning_rate": 4.962271317302068e-06,
+      "loss": 0.6658,
+      "step": 143
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.039369072186367,
+      "learning_rate": 4.9617032941923796e-06,
+      "loss": 0.5853,
+      "step": 144
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.071470113085907,
+      "learning_rate": 4.961131060042522e-06,
+      "loss": 0.601,
+      "step": 145
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.437470272347474,
+      "learning_rate": 4.960554615831372e-06,
+      "loss": 0.6593,
+      "step": 146
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.178684122927139,
+      "learning_rate": 4.959973962545005e-06,
+      "loss": 0.607,
+      "step": 147
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.097006749956471,
+      "learning_rate": 4.9593891011767e-06,
+      "loss": 0.5873,
+      "step": 148
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.9801202541822784,
+      "learning_rate": 4.958800032726931e-06,
+      "loss": 0.5877,
+      "step": 149
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.30001951085656,
+      "learning_rate": 4.958206758203373e-06,
+      "loss": 0.6368,
+      "step": 150
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 1.990094260131078,
+      "learning_rate": 4.957609278620891e-06,
+      "loss": 0.59,
+      "step": 151
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.262163752076628,
+      "learning_rate": 4.957007595001548e-06,
+      "loss": 0.5779,
+      "step": 152
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.1970152093220983,
+      "learning_rate": 4.956401708374595e-06,
+      "loss": 0.5894,
+      "step": 153
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.220825872684071,
+      "learning_rate": 4.9557916197764745e-06,
+      "loss": 0.6528,
+      "step": 154
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.099472677591387,
+      "learning_rate": 4.955177330250817e-06,
+      "loss": 0.5798,
+      "step": 155
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.159203936881569,
+      "learning_rate": 4.954558840848437e-06,
+      "loss": 0.6206,
+      "step": 156
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.185152414039555,
+      "learning_rate": 4.953936152627338e-06,
+      "loss": 0.5624,
+      "step": 157
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.0679748168992624,
+      "learning_rate": 4.953309266652701e-06,
+      "loss": 0.5859,
+      "step": 158
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.327237187255128,
+      "learning_rate": 4.952678183996891e-06,
+      "loss": 0.5632,
+      "step": 159
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.2865519679977417,
+      "learning_rate": 4.952042905739451e-06,
+      "loss": 0.6965,
+      "step": 160
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.523435408018699,
+      "learning_rate": 4.9514034329671e-06,
+      "loss": 0.6217,
+      "step": 161
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.4992653226709636,
+      "learning_rate": 4.950759766773734e-06,
+      "loss": 0.6175,
+      "step": 162
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.432752824777114,
+      "learning_rate": 4.950111908260423e-06,
+      "loss": 0.5862,
+      "step": 163
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.137500912204061,
+      "learning_rate": 4.949459858535404e-06,
+      "loss": 0.6124,
+      "step": 164
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.2226376224120474,
+      "learning_rate": 4.94880361871409e-06,
+      "loss": 0.5891,
+      "step": 165
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.3821839805775165,
+      "learning_rate": 4.9481431899190544e-06,
+      "loss": 0.6008,
+      "step": 166
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.306242834684614,
+      "learning_rate": 4.947478573280044e-06,
+      "loss": 0.6159,
+      "step": 167
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.3298092236851518,
+      "learning_rate": 4.946809769933963e-06,
+      "loss": 0.5809,
+      "step": 168
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.364296499621558,
+      "learning_rate": 4.946136781024883e-06,
+      "loss": 0.5895,
+      "step": 169
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.237241095609228,
+      "learning_rate": 4.945459607704029e-06,
+      "loss": 0.6144,
+      "step": 170
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.4027419761972264,
+      "learning_rate": 4.9447782511297905e-06,
+      "loss": 0.5985,
+      "step": 171
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.1547059182244284,
+      "learning_rate": 4.944092712467709e-06,
+      "loss": 0.5763,
+      "step": 172
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.1530221667047984,
+      "learning_rate": 4.9434029928904805e-06,
+      "loss": 0.5692,
+      "step": 173
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.228588593294869,
+      "learning_rate": 4.942709093577954e-06,
+      "loss": 0.5896,
+      "step": 174
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.1597295307130198,
+      "learning_rate": 4.942011015717129e-06,
+      "loss": 0.5864,
+      "step": 175
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.321140955498194,
+      "learning_rate": 4.941308760502149e-06,
+      "loss": 0.6089,
+      "step": 176
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.220124736460707,
+      "learning_rate": 4.940602329134309e-06,
+      "loss": 0.5786,
+      "step": 177
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.1698038563080417,
+      "learning_rate": 4.939891722822043e-06,
+      "loss": 0.5749,
+      "step": 178
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.244425969121411,
+      "learning_rate": 4.93917694278093e-06,
+      "loss": 0.5877,
+      "step": 179
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.143920008069458,
+      "learning_rate": 4.938457990233687e-06,
+      "loss": 0.6024,
+      "step": 180
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.1786040820345813,
+      "learning_rate": 4.937734866410169e-06,
+      "loss": 0.5845,
+      "step": 181
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.301832824481007,
+      "learning_rate": 4.9370075725473665e-06,
+      "loss": 0.6182,
+      "step": 182
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.3748033727083997,
+      "learning_rate": 4.936276109889403e-06,
+      "loss": 0.6073,
+      "step": 183
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.476334487382023,
+      "learning_rate": 4.935540479687534e-06,
+      "loss": 0.5793,
+      "step": 184
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.2509466352322494,
+      "learning_rate": 4.934800683200143e-06,
+      "loss": 0.6133,
+      "step": 185
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.8391697547684873,
+      "learning_rate": 4.934056721692742e-06,
+      "loss": 0.5967,
+      "step": 186
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.4492364225391765,
+      "learning_rate": 4.933308596437965e-06,
+      "loss": 0.5676,
+      "step": 187
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.685548141821295,
+      "learning_rate": 4.932556308715573e-06,
+      "loss": 0.6069,
+      "step": 188
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.261217637824808,
+      "learning_rate": 4.931799859812443e-06,
+      "loss": 0.6411,
+      "step": 189
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.3838284395200966,
+      "learning_rate": 4.931039251022573e-06,
+      "loss": 0.5745,
+      "step": 190
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.2550921344466164,
+      "learning_rate": 4.930274483647074e-06,
+      "loss": 0.5989,
+      "step": 191
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.078406234527636,
+      "learning_rate": 4.929505558994175e-06,
+      "loss": 0.5998,
+      "step": 192
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.592864566091496,
+      "learning_rate": 4.928732478379214e-06,
+      "loss": 0.5842,
+      "step": 193
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.092752299259724,
+      "learning_rate": 4.927955243124638e-06,
+      "loss": 0.5789,
+      "step": 194
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.3799311595696966,
+      "learning_rate": 4.927173854560002e-06,
+      "loss": 0.6265,
+      "step": 195
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.246876688010602,
+      "learning_rate": 4.926388314021964e-06,
+      "loss": 0.6126,
+      "step": 196
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.1409898276704578,
+      "learning_rate": 4.925598622854287e-06,
+      "loss": 0.6073,
+      "step": 197
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.5946158421875385,
+      "learning_rate": 4.924804782407834e-06,
+      "loss": 0.6154,
+      "step": 198
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.1225494320427982,
+      "learning_rate": 4.924006794040562e-06,
+      "loss": 0.583,
+      "step": 199
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.1971323526291338,
+      "learning_rate": 4.923204659117528e-06,
+      "loss": 0.6078,
+      "step": 200
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.289185506404785,
+      "learning_rate": 4.92239837901088e-06,
+      "loss": 0.6127,
+      "step": 201
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.0071007751625354,
+      "learning_rate": 4.921587955099858e-06,
+      "loss": 0.5804,
+      "step": 202
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.2981840149068247,
+      "learning_rate": 4.920773388770789e-06,
+      "loss": 0.6027,
+      "step": 203
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.236179116886702,
+      "learning_rate": 4.919954681417087e-06,
+      "loss": 0.6179,
+      "step": 204
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.007422589251611,
+      "learning_rate": 4.91913183443925e-06,
+      "loss": 0.5647,
+      "step": 205
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.1402813555735483,
+      "learning_rate": 4.918304849244857e-06,
+      "loss": 0.5841,
+      "step": 206
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.0456415785177104,
+      "learning_rate": 4.917473727248565e-06,
+      "loss": 0.5524,
+      "step": 207
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 1.9673558126020942,
+      "learning_rate": 4.916638469872109e-06,
+      "loss": 0.5698,
+      "step": 208
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.015111672496819,
+      "learning_rate": 4.9157990785442964e-06,
+      "loss": 0.5957,
+      "step": 209
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 1.9502065547578398,
+      "learning_rate": 4.9149555547010086e-06,
+      "loss": 0.5592,
+      "step": 210
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.167936522558899,
+      "learning_rate": 4.9141078997851945e-06,
+      "loss": 0.5705,
+      "step": 211
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.2066587458997935,
+      "learning_rate": 4.91325611524687e-06,
+      "loss": 0.5526,
+      "step": 212
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.9132995625903553,
+      "learning_rate": 4.9124002025431136e-06,
+      "loss": 0.5767,
+      "step": 213
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.0097281107801277,
+      "learning_rate": 4.91154016313807e-06,
+      "loss": 0.6185,
+      "step": 214
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.023532008241332,
+      "learning_rate": 4.910675998502938e-06,
+      "loss": 0.6005,
+      "step": 215
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.9253831001776973,
+      "learning_rate": 4.909807710115977e-06,
+      "loss": 0.5769,
+      "step": 216
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.066862408842564,
+      "learning_rate": 4.908935299462497e-06,
+      "loss": 0.5671,
+      "step": 217
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.9412704290792853,
+      "learning_rate": 4.908058768034862e-06,
+      "loss": 0.5568,
+      "step": 218
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.185994457097553,
+      "learning_rate": 4.907178117332487e-06,
+      "loss": 0.5621,
+      "step": 219
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.021517127546353,
+      "learning_rate": 4.906293348861829e-06,
+      "loss": 0.5672,
+      "step": 220
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.099703967072734,
+      "learning_rate": 4.905404464136391e-06,
+      "loss": 0.5366,
+      "step": 221
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.030197056583618,
+      "learning_rate": 4.904511464676718e-06,
+      "loss": 0.6064,
+      "step": 222
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.4170102988954896,
+      "learning_rate": 4.903614352010393e-06,
+      "loss": 0.5919,
+      "step": 223
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.0819468873015476,
+      "learning_rate": 4.9027131276720355e-06,
+      "loss": 0.5366,
+      "step": 224
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.148008018153629,
+      "learning_rate": 4.901807793203299e-06,
+      "loss": 0.597,
+      "step": 225
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.0303725862017186,
+      "learning_rate": 4.900898350152866e-06,
+      "loss": 0.6394,
+      "step": 226
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.1598989214704334,
+      "learning_rate": 4.899984800076449e-06,
+      "loss": 0.5932,
+      "step": 227
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.0816312637185255,
+      "learning_rate": 4.899067144536786e-06,
+      "loss": 0.5909,
+      "step": 228
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 1.9024067197329315,
+      "learning_rate": 4.8981453851036365e-06,
+      "loss": 0.5463,
+      "step": 229
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.1830926868871043,
+      "learning_rate": 4.897219523353781e-06,
+      "loss": 0.5821,
+      "step": 230
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.1156269612794016,
+      "learning_rate": 4.8962895608710195e-06,
+      "loss": 0.5993,
+      "step": 231
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 1.9653407654210864,
+      "learning_rate": 4.895355499246162e-06,
+      "loss": 0.5525,
+      "step": 232
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.367769051061897,
+      "learning_rate": 4.894417340077036e-06,
+      "loss": 0.5683,
+      "step": 233
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.078327064466567,
+      "learning_rate": 4.893475084968474e-06,
+      "loss": 0.6184,
+      "step": 234
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.1661882731589475,
+      "learning_rate": 4.8925287355323195e-06,
+      "loss": 0.6321,
+      "step": 235
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.182760952002799,
+      "learning_rate": 4.891578293387413e-06,
+      "loss": 0.6254,
+      "step": 236
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.998723579962691,
+      "learning_rate": 4.890623760159605e-06,
+      "loss": 0.5371,
+      "step": 237
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.319922346931926,
+      "learning_rate": 4.8896651374817365e-06,
+      "loss": 0.5941,
+      "step": 238
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.090735197217999,
+      "learning_rate": 4.888702426993648e-06,
+      "loss": 0.577,
+      "step": 239
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.1247199987228558,
+      "learning_rate": 4.887735630342173e-06,
+      "loss": 0.5928,
+      "step": 240
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.33151114429804,
+      "learning_rate": 4.8867647491811315e-06,
+      "loss": 0.5838,
+      "step": 241
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.1570026356289147,
+      "learning_rate": 4.885789785171334e-06,
+      "loss": 0.5642,
+      "step": 242
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.049571197047368,
+      "learning_rate": 4.884810739980575e-06,
+      "loss": 0.6684,
+      "step": 243
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.9810062424466381,
+      "learning_rate": 4.883827615283626e-06,
+      "loss": 0.5942,
+      "step": 244
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.145869663660159,
+      "learning_rate": 4.882840412762244e-06,
+      "loss": 0.6356,
+      "step": 245
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.19290302186514,
+      "learning_rate": 4.881849134105156e-06,
+      "loss": 0.6189,
+      "step": 246
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.0561043419872984,
+      "learning_rate": 4.880853781008062e-06,
+      "loss": 0.5563,
+      "step": 247
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 1.8831183793224635,
+      "learning_rate": 4.879854355173638e-06,
+      "loss": 0.5522,
+      "step": 248
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.020981606684741,
+      "learning_rate": 4.878850858311518e-06,
+      "loss": 0.5548,
+      "step": 249
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.060242570493272,
+      "learning_rate": 4.877843292138307e-06,
+      "loss": 0.5715,
+      "step": 250
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.082455778933014,
+      "learning_rate": 4.8768316583775665e-06,
+      "loss": 0.5959,
+      "step": 251
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 1.9830929719438626,
+      "learning_rate": 4.875815958759819e-06,
+      "loss": 0.5813,
+      "step": 252
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 1.9772267506828567,
+      "learning_rate": 4.8747961950225406e-06,
+      "loss": 0.539,
+      "step": 253
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.1492561995002104,
+      "learning_rate": 4.873772368910161e-06,
+      "loss": 0.6059,
+      "step": 254
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.253757247139787,
+      "learning_rate": 4.872744482174058e-06,
+      "loss": 0.5897,
+      "step": 255
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.3282624851882496,
+      "learning_rate": 4.8717125365725545e-06,
+      "loss": 0.5675,
+      "step": 256
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.15573581133063,
+      "learning_rate": 4.8706765338709185e-06,
+      "loss": 0.5958,
+      "step": 257
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.073289220218241,
+      "learning_rate": 4.869636475841358e-06,
+      "loss": 0.6052,
+      "step": 258
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.293714090249444,
+      "learning_rate": 4.8685923642630165e-06,
+      "loss": 0.5786,
+      "step": 259
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.9496544276539172,
+      "learning_rate": 4.867544200921974e-06,
+      "loss": 0.6163,
+      "step": 260
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.5267016753690132,
+      "learning_rate": 4.866491987611239e-06,
+      "loss": 0.6223,
+      "step": 261
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.8731249445320794,
+      "learning_rate": 4.865435726130751e-06,
+      "loss": 0.5632,
+      "step": 262
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.3586331105798863,
+      "learning_rate": 4.86437541828737e-06,
+      "loss": 0.5769,
+      "step": 263
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.0258106914510585,
+      "learning_rate": 4.863311065894883e-06,
+      "loss": 0.6103,
+      "step": 264
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.2543614390885955,
+      "learning_rate": 4.862242670773991e-06,
+      "loss": 0.5844,
+      "step": 265
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.9440299381244668,
+      "learning_rate": 4.861170234752314e-06,
+      "loss": 0.5559,
+      "step": 266
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.254538268495492,
+      "learning_rate": 4.8600937596643815e-06,
+      "loss": 0.5709,
+      "step": 267
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.007651746385687,
+      "learning_rate": 4.8590132473516346e-06,
+      "loss": 0.573,
+      "step": 268
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.0735253118288837,
+      "learning_rate": 4.857928699662421e-06,
+      "loss": 0.5954,
+      "step": 269
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.024775417101569,
+      "learning_rate": 4.856840118451989e-06,
+      "loss": 0.5992,
+      "step": 270
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.1043310699945814,
+      "learning_rate": 4.855747505582488e-06,
+      "loss": 0.6507,
+      "step": 271
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.0386353328313214,
+      "learning_rate": 4.854650862922965e-06,
+      "loss": 0.5666,
+      "step": 272
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.978698841367705,
+      "learning_rate": 4.853550192349358e-06,
+      "loss": 0.5593,
+      "step": 273
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.9386534247633986,
+      "learning_rate": 4.852445495744497e-06,
+      "loss": 0.5735,
+      "step": 274
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.049346245018599,
+      "learning_rate": 4.8513367749981e-06,
+      "loss": 0.5415,
+      "step": 275
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.1051969521216605,
+      "learning_rate": 4.850224032006765e-06,
+      "loss": 0.5532,
+      "step": 276
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.2006792558872315,
+      "learning_rate": 4.849107268673975e-06,
+      "loss": 0.5696,
+      "step": 277
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.0460787736353647,
+      "learning_rate": 4.847986486910088e-06,
+      "loss": 0.5658,
+      "step": 278
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.1161843259225406,
+      "learning_rate": 4.846861688632336e-06,
+      "loss": 0.583,
+      "step": 279
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.8882198480393542,
+      "learning_rate": 4.8457328757648224e-06,
+      "loss": 0.5693,
+      "step": 280
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.1578413701109596,
+      "learning_rate": 4.844600050238517e-06,
+      "loss": 0.5409,
+      "step": 281
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.03912467778954,
+      "learning_rate": 4.843463213991255e-06,
+      "loss": 0.5908,
+      "step": 282
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.2333462480826247,
+      "learning_rate": 4.842322368967731e-06,
+      "loss": 0.6088,
+      "step": 283
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.06698702157327,
+      "learning_rate": 4.8411775171194986e-06,
+      "loss": 0.5953,
+      "step": 284
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.1433923121572045,
+      "learning_rate": 4.840028660404964e-06,
+      "loss": 0.5851,
+      "step": 285
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.214858780835041,
+      "learning_rate": 4.838875800789386e-06,
+      "loss": 0.5913,
+      "step": 286
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.038128612492624,
+      "learning_rate": 4.837718940244871e-06,
+      "loss": 0.5827,
+      "step": 287
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.9894065096959768,
+      "learning_rate": 4.836558080750365e-06,
+      "loss": 0.5769,
+      "step": 288
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.1711590153285822,
+      "learning_rate": 4.835393224291662e-06,
+      "loss": 0.654,
+      "step": 289
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.105004451988696,
+      "learning_rate": 4.834224372861386e-06,
+      "loss": 0.6158,
+      "step": 290
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.9554568023729102,
+      "learning_rate": 4.833051528459001e-06,
+      "loss": 0.5807,
+      "step": 291
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.2693917834500312,
+      "learning_rate": 4.831874693090797e-06,
+      "loss": 0.5557,
+      "step": 292
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.9081391627126192,
+      "learning_rate": 4.830693868769892e-06,
+      "loss": 0.6057,
+      "step": 293
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.2133664110768585,
+      "learning_rate": 4.82950905751623e-06,
+      "loss": 0.6103,
+      "step": 294
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.015392814211589,
+      "learning_rate": 4.8283202613565735e-06,
+      "loss": 0.5578,
+      "step": 295
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.142124020349717,
+      "learning_rate": 4.8271274823245e-06,
+      "loss": 0.5675,
+      "step": 296
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.981611826462286,
+      "learning_rate": 4.825930722460405e-06,
+      "loss": 0.5696,
+      "step": 297
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.966759748348117,
+      "learning_rate": 4.824729983811486e-06,
+      "loss": 0.58,
+      "step": 298
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.0117040369769397,
+      "learning_rate": 4.823525268431754e-06,
+      "loss": 0.6005,
+      "step": 299
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.9579664917991193,
+      "learning_rate": 4.822316578382019e-06,
+      "loss": 0.5472,
+      "step": 300
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.9075723479635032,
+      "learning_rate": 4.821103915729892e-06,
+      "loss": 0.5834,
+      "step": 301
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.289340229011896,
+      "learning_rate": 4.819887282549777e-06,
+      "loss": 0.6088,
+      "step": 302
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.0410700553735235,
+      "learning_rate": 4.818666680922874e-06,
+      "loss": 0.5449,
+      "step": 303
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.074434792511819,
+      "learning_rate": 4.8174421129371675e-06,
+      "loss": 0.5826,
+      "step": 304
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.1377170527698865,
+      "learning_rate": 4.816213580687428e-06,
+      "loss": 0.6262,
+      "step": 305
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.060340839248083,
+      "learning_rate": 4.814981086275209e-06,
+      "loss": 0.5479,
+      "step": 306
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.007036467413588,
+      "learning_rate": 4.813744631808841e-06,
+      "loss": 0.5642,
+      "step": 307
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.016779606220332,
+      "learning_rate": 4.8125042194034285e-06,
+      "loss": 0.5503,
+      "step": 308
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.930004252757651,
+      "learning_rate": 4.811259851180845e-06,
+      "loss": 0.582,
+      "step": 309
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.9179477992752856,
+      "learning_rate": 4.810011529269734e-06,
+      "loss": 0.5678,
+      "step": 310
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.023430757276848,
+      "learning_rate": 4.808759255805498e-06,
+      "loss": 0.614,
+      "step": 311
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.8334738409404936,
+      "learning_rate": 4.807503032930306e-06,
+      "loss": 0.5742,
+      "step": 312
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.937332706274502,
+      "learning_rate": 4.806242862793075e-06,
+      "loss": 0.6257,
+      "step": 313
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.0265383045700363,
+      "learning_rate": 4.8049787475494786e-06,
+      "loss": 0.5733,
+      "step": 314
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.056444039073761,
+      "learning_rate": 4.803710689361939e-06,
+      "loss": 0.578,
+      "step": 315
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.411132719183335,
+      "learning_rate": 4.802438690399622e-06,
+      "loss": 0.5778,
+      "step": 316
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.0233969242222853,
+      "learning_rate": 4.801162752838436e-06,
+      "loss": 0.5649,
+      "step": 317
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.2809121915132815,
+      "learning_rate": 4.799882878861025e-06,
+      "loss": 0.5589,
+      "step": 318
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.9806834041020271,
+      "learning_rate": 4.798599070656768e-06,
+      "loss": 0.5753,
+      "step": 319
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.095099671577702,
+      "learning_rate": 4.797311330421773e-06,
+      "loss": 0.5644,
+      "step": 320
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.1697606190375764,
+      "learning_rate": 4.796019660358877e-06,
+      "loss": 0.6009,
+      "step": 321
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.9549416103216173,
+      "learning_rate": 4.794724062677635e-06,
+      "loss": 0.5429,
+      "step": 322
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.9986949357292838,
+      "learning_rate": 4.793424539594323e-06,
+      "loss": 0.5456,
+      "step": 323
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.9414831957796765,
+      "learning_rate": 4.792121093331935e-06,
+      "loss": 0.5468,
+      "step": 324
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.100702188933012,
+      "learning_rate": 4.7908137261201685e-06,
+      "loss": 0.5763,
+      "step": 325
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.2747471285831025,
+      "learning_rate": 4.789502440195436e-06,
+      "loss": 0.5637,
+      "step": 326
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.8996382919319124,
+      "learning_rate": 4.788187237800849e-06,
+      "loss": 0.5285,
+      "step": 327
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.3451495174978847,
+      "learning_rate": 4.786868121186218e-06,
+      "loss": 0.5638,
+      "step": 328
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.0437536068229565,
+      "learning_rate": 4.7855450926080535e-06,
+      "loss": 0.5282,
+      "step": 329
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.1185488514745554,
+      "learning_rate": 4.784218154329555e-06,
+      "loss": 0.5689,
+      "step": 330
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.08745956731504,
+      "learning_rate": 4.78288730862061e-06,
+      "loss": 0.5772,
+      "step": 331
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.9479507156354359,
+      "learning_rate": 4.781552557757789e-06,
+      "loss": 0.5419,
+      "step": 332
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.0211480847937255,
+      "learning_rate": 4.780213904024346e-06,
+      "loss": 0.5757,
+      "step": 333
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.9075335749936069,
+      "learning_rate": 4.7788713497102094e-06,
+      "loss": 0.5693,
+      "step": 334
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.9590727137410602,
+      "learning_rate": 4.777524897111979e-06,
+      "loss": 0.5501,
+      "step": 335
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.0328480247612752,
+      "learning_rate": 4.776174548532926e-06,
+      "loss": 0.587,
+      "step": 336
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.062540517496736,
+      "learning_rate": 4.774820306282982e-06,
+      "loss": 0.5819,
+      "step": 337
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.0054452800156195,
+      "learning_rate": 4.773462172678744e-06,
+      "loss": 0.5529,
+      "step": 338
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.9641125644599562,
+      "learning_rate": 4.772100150043462e-06,
+      "loss": 0.5895,
+      "step": 339
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.9196744569285298,
+      "learning_rate": 4.77073424070704e-06,
+      "loss": 0.5504,
+      "step": 340
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.0002752186146484,
+      "learning_rate": 4.76936444700603e-06,
+      "loss": 0.5307,
+      "step": 341
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.1068919823054344,
+      "learning_rate": 4.76799077128363e-06,
+      "loss": 0.5908,
+      "step": 342
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.919597745459612,
+      "learning_rate": 4.766613215889678e-06,
+      "loss": 0.5423,
+      "step": 343
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 2.0670928578728716,
+      "learning_rate": 4.765231783180648e-06,
+      "loss": 0.5901,
+      "step": 344
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.906116148793229,
+      "learning_rate": 4.763846475519648e-06,
+      "loss": 0.5919,
+      "step": 345
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.9133575268702454,
+      "learning_rate": 4.762457295276413e-06,
+      "loss": 0.585,
+      "step": 346
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 2.133902651855379,
+      "learning_rate": 4.7610642448273025e-06,
+      "loss": 0.5444,
+      "step": 347
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.95222194640397,
+      "learning_rate": 4.7596673265552985e-06,
+      "loss": 0.5941,
+      "step": 348
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 2.095010268380277,
+      "learning_rate": 4.758266542849997e-06,
+      "loss": 0.6045,
+      "step": 349
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 2.0493864712059655,
+      "learning_rate": 4.756861896107609e-06,
+      "loss": 0.6011,
+      "step": 350
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.9222198823064967,
+      "learning_rate": 4.755453388730949e-06,
+      "loss": 0.5521,
+      "step": 351
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 2.368147154955994,
+      "learning_rate": 4.754041023129442e-06,
+      "loss": 0.6117,
+      "step": 352
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.9734596786106697,
+      "learning_rate": 4.752624801719108e-06,
+      "loss": 0.5727,
+      "step": 353
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 2.151510566977991,
+      "learning_rate": 4.751204726922564e-06,
+      "loss": 0.6085,
+      "step": 354
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.9291219072892685,
+      "learning_rate": 4.74978080116902e-06,
+      "loss": 0.5655,
+      "step": 355
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.838592559018919,
+      "learning_rate": 4.748353026894273e-06,
+      "loss": 0.5508,
+      "step": 356
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 2.069156589116884,
+      "learning_rate": 4.7469214065407e-06,
+      "loss": 0.5942,
+      "step": 357
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.8960817746615841,
+      "learning_rate": 4.745485942557264e-06,
+      "loss": 0.5902,
+      "step": 358
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 2.0606557307859634,
+      "learning_rate": 4.744046637399497e-06,
+      "loss": 0.556,
+      "step": 359
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.9660065879130573,
+      "learning_rate": 4.742603493529505e-06,
+      "loss": 0.5364,
+      "step": 360
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.9647921383638112,
+      "learning_rate": 4.741156513415958e-06,
+      "loss": 0.5601,
+      "step": 361
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 2.049074688423064,
+      "learning_rate": 4.739705699534092e-06,
+      "loss": 0.556,
+      "step": 362
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.962593945802751,
+      "learning_rate": 4.738251054365697e-06,
+      "loss": 0.5609,
+      "step": 363
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 2.059675349950347,
+      "learning_rate": 4.736792580399119e-06,
+      "loss": 0.5499,
+      "step": 364
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.8479566025134508,
+      "learning_rate": 4.7353302801292555e-06,
+      "loss": 0.5621,
+      "step": 365
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.9405450724813613,
+      "learning_rate": 4.733864156057545e-06,
+      "loss": 0.5437,
+      "step": 366
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 2.122487864033456,
+      "learning_rate": 4.7323942106919715e-06,
+      "loss": 0.5984,
+      "step": 367
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.6822841144123046,
+      "learning_rate": 4.730920446547052e-06,
+      "loss": 0.5951,
+      "step": 368
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.001405394086718,
+      "learning_rate": 4.729442866143838e-06,
+      "loss": 0.5552,
+      "step": 369
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.081154186949651,
+      "learning_rate": 4.72796147200991e-06,
+      "loss": 0.587,
+      "step": 370
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.1196544292473236,
+      "learning_rate": 4.72647626667937e-06,
+      "loss": 0.5882,
+      "step": 371
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.107445583509131,
+      "learning_rate": 4.724987252692841e-06,
+      "loss": 0.5389,
+      "step": 372
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 1.9529785007256542,
+      "learning_rate": 4.723494432597462e-06,
+      "loss": 0.6439,
+      "step": 373
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.11513441515607,
+      "learning_rate": 4.72199780894688e-06,
+      "loss": 0.6089,
+      "step": 374
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 1.9769899713721226,
+      "learning_rate": 4.7204973843012504e-06,
+      "loss": 0.5393,
+      "step": 375
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.063749623036316,
+      "learning_rate": 4.718993161227231e-06,
+      "loss": 0.5987,
+      "step": 376
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.0515862288253883,
+      "learning_rate": 4.717485142297977e-06,
+      "loss": 0.5772,
+      "step": 377
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 1.8962297741946081,
+      "learning_rate": 4.715973330093135e-06,
+      "loss": 0.5424,
+      "step": 378
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.2210958340400087,
+      "learning_rate": 4.7144577271988435e-06,
+      "loss": 0.6072,
+      "step": 379
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.067113337475314,
+      "learning_rate": 4.712938336207724e-06,
+      "loss": 0.5482,
+      "step": 380
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.8985489253954526,
+      "learning_rate": 4.711415159718876e-06,
+      "loss": 0.5593,
+      "step": 381
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.085236381118245,
+      "learning_rate": 4.709888200337879e-06,
+      "loss": 0.5704,
+      "step": 382
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.0967664183909784,
+      "learning_rate": 4.708357460676779e-06,
+      "loss": 0.5997,
+      "step": 383
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.0454278026009645,
+      "learning_rate": 4.706822943354092e-06,
+      "loss": 0.5669,
+      "step": 384
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.9171673309342674,
+      "learning_rate": 4.705284650994793e-06,
+      "loss": 0.517,
+      "step": 385
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.2003223432761287,
+      "learning_rate": 4.70374258623032e-06,
+      "loss": 0.5957,
+      "step": 386
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.936392519491186,
+      "learning_rate": 4.702196751698557e-06,
+      "loss": 0.5767,
+      "step": 387
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.354272003403086,
+      "learning_rate": 4.700647150043841e-06,
+      "loss": 0.6515,
+      "step": 388
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.9115059027323418,
+      "learning_rate": 4.699093783916955e-06,
+      "loss": 0.5579,
+      "step": 389
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.9878827587010002,
+      "learning_rate": 4.697536655975115e-06,
+      "loss": 0.572,
+      "step": 390
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.9729552535473858,
+      "learning_rate": 4.69597576888198e-06,
+      "loss": 0.5665,
+      "step": 391
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.177634366499155,
+      "learning_rate": 4.694411125307632e-06,
+      "loss": 0.6363,
+      "step": 392
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.8955146664976508,
+      "learning_rate": 4.692842727928584e-06,
+      "loss": 0.5682,
+      "step": 393
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 2.175305874476245,
+      "learning_rate": 4.691270579427769e-06,
+      "loss": 0.5943,
+      "step": 394
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 2.068140527232831,
+      "learning_rate": 4.689694682494537e-06,
+      "loss": 0.5659,
+      "step": 395
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.9112960694448755,
+      "learning_rate": 4.688115039824648e-06,
+      "loss": 0.6048,
+      "step": 396
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.9778305624626604,
+      "learning_rate": 4.686531654120272e-06,
+      "loss": 0.5695,
+      "step": 397
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 2.096904163204813,
+      "learning_rate": 4.684944528089981e-06,
+      "loss": 0.6113,
+      "step": 398
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 2.0011934144948516,
+      "learning_rate": 4.683353664448745e-06,
+      "loss": 0.5568,
+      "step": 399
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.8562851971757464,
+      "learning_rate": 4.681759065917929e-06,
+      "loss": 0.5474,
+      "step": 400
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.8190547574166316,
+      "learning_rate": 4.680160735225285e-06,
+      "loss": 0.5315,
+      "step": 401
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.9247862956929132,
+      "learning_rate": 4.6785586751049505e-06,
+      "loss": 0.5568,
+      "step": 402
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.8469793674077621,
+      "learning_rate": 4.676952888297442e-06,
+      "loss": 0.5811,
+      "step": 403
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.946943145198674,
+      "learning_rate": 4.675343377549653e-06,
+      "loss": 0.5475,
+      "step": 404
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.991304422730463,
+      "learning_rate": 4.6737301456148445e-06,
+      "loss": 0.5856,
+      "step": 405
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.9168241989446437,
+      "learning_rate": 4.672113195252644e-06,
+      "loss": 0.6069,
+      "step": 406
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.9305433665377905,
+      "learning_rate": 4.670492529229039e-06,
+      "loss": 0.5536,
+      "step": 407
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.8441008898830742,
+      "learning_rate": 4.668868150316377e-06,
+      "loss": 0.5859,
+      "step": 408
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.8879301596961315,
+      "learning_rate": 4.667240061293351e-06,
+      "loss": 0.5483,
+      "step": 409
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 2.024767417636281,
+      "learning_rate": 4.665608264945004e-06,
+      "loss": 0.5414,
+      "step": 410
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 2.1331610141797395,
+      "learning_rate": 4.663972764062722e-06,
+      "loss": 0.5811,
+      "step": 411
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.8132480265817386,
+      "learning_rate": 4.662333561444226e-06,
+      "loss": 0.5573,
+      "step": 412
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.9795813972027145,
+      "learning_rate": 4.6606906598935675e-06,
+      "loss": 0.5814,
+      "step": 413
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.8782931074297053,
+      "learning_rate": 4.6590440622211295e-06,
+      "loss": 0.569,
+      "step": 414
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.8219945335518706,
+      "learning_rate": 4.657393771243614e-06,
+      "loss": 0.5669,
+      "step": 415
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 2.4047268604371306,
+      "learning_rate": 4.6557397897840454e-06,
+      "loss": 0.5602,
+      "step": 416
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 2.064501780523946,
+      "learning_rate": 4.654082120671757e-06,
+      "loss": 0.5699,
+      "step": 417
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.9183128854940252,
+      "learning_rate": 4.65242076674239e-06,
+      "loss": 0.6112,
+      "step": 418
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.9315698971629633,
+      "learning_rate": 4.650755730837894e-06,
+      "loss": 0.5537,
+      "step": 419
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.9527809333659218,
+      "learning_rate": 4.649087015806509e-06,
+      "loss": 0.5423,
+      "step": 420
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.8940523915995442,
+      "learning_rate": 4.647414624502777e-06,
+      "loss": 0.5708,
+      "step": 421
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.9976964785548623,
+      "learning_rate": 4.645738559787524e-06,
+      "loss": 0.6006,
+      "step": 422
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.9098681403283917,
+      "learning_rate": 4.64405882452786e-06,
+      "loss": 0.5591,
+      "step": 423
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.8695612182804557,
+      "learning_rate": 4.642375421597175e-06,
+      "loss": 0.5219,
+      "step": 424
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.8912077704810082,
+      "learning_rate": 4.6406883538751315e-06,
+      "loss": 0.5224,
+      "step": 425
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.9390714726978922,
+      "learning_rate": 4.638997624247664e-06,
+      "loss": 0.5359,
+      "step": 426
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 2.051545992296337,
+      "learning_rate": 4.637303235606968e-06,
+      "loss": 0.544,
+      "step": 427
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 2.0657109136265914,
+      "learning_rate": 4.6356051908515e-06,
+      "loss": 0.5429,
+      "step": 428
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 2.0301022307984793,
+      "learning_rate": 4.63390349288597e-06,
+      "loss": 0.5787,
+      "step": 429
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 2.052515756169346,
+      "learning_rate": 4.632198144621338e-06,
+      "loss": 0.5778,
+      "step": 430
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.9741370495474897,
+      "learning_rate": 4.630489148974807e-06,
+      "loss": 0.5142,
+      "step": 431
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.9713229498863698,
+      "learning_rate": 4.62877650886982e-06,
+      "loss": 0.6127,
+      "step": 432
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 2.1609440121306007,
+      "learning_rate": 4.627060227236055e-06,
+      "loss": 0.5886,
+      "step": 433
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.944966445355139,
+      "learning_rate": 4.625340307009418e-06,
+      "loss": 0.5657,
+      "step": 434
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 2.031003925680835,
+      "learning_rate": 4.623616751132041e-06,
+      "loss": 0.5628,
+      "step": 435
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.8774113373137704,
+      "learning_rate": 4.621889562552272e-06,
+      "loss": 0.6068,
+      "step": 436
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 2.0385201543401785,
+      "learning_rate": 4.620158744224677e-06,
+      "loss": 0.5511,
+      "step": 437
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.8440750841938207,
+      "learning_rate": 4.618424299110028e-06,
+      "loss": 0.5261,
+      "step": 438
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.8978691755923442,
+      "learning_rate": 4.616686230175303e-06,
+      "loss": 0.5862,
+      "step": 439
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.8120850246861446,
+      "learning_rate": 4.614944540393679e-06,
+      "loss": 0.5652,
+      "step": 440
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.1821084695714914,
+      "learning_rate": 4.613199232744525e-06,
+      "loss": 0.5598,
+      "step": 441
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.9626422737625222,
+      "learning_rate": 4.611450310213401e-06,
+      "loss": 0.5267,
+      "step": 442
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.9714913234889215,
+      "learning_rate": 4.6096977757920505e-06,
+      "loss": 0.5658,
+      "step": 443
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.0179324078198233,
+      "learning_rate": 4.607941632478393e-06,
+      "loss": 0.582,
+      "step": 444
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.8565193856331161,
+      "learning_rate": 4.6061818832765246e-06,
+      "loss": 0.5715,
+      "step": 445
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.9798501479599246,
+      "learning_rate": 4.604418531196708e-06,
+      "loss": 0.6007,
+      "step": 446
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.0095846956468257,
+      "learning_rate": 4.602651579255369e-06,
+      "loss": 0.5947,
+      "step": 447
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.9316541079988245,
+      "learning_rate": 4.600881030475093e-06,
+      "loss": 0.5501,
+      "step": 448
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.080069353365406,
+      "learning_rate": 4.599106887884616e-06,
+      "loss": 0.5631,
+      "step": 449
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.965973137652201,
+      "learning_rate": 4.5973291545188235e-06,
+      "loss": 0.5267,
+      "step": 450
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.1082225966704087,
+      "learning_rate": 4.595547833418741e-06,
+      "loss": 0.6418,
+      "step": 451
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.0359312594194083,
+      "learning_rate": 4.593762927631536e-06,
+      "loss": 0.5644,
+      "step": 452
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 2.1254892914109433,
+      "learning_rate": 4.591974440210502e-06,
+      "loss": 0.5693,
+      "step": 453
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.9121188587334927,
+      "learning_rate": 4.590182374215064e-06,
+      "loss": 0.5572,
+      "step": 454
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.9348642624953207,
+      "learning_rate": 4.588386732710765e-06,
+      "loss": 0.5446,
+      "step": 455
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.8667846547370581,
+      "learning_rate": 4.5865875187692695e-06,
+      "loss": 0.5681,
+      "step": 456
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.9219061327454674,
+      "learning_rate": 4.5847847354683465e-06,
+      "loss": 0.5508,
+      "step": 457
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.8106132369123122,
+      "learning_rate": 4.5829783858918756e-06,
+      "loss": 0.5626,
+      "step": 458
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.7827483964442634,
+      "learning_rate": 4.5811684731298355e-06,
+      "loss": 0.5575,
+      "step": 459
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.9284196979863513,
+      "learning_rate": 4.5793550002783e-06,
+      "loss": 0.5363,
+      "step": 460
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 2.029647468705457,
+      "learning_rate": 4.577537970439433e-06,
+      "loss": 0.5415,
+      "step": 461
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 2.0997127029950087,
+      "learning_rate": 4.575717386721482e-06,
+      "loss": 0.5814,
+      "step": 462
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.9589290300656341,
+      "learning_rate": 4.573893252238777e-06,
+      "loss": 0.5156,
+      "step": 463
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.905237143908251,
+      "learning_rate": 4.572065570111717e-06,
+      "loss": 0.5536,
+      "step": 464
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.929519794935609,
+      "learning_rate": 4.570234343466775e-06,
+      "loss": 0.5879,
+      "step": 465
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 2.096095808886982,
+      "learning_rate": 4.568399575436484e-06,
+      "loss": 0.6241,
+      "step": 466
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.9486118894048778,
+      "learning_rate": 4.566561269159437e-06,
+      "loss": 0.6307,
+      "step": 467
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 2.0839490306744586,
+      "learning_rate": 4.564719427780276e-06,
+      "loss": 0.5655,
+      "step": 468
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.9439525665822102,
+      "learning_rate": 4.562874054449694e-06,
+      "loss": 0.5437,
+      "step": 469
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.9409142791465297,
+      "learning_rate": 4.5610251523244244e-06,
+      "loss": 0.6429,
+      "step": 470
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.8664574493795525,
+      "learning_rate": 4.559172724567238e-06,
+      "loss": 0.5826,
+      "step": 471
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.80819349503324,
+      "learning_rate": 4.557316774346934e-06,
+      "loss": 0.5372,
+      "step": 472
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.8680097526865296,
+      "learning_rate": 4.555457304838341e-06,
+      "loss": 0.5503,
+      "step": 473
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.7466938790815696,
+      "learning_rate": 4.553594319222303e-06,
+      "loss": 0.5425,
+      "step": 474
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.9610557658505607,
+      "learning_rate": 4.551727820685684e-06,
+      "loss": 0.5755,
+      "step": 475
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.9414839604282412,
+      "learning_rate": 4.549857812421353e-06,
+      "loss": 0.5915,
+      "step": 476
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.8484957644576423,
+      "learning_rate": 4.547984297628186e-06,
+      "loss": 0.5676,
+      "step": 477
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 2.074524028551078,
+      "learning_rate": 4.546107279511055e-06,
+      "loss": 0.6084,
+      "step": 478
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 2.069692704122282,
+      "learning_rate": 4.544226761280826e-06,
+      "loss": 0.5676,
+      "step": 479
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.8975472248317244,
+      "learning_rate": 4.54234274615435e-06,
+      "loss": 0.5904,
+      "step": 480
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 2.0118868982719897,
+      "learning_rate": 4.540455237354466e-06,
+      "loss": 0.5722,
+      "step": 481
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.9733105429381828,
+      "learning_rate": 4.5385642381099814e-06,
+      "loss": 0.6112,
+      "step": 482
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.862156914026863,
+      "learning_rate": 4.53666975165568e-06,
+      "loss": 0.5951,
+      "step": 483
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.9512940035297868,
+      "learning_rate": 4.53477178123231e-06,
+      "loss": 0.5223,
+      "step": 484
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.9202464191558823,
+      "learning_rate": 4.532870330086577e-06,
+      "loss": 0.5638,
+      "step": 485
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.9015767656854419,
+      "learning_rate": 4.530965401471143e-06,
+      "loss": 0.5911,
+      "step": 486
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.95190921973106,
+      "learning_rate": 4.529056998644619e-06,
+      "loss": 0.6053,
+      "step": 487
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 2.0058459596081644,
+      "learning_rate": 4.527145124871556e-06,
+      "loss": 0.5466,
+      "step": 488
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.8902620959998047,
+      "learning_rate": 4.5252297834224454e-06,
+      "loss": 0.5526,
+      "step": 489
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.985466416169018,
+      "learning_rate": 4.523310977573711e-06,
+      "loss": 0.5958,
+      "step": 490
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 2.1140148957176415,
+      "learning_rate": 4.521388710607699e-06,
+      "loss": 0.613,
+      "step": 491
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.9470601192089525,
+      "learning_rate": 4.51946298581268e-06,
+      "loss": 0.5847,
+      "step": 492
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 2.0227057176069603,
+      "learning_rate": 4.51753380648284e-06,
+      "loss": 0.5784,
+      "step": 493
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 2.05501863673554,
+      "learning_rate": 4.515601175918269e-06,
+      "loss": 0.5501,
+      "step": 494
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 2.0129325402811715,
+      "learning_rate": 4.513665097424967e-06,
+      "loss": 0.5641,
+      "step": 495
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 2.0322333044110468,
+      "learning_rate": 4.51172557431483e-06,
+      "loss": 0.5422,
+      "step": 496
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.9573055659958774,
+      "learning_rate": 4.509782609905644e-06,
+      "loss": 0.516,
+      "step": 497
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.8223127451485421,
+      "learning_rate": 4.507836207521085e-06,
+      "loss": 0.5714,
+      "step": 498
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.9343089861079434,
+      "learning_rate": 4.50588637049071e-06,
+      "loss": 0.5424,
+      "step": 499
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.8940990649350729,
+      "learning_rate": 4.503933102149948e-06,
+      "loss": 0.5832,
+      "step": 500
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.908617301933682,
+      "learning_rate": 4.501976405840101e-06,
+      "loss": 0.5399,
+      "step": 501
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.8290259512093785,
+      "learning_rate": 4.500016284908334e-06,
+      "loss": 0.5561,
+      "step": 502
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.9840280991844164,
+      "learning_rate": 4.49805274270767e-06,
+      "loss": 0.5645,
+      "step": 503
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.9864953051636856,
+      "learning_rate": 4.496085782596984e-06,
+      "loss": 0.5369,
+      "step": 504
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.979387839103732,
+      "learning_rate": 4.494115407940999e-06,
+      "loss": 0.6196,
+      "step": 505
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.9266869362165981,
+      "learning_rate": 4.492141622110279e-06,
+      "loss": 0.5687,
+      "step": 506
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.9887461782376619,
+      "learning_rate": 4.4901644284812205e-06,
+      "loss": 0.5264,
+      "step": 507
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.8717867803152208,
+      "learning_rate": 4.488183830436052e-06,
+      "loss": 0.5612,
+      "step": 508
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 2.0044226171493,
+      "learning_rate": 4.486199831362828e-06,
+      "loss": 0.5571,
+      "step": 509
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 2.1075571016617958,
+      "learning_rate": 4.484212434655414e-06,
+      "loss": 0.5642,
+      "step": 510
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.8031612547539957,
+      "learning_rate": 4.482221643713494e-06,
+      "loss": 0.5805,
+      "step": 511
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.8782516337672304,
+      "learning_rate": 4.480227461942556e-06,
+      "loss": 0.5596,
+      "step": 512
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.075073901596185,
+      "learning_rate": 4.478229892753886e-06,
+      "loss": 0.6124,
+      "step": 513
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.0588983460568304,
+      "learning_rate": 4.47622893956457e-06,
+      "loss": 0.5589,
+      "step": 514
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.850248236464706,
+      "learning_rate": 4.474224605797476e-06,
+      "loss": 0.5603,
+      "step": 515
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.932844310652863,
+      "learning_rate": 4.472216894881261e-06,
+      "loss": 0.5571,
+      "step": 516
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.09975454805468,
+      "learning_rate": 4.470205810250357e-06,
+      "loss": 0.5975,
+      "step": 517
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.9694087093010304,
+      "learning_rate": 4.468191355344965e-06,
+      "loss": 0.5698,
+      "step": 518
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.8794788153917539,
+      "learning_rate": 4.466173533611053e-06,
+      "loss": 0.5559,
+      "step": 519
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.0650455557855434,
+      "learning_rate": 4.46415234850035e-06,
+      "loss": 0.5644,
+      "step": 520
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.0062649027982022,
+      "learning_rate": 4.462127803470334e-06,
+      "loss": 0.608,
+      "step": 521
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.043267877462657,
+      "learning_rate": 4.460099901984235e-06,
+      "loss": 0.573,
+      "step": 522
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.056372436619027,
+      "learning_rate": 4.4580686475110235e-06,
+      "loss": 0.5748,
+      "step": 523
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.8871033520138176,
+      "learning_rate": 4.456034043525404e-06,
+      "loss": 0.5339,
+      "step": 524
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.889474616209236,
+      "learning_rate": 4.45399609350781e-06,
+      "loss": 0.5185,
+      "step": 525
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.9767406217632912,
+      "learning_rate": 4.451954800944405e-06,
+      "loss": 0.5758,
+      "step": 526
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.9588695861513832,
+      "learning_rate": 4.449910169327062e-06,
+      "loss": 0.5472,
+      "step": 527
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.8852210889000718,
+      "learning_rate": 4.447862202153372e-06,
+      "loss": 0.5917,
+      "step": 528
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 2.0103638871993077,
+      "learning_rate": 4.445810902926629e-06,
+      "loss": 0.5761,
+      "step": 529
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 2.201836945389513,
+      "learning_rate": 4.443756275155827e-06,
+      "loss": 0.5614,
+      "step": 530
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.900702305836831,
+      "learning_rate": 4.441698322355656e-06,
+      "loss": 0.5254,
+      "step": 531
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 2.134694583439314,
+      "learning_rate": 4.4396370480464915e-06,
+      "loss": 0.5607,
+      "step": 532
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.8073751630381198,
+      "learning_rate": 4.437572455754391e-06,
+      "loss": 0.536,
+      "step": 533
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.9607338020142653,
+      "learning_rate": 4.435504549011088e-06,
+      "loss": 0.59,
+      "step": 534
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 2.0756430867435274,
+      "learning_rate": 4.433433331353988e-06,
+      "loss": 0.5538,
+      "step": 535
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.8280570853718465,
+      "learning_rate": 4.431358806326158e-06,
+      "loss": 0.5789,
+      "step": 536
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.2005143967434977,
+      "learning_rate": 4.429280977476321e-06,
+      "loss": 0.545,
+      "step": 537
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.896479397543979,
+      "learning_rate": 4.4271998483588565e-06,
+      "loss": 0.5791,
+      "step": 538
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.117773381781195,
+      "learning_rate": 4.425115422533785e-06,
+      "loss": 0.5234,
+      "step": 539
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.4438942429566617,
+      "learning_rate": 4.423027703566769e-06,
+      "loss": 0.5692,
+      "step": 540
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.873481152225171,
+      "learning_rate": 4.4209366950291025e-06,
+      "loss": 0.5739,
+      "step": 541
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.8655199147974673,
+      "learning_rate": 4.4188424004977085e-06,
+      "loss": 0.5795,
+      "step": 542
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.948840412241188,
+      "learning_rate": 4.416744823555129e-06,
+      "loss": 0.5304,
+      "step": 543
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.8389034133315045,
+      "learning_rate": 4.414643967789523e-06,
+      "loss": 0.5076,
+      "step": 544
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.8269235720085213,
+      "learning_rate": 4.412539836794657e-06,
+      "loss": 0.5837,
+      "step": 545
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.1298715969759505,
+      "learning_rate": 4.410432434169902e-06,
+      "loss": 0.5694,
+      "step": 546
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.0057741366005746,
+      "learning_rate": 4.408321763520223e-06,
+      "loss": 0.557,
+      "step": 547
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.7901331374893255,
+      "learning_rate": 4.406207828456177e-06,
+      "loss": 0.5746,
+      "step": 548
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 2.1994839889416187,
+      "learning_rate": 4.404090632593904e-06,
+      "loss": 0.5407,
+      "step": 549
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.9664921082690268,
+      "learning_rate": 4.401970179555123e-06,
+      "loss": 0.5322,
+      "step": 550
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.9933486180243851,
+      "learning_rate": 4.399846472967124e-06,
+      "loss": 0.5798,
+      "step": 551
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.986612256562151,
+      "learning_rate": 4.397719516462765e-06,
+      "loss": 0.5213,
+      "step": 552
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 2.046550123292336,
+      "learning_rate": 4.395589313680459e-06,
+      "loss": 0.5857,
+      "step": 553
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.7902327250340486,
+      "learning_rate": 4.393455868264176e-06,
+      "loss": 0.555,
+      "step": 554
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 2.0203627138517146,
+      "learning_rate": 4.391319183863432e-06,
+      "loss": 0.6329,
+      "step": 555
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.9373549045181289,
+      "learning_rate": 4.389179264133281e-06,
+      "loss": 0.566,
+      "step": 556
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.8936753353678124,
+      "learning_rate": 4.387036112734316e-06,
+      "loss": 0.5555,
+      "step": 557
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.8493817575820743,
+      "learning_rate": 4.3848897333326545e-06,
+      "loss": 0.5427,
+      "step": 558
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.9119588677783816,
+      "learning_rate": 4.382740129599937e-06,
+      "loss": 0.5157,
+      "step": 559
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.8190137094200924,
+      "learning_rate": 4.380587305213321e-06,
+      "loss": 0.503,
+      "step": 560
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.9891332712764953,
+      "learning_rate": 4.37843126385547e-06,
+      "loss": 0.5761,
+      "step": 561
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.8620896547461154,
+      "learning_rate": 4.376272009214555e-06,
+      "loss": 0.5259,
+      "step": 562
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.8896721756477406,
+      "learning_rate": 4.37410954498424e-06,
+      "loss": 0.5632,
+      "step": 563
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.8302281976781984,
+      "learning_rate": 4.37194387486368e-06,
+      "loss": 0.5612,
+      "step": 564
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 2.0721820586440165,
+      "learning_rate": 4.369775002557516e-06,
+      "loss": 0.533,
+      "step": 565
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.8259926551813157,
+      "learning_rate": 4.367602931775865e-06,
+      "loss": 0.526,
+      "step": 566
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.8096334574000785,
+      "learning_rate": 4.3654276662343155e-06,
+      "loss": 0.5306,
+      "step": 567
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.9675637591445598,
+      "learning_rate": 4.363249209653922e-06,
+      "loss": 0.5577,
+      "step": 568
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.8800389115841605,
+      "learning_rate": 4.361067565761197e-06,
+      "loss": 0.5553,
+      "step": 569
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.827485496395265,
+      "learning_rate": 4.358882738288105e-06,
+      "loss": 0.5587,
+      "step": 570
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.820954908943235,
+      "learning_rate": 4.356694730972056e-06,
+      "loss": 0.6186,
+      "step": 571
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.952072431699686,
+      "learning_rate": 4.3545035475559025e-06,
+      "loss": 0.5488,
+      "step": 572
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.8292648968688423,
+      "learning_rate": 4.352309191787924e-06,
+      "loss": 0.5534,
+      "step": 573
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.826293122529813,
+      "learning_rate": 4.350111667421835e-06,
+      "loss": 0.5872,
+      "step": 574
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.9251425791166785,
+      "learning_rate": 4.347910978216763e-06,
+      "loss": 0.5298,
+      "step": 575
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.8330818196811385,
+      "learning_rate": 4.345707127937253e-06,
+      "loss": 0.5871,
+      "step": 576
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.7842986545873851,
+      "learning_rate": 4.3435001203532555e-06,
+      "loss": 0.4898,
+      "step": 577
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.8778666245156521,
+      "learning_rate": 4.341289959240124e-06,
+      "loss": 0.5385,
+      "step": 578
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.9300679499181266,
+      "learning_rate": 4.339076648378605e-06,
+      "loss": 0.5698,
+      "step": 579
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.9440861965960357,
+      "learning_rate": 4.336860191554833e-06,
+      "loss": 0.5984,
+      "step": 580
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.929951096053947,
+      "learning_rate": 4.3346405925603265e-06,
+      "loss": 0.6222,
+      "step": 581
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.9138258400335695,
+      "learning_rate": 4.332417855191974e-06,
+      "loss": 0.5498,
+      "step": 582
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 2.058548455869675,
+      "learning_rate": 4.330191983252039e-06,
+      "loss": 0.5218,
+      "step": 583
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 2.243429045583125,
+      "learning_rate": 4.327962980548142e-06,
+      "loss": 0.5768,
+      "step": 584
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.9213537104634244,
+      "learning_rate": 4.32573085089326e-06,
+      "loss": 0.5784,
+      "step": 585
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.9165291289119128,
+      "learning_rate": 4.32349559810572e-06,
+      "loss": 0.5697,
+      "step": 586
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.9674279518735756,
+      "learning_rate": 4.321257226009193e-06,
+      "loss": 0.5104,
+      "step": 587
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.9051339015323923,
+      "learning_rate": 4.319015738432683e-06,
+      "loss": 0.5711,
+      "step": 588
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.957357618850765,
+      "learning_rate": 4.3167711392105245e-06,
+      "loss": 0.5854,
+      "step": 589
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.9859311708308915,
+      "learning_rate": 4.314523432182376e-06,
+      "loss": 0.547,
+      "step": 590
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.773704456523191,
+      "learning_rate": 4.312272621193209e-06,
+      "loss": 0.5259,
+      "step": 591
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.82988033655793,
+      "learning_rate": 4.31001871009331e-06,
+      "loss": 0.5209,
+      "step": 592
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.8925134832060522,
+      "learning_rate": 4.307761702738264e-06,
+      "loss": 0.59,
+      "step": 593
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.8477075780641046,
+      "learning_rate": 4.305501602988953e-06,
+      "loss": 0.5714,
+      "step": 594
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.8568432886623798,
+      "learning_rate": 4.303238414711552e-06,
+      "loss": 0.5877,
+      "step": 595
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.8179798660158206,
+      "learning_rate": 4.3009721417775166e-06,
+      "loss": 0.6029,
+      "step": 596
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.8494963193854803,
+      "learning_rate": 4.29870278806358e-06,
+      "loss": 0.5236,
+      "step": 597
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.9586017397154731,
+      "learning_rate": 4.296430357451744e-06,
+      "loss": 0.5998,
+      "step": 598
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.926616057974202,
+      "learning_rate": 4.2941548538292765e-06,
+      "loss": 0.5914,
+      "step": 599
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.9321738359144827,
+      "learning_rate": 4.291876281088701e-06,
+      "loss": 0.5358,
+      "step": 600
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.8229177571361932,
+      "learning_rate": 4.289594643127788e-06,
+      "loss": 0.5284,
+      "step": 601
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.849252449531427,
+      "learning_rate": 4.287309943849558e-06,
+      "loss": 0.5689,
+      "step": 602
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.985343175388319,
+      "learning_rate": 4.285022187162261e-06,
+      "loss": 0.6101,
+      "step": 603
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.9437791826489255,
+      "learning_rate": 4.2827313769793835e-06,
+      "loss": 0.5419,
+      "step": 604
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.8027421078538746,
+      "learning_rate": 4.28043751721963e-06,
+      "loss": 0.5504,
+      "step": 605
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.8221230935939319,
+      "learning_rate": 4.278140611806926e-06,
+      "loss": 0.5284,
+      "step": 606
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.8597205853821357,
+      "learning_rate": 4.275840664670403e-06,
+      "loss": 0.623,
+      "step": 607
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.7801370844338822,
+      "learning_rate": 4.2735376797444e-06,
+      "loss": 0.5265,
+      "step": 608
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.9028094416250234,
+      "learning_rate": 4.271231660968449e-06,
+      "loss": 0.5764,
+      "step": 609
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.9385737581380094,
+      "learning_rate": 4.268922612287273e-06,
+      "loss": 0.6047,
+      "step": 610
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.760006169733744,
+      "learning_rate": 4.266610537650778e-06,
+      "loss": 0.4944,
+      "step": 611
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.857083980479501,
+      "learning_rate": 4.264295441014047e-06,
+      "loss": 0.5174,
+      "step": 612
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.8299942480819913,
+      "learning_rate": 4.261977326337332e-06,
+      "loss": 0.5814,
+      "step": 613
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.8943903433033418,
+      "learning_rate": 4.259656197586046e-06,
+      "loss": 0.5514,
+      "step": 614
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.7839062839610529,
+      "learning_rate": 4.257332058730761e-06,
+      "loss": 0.5857,
+      "step": 615
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 2.7188975139736256,
+      "learning_rate": 4.255004913747196e-06,
+      "loss": 0.5509,
+      "step": 616
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.8767461602206779,
+      "learning_rate": 4.252674766616212e-06,
+      "loss": 0.5038,
+      "step": 617
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.8391588901867753,
+      "learning_rate": 4.250341621323809e-06,
+      "loss": 0.5196,
+      "step": 618
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.8106924420187829,
+      "learning_rate": 4.248005481861111e-06,
+      "loss": 0.5458,
+      "step": 619
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.9698953511074666,
+      "learning_rate": 4.245666352224367e-06,
+      "loss": 0.5963,
+      "step": 620
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.8890424031569348,
+      "learning_rate": 4.243324236414939e-06,
+      "loss": 0.5277,
+      "step": 621
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.8537879418167673,
+      "learning_rate": 4.240979138439301e-06,
+      "loss": 0.5407,
+      "step": 622
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.9264981771759184,
+      "learning_rate": 4.238631062309023e-06,
+      "loss": 0.5788,
+      "step": 623
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.949693389062837,
+      "learning_rate": 4.236280012040773e-06,
+      "loss": 0.5007,
+      "step": 624
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.8845778025905608,
+      "learning_rate": 4.233925991656307e-06,
+      "loss": 0.5905,
+      "step": 625
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.8977167810192608,
+      "learning_rate": 4.231569005182459e-06,
+      "loss": 0.5342,
+      "step": 626
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.9579196623045914,
+      "learning_rate": 4.229209056651139e-06,
+      "loss": 0.554,
+      "step": 627
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.8427820272426025,
+      "learning_rate": 4.226846150099324e-06,
+      "loss": 0.5629,
+      "step": 628
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.865218131227253,
+      "learning_rate": 4.22448028956905e-06,
+      "loss": 0.558,
+      "step": 629
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.7348773966225364,
+      "learning_rate": 4.222111479107406e-06,
+      "loss": 0.5332,
+      "step": 630
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.779367140127678,
+      "learning_rate": 4.219739722766528e-06,
+      "loss": 0.569,
+      "step": 631
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.92860570712595,
+      "learning_rate": 4.217365024603592e-06,
+      "loss": 0.5342,
+      "step": 632
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.946965997476449,
+      "learning_rate": 4.214987388680804e-06,
+      "loss": 0.5482,
+      "step": 633
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.7930454990298659,
+      "learning_rate": 4.212606819065399e-06,
+      "loss": 0.5376,
+      "step": 634
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.8379498458279013,
+      "learning_rate": 4.210223319829626e-06,
+      "loss": 0.5741,
+      "step": 635
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.742977498596499,
+      "learning_rate": 4.207836895050748e-06,
+      "loss": 0.5569,
+      "step": 636
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.852541709372898,
+      "learning_rate": 4.205447548811032e-06,
+      "loss": 0.578,
+      "step": 637
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.8180259569107267,
+      "learning_rate": 4.203055285197745e-06,
+      "loss": 0.5189,
+      "step": 638
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.8177842562763082,
+      "learning_rate": 4.20066010830314e-06,
+      "loss": 0.5424,
+      "step": 639
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.8068654723170434,
+      "learning_rate": 4.198262022224457e-06,
+      "loss": 0.5336,
+      "step": 640
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.9664843499052276,
+      "learning_rate": 4.195861031063909e-06,
+      "loss": 0.5399,
+      "step": 641
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.7812265481792608,
+      "learning_rate": 4.193457138928683e-06,
+      "loss": 0.534,
+      "step": 642
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.908377487778027,
+      "learning_rate": 4.191050349930925e-06,
+      "loss": 0.5831,
+      "step": 643
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.8124678634933105,
+      "learning_rate": 4.18864066818774e-06,
+      "loss": 0.5309,
+      "step": 644
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.902443199964304,
+      "learning_rate": 4.186228097821176e-06,
+      "loss": 0.5452,
+      "step": 645
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.9694387068719457,
+      "learning_rate": 4.183812642958227e-06,
+      "loss": 0.5462,
+      "step": 646
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.945352264767711,
+      "learning_rate": 4.181394307730819e-06,
+      "loss": 0.4853,
+      "step": 647
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.7967416728436914,
+      "learning_rate": 4.178973096275806e-06,
+      "loss": 0.5952,
+      "step": 648
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 2.0602433101771616,
+      "learning_rate": 4.176549012734963e-06,
+      "loss": 0.6346,
+      "step": 649
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.9158731498204968,
+      "learning_rate": 4.1741220612549746e-06,
+      "loss": 0.5101,
+      "step": 650
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.951875972207364,
+      "learning_rate": 4.171692245987436e-06,
+      "loss": 0.5718,
+      "step": 651
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.871788727804539,
+      "learning_rate": 4.169259571088839e-06,
+      "loss": 0.5516,
+      "step": 652
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.945571804366465,
+      "learning_rate": 4.166824040720566e-06,
+      "loss": 0.5544,
+      "step": 653
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.8975723622706568,
+      "learning_rate": 4.1643856590488866e-06,
+      "loss": 0.5643,
+      "step": 654
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.9772846459626554,
+      "learning_rate": 4.161944430244945e-06,
+      "loss": 0.5487,
+      "step": 655
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 2.036472038769578,
+      "learning_rate": 4.159500358484759e-06,
+      "loss": 0.5232,
+      "step": 656
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.7742095436926848,
+      "learning_rate": 4.157053447949206e-06,
+      "loss": 0.4963,
+      "step": 657
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 2.1819742476725814,
+      "learning_rate": 4.154603702824023e-06,
+      "loss": 0.5416,
+      "step": 658
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.9151345309457093,
+      "learning_rate": 4.152151127299794e-06,
+      "loss": 0.5822,
+      "step": 659
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 2.033640859083771,
+      "learning_rate": 4.149695725571944e-06,
+      "loss": 0.5876,
+      "step": 660
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.8935471013235925,
+      "learning_rate": 4.147237501840734e-06,
+      "loss": 0.548,
+      "step": 661
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.7836299476774775,
+      "learning_rate": 4.144776460311253e-06,
+      "loss": 0.5274,
+      "step": 662
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 2.194666072449123,
+      "learning_rate": 4.142312605193407e-06,
+      "loss": 0.5934,
+      "step": 663
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.988265407508224,
+      "learning_rate": 4.13984594070192e-06,
+      "loss": 0.5539,
+      "step": 664
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.7594955740187146,
+      "learning_rate": 4.137376471056317e-06,
+      "loss": 0.5324,
+      "step": 665
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.9342530277100989,
+      "learning_rate": 4.1349042004809224e-06,
+      "loss": 0.5902,
+      "step": 666
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.9757082453588417,
+      "learning_rate": 4.132429133204856e-06,
+      "loss": 0.5874,
+      "step": 667
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.7792467343474774,
+      "learning_rate": 4.129951273462016e-06,
+      "loss": 0.5516,
+      "step": 668
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.9010392264817964,
+      "learning_rate": 4.127470625491082e-06,
+      "loss": 0.5793,
+      "step": 669
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 2.054505290884914,
+      "learning_rate": 4.1249871935355e-06,
+      "loss": 0.5718,
+      "step": 670
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.8010036617727825,
+      "learning_rate": 4.1225009818434805e-06,
+      "loss": 0.5698,
+      "step": 671
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.975020822034628,
+      "learning_rate": 4.120011994667988e-06,
+      "loss": 0.5739,
+      "step": 672
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.9801075045379748,
+      "learning_rate": 4.117520236266734e-06,
+      "loss": 0.5589,
+      "step": 673
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.7773808874926829,
+      "learning_rate": 4.115025710902173e-06,
+      "loss": 0.5276,
+      "step": 674
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.890298398205481,
+      "learning_rate": 4.112528422841491e-06,
+      "loss": 0.4914,
+      "step": 675
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.9087570296379215,
+      "learning_rate": 4.110028376356599e-06,
+      "loss": 0.5412,
+      "step": 676
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.8908271691889404,
+      "learning_rate": 4.1075255757241295e-06,
+      "loss": 0.5618,
+      "step": 677
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 2.024312170169272,
+      "learning_rate": 4.105020025225423e-06,
+      "loss": 0.5618,
+      "step": 678
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.8072403207581518,
+      "learning_rate": 4.102511729146528e-06,
+      "loss": 0.5744,
+      "step": 679
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.7750572145097157,
+      "learning_rate": 4.100000691778185e-06,
+      "loss": 0.5716,
+      "step": 680
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.8778337896632162,
+      "learning_rate": 4.097486917415827e-06,
+      "loss": 0.5683,
+      "step": 681
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.9710167098273688,
+      "learning_rate": 4.094970410359568e-06,
+      "loss": 0.5273,
+      "step": 682
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.9136975523972874,
+      "learning_rate": 4.092451174914196e-06,
+      "loss": 0.5239,
+      "step": 683
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.929344793900944,
+      "learning_rate": 4.089929215389167e-06,
+      "loss": 0.5388,
+      "step": 684
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.7211535229712278,
+      "learning_rate": 4.087404536098597e-06,
+      "loss": 0.5068,
+      "step": 685
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.8739637749458882,
+      "learning_rate": 4.084877141361254e-06,
+      "loss": 0.5537,
+      "step": 686
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.9268469960932768,
+      "learning_rate": 4.082347035500553e-06,
+      "loss": 0.5875,
+      "step": 687
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.896542320004603,
+      "learning_rate": 4.079814222844541e-06,
+      "loss": 0.5314,
+      "step": 688
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.723925126440519,
+      "learning_rate": 4.077278707725904e-06,
+      "loss": 0.5009,
+      "step": 689
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.8345210205201996,
+      "learning_rate": 4.074740494481942e-06,
+      "loss": 0.5544,
+      "step": 690
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.766819080519227,
+      "learning_rate": 4.072199587454578e-06,
+      "loss": 0.5393,
+      "step": 691
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.9577975399484282,
+      "learning_rate": 4.069655990990337e-06,
+      "loss": 0.5357,
+      "step": 692
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.8254761359015224,
+      "learning_rate": 4.06710970944035e-06,
+      "loss": 0.5797,
+      "step": 693
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 2.1203973374999214,
+      "learning_rate": 4.064560747160337e-06,
+      "loss": 0.5811,
+      "step": 694
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.9066221824053846,
+      "learning_rate": 4.062009108510605e-06,
+      "loss": 0.5014,
+      "step": 695
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.951489716071849,
+      "learning_rate": 4.059454797856039e-06,
+      "loss": 0.529,
+      "step": 696
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.8402907113209426,
+      "learning_rate": 4.056897819566096e-06,
+      "loss": 0.4942,
+      "step": 697
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 2.0368715640768498,
+      "learning_rate": 4.0543381780147965e-06,
+      "loss": 0.5245,
+      "step": 698
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.8154462049772704,
+      "learning_rate": 4.0517758775807135e-06,
+      "loss": 0.4979,
+      "step": 699
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.890388895335948,
+      "learning_rate": 4.049210922646973e-06,
+      "loss": 0.5212,
+      "step": 700
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 2.0215900504030166,
+      "learning_rate": 4.046643317601237e-06,
+      "loss": 0.5384,
+      "step": 701
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.816997259900234,
+      "learning_rate": 4.0440730668357076e-06,
+      "loss": 0.492,
+      "step": 702
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.968633766153865,
+      "learning_rate": 4.0415001747471036e-06,
+      "loss": 0.5917,
+      "step": 703
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.8313487810801756,
+      "learning_rate": 4.0389246457366696e-06,
+      "loss": 0.5561,
+      "step": 704
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.7954421155528784,
+      "learning_rate": 4.036346484210159e-06,
+      "loss": 0.5383,
+      "step": 705
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8517101217315919,
+      "learning_rate": 4.033765694577826e-06,
+      "loss": 0.5368,
+      "step": 706
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8888441616203875,
+      "learning_rate": 4.031182281254423e-06,
+      "loss": 0.5895,
+      "step": 707
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8131436351862782,
+      "learning_rate": 4.028596248659191e-06,
+      "loss": 0.5346,
+      "step": 708
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8803113487311214,
+      "learning_rate": 4.0260076012158486e-06,
+      "loss": 0.4987,
+      "step": 709
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8989122650791335,
+      "learning_rate": 4.023416343352589e-06,
+      "loss": 0.5007,
+      "step": 710
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.9466291969735336,
+      "learning_rate": 4.020822479502074e-06,
+      "loss": 0.5868,
+      "step": 711
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.869533367998661,
+      "learning_rate": 4.018226014101418e-06,
+      "loss": 0.5995,
+      "step": 712
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.93738608926368,
+      "learning_rate": 4.015626951592187e-06,
+      "loss": 0.5625,
+      "step": 713
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8485080870897803,
+      "learning_rate": 4.013025296420394e-06,
+      "loss": 0.5585,
+      "step": 714
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8099669115387913,
+      "learning_rate": 4.010421053036481e-06,
+      "loss": 0.5384,
+      "step": 715
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8810123612010912,
+      "learning_rate": 4.007814225895321e-06,
+      "loss": 0.5589,
+      "step": 716
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8692823610937885,
+      "learning_rate": 4.005204819456205e-06,
+      "loss": 0.5474,
+      "step": 717
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.8120887102918588,
+      "learning_rate": 4.00259283818284e-06,
+      "loss": 0.5138,
+      "step": 718
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.7933926935301234,
+      "learning_rate": 3.999978286543331e-06,
+      "loss": 0.5235,
+      "step": 719
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.8382360731306235,
+      "learning_rate": 3.997361169010187e-06,
+      "loss": 0.5846,
+      "step": 720
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.993925306673069,
+      "learning_rate": 3.994741490060301e-06,
+      "loss": 0.5561,
+      "step": 721
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.900088669959918,
+      "learning_rate": 3.9921192541749505e-06,
+      "loss": 0.5215,
+      "step": 722
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.9250072769385074,
+      "learning_rate": 3.989494465839785e-06,
+      "loss": 0.54,
+      "step": 723
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.7928905908766457,
+      "learning_rate": 3.986867129544822e-06,
+      "loss": 0.6066,
+      "step": 724
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.9474900039545116,
+      "learning_rate": 3.984237249784437e-06,
+      "loss": 0.5173,
+      "step": 725
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.9004077336349998,
+      "learning_rate": 3.981604831057357e-06,
+      "loss": 0.5409,
+      "step": 726
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.7573843693188624,
+      "learning_rate": 3.97896987786665e-06,
+      "loss": 0.5239,
+      "step": 727
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.899283660379949,
+      "learning_rate": 3.976332394719721e-06,
+      "loss": 0.4977,
+      "step": 728
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.8353476568345033,
+      "learning_rate": 3.973692386128304e-06,
+      "loss": 0.5834,
+      "step": 729
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 2.032325534167748,
+      "learning_rate": 3.971049856608451e-06,
+      "loss": 0.5343,
+      "step": 730
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.8161347764383835,
+      "learning_rate": 3.9684048106805286e-06,
+      "loss": 0.585,
+      "step": 731
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.836376388525165,
+      "learning_rate": 3.965757252869204e-06,
+      "loss": 0.5978,
+      "step": 732
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.889118862096067,
+      "learning_rate": 3.963107187703446e-06,
+      "loss": 0.5393,
+      "step": 733
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.7772829607776217,
+      "learning_rate": 3.96045461971651e-06,
+      "loss": 0.5164,
+      "step": 734
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.7980410807492582,
+      "learning_rate": 3.957799553445932e-06,
+      "loss": 0.5455,
+      "step": 735
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.907936099702467,
+      "learning_rate": 3.955141993433526e-06,
+      "loss": 0.532,
+      "step": 736
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.8668064740862462,
+      "learning_rate": 3.9524819442253645e-06,
+      "loss": 0.5578,
+      "step": 737
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.838952740378055,
+      "learning_rate": 3.949819410371785e-06,
+      "loss": 0.5784,
+      "step": 738
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.9595767898211005,
+      "learning_rate": 3.947154396427373e-06,
+      "loss": 0.5213,
+      "step": 739
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.9422968944070973,
+      "learning_rate": 3.944486906950954e-06,
+      "loss": 0.5709,
+      "step": 740
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.760556693040696,
+      "learning_rate": 3.941816946505592e-06,
+      "loss": 0.5564,
+      "step": 741
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.8054841879427592,
+      "learning_rate": 3.939144519658575e-06,
+      "loss": 0.5435,
+      "step": 742
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 2.1072923992538,
+      "learning_rate": 3.936469630981412e-06,
+      "loss": 0.5622,
+      "step": 743
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.711687978027928,
+      "learning_rate": 3.933792285049821e-06,
+      "loss": 0.5554,
+      "step": 744
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.8166543944942228,
+      "learning_rate": 3.931112486443727e-06,
+      "loss": 0.5079,
+      "step": 745
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.7923405334139695,
+      "learning_rate": 3.928430239747246e-06,
+      "loss": 0.5692,
+      "step": 746
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.9611773239667012,
+      "learning_rate": 3.925745549548687e-06,
+      "loss": 0.5092,
+      "step": 747
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.8440088039871827,
+      "learning_rate": 3.923058420440534e-06,
+      "loss": 0.5369,
+      "step": 748
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.9272316571307881,
+      "learning_rate": 3.920368857019447e-06,
+      "loss": 0.5798,
+      "step": 749
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.8248503445199376,
+      "learning_rate": 3.917676863886246e-06,
+      "loss": 0.5479,
+      "step": 750
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.9200626612083824,
+      "learning_rate": 3.914982445645912e-06,
+      "loss": 0.549,
+      "step": 751
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.8585556832275227,
+      "learning_rate": 3.91228560690757e-06,
+      "loss": 0.5283,
+      "step": 752
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.819239895382093,
+      "learning_rate": 3.90958635228449e-06,
+      "loss": 0.535,
+      "step": 753
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.7810389942543545,
+      "learning_rate": 3.90688468639407e-06,
+      "loss": 0.5125,
+      "step": 754
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.9614453700373935,
+      "learning_rate": 3.904180613857837e-06,
+      "loss": 0.5406,
+      "step": 755
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.805104940263808,
+      "learning_rate": 3.901474139301433e-06,
+      "loss": 0.5794,
+      "step": 756
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.78756289235025,
+      "learning_rate": 3.898765267354607e-06,
+      "loss": 0.569,
+      "step": 757
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.912300438003516,
+      "learning_rate": 3.896054002651213e-06,
+      "loss": 0.5565,
+      "step": 758
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.8148356694353722,
+      "learning_rate": 3.893340349829195e-06,
+      "loss": 0.5471,
+      "step": 759
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.6836223387492706,
+      "learning_rate": 3.890624313530583e-06,
+      "loss": 0.5145,
+      "step": 760
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.8389298216964765,
+      "learning_rate": 3.887905898401485e-06,
+      "loss": 0.5441,
+      "step": 761
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.7845754057436856,
+      "learning_rate": 3.885185109092078e-06,
+      "loss": 0.5478,
+      "step": 762
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.77076035925993,
+      "learning_rate": 3.882461950256598e-06,
+      "loss": 0.5497,
+      "step": 763
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.8011284465286703,
+      "learning_rate": 3.87973642655334e-06,
+      "loss": 0.5039,
+      "step": 764
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.7400129481667248,
+      "learning_rate": 3.877008542644637e-06,
+      "loss": 0.5243,
+      "step": 765
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.9899565111682327,
+      "learning_rate": 3.874278303196866e-06,
+      "loss": 0.5767,
+      "step": 766
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.8345576263874734,
+      "learning_rate": 3.871545712880429e-06,
+      "loss": 0.5262,
+      "step": 767
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.8375211207672395,
+      "learning_rate": 3.8688107763697505e-06,
+      "loss": 0.5467,
+      "step": 768
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.8068462280574835,
+      "learning_rate": 3.8660734983432715e-06,
+      "loss": 0.5256,
+      "step": 769
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.7823522202158735,
+      "learning_rate": 3.863333883483433e-06,
+      "loss": 0.5419,
+      "step": 770
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.8881514180214427,
+      "learning_rate": 3.86059193647668e-06,
+      "loss": 0.541,
+      "step": 771
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.8311064595650786,
+      "learning_rate": 3.85784766201344e-06,
+      "loss": 0.5455,
+      "step": 772
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.9833459774866717,
+      "learning_rate": 3.855101064788126e-06,
+      "loss": 0.5723,
+      "step": 773
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.7968096633022903,
+      "learning_rate": 3.852352149499125e-06,
+      "loss": 0.5153,
+      "step": 774
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.775423895652992,
+      "learning_rate": 3.849600920848787e-06,
+      "loss": 0.5134,
+      "step": 775
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.7262892998825556,
+      "learning_rate": 3.84684738354342e-06,
+      "loss": 0.5287,
+      "step": 776
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.7866135638778051,
+      "learning_rate": 3.84409154229328e-06,
+      "loss": 0.57,
+      "step": 777
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.787377916112687,
+      "learning_rate": 3.841333401812569e-06,
+      "loss": 0.5312,
+      "step": 778
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.684801862246949,
+      "learning_rate": 3.838572966819416e-06,
+      "loss": 0.5822,
+      "step": 779
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.79074773131748,
+      "learning_rate": 3.835810242035879e-06,
+      "loss": 0.5651,
+      "step": 780
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.9234904827178134,
+      "learning_rate": 3.8330452321879305e-06,
+      "loss": 0.5527,
+      "step": 781
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 2.1733402579018186,
+      "learning_rate": 3.830277942005455e-06,
+      "loss": 0.5545,
+      "step": 782
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 2.112229504682016,
+      "learning_rate": 3.827508376222233e-06,
+      "loss": 0.5766,
+      "step": 783
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 2.087174122744587,
+      "learning_rate": 3.824736539575944e-06,
+      "loss": 0.549,
+      "step": 784
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.9570382810890106,
+      "learning_rate": 3.821962436808145e-06,
+      "loss": 0.4984,
+      "step": 785
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.94720853153738,
+      "learning_rate": 3.819186072664277e-06,
+      "loss": 0.5303,
+      "step": 786
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 2.21095404069362,
+      "learning_rate": 3.816407451893643e-06,
+      "loss": 0.5674,
+      "step": 787
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.7284336698899117,
+      "learning_rate": 3.8136265792494094e-06,
+      "loss": 0.5952,
+      "step": 788
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.940869697529687,
+      "learning_rate": 3.8108434594885934e-06,
+      "loss": 0.5198,
+      "step": 789
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.9282749931884566,
+      "learning_rate": 3.808058097372057e-06,
+      "loss": 0.5499,
+      "step": 790
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 2.0180195532646983,
+      "learning_rate": 3.8052704976644984e-06,
+      "loss": 0.5117,
+      "step": 791
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.8303561179366206,
+      "learning_rate": 3.8024806651344424e-06,
+      "loss": 0.5034,
+      "step": 792
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 2.0584295539484754,
+      "learning_rate": 3.7996886045542335e-06,
+      "loss": 0.5391,
+      "step": 793
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.7736893833047733,
+      "learning_rate": 3.7968943207000284e-06,
+      "loss": 0.5378,
+      "step": 794
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.7840353008162277,
+      "learning_rate": 3.794097818351786e-06,
+      "loss": 0.5091,
+      "step": 795
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 2.0949100717616225,
+      "learning_rate": 3.791299102293261e-06,
+      "loss": 0.5731,
+      "step": 796
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 2.048353193294094,
+      "learning_rate": 3.7884981773119943e-06,
+      "loss": 0.5576,
+      "step": 797
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.9990070284918733,
+      "learning_rate": 3.7856950481993054e-06,
+      "loss": 0.5297,
+      "step": 798
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.859560152641746,
+      "learning_rate": 3.7828897197502856e-06,
+      "loss": 0.5131,
+      "step": 799
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 2.0054802770873916,
+      "learning_rate": 3.780082196763785e-06,
+      "loss": 0.5428,
+      "step": 800
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.8985367093585213,
+      "learning_rate": 3.7772724840424126e-06,
+      "loss": 0.5206,
+      "step": 801
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.9964704653764362,
+      "learning_rate": 3.774460586392519e-06,
+      "loss": 0.5929,
+      "step": 802
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.7572936836574113,
+      "learning_rate": 3.771646508624194e-06,
+      "loss": 0.5428,
+      "step": 803
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.9623695483620975,
+      "learning_rate": 3.768830255551258e-06,
+      "loss": 0.5685,
+      "step": 804
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.9663290616402378,
+      "learning_rate": 3.76601183199125e-06,
+      "loss": 0.5351,
+      "step": 805
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.7876590847889615,
+      "learning_rate": 3.763191242765424e-06,
+      "loss": 0.567,
+      "step": 806
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.8500820456277005,
+      "learning_rate": 3.7603684926987383e-06,
+      "loss": 0.523,
+      "step": 807
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 2.041973125533567,
+      "learning_rate": 3.757543586619845e-06,
+      "loss": 0.5531,
+      "step": 808
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.7440376746222928,
+      "learning_rate": 3.754716529361089e-06,
+      "loss": 0.4913,
+      "step": 809
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.7910937306897654,
+      "learning_rate": 3.7518873257584897e-06,
+      "loss": 0.5128,
+      "step": 810
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.9334392608388238,
+      "learning_rate": 3.7490559806517434e-06,
+      "loss": 0.5861,
+      "step": 811
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 2.0003597857127673,
+      "learning_rate": 3.746222498884206e-06,
+      "loss": 0.5535,
+      "step": 812
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.7964615198133413,
+      "learning_rate": 3.74338688530289e-06,
+      "loss": 0.5409,
+      "step": 813
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.7726488990007383,
+      "learning_rate": 3.740549144758453e-06,
+      "loss": 0.5714,
+      "step": 814
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.9080323144095523,
+      "learning_rate": 3.737709282105193e-06,
+      "loss": 0.5534,
+      "step": 815
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.9612361354867969,
+      "learning_rate": 3.734867302201038e-06,
+      "loss": 0.5282,
+      "step": 816
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.873254058551618,
+      "learning_rate": 3.7320232099075363e-06,
+      "loss": 0.5422,
+      "step": 817
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.8383882069199007,
+      "learning_rate": 3.7291770100898508e-06,
+      "loss": 0.5588,
+      "step": 818
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 2.0137053963220835,
+      "learning_rate": 3.726328707616749e-06,
+      "loss": 0.5895,
+      "step": 819
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.8207549211692964,
+      "learning_rate": 3.7234783073605957e-06,
+      "loss": 0.5428,
+      "step": 820
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.7929761418069659,
+      "learning_rate": 3.7206258141973445e-06,
+      "loss": 0.555,
+      "step": 821
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.8863691259545465,
+      "learning_rate": 3.7177712330065285e-06,
+      "loss": 0.5802,
+      "step": 822
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.8383911000943605,
+      "learning_rate": 3.714914568671252e-06,
+      "loss": 0.4986,
+      "step": 823
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 2.0032777947804044,
+      "learning_rate": 3.7120558260781846e-06,
+      "loss": 0.6456,
+      "step": 824
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.733320874844507,
+      "learning_rate": 3.709195010117551e-06,
+      "loss": 0.5146,
+      "step": 825
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.7411187007421471,
+      "learning_rate": 3.7063321256831193e-06,
+      "loss": 0.5297,
+      "step": 826
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.8334107493901353,
+      "learning_rate": 3.7034671776722003e-06,
+      "loss": 0.545,
+      "step": 827
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.931467221651553,
+      "learning_rate": 3.7006001709856314e-06,
+      "loss": 0.579,
+      "step": 828
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.799522216655623,
+      "learning_rate": 3.697731110527774e-06,
+      "loss": 0.5453,
+      "step": 829
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.8098119388805842,
+      "learning_rate": 3.6948600012065016e-06,
+      "loss": 0.5186,
+      "step": 830
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.8419013342395714,
+      "learning_rate": 3.6919868479331934e-06,
+      "loss": 0.4833,
+      "step": 831
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.8419148322752323,
+      "learning_rate": 3.6891116556227234e-06,
+      "loss": 0.5479,
+      "step": 832
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.7858200344474908,
+      "learning_rate": 3.6862344291934545e-06,
+      "loss": 0.5264,
+      "step": 833
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.8057437623830686,
+      "learning_rate": 3.6833551735672293e-06,
+      "loss": 0.5208,
+      "step": 834
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.8570584000334132,
+      "learning_rate": 3.6804738936693617e-06,
+      "loss": 0.5652,
+      "step": 835
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.7961732805960369,
+      "learning_rate": 3.677590594428629e-06,
+      "loss": 0.5693,
+      "step": 836
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.954108513879844,
+      "learning_rate": 3.6747052807772614e-06,
+      "loss": 0.5673,
+      "step": 837
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.834152772161213,
+      "learning_rate": 3.671817957650936e-06,
+      "loss": 0.5118,
+      "step": 838
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.8035026424969205,
+      "learning_rate": 3.6689286299887663e-06,
+      "loss": 0.5778,
+      "step": 839
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.7862771700309947,
+      "learning_rate": 3.666037302733295e-06,
+      "loss": 0.5575,
+      "step": 840
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.7398650592861555,
+      "learning_rate": 3.6631439808304874e-06,
+      "loss": 0.5323,
+      "step": 841
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.7082885736006344,
+      "learning_rate": 3.6602486692297183e-06,
+      "loss": 0.543,
+      "step": 842
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.8242434568233548,
+      "learning_rate": 3.6573513728837685e-06,
+      "loss": 0.5579,
+      "step": 843
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.8305967806472925,
+      "learning_rate": 3.6544520967488108e-06,
+      "loss": 0.5425,
+      "step": 844
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.7126995402462595,
+      "learning_rate": 3.651550845784407e-06,
+      "loss": 0.5399,
+      "step": 845
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.992190051239983,
+      "learning_rate": 3.648647624953496e-06,
+      "loss": 0.5951,
+      "step": 846
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.9362402903409848,
+      "learning_rate": 3.6457424392223885e-06,
+      "loss": 0.5427,
+      "step": 847
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.7390586845081806,
+      "learning_rate": 3.642835293560754e-06,
+      "loss": 0.5269,
+      "step": 848
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.8601747321693383,
+      "learning_rate": 3.639926192941615e-06,
+      "loss": 0.5246,
+      "step": 849
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.8305054240762129,
+      "learning_rate": 3.6370151423413396e-06,
+      "loss": 0.562,
+      "step": 850
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.8361711553327809,
+      "learning_rate": 3.6341021467396296e-06,
+      "loss": 0.5066,
+      "step": 851
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.9202617492772214,
+      "learning_rate": 3.6311872111195163e-06,
+      "loss": 0.5755,
+      "step": 852
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.9056266366653432,
+      "learning_rate": 3.628270340467348e-06,
+      "loss": 0.5193,
+      "step": 853
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.9700971504271882,
+      "learning_rate": 3.625351539772783e-06,
+      "loss": 0.5499,
+      "step": 854
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.7142305580780086,
+      "learning_rate": 3.6224308140287818e-06,
+      "loss": 0.5597,
+      "step": 855
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.7897876492593174,
+      "learning_rate": 3.6195081682315972e-06,
+      "loss": 0.5347,
+      "step": 856
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 2.191923699092432,
+      "learning_rate": 3.616583607380769e-06,
+      "loss": 0.5251,
+      "step": 857
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.8582876176666503,
+      "learning_rate": 3.61365713647911e-06,
+      "loss": 0.5067,
+      "step": 858
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.991617360171558,
+      "learning_rate": 3.610728760532701e-06,
+      "loss": 0.6464,
+      "step": 859
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.892621069660817,
+      "learning_rate": 3.607798484550881e-06,
+      "loss": 0.5145,
+      "step": 860
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.7592963181570629,
+      "learning_rate": 3.6048663135462423e-06,
+      "loss": 0.5297,
+      "step": 861
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 2.020192040751123,
+      "learning_rate": 3.6019322525346157e-06,
+      "loss": 0.5709,
+      "step": 862
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.8575959680616767,
+      "learning_rate": 3.598996306535067e-06,
+      "loss": 0.5946,
+      "step": 863
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.9638758131071599,
+      "learning_rate": 3.5960584805698845e-06,
+      "loss": 0.4833,
+      "step": 864
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.7517341191956926,
+      "learning_rate": 3.593118779664574e-06,
+      "loss": 0.5439,
+      "step": 865
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.7637144330636925,
+      "learning_rate": 3.590177208847848e-06,
+      "loss": 0.4898,
+      "step": 866
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 2.107899096934758,
+      "learning_rate": 3.5872337731516186e-06,
+      "loss": 0.5332,
+      "step": 867
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 2.016493645108941,
+      "learning_rate": 3.5842884776109875e-06,
+      "loss": 0.5313,
+      "step": 868
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.8758602544873038,
+      "learning_rate": 3.581341327264236e-06,
+      "loss": 0.554,
+      "step": 869
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.8566881639083022,
+      "learning_rate": 3.5783923271528222e-06,
+      "loss": 0.5322,
+      "step": 870
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.9151838907738468,
+      "learning_rate": 3.5754414823213647e-06,
+      "loss": 0.5306,
+      "step": 871
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.7893407766785276,
+      "learning_rate": 3.572488797817639e-06,
+      "loss": 0.5226,
+      "step": 872
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.908122661974681,
+      "learning_rate": 3.569534278692569e-06,
+      "loss": 0.5132,
+      "step": 873
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.9052513037253582,
+      "learning_rate": 3.5665779300002144e-06,
+      "loss": 0.513,
+      "step": 874
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.7876914527016339,
+      "learning_rate": 3.563619756797767e-06,
+      "loss": 0.5627,
+      "step": 875
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.9607045801516068,
+      "learning_rate": 3.5606597641455387e-06,
+      "loss": 0.4986,
+      "step": 876
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.701462749441997,
+      "learning_rate": 3.5576979571069527e-06,
+      "loss": 0.5306,
+      "step": 877
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.8413701238351416,
+      "learning_rate": 3.554734340748538e-06,
+      "loss": 0.5602,
+      "step": 878
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.8762306249541667,
+      "learning_rate": 3.5517689201399162e-06,
+      "loss": 0.5663,
+      "step": 879
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.833164968453507,
+      "learning_rate": 3.5488017003537977e-06,
+      "loss": 0.5264,
+      "step": 880
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.766302763247428,
+      "learning_rate": 3.5458326864659687e-06,
+      "loss": 0.5498,
+      "step": 881
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.821883208129187,
+      "learning_rate": 3.5428618835552867e-06,
+      "loss": 0.5468,
+      "step": 882
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.7773758034614335,
+      "learning_rate": 3.5398892967036674e-06,
+      "loss": 0.505,
+      "step": 883
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.8248820711070537,
+      "learning_rate": 3.5369149309960783e-06,
+      "loss": 0.5679,
+      "step": 884
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.8248114104788378,
+      "learning_rate": 3.5339387915205305e-06,
+      "loss": 0.5351,
+      "step": 885
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 2.00472132505421,
+      "learning_rate": 3.53096088336807e-06,
+      "loss": 0.5637,
+      "step": 886
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 2.0594957277906656,
+      "learning_rate": 3.5279812116327667e-06,
+      "loss": 0.567,
+      "step": 887
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.916227169502353,
+      "learning_rate": 3.5249997814117098e-06,
+      "loss": 0.5733,
+      "step": 888
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.7595020268824906,
+      "learning_rate": 3.5220165978049937e-06,
+      "loss": 0.5512,
+      "step": 889
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.8259487385184114,
+      "learning_rate": 3.5190316659157126e-06,
+      "loss": 0.5332,
+      "step": 890
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.8216813752485344,
+      "learning_rate": 3.5160449908499538e-06,
+      "loss": 0.5718,
+      "step": 891
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.8497964997952454,
+      "learning_rate": 3.5130565777167845e-06,
+      "loss": 0.5179,
+      "step": 892
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.8242356367817554,
+      "learning_rate": 3.5100664316282464e-06,
+      "loss": 0.5587,
+      "step": 893
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.7793507179190546,
+      "learning_rate": 3.5070745576993428e-06,
+      "loss": 0.5924,
+      "step": 894
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.920176905610262,
+      "learning_rate": 3.5040809610480364e-06,
+      "loss": 0.5579,
+      "step": 895
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.954421523744336,
+      "learning_rate": 3.5010856467952335e-06,
+      "loss": 0.5496,
+      "step": 896
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.7785169911731862,
+      "learning_rate": 3.4980886200647817e-06,
+      "loss": 0.5383,
+      "step": 897
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.853827977546151,
+      "learning_rate": 3.4950898859834555e-06,
+      "loss": 0.5501,
+      "step": 898
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.9882198198152168,
+      "learning_rate": 3.4920894496809515e-06,
+      "loss": 0.5557,
+      "step": 899
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.98090605107646,
+      "learning_rate": 3.489087316289877e-06,
+      "loss": 0.5661,
+      "step": 900
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 2.0027723691714785,
+      "learning_rate": 3.486083490945743e-06,
+      "loss": 0.4791,
+      "step": 901
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 2.0183911897675015,
+      "learning_rate": 3.4830779787869555e-06,
+      "loss": 0.5386,
+      "step": 902
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.9385976919386894,
+      "learning_rate": 3.480070784954805e-06,
+      "loss": 0.5351,
+      "step": 903
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.7612550957325825,
+      "learning_rate": 3.4770619145934586e-06,
+      "loss": 0.511,
+      "step": 904
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.8677538420589843,
+      "learning_rate": 3.4740513728499515e-06,
+      "loss": 0.5942,
+      "step": 905
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.9208446249900946,
+      "learning_rate": 3.4710391648741787e-06,
+      "loss": 0.5146,
+      "step": 906
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.8008673055527855,
+      "learning_rate": 3.468025295818885e-06,
+      "loss": 0.5909,
+      "step": 907
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.891052390507894,
+      "learning_rate": 3.465009770839657e-06,
+      "loss": 0.5527,
+      "step": 908
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 2.0521048489395435,
+      "learning_rate": 3.4619925950949126e-06,
+      "loss": 0.5756,
+      "step": 909
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 2.003295441830653,
+      "learning_rate": 3.4589737737458946e-06,
+      "loss": 0.5299,
+      "step": 910
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.7635851435542724,
+      "learning_rate": 3.4559533119566612e-06,
+      "loss": 0.5338,
+      "step": 911
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.834326490517508,
+      "learning_rate": 3.4529312148940763e-06,
+      "loss": 0.56,
+      "step": 912
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.8618427761057224,
+      "learning_rate": 3.4499074877278016e-06,
+      "loss": 0.5189,
+      "step": 913
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 2.04459004844406,
+      "learning_rate": 3.446882135630286e-06,
+      "loss": 0.5765,
+      "step": 914
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.7467595732765806,
+      "learning_rate": 3.4438551637767604e-06,
+      "loss": 0.5512,
+      "step": 915
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.7952035114217406,
+      "learning_rate": 3.4408265773452226e-06,
+      "loss": 0.5348,
+      "step": 916
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.8448198186244822,
+      "learning_rate": 3.4377963815164362e-06,
+      "loss": 0.5187,
+      "step": 917
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.7738820116169103,
+      "learning_rate": 3.4347645814739156e-06,
+      "loss": 0.507,
+      "step": 918
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.9699054774415494,
+      "learning_rate": 3.4317311824039216e-06,
+      "loss": 0.5175,
+      "step": 919
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.7482905457169124,
+      "learning_rate": 3.4286961894954473e-06,
+      "loss": 0.5188,
+      "step": 920
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.8012194296110113,
+      "learning_rate": 3.425659607940215e-06,
+      "loss": 0.5465,
+      "step": 921
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.7978097428012587,
+      "learning_rate": 3.422621442932662e-06,
+      "loss": 0.5257,
+      "step": 922
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.8534167116514217,
+      "learning_rate": 3.419581699669937e-06,
+      "loss": 0.536,
+      "step": 923
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.7733377878036733,
+      "learning_rate": 3.416540383351888e-06,
+      "loss": 0.5632,
+      "step": 924
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.8124786776539388,
+      "learning_rate": 3.4134974991810503e-06,
+      "loss": 0.5471,
+      "step": 925
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.8553271859579439,
+      "learning_rate": 3.4104530523626463e-06,
+      "loss": 0.538,
+      "step": 926
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.8888926038913822,
+      "learning_rate": 3.4074070481045683e-06,
+      "loss": 0.4868,
+      "step": 927
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 2.0158609319355505,
+      "learning_rate": 3.404359491617374e-06,
+      "loss": 0.5757,
+      "step": 928
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.8376639720078027,
+      "learning_rate": 3.401310388114276e-06,
+      "loss": 0.5377,
+      "step": 929
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 2.3651883595335232,
+      "learning_rate": 3.3982597428111336e-06,
+      "loss": 0.5536,
+      "step": 930
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.908409388949023,
+      "learning_rate": 3.3952075609264423e-06,
+      "loss": 0.5349,
+      "step": 931
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.8261622890952995,
+      "learning_rate": 3.3921538476813278e-06,
+      "loss": 0.4991,
+      "step": 932
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.924034720876031,
+      "learning_rate": 3.3890986082995353e-06,
+      "loss": 0.536,
+      "step": 933
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.829615974230478,
+      "learning_rate": 3.3860418480074188e-06,
+      "loss": 0.5163,
+      "step": 934
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.7812992854973535,
+      "learning_rate": 3.3829835720339353e-06,
+      "loss": 0.5412,
+      "step": 935
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.8270515542068861,
+      "learning_rate": 3.3799237856106348e-06,
+      "loss": 0.5459,
+      "step": 936
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.8336967909163833,
+      "learning_rate": 3.3768624939716506e-06,
+      "loss": 0.5074,
+      "step": 937
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.773892866992307,
+      "learning_rate": 3.373799702353691e-06,
+      "loss": 0.5457,
+      "step": 938
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.8605607499004266,
+      "learning_rate": 3.370735415996031e-06,
+      "loss": 0.5691,
+      "step": 939
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.7961529805945686,
+      "learning_rate": 3.3676696401405007e-06,
+      "loss": 0.5406,
+      "step": 940
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.7406787561376078,
+      "learning_rate": 3.3646023800314792e-06,
+      "loss": 0.5297,
+      "step": 941
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.9794693468141764,
+      "learning_rate": 3.361533640915885e-06,
+      "loss": 0.4765,
+      "step": 942
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.820632707720892,
+      "learning_rate": 3.3584634280431657e-06,
+      "loss": 0.5395,
+      "step": 943
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.8478126164835549,
+      "learning_rate": 3.3553917466652915e-06,
+      "loss": 0.5288,
+      "step": 944
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.749509825583459,
+      "learning_rate": 3.352318602036742e-06,
+      "loss": 0.5343,
+      "step": 945
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.8034305951190157,
+      "learning_rate": 3.3492439994145033e-06,
+      "loss": 0.5536,
+      "step": 946
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.8172591817519397,
+      "learning_rate": 3.346167944058052e-06,
+      "loss": 0.5844,
+      "step": 947
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.749562414198837,
+      "learning_rate": 3.3430904412293526e-06,
+      "loss": 0.4833,
+      "step": 948
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.7243742428927225,
+      "learning_rate": 3.3400114961928444e-06,
+      "loss": 0.4828,
+      "step": 949
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.757242299744874,
+      "learning_rate": 3.3369311142154337e-06,
+      "loss": 0.5282,
+      "step": 950
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 2.036302581700697,
+      "learning_rate": 3.3338493005664853e-06,
+      "loss": 0.5315,
+      "step": 951
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.886299636672335,
+      "learning_rate": 3.330766060517812e-06,
+      "loss": 0.5244,
+      "step": 952
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.898853787733011,
+      "learning_rate": 3.3276813993436695e-06,
+      "loss": 0.5914,
+      "step": 953
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.8359472984671243,
+      "learning_rate": 3.324595322320741e-06,
+      "loss": 0.5488,
+      "step": 954
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.8768955168510497,
+      "learning_rate": 3.321507834728134e-06,
+      "loss": 0.5871,
+      "step": 955
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.8358033818112791,
+      "learning_rate": 3.3184189418473674e-06,
+      "loss": 0.5632,
+      "step": 956
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.792562502385941,
+      "learning_rate": 3.315328648962364e-06,
+      "loss": 0.4887,
+      "step": 957
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.8732702930932368,
+      "learning_rate": 3.312236961359444e-06,
+      "loss": 0.5313,
+      "step": 958
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.7708047128885986,
+      "learning_rate": 3.3091438843273115e-06,
+      "loss": 0.5348,
+      "step": 959
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.9094434763935804,
+      "learning_rate": 3.3060494231570463e-06,
+      "loss": 0.5027,
+      "step": 960
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.87927564418864,
+      "learning_rate": 3.3029535831420977e-06,
+      "loss": 0.511,
+      "step": 961
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.717365559903535,
+      "learning_rate": 3.299856369578273e-06,
+      "loss": 0.5203,
+      "step": 962
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.770779257052532,
+      "learning_rate": 3.2967577877637296e-06,
+      "loss": 0.5233,
+      "step": 963
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.7541392466004568,
+      "learning_rate": 3.2936578429989653e-06,
+      "loss": 0.5013,
+      "step": 964
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.7840578280891832,
+      "learning_rate": 3.290556540586809e-06,
+      "loss": 0.4844,
+      "step": 965
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.7184305413001233,
+      "learning_rate": 3.287453885832413e-06,
+      "loss": 0.4694,
+      "step": 966
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.8671517036325307,
+      "learning_rate": 3.2843498840432403e-06,
+      "loss": 0.4652,
+      "step": 967
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.9960847871768508,
+      "learning_rate": 3.2812445405290612e-06,
+      "loss": 0.5906,
+      "step": 968
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.7535227575839891,
+      "learning_rate": 3.27813786060194e-06,
+      "loss": 0.5482,
+      "step": 969
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.929231862440999,
+      "learning_rate": 3.2750298495762278e-06,
+      "loss": 0.5334,
+      "step": 970
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.7879676366114814,
+      "learning_rate": 3.2719205127685505e-06,
+      "loss": 0.515,
+      "step": 971
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.7817120865072218,
+      "learning_rate": 3.2688098554978053e-06,
+      "loss": 0.5045,
+      "step": 972
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.8725673808714274,
+      "learning_rate": 3.265697883085145e-06,
+      "loss": 0.5557,
+      "step": 973
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.8554796275037901,
+      "learning_rate": 3.262584600853973e-06,
+      "loss": 0.5785,
+      "step": 974
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.77078783324655,
+      "learning_rate": 3.259470014129936e-06,
+      "loss": 0.524,
+      "step": 975
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.820843626030818,
+      "learning_rate": 3.256354128240907e-06,
+      "loss": 0.5144,
+      "step": 976
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.9330495063889956,
+      "learning_rate": 3.253236948516987e-06,
+      "loss": 0.5405,
+      "step": 977
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.9113413794485425,
+      "learning_rate": 3.2501184802904867e-06,
+      "loss": 0.5212,
+      "step": 978
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.799188386703558,
+      "learning_rate": 3.2469987288959208e-06,
+      "loss": 0.5148,
+      "step": 979
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.8610914183588203,
+      "learning_rate": 3.2438776996700023e-06,
+      "loss": 0.5363,
+      "step": 980
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.8245263524947073,
+      "learning_rate": 3.240755397951625e-06,
+      "loss": 0.5216,
+      "step": 981
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.7863270641417597,
+      "learning_rate": 3.2376318290818643e-06,
+      "loss": 0.5581,
+      "step": 982
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.9266115141469626,
+      "learning_rate": 3.23450699840396e-06,
+      "loss": 0.5178,
+      "step": 983
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.8044458399187253,
+      "learning_rate": 3.2313809112633133e-06,
+      "loss": 0.5252,
+      "step": 984
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.8809392949423562,
+      "learning_rate": 3.2282535730074714e-06,
+      "loss": 0.486,
+      "step": 985
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.9487997548787144,
+      "learning_rate": 3.2251249889861237e-06,
+      "loss": 0.5272,
+      "step": 986
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 2.088279538426057,
+      "learning_rate": 3.2219951645510907e-06,
+      "loss": 0.5426,
+      "step": 987
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.8280370745964312,
+      "learning_rate": 3.218864105056313e-06,
+      "loss": 0.5545,
+      "step": 988
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.7678201455723743,
+      "learning_rate": 3.2157318158578473e-06,
+      "loss": 0.5476,
+      "step": 989
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.708170466024094,
+      "learning_rate": 3.21259830231385e-06,
+      "loss": 0.5442,
+      "step": 990
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 2.0427224573251483,
+      "learning_rate": 3.209463569784575e-06,
+      "loss": 0.5501,
+      "step": 991
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.8557413526282036,
+      "learning_rate": 3.206327623632359e-06,
+      "loss": 0.5573,
+      "step": 992
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.7138810851622357,
+      "learning_rate": 3.2031904692216153e-06,
+      "loss": 0.5267,
+      "step": 993
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.9034028799031073,
+      "learning_rate": 3.2000521119188267e-06,
+      "loss": 0.5605,
+      "step": 994
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.994571492675121,
+      "learning_rate": 3.1969125570925303e-06,
+      "loss": 0.53,
+      "step": 995
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.771581881704634,
+      "learning_rate": 3.193771810113313e-06,
+      "loss": 0.6177,
+      "step": 996
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.7808220445921694,
+      "learning_rate": 3.1906298763538005e-06,
+      "loss": 0.5215,
+      "step": 997
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.8069794706642701,
+      "learning_rate": 3.1874867611886513e-06,
+      "loss": 0.5444,
+      "step": 998
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.7806867210889854,
+      "learning_rate": 3.1843424699945403e-06,
+      "loss": 0.5471,
+      "step": 999
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.7481554024627886,
+      "learning_rate": 3.1811970081501576e-06,
+      "loss": 0.5159,
+      "step": 1000
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.8105318680671914,
+      "learning_rate": 3.1780503810361946e-06,
+      "loss": 0.4985,
+      "step": 1001
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.7033701950072382,
+      "learning_rate": 3.1749025940353363e-06,
+      "loss": 0.5594,
+      "step": 1002
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 2.3799847532384515,
+      "learning_rate": 3.1717536525322512e-06,
+      "loss": 0.5978,
+      "step": 1003
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.7427559432173463,
+      "learning_rate": 3.1686035619135845e-06,
+      "loss": 0.5299,
+      "step": 1004
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.7454547855925509,
+      "learning_rate": 3.1654523275679453e-06,
+      "loss": 0.5439,
+      "step": 1005
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.7130931472340127,
+      "learning_rate": 3.162299954885899e-06,
+      "loss": 0.5379,
+      "step": 1006
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.6940357366272063,
+      "learning_rate": 3.15914644925996e-06,
+      "loss": 0.5694,
+      "step": 1007
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.8544220651543013,
+      "learning_rate": 3.1559918160845787e-06,
+      "loss": 0.5285,
+      "step": 1008
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.8481774433371347,
+      "learning_rate": 3.1528360607561358e-06,
+      "loss": 0.5384,
+      "step": 1009
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.8256828659009958,
+      "learning_rate": 3.149679188672932e-06,
+      "loss": 0.4806,
+      "step": 1010
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.9380282822721238,
+      "learning_rate": 3.1465212052351766e-06,
+      "loss": 0.543,
+      "step": 1011
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.985943690469791,
+      "learning_rate": 3.1433621158449807e-06,
+      "loss": 0.5549,
+      "step": 1012
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.7038398790061953,
+      "learning_rate": 3.140201925906348e-06,
+      "loss": 0.4682,
+      "step": 1013
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.8748481620529394,
+      "learning_rate": 3.1370406408251632e-06,
+      "loss": 0.5046,
+      "step": 1014
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.7587036990451181,
+      "learning_rate": 3.133878266009186e-06,
+      "loss": 0.5203,
+      "step": 1015
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.7503537433041947,
+      "learning_rate": 3.130714806868041e-06,
+      "loss": 0.5546,
+      "step": 1016
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.7701505667314001,
+      "learning_rate": 3.127550268813205e-06,
+      "loss": 0.531,
+      "step": 1017
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.771371589393474,
+      "learning_rate": 3.124384657258001e-06,
+      "loss": 0.5424,
+      "step": 1018
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.8016015279719124,
+      "learning_rate": 3.1212179776175905e-06,
+      "loss": 0.5706,
+      "step": 1019
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.810944889002695,
+      "learning_rate": 3.1180502353089598e-06,
+      "loss": 0.5502,
+      "step": 1020
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.8062084514449492,
+      "learning_rate": 3.1148814357509147e-06,
+      "loss": 0.5337,
+      "step": 1021
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.669643406466654,
+      "learning_rate": 3.111711584364068e-06,
+      "loss": 0.4802,
+      "step": 1022
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.6852245083058144,
+      "learning_rate": 3.1085406865708333e-06,
+      "loss": 0.532,
+      "step": 1023
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.8463748056800222,
+      "learning_rate": 3.1053687477954124e-06,
+      "loss": 0.5112,
+      "step": 1024
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.7302148909577209,
+      "learning_rate": 3.10219577346379e-06,
+      "loss": 0.5549,
+      "step": 1025
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.7752983463714818,
+      "learning_rate": 3.0990217690037206e-06,
+      "loss": 0.5606,
+      "step": 1026
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.695119975844164,
+      "learning_rate": 3.09584673984472e-06,
+      "loss": 0.486,
+      "step": 1027
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.793543444803663,
+      "learning_rate": 3.0926706914180605e-06,
+      "loss": 0.6474,
+      "step": 1028
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.6954588940750932,
+      "learning_rate": 3.089493629156755e-06,
+      "loss": 0.5208,
+      "step": 1029
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.9045089074493644,
+      "learning_rate": 3.08631555849555e-06,
+      "loss": 0.5291,
+      "step": 1030
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.8481217904786489,
+      "learning_rate": 3.083136484870921e-06,
+      "loss": 0.5212,
+      "step": 1031
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.6729420221561044,
+      "learning_rate": 3.0799564137210536e-06,
+      "loss": 0.5024,
+      "step": 1032
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.8821832248249077,
+      "learning_rate": 3.076775350485845e-06,
+      "loss": 0.5459,
+      "step": 1033
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.762473350167322,
+      "learning_rate": 3.0735933006068863e-06,
+      "loss": 0.4938,
+      "step": 1034
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.7950707678098703,
+      "learning_rate": 3.0704102695274573e-06,
+      "loss": 0.4922,
+      "step": 1035
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.6853644769275375,
+      "learning_rate": 3.0672262626925174e-06,
+      "loss": 0.47,
+      "step": 1036
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.809909106997157,
+      "learning_rate": 3.0640412855486922e-06,
+      "loss": 0.5545,
+      "step": 1037
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 2.019472393876661,
+      "learning_rate": 3.06085534354427e-06,
+      "loss": 0.5616,
+      "step": 1038
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.7972785887075076,
+      "learning_rate": 3.057668442129188e-06,
+      "loss": 0.5269,
+      "step": 1039
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.865555820217107,
+      "learning_rate": 3.054480586755026e-06,
+      "loss": 0.5752,
+      "step": 1040
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.792147096098412,
+      "learning_rate": 3.051291782874995e-06,
+      "loss": 0.54,
+      "step": 1041
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.8108893550848508,
+      "learning_rate": 3.048102035943927e-06,
+      "loss": 0.5367,
+      "step": 1042
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 2.0966646553454793,
+      "learning_rate": 3.04491135141827e-06,
+      "loss": 0.5455,
+      "step": 1043
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.7357403687049695,
+      "learning_rate": 3.041719734756073e-06,
+      "loss": 0.502,
+      "step": 1044
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.8033826162723872,
+      "learning_rate": 3.038527191416982e-06,
+      "loss": 0.5644,
+      "step": 1045
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.7822928111630525,
+      "learning_rate": 3.0353337268622267e-06,
+      "loss": 0.4938,
+      "step": 1046
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.7910319343463081,
+      "learning_rate": 3.0321393465546134e-06,
+      "loss": 0.5889,
+      "step": 1047
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.7457160087273953,
+      "learning_rate": 3.028944055958514e-06,
+      "loss": 0.5022,
+      "step": 1048
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.691379648176161,
+      "learning_rate": 3.0257478605398595e-06,
+      "loss": 0.4841,
+      "step": 1049
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.7452186987943483,
+      "learning_rate": 3.0225507657661257e-06,
+      "loss": 0.5626,
+      "step": 1050
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.7578678635930594,
+      "learning_rate": 3.0193527771063297e-06,
+      "loss": 0.5115,
+      "step": 1051
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.7879798898209605,
+      "learning_rate": 3.016153900031016e-06,
+      "loss": 0.5296,
+      "step": 1052
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.6745604796677231,
+      "learning_rate": 3.0129541400122492e-06,
+      "loss": 0.5089,
+      "step": 1053
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.8484438696306678,
+      "learning_rate": 3.0097535025236045e-06,
+      "loss": 0.6124,
+      "step": 1054
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.8023880068850882,
+      "learning_rate": 3.0065519930401595e-06,
+      "loss": 0.4983,
+      "step": 1055
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.743901583565096,
+      "learning_rate": 3.0033496170384803e-06,
+      "loss": 0.4998,
+      "step": 1056
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.9494472820876043,
+      "learning_rate": 3.000146379996617e-06,
+      "loss": 0.537,
+      "step": 1057
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.6992995489648048,
+      "learning_rate": 2.996942287394093e-06,
+      "loss": 0.5822,
+      "step": 1058
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.8498288139189643,
+      "learning_rate": 2.993737344711895e-06,
+      "loss": 0.5651,
+      "step": 1059
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.755920633785882,
+      "learning_rate": 2.990531557432464e-06,
+      "loss": 0.496,
+      "step": 1060
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.7876484928074277,
+      "learning_rate": 2.9873249310396853e-06,
+      "loss": 0.5224,
+      "step": 1061
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.7573987279473129,
+      "learning_rate": 2.98411747101888e-06,
+      "loss": 0.5228,
+      "step": 1062
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.6995721104857204,
+      "learning_rate": 2.980909182856794e-06,
+      "loss": 0.4758,
+      "step": 1063
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.907464743607936,
+      "learning_rate": 2.9777000720415916e-06,
+      "loss": 0.5254,
+      "step": 1064
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.7921365259203703,
+      "learning_rate": 2.974490144062844e-06,
+      "loss": 0.5116,
+      "step": 1065
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.9010192849593792,
+      "learning_rate": 2.9712794044115196e-06,
+      "loss": 0.5136,
+      "step": 1066
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.742881813035793,
+      "learning_rate": 2.968067858579975e-06,
+      "loss": 0.5436,
+      "step": 1067
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.7135933558215708,
+      "learning_rate": 2.964855512061947e-06,
+      "loss": 0.5268,
+      "step": 1068
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.8360025545734582,
+      "learning_rate": 2.9616423703525414e-06,
+      "loss": 0.5238,
+      "step": 1069
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.7090421713960848,
+      "learning_rate": 2.9584284389482237e-06,
+      "loss": 0.5051,
+      "step": 1070
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.7462732547158757,
+      "learning_rate": 2.9552137233468113e-06,
+      "loss": 0.4838,
+      "step": 1071
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.9336108910937513,
+      "learning_rate": 2.951998229047464e-06,
+      "loss": 0.5576,
+      "step": 1072
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.784092660568157,
+      "learning_rate": 2.9487819615506702e-06,
+      "loss": 0.5349,
+      "step": 1073
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.772640354616067,
+      "learning_rate": 2.945564926358245e-06,
+      "loss": 0.5423,
+      "step": 1074
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.8491968859591044,
+      "learning_rate": 2.9423471289733125e-06,
+      "loss": 0.5453,
+      "step": 1075
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.8283172103770493,
+      "learning_rate": 2.9391285749003046e-06,
+      "loss": 0.5318,
+      "step": 1076
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.7802483696828226,
+      "learning_rate": 2.935909269644946e-06,
+      "loss": 0.4954,
+      "step": 1077
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.8687809173149,
+      "learning_rate": 2.9326892187142457e-06,
+      "loss": 0.5428,
+      "step": 1078
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.9218917868616974,
+      "learning_rate": 2.9294684276164888e-06,
+      "loss": 0.5125,
+      "step": 1079
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.8406300824318225,
+      "learning_rate": 2.9262469018612278e-06,
+      "loss": 0.5186,
+      "step": 1080
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.8153319034513924,
+      "learning_rate": 2.9230246469592695e-06,
+      "loss": 0.4878,
+      "step": 1081
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.8381190525343576,
+      "learning_rate": 2.91980166842267e-06,
+      "loss": 0.5455,
+      "step": 1082
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.7941629060330144,
+      "learning_rate": 2.9165779717647212e-06,
+      "loss": 0.5425,
+      "step": 1083
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.755950985861856,
+      "learning_rate": 2.9133535624999466e-06,
+      "loss": 0.4992,
+      "step": 1084
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.8065716401418646,
+      "learning_rate": 2.9101284461440853e-06,
+      "loss": 0.5569,
+      "step": 1085
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.8487073865649808,
+      "learning_rate": 2.9069026282140887e-06,
+      "loss": 0.5352,
+      "step": 1086
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.877024524581134,
+      "learning_rate": 2.903676114228107e-06,
+      "loss": 0.5584,
+      "step": 1087
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.812931375367902,
+      "learning_rate": 2.9004489097054807e-06,
+      "loss": 0.5154,
+      "step": 1088
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.7729938020658174,
+      "learning_rate": 2.897221020166732e-06,
+      "loss": 0.5386,
+      "step": 1089
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.6991898958250629,
+      "learning_rate": 2.8939924511335555e-06,
+      "loss": 0.5467,
+      "step": 1090
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.7298323860671052,
+      "learning_rate": 2.890763208128807e-06,
+      "loss": 0.5506,
+      "step": 1091
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.9718362378496106,
+      "learning_rate": 2.887533296676497e-06,
+      "loss": 0.5453,
+      "step": 1092
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.7003897379752575,
+      "learning_rate": 2.8843027223017767e-06,
+      "loss": 0.5016,
+      "step": 1093
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.7604846690613096,
+      "learning_rate": 2.8810714905309346e-06,
+      "loss": 0.5206,
+      "step": 1094
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.868522047775135,
+      "learning_rate": 2.8778396068913807e-06,
+      "loss": 0.5152,
+      "step": 1095
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.8080911269766844,
+      "learning_rate": 2.874607076911642e-06,
+      "loss": 0.4966,
+      "step": 1096
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.7767037245003534,
+      "learning_rate": 2.871373906121351e-06,
+      "loss": 0.5081,
+      "step": 1097
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.733045586658075,
+      "learning_rate": 2.8681401000512356e-06,
+      "loss": 0.5031,
+      "step": 1098
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.6767478479637847,
+      "learning_rate": 2.8649056642331103e-06,
+      "loss": 0.4856,
+      "step": 1099
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.6820690185704608,
+      "learning_rate": 2.8616706041998686e-06,
+      "loss": 0.5151,
+      "step": 1100
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.840181264549285,
+      "learning_rate": 2.8584349254854693e-06,
+      "loss": 0.5393,
+      "step": 1101
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.827807570004724,
+      "learning_rate": 2.8551986336249322e-06,
+      "loss": 0.5572,
+      "step": 1102
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.711815265099016,
+      "learning_rate": 2.8519617341543233e-06,
+      "loss": 0.5184,
+      "step": 1103
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.7460018389221874,
+      "learning_rate": 2.8487242326107495e-06,
+      "loss": 0.5374,
+      "step": 1104
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.985067366728648,
+      "learning_rate": 2.8454861345323475e-06,
+      "loss": 0.538,
+      "step": 1105
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.8044567576569952,
+      "learning_rate": 2.8422474454582754e-06,
+      "loss": 0.4947,
+      "step": 1106
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.7648712890692506,
+      "learning_rate": 2.8390081709286997e-06,
+      "loss": 0.5584,
+      "step": 1107
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.7544905722043518,
+      "learning_rate": 2.8357683164847903e-06,
+      "loss": 0.5696,
+      "step": 1108
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.7923136846837993,
+      "learning_rate": 2.8325278876687084e-06,
+      "loss": 0.5502,
+      "step": 1109
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 2.077195937792951,
+      "learning_rate": 2.8292868900235986e-06,
+      "loss": 0.543,
+      "step": 1110
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.7675854046933754,
+      "learning_rate": 2.826045329093578e-06,
+      "loss": 0.5422,
+      "step": 1111
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.8457239401392898,
+      "learning_rate": 2.822803210423727e-06,
+      "loss": 0.5334,
+      "step": 1112
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.7426929121470698,
+      "learning_rate": 2.8195605395600804e-06,
+      "loss": 0.4972,
+      "step": 1113
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.7675216264197045,
+      "learning_rate": 2.8163173220496175e-06,
+      "loss": 0.5442,
+      "step": 1114
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.7483102565661375,
+      "learning_rate": 2.8130735634402527e-06,
+      "loss": 0.5425,
+      "step": 1115
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.692036399159914,
+      "learning_rate": 2.8098292692808253e-06,
+      "loss": 0.521,
+      "step": 1116
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.799980213437577,
+      "learning_rate": 2.8065844451210933e-06,
+      "loss": 0.5597,
+      "step": 1117
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.7666190830884467,
+      "learning_rate": 2.803339096511718e-06,
+      "loss": 0.5612,
+      "step": 1118
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.792129515845057,
+      "learning_rate": 2.8000932290042597e-06,
+      "loss": 0.5334,
+      "step": 1119
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.7395715578516604,
+      "learning_rate": 2.7968468481511663e-06,
+      "loss": 0.5545,
+      "step": 1120
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.6843830287676704,
+      "learning_rate": 2.7935999595057623e-06,
+      "loss": 0.5659,
+      "step": 1121
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.6432688824199502,
+      "learning_rate": 2.790352568622244e-06,
+      "loss": 0.4926,
+      "step": 1122
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.7430642435954644,
+      "learning_rate": 2.787104681055663e-06,
+      "loss": 0.4666,
+      "step": 1123
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.8067789882264202,
+      "learning_rate": 2.783856302361923e-06,
+      "loss": 0.5233,
+      "step": 1124
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.7685143281757654,
+      "learning_rate": 2.780607438097769e-06,
+      "loss": 0.5506,
+      "step": 1125
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.7163110868931304,
+      "learning_rate": 2.7773580938207717e-06,
+      "loss": 0.5044,
+      "step": 1126
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.809036270322799,
+      "learning_rate": 2.7741082750893284e-06,
+      "loss": 0.5206,
+      "step": 1127
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.8193898978325846,
+      "learning_rate": 2.770857987462645e-06,
+      "loss": 0.6064,
+      "step": 1128
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.765826426309075,
+      "learning_rate": 2.76760723650073e-06,
+      "loss": 0.4914,
+      "step": 1129
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 2.046345230237298,
+      "learning_rate": 2.764356027764385e-06,
+      "loss": 0.5938,
+      "step": 1130
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.8264697696225647,
+      "learning_rate": 2.7611043668151948e-06,
+      "loss": 0.5476,
+      "step": 1131
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.7776043318415495,
+      "learning_rate": 2.7578522592155166e-06,
+      "loss": 0.5318,
+      "step": 1132
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.767284538432005,
+      "learning_rate": 2.7545997105284735e-06,
+      "loss": 0.5197,
+      "step": 1133
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.831190014066027,
+      "learning_rate": 2.75134672631794e-06,
+      "loss": 0.4939,
+      "step": 1134
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.7727769641989948,
+      "learning_rate": 2.7480933121485394e-06,
+      "loss": 0.5542,
+      "step": 1135
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.7599576706599651,
+      "learning_rate": 2.7448394735856275e-06,
+      "loss": 0.5102,
+      "step": 1136
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.7526987759875383,
+      "learning_rate": 2.7415852161952893e-06,
+      "loss": 0.5357,
+      "step": 1137
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.7478180377944075,
+      "learning_rate": 2.7383305455443223e-06,
+      "loss": 0.552,
+      "step": 1138
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.8026983878339322,
+      "learning_rate": 2.7350754672002334e-06,
+      "loss": 0.5324,
+      "step": 1139
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.7539604119960455,
+      "learning_rate": 2.7318199867312267e-06,
+      "loss": 0.4951,
+      "step": 1140
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.7060714376533908,
+      "learning_rate": 2.728564109706193e-06,
+      "loss": 0.5044,
+      "step": 1141
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.896732668736906,
+      "learning_rate": 2.725307841694704e-06,
+      "loss": 0.5272,
+      "step": 1142
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.9094037542829962,
+      "learning_rate": 2.722051188266998e-06,
+      "loss": 0.5036,
+      "step": 1143
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.7529900591353695,
+      "learning_rate": 2.7187941549939723e-06,
+      "loss": 0.4962,
+      "step": 1144
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.7652784724721573,
+      "learning_rate": 2.7155367474471763e-06,
+      "loss": 0.5159,
+      "step": 1145
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.9070275680276054,
+      "learning_rate": 2.7122789711987964e-06,
+      "loss": 0.5269,
+      "step": 1146
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.7630505518040367,
+      "learning_rate": 2.709020831821652e-06,
+      "loss": 0.5286,
+      "step": 1147
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.7410138974922291,
+      "learning_rate": 2.7057623348891846e-06,
+      "loss": 0.4902,
+      "step": 1148
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.745842560539345,
+      "learning_rate": 2.7025034859754446e-06,
+      "loss": 0.5178,
+      "step": 1149
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.8498982578771728,
+      "learning_rate": 2.699244290655086e-06,
+      "loss": 0.55,
+      "step": 1150
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.6360369924184164,
+      "learning_rate": 2.6959847545033558e-06,
+      "loss": 0.4988,
+      "step": 1151
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.6784833460211517,
+      "learning_rate": 2.692724883096082e-06,
+      "loss": 0.5303,
+      "step": 1152
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.7888637226825195,
+      "learning_rate": 2.68946468200967e-06,
+      "loss": 0.542,
+      "step": 1153
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.7156031503954616,
+      "learning_rate": 2.686204156821084e-06,
+      "loss": 0.499,
+      "step": 1154
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.802618839032982,
+      "learning_rate": 2.6829433131078464e-06,
+      "loss": 0.5095,
+      "step": 1155
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.7018673816457677,
+      "learning_rate": 2.6796821564480237e-06,
+      "loss": 0.4911,
+      "step": 1156
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.939833859373507,
+      "learning_rate": 2.6764206924202173e-06,
+      "loss": 0.5965,
+      "step": 1157
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.757462214596805,
+      "learning_rate": 2.673158926603554e-06,
+      "loss": 0.5119,
+      "step": 1158
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.824906787992325,
+      "learning_rate": 2.669896864577678e-06,
+      "loss": 0.4995,
+      "step": 1159
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.6963319988581682,
+      "learning_rate": 2.666634511922739e-06,
+      "loss": 0.499,
+      "step": 1160
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.7490967555131538,
+      "learning_rate": 2.6633718742193837e-06,
+      "loss": 0.5045,
+      "step": 1161
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.7295387040616608,
+      "learning_rate": 2.660108957048749e-06,
+      "loss": 0.48,
+      "step": 1162
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.7062936128447537,
+      "learning_rate": 2.656845765992447e-06,
+      "loss": 0.5024,
+      "step": 1163
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.7291223687738257,
+      "learning_rate": 2.6535823066325594e-06,
+      "loss": 0.4965,
+      "step": 1164
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.7660018876230184,
+      "learning_rate": 2.650318584551626e-06,
+      "loss": 0.6289,
+      "step": 1165
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.6875948695046943,
+      "learning_rate": 2.6470546053326375e-06,
+      "loss": 0.5099,
+      "step": 1166
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.7055862895950586,
+      "learning_rate": 2.643790374559023e-06,
+      "loss": 0.4748,
+      "step": 1167
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.8397810404769834,
+      "learning_rate": 2.6405258978146443e-06,
+      "loss": 0.5547,
+      "step": 1168
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.6780759297615608,
+      "learning_rate": 2.6372611806837804e-06,
+      "loss": 0.4696,
+      "step": 1169
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.7463193906158438,
+      "learning_rate": 2.633996228751125e-06,
+      "loss": 0.5167,
+      "step": 1170
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.7682737157303552,
+      "learning_rate": 2.6307310476017705e-06,
+      "loss": 0.5178,
+      "step": 1171
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.7759532350573655,
+      "learning_rate": 2.627465642821203e-06,
+      "loss": 0.5411,
+      "step": 1172
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.741742707150691,
+      "learning_rate": 2.624200019995293e-06,
+      "loss": 0.5357,
+      "step": 1173
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.7638181255611864,
+      "learning_rate": 2.6209341847102787e-06,
+      "loss": 0.5598,
+      "step": 1174
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.6585763596592404,
+      "learning_rate": 2.6176681425527663e-06,
+      "loss": 0.4891,
+      "step": 1175
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.7652514703885578,
+      "learning_rate": 2.614401899109716e-06,
+      "loss": 0.5412,
+      "step": 1176
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.7646286601286296,
+      "learning_rate": 2.6111354599684287e-06,
+      "loss": 0.4753,
+      "step": 1177
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.7933546923906454,
+      "learning_rate": 2.6078688307165436e-06,
+      "loss": 0.5159,
+      "step": 1178
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.8474498352431208,
+      "learning_rate": 2.6046020169420223e-06,
+      "loss": 0.4786,
+      "step": 1179
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.816609500392057,
+      "learning_rate": 2.601335024233145e-06,
+      "loss": 0.5821,
+      "step": 1180
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.7603922858788037,
+      "learning_rate": 2.598067858178495e-06,
+      "loss": 0.4749,
+      "step": 1181
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.771168764538133,
+      "learning_rate": 2.594800524366956e-06,
+      "loss": 0.5221,
+      "step": 1182
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.7428386931770696,
+      "learning_rate": 2.591533028387694e-06,
+      "loss": 0.5243,
+      "step": 1183
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.7354647623517858,
+      "learning_rate": 2.588265375830155e-06,
+      "loss": 0.4665,
+      "step": 1184
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.7757829783254058,
+      "learning_rate": 2.5849975722840537e-06,
+      "loss": 0.4713,
+      "step": 1185
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.7660698291034924,
+      "learning_rate": 2.58172962333936e-06,
+      "loss": 0.5198,
+      "step": 1186
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.7071465020770178,
+      "learning_rate": 2.5784615345862963e-06,
+      "loss": 0.5355,
+      "step": 1187
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.6994920599655763,
+      "learning_rate": 2.5751933116153215e-06,
+      "loss": 0.4867,
+      "step": 1188
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.7891977115774562,
+      "learning_rate": 2.5719249600171247e-06,
+      "loss": 0.5071,
+      "step": 1189
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.6866451169084888,
+      "learning_rate": 2.568656485382616e-06,
+      "loss": 0.4767,
+      "step": 1190
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.9106444693405875,
+      "learning_rate": 2.5653878933029134e-06,
+      "loss": 0.5063,
+      "step": 1191
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.7546015951107552,
+      "learning_rate": 2.56211918936934e-06,
+      "loss": 0.5536,
+      "step": 1192
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.7866083346923656,
+      "learning_rate": 2.5588503791734053e-06,
+      "loss": 0.4738,
+      "step": 1193
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.6678313975517949,
+      "learning_rate": 2.5555814683068058e-06,
+      "loss": 0.5095,
+      "step": 1194
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.694690087625629,
+      "learning_rate": 2.552312462361405e-06,
+      "loss": 0.5711,
+      "step": 1195
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.7583066556547233,
+      "learning_rate": 2.5490433669292337e-06,
+      "loss": 0.5183,
+      "step": 1196
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.8259327544569408,
+      "learning_rate": 2.5457741876024716e-06,
+      "loss": 0.5129,
+      "step": 1197
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.743709458286742,
+      "learning_rate": 2.542504929973445e-06,
+      "loss": 0.509,
+      "step": 1198
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.8551037168096902,
+      "learning_rate": 2.5392355996346134e-06,
+      "loss": 0.4874,
+      "step": 1199
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.7705896553689628,
+      "learning_rate": 2.5359662021785596e-06,
+      "loss": 0.5102,
+      "step": 1200
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.8456154073029885,
+      "learning_rate": 2.532696743197982e-06,
+      "loss": 0.5363,
+      "step": 1201
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.7341454202963031,
+      "learning_rate": 2.529427228285686e-06,
+      "loss": 0.5013,
+      "step": 1202
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.7923147732329405,
+      "learning_rate": 2.526157663034568e-06,
+      "loss": 0.5191,
+      "step": 1203
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.731262319220837,
+      "learning_rate": 2.522888053037616e-06,
+      "loss": 0.4889,
+      "step": 1204
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.797800368847369,
+      "learning_rate": 2.5196184038878895e-06,
+      "loss": 0.4868,
+      "step": 1205
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.8182272292135089,
+      "learning_rate": 2.5163487211785194e-06,
+      "loss": 0.5159,
+      "step": 1206
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.9699143840893472,
+      "learning_rate": 2.5130790105026908e-06,
+      "loss": 0.543,
+      "step": 1207
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.805587879000798,
+      "learning_rate": 2.5098092774536397e-06,
+      "loss": 0.5162,
+      "step": 1208
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.966538834153111,
+      "learning_rate": 2.506539527624637e-06,
+      "loss": 0.4973,
+      "step": 1209
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.7007116827865891,
+      "learning_rate": 2.5032697666089833e-06,
+      "loss": 0.5337,
+      "step": 1210
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.8200190388383481,
+      "learning_rate": 2.5e-06,
+      "loss": 0.492,
+      "step": 1211
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.7811733389101785,
+      "learning_rate": 2.496730233391017e-06,
+      "loss": 0.533,
+      "step": 1212
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.7692852455085013,
+      "learning_rate": 2.4934604723753636e-06,
+      "loss": 0.5151,
+      "step": 1213
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 2.0118407638136726,
+      "learning_rate": 2.4901907225463607e-06,
+      "loss": 0.566,
+      "step": 1214
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.9919699597672162,
+      "learning_rate": 2.486920989497309e-06,
+      "loss": 0.5296,
+      "step": 1215
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.7399123797451834,
+      "learning_rate": 2.483651278821481e-06,
+      "loss": 0.5535,
+      "step": 1216
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 2.0162050634113617,
+      "learning_rate": 2.4803815961121117e-06,
+      "loss": 0.5105,
+      "step": 1217
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.9472302767468135,
+      "learning_rate": 2.4771119469623856e-06,
+      "loss": 0.4829,
+      "step": 1218
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.9358326178363474,
+      "learning_rate": 2.4738423369654327e-06,
+      "loss": 0.5895,
+      "step": 1219
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.8202396491898063,
+      "learning_rate": 2.470572771714315e-06,
+      "loss": 0.5159,
+      "step": 1220
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 2.0705540084815652,
+      "learning_rate": 2.4673032568020183e-06,
+      "loss": 0.5375,
+      "step": 1221
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.9290016818033147,
+      "learning_rate": 2.464033797821441e-06,
+      "loss": 0.5328,
+      "step": 1222
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.858876842427081,
+      "learning_rate": 2.460764400365387e-06,
+      "loss": 0.5246,
+      "step": 1223
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.7372257522644121,
+      "learning_rate": 2.457495070026555e-06,
+      "loss": 0.5557,
+      "step": 1224
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 2.042578607858068,
+      "learning_rate": 2.454225812397529e-06,
+      "loss": 0.5493,
+      "step": 1225
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 1.80578953353184,
+      "learning_rate": 2.450956633070767e-06,
+      "loss": 0.4722,
+      "step": 1226
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 1.6245117501883604,
+      "learning_rate": 2.4476875376385954e-06,
+      "loss": 0.4861,
+      "step": 1227
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 2.3717275673814986,
+      "learning_rate": 2.4444185316931955e-06,
+      "loss": 0.4955,
+      "step": 1228
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 2.789230426976571,
+      "learning_rate": 2.441149620826595e-06,
+      "loss": 0.401,
+      "step": 1229
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 2.3165196574538163,
+      "learning_rate": 2.437880810630661e-06,
+      "loss": 0.391,
+      "step": 1230
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 3.7748119497874244,
+      "learning_rate": 2.434612106697087e-06,
+      "loss": 0.3971,
+      "step": 1231
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 2.516708769328096,
+      "learning_rate": 2.4313435146173845e-06,
+      "loss": 0.3677,
+      "step": 1232
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 2.0383812730416593,
+      "learning_rate": 2.4280750399828757e-06,
+      "loss": 0.3834,
+      "step": 1233
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 2.388274870254754,
+      "learning_rate": 2.424806688384679e-06,
+      "loss": 0.38,
+      "step": 1234
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 2.428758767469847,
+      "learning_rate": 2.4215384654137037e-06,
+      "loss": 0.3557,
+      "step": 1235
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.9871015940327752,
+      "learning_rate": 2.41827037666064e-06,
+      "loss": 0.3742,
+      "step": 1236
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 2.0490853630896595,
+      "learning_rate": 2.415002427715948e-06,
+      "loss": 0.4077,
+      "step": 1237
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 2.36022057857035,
+      "learning_rate": 2.4117346241698457e-06,
+      "loss": 0.4079,
+      "step": 1238
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 2.4014397498962974,
+      "learning_rate": 2.408466971612307e-06,
+      "loss": 0.3783,
+      "step": 1239
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 2.1970209263326246,
+      "learning_rate": 2.405199475633045e-06,
+      "loss": 0.4019,
+      "step": 1240
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.8747804397851657,
+      "learning_rate": 2.4019321418215053e-06,
+      "loss": 0.3657,
+      "step": 1241
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 2.0377029592503666,
+      "learning_rate": 2.398664975766856e-06,
+      "loss": 0.3575,
+      "step": 1242
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 2.2162687478729133,
+      "learning_rate": 2.3953979830579785e-06,
+      "loss": 0.3891,
+      "step": 1243
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 2.0736112974636605,
+      "learning_rate": 2.3921311692834577e-06,
+      "loss": 0.3872,
+      "step": 1244
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.8065329023464558,
+      "learning_rate": 2.3888645400315717e-06,
+      "loss": 0.3684,
+      "step": 1245
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 2.144863722944226,
+      "learning_rate": 2.385598100890285e-06,
+      "loss": 0.3781,
+      "step": 1246
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 2.245173550848138,
+      "learning_rate": 2.382331857447234e-06,
+      "loss": 0.3906,
+      "step": 1247
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 2.0580037557233806,
+      "learning_rate": 2.379065815289723e-06,
+      "loss": 0.3461,
+      "step": 1248
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 1.754328637936701,
+      "learning_rate": 2.3757999800047088e-06,
+      "loss": 0.3626,
+      "step": 1249
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 1.8749369460952616,
+      "learning_rate": 2.3725343571787974e-06,
+      "loss": 0.3723,
+      "step": 1250
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 1.9635590762348785,
+      "learning_rate": 2.36926895239823e-06,
+      "loss": 0.3506,
+      "step": 1251
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 1.9091295881177242,
+      "learning_rate": 2.3660037712488758e-06,
+      "loss": 0.3705,
+      "step": 1252
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 2.0807822077632445,
+      "learning_rate": 2.36273881931622e-06,
+      "loss": 0.4083,
+      "step": 1253
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 1.9247801946548893,
+      "learning_rate": 2.3594741021853565e-06,
+      "loss": 0.3896,
+      "step": 1254
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 2.003234826375957,
+      "learning_rate": 2.356209625440977e-06,
+      "loss": 0.3928,
+      "step": 1255
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 1.9601094488156638,
+      "learning_rate": 2.352945394667363e-06,
+      "loss": 0.346,
+      "step": 1256
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 1.835912356231795,
+      "learning_rate": 2.3496814154483754e-06,
+      "loss": 0.3268,
+      "step": 1257
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 1.851616138864044,
+      "learning_rate": 2.346417693367442e-06,
+      "loss": 0.395,
+      "step": 1258
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 2.017511453982363,
+      "learning_rate": 2.3431542340075535e-06,
+      "loss": 0.3989,
+      "step": 1259
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 1.9337327085061278,
+      "learning_rate": 2.3398910429512516e-06,
+      "loss": 0.4168,
+      "step": 1260
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 1.8957440589808827,
+      "learning_rate": 2.3366281257806167e-06,
+      "loss": 0.3626,
+      "step": 1261
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 1.819897111464585,
+      "learning_rate": 2.3333654880772622e-06,
+      "loss": 0.3737,
+      "step": 1262
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 1.9283607336926767,
+      "learning_rate": 2.3301031354223226e-06,
+      "loss": 0.3595,
+      "step": 1263
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 1.8049670593502345,
+      "learning_rate": 2.3268410733964463e-06,
+      "loss": 0.3645,
+      "step": 1264
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 1.866103990559354,
+      "learning_rate": 2.3235793075797835e-06,
+      "loss": 0.391,
+      "step": 1265
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 1.774992664072412,
+      "learning_rate": 2.3203178435519767e-06,
+      "loss": 0.3863,
+      "step": 1266
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 1.8431093658964484,
+      "learning_rate": 2.3170566868921553e-06,
+      "loss": 0.4175,
+      "step": 1267
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 1.7731154009482526,
+      "learning_rate": 2.3137958431789175e-06,
+      "loss": 0.3651,
+      "step": 1268
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 1.980392583405916,
+      "learning_rate": 2.3105353179903313e-06,
+      "loss": 0.3919,
+      "step": 1269
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 1.8435910751312221,
+      "learning_rate": 2.3072751169039183e-06,
+      "loss": 0.3466,
+      "step": 1270
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 1.88150621693115,
+      "learning_rate": 2.304015245496645e-06,
+      "loss": 0.3991,
+      "step": 1271
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 1.9365960105712363,
+      "learning_rate": 2.300755709344915e-06,
+      "loss": 0.3675,
+      "step": 1272
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 1.8120924423380202,
+      "learning_rate": 2.297496514024556e-06,
+      "loss": 0.389,
+      "step": 1273
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 1.822066570446833,
+      "learning_rate": 2.2942376651108158e-06,
+      "loss": 0.3355,
+      "step": 1274
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 1.968043494993567,
+      "learning_rate": 2.290979168178348e-06,
+      "loss": 0.3909,
+      "step": 1275
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 1.8571689944285859,
+      "learning_rate": 2.287721028801204e-06,
+      "loss": 0.376,
+      "step": 1276
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 2.003415605331929,
+      "learning_rate": 2.2844632525528245e-06,
+      "loss": 0.3439,
+      "step": 1277
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 2.248040597881556,
+      "learning_rate": 2.2812058450060285e-06,
+      "loss": 0.3789,
+      "step": 1278
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 1.8018969815730068,
+      "learning_rate": 2.2779488117330032e-06,
+      "loss": 0.3756,
+      "step": 1279
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 1.90374397055853,
+      "learning_rate": 2.2746921583052967e-06,
+      "loss": 0.4126,
+      "step": 1280
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 1.8558365521624263,
+      "learning_rate": 2.2714358902938073e-06,
+      "loss": 0.3959,
+      "step": 1281
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 1.8375175796231433,
+      "learning_rate": 2.268180013268774e-06,
+      "loss": 0.4048,
+      "step": 1282
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 1.984205865069469,
+      "learning_rate": 2.2649245327997674e-06,
+      "loss": 0.4039,
+      "step": 1283
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 1.8933532928718015,
+      "learning_rate": 2.261669454455679e-06,
+      "loss": 0.3781,
+      "step": 1284
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 1.9740915743952114,
+      "learning_rate": 2.2584147838047116e-06,
+      "loss": 0.4003,
+      "step": 1285
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 1.8808844925592019,
+      "learning_rate": 2.2551605264143725e-06,
+      "loss": 0.3449,
+      "step": 1286
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 1.9307797122579196,
+      "learning_rate": 2.251906687851461e-06,
+      "loss": 0.4182,
+      "step": 1287
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 1.8492505145939904,
+      "learning_rate": 2.2486532736820614e-06,
+      "loss": 0.3736,
+      "step": 1288
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 1.8826597143825838,
+      "learning_rate": 2.245400289471528e-06,
+      "loss": 0.3987,
+      "step": 1289
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 1.8696499317715565,
+      "learning_rate": 2.242147740784484e-06,
+      "loss": 0.3725,
+      "step": 1290
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 2.0572316139676463,
+      "learning_rate": 2.2388956331848057e-06,
+      "loss": 0.3777,
+      "step": 1291
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 1.9916048666817696,
+      "learning_rate": 2.2356439722356154e-06,
+      "loss": 0.3435,
+      "step": 1292
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 1.7903849297787813,
+      "learning_rate": 2.2323927634992706e-06,
+      "loss": 0.3691,
+      "step": 1293
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 1.8840722711485807,
+      "learning_rate": 2.2291420125373555e-06,
+      "loss": 0.3619,
+      "step": 1294
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 1.853222255447046,
+      "learning_rate": 2.225891724910672e-06,
+      "loss": 0.3406,
+      "step": 1295
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 1.8075515802139996,
+      "learning_rate": 2.2226419061792282e-06,
+      "loss": 0.3775,
+      "step": 1296
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 1.8220733253527324,
+      "learning_rate": 2.2193925619022323e-06,
+      "loss": 0.3652,
+      "step": 1297
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 1.9758397782161456,
+      "learning_rate": 2.2161436976380774e-06,
+      "loss": 0.3825,
+      "step": 1298
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 2.0469053125573202,
+      "learning_rate": 2.212895318944338e-06,
+      "loss": 0.4162,
+      "step": 1299
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 1.8037669439194224,
+      "learning_rate": 2.2096474313777574e-06,
+      "loss": 0.3584,
+      "step": 1300
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 1.8852980241376032,
+      "learning_rate": 2.206400040494238e-06,
+      "loss": 0.3786,
+      "step": 1301
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 1.8014277477129081,
+      "learning_rate": 2.2031531518488345e-06,
+      "loss": 0.4126,
+      "step": 1302
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 1.844230526856602,
+      "learning_rate": 2.1999067709957407e-06,
+      "loss": 0.4005,
+      "step": 1303
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 1.9775624321749639,
+      "learning_rate": 2.1966609034882825e-06,
+      "loss": 0.4279,
+      "step": 1304
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 1.7752280618538778,
+      "learning_rate": 2.193415554878907e-06,
+      "loss": 0.3512,
+      "step": 1305
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 1.8490455260047038,
+      "learning_rate": 2.1901707307191743e-06,
+      "loss": 0.3828,
+      "step": 1306
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 5.328150832014928,
+      "learning_rate": 2.1869264365597477e-06,
+      "loss": 0.3909,
+      "step": 1307
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 1.8437062886123319,
+      "learning_rate": 2.1836826779503838e-06,
+      "loss": 0.37,
+      "step": 1308
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 2.008796830412121,
+      "learning_rate": 2.1804394604399204e-06,
+      "loss": 0.4077,
+      "step": 1309
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 1.800679268264127,
+      "learning_rate": 2.1771967895762736e-06,
+      "loss": 0.3679,
+      "step": 1310
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 1.8462133413299637,
+      "learning_rate": 2.173954670906423e-06,
+      "loss": 0.3602,
+      "step": 1311
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 1.809976917930169,
+      "learning_rate": 2.1707131099764022e-06,
+      "loss": 0.3899,
+      "step": 1312
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 1.8544861012991105,
+      "learning_rate": 2.1674721123312924e-06,
+      "loss": 0.3747,
+      "step": 1313
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 1.8852269898368,
+      "learning_rate": 2.1642316835152106e-06,
+      "loss": 0.4467,
+      "step": 1314
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 1.9122728391881445,
+      "learning_rate": 2.1609918290713007e-06,
+      "loss": 0.3402,
+      "step": 1315
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 1.9590310432156601,
+      "learning_rate": 2.1577525545417254e-06,
+      "loss": 0.3732,
+      "step": 1316
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 1.8276147883157745,
+      "learning_rate": 2.1545138654676525e-06,
+      "loss": 0.3953,
+      "step": 1317
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 1.8133703409989375,
+      "learning_rate": 2.151275767389252e-06,
+      "loss": 0.3539,
+      "step": 1318
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 1.8006183709975836,
+      "learning_rate": 2.148038265845678e-06,
+      "loss": 0.4006,
+      "step": 1319
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 1.8947220090164194,
+      "learning_rate": 2.144801366375069e-06,
+      "loss": 0.4406,
+      "step": 1320
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 1.8280103512099313,
+      "learning_rate": 2.141565074514531e-06,
+      "loss": 0.3815,
+      "step": 1321
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 1.8706012819390525,
+      "learning_rate": 2.138329395800132e-06,
+      "loss": 0.3445,
+      "step": 1322
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 1.9063701163877025,
+      "learning_rate": 2.1350943357668905e-06,
+      "loss": 0.3983,
+      "step": 1323
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 2.033333592395131,
+      "learning_rate": 2.131859899948765e-06,
+      "loss": 0.3686,
+      "step": 1324
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 2.0894724502176425,
+      "learning_rate": 2.1286260938786497e-06,
+      "loss": 0.3811,
+      "step": 1325
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 1.9145691870270913,
+      "learning_rate": 2.125392923088358e-06,
+      "loss": 0.3783,
+      "step": 1326
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 1.941699323344672,
+      "learning_rate": 2.1221603931086193e-06,
+      "loss": 0.3842,
+      "step": 1327
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 2.0079800551627565,
+      "learning_rate": 2.118928509469066e-06,
+      "loss": 0.3885,
+      "step": 1328
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 1.851351482771633,
+      "learning_rate": 2.1156972776982238e-06,
+      "loss": 0.3281,
+      "step": 1329
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 1.9104937018736412,
+      "learning_rate": 2.112466703323504e-06,
+      "loss": 0.4231,
+      "step": 1330
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.92374307717419,
+      "learning_rate": 2.1092367918711935e-06,
+      "loss": 0.3702,
+      "step": 1331
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.8725737952655952,
+      "learning_rate": 2.1060075488664453e-06,
+      "loss": 0.3591,
+      "step": 1332
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.850042908610832,
+      "learning_rate": 2.1027789798332688e-06,
+      "loss": 0.3368,
+      "step": 1333
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.9324592525287807,
+      "learning_rate": 2.0995510902945197e-06,
+      "loss": 0.3676,
+      "step": 1334
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.9116116557564555,
+      "learning_rate": 2.0963238857718934e-06,
+      "loss": 0.3817,
+      "step": 1335
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.9148726445140338,
+      "learning_rate": 2.0930973717859117e-06,
+      "loss": 0.3704,
+      "step": 1336
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.8376871831619126,
+      "learning_rate": 2.089871553855915e-06,
+      "loss": 0.3521,
+      "step": 1337
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 2.069303925978208,
+      "learning_rate": 2.086646437500054e-06,
+      "loss": 0.3848,
+      "step": 1338
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.876178784774616,
+      "learning_rate": 2.08342202823528e-06,
+      "loss": 0.3697,
+      "step": 1339
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.8981757166548485,
+      "learning_rate": 2.0801983315773317e-06,
+      "loss": 0.3864,
+      "step": 1340
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.8313223303972075,
+      "learning_rate": 2.0769753530407317e-06,
+      "loss": 0.3768,
+      "step": 1341
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.9073767874852925,
+      "learning_rate": 2.073753098138773e-06,
+      "loss": 0.3991,
+      "step": 1342
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 1.837313805268737,
+      "learning_rate": 2.0705315723835116e-06,
+      "loss": 0.3959,
+      "step": 1343
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 1.9539946764244502,
+      "learning_rate": 2.067310781285755e-06,
+      "loss": 0.4305,
+      "step": 1344
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 2.019270181770809,
+      "learning_rate": 2.0640907303550545e-06,
+      "loss": 0.3601,
+      "step": 1345
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 2.406213238917182,
+      "learning_rate": 2.0608714250996954e-06,
+      "loss": 0.4426,
+      "step": 1346
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 1.9236578073704644,
+      "learning_rate": 2.0576528710266875e-06,
+      "loss": 0.4038,
+      "step": 1347
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 2.048182172212149,
+      "learning_rate": 2.054435073641756e-06,
+      "loss": 0.3746,
+      "step": 1348
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 1.928863945427719,
+      "learning_rate": 2.0512180384493306e-06,
+      "loss": 0.3894,
+      "step": 1349
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 1.8335551339682872,
+      "learning_rate": 2.0480017709525372e-06,
+      "loss": 0.3693,
+      "step": 1350
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 1.9647819756067608,
+      "learning_rate": 2.044786276653189e-06,
+      "loss": 0.3781,
+      "step": 1351
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 2.12907859222308,
+      "learning_rate": 2.041571561051777e-06,
+      "loss": 0.4171,
+      "step": 1352
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 1.9030554994611362,
+      "learning_rate": 2.0383576296474595e-06,
+      "loss": 0.3871,
+      "step": 1353
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 1.8482128197200014,
+      "learning_rate": 2.0351444879380533e-06,
+      "loss": 0.3801,
+      "step": 1354
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 1.9237098856083394,
+      "learning_rate": 2.031932141420026e-06,
+      "loss": 0.397,
+      "step": 1355
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 1.9292461604759314,
+      "learning_rate": 2.0287205955884812e-06,
+      "loss": 0.3808,
+      "step": 1356
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 1.905891034454967,
+      "learning_rate": 2.025509855937156e-06,
+      "loss": 0.3991,
+      "step": 1357
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 1.8451385574242787,
+      "learning_rate": 2.0222999279584084e-06,
+      "loss": 0.3801,
+      "step": 1358
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 1.949400009057099,
+      "learning_rate": 2.0190908171432073e-06,
+      "loss": 0.3892,
+      "step": 1359
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 1.9605363810464835,
+      "learning_rate": 2.0158825289811214e-06,
+      "loss": 0.3965,
+      "step": 1360
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 1.8606173348780064,
+      "learning_rate": 2.012675068960315e-06,
+      "loss": 0.3954,
+      "step": 1361
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 1.894555038278285,
+      "learning_rate": 2.009468442567537e-06,
+      "loss": 0.3872,
+      "step": 1362
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 1.8879641436732342,
+      "learning_rate": 2.006262655288106e-06,
+      "loss": 0.381,
+      "step": 1363
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 6.804463123370788,
+      "learning_rate": 2.003057712605908e-06,
+      "loss": 0.3598,
+      "step": 1364
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 1.9484231062475323,
+      "learning_rate": 1.9998536200033843e-06,
+      "loss": 0.387,
+      "step": 1365
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 1.9430636182866459,
+      "learning_rate": 1.996650382961521e-06,
+      "loss": 0.3815,
+      "step": 1366
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 1.8099872908810362,
+      "learning_rate": 1.9934480069598418e-06,
+      "loss": 0.3931,
+      "step": 1367
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 2.0871498559503583,
+      "learning_rate": 1.990246497476396e-06,
+      "loss": 0.3946,
+      "step": 1368
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 1.9534152521538926,
+      "learning_rate": 1.9870458599877524e-06,
+      "loss": 0.3998,
+      "step": 1369
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 1.9712355359168434,
+      "learning_rate": 1.9838460999689854e-06,
+      "loss": 0.3741,
+      "step": 1370
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 1.8831191819719022,
+      "learning_rate": 1.980647222893671e-06,
+      "loss": 0.3758,
+      "step": 1371
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 2.03493312021646,
+      "learning_rate": 1.977449234233875e-06,
+      "loss": 0.4066,
+      "step": 1372
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 1.9837157371609282,
+      "learning_rate": 1.9742521394601413e-06,
+      "loss": 0.3757,
+      "step": 1373
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 1.9871704920253919,
+      "learning_rate": 1.9710559440414867e-06,
+      "loss": 0.3811,
+      "step": 1374
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 1.8609975534569105,
+      "learning_rate": 1.9678606534453874e-06,
+      "loss": 0.3709,
+      "step": 1375
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 1.8599855946550903,
+      "learning_rate": 1.9646662731377737e-06,
+      "loss": 0.3589,
+      "step": 1376
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 2.0183183444158224,
+      "learning_rate": 1.9614728085830185e-06,
+      "loss": 0.3521,
+      "step": 1377
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 1.9976152320569405,
+      "learning_rate": 1.958280265243927e-06,
+      "loss": 0.3757,
+      "step": 1378
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 1.9951401325370672,
+      "learning_rate": 1.9550886485817313e-06,
+      "loss": 0.3947,
+      "step": 1379
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 1.9553672687038417,
+      "learning_rate": 1.9518979640560737e-06,
+      "loss": 0.3473,
+      "step": 1380
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 1.9340367763443969,
+      "learning_rate": 1.9487082171250057e-06,
+      "loss": 0.37,
+      "step": 1381
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 1.8996712185125788,
+      "learning_rate": 1.9455194132449745e-06,
+      "loss": 0.3924,
+      "step": 1382
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 1.9351658663427442,
+      "learning_rate": 1.9423315578708126e-06,
+      "loss": 0.3959,
+      "step": 1383
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 2.0174109611058504,
+      "learning_rate": 1.939144656455731e-06,
+      "loss": 0.3987,
+      "step": 1384
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 1.76886531168205,
+      "learning_rate": 1.9359587144513086e-06,
+      "loss": 0.4277,
+      "step": 1385
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 2.1774228741508455,
+      "learning_rate": 1.9327737373074834e-06,
+      "loss": 0.4474,
+      "step": 1386
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 1.8335022286037221,
+      "learning_rate": 1.929589730472543e-06,
+      "loss": 0.3586,
+      "step": 1387
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 1.944762597816562,
+      "learning_rate": 1.926406699393114e-06,
+      "loss": 0.3916,
+      "step": 1388
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 1.9158836718088024,
+      "learning_rate": 1.9232246495141554e-06,
+      "loss": 0.3471,
+      "step": 1389
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 1.9546368466405357,
+      "learning_rate": 1.920043586278947e-06,
+      "loss": 0.3747,
+      "step": 1390
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 1.9070019014660136,
+      "learning_rate": 1.9168635151290803e-06,
+      "loss": 0.3524,
+      "step": 1391
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 2.023146490194608,
+      "learning_rate": 1.9136844415044502e-06,
+      "loss": 0.3707,
+      "step": 1392
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 1.8809251159178713,
+      "learning_rate": 1.910506370843246e-06,
+      "loss": 0.3801,
+      "step": 1393
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 2.0409011175956784,
+      "learning_rate": 1.9073293085819402e-06,
+      "loss": 0.373,
+      "step": 1394
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 2.0117643519136315,
+      "learning_rate": 1.9041532601552804e-06,
+      "loss": 0.3645,
+      "step": 1395
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 1.9716378326274158,
+      "learning_rate": 1.9009782309962805e-06,
+      "loss": 0.3614,
+      "step": 1396
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 1.9329872273189466,
+      "learning_rate": 1.8978042265362103e-06,
+      "loss": 0.3551,
+      "step": 1397
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 1.9199554634763143,
+      "learning_rate": 1.8946312522045874e-06,
+      "loss": 0.3902,
+      "step": 1398
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 1.9590655710866773,
+      "learning_rate": 1.891459313429167e-06,
+      "loss": 0.4142,
+      "step": 1399
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 2.0331664011816972,
+      "learning_rate": 1.8882884156359324e-06,
+      "loss": 0.3656,
+      "step": 1400
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 2.0472909494424583,
+      "learning_rate": 1.8851185642490863e-06,
+      "loss": 0.3886,
+      "step": 1401
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 1.9929489595454677,
+      "learning_rate": 1.8819497646910408e-06,
+      "loss": 0.3672,
+      "step": 1402
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.9438211462442658,
+      "learning_rate": 1.87878202238241e-06,
+      "loss": 0.3713,
+      "step": 1403
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.9090031612890588,
+      "learning_rate": 1.8756153427419996e-06,
+      "loss": 0.3806,
+      "step": 1404
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.8225379267675694,
+      "learning_rate": 1.872449731186796e-06,
+      "loss": 0.3412,
+      "step": 1405
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.7944071121109437,
+      "learning_rate": 1.86928519313196e-06,
+      "loss": 0.3642,
+      "step": 1406
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.9414616279338623,
+      "learning_rate": 1.8661217339908142e-06,
+      "loss": 0.3806,
+      "step": 1407
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.944356212181711,
+      "learning_rate": 1.8629593591748374e-06,
+      "loss": 0.3987,
+      "step": 1408
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.857841085738498,
+      "learning_rate": 1.8597980740936528e-06,
+      "loss": 0.3899,
+      "step": 1409
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.8710356295384132,
+      "learning_rate": 1.8566378841550205e-06,
+      "loss": 0.3784,
+      "step": 1410
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.8728296119496737,
+      "learning_rate": 1.8534787947648247e-06,
+      "loss": 0.3867,
+      "step": 1411
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.8738844694805654,
+      "learning_rate": 1.8503208113270687e-06,
+      "loss": 0.3696,
+      "step": 1412
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.9649370685779552,
+      "learning_rate": 1.8471639392438648e-06,
+      "loss": 0.3986,
+      "step": 1413
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.7859555369523812,
+      "learning_rate": 1.8440081839154222e-06,
+      "loss": 0.3871,
+      "step": 1414
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 1.8610430021362592,
+      "learning_rate": 1.840853550740041e-06,
+      "loss": 0.333,
+      "step": 1415
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 1.9871037672382785,
+      "learning_rate": 1.8377000451141013e-06,
+      "loss": 0.3655,
+      "step": 1416
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 2.0510993717790544,
+      "learning_rate": 1.8345476724320549e-06,
+      "loss": 0.3345,
+      "step": 1417
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 2.022865297999793,
+      "learning_rate": 1.8313964380864157e-06,
+      "loss": 0.4238,
+      "step": 1418
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 2.0272213314003786,
+      "learning_rate": 1.8282463474677485e-06,
+      "loss": 0.3775,
+      "step": 1419
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 2.006744012043913,
+      "learning_rate": 1.825097405964665e-06,
+      "loss": 0.3886,
+      "step": 1420
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 2.0596399522136406,
+      "learning_rate": 1.8219496189638065e-06,
+      "loss": 0.4091,
+      "step": 1421
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 1.8816895162930982,
+      "learning_rate": 1.8188029918498434e-06,
+      "loss": 0.4065,
+      "step": 1422
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 1.9988370328142775,
+      "learning_rate": 1.8156575300054607e-06,
+      "loss": 0.3968,
+      "step": 1423
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 2.0379288149529216,
+      "learning_rate": 1.8125132388113497e-06,
+      "loss": 0.3893,
+      "step": 1424
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 1.8764951987892278,
+      "learning_rate": 1.8093701236461999e-06,
+      "loss": 0.3757,
+      "step": 1425
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 1.9911843473469748,
+      "learning_rate": 1.806228189886688e-06,
+      "loss": 0.3891,
+      "step": 1426
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 1.9631453513585595,
+      "learning_rate": 1.8030874429074701e-06,
+      "loss": 0.3969,
+      "step": 1427
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 1.8998526626952037,
+      "learning_rate": 1.7999478880811735e-06,
+      "loss": 0.3919,
+      "step": 1428
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 1.8805553933080315,
+      "learning_rate": 1.7968095307783845e-06,
+      "loss": 0.3767,
+      "step": 1429
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 1.9958093732421776,
+      "learning_rate": 1.7936723763676426e-06,
+      "loss": 0.3861,
+      "step": 1430
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 1.8587137598489651,
+      "learning_rate": 1.7905364302154264e-06,
+      "loss": 0.3289,
+      "step": 1431
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 2.0380004642313785,
+      "learning_rate": 1.7874016976861504e-06,
+      "loss": 0.3531,
+      "step": 1432
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 1.9171820086465794,
+      "learning_rate": 1.784268184142154e-06,
+      "loss": 0.3986,
+      "step": 1433
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 1.95855879390137,
+      "learning_rate": 1.7811358949436874e-06,
+      "loss": 0.3402,
+      "step": 1434
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 1.9995990338040457,
+      "learning_rate": 1.7780048354489101e-06,
+      "loss": 0.3599,
+      "step": 1435
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 1.9243145774410442,
+      "learning_rate": 1.7748750110138768e-06,
+      "loss": 0.4399,
+      "step": 1436
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 2.279285862974166,
+      "learning_rate": 1.7717464269925288e-06,
+      "loss": 0.3614,
+      "step": 1437
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 1.9005095716347011,
+      "learning_rate": 1.7686190887366875e-06,
+      "loss": 0.3665,
+      "step": 1438
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 1.8076423185524721,
+      "learning_rate": 1.7654930015960401e-06,
+      "loss": 0.3408,
+      "step": 1439
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 1.8762893879880087,
+      "learning_rate": 1.762368170918136e-06,
+      "loss": 0.39,
+      "step": 1440
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 2.0153368993119556,
+      "learning_rate": 1.7592446020483762e-06,
+      "loss": 0.3539,
+      "step": 1441
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 1.9585515808006808,
+      "learning_rate": 1.7561223003299994e-06,
+      "loss": 0.3956,
+      "step": 1442
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 2.124848103864915,
+      "learning_rate": 1.7530012711040794e-06,
+      "loss": 0.4119,
+      "step": 1443
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 2.012402459921111,
+      "learning_rate": 1.749881519709514e-06,
+      "loss": 0.408,
+      "step": 1444
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 1.9649268732755643,
+      "learning_rate": 1.7467630514830136e-06,
+      "loss": 0.3283,
+      "step": 1445
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 1.8596310758669552,
+      "learning_rate": 1.7436458717590931e-06,
+      "loss": 0.4354,
+      "step": 1446
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 1.9102148486337966,
+      "learning_rate": 1.7405299858700648e-06,
+      "loss": 0.3954,
+      "step": 1447
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 1.8553487771224224,
+      "learning_rate": 1.737415399146027e-06,
+      "loss": 0.3668,
+      "step": 1448
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 2.1142472778200756,
+      "learning_rate": 1.7343021169148554e-06,
+      "loss": 0.3745,
+      "step": 1449
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 1.9058887276269199,
+      "learning_rate": 1.7311901445021955e-06,
+      "loss": 0.3818,
+      "step": 1450
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 2.0622661899571666,
+      "learning_rate": 1.7280794872314499e-06,
+      "loss": 0.3961,
+      "step": 1451
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.8962754770592172,
+      "learning_rate": 1.7249701504237737e-06,
+      "loss": 0.3586,
+      "step": 1452
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.8165490259194481,
+      "learning_rate": 1.7218621393980606e-06,
+      "loss": 0.3311,
+      "step": 1453
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.9977375977133494,
+      "learning_rate": 1.7187554594709396e-06,
+      "loss": 0.3674,
+      "step": 1454
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.8504323227168384,
+      "learning_rate": 1.7156501159567607e-06,
+      "loss": 0.3743,
+      "step": 1455
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.9541250949627105,
+      "learning_rate": 1.7125461141675881e-06,
+      "loss": 0.3812,
+      "step": 1456
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.993766367538168,
+      "learning_rate": 1.7094434594131914e-06,
+      "loss": 0.355,
+      "step": 1457
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.851815452351873,
+      "learning_rate": 1.7063421570010349e-06,
+      "loss": 0.3792,
+      "step": 1458
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.8699896985814497,
+      "learning_rate": 1.7032422122362704e-06,
+      "loss": 0.345,
+      "step": 1459
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.941362367589001,
+      "learning_rate": 1.700143630421727e-06,
+      "loss": 0.3735,
+      "step": 1460
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.844833441576945,
+      "learning_rate": 1.6970464168579034e-06,
+      "loss": 0.3883,
+      "step": 1461
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.9382330200940399,
+      "learning_rate": 1.6939505768429548e-06,
+      "loss": 0.3451,
+      "step": 1462
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.9404379114850492,
+      "learning_rate": 1.6908561156726894e-06,
+      "loss": 0.3886,
+      "step": 1463
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.89967752240511,
+      "learning_rate": 1.6877630386405567e-06,
+      "loss": 0.4322,
+      "step": 1464
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.9542258627644085,
+      "learning_rate": 1.6846713510376363e-06,
+      "loss": 0.4143,
+      "step": 1465
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 2.0224476812069305,
+      "learning_rate": 1.6815810581526337e-06,
+      "loss": 0.3885,
+      "step": 1466
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.9984358815769925,
+      "learning_rate": 1.6784921652718666e-06,
+      "loss": 0.326,
+      "step": 1467
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.9112545672749313,
+      "learning_rate": 1.675404677679259e-06,
+      "loss": 0.3818,
+      "step": 1468
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.8535662369823578,
+      "learning_rate": 1.6723186006563309e-06,
+      "loss": 0.348,
+      "step": 1469
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.9484817526163822,
+      "learning_rate": 1.6692339394821877e-06,
+      "loss": 0.3357,
+      "step": 1470
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.898163029912662,
+      "learning_rate": 1.6661506994335164e-06,
+      "loss": 0.3755,
+      "step": 1471
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.8795795559493234,
+      "learning_rate": 1.6630688857845678e-06,
+      "loss": 0.3616,
+      "step": 1472
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.9167503410588418,
+      "learning_rate": 1.6599885038071566e-06,
+      "loss": 0.3592,
+      "step": 1473
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.9765253259894953,
+      "learning_rate": 1.6569095587706485e-06,
+      "loss": 0.3953,
+      "step": 1474
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.9352433621405845,
+      "learning_rate": 1.6538320559419488e-06,
+      "loss": 0.3528,
+      "step": 1475
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 2.0111021011512125,
+      "learning_rate": 1.6507560005854977e-06,
+      "loss": 0.407,
+      "step": 1476
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.8339393905209536,
+      "learning_rate": 1.6476813979632589e-06,
+      "loss": 0.3668,
+      "step": 1477
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.9309495145983575,
+      "learning_rate": 1.6446082533347096e-06,
+      "loss": 0.4106,
+      "step": 1478
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.8708341753950297,
+      "learning_rate": 1.641536571956835e-06,
+      "loss": 0.3749,
+      "step": 1479
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.8244009733234272,
+      "learning_rate": 1.6384663590841154e-06,
+      "loss": 0.3832,
+      "step": 1480
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.8878853394194013,
+      "learning_rate": 1.6353976199685222e-06,
+      "loss": 0.3539,
+      "step": 1481
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.8830734244466278,
+      "learning_rate": 1.6323303598595006e-06,
+      "loss": 0.3852,
+      "step": 1482
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.866253132730359,
+      "learning_rate": 1.6292645840039697e-06,
+      "loss": 0.364,
+      "step": 1483
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.977321954101075,
+      "learning_rate": 1.6262002976463098e-06,
+      "loss": 0.3866,
+      "step": 1484
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.9753878011905568,
+      "learning_rate": 1.62313750602835e-06,
+      "loss": 0.3999,
+      "step": 1485
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.9461948334927384,
+      "learning_rate": 1.6200762143893659e-06,
+      "loss": 0.3769,
+      "step": 1486
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 1.9597078370114984,
+      "learning_rate": 1.6170164279660656e-06,
+      "loss": 0.3546,
+      "step": 1487
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 2.0333727955548735,
+      "learning_rate": 1.6139581519925818e-06,
+      "loss": 0.3631,
+      "step": 1488
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 1.8957200128798963,
+      "learning_rate": 1.6109013917004657e-06,
+      "loss": 0.3738,
+      "step": 1489
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 1.8758015207075704,
+      "learning_rate": 1.6078461523186722e-06,
+      "loss": 0.3511,
+      "step": 1490
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 1.9539261883496823,
+      "learning_rate": 1.6047924390735587e-06,
+      "loss": 0.4074,
+      "step": 1491
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 2.046216911945662,
+      "learning_rate": 1.6017402571888677e-06,
+      "loss": 0.3729,
+      "step": 1492
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 2.0334239477316194,
+      "learning_rate": 1.5986896118857247e-06,
+      "loss": 0.3999,
+      "step": 1493
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 2.0768274033669556,
+      "learning_rate": 1.5956405083826266e-06,
+      "loss": 0.3982,
+      "step": 1494
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 1.9997134218487143,
+      "learning_rate": 1.592592951895432e-06,
+      "loss": 0.4319,
+      "step": 1495
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 1.9000589337955354,
+      "learning_rate": 1.5895469476373545e-06,
+      "loss": 0.3813,
+      "step": 1496
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 1.8787692854188953,
+      "learning_rate": 1.5865025008189501e-06,
+      "loss": 0.3801,
+      "step": 1497
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 1.8346902202639779,
+      "learning_rate": 1.5834596166481132e-06,
+      "loss": 0.3533,
+      "step": 1498
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 1.8993496821666367,
+      "learning_rate": 1.5804183003300627e-06,
+      "loss": 0.429,
+      "step": 1499
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 2.342530229905022,
+      "learning_rate": 1.5773785570673378e-06,
+      "loss": 0.3356,
+      "step": 1500
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 2.1048882391009127,
+      "learning_rate": 1.5743403920597856e-06,
+      "loss": 0.3896,
+      "step": 1501
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 1.8528209728378324,
+      "learning_rate": 1.5713038105045535e-06,
+      "loss": 0.3307,
+      "step": 1502
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 1.9057632190431548,
+      "learning_rate": 1.5682688175960797e-06,
+      "loss": 0.3806,
+      "step": 1503
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 1.8724905465304538,
+      "learning_rate": 1.5652354185260848e-06,
+      "loss": 0.3637,
+      "step": 1504
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 1.8484069152287292,
+      "learning_rate": 1.5622036184835648e-06,
+      "loss": 0.3161,
+      "step": 1505
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 1.8399814687678377,
+      "learning_rate": 1.559173422654778e-06,
+      "loss": 0.3745,
+      "step": 1506
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 1.8838641942793775,
+      "learning_rate": 1.5561448362232404e-06,
+      "loss": 0.3537,
+      "step": 1507
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 1.8623848433104377,
+      "learning_rate": 1.5531178643697142e-06,
+      "loss": 0.3624,
+      "step": 1508
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 1.8997144759052735,
+      "learning_rate": 1.5500925122721988e-06,
+      "loss": 0.3679,
+      "step": 1509
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 1.8976582272389906,
+      "learning_rate": 1.5470687851059235e-06,
+      "loss": 0.3736,
+      "step": 1510
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.8750760623537808,
+      "learning_rate": 1.5440466880433388e-06,
+      "loss": 0.3735,
+      "step": 1511
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.990180186983658,
+      "learning_rate": 1.5410262262541065e-06,
+      "loss": 0.3797,
+      "step": 1512
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.8820633605632435,
+      "learning_rate": 1.538007404905089e-06,
+      "loss": 0.3659,
+      "step": 1513
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.9458293982836543,
+      "learning_rate": 1.5349902291603441e-06,
+      "loss": 0.4092,
+      "step": 1514
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.822097097325058,
+      "learning_rate": 1.5319747041811158e-06,
+      "loss": 0.3276,
+      "step": 1515
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 2.0516824372881457,
+      "learning_rate": 1.528960835125822e-06,
+      "loss": 0.4232,
+      "step": 1516
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 2.0624060387577816,
+      "learning_rate": 1.5259486271500489e-06,
+      "loss": 0.3996,
+      "step": 1517
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.9158764361943028,
+      "learning_rate": 1.522938085406542e-06,
+      "loss": 0.3728,
+      "step": 1518
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.9071590654189663,
+      "learning_rate": 1.5199292150451956e-06,
+      "loss": 0.3459,
+      "step": 1519
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.9532115896688163,
+      "learning_rate": 1.5169220212130449e-06,
+      "loss": 0.3513,
+      "step": 1520
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.9901825773245059,
+      "learning_rate": 1.5139165090542574e-06,
+      "loss": 0.3468,
+      "step": 1521
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.7913388603914477,
+      "learning_rate": 1.510912683710124e-06,
+      "loss": 0.3381,
+      "step": 1522
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.8270379040698477,
+      "learning_rate": 1.5079105503190497e-06,
+      "loss": 0.3873,
+      "step": 1523
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.9259224146444094,
+      "learning_rate": 1.5049101140165453e-06,
+      "loss": 0.3553,
+      "step": 1524
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.7933642267566716,
+      "learning_rate": 1.501911379935219e-06,
+      "loss": 0.3928,
+      "step": 1525
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.859002957520952,
+      "learning_rate": 1.498914353204767e-06,
+      "loss": 0.3331,
+      "step": 1526
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.9280095918192017,
+      "learning_rate": 1.4959190389519646e-06,
+      "loss": 0.3902,
+      "step": 1527
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.9929705610530277,
+      "learning_rate": 1.492925442300658e-06,
+      "loss": 0.3765,
+      "step": 1528
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 2.02617558936789,
+      "learning_rate": 1.4899335683717546e-06,
+      "loss": 0.3815,
+      "step": 1529
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.8532248246777345,
+      "learning_rate": 1.4869434222832157e-06,
+      "loss": 0.3998,
+      "step": 1530
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.8616511215661515,
+      "learning_rate": 1.4839550091500464e-06,
+      "loss": 0.4005,
+      "step": 1531
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.9696593290003677,
+      "learning_rate": 1.4809683340842885e-06,
+      "loss": 0.4136,
+      "step": 1532
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.9439323576237217,
+      "learning_rate": 1.477983402195008e-06,
+      "loss": 0.3674,
+      "step": 1533
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.8858064066643994,
+      "learning_rate": 1.475000218588291e-06,
+      "loss": 0.3505,
+      "step": 1534
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.9565923900750009,
+      "learning_rate": 1.4720187883672337e-06,
+      "loss": 0.379,
+      "step": 1535
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 1.9482950589580994,
+      "learning_rate": 1.4690391166319307e-06,
+      "loss": 0.3962,
+      "step": 1536
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 1.979462387227227,
+      "learning_rate": 1.4660612084794701e-06,
+      "loss": 0.3662,
+      "step": 1537
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 1.894203355197371,
+      "learning_rate": 1.4630850690039221e-06,
+      "loss": 0.3703,
+      "step": 1538
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 1.8798042105520323,
+      "learning_rate": 1.460110703296333e-06,
+      "loss": 0.3631,
+      "step": 1539
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 1.9687008779986372,
+      "learning_rate": 1.4571381164447137e-06,
+      "loss": 0.4081,
+      "step": 1540
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 2.043706332156422,
+      "learning_rate": 1.454167313534031e-06,
+      "loss": 0.3629,
+      "step": 1541
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 1.9336401989651433,
+      "learning_rate": 1.4511982996462038e-06,
+      "loss": 0.4042,
+      "step": 1542
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 1.9550529998108908,
+      "learning_rate": 1.4482310798600852e-06,
+      "loss": 0.3768,
+      "step": 1543
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 1.874147928818456,
+      "learning_rate": 1.4452656592514633e-06,
+      "loss": 0.4125,
+      "step": 1544
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 1.848295970105597,
+      "learning_rate": 1.442302042893048e-06,
+      "loss": 0.3646,
+      "step": 1545
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 1.991422406332833,
+      "learning_rate": 1.439340235854462e-06,
+      "loss": 0.3885,
+      "step": 1546
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 1.89855710617557,
+      "learning_rate": 1.436380243202233e-06,
+      "loss": 0.3658,
+      "step": 1547
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 1.8657910310229384,
+      "learning_rate": 1.4334220699997856e-06,
+      "loss": 0.3659,
+      "step": 1548
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 1.9035891506078888,
+      "learning_rate": 1.4304657213074314e-06,
+      "loss": 0.3662,
+      "step": 1549
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 1.9026573701280374,
+      "learning_rate": 1.4275112021823618e-06,
+      "loss": 0.3712,
+      "step": 1550
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 1.9342408780305267,
+      "learning_rate": 1.4245585176786363e-06,
+      "loss": 0.355,
+      "step": 1551
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 1.8785254217068754,
+      "learning_rate": 1.4216076728471794e-06,
+      "loss": 0.3985,
+      "step": 1552
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 1.9602955113202258,
+      "learning_rate": 1.4186586727357649e-06,
+      "loss": 0.4063,
+      "step": 1553
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 2.083823151902659,
+      "learning_rate": 1.4157115223890136e-06,
+      "loss": 0.4121,
+      "step": 1554
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 1.8676871403375772,
+      "learning_rate": 1.4127662268483818e-06,
+      "loss": 0.3912,
+      "step": 1555
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 1.9120128683776039,
+      "learning_rate": 1.4098227911521523e-06,
+      "loss": 0.3453,
+      "step": 1556
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 1.844790264464269,
+      "learning_rate": 1.4068812203354264e-06,
+      "loss": 0.3666,
+      "step": 1557
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 1.8477236162312085,
+      "learning_rate": 1.4039415194301159e-06,
+      "loss": 0.3652,
+      "step": 1558
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 1.9200270211079769,
+      "learning_rate": 1.4010036934649334e-06,
+      "loss": 0.3755,
+      "step": 1559
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.8353558471804892,
+      "learning_rate": 1.3980677474653838e-06,
+      "loss": 0.3653,
+      "step": 1560
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.9621989060334357,
+      "learning_rate": 1.3951336864537572e-06,
+      "loss": 0.4104,
+      "step": 1561
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.8245538722983388,
+      "learning_rate": 1.3922015154491194e-06,
+      "loss": 0.3991,
+      "step": 1562
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.933539870056334,
+      "learning_rate": 1.3892712394673002e-06,
+      "loss": 0.3877,
+      "step": 1563
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.8275785324682217,
+      "learning_rate": 1.3863428635208915e-06,
+      "loss": 0.3546,
+      "step": 1564
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 2.0450836317829215,
+      "learning_rate": 1.3834163926192318e-06,
+      "loss": 0.3847,
+      "step": 1565
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 3.523986698344347,
+      "learning_rate": 1.380491831768403e-06,
+      "loss": 0.3502,
+      "step": 1566
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.9164812764116064,
+      "learning_rate": 1.3775691859712193e-06,
+      "loss": 0.309,
+      "step": 1567
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 2.0951493120042604,
+      "learning_rate": 1.3746484602272178e-06,
+      "loss": 0.3678,
+      "step": 1568
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.8843177010635455,
+      "learning_rate": 1.3717296595326527e-06,
+      "loss": 0.358,
+      "step": 1569
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.9562282189438478,
+      "learning_rate": 1.3688127888804837e-06,
+      "loss": 0.4021,
+      "step": 1570
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.997781626544885,
+      "learning_rate": 1.36589785326037e-06,
+      "loss": 0.4158,
+      "step": 1571
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 1.8805954764404564,
+      "learning_rate": 1.3629848576586604e-06,
+      "loss": 0.3678,
+      "step": 1572
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 2.037723153555198,
+      "learning_rate": 1.3600738070583858e-06,
+      "loss": 0.3611,
+      "step": 1573
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 1.9504207408498462,
+      "learning_rate": 1.3571647064392467e-06,
+      "loss": 0.4096,
+      "step": 1574
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 2.0573869926356494,
+      "learning_rate": 1.3542575607776117e-06,
+      "loss": 0.3698,
+      "step": 1575
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 1.9648011988919714,
+      "learning_rate": 1.3513523750465049e-06,
+      "loss": 0.3557,
+      "step": 1576
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 2.0566628239070077,
+      "learning_rate": 1.3484491542155941e-06,
+      "loss": 0.4099,
+      "step": 1577
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 1.840088910062188,
+      "learning_rate": 1.3455479032511903e-06,
+      "loss": 0.3759,
+      "step": 1578
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 1.916068103431673,
+      "learning_rate": 1.3426486271162326e-06,
+      "loss": 0.36,
+      "step": 1579
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 1.932989091441797,
+      "learning_rate": 1.3397513307702817e-06,
+      "loss": 0.3658,
+      "step": 1580
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 1.8629067871512175,
+      "learning_rate": 1.3368560191695126e-06,
+      "loss": 0.3562,
+      "step": 1581
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 2.0118302341661307,
+      "learning_rate": 1.3339626972667048e-06,
+      "loss": 0.3878,
+      "step": 1582
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 1.9124583307461076,
+      "learning_rate": 1.3310713700112348e-06,
+      "loss": 0.3809,
+      "step": 1583
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.9774861213509043,
+      "learning_rate": 1.328182042349065e-06,
+      "loss": 0.4137,
+      "step": 1584
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.9114216906066048,
+      "learning_rate": 1.3252947192227388e-06,
+      "loss": 0.3837,
+      "step": 1585
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.8560468375199388,
+      "learning_rate": 1.3224094055713713e-06,
+      "loss": 0.3603,
+      "step": 1586
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.9212128604014926,
+      "learning_rate": 1.3195261063306381e-06,
+      "loss": 0.3458,
+      "step": 1587
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.9251208352537634,
+      "learning_rate": 1.316644826432772e-06,
+      "loss": 0.3844,
+      "step": 1588
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.883081065391109,
+      "learning_rate": 1.313765570806547e-06,
+      "loss": 0.4208,
+      "step": 1589
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.8564972529452957,
+      "learning_rate": 1.3108883443772779e-06,
+      "loss": 0.3622,
+      "step": 1590
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.9725309818034906,
+      "learning_rate": 1.3080131520668075e-06,
+      "loss": 0.3489,
+      "step": 1591
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.9747828638689664,
+      "learning_rate": 1.3051399987934988e-06,
+      "loss": 0.38,
+      "step": 1592
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.8498395134731278,
+      "learning_rate": 1.3022688894722271e-06,
+      "loss": 0.3797,
+      "step": 1593
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.8845414148933772,
+      "learning_rate": 1.2993998290143698e-06,
+      "loss": 0.3335,
+      "step": 1594
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.9610318168301932,
+      "learning_rate": 1.296532822327801e-06,
+      "loss": 0.3769,
+      "step": 1595
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 1.8917429842068785,
+      "learning_rate": 1.2936678743168813e-06,
+      "loss": 0.3981,
+      "step": 1596
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 2.005525949740854,
+      "learning_rate": 1.29080498988245e-06,
+      "loss": 0.3789,
+      "step": 1597
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 1.891996578027132,
+      "learning_rate": 1.2879441739218152e-06,
+      "loss": 0.3906,
+      "step": 1598
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 2.0224573297517114,
+      "learning_rate": 1.285085431328748e-06,
+      "loss": 0.3852,
+      "step": 1599
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 1.9933585122271171,
+      "learning_rate": 1.282228766993472e-06,
+      "loss": 0.3811,
+      "step": 1600
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 1.9655771579152717,
+      "learning_rate": 1.2793741858026565e-06,
+      "loss": 0.3799,
+      "step": 1601
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 1.8953068551718162,
+      "learning_rate": 1.2765216926394047e-06,
+      "loss": 0.3508,
+      "step": 1602
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 1.8702448937265155,
+      "learning_rate": 1.2736712923832526e-06,
+      "loss": 0.3427,
+      "step": 1603
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 1.9279047888369216,
+      "learning_rate": 1.2708229899101505e-06,
+      "loss": 0.3755,
+      "step": 1604
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 1.8867926377124098,
+      "learning_rate": 1.2679767900924647e-06,
+      "loss": 0.3366,
+      "step": 1605
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 1.8256946570291102,
+      "learning_rate": 1.2651326977989629e-06,
+      "loss": 0.3419,
+      "step": 1606
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 1.8845001674022432,
+      "learning_rate": 1.2622907178948074e-06,
+      "loss": 0.3593,
+      "step": 1607
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.8041550297275601,
+      "learning_rate": 1.2594508552415474e-06,
+      "loss": 0.3565,
+      "step": 1608
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.929162466271085,
+      "learning_rate": 1.2566131146971105e-06,
+      "loss": 0.346,
+      "step": 1609
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.9783530922620556,
+      "learning_rate": 1.2537775011157943e-06,
+      "loss": 0.3655,
+      "step": 1610
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.9493980516637623,
+      "learning_rate": 1.2509440193482564e-06,
+      "loss": 0.417,
+      "step": 1611
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.8895548928491517,
+      "learning_rate": 1.2481126742415098e-06,
+      "loss": 0.3731,
+      "step": 1612
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.874868433424839,
+      "learning_rate": 1.2452834706389122e-06,
+      "loss": 0.3743,
+      "step": 1613
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.917114604759422,
+      "learning_rate": 1.2424564133801553e-06,
+      "loss": 0.3412,
+      "step": 1614
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.9354723425395528,
+      "learning_rate": 1.2396315073012636e-06,
+      "loss": 0.3564,
+      "step": 1615
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.9621850514310992,
+      "learning_rate": 1.2368087572345772e-06,
+      "loss": 0.348,
+      "step": 1616
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 2.058589411316211,
+      "learning_rate": 1.233988168008751e-06,
+      "loss": 0.3679,
+      "step": 1617
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.9516795286397743,
+      "learning_rate": 1.2311697444487431e-06,
+      "loss": 0.3635,
+      "step": 1618
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.9233248775745249,
+      "learning_rate": 1.2283534913758066e-06,
+      "loss": 0.3957,
+      "step": 1619
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 1.9303786560618386,
+      "learning_rate": 1.225539413607482e-06,
+      "loss": 0.3806,
+      "step": 1620
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 2.030744520145863,
+      "learning_rate": 1.222727515957588e-06,
+      "loss": 0.4023,
+      "step": 1621
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 1.9537051918570292,
+      "learning_rate": 1.2199178032362149e-06,
+      "loss": 0.3808,
+      "step": 1622
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 1.8928085054817043,
+      "learning_rate": 1.2171102802497148e-06,
+      "loss": 0.3982,
+      "step": 1623
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 2.0571192296380296,
+      "learning_rate": 1.2143049518006952e-06,
+      "loss": 0.4044,
+      "step": 1624
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 1.856402590326006,
+      "learning_rate": 1.2115018226880063e-06,
+      "loss": 0.3977,
+      "step": 1625
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 1.927548078890778,
+      "learning_rate": 1.208700897706739e-06,
+      "loss": 0.4048,
+      "step": 1626
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 1.9400375481531664,
+      "learning_rate": 1.205902181648215e-06,
+      "loss": 0.3605,
+      "step": 1627
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 1.873775466516257,
+      "learning_rate": 1.2031056792999726e-06,
+      "loss": 0.3375,
+      "step": 1628
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 1.9913863168589552,
+      "learning_rate": 1.2003113954457673e-06,
+      "loss": 0.3964,
+      "step": 1629
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 1.9685736172926571,
+      "learning_rate": 1.1975193348655584e-06,
+      "loss": 0.3587,
+      "step": 1630
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 1.8698671252931964,
+      "learning_rate": 1.1947295023355022e-06,
+      "loss": 0.3568,
+      "step": 1631
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 1.9615330930141146,
+      "learning_rate": 1.1919419026279434e-06,
+      "loss": 0.385,
+      "step": 1632
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 1.8699401980633292,
+      "learning_rate": 1.189156540511407e-06,
+      "loss": 0.362,
+      "step": 1633
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 2.054845402143213,
+      "learning_rate": 1.186373420750592e-06,
+      "loss": 0.3746,
+      "step": 1634
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 1.828582523525085,
+      "learning_rate": 1.1835925481063575e-06,
+      "loss": 0.3915,
+      "step": 1635
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 1.9369510226251998,
+      "learning_rate": 1.1808139273357232e-06,
+      "loss": 0.3736,
+      "step": 1636
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 1.9623351823945685,
+      "learning_rate": 1.1780375631918544e-06,
+      "loss": 0.3861,
+      "step": 1637
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 2.057951803903781,
+      "learning_rate": 1.1752634604240565e-06,
+      "loss": 0.3988,
+      "step": 1638
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 1.926766218075179,
+      "learning_rate": 1.1724916237777675e-06,
+      "loss": 0.3526,
+      "step": 1639
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 1.8312750701887877,
+      "learning_rate": 1.1697220579945466e-06,
+      "loss": 0.3518,
+      "step": 1640
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 2.025004547929062,
+      "learning_rate": 1.1669547678120701e-06,
+      "loss": 0.3651,
+      "step": 1641
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 2.3363123335351874,
+      "learning_rate": 1.1641897579641221e-06,
+      "loss": 0.4033,
+      "step": 1642
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 1.8749245234784346,
+      "learning_rate": 1.1614270331805844e-06,
+      "loss": 0.3701,
+      "step": 1643
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 1.8332454151429327,
+      "learning_rate": 1.1586665981874323e-06,
+      "loss": 0.3911,
+      "step": 1644
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 2.217946505455479,
+      "learning_rate": 1.1559084577067206e-06,
+      "loss": 0.3346,
+      "step": 1645
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 1.799776695931742,
+      "learning_rate": 1.1531526164565816e-06,
+      "loss": 0.3489,
+      "step": 1646
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 1.9376934559686718,
+      "learning_rate": 1.150399079151214e-06,
+      "loss": 0.3721,
+      "step": 1647
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 1.826040524283735,
+      "learning_rate": 1.1476478505008753e-06,
+      "loss": 0.3464,
+      "step": 1648
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 1.9007570045973046,
+      "learning_rate": 1.144898935211874e-06,
+      "loss": 0.3859,
+      "step": 1649
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 2.1474984005060334,
+      "learning_rate": 1.1421523379865603e-06,
+      "loss": 0.3456,
+      "step": 1650
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 1.843189989485683,
+      "learning_rate": 1.1394080635233204e-06,
+      "loss": 0.3052,
+      "step": 1651
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 2.009903889503656,
+      "learning_rate": 1.136666116516567e-06,
+      "loss": 0.4498,
+      "step": 1652
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 3.0285468769549473,
+      "learning_rate": 1.1339265016567294e-06,
+      "loss": 0.3532,
+      "step": 1653
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 1.8725192886740858,
+      "learning_rate": 1.1311892236302508e-06,
+      "loss": 0.3685,
+      "step": 1654
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 1.8726862166869487,
+      "learning_rate": 1.128454287119573e-06,
+      "loss": 0.3761,
+      "step": 1655
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 1.8883766624211467,
+      "learning_rate": 1.1257216968031357e-06,
+      "loss": 0.3574,
+      "step": 1656
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 1.9004020165185547,
+      "learning_rate": 1.1229914573553641e-06,
+      "loss": 0.3638,
+      "step": 1657
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 1.8723325311418417,
+      "learning_rate": 1.1202635734466612e-06,
+      "loss": 0.3468,
+      "step": 1658
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 1.804021127084218,
+      "learning_rate": 1.1175380497434022e-06,
+      "loss": 0.3534,
+      "step": 1659
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 1.8962566852248846,
+      "learning_rate": 1.1148148909079229e-06,
+      "loss": 0.3943,
+      "step": 1660
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 1.9982200928541012,
+      "learning_rate": 1.1120941015985152e-06,
+      "loss": 0.4224,
+      "step": 1661
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 1.8053179049263286,
+      "learning_rate": 1.109375686469417e-06,
+      "loss": 0.3389,
+      "step": 1662
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 1.888467793597335,
+      "learning_rate": 1.106659650170805e-06,
+      "loss": 0.387,
+      "step": 1663
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 1.8685159814187862,
+      "learning_rate": 1.1039459973487876e-06,
+      "loss": 0.3428,
+      "step": 1664
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 1.972180811818148,
+      "learning_rate": 1.101234732645393e-06,
+      "loss": 0.3587,
+      "step": 1665
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 2.252459557872569,
+      "learning_rate": 1.0985258606985683e-06,
+      "loss": 0.3684,
+      "step": 1666
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 1.9679034729828595,
+      "learning_rate": 1.0958193861421634e-06,
+      "loss": 0.338,
+      "step": 1667
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 1.9117564762331398,
+      "learning_rate": 1.0931153136059304e-06,
+      "loss": 0.4016,
+      "step": 1668
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 1.915297031471561,
+      "learning_rate": 1.0904136477155112e-06,
+      "loss": 0.3629,
+      "step": 1669
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 1.8376703588677337,
+      "learning_rate": 1.0877143930924306e-06,
+      "loss": 0.371,
+      "step": 1670
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 1.9070422380758454,
+      "learning_rate": 1.085017554354089e-06,
+      "loss": 0.3533,
+      "step": 1671
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 1.9752631861235486,
+      "learning_rate": 1.0823231361137543e-06,
+      "loss": 0.4164,
+      "step": 1672
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 1.885197204563304,
+      "learning_rate": 1.0796311429805536e-06,
+      "loss": 0.3929,
+      "step": 1673
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 1.9090106863841916,
+      "learning_rate": 1.0769415795594659e-06,
+      "loss": 0.3449,
+      "step": 1674
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 2.022637519082336,
+      "learning_rate": 1.074254450451314e-06,
+      "loss": 0.3553,
+      "step": 1675
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 1.942217527277708,
+      "learning_rate": 1.0715697602527542e-06,
+      "loss": 0.3936,
+      "step": 1676
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 1.8809306152215932,
+      "learning_rate": 1.0688875135562738e-06,
+      "loss": 0.3481,
+      "step": 1677
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 2.0969194462234513,
+      "learning_rate": 1.0662077149501798e-06,
+      "loss": 0.3864,
+      "step": 1678
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 1.8365124296835973,
+      "learning_rate": 1.0635303690185894e-06,
+      "loss": 0.3778,
+      "step": 1679
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 1.9221630207347382,
+      "learning_rate": 1.0608554803414256e-06,
+      "loss": 0.3443,
+      "step": 1680
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 1.9319799829762891,
+      "learning_rate": 1.0581830534944084e-06,
+      "loss": 0.3759,
+      "step": 1681
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 2.00532761754314,
+      "learning_rate": 1.055513093049046e-06,
+      "loss": 0.373,
+      "step": 1682
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 1.8361577324130107,
+      "learning_rate": 1.052845603572627e-06,
+      "loss": 0.3671,
+      "step": 1683
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 1.9246365496147386,
+      "learning_rate": 1.0501805896282144e-06,
+      "loss": 0.3888,
+      "step": 1684
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 1.933677406014513,
+      "learning_rate": 1.047518055774636e-06,
+      "loss": 0.428,
+      "step": 1685
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 1.8497481971894003,
+      "learning_rate": 1.0448580065664754e-06,
+      "loss": 0.339,
+      "step": 1686
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 1.9674163310656592,
+      "learning_rate": 1.042200446554068e-06,
+      "loss": 0.3933,
+      "step": 1687
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 1.8703345670634528,
+      "learning_rate": 1.039545380283491e-06,
+      "loss": 0.3805,
+      "step": 1688
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 1.8996794102359933,
+      "learning_rate": 1.0368928122965547e-06,
+      "loss": 0.3612,
+      "step": 1689
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 1.8163372630466865,
+      "learning_rate": 1.0342427471307973e-06,
+      "loss": 0.3631,
+      "step": 1690
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 1.8990581755942872,
+      "learning_rate": 1.031595189319473e-06,
+      "loss": 0.4539,
+      "step": 1691
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 1.9101558963616596,
+      "learning_rate": 1.0289501433915493e-06,
+      "loss": 0.4649,
+      "step": 1692
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 1.8873611659348446,
+      "learning_rate": 1.0263076138716962e-06,
+      "loss": 0.3649,
+      "step": 1693
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 1.823482013352725,
+      "learning_rate": 1.0236676052802791e-06,
+      "loss": 0.3648,
+      "step": 1694
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 1.8931382792204232,
+      "learning_rate": 1.0210301221333512e-06,
+      "loss": 0.3589,
+      "step": 1695
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 2.0713580311911355,
+      "learning_rate": 1.0183951689426438e-06,
+      "loss": 0.3474,
+      "step": 1696
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 1.8607620741027457,
+      "learning_rate": 1.0157627502155632e-06,
+      "loss": 0.3773,
+      "step": 1697
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 1.8645944548746636,
+      "learning_rate": 1.0131328704551782e-06,
+      "loss": 0.3457,
+      "step": 1698
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 1.852711235772826,
+      "learning_rate": 1.0105055341602153e-06,
+      "loss": 0.3559,
+      "step": 1699
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 1.969084245230365,
+      "learning_rate": 1.00788074582505e-06,
+      "loss": 0.3786,
+      "step": 1700
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 1.939185809703108,
+      "learning_rate": 1.005258509939699e-06,
+      "loss": 0.3649,
+      "step": 1701
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 1.9104875321871906,
+      "learning_rate": 1.0026388309898132e-06,
+      "loss": 0.388,
+      "step": 1702
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 2.161662535348609,
+      "learning_rate": 1.0000217134566694e-06,
+      "loss": 0.3692,
+      "step": 1703
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.8024704245485432,
+      "learning_rate": 9.974071618171613e-07,
+      "loss": 0.3751,
+      "step": 1704
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.7739470701867779,
+      "learning_rate": 9.94795180543796e-07,
+      "loss": 0.3373,
+      "step": 1705
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.8188576734630457,
+      "learning_rate": 9.921857741046806e-07,
+      "loss": 0.3945,
+      "step": 1706
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.9054961265186567,
+      "learning_rate": 9.895789469635204e-07,
+      "loss": 0.3518,
+      "step": 1707
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.8782724635395873,
+      "learning_rate": 9.869747035796071e-07,
+      "loss": 0.37,
+      "step": 1708
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.838615529167183,
+      "learning_rate": 9.843730484078128e-07,
+      "loss": 0.3376,
+      "step": 1709
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.785535753238471,
+      "learning_rate": 9.817739858985828e-07,
+      "loss": 0.337,
+      "step": 1710
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.8535882977550358,
+      "learning_rate": 9.791775204979263e-07,
+      "loss": 0.3391,
+      "step": 1711
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.882614515071742,
+      "learning_rate": 9.765836566474105e-07,
+      "loss": 0.391,
+      "step": 1712
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.8285960302994975,
+      "learning_rate": 9.739923987841518e-07,
+      "loss": 0.356,
+      "step": 1713
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.8054856197120326,
+      "learning_rate": 9.714037513408093e-07,
+      "loss": 0.3623,
+      "step": 1714
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.8671208649893825,
+      "learning_rate": 9.68817718745577e-07,
+      "loss": 0.3693,
+      "step": 1715
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.9004503058230886,
+      "learning_rate": 9.662343054221743e-07,
+      "loss": 0.3327,
+      "step": 1716
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 1.9148999919712566,
+      "learning_rate": 9.636535157898422e-07,
+      "loss": 0.3618,
+      "step": 1717
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 1.8635582232372712,
+      "learning_rate": 9.610753542633309e-07,
+      "loss": 0.3884,
+      "step": 1718
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 1.9383472683274976,
+      "learning_rate": 9.58499825252897e-07,
+      "loss": 0.3953,
+      "step": 1719
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 1.946035726357351,
+      "learning_rate": 9.559269331642937e-07,
+      "loss": 0.3292,
+      "step": 1720
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 1.8700217872447233,
+      "learning_rate": 9.533566823987628e-07,
+      "loss": 0.361,
+      "step": 1721
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 1.8900223904453795,
+      "learning_rate": 9.507890773530276e-07,
+      "loss": 0.3349,
+      "step": 1722
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 1.9125828500996216,
+      "learning_rate": 9.482241224192867e-07,
+      "loss": 0.3641,
+      "step": 1723
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 1.940533327906808,
+      "learning_rate": 9.456618219852042e-07,
+      "loss": 0.4036,
+      "step": 1724
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 2.0712298544333687,
+      "learning_rate": 9.431021804339047e-07,
+      "loss": 0.3934,
+      "step": 1725
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 1.8791027421557622,
+      "learning_rate": 9.40545202143962e-07,
+      "loss": 0.3507,
+      "step": 1726
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 1.9686923479849525,
+      "learning_rate": 9.379908914893962e-07,
+      "loss": 0.3497,
+      "step": 1727
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 2.0437193308441253,
+      "learning_rate": 9.354392528396638e-07,
+      "loss": 0.395,
+      "step": 1728
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.864988214025856,
+      "learning_rate": 9.328902905596512e-07,
+      "loss": 0.379,
+      "step": 1729
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.963062444850751,
+      "learning_rate": 9.303440090096633e-07,
+      "loss": 0.3565,
+      "step": 1730
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.9399170798660286,
+      "learning_rate": 9.278004125454232e-07,
+      "loss": 0.415,
+      "step": 1731
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.874726297624515,
+      "learning_rate": 9.252595055180585e-07,
+      "loss": 0.3606,
+      "step": 1732
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.834934005776965,
+      "learning_rate": 9.227212922740971e-07,
+      "loss": 0.4104,
+      "step": 1733
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.8726418919835732,
+      "learning_rate": 9.20185777155459e-07,
+      "loss": 0.3325,
+      "step": 1734
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.9432074923657174,
+      "learning_rate": 9.176529644994481e-07,
+      "loss": 0.3663,
+      "step": 1735
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.7937207452405413,
+      "learning_rate": 9.151228586387464e-07,
+      "loss": 0.3225,
+      "step": 1736
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.911607719176901,
+      "learning_rate": 9.125954639014037e-07,
+      "loss": 0.3491,
+      "step": 1737
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.8954594851178048,
+      "learning_rate": 9.100707846108337e-07,
+      "loss": 0.3474,
+      "step": 1738
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.9081066235083353,
+      "learning_rate": 9.075488250858047e-07,
+      "loss": 0.3654,
+      "step": 1739
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.9384836973235149,
+      "learning_rate": 9.050295896404326e-07,
+      "loss": 0.3519,
+      "step": 1740
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 1.9655302768136176,
+      "learning_rate": 9.02513082584173e-07,
+      "loss": 0.3482,
+      "step": 1741
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 1.900218584161994,
+      "learning_rate": 8.999993082218156e-07,
+      "loss": 0.3576,
+      "step": 1742
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 2.030742409886431,
+      "learning_rate": 8.974882708534724e-07,
+      "loss": 0.3055,
+      "step": 1743
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 1.865959678567607,
+      "learning_rate": 8.949799747745766e-07,
+      "loss": 0.3485,
+      "step": 1744
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 1.8300998571759115,
+      "learning_rate": 8.924744242758707e-07,
+      "loss": 0.3412,
+      "step": 1745
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 2.3841641123937514,
+      "learning_rate": 8.899716236434019e-07,
+      "loss": 0.3484,
+      "step": 1746
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 1.844271076789803,
+      "learning_rate": 8.874715771585105e-07,
+      "loss": 0.3762,
+      "step": 1747
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 1.8687696131042617,
+      "learning_rate": 8.84974289097828e-07,
+      "loss": 0.402,
+      "step": 1748
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 1.889973499535232,
+      "learning_rate": 8.824797637332669e-07,
+      "loss": 0.3566,
+      "step": 1749
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 1.8681107208205963,
+      "learning_rate": 8.799880053320131e-07,
+      "loss": 0.4057,
+      "step": 1750
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 1.8928327876139377,
+      "learning_rate": 8.774990181565201e-07,
+      "loss": 0.3784,
+      "step": 1751
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 1.931089236577729,
+      "learning_rate": 8.750128064645002e-07,
+      "loss": 0.4008,
+      "step": 1752
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 1.9573581859995763,
+      "learning_rate": 8.725293745089181e-07,
+      "loss": 0.3486,
+      "step": 1753
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 1.9164746693234396,
+      "learning_rate": 8.700487265379845e-07,
+      "loss": 0.3634,
+      "step": 1754
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 1.812159570787973,
+      "learning_rate": 8.675708667951446e-07,
+      "loss": 0.3476,
+      "step": 1755
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 2.0355096473340146,
+      "learning_rate": 8.650957995190784e-07,
+      "loss": 0.3562,
+      "step": 1756
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 1.8995538618272807,
+      "learning_rate": 8.626235289436846e-07,
+      "loss": 0.3767,
+      "step": 1757
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 1.8751894629115184,
+      "learning_rate": 8.601540592980812e-07,
+      "loss": 0.3709,
+      "step": 1758
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 1.8772906072081945,
+      "learning_rate": 8.576873948065931e-07,
+      "loss": 0.3692,
+      "step": 1759
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 1.855725719743314,
+      "learning_rate": 8.552235396887479e-07,
+      "loss": 0.3461,
+      "step": 1760
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 1.9058932387569096,
+      "learning_rate": 8.52762498159266e-07,
+      "loss": 0.4035,
+      "step": 1761
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 1.8155999399280405,
+      "learning_rate": 8.503042744280565e-07,
+      "loss": 0.3821,
+      "step": 1762
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 1.9191184065214926,
+      "learning_rate": 8.478488727002062e-07,
+      "loss": 0.4182,
+      "step": 1763
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 1.8660511914055784,
+      "learning_rate": 8.453962971759766e-07,
+      "loss": 0.3936,
+      "step": 1764
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.8559359079620885,
+      "learning_rate": 8.429465520507932e-07,
+      "loss": 0.3555,
+      "step": 1765
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.871625930259135,
+      "learning_rate": 8.404996415152414e-07,
+      "loss": 0.3336,
+      "step": 1766
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.9146405985810966,
+      "learning_rate": 8.38055569755055e-07,
+      "loss": 0.3595,
+      "step": 1767
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.8172916285896499,
+      "learning_rate": 8.356143409511145e-07,
+      "loss": 0.3763,
+      "step": 1768
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.9045338434685268,
+      "learning_rate": 8.331759592794344e-07,
+      "loss": 0.3454,
+      "step": 1769
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.9019450574908656,
+      "learning_rate": 8.307404289111618e-07,
+      "loss": 0.3782,
+      "step": 1770
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.8040956687408418,
+      "learning_rate": 8.283077540125642e-07,
+      "loss": 0.3397,
+      "step": 1771
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.8854623689371994,
+      "learning_rate": 8.258779387450258e-07,
+      "loss": 0.3632,
+      "step": 1772
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.8703628366355571,
+      "learning_rate": 8.234509872650381e-07,
+      "loss": 0.3796,
+      "step": 1773
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.8974382562927672,
+      "learning_rate": 8.210269037241945e-07,
+      "loss": 0.3577,
+      "step": 1774
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.8041564148309792,
+      "learning_rate": 8.186056922691816e-07,
+      "loss": 0.3423,
+      "step": 1775
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.8871513088592733,
+      "learning_rate": 8.161873570417742e-07,
+      "loss": 0.3724,
+      "step": 1776
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.7959090299202567,
+      "learning_rate": 8.137719021788248e-07,
+      "loss": 0.3514,
+      "step": 1777
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.77414937614363,
+      "learning_rate": 8.113593318122609e-07,
+      "loss": 0.3655,
+      "step": 1778
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.8415138040355723,
+      "learning_rate": 8.089496500690747e-07,
+      "loss": 0.3469,
+      "step": 1779
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.943916626029921,
+      "learning_rate": 8.06542861071318e-07,
+      "loss": 0.3626,
+      "step": 1780
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.9699325195709307,
+      "learning_rate": 8.041389689360921e-07,
+      "loss": 0.3897,
+      "step": 1781
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.8300758832916175,
+      "learning_rate": 8.01737977775545e-07,
+      "loss": 0.3528,
+      "step": 1782
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.8854405268423242,
+      "learning_rate": 7.993398916968609e-07,
+      "loss": 0.3458,
+      "step": 1783
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.8610707367327934,
+      "learning_rate": 7.969447148022555e-07,
+      "loss": 0.3825,
+      "step": 1784
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.8761158349166456,
+      "learning_rate": 7.945524511889676e-07,
+      "loss": 0.361,
+      "step": 1785
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.8316905966902863,
+      "learning_rate": 7.921631049492526e-07,
+      "loss": 0.3791,
+      "step": 1786
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.8815617462853849,
+      "learning_rate": 7.897766801703754e-07,
+      "loss": 0.3334,
+      "step": 1787
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.8069850793814037,
+      "learning_rate": 7.873931809346022e-07,
+      "loss": 0.3063,
+      "step": 1788
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.877897596569181,
+      "learning_rate": 7.850126113191961e-07,
+      "loss": 0.3551,
+      "step": 1789
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.933100704380605,
+      "learning_rate": 7.826349753964083e-07,
+      "loss": 0.4,
+      "step": 1790
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.8588317568608963,
+      "learning_rate": 7.802602772334719e-07,
+      "loss": 0.3695,
+      "step": 1791
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.75903586927703,
+      "learning_rate": 7.778885208925943e-07,
+      "loss": 0.3334,
+      "step": 1792
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.847597726088611,
+      "learning_rate": 7.755197104309512e-07,
+      "loss": 0.3508,
+      "step": 1793
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.8730373365521515,
+      "learning_rate": 7.731538499006767e-07,
+      "loss": 0.3727,
+      "step": 1794
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.8696875894594878,
+      "learning_rate": 7.707909433488611e-07,
+      "loss": 0.3694,
+      "step": 1795
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.8224097896476315,
+      "learning_rate": 7.684309948175414e-07,
+      "loss": 0.3682,
+      "step": 1796
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.8896591788553188,
+      "learning_rate": 7.660740083436943e-07,
+      "loss": 0.353,
+      "step": 1797
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.8622597363460462,
+      "learning_rate": 7.637199879592275e-07,
+      "loss": 0.3835,
+      "step": 1798
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.8261440807434144,
+      "learning_rate": 7.61368937690978e-07,
+      "loss": 0.3673,
+      "step": 1799
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.86324753247062,
+      "learning_rate": 7.590208615607001e-07,
+      "loss": 0.3613,
+      "step": 1800
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 1.8704051001710107,
+      "learning_rate": 7.566757635850608e-07,
+      "loss": 0.3756,
+      "step": 1801
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 1.8547689419526656,
+      "learning_rate": 7.543336477756336e-07,
+      "loss": 0.3557,
+      "step": 1802
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 1.8970591656145008,
+      "learning_rate": 7.519945181388893e-07,
+      "loss": 0.3713,
+      "step": 1803
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 2.034710049647413,
+      "learning_rate": 7.496583786761911e-07,
+      "loss": 0.379,
+      "step": 1804
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 1.7207339510591724,
+      "learning_rate": 7.47325233383788e-07,
+      "loss": 0.324,
+      "step": 1805
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 1.8353430031672993,
+      "learning_rate": 7.449950862528046e-07,
+      "loss": 0.3688,
+      "step": 1806
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 1.8248952138910253,
+      "learning_rate": 7.426679412692403e-07,
+      "loss": 0.3744,
+      "step": 1807
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 1.8581710166024752,
+      "learning_rate": 7.403438024139547e-07,
+      "loss": 0.3591,
+      "step": 1808
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 1.972956887111899,
+      "learning_rate": 7.380226736626692e-07,
+      "loss": 0.3786,
+      "step": 1809
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 1.875119757327761,
+      "learning_rate": 7.357045589859535e-07,
+      "loss": 0.3924,
+      "step": 1810
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 1.829358260084735,
+      "learning_rate": 7.333894623492222e-07,
+      "loss": 0.3489,
+      "step": 1811
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 1.8778690881245192,
+      "learning_rate": 7.310773877127275e-07,
+      "loss": 0.3906,
+      "step": 1812
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 1.8807953543932978,
+      "learning_rate": 7.287683390315514e-07,
+      "loss": 0.3388,
+      "step": 1813
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 1.8813619639740409,
+      "learning_rate": 7.264623202556001e-07,
+      "loss": 0.3678,
+      "step": 1814
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 1.8588744507841983,
+      "learning_rate": 7.241593353295967e-07,
+      "loss": 0.3628,
+      "step": 1815
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 1.854298814001063,
+      "learning_rate": 7.218593881930744e-07,
+      "loss": 0.3719,
+      "step": 1816
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 1.9792414995864196,
+      "learning_rate": 7.195624827803704e-07,
+      "loss": 0.3954,
+      "step": 1817
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 1.9020874510906967,
+      "learning_rate": 7.172686230206174e-07,
+      "loss": 0.3501,
+      "step": 1818
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 1.8501064231209523,
+      "learning_rate": 7.1497781283774e-07,
+      "loss": 0.3733,
+      "step": 1819
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 1.8611190389122063,
+      "learning_rate": 7.126900561504435e-07,
+      "loss": 0.3883,
+      "step": 1820
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 1.8961852046693481,
+      "learning_rate": 7.104053568722128e-07,
+      "loss": 0.3524,
+      "step": 1821
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 1.8752890672773106,
+      "learning_rate": 7.081237189113005e-07,
+      "loss": 0.3524,
+      "step": 1822
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 1.9058307416670415,
+      "learning_rate": 7.058451461707239e-07,
+      "loss": 0.3653,
+      "step": 1823
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 1.9173580029522834,
+      "learning_rate": 7.035696425482563e-07,
+      "loss": 0.4105,
+      "step": 1824
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 2.1303031903916674,
+      "learning_rate": 7.012972119364206e-07,
+      "loss": 0.354,
+      "step": 1825
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 1.9010891424886316,
+      "learning_rate": 6.990278582224835e-07,
+      "loss": 0.3662,
+      "step": 1826
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 1.877460865866335,
+      "learning_rate": 6.967615852884485e-07,
+      "loss": 0.3898,
+      "step": 1827
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 1.8440076993566192,
+      "learning_rate": 6.944983970110475e-07,
+      "loss": 0.3582,
+      "step": 1828
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 1.8515961219329589,
+      "learning_rate": 6.922382972617372e-07,
+      "loss": 0.3653,
+      "step": 1829
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 1.830548627340717,
+      "learning_rate": 6.899812899066907e-07,
+      "loss": 0.3387,
+      "step": 1830
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 1.8380839219635814,
+      "learning_rate": 6.877273788067918e-07,
+      "loss": 0.352,
+      "step": 1831
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 2.013553404360448,
+      "learning_rate": 6.854765678176256e-07,
+      "loss": 0.3702,
+      "step": 1832
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 1.8665688769908853,
+      "learning_rate": 6.832288607894766e-07,
+      "loss": 0.3733,
+      "step": 1833
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 1.8644441622970171,
+      "learning_rate": 6.809842615673179e-07,
+      "loss": 0.3799,
+      "step": 1834
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 1.8383420889549984,
+      "learning_rate": 6.787427739908079e-07,
+      "loss": 0.3667,
+      "step": 1835
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 1.8631784953833668,
+      "learning_rate": 6.765044018942804e-07,
+      "loss": 0.3297,
+      "step": 1836
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 1.9010037089689114,
+      "learning_rate": 6.742691491067419e-07,
+      "loss": 0.3692,
+      "step": 1837
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 1.8921348077299611,
+      "learning_rate": 6.720370194518599e-07,
+      "loss": 0.3456,
+      "step": 1838
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 1.8727807839161668,
+      "learning_rate": 6.698080167479621e-07,
+      "loss": 0.3385,
+      "step": 1839
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 1.8815655707023475,
+      "learning_rate": 6.675821448080261e-07,
+      "loss": 0.3439,
+      "step": 1840
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 1.9290903087621594,
+      "learning_rate": 6.653594074396744e-07,
+      "loss": 0.3283,
+      "step": 1841
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 1.839315528952413,
+      "learning_rate": 6.631398084451671e-07,
+      "loss": 0.383,
+      "step": 1842
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 1.888172853861002,
+      "learning_rate": 6.609233516213955e-07,
+      "loss": 0.3326,
+      "step": 1843
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 1.9530270141902042,
+      "learning_rate": 6.58710040759877e-07,
+      "loss": 0.3629,
+      "step": 1844
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 1.8350634300423698,
+      "learning_rate": 6.564998796467453e-07,
+      "loss": 0.3378,
+      "step": 1845
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 1.8651246191841713,
+      "learning_rate": 6.542928720627478e-07,
+      "loss": 0.327,
+      "step": 1846
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 1.9096045764816365,
+      "learning_rate": 6.520890217832373e-07,
+      "loss": 0.3255,
+      "step": 1847
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 1.919058821000027,
+      "learning_rate": 6.498883325781658e-07,
+      "loss": 0.3722,
+      "step": 1848
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 1.7941064379223837,
+      "learning_rate": 6.476908082120758e-07,
+      "loss": 0.4079,
+      "step": 1849
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 1.9058204893972528,
+      "learning_rate": 6.454964524440988e-07,
+      "loss": 0.3447,
+      "step": 1850
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 1.7622865373821626,
+      "learning_rate": 6.433052690279443e-07,
+      "loss": 0.328,
+      "step": 1851
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 2.496163673119719,
+      "learning_rate": 6.411172617118958e-07,
+      "loss": 0.3457,
+      "step": 1852
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 1.952829151233476,
+      "learning_rate": 6.389324342388034e-07,
+      "loss": 0.3757,
+      "step": 1853
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 1.8737496346585327,
+      "learning_rate": 6.367507903460782e-07,
+      "loss": 0.3499,
+      "step": 1854
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 1.789743920699054,
+      "learning_rate": 6.345723337656845e-07,
+      "loss": 0.3686,
+      "step": 1855
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 1.814108105986017,
+      "learning_rate": 6.32397068224136e-07,
+      "loss": 0.3456,
+      "step": 1856
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 1.8844464079872227,
+      "learning_rate": 6.302249974424848e-07,
+      "loss": 0.3408,
+      "step": 1857
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 1.8123689124000075,
+      "learning_rate": 6.280561251363212e-07,
+      "loss": 0.3415,
+      "step": 1858
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 1.8581667911676272,
+      "learning_rate": 6.258904550157616e-07,
+      "loss": 0.3598,
+      "step": 1859
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 1.8951304560085334,
+      "learning_rate": 6.23727990785446e-07,
+      "loss": 0.388,
+      "step": 1860
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 1.9149385859826418,
+      "learning_rate": 6.215687361445305e-07,
+      "loss": 0.3218,
+      "step": 1861
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 1.9124781784882163,
+      "learning_rate": 6.194126947866799e-07,
+      "loss": 0.3754,
+      "step": 1862
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 1.834455896221774,
+      "learning_rate": 6.172598704000632e-07,
+      "loss": 0.3437,
+      "step": 1863
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 1.8282538931136811,
+      "learning_rate": 6.151102666673461e-07,
+      "loss": 0.4017,
+      "step": 1864
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 1.9403955088374527,
+      "learning_rate": 6.129638872656842e-07,
+      "loss": 0.3945,
+      "step": 1865
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 1.9233473407163304,
+      "learning_rate": 6.108207358667189e-07,
+      "loss": 0.3461,
+      "step": 1866
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 1.8541305863089477,
+      "learning_rate": 6.086808161365685e-07,
+      "loss": 0.3832,
+      "step": 1867
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 1.9957679209687724,
+      "learning_rate": 6.065441317358245e-07,
+      "loss": 0.3689,
+      "step": 1868
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 1.8808330149547094,
+      "learning_rate": 6.044106863195415e-07,
+      "loss": 0.3695,
+      "step": 1869
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 1.8104036881057104,
+      "learning_rate": 6.022804835372364e-07,
+      "loss": 0.3483,
+      "step": 1870
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 1.862734707805995,
+      "learning_rate": 6.001535270328768e-07,
+      "loss": 0.3631,
+      "step": 1871
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 1.7886949265475778,
+      "learning_rate": 5.980298204448781e-07,
+      "loss": 0.3427,
+      "step": 1872
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 2.0013909122499585,
+      "learning_rate": 5.959093674060973e-07,
+      "loss": 0.3227,
+      "step": 1873
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 1.8454555132680566,
+      "learning_rate": 5.937921715438242e-07,
+      "loss": 0.3712,
+      "step": 1874
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 1.7955230940580693,
+      "learning_rate": 5.916782364797774e-07,
+      "loss": 0.358,
+      "step": 1875
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 1.981678881628013,
+      "learning_rate": 5.895675658300981e-07,
+      "loss": 0.3436,
+      "step": 1876
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 1.8838900630160964,
+      "learning_rate": 5.874601632053426e-07,
+      "loss": 0.3672,
+      "step": 1877
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 1.9199361104982893,
+      "learning_rate": 5.853560322104778e-07,
+      "loss": 0.369,
+      "step": 1878
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 1.8961860490578715,
+      "learning_rate": 5.832551764448719e-07,
+      "loss": 0.3376,
+      "step": 1879
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 1.9335243111005527,
+      "learning_rate": 5.811575995022925e-07,
+      "loss": 0.3544,
+      "step": 1880
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 1.9130052171381386,
+      "learning_rate": 5.790633049708979e-07,
+      "loss": 0.3491,
+      "step": 1881
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 1.8114767475508076,
+      "learning_rate": 5.76972296433232e-07,
+      "loss": 0.349,
+      "step": 1882
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 1.869088387575796,
+      "learning_rate": 5.748845774662154e-07,
+      "loss": 0.339,
+      "step": 1883
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 1.8823416706934062,
+      "learning_rate": 5.728001516411441e-07,
+      "loss": 0.3344,
+      "step": 1884
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 2.028993532989763,
+      "learning_rate": 5.707190225236791e-07,
+      "loss": 0.3315,
+      "step": 1885
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 1.8841725898032713,
+      "learning_rate": 5.686411936738428e-07,
+      "loss": 0.3391,
+      "step": 1886
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 1.8925869708717353,
+      "learning_rate": 5.665666686460119e-07,
+      "loss": 0.3856,
+      "step": 1887
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 1.7967049620852429,
+      "learning_rate": 5.644954509889125e-07,
+      "loss": 0.327,
+      "step": 1888
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 1.8430042776590718,
+      "learning_rate": 5.624275442456101e-07,
+      "loss": 0.3796,
+      "step": 1889
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 1.9114914845740119,
+      "learning_rate": 5.603629519535092e-07,
+      "loss": 0.3232,
+      "step": 1890
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 1.926168461469419,
+      "learning_rate": 5.583016776443443e-07,
+      "loss": 0.3707,
+      "step": 1891
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 1.8385673934837172,
+      "learning_rate": 5.562437248441727e-07,
+      "loss": 0.3178,
+      "step": 1892
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 1.822703583469435,
+      "learning_rate": 5.54189097073371e-07,
+      "loss": 0.3435,
+      "step": 1893
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 1.9179700429647883,
+      "learning_rate": 5.52137797846628e-07,
+      "loss": 0.3822,
+      "step": 1894
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 1.8022936248540935,
+      "learning_rate": 5.500898306729385e-07,
+      "loss": 0.3326,
+      "step": 1895
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 1.8093411058422164,
+      "learning_rate": 5.48045199055596e-07,
+      "loss": 0.369,
+      "step": 1896
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 1.9644587652808347,
+      "learning_rate": 5.460039064921901e-07,
+      "loss": 0.3642,
+      "step": 1897
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 1.894471765138494,
+      "learning_rate": 5.439659564745975e-07,
+      "loss": 0.3433,
+      "step": 1898
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 1.813778968949277,
+      "learning_rate": 5.41931352488978e-07,
+      "loss": 0.3451,
+      "step": 1899
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 1.912342915517377,
+      "learning_rate": 5.399000980157657e-07,
+      "loss": 0.3531,
+      "step": 1900
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 1.9171703337147434,
+      "learning_rate": 5.378721965296665e-07,
+      "loss": 0.3319,
+      "step": 1901
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 2.130973697729064,
+      "learning_rate": 5.35847651499651e-07,
+      "loss": 0.3455,
+      "step": 1902
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 1.8703910349961417,
+      "learning_rate": 5.33826466388947e-07,
+      "loss": 0.3749,
+      "step": 1903
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 1.8325169995594843,
+      "learning_rate": 5.318086446550352e-07,
+      "loss": 0.3663,
+      "step": 1904
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 1.9036889163080901,
+      "learning_rate": 5.297941897496428e-07,
+      "loss": 0.3592,
+      "step": 1905
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 2.0123341027513892,
+      "learning_rate": 5.277831051187382e-07,
+      "loss": 0.4212,
+      "step": 1906
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 1.862718607721721,
+      "learning_rate": 5.257753942025243e-07,
+      "loss": 0.3739,
+      "step": 1907
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 1.8541370296695225,
+      "learning_rate": 5.237710604354313e-07,
+      "loss": 0.4162,
+      "step": 1908
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 1.8209725179155203,
+      "learning_rate": 5.217701072461149e-07,
+      "loss": 0.3288,
+      "step": 1909
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 1.9438902543671965,
+      "learning_rate": 5.197725380574456e-07,
+      "loss": 0.3753,
+      "step": 1910
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 1.8631654581464705,
+      "learning_rate": 5.177783562865066e-07,
+      "loss": 0.3437,
+      "step": 1911
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 1.8396840995640549,
+      "learning_rate": 5.157875653445866e-07,
+      "loss": 0.3669,
+      "step": 1912
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 1.7855189269312173,
+      "learning_rate": 5.138001686371729e-07,
+      "loss": 0.3593,
+      "step": 1913
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 1.8497984943634604,
+      "learning_rate": 5.118161695639479e-07,
+      "loss": 0.3295,
+      "step": 1914
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 1.8450016226883872,
+      "learning_rate": 5.0983557151878e-07,
+      "loss": 0.3514,
+      "step": 1915
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 1.8115984473579492,
+      "learning_rate": 5.078583778897216e-07,
+      "loss": 0.3599,
+      "step": 1916
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 1.8768743283745053,
+      "learning_rate": 5.058845920590008e-07,
+      "loss": 0.336,
+      "step": 1917
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 1.898132261271154,
+      "learning_rate": 5.039142174030159e-07,
+      "loss": 0.3895,
+      "step": 1918
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 1.8621712512957447,
+      "learning_rate": 5.019472572923307e-07,
+      "loss": 0.3718,
+      "step": 1919
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 1.9340085582222217,
+      "learning_rate": 4.999837150916664e-07,
+      "loss": 0.3633,
+      "step": 1920
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 1.8812999232707277,
+      "learning_rate": 4.980235941598999e-07,
+      "loss": 0.3666,
+      "step": 1921
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 1.783816493860109,
+      "learning_rate": 4.960668978500529e-07,
+      "loss": 0.3487,
+      "step": 1922
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 1.8845458416987717,
+      "learning_rate": 4.94113629509291e-07,
+      "loss": 0.379,
+      "step": 1923
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 2.039608954599768,
+      "learning_rate": 4.921637924789153e-07,
+      "loss": 0.3652,
+      "step": 1924
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 1.8867923218874316,
+      "learning_rate": 4.902173900943564e-07,
+      "loss": 0.3561,
+      "step": 1925
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 1.8133572570953833,
+      "learning_rate": 4.882744256851707e-07,
+      "loss": 0.3718,
+      "step": 1926
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 1.9247010567225666,
+      "learning_rate": 4.86334902575033e-07,
+      "loss": 0.3692,
+      "step": 1927
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 1.879349133553964,
+      "learning_rate": 4.84398824081731e-07,
+      "loss": 0.3639,
+      "step": 1928
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 1.8500016574014475,
+      "learning_rate": 4.824661935171613e-07,
+      "loss": 0.3351,
+      "step": 1929
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 1.8591348368444711,
+      "learning_rate": 4.805370141873198e-07,
+      "loss": 0.3901,
+      "step": 1930
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 1.8661345765477586,
+      "learning_rate": 4.786112893923011e-07,
+      "loss": 0.3875,
+      "step": 1931
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 1.8849185643314905,
+      "learning_rate": 4.766890224262896e-07,
+      "loss": 0.382,
+      "step": 1932
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 1.913717676777297,
+      "learning_rate": 4.747702165775542e-07,
+      "loss": 0.3636,
+      "step": 1933
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 1.8488810215590799,
+      "learning_rate": 4.728548751284448e-07,
+      "loss": 0.3725,
+      "step": 1934
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 1.8776942493725268,
+      "learning_rate": 4.7094300135538203e-07,
+      "loss": 0.345,
+      "step": 1935
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 1.947623892429973,
+      "learning_rate": 4.690345985288572e-07,
+      "loss": 0.3454,
+      "step": 1936
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 1.8541251767713411,
+      "learning_rate": 4.671296699134234e-07,
+      "loss": 0.3698,
+      "step": 1937
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 1.9190013210275396,
+      "learning_rate": 4.652282187676907e-07,
+      "loss": 0.3586,
+      "step": 1938
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 1.975817777540498,
+      "learning_rate": 4.6333024834432086e-07,
+      "loss": 0.3656,
+      "step": 1939
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 1.894711831278788,
+      "learning_rate": 4.6143576189001977e-07,
+      "loss": 0.3404,
+      "step": 1940
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 1.8817727906924275,
+      "learning_rate": 4.595447626455354e-07,
+      "loss": 0.3438,
+      "step": 1941
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 1.926259453987348,
+      "learning_rate": 4.576572538456503e-07,
+      "loss": 0.4277,
+      "step": 1942
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 1.8789373169400134,
+      "learning_rate": 4.557732387191752e-07,
+      "loss": 0.356,
+      "step": 1943
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 1.8546547967236782,
+      "learning_rate": 4.5389272048894566e-07,
+      "loss": 0.3646,
+      "step": 1944
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 1.7952836118848592,
+      "learning_rate": 4.5201570237181413e-07,
+      "loss": 0.3527,
+      "step": 1945
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 1.8451716234273645,
+      "learning_rate": 4.5014218757864714e-07,
+      "loss": 0.3581,
+      "step": 1946
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 2.057595831083482,
+      "learning_rate": 4.482721793143166e-07,
+      "loss": 0.3895,
+      "step": 1947
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 1.807840858302715,
+      "learning_rate": 4.464056807776973e-07,
+      "loss": 0.3269,
+      "step": 1948
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 2.056535275565557,
+      "learning_rate": 4.445426951616605e-07,
+      "loss": 0.4116,
+      "step": 1949
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 1.8423208982758352,
+      "learning_rate": 4.4268322565306663e-07,
+      "loss": 0.3315,
+      "step": 1950
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 1.8618265222451655,
+      "learning_rate": 4.4082727543276303e-07,
+      "loss": 0.3594,
+      "step": 1951
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 1.798959848387909,
+      "learning_rate": 4.3897484767557593e-07,
+      "loss": 0.4065,
+      "step": 1952
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 1.8468408859615186,
+      "learning_rate": 4.3712594555030656e-07,
+      "loss": 0.3339,
+      "step": 1953
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 1.8960514245664648,
+      "learning_rate": 4.352805722197248e-07,
+      "loss": 0.3619,
+      "step": 1954
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 1.8201726610868778,
+      "learning_rate": 4.334387308405641e-07,
+      "loss": 0.3652,
+      "step": 1955
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 1.8960787128219156,
+      "learning_rate": 4.316004245635158e-07,
+      "loss": 0.3482,
+      "step": 1956
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 1.8962249531145783,
+      "learning_rate": 4.297656565332248e-07,
+      "loss": 0.3542,
+      "step": 1957
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 1.9355222361840436,
+      "learning_rate": 4.279344298882834e-07,
+      "loss": 0.3531,
+      "step": 1958
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 1.8619155316505231,
+      "learning_rate": 4.2610674776122406e-07,
+      "loss": 0.3659,
+      "step": 1959
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 1.9040620290695986,
+      "learning_rate": 4.242826132785188e-07,
+      "loss": 0.349,
+      "step": 1960
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 1.8614871067348076,
+      "learning_rate": 4.224620295605683e-07,
+      "loss": 0.3851,
+      "step": 1961
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 1.92540215370278,
+      "learning_rate": 4.2064499972170073e-07,
+      "loss": 0.379,
+      "step": 1962
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 1.7674536324643837,
+      "learning_rate": 4.188315268701651e-07,
+      "loss": 0.3226,
+      "step": 1963
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 2.0097846655442146,
+      "learning_rate": 4.170216141081246e-07,
+      "loss": 0.3924,
+      "step": 1964
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 1.9454008124232989,
+      "learning_rate": 4.1521526453165374e-07,
+      "loss": 0.3599,
+      "step": 1965
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 1.8974492665299996,
+      "learning_rate": 4.134124812307311e-07,
+      "loss": 0.3606,
+      "step": 1966
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 1.9289392453232093,
+      "learning_rate": 4.116132672892345e-07,
+      "loss": 0.3733,
+      "step": 1967
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 1.9100167242810886,
+      "learning_rate": 4.098176257849365e-07,
+      "loss": 0.3502,
+      "step": 1968
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 1.8550217698156015,
+      "learning_rate": 4.0802555978949804e-07,
+      "loss": 0.3433,
+      "step": 1969
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 1.8671430861470548,
+      "learning_rate": 4.06237072368465e-07,
+      "loss": 0.3739,
+      "step": 1970
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 1.8622163328469568,
+      "learning_rate": 4.0445216658125896e-07,
+      "loss": 0.3565,
+      "step": 1971
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 1.957221135499962,
+      "learning_rate": 4.0267084548117786e-07,
+      "loss": 0.4121,
+      "step": 1972
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 1.9308316014476776,
+      "learning_rate": 4.0089311211538473e-07,
+      "loss": 0.3375,
+      "step": 1973
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 2.0943893420426685,
+      "learning_rate": 3.9911896952490786e-07,
+      "loss": 0.3447,
+      "step": 1974
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 1.8555953393062348,
+      "learning_rate": 3.9734842074463125e-07,
+      "loss": 0.3423,
+      "step": 1975
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 1.8787468374607186,
+      "learning_rate": 3.9558146880329246e-07,
+      "loss": 0.3629,
+      "step": 1976
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 1.9054645054466235,
+      "learning_rate": 3.9381811672347584e-07,
+      "loss": 0.347,
+      "step": 1977
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 1.843062330976259,
+      "learning_rate": 3.920583675216072e-07,
+      "loss": 0.377,
+      "step": 1978
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 1.867154897712441,
+      "learning_rate": 3.903022242079499e-07,
+      "loss": 0.3896,
+      "step": 1979
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 1.9141706940850274,
+      "learning_rate": 3.885496897865992e-07,
+      "loss": 0.3807,
+      "step": 1980
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 1.7693227259004407,
+      "learning_rate": 3.868007672554755e-07,
+      "loss": 0.3074,
+      "step": 1981
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 1.9095853727200514,
+      "learning_rate": 3.850554596063219e-07,
+      "loss": 0.3716,
+      "step": 1982
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 1.9430202747777054,
+      "learning_rate": 3.833137698246975e-07,
+      "loss": 0.3624,
+      "step": 1983
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 1.9653761109332166,
+      "learning_rate": 3.8157570088997257e-07,
+      "loss": 0.3526,
+      "step": 1984
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 1.7667551472503558,
+      "learning_rate": 3.798412557753245e-07,
+      "loss": 0.3568,
+      "step": 1985
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 1.8604050001183414,
+      "learning_rate": 3.78110437447729e-07,
+      "loss": 0.3477,
+      "step": 1986
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 1.8184815785613908,
+      "learning_rate": 3.7638324886796e-07,
+      "loss": 0.3181,
+      "step": 1987
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 1.8521943071413647,
+      "learning_rate": 3.7465969299058215e-07,
+      "loss": 0.317,
+      "step": 1988
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 1.8922496050409547,
+      "learning_rate": 3.729397727639453e-07,
+      "loss": 0.3685,
+      "step": 1989
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 1.8919544150457748,
+      "learning_rate": 3.712234911301807e-07,
+      "loss": 0.3811,
+      "step": 1990
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 1.7571917950373177,
+      "learning_rate": 3.6951085102519377e-07,
+      "loss": 0.3069,
+      "step": 1991
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 1.835944343826793,
+      "learning_rate": 3.6780185537866275e-07,
+      "loss": 0.3559,
+      "step": 1992
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 1.924915839626172,
+      "learning_rate": 3.6609650711403044e-07,
+      "loss": 0.3736,
+      "step": 1993
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 1.7729901621180486,
+      "learning_rate": 3.6439480914850057e-07,
+      "loss": 0.3202,
+      "step": 1994
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 1.7439650920581296,
+      "learning_rate": 3.6269676439303234e-07,
+      "loss": 0.3138,
+      "step": 1995
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 1.888200639953336,
+      "learning_rate": 3.6100237575233647e-07,
+      "loss": 0.3137,
+      "step": 1996
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 1.9215140175987564,
+      "learning_rate": 3.593116461248691e-07,
+      "loss": 0.3618,
+      "step": 1997
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 1.8621195398901151,
+      "learning_rate": 3.576245784028262e-07,
+      "loss": 0.3381,
+      "step": 1998
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 1.813427333802716,
+      "learning_rate": 3.5594117547214064e-07,
+      "loss": 0.3159,
+      "step": 1999
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 1.8109572860251124,
+      "learning_rate": 3.542614402124769e-07,
+      "loss": 0.3431,
+      "step": 2000
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 1.8438243884785575,
+      "learning_rate": 3.5258537549722334e-07,
+      "loss": 0.3404,
+      "step": 2001
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 1.8637548681582927,
+      "learning_rate": 3.5091298419349137e-07,
+      "loss": 0.3558,
+      "step": 2002
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 1.9054188507583447,
+      "learning_rate": 3.492442691621073e-07,
+      "loss": 0.3514,
+      "step": 2003
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 1.8508749827944715,
+      "learning_rate": 3.4757923325761e-07,
+      "loss": 0.3919,
+      "step": 2004
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 1.9381031698615256,
+      "learning_rate": 3.459178793282439e-07,
+      "loss": 0.3445,
+      "step": 2005
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 1.8965478043748623,
+      "learning_rate": 3.442602102159548e-07,
+      "loss": 0.3686,
+      "step": 2006
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 1.8508387871408407,
+      "learning_rate": 3.4260622875638554e-07,
+      "loss": 0.3579,
+      "step": 2007
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 1.8672067538829913,
+      "learning_rate": 3.4095593777887097e-07,
+      "loss": 0.3311,
+      "step": 2008
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 1.889545394249109,
+      "learning_rate": 3.393093401064335e-07,
+      "loss": 0.3903,
+      "step": 2009
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 1.9058059259656173,
+      "learning_rate": 3.3766643855577514e-07,
+      "loss": 0.3382,
+      "step": 2010
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 1.827046925655219,
+      "learning_rate": 3.360272359372785e-07,
+      "loss": 0.4026,
+      "step": 2011
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 1.8937619782731472,
+      "learning_rate": 3.3439173505499606e-07,
+      "loss": 0.3908,
+      "step": 2012
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 1.8634842581354367,
+      "learning_rate": 3.327599387066499e-07,
+      "loss": 0.3317,
+      "step": 2013
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 1.8390150223835013,
+      "learning_rate": 3.3113184968362384e-07,
+      "loss": 0.3556,
+      "step": 2014
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 1.8764780958956686,
+      "learning_rate": 3.2950747077096084e-07,
+      "loss": 0.3517,
+      "step": 2015
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 1.8020390033097442,
+      "learning_rate": 3.2788680474735687e-07,
+      "loss": 0.3417,
+      "step": 2016
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 1.9477716924859427,
+      "learning_rate": 3.262698543851561e-07,
+      "loss": 0.3596,
+      "step": 2017
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 1.894137770283384,
+      "learning_rate": 3.2465662245034696e-07,
+      "loss": 0.4035,
+      "step": 2018
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 1.783142135883036,
+      "learning_rate": 3.230471117025577e-07,
+      "loss": 0.3529,
+      "step": 2019
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 1.803387656385472,
+      "learning_rate": 3.214413248950496e-07,
+      "loss": 0.3509,
+      "step": 2020
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 1.8805063690885642,
+      "learning_rate": 3.198392647747159e-07,
+      "loss": 0.3749,
+      "step": 2021
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 2.0107348664022147,
+      "learning_rate": 3.182409340820719e-07,
+      "loss": 0.3485,
+      "step": 2022
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 1.8288895736395419,
+      "learning_rate": 3.1664633555125615e-07,
+      "loss": 0.3757,
+      "step": 2023
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 1.8466692854518314,
+      "learning_rate": 3.1505547191002017e-07,
+      "loss": 0.3465,
+      "step": 2024
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 1.8190080826897819,
+      "learning_rate": 3.1346834587972915e-07,
+      "loss": 0.374,
+      "step": 2025
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 1.741058949057854,
+      "learning_rate": 3.118849601753529e-07,
+      "loss": 0.3628,
+      "step": 2026
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 1.9154631726200175,
+      "learning_rate": 3.1030531750546377e-07,
+      "loss": 0.3823,
+      "step": 2027
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 1.8222826028897707,
+      "learning_rate": 3.0872942057223105e-07,
+      "loss": 0.3857,
+      "step": 2028
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 1.8691052236878156,
+      "learning_rate": 3.071572720714161e-07,
+      "loss": 0.3687,
+      "step": 2029
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 1.8329290245063994,
+      "learning_rate": 3.0558887469236824e-07,
+      "loss": 0.3459,
+      "step": 2030
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 2.105450216487807,
+      "learning_rate": 3.040242311180211e-07,
+      "loss": 0.3497,
+      "step": 2031
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 1.8682164915371384,
+      "learning_rate": 3.02463344024885e-07,
+      "loss": 0.3631,
+      "step": 2032
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 1.870413375372597,
+      "learning_rate": 3.0090621608304586e-07,
+      "loss": 0.337,
+      "step": 2033
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 1.831129712998651,
+      "learning_rate": 2.9935284995615874e-07,
+      "loss": 0.3223,
+      "step": 2034
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 1.9934638606782258,
+      "learning_rate": 2.978032483014434e-07,
+      "loss": 0.3776,
+      "step": 2035
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 1.9217667136109262,
+      "learning_rate": 2.9625741376968107e-07,
+      "loss": 0.3615,
+      "step": 2036
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 1.8233621789333103,
+      "learning_rate": 2.947153490052068e-07,
+      "loss": 0.3745,
+      "step": 2037
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 1.8933236552388286,
+      "learning_rate": 2.9317705664590857e-07,
+      "loss": 0.3521,
+      "step": 2038
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 1.8877528361725595,
+      "learning_rate": 2.9164253932322114e-07,
+      "loss": 0.3753,
+      "step": 2039
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 1.8701101519376129,
+      "learning_rate": 2.901117996621214e-07,
+      "loss": 0.3385,
+      "step": 2040
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 2.022104989564633,
+      "learning_rate": 2.885848402811242e-07,
+      "loss": 0.3604,
+      "step": 2041
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 1.981072876489218,
+      "learning_rate": 2.8706166379227685e-07,
+      "loss": 0.3844,
+      "step": 2042
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 1.8524158018731827,
+      "learning_rate": 2.8554227280115673e-07,
+      "loss": 0.35,
+      "step": 2043
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 1.938845970147165,
+      "learning_rate": 2.8402666990686526e-07,
+      "loss": 0.3396,
+      "step": 2044
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 1.830032191851363,
+      "learning_rate": 2.825148577020237e-07,
+      "loss": 0.3376,
+      "step": 2045
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 1.815946726661388,
+      "learning_rate": 2.8100683877276935e-07,
+      "loss": 0.3186,
+      "step": 2046
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 1.9148486946074896,
+      "learning_rate": 2.7950261569874987e-07,
+      "loss": 0.3818,
+      "step": 2047
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 2.0362020767823688,
+      "learning_rate": 2.7800219105312107e-07,
+      "loss": 0.3692,
+      "step": 2048
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 1.9516570398785693,
+      "learning_rate": 2.765055674025388e-07,
+      "loss": 0.3761,
+      "step": 2049
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 1.878773060905854,
+      "learning_rate": 2.75012747307159e-07,
+      "loss": 0.3683,
+      "step": 2050
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 1.7913050498897873,
+      "learning_rate": 2.735237333206306e-07,
+      "loss": 0.3561,
+      "step": 2051
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 1.9868033431365109,
+      "learning_rate": 2.720385279900908e-07,
+      "loss": 0.4107,
+      "step": 2052
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 1.8593813692288108,
+      "learning_rate": 2.7055713385616246e-07,
+      "loss": 0.3705,
+      "step": 2053
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 1.8698518318714625,
+      "learning_rate": 2.6907955345294864e-07,
+      "loss": 0.3566,
+      "step": 2054
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 1.9690930287334685,
+      "learning_rate": 2.6760578930802917e-07,
+      "loss": 0.3855,
+      "step": 2055
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 1.8559735887870026,
+      "learning_rate": 2.661358439424552e-07,
+      "loss": 0.3442,
+      "step": 2056
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 1.7913911949444496,
+      "learning_rate": 2.6466971987074514e-07,
+      "loss": 0.3424,
+      "step": 2057
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 1.8619135470758108,
+      "learning_rate": 2.6320741960088104e-07,
+      "loss": 0.3666,
+      "step": 2058
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 1.9162607121997142,
+      "learning_rate": 2.6174894563430365e-07,
+      "loss": 0.3398,
+      "step": 2059
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 1.8850582422937106,
+      "learning_rate": 2.602943004659092e-07,
+      "loss": 0.3267,
+      "step": 2060
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 1.8451237530567572,
+      "learning_rate": 2.588434865840425e-07,
+      "loss": 0.3535,
+      "step": 2061
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 1.9560351999911851,
+      "learning_rate": 2.573965064704964e-07,
+      "loss": 0.3927,
+      "step": 2062
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 1.9375022045866468,
+      "learning_rate": 2.5595336260050367e-07,
+      "loss": 0.345,
+      "step": 2063
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 1.9389910057698705,
+      "learning_rate": 2.5451405744273684e-07,
+      "loss": 0.3632,
+      "step": 2064
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 1.9464447650421863,
+      "learning_rate": 2.5307859345930025e-07,
+      "loss": 0.3732,
+      "step": 2065
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 1.901436015964627,
+      "learning_rate": 2.516469731057286e-07,
+      "loss": 0.3624,
+      "step": 2066
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 1.861052476130716,
+      "learning_rate": 2.5021919883098043e-07,
+      "loss": 0.3465,
+      "step": 2067
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 1.8061147604719698,
+      "learning_rate": 2.4879527307743624e-07,
+      "loss": 0.3271,
+      "step": 2068
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 1.8600512074627422,
+      "learning_rate": 2.473751982808925e-07,
+      "loss": 0.3524,
+      "step": 2069
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 1.9116929096391897,
+      "learning_rate": 2.459589768705581e-07,
+      "loss": 0.4268,
+      "step": 2070
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 1.9375533647948024,
+      "learning_rate": 2.445466112690506e-07,
+      "loss": 0.4426,
+      "step": 2071
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 1.8187896872609466,
+      "learning_rate": 2.431381038923922e-07,
+      "loss": 0.4264,
+      "step": 2072
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 1.8722345082962843,
+      "learning_rate": 2.4173345715000326e-07,
+      "loss": 0.3428,
+      "step": 2073
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 1.807831006800025,
+      "learning_rate": 2.4033267344470256e-07,
+      "loss": 0.3593,
+      "step": 2074
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 1.8381163414971127,
+      "learning_rate": 2.389357551726981e-07,
+      "loss": 0.336,
+      "step": 2075
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 1.8053442770688852,
+      "learning_rate": 2.3754270472358786e-07,
+      "loss": 0.36,
+      "step": 2076
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 1.8102492703550743,
+      "learning_rate": 2.3615352448035228e-07,
+      "loss": 0.3373,
+      "step": 2077
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 1.8640600559614149,
+      "learning_rate": 2.3476821681935185e-07,
+      "loss": 0.3713,
+      "step": 2078
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 1.8259178467876043,
+      "learning_rate": 2.3338678411032184e-07,
+      "loss": 0.3812,
+      "step": 2079
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 1.8572135345525957,
+      "learning_rate": 2.3200922871636973e-07,
+      "loss": 0.3564,
+      "step": 2080
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 1.8808861682893652,
+      "learning_rate": 2.3063555299396994e-07,
+      "loss": 0.344,
+      "step": 2081
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 2.082790897941687,
+      "learning_rate": 2.292657592929609e-07,
+      "loss": 0.4149,
+      "step": 2082
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 1.8154127177582888,
+      "learning_rate": 2.278998499565388e-07,
+      "loss": 0.3309,
+      "step": 2083
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 1.9281000674444286,
+      "learning_rate": 2.265378273212565e-07,
+      "loss": 0.35,
+      "step": 2084
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 1.787806749105191,
+      "learning_rate": 2.2517969371701808e-07,
+      "loss": 0.3332,
+      "step": 2085
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 1.8495975031177656,
+      "learning_rate": 2.2382545146707485e-07,
+      "loss": 0.3859,
+      "step": 2086
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 2.0959876699169335,
+      "learning_rate": 2.224751028880215e-07,
+      "loss": 0.3441,
+      "step": 2087
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 1.8576485516029098,
+      "learning_rate": 2.2112865028979135e-07,
+      "loss": 0.4039,
+      "step": 2088
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 1.7715289202721822,
+      "learning_rate": 2.1978609597565425e-07,
+      "loss": 0.3379,
+      "step": 2089
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 1.9403112506850888,
+      "learning_rate": 2.1844744224221115e-07,
+      "loss": 0.3668,
+      "step": 2090
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 1.8854691153731733,
+      "learning_rate": 2.1711269137939083e-07,
+      "loss": 0.3612,
+      "step": 2091
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 1.9371643123801792,
+      "learning_rate": 2.1578184567044552e-07,
+      "loss": 0.4192,
+      "step": 2092
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 1.7867188340579265,
+      "learning_rate": 2.1445490739194663e-07,
+      "loss": 0.3679,
+      "step": 2093
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 1.7966033645477641,
+      "learning_rate": 2.1313187881378205e-07,
+      "loss": 0.3849,
+      "step": 2094
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 1.7756177099499322,
+      "learning_rate": 2.1181276219915224e-07,
+      "loss": 0.3277,
+      "step": 2095
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 1.8346072475530897,
+      "learning_rate": 2.104975598045647e-07,
+      "loss": 0.3565,
+      "step": 2096
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 2.011276790677077,
+      "learning_rate": 2.091862738798317e-07,
+      "loss": 0.3498,
+      "step": 2097
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 1.8618849009320824,
+      "learning_rate": 2.0787890666806588e-07,
+      "loss": 0.3457,
+      "step": 2098
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 1.9013266132795015,
+      "learning_rate": 2.0657546040567688e-07,
+      "loss": 0.4026,
+      "step": 2099
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 1.8662915733623995,
+      "learning_rate": 2.0527593732236563e-07,
+      "loss": 0.3239,
+      "step": 2100
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 1.862742931005612,
+      "learning_rate": 2.0398033964112386e-07,
+      "loss": 0.3534,
+      "step": 2101
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 1.9697615787915368,
+      "learning_rate": 2.0268866957822737e-07,
+      "loss": 0.4434,
+      "step": 2102
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 1.928940882803443,
+      "learning_rate": 2.0140092934323286e-07,
+      "loss": 0.378,
+      "step": 2103
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 2.316878206169073,
+      "learning_rate": 2.00117121138976e-07,
+      "loss": 0.3245,
+      "step": 2104
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 1.8290409402278658,
+      "learning_rate": 1.9883724716156488e-07,
+      "loss": 0.3167,
+      "step": 2105
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 1.8470715714524868,
+      "learning_rate": 1.975613096003784e-07,
+      "loss": 0.3692,
+      "step": 2106
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 1.7589018750118917,
+      "learning_rate": 1.9628931063806145e-07,
+      "loss": 0.3428,
+      "step": 2107
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 1.8386934947013096,
+      "learning_rate": 1.9502125245052184e-07,
+      "loss": 0.302,
+      "step": 2108
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 2.013634549023632,
+      "learning_rate": 1.9375713720692578e-07,
+      "loss": 0.374,
+      "step": 2109
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 1.9630403258780862,
+      "learning_rate": 1.9249696706969468e-07,
+      "loss": 0.3639,
+      "step": 2110
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 1.815851928434014,
+      "learning_rate": 1.9124074419450188e-07,
+      "loss": 0.3911,
+      "step": 2111
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 1.7873258577827946,
+      "learning_rate": 1.899884707302671e-07,
+      "loss": 0.3587,
+      "step": 2112
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 1.7437701243798065,
+      "learning_rate": 1.8874014881915592e-07,
+      "loss": 0.3259,
+      "step": 2113
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 1.7944960642642218,
+      "learning_rate": 1.8749578059657269e-07,
+      "loss": 0.3244,
+      "step": 2114
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 1.8363546908139476,
+      "learning_rate": 1.862553681911594e-07,
+      "loss": 0.3456,
+      "step": 2115
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 1.8197485549623056,
+      "learning_rate": 1.8501891372479124e-07,
+      "loss": 0.3283,
+      "step": 2116
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 1.9637296453576303,
+      "learning_rate": 1.837864193125724e-07,
+      "loss": 0.3826,
+      "step": 2117
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 1.8379188611549326,
+      "learning_rate": 1.8255788706283333e-07,
+      "loss": 0.3479,
+      "step": 2118
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 1.9065317118193474,
+      "learning_rate": 1.813333190771263e-07,
+      "loss": 0.389,
+      "step": 2119
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 1.825398324635909,
+      "learning_rate": 1.8011271745022236e-07,
+      "loss": 0.3152,
+      "step": 2120
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 1.84277570139951,
+      "learning_rate": 1.788960842701079e-07,
+      "loss": 0.3303,
+      "step": 2121
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 2.1819179580807093,
+      "learning_rate": 1.7768342161798124e-07,
+      "loss": 0.3643,
+      "step": 2122
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 1.8011890761495493,
+      "learning_rate": 1.7647473156824635e-07,
+      "loss": 0.339,
+      "step": 2123
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 1.8221066042168634,
+      "learning_rate": 1.7527001618851458e-07,
+      "loss": 0.3569,
+      "step": 2124
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 1.8261396232814944,
+      "learning_rate": 1.7406927753959635e-07,
+      "loss": 0.4109,
+      "step": 2125
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 1.805174532249848,
+      "learning_rate": 1.728725176755e-07,
+      "loss": 0.3393,
+      "step": 2126
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 1.8848775064893697,
+      "learning_rate": 1.7167973864342713e-07,
+      "loss": 0.3496,
+      "step": 2127
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 1.8100491319883585,
+      "learning_rate": 1.7049094248377028e-07,
+      "loss": 0.3281,
+      "step": 2128
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 1.8831196622854411,
+      "learning_rate": 1.6930613123010835e-07,
+      "loss": 0.3841,
+      "step": 2129
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 1.9209980081592088,
+      "learning_rate": 1.6812530690920424e-07,
+      "loss": 0.391,
+      "step": 2130
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 1.9944918420194047,
+      "learning_rate": 1.669484715409997e-07,
+      "loss": 0.3926,
+      "step": 2131
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 1.9395008522068116,
+      "learning_rate": 1.6577562713861407e-07,
+      "loss": 0.3883,
+      "step": 2132
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 1.8304355795594052,
+      "learning_rate": 1.646067757083389e-07,
+      "loss": 0.3389,
+      "step": 2133
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 1.8402459886338345,
+      "learning_rate": 1.6344191924963476e-07,
+      "loss": 0.3922,
+      "step": 2134
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 1.8132899531779554,
+      "learning_rate": 1.622810597551297e-07,
+      "loss": 0.3469,
+      "step": 2135
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 1.921034055540662,
+      "learning_rate": 1.6112419921061357e-07,
+      "loss": 0.3653,
+      "step": 2136
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 1.8056140554264193,
+      "learning_rate": 1.5997133959503586e-07,
+      "loss": 0.3415,
+      "step": 2137
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 1.8588044133851904,
+      "learning_rate": 1.5882248288050212e-07,
+      "loss": 0.3506,
+      "step": 2138
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 1.8108046119552568,
+      "learning_rate": 1.5767763103226973e-07,
+      "loss": 0.388,
+      "step": 2139
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 1.8763098742970408,
+      "learning_rate": 1.5653678600874579e-07,
+      "loss": 0.3664,
+      "step": 2140
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 1.8497677696649508,
+      "learning_rate": 1.553999497614833e-07,
+      "loss": 0.3639,
+      "step": 2141
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 1.9655979382195377,
+      "learning_rate": 1.5426712423517786e-07,
+      "loss": 0.3506,
+      "step": 2142
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 1.790628708967862,
+      "learning_rate": 1.5313831136766404e-07,
+      "loss": 0.3406,
+      "step": 2143
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 1.9608335290613008,
+      "learning_rate": 1.5201351308991224e-07,
+      "loss": 0.3501,
+      "step": 2144
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 1.7903484977756507,
+      "learning_rate": 1.50892731326025e-07,
+      "loss": 0.3676,
+      "step": 2145
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 1.8855435922984911,
+      "learning_rate": 1.4977596799323535e-07,
+      "loss": 0.3799,
+      "step": 2146
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 1.8324689872102904,
+      "learning_rate": 1.4866322500190101e-07,
+      "loss": 0.3755,
+      "step": 2147
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 1.8514508819914763,
+      "learning_rate": 1.4755450425550323e-07,
+      "loss": 0.3347,
+      "step": 2148
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 1.7886393781902339,
+      "learning_rate": 1.4644980765064265e-07,
+      "loss": 0.3585,
+      "step": 2149
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 2.0582700711129904,
+      "learning_rate": 1.45349137077036e-07,
+      "loss": 0.3524,
+      "step": 2150
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 1.8479484119761578,
+      "learning_rate": 1.442524944175122e-07,
+      "loss": 0.3588,
+      "step": 2151
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 1.9799010734714735,
+      "learning_rate": 1.431598815480112e-07,
+      "loss": 0.3851,
+      "step": 2152
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 1.8154268548789099,
+      "learning_rate": 1.4207130033757953e-07,
+      "loss": 0.3659,
+      "step": 2153
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 1.788826962078112,
+      "learning_rate": 1.409867526483655e-07,
+      "loss": 0.3532,
+      "step": 2154
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 1.831394990369725,
+      "learning_rate": 1.399062403356191e-07,
+      "loss": 0.3656,
+      "step": 2155
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 1.860812513843525,
+      "learning_rate": 1.3882976524768694e-07,
+      "loss": 0.3437,
+      "step": 2156
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 1.8191445465710199,
+      "learning_rate": 1.3775732922600955e-07,
+      "loss": 0.3453,
+      "step": 2157
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 1.912203673098316,
+      "learning_rate": 1.3668893410511752e-07,
+      "loss": 0.356,
+      "step": 2158
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 1.7634263389685063,
+      "learning_rate": 1.3562458171262977e-07,
+      "loss": 0.3364,
+      "step": 2159
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 1.9606772030976105,
+      "learning_rate": 1.345642738692493e-07,
+      "loss": 0.4129,
+      "step": 2160
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 1.8903730951236122,
+      "learning_rate": 1.3350801238876054e-07,
+      "loss": 0.3552,
+      "step": 2161
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 1.8151033977020858,
+      "learning_rate": 1.3245579907802647e-07,
+      "loss": 0.3749,
+      "step": 2162
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 1.798306670267368,
+      "learning_rate": 1.3140763573698368e-07,
+      "loss": 0.3579,
+      "step": 2163
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 1.8830967664423854,
+      "learning_rate": 1.3036352415864317e-07,
+      "loss": 0.3807,
+      "step": 2164
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 1.8497739705616796,
+      "learning_rate": 1.2932346612908236e-07,
+      "loss": 0.3435,
+      "step": 2165
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 1.8775626143477988,
+      "learning_rate": 1.2828746342744642e-07,
+      "loss": 0.3694,
+      "step": 2166
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 1.86888174731765,
+      "learning_rate": 1.2725551782594297e-07,
+      "loss": 0.3773,
+      "step": 2167
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 1.8106535392312983,
+      "learning_rate": 1.2622763108983943e-07,
+      "loss": 0.3658,
+      "step": 2168
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 1.7565946139355706,
+      "learning_rate": 1.2520380497745955e-07,
+      "loss": 0.3327,
+      "step": 2169
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 1.8337917759239957,
+      "learning_rate": 1.2418404124018152e-07,
+      "loss": 0.3581,
+      "step": 2170
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 1.8611220399958432,
+      "learning_rate": 1.2316834162243385e-07,
+      "loss": 0.3157,
+      "step": 2171
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 1.838068997080928,
+      "learning_rate": 1.2215670786169365e-07,
+      "loss": 0.3784,
+      "step": 2172
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 1.8073457370585166,
+      "learning_rate": 1.2114914168848247e-07,
+      "loss": 0.3601,
+      "step": 2173
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 1.7903431751846044,
+      "learning_rate": 1.20145644826363e-07,
+      "loss": 0.3387,
+      "step": 2174
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 1.8158848144806372,
+      "learning_rate": 1.1914621899193762e-07,
+      "loss": 0.34,
+      "step": 2175
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 1.766558225950262,
+      "learning_rate": 1.181508658948452e-07,
+      "loss": 0.3226,
+      "step": 2176
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 1.794967554546229,
+      "learning_rate": 1.1715958723775706e-07,
+      "loss": 0.3798,
+      "step": 2177
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 1.7695727585693561,
+      "learning_rate": 1.1617238471637455e-07,
+      "loss": 0.3678,
+      "step": 2178
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 1.8143490780194036,
+      "learning_rate": 1.1518926001942655e-07,
+      "loss": 0.3237,
+      "step": 2179
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 1.8739602172417291,
+      "learning_rate": 1.142102148286664e-07,
+      "loss": 0.3639,
+      "step": 2180
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 1.7739231697525675,
+      "learning_rate": 1.1323525081886888e-07,
+      "loss": 0.3159,
+      "step": 2181
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 2.105039856872307,
+      "learning_rate": 1.1226436965782767e-07,
+      "loss": 0.432,
+      "step": 2182
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 1.8072327831095898,
+      "learning_rate": 1.112975730063523e-07,
+      "loss": 0.3792,
+      "step": 2183
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 1.8941794154335998,
+      "learning_rate": 1.1033486251826403e-07,
+      "loss": 0.3968,
+      "step": 2184
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 2.1193960588145755,
+      "learning_rate": 1.0937623984039552e-07,
+      "loss": 0.3529,
+      "step": 2185
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 1.9461657576343148,
+      "learning_rate": 1.0842170661258672e-07,
+      "loss": 0.3957,
+      "step": 2186
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 1.7950594098824268,
+      "learning_rate": 1.0747126446768147e-07,
+      "loss": 0.3405,
+      "step": 2187
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 1.8279881694837754,
+      "learning_rate": 1.065249150315259e-07,
+      "loss": 0.3341,
+      "step": 2188
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 1.7886544153404058,
+      "learning_rate": 1.0558265992296451e-07,
+      "loss": 0.3536,
+      "step": 2189
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 1.8349909807030522,
+      "learning_rate": 1.0464450075383825e-07,
+      "loss": 0.3485,
+      "step": 2190
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 1.801301802497824,
+      "learning_rate": 1.0371043912898144e-07,
+      "loss": 0.3438,
+      "step": 2191
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 2.2727551395393815,
+      "learning_rate": 1.0278047664621927e-07,
+      "loss": 0.3638,
+      "step": 2192
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 1.8256549405277607,
+      "learning_rate": 1.0185461489636422e-07,
+      "loss": 0.3825,
+      "step": 2193
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 1.8324272080700041,
+      "learning_rate": 1.0093285546321496e-07,
+      "loss": 0.359,
+      "step": 2194
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 1.9572385811446105,
+      "learning_rate": 1.0001519992355158e-07,
+      "loss": 0.3805,
+      "step": 2195
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 1.8745619608955748,
+      "learning_rate": 9.910164984713477e-08,
+      "loss": 0.3922,
+      "step": 2196
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 1.8621961607473747,
+      "learning_rate": 9.819220679670172e-08,
+      "loss": 0.3448,
+      "step": 2197
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 1.900350756763101,
+      "learning_rate": 9.728687232796463e-08,
+      "loss": 0.394,
+      "step": 2198
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 1.8728312451342384,
+      "learning_rate": 9.638564798960748e-08,
+      "loss": 0.3457,
+      "step": 2199
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 1.8715935041572054,
+      "learning_rate": 9.548853532328261e-08,
+      "loss": 0.345,
+      "step": 2200
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 1.8278526441263347,
+      "learning_rate": 9.459553586360998e-08,
+      "loss": 0.3418,
+      "step": 2201
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 1.9429462293638,
+      "learning_rate": 9.370665113817206e-08,
+      "loss": 0.3986,
+      "step": 2202
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 1.8143835995774138,
+      "learning_rate": 9.282188266751341e-08,
+      "loss": 0.3367,
+      "step": 2203
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 2.0092496115099596,
+      "learning_rate": 9.194123196513776e-08,
+      "loss": 0.345,
+      "step": 2204
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 1.9645251631663325,
+      "learning_rate": 9.106470053750371e-08,
+      "loss": 0.3701,
+      "step": 2205
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 1.866996771603958,
+      "learning_rate": 9.019228988402406e-08,
+      "loss": 0.3366,
+      "step": 2206
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 1.9147717656924788,
+      "learning_rate": 8.932400149706227e-08,
+      "loss": 0.3575,
+      "step": 2207
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 1.9482208231738272,
+      "learning_rate": 8.84598368619305e-08,
+      "loss": 0.3414,
+      "step": 2208
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 1.8233257333540858,
+      "learning_rate": 8.759979745688623e-08,
+      "loss": 0.3318,
+      "step": 2209
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 1.8579605341403027,
+      "learning_rate": 8.674388475313073e-08,
+      "loss": 0.3771,
+      "step": 2210
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 1.8725714643585427,
+      "learning_rate": 8.589210021480581e-08,
+      "loss": 0.406,
+      "step": 2211
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 1.7878785317641386,
+      "learning_rate": 8.504444529899153e-08,
+      "loss": 0.341,
+      "step": 2212
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 1.9092348427688413,
+      "learning_rate": 8.420092145570408e-08,
+      "loss": 0.3627,
+      "step": 2213
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 1.931178088082703,
+      "learning_rate": 8.3361530127892e-08,
+      "loss": 0.3368,
+      "step": 2214
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 1.8737770088221923,
+      "learning_rate": 8.252627275143587e-08,
+      "loss": 0.3306,
+      "step": 2215
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 1.868328247002196,
+      "learning_rate": 8.16951507551439e-08,
+      "loss": 0.339,
+      "step": 2216
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 1.7549812055184346,
+      "learning_rate": 8.086816556075045e-08,
+      "loss": 0.3324,
+      "step": 2217
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 1.9136663871575121,
+      "learning_rate": 8.00453185829131e-08,
+      "loss": 0.3783,
+      "step": 2218
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 1.804706334856571,
+      "learning_rate": 7.922661122921116e-08,
+      "loss": 0.3293,
+      "step": 2219
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 1.8409617616372291,
+      "learning_rate": 7.841204490014215e-08,
+      "loss": 0.3381,
+      "step": 2220
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 1.8083991705398357,
+      "learning_rate": 7.760162098911978e-08,
+      "loss": 0.3463,
+      "step": 2221
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 22.560759057837398,
+      "learning_rate": 7.679534088247231e-08,
+      "loss": 0.4184,
+      "step": 2222
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 3.0344748163153143,
+      "learning_rate": 7.599320595943815e-08,
+      "loss": 0.3611,
+      "step": 2223
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 1.8265541046231377,
+      "learning_rate": 7.519521759216691e-08,
+      "loss": 0.3513,
+      "step": 2224
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 1.8696576868008075,
+      "learning_rate": 7.440137714571277e-08,
+      "loss": 0.3466,
+      "step": 2225
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 1.8962707956349882,
+      "learning_rate": 7.361168597803614e-08,
+      "loss": 0.3665,
+      "step": 2226
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 1.845514555585491,
+      "learning_rate": 7.282614543999867e-08,
+      "loss": 0.344,
+      "step": 2227
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 1.87201142236449,
+      "learning_rate": 7.204475687536238e-08,
+      "loss": 0.3986,
+      "step": 2228
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 1.8581673469597215,
+      "learning_rate": 7.126752162078643e-08,
+      "loss": 0.3234,
+      "step": 2229
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 1.8315260718739734,
+      "learning_rate": 7.049444100582503e-08,
+      "loss": 0.3214,
+      "step": 2230
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 1.7806104471214492,
+      "learning_rate": 6.972551635292618e-08,
+      "loss": 0.3229,
+      "step": 2231
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 1.7994027257586662,
+      "learning_rate": 6.896074897742827e-08,
+      "loss": 0.3583,
+      "step": 2232
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 1.7550904179180664,
+      "learning_rate": 6.820014018755761e-08,
+      "loss": 0.3191,
+      "step": 2233
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 1.937456980479283,
+      "learning_rate": 6.744369128442785e-08,
+      "loss": 0.3771,
+      "step": 2234
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 1.7707253123670128,
+      "learning_rate": 6.66914035620353e-08,
+      "loss": 0.3286,
+      "step": 2235
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 1.8227176243965777,
+      "learning_rate": 6.594327830725916e-08,
+      "loss": 0.3634,
+      "step": 2236
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 1.8234900630660766,
+      "learning_rate": 6.51993167998577e-08,
+      "loss": 0.3655,
+      "step": 2237
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 1.9640311575750178,
+      "learning_rate": 6.445952031246678e-08,
+      "loss": 0.3801,
+      "step": 2238
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 1.8231576961256921,
+      "learning_rate": 6.372389011059743e-08,
+      "loss": 0.349,
+      "step": 2239
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 1.8533851518368958,
+      "learning_rate": 6.299242745263445e-08,
+      "loss": 0.354,
+      "step": 2240
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 1.799650055342726,
+      "learning_rate": 6.226513358983166e-08,
+      "loss": 0.3854,
+      "step": 2241
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 1.832735461763389,
+      "learning_rate": 6.154200976631358e-08,
+      "loss": 0.3495,
+      "step": 2242
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 2.0635623794685487,
+      "learning_rate": 6.082305721907044e-08,
+      "loss": 0.4287,
+      "step": 2243
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 1.8920812885996825,
+      "learning_rate": 6.010827717795736e-08,
+      "loss": 0.3444,
+      "step": 2244
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 1.8850298512782395,
+      "learning_rate": 5.9397670865691813e-08,
+      "loss": 0.3713,
+      "step": 2245
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 1.9351828186998583,
+      "learning_rate": 5.8691239497851436e-08,
+      "loss": 0.406,
+      "step": 2246
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 1.784816624778513,
+      "learning_rate": 5.7988984282872085e-08,
+      "loss": 0.3607,
+      "step": 2247
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 1.826165368757773,
+      "learning_rate": 5.729090642204615e-08,
+      "loss": 0.3238,
+      "step": 2248
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 1.8992022791618013,
+      "learning_rate": 5.659700710951982e-08,
+      "loss": 0.3437,
+      "step": 2249
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 1.8504942451770303,
+      "learning_rate": 5.5907287532291354e-08,
+      "loss": 0.344,
+      "step": 2250
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 1.8238313969289717,
+      "learning_rate": 5.5221748870209756e-08,
+      "loss": 0.3805,
+      "step": 2251
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 1.8659566459192327,
+      "learning_rate": 5.4540392295971136e-08,
+      "loss": 0.3322,
+      "step": 2252
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 1.8525482310150154,
+      "learning_rate": 5.386321897511787e-08,
+      "loss": 0.3229,
+      "step": 2253
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 1.8318342126706884,
+      "learning_rate": 5.319023006603668e-08,
+      "loss": 0.3552,
+      "step": 2254
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 1.9361724490090373,
+      "learning_rate": 5.252142671995669e-08,
+      "loss": 0.3742,
+      "step": 2255
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 1.9026081066119354,
+      "learning_rate": 5.185681008094579e-08,
+      "loss": 0.3313,
+      "step": 2256
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 1.9824621364268147,
+      "learning_rate": 5.119638128591148e-08,
+      "loss": 0.3723,
+      "step": 2257
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 1.8800250495832693,
+      "learning_rate": 5.0540141464596185e-08,
+      "loss": 0.3695,
+      "step": 2258
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 1.7729891099483797,
+      "learning_rate": 4.988809173957804e-08,
+      "loss": 0.3487,
+      "step": 2259
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 1.814679974377573,
+      "learning_rate": 4.924023322626592e-08,
+      "loss": 0.3689,
+      "step": 2260
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 1.8464925551585676,
+      "learning_rate": 4.8596567032900274e-08,
+      "loss": 0.3539,
+      "step": 2261
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 1.8903105581775805,
+      "learning_rate": 4.7957094260549784e-08,
+      "loss": 0.364,
+      "step": 2262
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 1.805466964295455,
+      "learning_rate": 4.7321816003109424e-08,
+      "loss": 0.3628,
+      "step": 2263
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 1.8143326259659651,
+      "learning_rate": 4.6690733347299624e-08,
+      "loss": 0.3661,
+      "step": 2264
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 1.843803583791867,
+      "learning_rate": 4.6063847372662676e-08,
+      "loss": 0.3458,
+      "step": 2265
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 1.908507563592907,
+      "learning_rate": 4.5441159151563275e-08,
+      "loss": 0.4031,
+      "step": 2266
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 1.826011543168382,
+      "learning_rate": 4.4822669749184364e-08,
+      "loss": 0.3152,
+      "step": 2267
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 1.8153710037318442,
+      "learning_rate": 4.420838022352631e-08,
+      "loss": 0.3342,
+      "step": 2268
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 1.9759369609394648,
+      "learning_rate": 4.359829162540574e-08,
+      "loss": 0.3566,
+      "step": 2269
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 1.836606089281238,
+      "learning_rate": 4.2992404998452867e-08,
+      "loss": 0.3373,
+      "step": 2270
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 1.7419439850247866,
+      "learning_rate": 4.2390721379109434e-08,
+      "loss": 0.3258,
+      "step": 2271
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 1.8113632431171702,
+      "learning_rate": 4.1793241796627694e-08,
+      "loss": 0.3396,
+      "step": 2272
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 1.782592709936038,
+      "learning_rate": 4.119996727306896e-08,
+      "loss": 0.3166,
+      "step": 2273
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 1.8240157047778767,
+      "learning_rate": 4.0610898823300605e-08,
+      "loss": 0.3698,
+      "step": 2274
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 1.9054984374125201,
+      "learning_rate": 4.0026037454995446e-08,
+      "loss": 0.3836,
+      "step": 2275
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 1.7353865384456557,
+      "learning_rate": 3.9445384168628474e-08,
+      "loss": 0.3253,
+      "step": 2276
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 1.9177471239749773,
+      "learning_rate": 3.88689399574782e-08,
+      "loss": 0.3705,
+      "step": 2277
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 1.767304846703738,
+      "learning_rate": 3.8296705807621124e-08,
+      "loss": 0.3394,
+      "step": 2278
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 1.8307581850341796,
+      "learning_rate": 3.772868269793312e-08,
+      "loss": 0.3742,
+      "step": 2279
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 2.2094772282591357,
+      "learning_rate": 3.716487160008608e-08,
+      "loss": 0.3737,
+      "step": 2280
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 1.831225952922472,
+      "learning_rate": 3.660527347854687e-08,
+      "loss": 0.3355,
+      "step": 2281
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 1.8903588912458535,
+      "learning_rate": 3.604988929057529e-08,
+      "loss": 0.3504,
+      "step": 2282
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 1.831527380801516,
+      "learning_rate": 3.549871998622334e-08,
+      "loss": 0.3622,
+      "step": 2283
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 1.8110423459241198,
+      "learning_rate": 3.4951766508332377e-08,
+      "loss": 0.3578,
+      "step": 2284
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 1.8576409618357517,
+      "learning_rate": 3.440902979253202e-08,
+      "loss": 0.3385,
+      "step": 2285
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 1.7451272854809423,
+      "learning_rate": 3.387051076723907e-08,
+      "loss": 0.3799,
+      "step": 2286
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 1.8297163144764026,
+      "learning_rate": 3.333621035365525e-08,
+      "loss": 0.3593,
+      "step": 2287
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 1.85432730355764,
+      "learning_rate": 3.280612946576556e-08,
+      "loss": 0.3639,
+      "step": 2288
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 1.7735697250412514,
+      "learning_rate": 3.2280269010337427e-08,
+      "loss": 0.3617,
+      "step": 2289
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 1.7625079229313618,
+      "learning_rate": 3.175862988691852e-08,
+      "loss": 0.329,
+      "step": 2290
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 2.105587141461978,
+      "learning_rate": 3.1241212987835614e-08,
+      "loss": 0.4002,
+      "step": 2291
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 1.8530942331843046,
+      "learning_rate": 3.072801919819235e-08,
+      "loss": 0.3574,
+      "step": 2292
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 1.8187789742996616,
+      "learning_rate": 3.021904939586873e-08,
+      "loss": 0.3385,
+      "step": 2293
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 1.8408002748586405,
+      "learning_rate": 2.971430445151885e-08,
+      "loss": 0.3443,
+      "step": 2294
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 1.8528110336406811,
+      "learning_rate": 2.9213785228569823e-08,
+      "loss": 0.3298,
+      "step": 2295
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 1.7149723027801804,
+      "learning_rate": 2.8717492583220095e-08,
+      "loss": 0.3607,
+      "step": 2296
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 1.9360256295655223,
+      "learning_rate": 2.8225427364438063e-08,
+      "loss": 0.3833,
+      "step": 2297
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 1.8530219977927616,
+      "learning_rate": 2.773759041396068e-08,
+      "loss": 0.3414,
+      "step": 2298
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 1.7555751947021212,
+      "learning_rate": 2.7253982566291525e-08,
+      "loss": 0.376,
+      "step": 2299
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 1.8257207937080222,
+      "learning_rate": 2.677460464870024e-08,
+      "loss": 0.3375,
+      "step": 2300
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 1.808041093231935,
+      "learning_rate": 2.629945748122087e-08,
+      "loss": 0.3259,
+      "step": 2301
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 1.844719556713403,
+      "learning_rate": 2.5828541876649628e-08,
+      "loss": 0.32,
+      "step": 2302
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 1.8627811857138301,
+      "learning_rate": 2.5361858640544357e-08,
+      "loss": 0.3467,
+      "step": 2303
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 1.775658122498733,
+      "learning_rate": 2.489940857122314e-08,
+      "loss": 0.3438,
+      "step": 2304
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 2.0652341897824873,
+      "learning_rate": 2.4441192459762342e-08,
+      "loss": 0.4325,
+      "step": 2305
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 1.8808828610296175,
+      "learning_rate": 2.3987211089996075e-08,
+      "loss": 0.3599,
+      "step": 2306
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 1.8324673797834097,
+      "learning_rate": 2.3537465238513966e-08,
+      "loss": 0.3293,
+      "step": 2307
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 1.788314595093782,
+      "learning_rate": 2.3091955674660606e-08,
+      "loss": 0.374,
+      "step": 2308
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 1.9054450644102343,
+      "learning_rate": 2.265068316053387e-08,
+      "loss": 0.3817,
+      "step": 2309
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 1.7959481618901236,
+      "learning_rate": 2.2213648450983284e-08,
+      "loss": 0.3293,
+      "step": 2310
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 1.829622659307099,
+      "learning_rate": 2.178085229360999e-08,
+      "loss": 0.3623,
+      "step": 2311
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 1.7576464280597102,
+      "learning_rate": 2.1352295428763435e-08,
+      "loss": 0.318,
+      "step": 2312
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 1.7741716877498643,
+      "learning_rate": 2.092797858954193e-08,
+      "loss": 0.3561,
+      "step": 2313
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 1.7225396697993693,
+      "learning_rate": 2.050790250179041e-08,
+      "loss": 0.3088,
+      "step": 2314
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 1.783671372833311,
+      "learning_rate": 2.0092067884100175e-08,
+      "loss": 0.3378,
+      "step": 2315
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 1.878137104399538,
+      "learning_rate": 1.9680475447805826e-08,
+      "loss": 0.3528,
+      "step": 2316
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 1.7570308196145663,
+      "learning_rate": 1.9273125896986378e-08,
+      "loss": 0.3266,
+      "step": 2317
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 1.8198609878561622,
+      "learning_rate": 1.8870019928461936e-08,
+      "loss": 0.3866,
+      "step": 2318
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 1.9491640718315564,
+      "learning_rate": 1.8471158231793962e-08,
+      "loss": 0.3731,
+      "step": 2319
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 2.0157828598711407,
+      "learning_rate": 1.807654148928334e-08,
+      "loss": 0.3225,
+      "step": 2320
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 1.830799493750694,
+      "learning_rate": 1.7686170375969813e-08,
+      "loss": 0.3378,
+      "step": 2321
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 1.776821773854751,
+      "learning_rate": 1.7300045559630053e-08,
+      "loss": 0.3426,
+      "step": 2322
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 1.8331771417191418,
+      "learning_rate": 1.691816770077709e-08,
+      "loss": 0.3879,
+      "step": 2323
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 1.8367134813568278,
+      "learning_rate": 1.654053745265921e-08,
+      "loss": 0.3577,
+      "step": 2324
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 1.8655517772206949,
+      "learning_rate": 1.6167155461258298e-08,
+      "loss": 0.3106,
+      "step": 2325
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 1.7984794341036208,
+      "learning_rate": 1.5798022365289544e-08,
+      "loss": 0.3339,
+      "step": 2326
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 1.8046596260708332,
+      "learning_rate": 1.5433138796198954e-08,
+      "loss": 0.3514,
+      "step": 2327
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 1.80328251848771,
+      "learning_rate": 1.5072505378164182e-08,
+      "loss": 0.3813,
+      "step": 2328
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 1.8143267874334106,
+      "learning_rate": 1.4716122728092586e-08,
+      "loss": 0.3705,
+      "step": 2329
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 1.8115674095692667,
+      "learning_rate": 1.4363991455619008e-08,
+      "loss": 0.358,
+      "step": 2330
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 1.9202718564659147,
+      "learning_rate": 1.401611216310661e-08,
+      "loss": 0.4294,
+      "step": 2331
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 1.900298805533325,
+      "learning_rate": 1.36724854456452e-08,
+      "loss": 0.3429,
+      "step": 2332
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 1.8195483339075496,
+      "learning_rate": 1.3333111891049023e-08,
+      "loss": 0.3788,
+      "step": 2333
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 1.7931061906479377,
+      "learning_rate": 1.2997992079858135e-08,
+      "loss": 0.3483,
+      "step": 2334
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 1.740931180103152,
+      "learning_rate": 1.266712658533481e-08,
+      "loss": 0.353,
+      "step": 2335
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 1.8508839449199255,
+      "learning_rate": 1.2340515973464917e-08,
+      "loss": 0.4318,
+      "step": 2336
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 1.7123271515142926,
+      "learning_rate": 1.2018160802954592e-08,
+      "loss": 0.3511,
+      "step": 2337
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 1.8655323178391352,
+      "learning_rate": 1.170006162523163e-08,
+      "loss": 0.3334,
+      "step": 2338
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 1.8116660117809809,
+      "learning_rate": 1.1386218984443253e-08,
+      "loss": 0.3517,
+      "step": 2339
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 1.7942153746632152,
+      "learning_rate": 1.1076633417454463e-08,
+      "loss": 0.3172,
+      "step": 2340
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 1.7673253216996463,
+      "learning_rate": 1.0771305453849134e-08,
+      "loss": 0.3634,
+      "step": 2341
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 1.8407979709727775,
+      "learning_rate": 1.0470235615927526e-08,
+      "loss": 0.3381,
+      "step": 2342
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 1.8972203790540076,
+      "learning_rate": 1.0173424418705724e-08,
+      "loss": 0.4329,
+      "step": 2343
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 1.7916957004045877,
+      "learning_rate": 9.880872369915362e-09,
+      "loss": 0.319,
+      "step": 2344
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 1.8307160856502611,
+      "learning_rate": 9.592579970001404e-09,
+      "loss": 0.3661,
+      "step": 2345
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 1.8705249856696158,
+      "learning_rate": 9.30854771212325e-09,
+      "loss": 0.346,
+      "step": 2346
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 1.9255943662811519,
+      "learning_rate": 9.028776082152246e-09,
+      "loss": 0.3957,
+      "step": 2347
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 1.8466026013927237,
+      "learning_rate": 8.75326555867112e-09,
+      "loss": 0.3208,
+      "step": 2348
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 2.188566840717273,
+      "learning_rate": 8.482016612974265e-09,
+      "loss": 0.3949,
+      "step": 2349
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 1.8661101622885223,
+      "learning_rate": 8.215029709065515e-09,
+      "loss": 0.3837,
+      "step": 2350
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 1.8861120420225161,
+      "learning_rate": 7.952305303658147e-09,
+      "loss": 0.3582,
+      "step": 2351
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 1.813038172439941,
+      "learning_rate": 7.693843846174055e-09,
+      "loss": 0.3784,
+      "step": 2352
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 1.732272473538418,
+      "learning_rate": 7.439645778742344e-09,
+      "loss": 0.3083,
+      "step": 2353
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 1.7583227773146088,
+      "learning_rate": 7.189711536199906e-09,
+      "loss": 0.3407,
+      "step": 2354
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 1.9598993214652383,
+      "learning_rate": 6.944041546088909e-09,
+      "loss": 0.3442,
+      "step": 2355
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 1.920153009242741,
+      "learning_rate": 6.702636228657911e-09,
+      "loss": 0.3507,
+      "step": 2356
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 1.93057180027477,
+      "learning_rate": 6.465495996859639e-09,
+      "loss": 0.3528,
+      "step": 2357
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 1.757464657460147,
+      "learning_rate": 6.2326212563507126e-09,
+      "loss": 0.31,
+      "step": 2358
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 1.793945156305048,
+      "learning_rate": 6.004012405492199e-09,
+      "loss": 0.3255,
+      "step": 2359
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 1.8164806685160202,
+      "learning_rate": 5.77966983534628e-09,
+      "loss": 0.3363,
+      "step": 2360
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 1.7782525894382621,
+      "learning_rate": 5.5595939296784755e-09,
+      "loss": 0.3416,
+      "step": 2361
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 1.832295574318913,
+      "learning_rate": 5.343785064954865e-09,
+      "loss": 0.3628,
+      "step": 2362
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 1.729926329285944,
+      "learning_rate": 5.132243610342924e-09,
+      "loss": 0.3362,
+      "step": 2363
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 1.8177287184944615,
+      "learning_rate": 4.9249699277093e-09,
+      "loss": 0.344,
+      "step": 2364
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 1.8317448819287787,
+      "learning_rate": 4.721964371620924e-09,
+      "loss": 0.4136,
+      "step": 2365
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 1.7302394089099904,
+      "learning_rate": 4.523227289343068e-09,
+      "loss": 0.3212,
+      "step": 2366
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 1.8449995563751413,
+      "learning_rate": 4.328759020839624e-09,
+      "loss": 0.3686,
+      "step": 2367
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 1.8177433334210171,
+      "learning_rate": 4.138559898771988e-09,
+      "loss": 0.3923,
+      "step": 2368
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 1.8895824040710925,
+      "learning_rate": 3.95263024849879e-09,
+      "loss": 0.4414,
+      "step": 2369
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 1.8778393603343977,
+      "learning_rate": 3.7709703880747795e-09,
+      "loss": 0.351,
+      "step": 2370
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 1.8123035152551727,
+      "learning_rate": 3.5935806282511032e-09,
+      "loss": 0.3596,
+      "step": 2371
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 1.8483506706663495,
+      "learning_rate": 3.4204612724744736e-09,
+      "loss": 0.3666,
+      "step": 2372
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 1.9665278586805264,
+      "learning_rate": 3.2516126168866123e-09,
+      "loss": 0.3447,
+      "step": 2373
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 1.9266775035122918,
+      "learning_rate": 3.0870349503231424e-09,
+      "loss": 0.3763,
+      "step": 2374
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 1.8202142350172061,
+      "learning_rate": 2.92672855431414e-09,
+      "loss": 0.3275,
+      "step": 2375
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 1.8013001627180567,
+      "learning_rate": 2.7706937030827495e-09,
+      "loss": 0.3557,
+      "step": 2376
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 1.906524152202692,
+      "learning_rate": 2.6189306635460156e-09,
+      "loss": 0.3548,
+      "step": 2377
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 1.8346090500701675,
+      "learning_rate": 2.471439695312383e-09,
+      "loss": 0.3672,
+      "step": 2378
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 1.823882086144735,
+      "learning_rate": 2.328221050683088e-09,
+      "loss": 0.3349,
+      "step": 2379
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 1.783279345835604,
+      "learning_rate": 2.189274974650768e-09,
+      "loss": 0.3651,
+      "step": 2380
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 1.8063215302334035,
+      "learning_rate": 2.0546017048994615e-09,
+      "loss": 0.3829,
+      "step": 2381
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 1.9347786435697172,
+      "learning_rate": 1.924201471804332e-09,
+      "loss": 0.3535,
+      "step": 2382
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 1.7349198388308216,
+      "learning_rate": 1.79807449843028e-09,
+      "loss": 0.3374,
+      "step": 2383
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 1.8780406239663383,
+      "learning_rate": 1.6762210005330515e-09,
+      "loss": 0.3509,
+      "step": 2384
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 1.8146143727655357,
+      "learning_rate": 1.558641186557297e-09,
+      "loss": 0.3994,
+      "step": 2385
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 1.8397391761113984,
+      "learning_rate": 1.4453352576379587e-09,
+      "loss": 0.354,
+      "step": 2386
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 1.7893297095614948,
+      "learning_rate": 1.3363034075980496e-09,
+      "loss": 0.3669,
+      "step": 2387
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 1.7642311756811444,
+      "learning_rate": 1.2315458229500422e-09,
+      "loss": 0.3613,
+      "step": 2388
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 1.9684442048645372,
+      "learning_rate": 1.1310626828942017e-09,
+      "loss": 0.3317,
+      "step": 2389
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 1.7590264201884553,
+      "learning_rate": 1.034854159319143e-09,
+      "loss": 0.3164,
+      "step": 2390
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 1.769739988978357,
+      "learning_rate": 9.42920416801274e-10,
+      "loss": 0.3105,
+      "step": 2391
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 1.8187497681660625,
+      "learning_rate": 8.552616126039637e-10,
+      "loss": 0.3491,
+      "step": 2392
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 1.8697839343972154,
+      "learning_rate": 7.718778966783746e-10,
+      "loss": 0.3381,
+      "step": 2393
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 1.8195517409055602,
+      "learning_rate": 6.927694116623529e-10,
+      "loss": 0.3579,
+      "step": 2394
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 1.957748954418363,
+      "learning_rate": 6.179362928804278e-10,
+      "loss": 0.3428,
+      "step": 2395
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 1.7764755140707773,
+      "learning_rate": 5.473786683440896e-10,
+      "loss": 0.3572,
+      "step": 2396
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 1.8254010648782029,
+      "learning_rate": 4.810966587501242e-10,
+      "loss": 0.3385,
+      "step": 2397
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 2.398100665461426,
+      "learning_rate": 4.1909037748227856e-10,
+      "loss": 0.3683,
+      "step": 2398
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 1.7370690524344994,
+      "learning_rate": 3.613599306093174e-10,
+      "loss": 0.3453,
+      "step": 2399
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 1.9603879188454718,
+      "learning_rate": 3.079054168866891e-10,
+      "loss": 0.3446,
+      "step": 2400
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 1.8244354138147691,
+      "learning_rate": 2.5872692775430474e-10,
+      "loss": 0.3107,
+      "step": 2401
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 1.842040045340226,
+      "learning_rate": 2.1382454733848147e-10,
+      "loss": 0.3794,
+      "step": 2402
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 1.8958611045361276,
+      "learning_rate": 1.7319835244944405e-10,
+      "loss": 0.3499,
+      "step": 2403
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 1.795028879660856,
+      "learning_rate": 1.3684841258354564e-10,
+      "loss": 0.3571,
+      "step": 2404
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 1.9239030192337772,
+      "learning_rate": 1.0477478992187984e-10,
+      "loss": 0.3801,
+      "step": 2405
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 1.7943896830711028,
+      "learning_rate": 7.697753933000318e-11,
+      "loss": 0.3379,
+      "step": 2406
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 1.8578795406276247,
+      "learning_rate": 5.345670835849026e-11,
+      "loss": 0.3335,
+      "step": 2407
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 1.7907314686158868,
+      "learning_rate": 3.4212337242656156e-11,
+      "loss": 0.3253,
+      "step": 2408
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 1.9623684204229361,
+      "learning_rate": 1.9244458902278884e-11,
+      "loss": 0.3983,
+      "step": 2409
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 1.753382755747083,
+      "learning_rate": 8.553098941876947e-12,
+      "loss": 0.3433,
+      "step": 2410
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 1.7839093116025138,
+      "learning_rate": 2.1382756498766753e-12,
+      "loss": 0.3743,
+      "step": 2411
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 1.8247664664090246,
+      "learning_rate": 0.0,
+      "loss": 0.3419,
+      "step": 2412
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 2412,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 603,
+  "total_flos": 1136185615319040.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-2412/training_args.bin b/checkpoint-2412/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9c7d637f7d8ebf811202f98fbce7e1b0a49f637e
--- /dev/null
+++ b/checkpoint-2412/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7f4a52e7a4e455192e0e321f42138a81946c62773f6570625fe5cb773689e26
+size 7352
diff --git a/checkpoint-2412/zero_to_fp32.py b/checkpoint-2412/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..49b846633d6eb1e836e34681e44033581f4edb7b
--- /dev/null
+++ b/checkpoint-2412/zero_to_fp32.py
@@ -0,0 +1,592 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    """
+
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag)
diff --git a/checkpoint-603/config.json b/checkpoint-603/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e19729ddff0edff2916b7fba6d2fec722621e76
--- /dev/null
+++ b/checkpoint-603/config.json
@@ -0,0 +1,26 @@
+{
+  "_name_or_path": "alpindale/Mistral-7B-v0.2-hf",
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 32000,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 32768,
+  "model_type": "mistral",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": false,
+  "vocab_size": 32002
+}
diff --git a/checkpoint-603/generation_config.json b/checkpoint-603/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..282b497efd8f276cf9270e576fb79be429aebcdc
--- /dev/null
+++ b/checkpoint-603/generation_config.json
@@ -0,0 +1,7 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "do_sample": true,
+  "eos_token_id": 2,
+  "transformers_version": "4.38.2"
+}
diff --git a/checkpoint-603/global_step603/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-603/global_step603/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..448632be36b94fce9a238803682d75dec7ae51e9
--- /dev/null
+++ b/checkpoint-603/global_step603/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59370e22148d3a8a1d2cd4624d7946cbd7616b76121082a13fe7734a7d5d02c5
+size 4831623435
diff --git a/checkpoint-603/global_step603/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-603/global_step603/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..83b19732a78854dc40d38802c3954036ef62416e
--- /dev/null
+++ b/checkpoint-603/global_step603/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b53042893025c005ef60ddc8f3397c4183eccf3bb8d43ca6fe90b937586c5be3
+size 4831623435
diff --git a/checkpoint-603/global_step603/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-603/global_step603/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b6b1e266d33c0a472ceec67b3a3d494980842efd
--- /dev/null
+++ b/checkpoint-603/global_step603/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cef9cf818255a83cde92a08ece9ca1eff5aaaf294200103287c9446b18ba83cb
+size 4831623435
diff --git a/checkpoint-603/global_step603/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-603/global_step603/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ca29e9a415c45ec9fb56a1a89ec2690bb2c3c3ac
--- /dev/null
+++ b/checkpoint-603/global_step603/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76a96afb8918f45759eb193305769f1d407427f79b37364fd4066fb832c4cf2b
+size 4831623435
diff --git a/checkpoint-603/global_step603/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-603/global_step603/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3e7dccae6b525c60c58488c3a74da353d6da2766
--- /dev/null
+++ b/checkpoint-603/global_step603/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76f6f1b8a0c4a54dc38c3811a0160fe52885a7f82982e78eea9dbe8ec5be1799
+size 4831623435
diff --git a/checkpoint-603/global_step603/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-603/global_step603/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e945055499c5d9be0813f06293afcf5a98002f01
--- /dev/null
+++ b/checkpoint-603/global_step603/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af7f387005cd25db76e0929d2bdbe6eec2be1a8b08e284fffba3a2c6b3c71bb5
+size 4831623435
diff --git a/checkpoint-603/global_step603/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-603/global_step603/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..438bb41c33684f1698f9f0afda965d75656e0e5a
--- /dev/null
+++ b/checkpoint-603/global_step603/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c06d83b7316728ec5932771c7f7b3b8108c5ce6affa4f06d2f1909b490e0d58e
+size 4831623435
diff --git a/checkpoint-603/global_step603/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-603/global_step603/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..02def1d19549a11b77e6bd136f7cfe49743eb305
--- /dev/null
+++ b/checkpoint-603/global_step603/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d58b1f6f3a57acccdd9de93ee37604f78994e878101c8d31e0b54ee51667710f
+size 4831623435
diff --git a/checkpoint-603/global_step603/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/checkpoint-603/global_step603/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bce01e08b82b8e35be03f675f745a3edf7065967
--- /dev/null
+++ b/checkpoint-603/global_step603/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e25c081ab69293c7624657196aa101e5f93da28fd53ce9c0ed7c2cde1e7bda5
+size 4831623435
diff --git a/checkpoint-603/global_step603/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-603/global_step603/zero_pp_rank_0_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cc86df6dbf064e329b44f0edd948821b403844b0
--- /dev/null
+++ b/checkpoint-603/global_step603/zero_pp_rank_0_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7f068ea32760e0b48dac2d6af9eac8a70cb1e844c7f3df607dba31b5b1a65bb
+size 153829
diff --git a/checkpoint-603/global_step603/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-603/global_step603/zero_pp_rank_1_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fbfe195ff56e36f822ae14568491e633e545b11b
--- /dev/null
+++ b/checkpoint-603/global_step603/zero_pp_rank_1_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f012302b249642f3647f99b5acf92cbab815fb99a79368a02ab1aa7147d9cc9
+size 153829
diff --git a/checkpoint-603/global_step603/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-603/global_step603/zero_pp_rank_2_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..eb1befd2ac756cfc6de31025ff60c1b41f937042
--- /dev/null
+++ b/checkpoint-603/global_step603/zero_pp_rank_2_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4aac9dbbcd4643c54be734b072c8a79493529ef0c76048fb783945075bdd0f4a
+size 153829
diff --git a/checkpoint-603/global_step603/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-603/global_step603/zero_pp_rank_3_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0fec34785df33dbcc972ce4ec8fc10cda57918a6
--- /dev/null
+++ b/checkpoint-603/global_step603/zero_pp_rank_3_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1df7d9f001aa84b5743fd10d6afb2a15e032818f728d246a9a55f62a90f78eb3
+size 153829
diff --git a/checkpoint-603/global_step603/zero_pp_rank_4_mp_rank_00_model_states.pt b/checkpoint-603/global_step603/zero_pp_rank_4_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f877c57ebc122f734c9d8e6cb6dd6046f9e2b020
--- /dev/null
+++ b/checkpoint-603/global_step603/zero_pp_rank_4_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:549ad2aae080a1d987992d584bb537308f73c6734215531882ce33c6e440a523
+size 153829
diff --git a/checkpoint-603/global_step603/zero_pp_rank_5_mp_rank_00_model_states.pt b/checkpoint-603/global_step603/zero_pp_rank_5_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..94f29c4943b63db704960aa3e56927c1bbeb28f2
--- /dev/null
+++ b/checkpoint-603/global_step603/zero_pp_rank_5_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:43eb9c0d5739614a0e52559181b5107c8b4a52fe8f2823b3a5fe055a536a2d27
+size 153829
diff --git a/checkpoint-603/global_step603/zero_pp_rank_6_mp_rank_00_model_states.pt b/checkpoint-603/global_step603/zero_pp_rank_6_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1e31e537ad39d7343e213fc51911cfe0044593db
--- /dev/null
+++ b/checkpoint-603/global_step603/zero_pp_rank_6_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eedbfca9cde614f50defef9af4967b35a4f4ccf27e94df4ad2ff4be358848b54
+size 153829
diff --git a/checkpoint-603/global_step603/zero_pp_rank_7_mp_rank_00_model_states.pt b/checkpoint-603/global_step603/zero_pp_rank_7_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c115461d604c2025df1f904be40e4931eccb289a
--- /dev/null
+++ b/checkpoint-603/global_step603/zero_pp_rank_7_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ada9a3962cbc0dd20da5343bc7f4aa055b4e752ab514423e555f7cbdbc2b3c2c
+size 153829
diff --git a/checkpoint-603/global_step603/zero_pp_rank_8_mp_rank_00_model_states.pt b/checkpoint-603/global_step603/zero_pp_rank_8_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8cf3a0cf0990a86c554e5e80f59085fe79d62d75
--- /dev/null
+++ b/checkpoint-603/global_step603/zero_pp_rank_8_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f40ba5b0013f52277f2af3d3822c99d8fdae8a69c1c3c7c3794949261d198b86
+size 153829
diff --git a/checkpoint-603/latest b/checkpoint-603/latest
new file mode 100644
index 0000000000000000000000000000000000000000..79c0a3e5ad0a79110e06067b151d8c6b9a0aacd6
--- /dev/null
+++ b/checkpoint-603/latest
@@ -0,0 +1 @@
+global_step603
\ No newline at end of file
diff --git a/checkpoint-603/model-00001-of-00003.safetensors b/checkpoint-603/model-00001-of-00003.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..125cf68fd8372f547ac2a310463684c5286d6917
--- /dev/null
+++ b/checkpoint-603/model-00001-of-00003.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25433fce8850cf2c8bfb942d2859f8b1ae795d54153015bfda50c860636f2d33
+size 4943178720
diff --git a/checkpoint-603/model-00002-of-00003.safetensors b/checkpoint-603/model-00002-of-00003.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..29f87439dd45776ba27a6ffd102bc711eec7c73d
--- /dev/null
+++ b/checkpoint-603/model-00002-of-00003.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b4ab548de5173dd2f658babb32a5dc829cbec02780d3349d8fe38ea754d6e0e
+size 4999819336
diff --git a/checkpoint-603/model-00003-of-00003.safetensors b/checkpoint-603/model-00003-of-00003.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..652488552dc46b5266ea673729d104b6c83f758e
--- /dev/null
+++ b/checkpoint-603/model-00003-of-00003.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:647792278f578fc90f83abfaef9375524293e96a59884f35e68895f545856809
+size 4540532728
diff --git a/checkpoint-603/model.safetensors.index.json b/checkpoint-603/model.safetensors.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..71e207631efb42944bc779548c9b6d4b5818ffe2
--- /dev/null
+++ b/checkpoint-603/model.safetensors.index.json
@@ -0,0 +1,298 @@
+{
+  "metadata": {
+    "total_size": 14483496960
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00003-of-00003.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.norm.weight": "model-00003-of-00003.safetensors"
+  }
+}
diff --git a/checkpoint-603/rng_state_0.pth b/checkpoint-603/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..7ae643fef71bb5468722e041971c4fd10143dcde
--- /dev/null
+++ b/checkpoint-603/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d78df38122b8b51b69a3cce1a8d8cb0f7d8684196dde8fb6d174ef0fd3440d89
+size 16240
diff --git a/checkpoint-603/rng_state_1.pth b/checkpoint-603/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..0dec857bd06d8263dc0d1f195ea4d4288bad4641
--- /dev/null
+++ b/checkpoint-603/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:499f46e15237a5856de1a8f0582d02e4319721d83140e01c31e9e1db92da7108
+size 16240
diff --git a/checkpoint-603/rng_state_2.pth b/checkpoint-603/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6d57f4b1f904b392ef605de094c7e5171fced622
--- /dev/null
+++ b/checkpoint-603/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b32ec8b414a3886bf179af827449dee557e95bfa64a7c20f26c186df2659c9f
+size 16240
diff --git a/checkpoint-603/rng_state_3.pth b/checkpoint-603/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..4c8bebc9d459d1ed2d1ab4f27d7ec2da721d0445
--- /dev/null
+++ b/checkpoint-603/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:82765e3b8fb57ca7779e75617b51182226eed278593e6441a31510115950353d
+size 16240
diff --git a/checkpoint-603/rng_state_4.pth b/checkpoint-603/rng_state_4.pth
new file mode 100644
index 0000000000000000000000000000000000000000..71f7ca7b0554bc7702f1e276ae0cd3924ffba0d2
--- /dev/null
+++ b/checkpoint-603/rng_state_4.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6dd2c24e041054b45b5bf8c50512ea8c4552e5f2e877fe798759dec7a7f3aae1
+size 16240
diff --git a/checkpoint-603/rng_state_5.pth b/checkpoint-603/rng_state_5.pth
new file mode 100644
index 0000000000000000000000000000000000000000..2393f7d616bfb4cf0ab81957f29d35b455685a54
--- /dev/null
+++ b/checkpoint-603/rng_state_5.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92b3e1210264272a2020cbcb79f6ade48528f5682dadcecb7a94805779548161
+size 16240
diff --git a/checkpoint-603/rng_state_6.pth b/checkpoint-603/rng_state_6.pth
new file mode 100644
index 0000000000000000000000000000000000000000..46f8e8cc8551391d67e345af829445ad610b17a4
--- /dev/null
+++ b/checkpoint-603/rng_state_6.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:556ec0b910e14a1a5ab8fb6a1a16d525b89e31c69dd9b6cd8d4a4cccad65b546
+size 16240
diff --git a/checkpoint-603/rng_state_7.pth b/checkpoint-603/rng_state_7.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b0723b7d69eb2d78f3ee4bdd7f838269f3f845d1
--- /dev/null
+++ b/checkpoint-603/rng_state_7.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e830dc416886fe1aafeacfa75da6baacdbe9a61c66d2f1fbc11417753a516513
+size 16240
diff --git a/checkpoint-603/rng_state_8.pth b/checkpoint-603/rng_state_8.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b9da906954a171d52c0afc8baea75914a9bb9a62
--- /dev/null
+++ b/checkpoint-603/rng_state_8.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80d7cb0002af3e22c063c6751b91836d7e06c4267f7ba8e1912c42d6867e4885
+size 16240
diff --git a/checkpoint-603/scheduler.pt b/checkpoint-603/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ca7b6c97f0d9d22ad737fa4ce94633a5d89d4b35
--- /dev/null
+++ b/checkpoint-603/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1cf517ae91e21a035522e0b4a4fedb4101eafa6a9cc5b1728a258fae8d83e6cb
+size 1064
diff --git a/checkpoint-603/trainer_state.json b/checkpoint-603/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..14e538bf4467f81121288a2533b546025d0f9a26
--- /dev/null
+++ b/checkpoint-603/trainer_state.json
@@ -0,0 +1,4242 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.4998963730569948,
+  "eval_steps": 500,
+  "global_step": 603,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "grad_norm": 27.81778461909011,
+      "learning_rate": 5.000000000000001e-07,
+      "loss": 0.7993,
+      "step": 1
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 28.63833175363421,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 0.9056,
+      "step": 2
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 25.646828828014854,
+      "learning_rate": 1.5e-06,
+      "loss": 0.8473,
+      "step": 3
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 9.834124771941388,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 0.8192,
+      "step": 4
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 10.558095859980105,
+      "learning_rate": 2.5e-06,
+      "loss": 0.7943,
+      "step": 5
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 7.905789045775758,
+      "learning_rate": 3e-06,
+      "loss": 0.7075,
+      "step": 6
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 7.259519170268483,
+      "learning_rate": 3.5e-06,
+      "loss": 0.7537,
+      "step": 7
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 6.639042051048664,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 0.7471,
+      "step": 8
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 8.515070932390074,
+      "learning_rate": 4.5e-06,
+      "loss": 0.7689,
+      "step": 9
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 8.916410424632533,
+      "learning_rate": 5e-06,
+      "loss": 0.7194,
+      "step": 10
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 4.835046497413255,
+      "learning_rate": 4.9999978617243506e-06,
+      "loss": 0.6949,
+      "step": 11
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 10.065648500649479,
+      "learning_rate": 4.9999914469010585e-06,
+      "loss": 0.7039,
+      "step": 12
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 5.299372887839679,
+      "learning_rate": 4.999980755541098e-06,
+      "loss": 0.7067,
+      "step": 13
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 5.693110837094718,
+      "learning_rate": 4.999965787662758e-06,
+      "loss": 0.7126,
+      "step": 14
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 2.983869635716314,
+      "learning_rate": 4.999946543291642e-06,
+      "loss": 0.6496,
+      "step": 15
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 4.2561193962441175,
+      "learning_rate": 4.999923022460671e-06,
+      "loss": 0.7036,
+      "step": 16
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 3.011772824968437,
+      "learning_rate": 4.999895225210079e-06,
+      "loss": 0.7009,
+      "step": 17
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 3.386638415717137,
+      "learning_rate": 4.9998631515874165e-06,
+      "loss": 0.6624,
+      "step": 18
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 3.764658092125165,
+      "learning_rate": 4.999826801647551e-06,
+      "loss": 0.6687,
+      "step": 19
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.3982096117966614,
+      "learning_rate": 4.999786175452662e-06,
+      "loss": 0.706,
+      "step": 20
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.8051633678260193,
+      "learning_rate": 4.999741273072246e-06,
+      "loss": 0.7031,
+      "step": 21
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 3.1177784624332614,
+      "learning_rate": 4.999692094583114e-06,
+      "loss": 0.7525,
+      "step": 22
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.2533819675617806,
+      "learning_rate": 4.9996386400693906e-06,
+      "loss": 0.6767,
+      "step": 23
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.61893793162573,
+      "learning_rate": 4.999580909622518e-06,
+      "loss": 0.6432,
+      "step": 24
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.76057623723569,
+      "learning_rate": 4.999518903341251e-06,
+      "loss": 0.6809,
+      "step": 25
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.27983032069553,
+      "learning_rate": 4.999452621331657e-06,
+      "loss": 0.6798,
+      "step": 26
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.501904568120582,
+      "learning_rate": 4.99938206370712e-06,
+      "loss": 0.6412,
+      "step": 27
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.819229290729669,
+      "learning_rate": 4.999307230588338e-06,
+      "loss": 0.6188,
+      "step": 28
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.1233212322022212,
+      "learning_rate": 4.9992281221033224e-06,
+      "loss": 0.6378,
+      "step": 29
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.7806911906686755,
+      "learning_rate": 4.999144738387396e-06,
+      "loss": 0.6653,
+      "step": 30
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.4045490257014563,
+      "learning_rate": 4.999057079583199e-06,
+      "loss": 0.6377,
+      "step": 31
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.3803717769210446,
+      "learning_rate": 4.998965145840681e-06,
+      "loss": 0.6855,
+      "step": 32
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.3976652879633473,
+      "learning_rate": 4.998868937317106e-06,
+      "loss": 0.6284,
+      "step": 33
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.2958541157119727,
+      "learning_rate": 4.998768454177051e-06,
+      "loss": 0.6521,
+      "step": 34
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.1925196833696154,
+      "learning_rate": 4.998663696592403e-06,
+      "loss": 0.6619,
+      "step": 35
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.361006042901851,
+      "learning_rate": 4.998554664742362e-06,
+      "loss": 0.6155,
+      "step": 36
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.1577758143653614,
+      "learning_rate": 4.998441358813443e-06,
+      "loss": 0.6398,
+      "step": 37
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.219872074512664,
+      "learning_rate": 4.998323778999467e-06,
+      "loss": 0.6051,
+      "step": 38
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.2907501521408546,
+      "learning_rate": 4.9982019255015705e-06,
+      "loss": 0.6337,
+      "step": 39
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.1769862324666183,
+      "learning_rate": 4.9980757985281955e-06,
+      "loss": 0.6606,
+      "step": 40
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.4252479779661607,
+      "learning_rate": 4.997945398295101e-06,
+      "loss": 0.6685,
+      "step": 41
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.3929541982084657,
+      "learning_rate": 4.99781072502535e-06,
+      "loss": 0.6084,
+      "step": 42
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.932539969840091,
+      "learning_rate": 4.997671778949318e-06,
+      "loss": 0.6123,
+      "step": 43
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.191742541327873,
+      "learning_rate": 4.997528560304688e-06,
+      "loss": 0.6247,
+      "step": 44
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.423376784566499,
+      "learning_rate": 4.997381069336455e-06,
+      "loss": 0.7024,
+      "step": 45
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.0599055392481076,
+      "learning_rate": 4.997229306296918e-06,
+      "loss": 0.6612,
+      "step": 46
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.16832922087532,
+      "learning_rate": 4.997073271445686e-06,
+      "loss": 0.5949,
+      "step": 47
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.0483598654319453,
+      "learning_rate": 4.9969129650496775e-06,
+      "loss": 0.6406,
+      "step": 48
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.963056609139284,
+      "learning_rate": 4.996748387383113e-06,
+      "loss": 0.6361,
+      "step": 49
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.2094923844269307,
+      "learning_rate": 4.996579538727527e-06,
+      "loss": 0.5901,
+      "step": 50
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.1088153449411857,
+      "learning_rate": 4.996406419371749e-06,
+      "loss": 0.6458,
+      "step": 51
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.093448940617732,
+      "learning_rate": 4.996229029611926e-06,
+      "loss": 0.6509,
+      "step": 52
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.075116207412987,
+      "learning_rate": 4.996047369751502e-06,
+      "loss": 0.6295,
+      "step": 53
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.138141165277684,
+      "learning_rate": 4.995861440101229e-06,
+      "loss": 0.6088,
+      "step": 54
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.186316382848445,
+      "learning_rate": 4.995671240979161e-06,
+      "loss": 0.6307,
+      "step": 55
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.2513741083982195,
+      "learning_rate": 4.995476772710657e-06,
+      "loss": 0.6175,
+      "step": 56
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.0827167336870596,
+      "learning_rate": 4.995278035628379e-06,
+      "loss": 0.5935,
+      "step": 57
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.117977588574442,
+      "learning_rate": 4.995075030072291e-06,
+      "loss": 0.5998,
+      "step": 58
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.0996940200235485,
+      "learning_rate": 4.994867756389658e-06,
+      "loss": 0.6159,
+      "step": 59
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.141096165691323,
+      "learning_rate": 4.994656214935045e-06,
+      "loss": 0.6294,
+      "step": 60
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.022748830058395,
+      "learning_rate": 4.994440406070323e-06,
+      "loss": 0.6315,
+      "step": 61
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.209132168720991,
+      "learning_rate": 4.994220330164654e-06,
+      "loss": 0.5645,
+      "step": 62
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.0994557317862674,
+      "learning_rate": 4.993995987594509e-06,
+      "loss": 0.6272,
+      "step": 63
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.204220831053169,
+      "learning_rate": 4.99376737874365e-06,
+      "loss": 0.6379,
+      "step": 64
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.127733932186697,
+      "learning_rate": 4.993534504003141e-06,
+      "loss": 0.622,
+      "step": 65
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.1338506582034316,
+      "learning_rate": 4.993297363771342e-06,
+      "loss": 0.6259,
+      "step": 66
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.104802764460729,
+      "learning_rate": 4.993055958453912e-06,
+      "loss": 0.6414,
+      "step": 67
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.0889535347771675,
+      "learning_rate": 4.9928102884638004e-06,
+      "loss": 0.6466,
+      "step": 68
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.252225316694296,
+      "learning_rate": 4.992560354221258e-06,
+      "loss": 0.6167,
+      "step": 69
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.015392533516649,
+      "learning_rate": 4.992306156153827e-06,
+      "loss": 0.5958,
+      "step": 70
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.151741408948778,
+      "learning_rate": 4.992047694696343e-06,
+      "loss": 0.5875,
+      "step": 71
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.0351299117412696,
+      "learning_rate": 4.991784970290935e-06,
+      "loss": 0.5935,
+      "step": 72
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.0000962363827983,
+      "learning_rate": 4.991517983387026e-06,
+      "loss": 0.6091,
+      "step": 73
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.202881736102415,
+      "learning_rate": 4.99124673444133e-06,
+      "loss": 0.6122,
+      "step": 74
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.015074773396151,
+      "learning_rate": 4.990971223917848e-06,
+      "loss": 0.6134,
+      "step": 75
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.009305960567766,
+      "learning_rate": 4.990691452287877e-06,
+      "loss": 0.6308,
+      "step": 76
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 1.9967884756310221,
+      "learning_rate": 4.990407420029999e-06,
+      "loss": 0.6098,
+      "step": 77
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.0858738033925905,
+      "learning_rate": 4.990119127630085e-06,
+      "loss": 0.6344,
+      "step": 78
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 1.9427707561903895,
+      "learning_rate": 4.989826575581295e-06,
+      "loss": 0.6049,
+      "step": 79
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.157150584766853,
+      "learning_rate": 4.989529764384073e-06,
+      "loss": 0.5965,
+      "step": 80
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.0303527419352583,
+      "learning_rate": 4.989228694546151e-06,
+      "loss": 0.6524,
+      "step": 81
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.128799919475717,
+      "learning_rate": 4.988923366582546e-06,
+      "loss": 0.5524,
+      "step": 82
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.0122786280510696,
+      "learning_rate": 4.988613781015557e-06,
+      "loss": 0.6268,
+      "step": 83
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.104580177719229,
+      "learning_rate": 4.988299938374769e-06,
+      "loss": 0.6229,
+      "step": 84
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.3894843860356834,
+      "learning_rate": 4.9879818391970455e-06,
+      "loss": 0.6194,
+      "step": 85
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 1.9615211372441477,
+      "learning_rate": 4.9876594840265355e-06,
+      "loss": 0.6355,
+      "step": 86
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.4509852093141937,
+      "learning_rate": 4.987332873414666e-06,
+      "loss": 0.6405,
+      "step": 87
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.178942375285086,
+      "learning_rate": 4.987002007920142e-06,
+      "loss": 0.5593,
+      "step": 88
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.2625634345900445,
+      "learning_rate": 4.9866668881089515e-06,
+      "loss": 0.6133,
+      "step": 89
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.363092638811143,
+      "learning_rate": 4.986327514554356e-06,
+      "loss": 0.6298,
+      "step": 90
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.0401982492138546,
+      "learning_rate": 4.985983887836894e-06,
+      "loss": 0.6276,
+      "step": 91
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.276956647922478,
+      "learning_rate": 4.985636008544381e-06,
+      "loss": 0.5691,
+      "step": 92
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.1072762844110233,
+      "learning_rate": 4.985283877271908e-06,
+      "loss": 0.6175,
+      "step": 93
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.2931866879442637,
+      "learning_rate": 4.984927494621836e-06,
+      "loss": 0.6419,
+      "step": 94
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.112474101166308,
+      "learning_rate": 4.984566861203801e-06,
+      "loss": 0.607,
+      "step": 95
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.1816059679212634,
+      "learning_rate": 4.984201977634711e-06,
+      "loss": 0.6136,
+      "step": 96
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.0620776369966554,
+      "learning_rate": 4.9838328445387415e-06,
+      "loss": 0.6372,
+      "step": 97
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.147592836641578,
+      "learning_rate": 4.983459462547341e-06,
+      "loss": 0.606,
+      "step": 98
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.1808001877062453,
+      "learning_rate": 4.983081832299224e-06,
+      "loss": 0.6019,
+      "step": 99
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.3751999527114087,
+      "learning_rate": 4.98269995444037e-06,
+      "loss": 0.6021,
+      "step": 100
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.8769470206406913,
+      "learning_rate": 4.98231382962403e-06,
+      "loss": 0.6082,
+      "step": 101
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.3060925784921347,
+      "learning_rate": 4.981923458510717e-06,
+      "loss": 0.6174,
+      "step": 102
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.1543176832473683,
+      "learning_rate": 4.981528841768206e-06,
+      "loss": 0.6092,
+      "step": 103
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.1558689520522547,
+      "learning_rate": 4.981129980071538e-06,
+      "loss": 0.587,
+      "step": 104
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.3830532005188383,
+      "learning_rate": 4.980726874103014e-06,
+      "loss": 0.6518,
+      "step": 105
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.3333119576634767,
+      "learning_rate": 4.980319524552195e-06,
+      "loss": 0.6096,
+      "step": 106
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.1135146855324214,
+      "learning_rate": 4.9799079321159e-06,
+      "loss": 0.5728,
+      "step": 107
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.2300463384326394,
+      "learning_rate": 4.9794920974982095e-06,
+      "loss": 0.6563,
+      "step": 108
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.1745234017525443,
+      "learning_rate": 4.979072021410458e-06,
+      "loss": 0.5968,
+      "step": 109
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.1536586182562334,
+      "learning_rate": 4.978647704571237e-06,
+      "loss": 0.6189,
+      "step": 110
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.193809374687326,
+      "learning_rate": 4.97821914770639e-06,
+      "loss": 0.5864,
+      "step": 111
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.0525896373682047,
+      "learning_rate": 4.977786351549017e-06,
+      "loss": 0.6101,
+      "step": 112
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.216099286618384,
+      "learning_rate": 4.977349316839467e-06,
+      "loss": 0.5984,
+      "step": 113
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.155122255962579,
+      "learning_rate": 4.97690804432534e-06,
+      "loss": 0.6311,
+      "step": 114
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.2972101190291374,
+      "learning_rate": 4.976462534761487e-06,
+      "loss": 0.5813,
+      "step": 115
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 1.9925413745245948,
+      "learning_rate": 4.9760127889100044e-06,
+      "loss": 0.6157,
+      "step": 116
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.2802548684036568,
+      "learning_rate": 4.975558807540238e-06,
+      "loss": 0.6079,
+      "step": 117
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.048888007394621,
+      "learning_rate": 4.9751005914287775e-06,
+      "loss": 0.6467,
+      "step": 118
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.28661640438254,
+      "learning_rate": 4.974638141359456e-06,
+      "loss": 0.6029,
+      "step": 119
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.004056683755783,
+      "learning_rate": 4.974171458123351e-06,
+      "loss": 0.6289,
+      "step": 120
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.1628470048067667,
+      "learning_rate": 4.97370054251878e-06,
+      "loss": 0.6139,
+      "step": 121
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.056119895466544,
+      "learning_rate": 4.9732253953513e-06,
+      "loss": 0.5798,
+      "step": 122
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.1716513163164275,
+      "learning_rate": 4.972746017433709e-06,
+      "loss": 0.6085,
+      "step": 123
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.255856676525811,
+      "learning_rate": 4.97226240958604e-06,
+      "loss": 0.6342,
+      "step": 124
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.1049280498075373,
+      "learning_rate": 4.971774572635563e-06,
+      "loss": 0.6197,
+      "step": 125
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.133349390995361,
+      "learning_rate": 4.97128250741678e-06,
+      "loss": 0.5751,
+      "step": 126
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.2044887467317578,
+      "learning_rate": 4.97078621477143e-06,
+      "loss": 0.6611,
+      "step": 127
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.1413863795698145,
+      "learning_rate": 4.970285695548481e-06,
+      "loss": 0.625,
+      "step": 128
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.0229587336296615,
+      "learning_rate": 4.969780950604132e-06,
+      "loss": 0.5989,
+      "step": 129
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.0983599595244247,
+      "learning_rate": 4.969271980801808e-06,
+      "loss": 0.5747,
+      "step": 130
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.1059041140010786,
+      "learning_rate": 4.9687587870121645e-06,
+      "loss": 0.5869,
+      "step": 131
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.8967441614595046,
+      "learning_rate": 4.9682413701130815e-06,
+      "loss": 0.6272,
+      "step": 132
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.9976164993621088,
+      "learning_rate": 4.967719730989663e-06,
+      "loss": 0.6282,
+      "step": 133
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.8719131324952145,
+      "learning_rate": 4.967193870534235e-06,
+      "loss": 0.6052,
+      "step": 134
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.071702997476533,
+      "learning_rate": 4.9666637896463455e-06,
+      "loss": 0.5785,
+      "step": 135
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.9549455320048341,
+      "learning_rate": 4.966129489232762e-06,
+      "loss": 0.5739,
+      "step": 136
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.0656898626759315,
+      "learning_rate": 4.9655909702074684e-06,
+      "loss": 0.6651,
+      "step": 137
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.1185948604203038,
+      "learning_rate": 4.965048233491669e-06,
+      "loss": 0.5759,
+      "step": 138
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.08566019272993,
+      "learning_rate": 4.964501280013777e-06,
+      "loss": 0.6271,
+      "step": 139
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.117420903965419,
+      "learning_rate": 4.963950110709425e-06,
+      "loss": 0.5968,
+      "step": 140
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.9784944143818486,
+      "learning_rate": 4.963394726521453e-06,
+      "loss": 0.6112,
+      "step": 141
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.077292948039572,
+      "learning_rate": 4.9628351283999144e-06,
+      "loss": 0.5636,
+      "step": 142
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.223803520245629,
+      "learning_rate": 4.962271317302068e-06,
+      "loss": 0.6658,
+      "step": 143
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.039369072186367,
+      "learning_rate": 4.9617032941923796e-06,
+      "loss": 0.5853,
+      "step": 144
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.071470113085907,
+      "learning_rate": 4.961131060042522e-06,
+      "loss": 0.601,
+      "step": 145
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.437470272347474,
+      "learning_rate": 4.960554615831372e-06,
+      "loss": 0.6593,
+      "step": 146
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.178684122927139,
+      "learning_rate": 4.959973962545005e-06,
+      "loss": 0.607,
+      "step": 147
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.097006749956471,
+      "learning_rate": 4.9593891011767e-06,
+      "loss": 0.5873,
+      "step": 148
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.9801202541822784,
+      "learning_rate": 4.958800032726931e-06,
+      "loss": 0.5877,
+      "step": 149
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.30001951085656,
+      "learning_rate": 4.958206758203373e-06,
+      "loss": 0.6368,
+      "step": 150
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 1.990094260131078,
+      "learning_rate": 4.957609278620891e-06,
+      "loss": 0.59,
+      "step": 151
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.262163752076628,
+      "learning_rate": 4.957007595001548e-06,
+      "loss": 0.5779,
+      "step": 152
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.1970152093220983,
+      "learning_rate": 4.956401708374595e-06,
+      "loss": 0.5894,
+      "step": 153
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.220825872684071,
+      "learning_rate": 4.9557916197764745e-06,
+      "loss": 0.6528,
+      "step": 154
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.099472677591387,
+      "learning_rate": 4.955177330250817e-06,
+      "loss": 0.5798,
+      "step": 155
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.159203936881569,
+      "learning_rate": 4.954558840848437e-06,
+      "loss": 0.6206,
+      "step": 156
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.185152414039555,
+      "learning_rate": 4.953936152627338e-06,
+      "loss": 0.5624,
+      "step": 157
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.0679748168992624,
+      "learning_rate": 4.953309266652701e-06,
+      "loss": 0.5859,
+      "step": 158
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.327237187255128,
+      "learning_rate": 4.952678183996891e-06,
+      "loss": 0.5632,
+      "step": 159
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.2865519679977417,
+      "learning_rate": 4.952042905739451e-06,
+      "loss": 0.6965,
+      "step": 160
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.523435408018699,
+      "learning_rate": 4.9514034329671e-06,
+      "loss": 0.6217,
+      "step": 161
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.4992653226709636,
+      "learning_rate": 4.950759766773734e-06,
+      "loss": 0.6175,
+      "step": 162
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.432752824777114,
+      "learning_rate": 4.950111908260423e-06,
+      "loss": 0.5862,
+      "step": 163
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.137500912204061,
+      "learning_rate": 4.949459858535404e-06,
+      "loss": 0.6124,
+      "step": 164
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.2226376224120474,
+      "learning_rate": 4.94880361871409e-06,
+      "loss": 0.5891,
+      "step": 165
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.3821839805775165,
+      "learning_rate": 4.9481431899190544e-06,
+      "loss": 0.6008,
+      "step": 166
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.306242834684614,
+      "learning_rate": 4.947478573280044e-06,
+      "loss": 0.6159,
+      "step": 167
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.3298092236851518,
+      "learning_rate": 4.946809769933963e-06,
+      "loss": 0.5809,
+      "step": 168
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.364296499621558,
+      "learning_rate": 4.946136781024883e-06,
+      "loss": 0.5895,
+      "step": 169
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.237241095609228,
+      "learning_rate": 4.945459607704029e-06,
+      "loss": 0.6144,
+      "step": 170
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.4027419761972264,
+      "learning_rate": 4.9447782511297905e-06,
+      "loss": 0.5985,
+      "step": 171
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.1547059182244284,
+      "learning_rate": 4.944092712467709e-06,
+      "loss": 0.5763,
+      "step": 172
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.1530221667047984,
+      "learning_rate": 4.9434029928904805e-06,
+      "loss": 0.5692,
+      "step": 173
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.228588593294869,
+      "learning_rate": 4.942709093577954e-06,
+      "loss": 0.5896,
+      "step": 174
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.1597295307130198,
+      "learning_rate": 4.942011015717129e-06,
+      "loss": 0.5864,
+      "step": 175
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.321140955498194,
+      "learning_rate": 4.941308760502149e-06,
+      "loss": 0.6089,
+      "step": 176
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.220124736460707,
+      "learning_rate": 4.940602329134309e-06,
+      "loss": 0.5786,
+      "step": 177
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.1698038563080417,
+      "learning_rate": 4.939891722822043e-06,
+      "loss": 0.5749,
+      "step": 178
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.244425969121411,
+      "learning_rate": 4.93917694278093e-06,
+      "loss": 0.5877,
+      "step": 179
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.143920008069458,
+      "learning_rate": 4.938457990233687e-06,
+      "loss": 0.6024,
+      "step": 180
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.1786040820345813,
+      "learning_rate": 4.937734866410169e-06,
+      "loss": 0.5845,
+      "step": 181
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.301832824481007,
+      "learning_rate": 4.9370075725473665e-06,
+      "loss": 0.6182,
+      "step": 182
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.3748033727083997,
+      "learning_rate": 4.936276109889403e-06,
+      "loss": 0.6073,
+      "step": 183
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.476334487382023,
+      "learning_rate": 4.935540479687534e-06,
+      "loss": 0.5793,
+      "step": 184
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.2509466352322494,
+      "learning_rate": 4.934800683200143e-06,
+      "loss": 0.6133,
+      "step": 185
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.8391697547684873,
+      "learning_rate": 4.934056721692742e-06,
+      "loss": 0.5967,
+      "step": 186
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.4492364225391765,
+      "learning_rate": 4.933308596437965e-06,
+      "loss": 0.5676,
+      "step": 187
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.685548141821295,
+      "learning_rate": 4.932556308715573e-06,
+      "loss": 0.6069,
+      "step": 188
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.261217637824808,
+      "learning_rate": 4.931799859812443e-06,
+      "loss": 0.6411,
+      "step": 189
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.3838284395200966,
+      "learning_rate": 4.931039251022573e-06,
+      "loss": 0.5745,
+      "step": 190
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.2550921344466164,
+      "learning_rate": 4.930274483647074e-06,
+      "loss": 0.5989,
+      "step": 191
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.078406234527636,
+      "learning_rate": 4.929505558994175e-06,
+      "loss": 0.5998,
+      "step": 192
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.592864566091496,
+      "learning_rate": 4.928732478379214e-06,
+      "loss": 0.5842,
+      "step": 193
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.092752299259724,
+      "learning_rate": 4.927955243124638e-06,
+      "loss": 0.5789,
+      "step": 194
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.3799311595696966,
+      "learning_rate": 4.927173854560002e-06,
+      "loss": 0.6265,
+      "step": 195
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.246876688010602,
+      "learning_rate": 4.926388314021964e-06,
+      "loss": 0.6126,
+      "step": 196
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.1409898276704578,
+      "learning_rate": 4.925598622854287e-06,
+      "loss": 0.6073,
+      "step": 197
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.5946158421875385,
+      "learning_rate": 4.924804782407834e-06,
+      "loss": 0.6154,
+      "step": 198
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.1225494320427982,
+      "learning_rate": 4.924006794040562e-06,
+      "loss": 0.583,
+      "step": 199
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.1971323526291338,
+      "learning_rate": 4.923204659117528e-06,
+      "loss": 0.6078,
+      "step": 200
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.289185506404785,
+      "learning_rate": 4.92239837901088e-06,
+      "loss": 0.6127,
+      "step": 201
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.0071007751625354,
+      "learning_rate": 4.921587955099858e-06,
+      "loss": 0.5804,
+      "step": 202
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.2981840149068247,
+      "learning_rate": 4.920773388770789e-06,
+      "loss": 0.6027,
+      "step": 203
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.236179116886702,
+      "learning_rate": 4.919954681417087e-06,
+      "loss": 0.6179,
+      "step": 204
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.007422589251611,
+      "learning_rate": 4.91913183443925e-06,
+      "loss": 0.5647,
+      "step": 205
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.1402813555735483,
+      "learning_rate": 4.918304849244857e-06,
+      "loss": 0.5841,
+      "step": 206
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.0456415785177104,
+      "learning_rate": 4.917473727248565e-06,
+      "loss": 0.5524,
+      "step": 207
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 1.9673558126020942,
+      "learning_rate": 4.916638469872109e-06,
+      "loss": 0.5698,
+      "step": 208
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.015111672496819,
+      "learning_rate": 4.9157990785442964e-06,
+      "loss": 0.5957,
+      "step": 209
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 1.9502065547578398,
+      "learning_rate": 4.9149555547010086e-06,
+      "loss": 0.5592,
+      "step": 210
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 2.167936522558899,
+      "learning_rate": 4.9141078997851945e-06,
+      "loss": 0.5705,
+      "step": 211
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.2066587458997935,
+      "learning_rate": 4.91325611524687e-06,
+      "loss": 0.5526,
+      "step": 212
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.9132995625903553,
+      "learning_rate": 4.9124002025431136e-06,
+      "loss": 0.5767,
+      "step": 213
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.0097281107801277,
+      "learning_rate": 4.91154016313807e-06,
+      "loss": 0.6185,
+      "step": 214
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.023532008241332,
+      "learning_rate": 4.910675998502938e-06,
+      "loss": 0.6005,
+      "step": 215
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.9253831001776973,
+      "learning_rate": 4.909807710115977e-06,
+      "loss": 0.5769,
+      "step": 216
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.066862408842564,
+      "learning_rate": 4.908935299462497e-06,
+      "loss": 0.5671,
+      "step": 217
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.9412704290792853,
+      "learning_rate": 4.908058768034862e-06,
+      "loss": 0.5568,
+      "step": 218
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.185994457097553,
+      "learning_rate": 4.907178117332487e-06,
+      "loss": 0.5621,
+      "step": 219
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.021517127546353,
+      "learning_rate": 4.906293348861829e-06,
+      "loss": 0.5672,
+      "step": 220
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.099703967072734,
+      "learning_rate": 4.905404464136391e-06,
+      "loss": 0.5366,
+      "step": 221
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.030197056583618,
+      "learning_rate": 4.904511464676718e-06,
+      "loss": 0.6064,
+      "step": 222
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.4170102988954896,
+      "learning_rate": 4.903614352010393e-06,
+      "loss": 0.5919,
+      "step": 223
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.0819468873015476,
+      "learning_rate": 4.9027131276720355e-06,
+      "loss": 0.5366,
+      "step": 224
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.148008018153629,
+      "learning_rate": 4.901807793203299e-06,
+      "loss": 0.597,
+      "step": 225
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.0303725862017186,
+      "learning_rate": 4.900898350152866e-06,
+      "loss": 0.6394,
+      "step": 226
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.1598989214704334,
+      "learning_rate": 4.899984800076449e-06,
+      "loss": 0.5932,
+      "step": 227
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.0816312637185255,
+      "learning_rate": 4.899067144536786e-06,
+      "loss": 0.5909,
+      "step": 228
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 1.9024067197329315,
+      "learning_rate": 4.8981453851036365e-06,
+      "loss": 0.5463,
+      "step": 229
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.1830926868871043,
+      "learning_rate": 4.897219523353781e-06,
+      "loss": 0.5821,
+      "step": 230
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.1156269612794016,
+      "learning_rate": 4.8962895608710195e-06,
+      "loss": 0.5993,
+      "step": 231
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 1.9653407654210864,
+      "learning_rate": 4.895355499246162e-06,
+      "loss": 0.5525,
+      "step": 232
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.367769051061897,
+      "learning_rate": 4.894417340077036e-06,
+      "loss": 0.5683,
+      "step": 233
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.078327064466567,
+      "learning_rate": 4.893475084968474e-06,
+      "loss": 0.6184,
+      "step": 234
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.1661882731589475,
+      "learning_rate": 4.8925287355323195e-06,
+      "loss": 0.6321,
+      "step": 235
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.182760952002799,
+      "learning_rate": 4.891578293387413e-06,
+      "loss": 0.6254,
+      "step": 236
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.998723579962691,
+      "learning_rate": 4.890623760159605e-06,
+      "loss": 0.5371,
+      "step": 237
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.319922346931926,
+      "learning_rate": 4.8896651374817365e-06,
+      "loss": 0.5941,
+      "step": 238
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.090735197217999,
+      "learning_rate": 4.888702426993648e-06,
+      "loss": 0.577,
+      "step": 239
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.1247199987228558,
+      "learning_rate": 4.887735630342173e-06,
+      "loss": 0.5928,
+      "step": 240
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.33151114429804,
+      "learning_rate": 4.8867647491811315e-06,
+      "loss": 0.5838,
+      "step": 241
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.1570026356289147,
+      "learning_rate": 4.885789785171334e-06,
+      "loss": 0.5642,
+      "step": 242
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.049571197047368,
+      "learning_rate": 4.884810739980575e-06,
+      "loss": 0.6684,
+      "step": 243
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.9810062424466381,
+      "learning_rate": 4.883827615283626e-06,
+      "loss": 0.5942,
+      "step": 244
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.145869663660159,
+      "learning_rate": 4.882840412762244e-06,
+      "loss": 0.6356,
+      "step": 245
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.19290302186514,
+      "learning_rate": 4.881849134105156e-06,
+      "loss": 0.6189,
+      "step": 246
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.0561043419872984,
+      "learning_rate": 4.880853781008062e-06,
+      "loss": 0.5563,
+      "step": 247
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 1.8831183793224635,
+      "learning_rate": 4.879854355173638e-06,
+      "loss": 0.5522,
+      "step": 248
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.020981606684741,
+      "learning_rate": 4.878850858311518e-06,
+      "loss": 0.5548,
+      "step": 249
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.060242570493272,
+      "learning_rate": 4.877843292138307e-06,
+      "loss": 0.5715,
+      "step": 250
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.082455778933014,
+      "learning_rate": 4.8768316583775665e-06,
+      "loss": 0.5959,
+      "step": 251
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 1.9830929719438626,
+      "learning_rate": 4.875815958759819e-06,
+      "loss": 0.5813,
+      "step": 252
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 1.9772267506828567,
+      "learning_rate": 4.8747961950225406e-06,
+      "loss": 0.539,
+      "step": 253
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.1492561995002104,
+      "learning_rate": 4.873772368910161e-06,
+      "loss": 0.6059,
+      "step": 254
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.253757247139787,
+      "learning_rate": 4.872744482174058e-06,
+      "loss": 0.5897,
+      "step": 255
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.3282624851882496,
+      "learning_rate": 4.8717125365725545e-06,
+      "loss": 0.5675,
+      "step": 256
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.15573581133063,
+      "learning_rate": 4.8706765338709185e-06,
+      "loss": 0.5958,
+      "step": 257
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.073289220218241,
+      "learning_rate": 4.869636475841358e-06,
+      "loss": 0.6052,
+      "step": 258
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.293714090249444,
+      "learning_rate": 4.8685923642630165e-06,
+      "loss": 0.5786,
+      "step": 259
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.9496544276539172,
+      "learning_rate": 4.867544200921974e-06,
+      "loss": 0.6163,
+      "step": 260
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.5267016753690132,
+      "learning_rate": 4.866491987611239e-06,
+      "loss": 0.6223,
+      "step": 261
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.8731249445320794,
+      "learning_rate": 4.865435726130751e-06,
+      "loss": 0.5632,
+      "step": 262
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.3586331105798863,
+      "learning_rate": 4.86437541828737e-06,
+      "loss": 0.5769,
+      "step": 263
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.0258106914510585,
+      "learning_rate": 4.863311065894883e-06,
+      "loss": 0.6103,
+      "step": 264
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.2543614390885955,
+      "learning_rate": 4.862242670773991e-06,
+      "loss": 0.5844,
+      "step": 265
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.9440299381244668,
+      "learning_rate": 4.861170234752314e-06,
+      "loss": 0.5559,
+      "step": 266
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.254538268495492,
+      "learning_rate": 4.8600937596643815e-06,
+      "loss": 0.5709,
+      "step": 267
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.007651746385687,
+      "learning_rate": 4.8590132473516346e-06,
+      "loss": 0.573,
+      "step": 268
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.0735253118288837,
+      "learning_rate": 4.857928699662421e-06,
+      "loss": 0.5954,
+      "step": 269
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.024775417101569,
+      "learning_rate": 4.856840118451989e-06,
+      "loss": 0.5992,
+      "step": 270
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 2.1043310699945814,
+      "learning_rate": 4.855747505582488e-06,
+      "loss": 0.6507,
+      "step": 271
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.0386353328313214,
+      "learning_rate": 4.854650862922965e-06,
+      "loss": 0.5666,
+      "step": 272
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.978698841367705,
+      "learning_rate": 4.853550192349358e-06,
+      "loss": 0.5593,
+      "step": 273
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.9386534247633986,
+      "learning_rate": 4.852445495744497e-06,
+      "loss": 0.5735,
+      "step": 274
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.049346245018599,
+      "learning_rate": 4.8513367749981e-06,
+      "loss": 0.5415,
+      "step": 275
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.1051969521216605,
+      "learning_rate": 4.850224032006765e-06,
+      "loss": 0.5532,
+      "step": 276
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.2006792558872315,
+      "learning_rate": 4.849107268673975e-06,
+      "loss": 0.5696,
+      "step": 277
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.0460787736353647,
+      "learning_rate": 4.847986486910088e-06,
+      "loss": 0.5658,
+      "step": 278
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.1161843259225406,
+      "learning_rate": 4.846861688632336e-06,
+      "loss": 0.583,
+      "step": 279
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.8882198480393542,
+      "learning_rate": 4.8457328757648224e-06,
+      "loss": 0.5693,
+      "step": 280
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.1578413701109596,
+      "learning_rate": 4.844600050238517e-06,
+      "loss": 0.5409,
+      "step": 281
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.03912467778954,
+      "learning_rate": 4.843463213991255e-06,
+      "loss": 0.5908,
+      "step": 282
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.2333462480826247,
+      "learning_rate": 4.842322368967731e-06,
+      "loss": 0.6088,
+      "step": 283
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.06698702157327,
+      "learning_rate": 4.8411775171194986e-06,
+      "loss": 0.5953,
+      "step": 284
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.1433923121572045,
+      "learning_rate": 4.840028660404964e-06,
+      "loss": 0.5851,
+      "step": 285
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.214858780835041,
+      "learning_rate": 4.838875800789386e-06,
+      "loss": 0.5913,
+      "step": 286
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.038128612492624,
+      "learning_rate": 4.837718940244871e-06,
+      "loss": 0.5827,
+      "step": 287
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.9894065096959768,
+      "learning_rate": 4.836558080750365e-06,
+      "loss": 0.5769,
+      "step": 288
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.1711590153285822,
+      "learning_rate": 4.835393224291662e-06,
+      "loss": 0.654,
+      "step": 289
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.105004451988696,
+      "learning_rate": 4.834224372861386e-06,
+      "loss": 0.6158,
+      "step": 290
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.9554568023729102,
+      "learning_rate": 4.833051528459001e-06,
+      "loss": 0.5807,
+      "step": 291
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.2693917834500312,
+      "learning_rate": 4.831874693090797e-06,
+      "loss": 0.5557,
+      "step": 292
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.9081391627126192,
+      "learning_rate": 4.830693868769892e-06,
+      "loss": 0.6057,
+      "step": 293
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.2133664110768585,
+      "learning_rate": 4.82950905751623e-06,
+      "loss": 0.6103,
+      "step": 294
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 2.015392814211589,
+      "learning_rate": 4.8283202613565735e-06,
+      "loss": 0.5578,
+      "step": 295
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.142124020349717,
+      "learning_rate": 4.8271274823245e-06,
+      "loss": 0.5675,
+      "step": 296
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.981611826462286,
+      "learning_rate": 4.825930722460405e-06,
+      "loss": 0.5696,
+      "step": 297
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.966759748348117,
+      "learning_rate": 4.824729983811486e-06,
+      "loss": 0.58,
+      "step": 298
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.0117040369769397,
+      "learning_rate": 4.823525268431754e-06,
+      "loss": 0.6005,
+      "step": 299
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.9579664917991193,
+      "learning_rate": 4.822316578382019e-06,
+      "loss": 0.5472,
+      "step": 300
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.9075723479635032,
+      "learning_rate": 4.821103915729892e-06,
+      "loss": 0.5834,
+      "step": 301
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.289340229011896,
+      "learning_rate": 4.819887282549777e-06,
+      "loss": 0.6088,
+      "step": 302
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.0410700553735235,
+      "learning_rate": 4.818666680922874e-06,
+      "loss": 0.5449,
+      "step": 303
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.074434792511819,
+      "learning_rate": 4.8174421129371675e-06,
+      "loss": 0.5826,
+      "step": 304
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.1377170527698865,
+      "learning_rate": 4.816213580687428e-06,
+      "loss": 0.6262,
+      "step": 305
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.060340839248083,
+      "learning_rate": 4.814981086275209e-06,
+      "loss": 0.5479,
+      "step": 306
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.007036467413588,
+      "learning_rate": 4.813744631808841e-06,
+      "loss": 0.5642,
+      "step": 307
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.016779606220332,
+      "learning_rate": 4.8125042194034285e-06,
+      "loss": 0.5503,
+      "step": 308
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.930004252757651,
+      "learning_rate": 4.811259851180845e-06,
+      "loss": 0.582,
+      "step": 309
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.9179477992752856,
+      "learning_rate": 4.810011529269734e-06,
+      "loss": 0.5678,
+      "step": 310
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.023430757276848,
+      "learning_rate": 4.808759255805498e-06,
+      "loss": 0.614,
+      "step": 311
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.8334738409404936,
+      "learning_rate": 4.807503032930306e-06,
+      "loss": 0.5742,
+      "step": 312
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.937332706274502,
+      "learning_rate": 4.806242862793075e-06,
+      "loss": 0.6257,
+      "step": 313
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.0265383045700363,
+      "learning_rate": 4.8049787475494786e-06,
+      "loss": 0.5733,
+      "step": 314
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.056444039073761,
+      "learning_rate": 4.803710689361939e-06,
+      "loss": 0.578,
+      "step": 315
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.411132719183335,
+      "learning_rate": 4.802438690399622e-06,
+      "loss": 0.5778,
+      "step": 316
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.0233969242222853,
+      "learning_rate": 4.801162752838436e-06,
+      "loss": 0.5649,
+      "step": 317
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.2809121915132815,
+      "learning_rate": 4.799882878861025e-06,
+      "loss": 0.5589,
+      "step": 318
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.9806834041020271,
+      "learning_rate": 4.798599070656768e-06,
+      "loss": 0.5753,
+      "step": 319
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.095099671577702,
+      "learning_rate": 4.797311330421773e-06,
+      "loss": 0.5644,
+      "step": 320
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.1697606190375764,
+      "learning_rate": 4.796019660358877e-06,
+      "loss": 0.6009,
+      "step": 321
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.9549416103216173,
+      "learning_rate": 4.794724062677635e-06,
+      "loss": 0.5429,
+      "step": 322
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.9986949357292838,
+      "learning_rate": 4.793424539594323e-06,
+      "loss": 0.5456,
+      "step": 323
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.9414831957796765,
+      "learning_rate": 4.792121093331935e-06,
+      "loss": 0.5468,
+      "step": 324
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.100702188933012,
+      "learning_rate": 4.7908137261201685e-06,
+      "loss": 0.5763,
+      "step": 325
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.2747471285831025,
+      "learning_rate": 4.789502440195436e-06,
+      "loss": 0.5637,
+      "step": 326
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.8996382919319124,
+      "learning_rate": 4.788187237800849e-06,
+      "loss": 0.5285,
+      "step": 327
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.3451495174978847,
+      "learning_rate": 4.786868121186218e-06,
+      "loss": 0.5638,
+      "step": 328
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.0437536068229565,
+      "learning_rate": 4.7855450926080535e-06,
+      "loss": 0.5282,
+      "step": 329
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.1185488514745554,
+      "learning_rate": 4.784218154329555e-06,
+      "loss": 0.5689,
+      "step": 330
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.08745956731504,
+      "learning_rate": 4.78288730862061e-06,
+      "loss": 0.5772,
+      "step": 331
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.9479507156354359,
+      "learning_rate": 4.781552557757789e-06,
+      "loss": 0.5419,
+      "step": 332
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.0211480847937255,
+      "learning_rate": 4.780213904024346e-06,
+      "loss": 0.5757,
+      "step": 333
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.9075335749936069,
+      "learning_rate": 4.7788713497102094e-06,
+      "loss": 0.5693,
+      "step": 334
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.9590727137410602,
+      "learning_rate": 4.777524897111979e-06,
+      "loss": 0.5501,
+      "step": 335
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.0328480247612752,
+      "learning_rate": 4.776174548532926e-06,
+      "loss": 0.587,
+      "step": 336
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.062540517496736,
+      "learning_rate": 4.774820306282982e-06,
+      "loss": 0.5819,
+      "step": 337
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.0054452800156195,
+      "learning_rate": 4.773462172678744e-06,
+      "loss": 0.5529,
+      "step": 338
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.9641125644599562,
+      "learning_rate": 4.772100150043462e-06,
+      "loss": 0.5895,
+      "step": 339
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.9196744569285298,
+      "learning_rate": 4.77073424070704e-06,
+      "loss": 0.5504,
+      "step": 340
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.0002752186146484,
+      "learning_rate": 4.76936444700603e-06,
+      "loss": 0.5307,
+      "step": 341
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.1068919823054344,
+      "learning_rate": 4.76799077128363e-06,
+      "loss": 0.5908,
+      "step": 342
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.919597745459612,
+      "learning_rate": 4.766613215889678e-06,
+      "loss": 0.5423,
+      "step": 343
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 2.0670928578728716,
+      "learning_rate": 4.765231783180648e-06,
+      "loss": 0.5901,
+      "step": 344
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.906116148793229,
+      "learning_rate": 4.763846475519648e-06,
+      "loss": 0.5919,
+      "step": 345
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.9133575268702454,
+      "learning_rate": 4.762457295276413e-06,
+      "loss": 0.585,
+      "step": 346
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 2.133902651855379,
+      "learning_rate": 4.7610642448273025e-06,
+      "loss": 0.5444,
+      "step": 347
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.95222194640397,
+      "learning_rate": 4.7596673265552985e-06,
+      "loss": 0.5941,
+      "step": 348
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 2.095010268380277,
+      "learning_rate": 4.758266542849997e-06,
+      "loss": 0.6045,
+      "step": 349
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 2.0493864712059655,
+      "learning_rate": 4.756861896107609e-06,
+      "loss": 0.6011,
+      "step": 350
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.9222198823064967,
+      "learning_rate": 4.755453388730949e-06,
+      "loss": 0.5521,
+      "step": 351
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 2.368147154955994,
+      "learning_rate": 4.754041023129442e-06,
+      "loss": 0.6117,
+      "step": 352
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.9734596786106697,
+      "learning_rate": 4.752624801719108e-06,
+      "loss": 0.5727,
+      "step": 353
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 2.151510566977991,
+      "learning_rate": 4.751204726922564e-06,
+      "loss": 0.6085,
+      "step": 354
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.9291219072892685,
+      "learning_rate": 4.74978080116902e-06,
+      "loss": 0.5655,
+      "step": 355
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.838592559018919,
+      "learning_rate": 4.748353026894273e-06,
+      "loss": 0.5508,
+      "step": 356
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 2.069156589116884,
+      "learning_rate": 4.7469214065407e-06,
+      "loss": 0.5942,
+      "step": 357
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.8960817746615841,
+      "learning_rate": 4.745485942557264e-06,
+      "loss": 0.5902,
+      "step": 358
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 2.0606557307859634,
+      "learning_rate": 4.744046637399497e-06,
+      "loss": 0.556,
+      "step": 359
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.9660065879130573,
+      "learning_rate": 4.742603493529505e-06,
+      "loss": 0.5364,
+      "step": 360
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.9647921383638112,
+      "learning_rate": 4.741156513415958e-06,
+      "loss": 0.5601,
+      "step": 361
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 2.049074688423064,
+      "learning_rate": 4.739705699534092e-06,
+      "loss": 0.556,
+      "step": 362
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.962593945802751,
+      "learning_rate": 4.738251054365697e-06,
+      "loss": 0.5609,
+      "step": 363
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 2.059675349950347,
+      "learning_rate": 4.736792580399119e-06,
+      "loss": 0.5499,
+      "step": 364
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.8479566025134508,
+      "learning_rate": 4.7353302801292555e-06,
+      "loss": 0.5621,
+      "step": 365
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.9405450724813613,
+      "learning_rate": 4.733864156057545e-06,
+      "loss": 0.5437,
+      "step": 366
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 2.122487864033456,
+      "learning_rate": 4.7323942106919715e-06,
+      "loss": 0.5984,
+      "step": 367
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.6822841144123046,
+      "learning_rate": 4.730920446547052e-06,
+      "loss": 0.5951,
+      "step": 368
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.001405394086718,
+      "learning_rate": 4.729442866143838e-06,
+      "loss": 0.5552,
+      "step": 369
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.081154186949651,
+      "learning_rate": 4.72796147200991e-06,
+      "loss": 0.587,
+      "step": 370
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.1196544292473236,
+      "learning_rate": 4.72647626667937e-06,
+      "loss": 0.5882,
+      "step": 371
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.107445583509131,
+      "learning_rate": 4.724987252692841e-06,
+      "loss": 0.5389,
+      "step": 372
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 1.9529785007256542,
+      "learning_rate": 4.723494432597462e-06,
+      "loss": 0.6439,
+      "step": 373
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.11513441515607,
+      "learning_rate": 4.72199780894688e-06,
+      "loss": 0.6089,
+      "step": 374
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 1.9769899713721226,
+      "learning_rate": 4.7204973843012504e-06,
+      "loss": 0.5393,
+      "step": 375
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.063749623036316,
+      "learning_rate": 4.718993161227231e-06,
+      "loss": 0.5987,
+      "step": 376
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.0515862288253883,
+      "learning_rate": 4.717485142297977e-06,
+      "loss": 0.5772,
+      "step": 377
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 1.8962297741946081,
+      "learning_rate": 4.715973330093135e-06,
+      "loss": 0.5424,
+      "step": 378
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 2.2210958340400087,
+      "learning_rate": 4.7144577271988435e-06,
+      "loss": 0.6072,
+      "step": 379
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.067113337475314,
+      "learning_rate": 4.712938336207724e-06,
+      "loss": 0.5482,
+      "step": 380
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.8985489253954526,
+      "learning_rate": 4.711415159718876e-06,
+      "loss": 0.5593,
+      "step": 381
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.085236381118245,
+      "learning_rate": 4.709888200337879e-06,
+      "loss": 0.5704,
+      "step": 382
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.0967664183909784,
+      "learning_rate": 4.708357460676779e-06,
+      "loss": 0.5997,
+      "step": 383
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.0454278026009645,
+      "learning_rate": 4.706822943354092e-06,
+      "loss": 0.5669,
+      "step": 384
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.9171673309342674,
+      "learning_rate": 4.705284650994793e-06,
+      "loss": 0.517,
+      "step": 385
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.2003223432761287,
+      "learning_rate": 4.70374258623032e-06,
+      "loss": 0.5957,
+      "step": 386
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.936392519491186,
+      "learning_rate": 4.702196751698557e-06,
+      "loss": 0.5767,
+      "step": 387
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.354272003403086,
+      "learning_rate": 4.700647150043841e-06,
+      "loss": 0.6515,
+      "step": 388
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.9115059027323418,
+      "learning_rate": 4.699093783916955e-06,
+      "loss": 0.5579,
+      "step": 389
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.9878827587010002,
+      "learning_rate": 4.697536655975115e-06,
+      "loss": 0.572,
+      "step": 390
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.9729552535473858,
+      "learning_rate": 4.69597576888198e-06,
+      "loss": 0.5665,
+      "step": 391
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.177634366499155,
+      "learning_rate": 4.694411125307632e-06,
+      "loss": 0.6363,
+      "step": 392
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.8955146664976508,
+      "learning_rate": 4.692842727928584e-06,
+      "loss": 0.5682,
+      "step": 393
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 2.175305874476245,
+      "learning_rate": 4.691270579427769e-06,
+      "loss": 0.5943,
+      "step": 394
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 2.068140527232831,
+      "learning_rate": 4.689694682494537e-06,
+      "loss": 0.5659,
+      "step": 395
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.9112960694448755,
+      "learning_rate": 4.688115039824648e-06,
+      "loss": 0.6048,
+      "step": 396
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.9778305624626604,
+      "learning_rate": 4.686531654120272e-06,
+      "loss": 0.5695,
+      "step": 397
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 2.096904163204813,
+      "learning_rate": 4.684944528089981e-06,
+      "loss": 0.6113,
+      "step": 398
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 2.0011934144948516,
+      "learning_rate": 4.683353664448745e-06,
+      "loss": 0.5568,
+      "step": 399
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.8562851971757464,
+      "learning_rate": 4.681759065917929e-06,
+      "loss": 0.5474,
+      "step": 400
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.8190547574166316,
+      "learning_rate": 4.680160735225285e-06,
+      "loss": 0.5315,
+      "step": 401
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.9247862956929132,
+      "learning_rate": 4.6785586751049505e-06,
+      "loss": 0.5568,
+      "step": 402
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.8469793674077621,
+      "learning_rate": 4.676952888297442e-06,
+      "loss": 0.5811,
+      "step": 403
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.946943145198674,
+      "learning_rate": 4.675343377549653e-06,
+      "loss": 0.5475,
+      "step": 404
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.991304422730463,
+      "learning_rate": 4.6737301456148445e-06,
+      "loss": 0.5856,
+      "step": 405
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.9168241989446437,
+      "learning_rate": 4.672113195252644e-06,
+      "loss": 0.6069,
+      "step": 406
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.9305433665377905,
+      "learning_rate": 4.670492529229039e-06,
+      "loss": 0.5536,
+      "step": 407
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.8441008898830742,
+      "learning_rate": 4.668868150316377e-06,
+      "loss": 0.5859,
+      "step": 408
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.8879301596961315,
+      "learning_rate": 4.667240061293351e-06,
+      "loss": 0.5483,
+      "step": 409
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 2.024767417636281,
+      "learning_rate": 4.665608264945004e-06,
+      "loss": 0.5414,
+      "step": 410
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 2.1331610141797395,
+      "learning_rate": 4.663972764062722e-06,
+      "loss": 0.5811,
+      "step": 411
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.8132480265817386,
+      "learning_rate": 4.662333561444226e-06,
+      "loss": 0.5573,
+      "step": 412
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.9795813972027145,
+      "learning_rate": 4.6606906598935675e-06,
+      "loss": 0.5814,
+      "step": 413
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.8782931074297053,
+      "learning_rate": 4.6590440622211295e-06,
+      "loss": 0.569,
+      "step": 414
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.8219945335518706,
+      "learning_rate": 4.657393771243614e-06,
+      "loss": 0.5669,
+      "step": 415
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 2.4047268604371306,
+      "learning_rate": 4.6557397897840454e-06,
+      "loss": 0.5602,
+      "step": 416
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 2.064501780523946,
+      "learning_rate": 4.654082120671757e-06,
+      "loss": 0.5699,
+      "step": 417
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.9183128854940252,
+      "learning_rate": 4.65242076674239e-06,
+      "loss": 0.6112,
+      "step": 418
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.9315698971629633,
+      "learning_rate": 4.650755730837894e-06,
+      "loss": 0.5537,
+      "step": 419
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.9527809333659218,
+      "learning_rate": 4.649087015806509e-06,
+      "loss": 0.5423,
+      "step": 420
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.8940523915995442,
+      "learning_rate": 4.647414624502777e-06,
+      "loss": 0.5708,
+      "step": 421
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.9976964785548623,
+      "learning_rate": 4.645738559787524e-06,
+      "loss": 0.6006,
+      "step": 422
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.9098681403283917,
+      "learning_rate": 4.64405882452786e-06,
+      "loss": 0.5591,
+      "step": 423
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.8695612182804557,
+      "learning_rate": 4.642375421597175e-06,
+      "loss": 0.5219,
+      "step": 424
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.8912077704810082,
+      "learning_rate": 4.6406883538751315e-06,
+      "loss": 0.5224,
+      "step": 425
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.9390714726978922,
+      "learning_rate": 4.638997624247664e-06,
+      "loss": 0.5359,
+      "step": 426
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 2.051545992296337,
+      "learning_rate": 4.637303235606968e-06,
+      "loss": 0.544,
+      "step": 427
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 2.0657109136265914,
+      "learning_rate": 4.6356051908515e-06,
+      "loss": 0.5429,
+      "step": 428
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 2.0301022307984793,
+      "learning_rate": 4.63390349288597e-06,
+      "loss": 0.5787,
+      "step": 429
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 2.052515756169346,
+      "learning_rate": 4.632198144621338e-06,
+      "loss": 0.5778,
+      "step": 430
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.9741370495474897,
+      "learning_rate": 4.630489148974807e-06,
+      "loss": 0.5142,
+      "step": 431
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.9713229498863698,
+      "learning_rate": 4.62877650886982e-06,
+      "loss": 0.6127,
+      "step": 432
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 2.1609440121306007,
+      "learning_rate": 4.627060227236055e-06,
+      "loss": 0.5886,
+      "step": 433
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.944966445355139,
+      "learning_rate": 4.625340307009418e-06,
+      "loss": 0.5657,
+      "step": 434
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 2.031003925680835,
+      "learning_rate": 4.623616751132041e-06,
+      "loss": 0.5628,
+      "step": 435
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.8774113373137704,
+      "learning_rate": 4.621889562552272e-06,
+      "loss": 0.6068,
+      "step": 436
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 2.0385201543401785,
+      "learning_rate": 4.620158744224677e-06,
+      "loss": 0.5511,
+      "step": 437
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.8440750841938207,
+      "learning_rate": 4.618424299110028e-06,
+      "loss": 0.5261,
+      "step": 438
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.8978691755923442,
+      "learning_rate": 4.616686230175303e-06,
+      "loss": 0.5862,
+      "step": 439
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.8120850246861446,
+      "learning_rate": 4.614944540393679e-06,
+      "loss": 0.5652,
+      "step": 440
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.1821084695714914,
+      "learning_rate": 4.613199232744525e-06,
+      "loss": 0.5598,
+      "step": 441
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.9626422737625222,
+      "learning_rate": 4.611450310213401e-06,
+      "loss": 0.5267,
+      "step": 442
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.9714913234889215,
+      "learning_rate": 4.6096977757920505e-06,
+      "loss": 0.5658,
+      "step": 443
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.0179324078198233,
+      "learning_rate": 4.607941632478393e-06,
+      "loss": 0.582,
+      "step": 444
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.8565193856331161,
+      "learning_rate": 4.6061818832765246e-06,
+      "loss": 0.5715,
+      "step": 445
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.9798501479599246,
+      "learning_rate": 4.604418531196708e-06,
+      "loss": 0.6007,
+      "step": 446
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.0095846956468257,
+      "learning_rate": 4.602651579255369e-06,
+      "loss": 0.5947,
+      "step": 447
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.9316541079988245,
+      "learning_rate": 4.600881030475093e-06,
+      "loss": 0.5501,
+      "step": 448
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.080069353365406,
+      "learning_rate": 4.599106887884616e-06,
+      "loss": 0.5631,
+      "step": 449
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.965973137652201,
+      "learning_rate": 4.5973291545188235e-06,
+      "loss": 0.5267,
+      "step": 450
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.1082225966704087,
+      "learning_rate": 4.595547833418741e-06,
+      "loss": 0.6418,
+      "step": 451
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.0359312594194083,
+      "learning_rate": 4.593762927631536e-06,
+      "loss": 0.5644,
+      "step": 452
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 2.1254892914109433,
+      "learning_rate": 4.591974440210502e-06,
+      "loss": 0.5693,
+      "step": 453
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.9121188587334927,
+      "learning_rate": 4.590182374215064e-06,
+      "loss": 0.5572,
+      "step": 454
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.9348642624953207,
+      "learning_rate": 4.588386732710765e-06,
+      "loss": 0.5446,
+      "step": 455
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.8667846547370581,
+      "learning_rate": 4.5865875187692695e-06,
+      "loss": 0.5681,
+      "step": 456
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.9219061327454674,
+      "learning_rate": 4.5847847354683465e-06,
+      "loss": 0.5508,
+      "step": 457
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.8106132369123122,
+      "learning_rate": 4.5829783858918756e-06,
+      "loss": 0.5626,
+      "step": 458
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.7827483964442634,
+      "learning_rate": 4.5811684731298355e-06,
+      "loss": 0.5575,
+      "step": 459
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.9284196979863513,
+      "learning_rate": 4.5793550002783e-06,
+      "loss": 0.5363,
+      "step": 460
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 2.029647468705457,
+      "learning_rate": 4.577537970439433e-06,
+      "loss": 0.5415,
+      "step": 461
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 2.0997127029950087,
+      "learning_rate": 4.575717386721482e-06,
+      "loss": 0.5814,
+      "step": 462
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.9589290300656341,
+      "learning_rate": 4.573893252238777e-06,
+      "loss": 0.5156,
+      "step": 463
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.905237143908251,
+      "learning_rate": 4.572065570111717e-06,
+      "loss": 0.5536,
+      "step": 464
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.929519794935609,
+      "learning_rate": 4.570234343466775e-06,
+      "loss": 0.5879,
+      "step": 465
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 2.096095808886982,
+      "learning_rate": 4.568399575436484e-06,
+      "loss": 0.6241,
+      "step": 466
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.9486118894048778,
+      "learning_rate": 4.566561269159437e-06,
+      "loss": 0.6307,
+      "step": 467
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 2.0839490306744586,
+      "learning_rate": 4.564719427780276e-06,
+      "loss": 0.5655,
+      "step": 468
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.9439525665822102,
+      "learning_rate": 4.562874054449694e-06,
+      "loss": 0.5437,
+      "step": 469
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.9409142791465297,
+      "learning_rate": 4.5610251523244244e-06,
+      "loss": 0.6429,
+      "step": 470
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.8664574493795525,
+      "learning_rate": 4.559172724567238e-06,
+      "loss": 0.5826,
+      "step": 471
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.80819349503324,
+      "learning_rate": 4.557316774346934e-06,
+      "loss": 0.5372,
+      "step": 472
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.8680097526865296,
+      "learning_rate": 4.555457304838341e-06,
+      "loss": 0.5503,
+      "step": 473
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.7466938790815696,
+      "learning_rate": 4.553594319222303e-06,
+      "loss": 0.5425,
+      "step": 474
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.9610557658505607,
+      "learning_rate": 4.551727820685684e-06,
+      "loss": 0.5755,
+      "step": 475
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.9414839604282412,
+      "learning_rate": 4.549857812421353e-06,
+      "loss": 0.5915,
+      "step": 476
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.8484957644576423,
+      "learning_rate": 4.547984297628186e-06,
+      "loss": 0.5676,
+      "step": 477
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 2.074524028551078,
+      "learning_rate": 4.546107279511055e-06,
+      "loss": 0.6084,
+      "step": 478
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 2.069692704122282,
+      "learning_rate": 4.544226761280826e-06,
+      "loss": 0.5676,
+      "step": 479
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.8975472248317244,
+      "learning_rate": 4.54234274615435e-06,
+      "loss": 0.5904,
+      "step": 480
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 2.0118868982719897,
+      "learning_rate": 4.540455237354466e-06,
+      "loss": 0.5722,
+      "step": 481
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.9733105429381828,
+      "learning_rate": 4.5385642381099814e-06,
+      "loss": 0.6112,
+      "step": 482
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.862156914026863,
+      "learning_rate": 4.53666975165568e-06,
+      "loss": 0.5951,
+      "step": 483
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.9512940035297868,
+      "learning_rate": 4.53477178123231e-06,
+      "loss": 0.5223,
+      "step": 484
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.9202464191558823,
+      "learning_rate": 4.532870330086577e-06,
+      "loss": 0.5638,
+      "step": 485
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.9015767656854419,
+      "learning_rate": 4.530965401471143e-06,
+      "loss": 0.5911,
+      "step": 486
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.95190921973106,
+      "learning_rate": 4.529056998644619e-06,
+      "loss": 0.6053,
+      "step": 487
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 2.0058459596081644,
+      "learning_rate": 4.527145124871556e-06,
+      "loss": 0.5466,
+      "step": 488
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.8902620959998047,
+      "learning_rate": 4.5252297834224454e-06,
+      "loss": 0.5526,
+      "step": 489
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.985466416169018,
+      "learning_rate": 4.523310977573711e-06,
+      "loss": 0.5958,
+      "step": 490
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 2.1140148957176415,
+      "learning_rate": 4.521388710607699e-06,
+      "loss": 0.613,
+      "step": 491
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.9470601192089525,
+      "learning_rate": 4.51946298581268e-06,
+      "loss": 0.5847,
+      "step": 492
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 2.0227057176069603,
+      "learning_rate": 4.51753380648284e-06,
+      "loss": 0.5784,
+      "step": 493
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 2.05501863673554,
+      "learning_rate": 4.515601175918269e-06,
+      "loss": 0.5501,
+      "step": 494
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 2.0129325402811715,
+      "learning_rate": 4.513665097424967e-06,
+      "loss": 0.5641,
+      "step": 495
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 2.0322333044110468,
+      "learning_rate": 4.51172557431483e-06,
+      "loss": 0.5422,
+      "step": 496
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.9573055659958774,
+      "learning_rate": 4.509782609905644e-06,
+      "loss": 0.516,
+      "step": 497
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.8223127451485421,
+      "learning_rate": 4.507836207521085e-06,
+      "loss": 0.5714,
+      "step": 498
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.9343089861079434,
+      "learning_rate": 4.50588637049071e-06,
+      "loss": 0.5424,
+      "step": 499
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.8940990649350729,
+      "learning_rate": 4.503933102149948e-06,
+      "loss": 0.5832,
+      "step": 500
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.908617301933682,
+      "learning_rate": 4.501976405840101e-06,
+      "loss": 0.5399,
+      "step": 501
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.8290259512093785,
+      "learning_rate": 4.500016284908334e-06,
+      "loss": 0.5561,
+      "step": 502
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.9840280991844164,
+      "learning_rate": 4.49805274270767e-06,
+      "loss": 0.5645,
+      "step": 503
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.9864953051636856,
+      "learning_rate": 4.496085782596984e-06,
+      "loss": 0.5369,
+      "step": 504
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.979387839103732,
+      "learning_rate": 4.494115407940999e-06,
+      "loss": 0.6196,
+      "step": 505
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.9266869362165981,
+      "learning_rate": 4.492141622110279e-06,
+      "loss": 0.5687,
+      "step": 506
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.9887461782376619,
+      "learning_rate": 4.4901644284812205e-06,
+      "loss": 0.5264,
+      "step": 507
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.8717867803152208,
+      "learning_rate": 4.488183830436052e-06,
+      "loss": 0.5612,
+      "step": 508
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 2.0044226171493,
+      "learning_rate": 4.486199831362828e-06,
+      "loss": 0.5571,
+      "step": 509
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 2.1075571016617958,
+      "learning_rate": 4.484212434655414e-06,
+      "loss": 0.5642,
+      "step": 510
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.8031612547539957,
+      "learning_rate": 4.482221643713494e-06,
+      "loss": 0.5805,
+      "step": 511
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.8782516337672304,
+      "learning_rate": 4.480227461942556e-06,
+      "loss": 0.5596,
+      "step": 512
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.075073901596185,
+      "learning_rate": 4.478229892753886e-06,
+      "loss": 0.6124,
+      "step": 513
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.0588983460568304,
+      "learning_rate": 4.47622893956457e-06,
+      "loss": 0.5589,
+      "step": 514
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.850248236464706,
+      "learning_rate": 4.474224605797476e-06,
+      "loss": 0.5603,
+      "step": 515
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.932844310652863,
+      "learning_rate": 4.472216894881261e-06,
+      "loss": 0.5571,
+      "step": 516
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.09975454805468,
+      "learning_rate": 4.470205810250357e-06,
+      "loss": 0.5975,
+      "step": 517
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.9694087093010304,
+      "learning_rate": 4.468191355344965e-06,
+      "loss": 0.5698,
+      "step": 518
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.8794788153917539,
+      "learning_rate": 4.466173533611053e-06,
+      "loss": 0.5559,
+      "step": 519
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.0650455557855434,
+      "learning_rate": 4.46415234850035e-06,
+      "loss": 0.5644,
+      "step": 520
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.0062649027982022,
+      "learning_rate": 4.462127803470334e-06,
+      "loss": 0.608,
+      "step": 521
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.043267877462657,
+      "learning_rate": 4.460099901984235e-06,
+      "loss": 0.573,
+      "step": 522
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.056372436619027,
+      "learning_rate": 4.4580686475110235e-06,
+      "loss": 0.5748,
+      "step": 523
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.8871033520138176,
+      "learning_rate": 4.456034043525404e-06,
+      "loss": 0.5339,
+      "step": 524
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.889474616209236,
+      "learning_rate": 4.45399609350781e-06,
+      "loss": 0.5185,
+      "step": 525
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.9767406217632912,
+      "learning_rate": 4.451954800944405e-06,
+      "loss": 0.5758,
+      "step": 526
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.9588695861513832,
+      "learning_rate": 4.449910169327062e-06,
+      "loss": 0.5472,
+      "step": 527
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.8852210889000718,
+      "learning_rate": 4.447862202153372e-06,
+      "loss": 0.5917,
+      "step": 528
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 2.0103638871993077,
+      "learning_rate": 4.445810902926629e-06,
+      "loss": 0.5761,
+      "step": 529
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 2.201836945389513,
+      "learning_rate": 4.443756275155827e-06,
+      "loss": 0.5614,
+      "step": 530
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.900702305836831,
+      "learning_rate": 4.441698322355656e-06,
+      "loss": 0.5254,
+      "step": 531
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 2.134694583439314,
+      "learning_rate": 4.4396370480464915e-06,
+      "loss": 0.5607,
+      "step": 532
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.8073751630381198,
+      "learning_rate": 4.437572455754391e-06,
+      "loss": 0.536,
+      "step": 533
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.9607338020142653,
+      "learning_rate": 4.435504549011088e-06,
+      "loss": 0.59,
+      "step": 534
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 2.0756430867435274,
+      "learning_rate": 4.433433331353988e-06,
+      "loss": 0.5538,
+      "step": 535
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.8280570853718465,
+      "learning_rate": 4.431358806326158e-06,
+      "loss": 0.5789,
+      "step": 536
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.2005143967434977,
+      "learning_rate": 4.429280977476321e-06,
+      "loss": 0.545,
+      "step": 537
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.896479397543979,
+      "learning_rate": 4.4271998483588565e-06,
+      "loss": 0.5791,
+      "step": 538
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.117773381781195,
+      "learning_rate": 4.425115422533785e-06,
+      "loss": 0.5234,
+      "step": 539
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.4438942429566617,
+      "learning_rate": 4.423027703566769e-06,
+      "loss": 0.5692,
+      "step": 540
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.873481152225171,
+      "learning_rate": 4.4209366950291025e-06,
+      "loss": 0.5739,
+      "step": 541
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.8655199147974673,
+      "learning_rate": 4.4188424004977085e-06,
+      "loss": 0.5795,
+      "step": 542
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.948840412241188,
+      "learning_rate": 4.416744823555129e-06,
+      "loss": 0.5304,
+      "step": 543
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.8389034133315045,
+      "learning_rate": 4.414643967789523e-06,
+      "loss": 0.5076,
+      "step": 544
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.8269235720085213,
+      "learning_rate": 4.412539836794657e-06,
+      "loss": 0.5837,
+      "step": 545
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.1298715969759505,
+      "learning_rate": 4.410432434169902e-06,
+      "loss": 0.5694,
+      "step": 546
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.0057741366005746,
+      "learning_rate": 4.408321763520223e-06,
+      "loss": 0.557,
+      "step": 547
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.7901331374893255,
+      "learning_rate": 4.406207828456177e-06,
+      "loss": 0.5746,
+      "step": 548
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 2.1994839889416187,
+      "learning_rate": 4.404090632593904e-06,
+      "loss": 0.5407,
+      "step": 549
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.9664921082690268,
+      "learning_rate": 4.401970179555123e-06,
+      "loss": 0.5322,
+      "step": 550
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.9933486180243851,
+      "learning_rate": 4.399846472967124e-06,
+      "loss": 0.5798,
+      "step": 551
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.986612256562151,
+      "learning_rate": 4.397719516462765e-06,
+      "loss": 0.5213,
+      "step": 552
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 2.046550123292336,
+      "learning_rate": 4.395589313680459e-06,
+      "loss": 0.5857,
+      "step": 553
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.7902327250340486,
+      "learning_rate": 4.393455868264176e-06,
+      "loss": 0.555,
+      "step": 554
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 2.0203627138517146,
+      "learning_rate": 4.391319183863432e-06,
+      "loss": 0.6329,
+      "step": 555
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.9373549045181289,
+      "learning_rate": 4.389179264133281e-06,
+      "loss": 0.566,
+      "step": 556
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.8936753353678124,
+      "learning_rate": 4.387036112734316e-06,
+      "loss": 0.5555,
+      "step": 557
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.8493817575820743,
+      "learning_rate": 4.3848897333326545e-06,
+      "loss": 0.5427,
+      "step": 558
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.9119588677783816,
+      "learning_rate": 4.382740129599937e-06,
+      "loss": 0.5157,
+      "step": 559
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.8190137094200924,
+      "learning_rate": 4.380587305213321e-06,
+      "loss": 0.503,
+      "step": 560
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.9891332712764953,
+      "learning_rate": 4.37843126385547e-06,
+      "loss": 0.5761,
+      "step": 561
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.8620896547461154,
+      "learning_rate": 4.376272009214555e-06,
+      "loss": 0.5259,
+      "step": 562
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.8896721756477406,
+      "learning_rate": 4.37410954498424e-06,
+      "loss": 0.5632,
+      "step": 563
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.8302281976781984,
+      "learning_rate": 4.37194387486368e-06,
+      "loss": 0.5612,
+      "step": 564
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 2.0721820586440165,
+      "learning_rate": 4.369775002557516e-06,
+      "loss": 0.533,
+      "step": 565
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.8259926551813157,
+      "learning_rate": 4.367602931775865e-06,
+      "loss": 0.526,
+      "step": 566
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.8096334574000785,
+      "learning_rate": 4.3654276662343155e-06,
+      "loss": 0.5306,
+      "step": 567
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.9675637591445598,
+      "learning_rate": 4.363249209653922e-06,
+      "loss": 0.5577,
+      "step": 568
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.8800389115841605,
+      "learning_rate": 4.361067565761197e-06,
+      "loss": 0.5553,
+      "step": 569
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.827485496395265,
+      "learning_rate": 4.358882738288105e-06,
+      "loss": 0.5587,
+      "step": 570
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.820954908943235,
+      "learning_rate": 4.356694730972056e-06,
+      "loss": 0.6186,
+      "step": 571
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.952072431699686,
+      "learning_rate": 4.3545035475559025e-06,
+      "loss": 0.5488,
+      "step": 572
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.8292648968688423,
+      "learning_rate": 4.352309191787924e-06,
+      "loss": 0.5534,
+      "step": 573
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.826293122529813,
+      "learning_rate": 4.350111667421835e-06,
+      "loss": 0.5872,
+      "step": 574
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.9251425791166785,
+      "learning_rate": 4.347910978216763e-06,
+      "loss": 0.5298,
+      "step": 575
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.8330818196811385,
+      "learning_rate": 4.345707127937253e-06,
+      "loss": 0.5871,
+      "step": 576
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.7842986545873851,
+      "learning_rate": 4.3435001203532555e-06,
+      "loss": 0.4898,
+      "step": 577
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.8778666245156521,
+      "learning_rate": 4.341289959240124e-06,
+      "loss": 0.5385,
+      "step": 578
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.9300679499181266,
+      "learning_rate": 4.339076648378605e-06,
+      "loss": 0.5698,
+      "step": 579
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.9440861965960357,
+      "learning_rate": 4.336860191554833e-06,
+      "loss": 0.5984,
+      "step": 580
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.929951096053947,
+      "learning_rate": 4.3346405925603265e-06,
+      "loss": 0.6222,
+      "step": 581
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.9138258400335695,
+      "learning_rate": 4.332417855191974e-06,
+      "loss": 0.5498,
+      "step": 582
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 2.058548455869675,
+      "learning_rate": 4.330191983252039e-06,
+      "loss": 0.5218,
+      "step": 583
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 2.243429045583125,
+      "learning_rate": 4.327962980548142e-06,
+      "loss": 0.5768,
+      "step": 584
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.9213537104634244,
+      "learning_rate": 4.32573085089326e-06,
+      "loss": 0.5784,
+      "step": 585
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.9165291289119128,
+      "learning_rate": 4.32349559810572e-06,
+      "loss": 0.5697,
+      "step": 586
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.9674279518735756,
+      "learning_rate": 4.321257226009193e-06,
+      "loss": 0.5104,
+      "step": 587
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.9051339015323923,
+      "learning_rate": 4.319015738432683e-06,
+      "loss": 0.5711,
+      "step": 588
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.957357618850765,
+      "learning_rate": 4.3167711392105245e-06,
+      "loss": 0.5854,
+      "step": 589
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.9859311708308915,
+      "learning_rate": 4.314523432182376e-06,
+      "loss": 0.547,
+      "step": 590
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.773704456523191,
+      "learning_rate": 4.312272621193209e-06,
+      "loss": 0.5259,
+      "step": 591
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.82988033655793,
+      "learning_rate": 4.31001871009331e-06,
+      "loss": 0.5209,
+      "step": 592
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.8925134832060522,
+      "learning_rate": 4.307761702738264e-06,
+      "loss": 0.59,
+      "step": 593
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.8477075780641046,
+      "learning_rate": 4.305501602988953e-06,
+      "loss": 0.5714,
+      "step": 594
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.8568432886623798,
+      "learning_rate": 4.303238414711552e-06,
+      "loss": 0.5877,
+      "step": 595
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.8179798660158206,
+      "learning_rate": 4.3009721417775166e-06,
+      "loss": 0.6029,
+      "step": 596
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.8494963193854803,
+      "learning_rate": 4.29870278806358e-06,
+      "loss": 0.5236,
+      "step": 597
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.9586017397154731,
+      "learning_rate": 4.296430357451744e-06,
+      "loss": 0.5998,
+      "step": 598
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.926616057974202,
+      "learning_rate": 4.2941548538292765e-06,
+      "loss": 0.5914,
+      "step": 599
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.9321738359144827,
+      "learning_rate": 4.291876281088701e-06,
+      "loss": 0.5358,
+      "step": 600
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.8229177571361932,
+      "learning_rate": 4.289594643127788e-06,
+      "loss": 0.5284,
+      "step": 601
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.849252449531427,
+      "learning_rate": 4.287309943849558e-06,
+      "loss": 0.5689,
+      "step": 602
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.985343175388319,
+      "learning_rate": 4.285022187162261e-06,
+      "loss": 0.6101,
+      "step": 603
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 2412,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 603,
+  "total_flos": 283958071787520.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-603/training_args.bin b/checkpoint-603/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9c7d637f7d8ebf811202f98fbce7e1b0a49f637e
--- /dev/null
+++ b/checkpoint-603/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7f4a52e7a4e455192e0e321f42138a81946c62773f6570625fe5cb773689e26
+size 7352
diff --git a/checkpoint-603/zero_to_fp32.py b/checkpoint-603/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..49b846633d6eb1e836e34681e44033581f4edb7b
--- /dev/null
+++ b/checkpoint-603/zero_to_fp32.py
@@ -0,0 +1,592 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    """
+
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag)
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e19729ddff0edff2916b7fba6d2fec722621e76
--- /dev/null
+++ b/config.json
@@ -0,0 +1,26 @@
+{
+  "_name_or_path": "alpindale/Mistral-7B-v0.2-hf",
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 32000,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 32768,
+  "model_type": "mistral",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": false,
+  "vocab_size": 32002
+}
diff --git a/generation_config.json b/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..282b497efd8f276cf9270e576fb79be429aebcdc
--- /dev/null
+++ b/generation_config.json
@@ -0,0 +1,7 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "do_sample": true,
+  "eos_token_id": 2,
+  "transformers_version": "4.38.2"
+}
diff --git a/model-00001-of-00003.safetensors b/model-00001-of-00003.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0f1c522c741bc956a541d5544734d12ff3a71b33
--- /dev/null
+++ b/model-00001-of-00003.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c89fd0fface188ca3f7988aa53f25e087292d72ca99cd52ef8cb52cf180ad2ff
+size 4943178720
diff --git a/model-00002-of-00003.safetensors b/model-00002-of-00003.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6a1c7f2c1a284a17e9b7a9124040ee4bb6680b67
--- /dev/null
+++ b/model-00002-of-00003.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49dd97160e0a8ff75303f02969df38307407c8800ce94aaa86611ceb6727bca0
+size 4999819336
diff --git a/model-00003-of-00003.safetensors b/model-00003-of-00003.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3f8cc928e41a10674f627e9a238420111f974bb7
--- /dev/null
+++ b/model-00003-of-00003.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03098a839ef612f1efe325b376aa90bc8311a01c1236120d9ca7934eb9b12fed
+size 4540532728
diff --git a/model.safetensors.index.json b/model.safetensors.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..71e207631efb42944bc779548c9b6d4b5818ffe2
--- /dev/null
+++ b/model.safetensors.index.json
@@ -0,0 +1,298 @@
+{
+  "metadata": {
+    "total_size": 14483496960
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00003-of-00003.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.norm.weight": "model-00003-of-00003.safetensors"
+  }
+}
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..40b1c6dadc2aed5b9e61dc7f9c7299e0aee16069
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1,24 @@
+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/tokenizer.model b/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..8b443ef19c2a19acc3ac64fb9c3db4a72921dff6
--- /dev/null
+++ b/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..392e982500a327e3b6f821a513fcae6cc7f4f453
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,60 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful assistant.' %}{% endif %}{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{{'<|im_start|>system\n' + system_message + '<|im_end|>\n'}}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "use_fast": true
+}
diff --git a/training_args.bin b/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9c7d637f7d8ebf811202f98fbce7e1b0a49f637e
--- /dev/null
+++ b/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7f4a52e7a4e455192e0e321f42138a81946c62773f6570625fe5cb773689e26
+size 7352